2019-05-25 Week 21 — pydata: Huiming's learning notes

python regular expression to clean the RMP data.
import re

strtest = """  3602433631519" />                                </td>
                <td> 7 </td>
                <td>< = "> </a></td>
                <td>HRB_HighClaim_Sideline</td>

                <!-- Align Rule condition to variable expression -->
                    <td></td>

                <td>MVEL</td>
                <td>
                        <pre class="code">get(&quot;$BFS.hrb_claims_by_customer_us.n_claim_count&quot;)!\
=empty &amp;&amp; 
get(&quot;$var_001&quot;)!=empty &amp;&amp; 
get(&quot;$var_002&quot;)!=empty &amp;&amp; 
$var_001&gt;0 &amp;&amp; 
$var_002&lt; 3650</pre>
                </td>
                <td> 
                <td>
                    <table>
                                </tr>
                        <tr>
                            <td> <a href=
                        </tr>
                    </table>
                </td>tail"> 
                                <td>
                                 </td>
                <td> 8 </td>
                <td><a href=" 05</a></td>
                <td>fortress_test_bf_continue_mo</td>
                <td>
                    <table>
                                <tr>
                                    <td> <a href=" " ><i class="fa fa-bell"></i></a> </td>
                                </tr>
                        <tr>
                            <td> <a href="#"  </td>
                        </tr>
                    </table>
                </td>
                                <td>
                                  <a class="btn btn-link" href="/ru
                                  </div>
                                </td>
            </tr>
                <td> 999 </td>
                <td><a href="/rule/show</td>
                <td>fraudAmtRatioInHL30ForASIN</td>
"""

# 1. find pattern like   <td> 7 </td>,  <td> 8 </td>,   <td> 999 </td>
pattern = re.compile(r"<td> [0-9]+ </td>")
re.findall(pattern, strtest)
str_split = re.split(pattern, strtest)

# 2. find pattern like <pre class=\"code\">   ----------any text inside this--------   </pre>
pattern2 = re.compile(r'<pre class=\"code\">(.*?)</pre>')

res = []
for x in str_split:
    res += (re.findall(pattern2, x.replace('\n', '').replace('\t', '')))

# 3. find pattern to remove the check the value is empty or not. if note remove, the count will be doubled
pattern3 = re.compile(r'get(.*?)!=empty')

re.sub(pattern3, '', res[0])
re.sub(pattern3, '', res[0]).replace('&amp;', '&')