Python中正则表达式用法

数据格式:

<PAIR>
<TEXT>AAA</TEXT>
<LINK>BBB</LINK>
</PAIR>
......

<PAIR>
<TEXT>XXX</TEXT>
<LINK>YYY</LINK>
</PAIR>

代码:特别注意re.DOCALL

    def getAnchors(self):
        texts = []
        links = []
        astr = self._get("ANCHOR")
        #pattern for anchor
        pairpat = re.compile("(<PAIR>.*?</PAIR>)", re.DOTALL | re.MULTILINE)
        textpat = re.compile("<TEXT>(.*?)</TEXT>", re.DOTALL | re.MULTILINE)
        linkpat = re.compile("<LINK>(.*?)</LINK>", re.DOTALL | re.MULTILINE)
        #each <PAIR>
        for m1 in pairpat.finditer(astr):
            anchor = m1.group()
            #extract TEXT and LINK in each <PAIR>
            text = textpat.search(anchor).group(1).strip()
            if len(text) == 0:
                text = "EMPTY_ANCHOR_TEXT"
            link = linkpat.search(anchor).group(1).strip()
            texts.append(text)
            links.append(link)
        #return
        return texts, links
        pass

Leave a Reply

Your email address will not be published. Required fields are marked *