数据格式:
<PAIR> <TEXT>AAA</TEXT> <LINK>BBB</LINK> </PAIR> ...... <PAIR> <TEXT>XXX</TEXT> <LINK>YYY</LINK> </PAIR>
代码:特别注意re.DOCALL
def getAnchors(self):
texts = []
links = []
astr = self._get("ANCHOR")
#pattern for anchor
pairpat = re.compile("(<PAIR>.*?</PAIR>)", re.DOTALL | re.MULTILINE)
textpat = re.compile("<TEXT>(.*?)</TEXT>", re.DOTALL | re.MULTILINE)
linkpat = re.compile("<LINK>(.*?)</LINK>", re.DOTALL | re.MULTILINE)
#each <PAIR>
for m1 in pairpat.finditer(astr):
anchor = m1.group()
#extract TEXT and LINK in each <PAIR>
text = textpat.search(anchor).group(1).strip()
if len(text) == 0:
text = "EMPTY_ANCHOR_TEXT"
link = linkpat.search(anchor).group(1).strip()
texts.append(text)
links.append(link)
#return
return texts, links
pass