Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def extract(self, wiki_document):
annotations = []
#Extract the article using the general WikiExtractor:
wiki_document = wikiextractor.WikiExtractor.extract(self, wiki_document)
if not wiki_document: return None
#This int is used to keep track of the difference between the original article with <a href="..">
#links and the new article that only contains the label of the link.
deltaStringLength = 0
#As a first step, find all links in the article, save their positions into the annotations object
ms = re.finditer('</a><a href="([^">([^>]+)</a>', wiki_document.text)
for m in ms:
if urllib.quote("#") not in m.group(1) or keep_anchors:
annotations.append({
"uri" : m.group(1),
"surface_form" : m.group(2),
"offset" : m.start() - deltaStringLength
})
def extract(self, wiki_document):
annotations = []
#Extract the article using the general WikiExtractor:
wiki_document = wikiextractor.WikiExtractor.extract(self, wiki_document)
if not wiki_document: return None
#This int is used to keep track of the difference between the original article with <a href="..">
#links and the new article that only contains the label of the link.
deltaStringLength = 0
#As a first step, find all links in the article, save their positions into the annotations object
ms = re.finditer('</a><a href="([^">([^>]+)</a>', wiki_document.text)
for m in ms:
if urllib.quote("#") not in m.group(1) or keep_anchors:
annotations.append({
"u" : m.group(1),
"s" : m.group(2),
"o" : m.start() - deltaStringLength
})