How to use the refextract.references.regexs.regex_match_list function in refextract

To help you get started, we’ve selected a few refextract examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / refextract / tests / test_regexs.py View on Github external
def test_regex_match_list():
    s = 'ABC'
    m = regexs.regex_match_list(s, [
        re.compile('C.C'),
        re.compile('A.C')
    ])
    assert m
    m = regexs.regex_match_list(s, [
        re.compile('C.C')
    ])
    assert m is None
github inspirehep / refextract / refextract / references / find.py View on Github external
else:
        mk_patterns = get_reference_line_numeration_marker_patterns()

    current_reference_count = 0
    while x < len(docbody) and not section_ended:
        # save the reference count
        num_match = regex_match_list(docbody[x].strip(), mk_patterns)
        if num_match:
            try:
                current_reference_count = int(num_match.group('marknum'))
            except (ValueError, IndexError):
                # non numerical references marking
                pass
        # look for a likely section title that would follow a reference
        # section:
        end_match = regex_match_list(docbody[x].strip(), t_patterns)
        if not end_match:
            # didn't match a section title - try looking for keywords that
            # suggest the end of a reference section:
            end_match = regex_match_list(docbody[x].strip(), kw_patterns)
        else:
            # Is it really the end of the reference section? Check within the next
            # 5 lines for other reference numeration markers:
            y = x + 1
            line_found = False
            while y < x + 200 and y < len(docbody) and not line_found:
                num_match = regex_match_list(docbody[y].strip(), mk_patterns)
                if num_match and not num_match.group(0).isdigit():
                    try:
                        num = int(num_match.group('marknum'))
                        if current_reference_count + 1 == num:
                            line_found = True
github inspirehep / refextract / refextract / references / find.py View on Github external
mark_pattern = mark_match.re.pattern

            # Look for [2] in next 10 lines:
            next_test_lines = 10

            index = len(docbody) - reversed_index
            zone_to_check = docbody[index:index + next_test_lines]
            if len(zone_to_check) < 5:
                # We found a 1 towards the end, we assume
                # we only have one reference
                found = True
            else:
                # Check for number 2
                found = False
                for line_ in zone_to_check:
                    mark_match2 = regex_match_list(line_.strip(), marker_patterns)
                    if mark_match2 and mark_match2.group('marknum') == '2':
                        found = True
                        break

            if found:
                # Found next reference line:
                found_ref_sect = True
                ref_start_line = len(docbody) - 1 - reversed_index
                ref_line_marker = mark_match.group('mark')
                ref_line_marker_pattern = mark_pattern
                break

    if found_ref_sect:
        ref_sectn_details = {
            'start_line': ref_start_line,
            'title_string': None,
github inspirehep / refextract / refextract / references / find.py View on Github external
def find_numeration_in_title(docbody, title):
    ref_details = None
    found_title = False

    try:
        first_line = docbody[0]
    except IndexError:
        return ref_details, found_title

    # Need to escape to avoid problems like 'References['
    title = re.escape(title)

    mk_with_title_ptns = \
        get_reference_line_numeration_marker_patterns(title)
    mk_with_title_match = \
        regex_match_list(first_line, mk_with_title_ptns)
    if mk_with_title_match:
        mk = mk_with_title_match.group('mark')
        mk_ptn = mk_with_title_match.re.pattern
        m_num = re_num.search(mk)
        if m_num and m_num.group(0) == '1':
            # Mark found
            found_title = True
            ref_details = {
                'marker': mk,
                'marker_pattern': mk_ptn,
                'title_marker_same_line': True
            }
        else:
            ref_details = {
                'marker': mk,
                'marker_pattern': mk_ptn,
github inspirehep / refextract / refextract / references / find.py View on Github external
}
                 Much of this information is used by later functions to rebuild
                 a reference section.
         -- OR --
                (None) - when the reference section could not be found.
    """
    if not docbody:
        return None

    ref_start_line = ref_line_marker = None

    # try to find first reference line in the reference section:
    found_ref_sect = False

    for reversed_index, line in enumerate(reversed(docbody)):
        mark_match = regex_match_list(line.strip(), marker_patterns)
        if mark_match and mark_match.group('marknum') == '1':
            # Get marker recognition pattern:
            mark_pattern = mark_match.re.pattern

            # Look for [2] in next 10 lines:
            next_test_lines = 10

            index = len(docbody) - reversed_index
            zone_to_check = docbody[index:index + next_test_lines]
            if len(zone_to_check) < 5:
                # We found a 1 towards the end, we assume
                # we only have one reference
                found = True
            else:
                # Check for number 2
                found = False
github inspirehep / refextract / refextract / references / find.py View on Github external
# No numeration unless we find one
    ref_details = {
        'title_marker_same_line': False,
        'marker': None,
        'marker_pattern': None,
    }

    for line in docbody:
        # Move past blank lines
        if line.isspace():
            continue

        # Is this line numerated like a reference line?
        m_num = None
        mark_match = regex_match_list(line, marker_patterns)
        if mark_match:
            # Check if it's the first reference
            # Something like [1] or (1), etc.
            try:
                m_num = mark_match.group('marknum')
                if m_num != '1':
                    continue
            except IndexError:
                pass

            mark = mark_match.group('mark')
            mk_ptn = mark_match.re.pattern
            ref_details = {
                'marker': mark,
                'marker_pattern': mk_ptn,
                'title_marker_same_line': False,