How to use the refextract.references.find.get_reference_section_beginning function in refextract

To help you get started, we’ve selected a few refextract examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / refextract / tests / test_find.py View on Github external
def test_no_title_via_brackets():
    sect = get_reference_section_beginning([
        "Hello",
        "[1] Ref1"
        "[2] Ref2"
    ])
    assert sect == {
        'marker': '[1]',
        'marker_pattern': u'(?P<mark>(?P\\[)\\s*(?P\\d+)\\s*(?P\\]))',
        'start_line': 1,
        'title_string': None,
        'title_marker_same_line': False,
        'how_found_start': 2,
    }</mark>
github inspirehep / refextract / tests / test_find.py View on Github external
def test_no_title_via_numbers2():
    sect = get_reference_section_beginning([
        "Hello",
        "1",
        "Ref1",
        "(3)",
        "2",
        "Ref2",
    ])
    assert sect, {
        'marker': '1',
        'marker_pattern': u'(?P<mark>(?P)\\s*(?P\\d+)\\s*(?P))',
        'start_line': 1,
        'title_string': None,
        'title_marker_same_line': False,
        'how_found_start': 4,
    }</mark>
github inspirehep / refextract / tests / test_find.py View on Github external
def test_no_title_via_dots():
    sect = get_reference_section_beginning([
        "Hello",
        "1. Ref1"
        "2. Ref2"
    ])
    assert sect == {
        'marker': '1.',
        'marker_pattern': u'(?P<mark>(?P)\\s*(?P\\d+)\\s*(?P\\.))',
        'start_line': 1,
        'title_string': None,
        'title_marker_same_line': False,
        'how_found_start': 3,
    }</mark>
github inspirehep / refextract / tests / test_find.py View on Github external
def test_no_section():
    sect = get_reference_section_beginning("")
    assert sect is None
github inspirehep / refextract / tests / test_find.py View on Github external
def test_no_title_via_numbers():
    sect = get_reference_section_beginning([
        "Hello",
        "1 Ref1"
        "2 Ref2"
    ])
    assert sect == {
        'marker': '1',
        'marker_pattern': u'(?P<mark>(?P)\\s*(?P\\d+)\\s*(?P))',
        'start_line': 1,
        'title_string': None,
        'title_marker_same_line': False,
        'how_found_start': 4,
    }</mark>
github inspirehep / refextract / tests / test_find.py View on Github external
def test_simple():
    sect = get_reference_section_beginning([
        "Hello",
        "References",
        "[1] Ref1"
    ])
    assert sect == {
        'marker': '[1]',
        'marker_pattern': u'\\s*(?P<mark>\\[\\s*(?P\\d+)\\s*\\])',
        'start_line': 1,
        'title_string': 'References',
        'title_marker_same_line': False,
        'how_found_start': 1,
    }</mark>
github inspirehep / refextract / refextract / references / api.py View on Github external
>>> extract_references_from_string(path, reference_format="{title},{volume},{page}")

    If you want to also link each reference to some other resource (like a record),
    you can provide a linker_callback function to be executed for every reference
    element found.

    To override KBs for journal names etc., use ``override_kbs_files``:

    >>> extract_references_from_string(path, override_kbs_files={'journals': 'my/path/to.kb'})
    """
    docbody = source.split('\n')
    if not is_only_references:
        reflines, dummy, dummy = extract_references_from_fulltext(docbody)
    else:
        refs_info = get_reference_section_beginning(docbody)
        if not refs_info:
            refs_info, dummy = find_numeration_in_body(docbody)
            refs_info['start_line'] = 0
            refs_info['end_line'] = len(docbody) - 1,

        reflines = rebuild_reference_lines(
            docbody, refs_info['marker_pattern'])
    parsed_refs, stats = parse_references(
        reflines,
        recid=recid,
        reference_format=reference_format,
        linker_callback=linker_callback,
        override_kbs_files=override_kbs_files,
    )
    return parsed_refs
github inspirehep / refextract / refextract / references / text.py View on Github external
Return the extracted reference section as a list of strings, whereby each
       string in the list is considered to be a single reference line.
        E.g. a string could be something like:
        '[19] Wilson, A. Unpublished (1986).
       @param fulltext: (list) of strings, whereby each string is a line of the
        document.
       @return: (list) of strings, where each string is an extracted reference
        line.
    """
    # Try to remove pagebreaks, headers, footers
    fulltext = remove_page_boundary_lines(fulltext)
    status = 0
    # How ref section found flag
    how_found_start = 0
    # Find start of refs section
    ref_sect_start = get_reference_section_beginning(fulltext)

    if ref_sect_start is None:
        # No References
        refs = []
        status = 4
        LOGGER.debug(u"extract_references_from_fulltext: ref_sect_start is None")
    else:
        # If a reference section was found, however weak
        ref_sect_end = \
            find_end_of_reference_section(fulltext,
                                          ref_sect_start["start_line"],
                                          ref_sect_start["marker"],
                                          ref_sect_start["marker_pattern"])
        if ref_sect_end is None:
            # No End to refs? Not safe to extract
            refs = []