How to use the hepcrawl.utils.split_fullname function in hepcrawl

To help you get started, we’ve selected a few hepcrawl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github inspirehep / hepcrawl / tests / unit / test_utils.py View on Github external
def test_split_fullname():
    """Test author fullname splitting."""
    author1 = 'Doe, John Magic'
    author2 = 'Doe Boe, John Magic'
    author3 = 'Doe Boe John Magic'
    author4 = 'John Magic Doe'
    author5 = 'John Magic Doe Boe'
    author6 = 'John Magic, Doe Boe'
    author7 = ''
    assert split_fullname(author1) == ('Doe', 'John Magic')
    assert split_fullname(author2) == ('Doe Boe', 'John Magic')
    assert split_fullname(author3, switch_name_order=True) == ('Doe', 'Boe John Magic')
    assert split_fullname(author4) == ('Doe', 'John Magic')
    assert split_fullname(author5) == ('Boe', 'John Magic Doe')
    assert split_fullname(author6, switch_name_order=True) == ('Doe Boe', 'John Magic')
    assert split_fullname(author7) == ('', '')
github inspirehep / hepcrawl / tests / unit / test_utils.py View on Github external
def test_split_fullname():
    """Test author fullname splitting."""
    author1 = 'Doe, John Magic'
    author2 = 'Doe Boe, John Magic'
    author3 = 'Doe Boe John Magic'
    author4 = 'John Magic Doe'
    author5 = 'John Magic Doe Boe'
    author6 = 'John Magic, Doe Boe'
    author7 = ''
    assert split_fullname(author1) == ('Doe', 'John Magic')
    assert split_fullname(author2) == ('Doe Boe', 'John Magic')
    assert split_fullname(author3, switch_name_order=True) == ('Doe', 'Boe John Magic')
    assert split_fullname(author4) == ('Doe', 'John Magic')
    assert split_fullname(author5) == ('Boe', 'John Magic Doe')
    assert split_fullname(author6, switch_name_order=True) == ('Doe Boe', 'John Magic')
    assert split_fullname(author7) == ('', '')
github inspirehep / hepcrawl / tests / unit / test_utils.py View on Github external
def test_split_fullname():
    """Test author fullname splitting."""
    author1 = 'Doe, John Magic'
    author2 = 'Doe Boe, John Magic'
    author3 = 'Doe Boe John Magic'
    author4 = 'John Magic Doe'
    author5 = 'John Magic Doe Boe'
    author6 = 'John Magic, Doe Boe'
    author7 = ''
    assert split_fullname(author1) == ('Doe', 'John Magic')
    assert split_fullname(author2) == ('Doe Boe', 'John Magic')
    assert split_fullname(author3, switch_name_order=True) == ('Doe', 'Boe John Magic')
    assert split_fullname(author4) == ('Doe', 'John Magic')
    assert split_fullname(author5) == ('Boe', 'John Magic Doe')
    assert split_fullname(author6, switch_name_order=True) == ('Doe Boe', 'John Magic')
    assert split_fullname(author7) == ('', '')
github inspirehep / hepcrawl / tests / unit / test_utils.py View on Github external
def test_split_fullname():
    """Test author fullname splitting."""
    author1 = 'Doe, John Magic'
    author2 = 'Doe Boe, John Magic'
    author3 = 'Doe Boe John Magic'
    author4 = 'John Magic Doe'
    author5 = 'John Magic Doe Boe'
    author6 = 'John Magic, Doe Boe'
    author7 = ''
    assert split_fullname(author1) == ('Doe', 'John Magic')
    assert split_fullname(author2) == ('Doe Boe', 'John Magic')
    assert split_fullname(author3, switch_name_order=True) == ('Doe', 'Boe John Magic')
    assert split_fullname(author4) == ('Doe', 'John Magic')
    assert split_fullname(author5) == ('Boe', 'John Magic Doe')
    assert split_fullname(author6, switch_name_order=True) == ('Doe Boe', 'John Magic')
    assert split_fullname(author7) == ('', '')
github inspirehep / hepcrawl / hepcrawl / spiders / mit_spider.py View on Github external
"""Return authors dictionary """
        authors_raw = node.xpath(
            "//td[contains(text(), 'dc.contributor.author')]/following-sibling::td[1]/text()").extract()
        affiliation = node.xpath(
            "//td[contains(text(), 'dc.contributor.department')]/following-sibling::td[1]/text()").extract_first()

        authors = []
        strip_years_pattern = re.compile(r"(.*)\,\s\d{4}.?")
        full_given_names_pattern = re.compile(r".?\((.*)\).?")
        for author in authors_raw:
            try:
                # Might contain birthdate
                author = strip_years_pattern.search(author).group(1)
            except AttributeError:
                pass
            surname, given_names = split_fullname(author)
            try:
                # Might contain full given_names in parentheses
                given_names = full_given_names_pattern.search(given_names).group(1)
            except AttributeError:
                pass

            authdict = {
                'surname': surname,
                'given_names': given_names,
            }
            if affiliation:
                authdict["affiliations"] = [{"value": affiliation}]
            authors.append(authdict)

        return authors
github inspirehep / hepcrawl / hepcrawl / inputs.py View on Github external
def parse_authors(value):
    """Add missing information for an author.

    ``full_name`` combination value and ``surname`` + ``given_names`` values.
    Delete spaces from initials.
    """
    if "raw_name" in value and "surname" not in value:
        value['surname'], value['given_names'] = split_fullname(
            value['raw_name']
        )
    if 'given_names' in value and value['given_names']:
        value['given_names'] = collapse_initials(value['given_names'])
        value['full_name'] = u'{0}, {1}'.format(
            value['surname'],
            value['given_names']
        )
    else:
        value['full_name'] = value['surname']

    return value
github inspirehep / hepcrawl / hepcrawl / spiders / magic_spider.py View on Github external
"""Parses the line where there are data about the author(s)

        Note that author surnames and given names are not comma separated, so
        `split_fullname` might get a wrong surname.
        """
        authors_raw = node.xpath(
            "//div[@id='content']/p[@class='author']/text()").extract()
        affiliation = node.xpath(
            "//h2[contains(text(), 'School')]/following-sibling::p/strong/text()"
        ).extract_first()
        if not affiliation:
            affiliation = ''

        authors = []
        for author in authors_raw:
            surname, given_names = split_fullname(author)
            authdict = {
                'surname': surname,
                'given_names': given_names,
            }
            if affiliation:
                authdict["affiliations"] = [{"value": affiliation}]
            authors.append(authdict)

        return authors
github inspirehep / hepcrawl / hepcrawl / parsers / arxiv.py View on Github external
some_affiliation_contains_collaboration = True
                else:
                    affiliations_without_collaborations.append(aff)
            for aff in affiliations_with_collaborations:
                coll, author_name = coll_cleanforthe(aff)
                if coll and coll not in collaborations:
                    collaborations.append(coll)

            # Check if name is a collaboration, else append to authors
            collaboration_in_name = ' for the ' in name_string.lower() or any(
                phrase in name_string.lower() for phrase in collab_phrases
            )
            if collaboration_in_name:
                coll, author_name = coll_cleanforthe(name_string)
                if author_name:
                    surname, given_names = split_fullname(author_name)
                    authors.append({
                        'full_name': surname + ', ' + given_names,
                        'surname': surname,
                        'given_names': given_names,
                        'affiliations': [],
                    })
                if coll and coll not in collaborations:
                    collaborations.append(coll)
            elif name_string.strip() == ':':
                # DANGERZONE : this might not be correct - add a warning for the cataloger
                warning_tags.append(' %s %s ' % (next_forenames, next_keyname))
                if not some_affiliation_contains_collaboration:
                    # everything up to now seems to be collaboration info
                    for author_info in authors:
                        name_string = " %s %s " % \
                            (author_info['given_names'], author_info['surname'])