How to use the goatools.semantic.TermCounts function in goatools

To help you get started, we’ve selected a few goatools examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github tanghaibao / goatools / tests / i148_semsim_lin.py View on Github external
dnld_annofile(fin_gpad, 'gpad')

    godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None)
    annoobj = GpadReader(fin_gpad, godag=godag)

    goids = [
        'GO:0042581',
        'GO:0101002',
        'GO:0042582',
        'GO:0070820',
        'GO:0008021',
        'GO:0005766',
        'GO:0016591']

    associations = annoobj.get_id2gos('CC')
    termcounts = TermCounts(godag, associations)

    # Calculate Lin values
    p2v = {frozenset([a, b]): lin_sim(a, b, godag, termcounts) for a, b in combo_w_rplc(goids, 2)}
    _prt_values(goids, p2v, prt=sys.stdout)
github tanghaibao / goatools / tests / similarity_top_parent.py View on Github external
def _test_path_bp_mf(branch_dist, godag, prt):
    """Test distances between BP branch and MF branch."""
    go_mf = 'GO:0003676' # level-03 depth-03 nucleic acid binding [molecular_function]
    go_bp = 'GO:0007516' # level-04 depth-05 hemocyte development [biological_process]
    dst_none = semantic_distance(go_mf, go_bp, godag)
    sim_none = semantic_similarity(go_mf, go_bp, godag)
    assc = dnld_assc("tair.gaf", godag)
    termcounts = TermCounts(godag, assc)
    fmt = '({GO1}, {GO2}) {TYPE:6} score = {VAL}\n'
    sim_r = resnik_sim(go_mf, go_bp, godag, termcounts)
    sim_l = lin_sim(go_mf, go_bp, godag, termcounts)
    if prt is not None:
        prt.write(fmt.format(TYPE='semantic distance', GO1=go_mf, GO2=go_bp, VAL=dst_none))
        prt.write(fmt.format(TYPE='semantic similarity', GO1=go_mf, GO2=go_bp, VAL=sim_none))
        prt.write(fmt.format(TYPE='Resnik similarity', GO1=go_mf, GO2=go_bp, VAL=sim_r))
        prt.write(fmt.format(TYPE='Lin similarity', GO1=go_mf, GO2=go_bp, VAL=sim_l))
    assert dst_none is None
    assert sim_none is None
    assert sim_r is None
    assert sim_l is None
    sim_d = semantic_distance(go_mf, go_bp, godag, branch_dist)
    if prt is not None:
        prt.write(fmt.format(TYPE='semantic distance', GO1=go_mf, GO2=go_bp, VAL=sim_d))
    assert sim_d == godag[go_mf].depth + godag[go_bp].depth + branch_dist
github tanghaibao / goatools / tests / test_yang_fig2a.py View on Github external
def test_semantic_similarity():
    """Test initializing TermCounts with annotations made to alternate GO ID"""
    godag = GODag(os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.obo'))
    file_id2gos = os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.anno')
    name2go = {o.name: o.item_id for o in godag.values()}
    assoc = _get_id2gos(file_id2gos, godag, name2go, NAME2NUM)
    tcntobj = TermCounts(godag, assoc)
    # N_v: Test accuracy of Python equivalent to Java: getNumberOfAnnotations
    # Test number of unique genes annotated to a GO Term PLUS genes annotated to a descendant
    assert tcntobj.gocnts[name2go['A']] == 100, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['B']] == 40, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['C']] == 50, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['D']] == 10, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['E']] == 10, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['F']] == 10, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['G']] == 30, tcntobj.gocnts
github tanghaibao / goatools / tests / test_semantic_similarity_best4lex.py View on Github external
]
    # Get all the annotations from arabidopsis.
    associations = [
        ('human', 'goa_human.gaf'),
        ('yeast', 'sgd.gaf'),
    ]


    godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None)
    for species, assc_name in associations:  # Limit test numbers for speed
        print()
        # Get all the annotations for the current species
        fin_assc = os.path.join(REPO, assc_name)
        assc_gene2gos = dnld_assc(fin_assc, godag, namespace='MF', prt=None)
        # Calculate the information content of the single term, GO:0048364
        termcounts = TermCounts(godag, assc_gene2gos)

        # Print information values for each GO term
        for goid in sorted(goids):
            infocontent = get_info_content(goid, termcounts)
            term = godag[goid]
            print('{SPECIES} Information content {INFO:8.6f} {NS} {GO} {NAME}'.format(
                SPECIES=species, GO=goid, INFO=infocontent, NS=term.namespace, NAME=term.name))

        # Print semantic similarities between each pair of GO terms
        print("GO #1      GO #2      Resnik Lin")
        print("---------- ---------- ------ -------")
        for go_a, go_b in itertools.combinations(sorted(goids), 2):
            # Resnik's similarity measure is defined as the information content of the most
            # informative common ancestor. That is, the most specific common parent-term in the GO.
            sim_r = resnik_sim(go_a, go_b, godag, termcounts)
            # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
github tanghaibao / goatools / tests / utils.py View on Github external
def get_termcounts(fin_anno, godag, namespace='all', **kws):
    """Get termcounts object"""
    objanno = get_objanno(fin_anno, godag, namespace)
    id2gos = objanno.get_id2gos(namespace=namespace, **kws)
    return TermCounts(godag, id2gos)
github tanghaibao / goatools / tests / test_semantic_similarity.py View on Github external
# Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))
    assert infocontent, "FATAL INFORMATION CONTENT"

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       Resnik similarity score (GO:0048364, GO:0044707) = 0.0 because DCA is BP top
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    dca = deepest_common_ancestor([go_id3, go_id4], godag)
    assert dca == NS2GO['BP']
    assert sim_r == get_info_content(dca, termcounts)
    assert sim_r == 0.0
github tanghaibao / goatools / tests / test_yang_fig1b.py View on Github external
def test_semantic_similarity():
    """Test faster version of sematic similarity"""
    godag = GODag(os.path.join(REPO, 'tests/data/yangRWC/fig1b.obo'))
    name2go = {o.name: o.item_id for o in godag.values()}
    assoc = _get_id2gos(os.path.join(REPO, 'tests/data/yangRWC/fig1b.anno'), godag, name2go)
    tcntobj = TermCounts(godag, assoc)
    assert tcntobj.gocnts[name2go['I']] == 20
    assert tcntobj.gocnts[name2go['L']] == 21
    assert tcntobj.gocnts[name2go['M']] == 20
    assert tcntobj.gocnts[name2go['N']] == 20
github tanghaibao / goatools / tests / test_yang_fig1a.py View on Github external
def test_semantic_similarity():
    """Test faster version of sematic similarity"""
    godag = GODag(os.path.join(REPO, 'tests/data/yangRWC/fig1a.obo'))
    name2go = {o.name: o.item_id for o in godag.values()}
    assoc = _get_id2gos(os.path.join(REPO, 'tests/data/yangRWC/fig1a.anno'), godag, name2go)
    tcntobj = TermCounts(godag, assoc)
    assert tcntobj.gocnts[name2go['I']] == 50
    assert tcntobj.gocnts[name2go['L']] == 50
    assert tcntobj.gocnts[name2go['M']] == 50
    assert tcntobj.gocnts[name2go['N']] == 50
github tanghaibao / goatools / goatools / cli / find_enrichment.py View on Github external
def __init__(self, objcli, godag_version):
        # _goids = set(o.id for o in godag.values() if not o.children)
        _goids = set(r.GO for r in objcli.results_all)
        _tobj = TermCounts(objcli.godag, objcli.objgoeans.get_assoc())
        # pylint: disable=line-too-long
        self.gosubdag = GoSubDag(_goids, objcli.godag, relationships=True, tcntobj=_tobj, prt=sys.stdout)
        self.grprdflt = GrouperDflts(self.gosubdag, objcli.args.goslim)
        self.hdrobj = HdrgosSections(self.grprdflt.gosubdag, self.grprdflt.hdrgos_dflt, objcli.sections)
        self.pval_fld = objcli.get_pval_field()  # primary pvalue of interest
        self.ver_list = [godag_version,
                         self.grprdflt.ver_goslims,
                         "Sections: {S}".format(S=objcli.args.sections)]
        # self.objaartall = self._init_objaartall()