How to use the pygtftk.gtf_interface.GTF function in pygtftk

To help you get started, we’ve selected a few pygtftk examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github dputhier / pygtftk / pygtftk / plugins / select_by_key.py View on Github external
elif file_with_values is None:
        if key is None or value is None:
            message("Key and value are mandatory. Alternatively use -e/t/g/f or -f with -k.",
                    type="ERROR")

    elif file_with_values is not None:
        if key is None:
            message("Please set -k.", type="ERROR")
        if value is not None:
            message("The -f and -v arguments are mutually exclusive.", type="ERROR")

    # ----------------------------------------------------------------------
    # Load file with value
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)
    all_values = gtf.extract_data(key, as_list=True, no_na=True, nr=True)

    if log:
        feat_before = len(gtf)

    if not file_with_values:
        value_list = value.split(",")
        gtf = gtf.select_by_key(key, value, invert_match)
    else:
        value_list = []

        for line in file_with_values:
            cols = line.split("\t")
            value_list += [cols[col - 1]]
        file_with_values.close()
        file_with_values = open(file_with_values.name)
github dputhier / pygtftk / pygtftk / plugins / del_attr.py View on Github external
def del_attr(
        inputfile=None,
        outputfile=None,
        key="transcript_id",
        reg_exp=False,
        invert_match=False):
    """
    Delete extended attributes in the target gtf file. attr_list can be a
    comma-separated list of attributes.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    if reg_exp:
        try:
            rgxp = re.compile(key)
        except:
            message("Check the regular expression please.", type="ERROR")
        key_list = [key]
    else:
        key_list = key.split(",")

    for i in gtf:

        feature_keys = i.get_attr_names()

        if not invert_match:
            for k in key_list:
github dputhier / pygtftk / pygtftk / plugins / join_attr.py View on Github external
# -----------------------------------------------------------

    if matrix is True:
        if new_key is not None:
            message("--new-key and --matrix are mutually exclusive.",
                    type="ERROR")
    else:
        if new_key is None:
            message("--new-key is required when --matrix is False.",
                    type="ERROR")

    # -----------------------------------------------------------
    #  load the GTF
    # -----------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    # -----------------------------------------------------------
    #  Check target feature
    # -----------------------------------------------------------

    feat_list = gtf.get_feature_list(nr=True)

    if target_feature is not None:
        target_feature_list = target_feature.split(",")

        for i in target_feature_list:
            if i not in feat_list + ["*"]:
                message("Feature " + i + " not found.",
                        type="ERROR")
    else:
        target_feature = ",".join(feat_list)
github dputhier / pygtftk / pygtftk / plugins / convergent.py View on Github external
downstream=1500,
        chrom_info=None):
    """
    Find transcript with convergent tts.
    """

    message("Using -u " + str(upstream) + ".")
    message("Using -d " + str(downstream) + ".")

    tx_to_convergent_nm = dict()
    dist_to_convergent = dict()
    tts_pos = dict()

    message("Loading GTF.")

    gtf = GTF(inputfile)

    message("Getting transcript coordinates.")

    tx_feat = gtf.select_by_key("feature", "transcript")

    message("Getting tts coordinates.")

    tts_bo = tx_feat.get_tts(name=["transcript_id", "gene_id"],
                             sep="||")

    # get tts position
    for i in tts_bo:
        tx_id_ov, gn_id_ov = i.name.split("||")
        tts_pos[tx_id_ov] = int(i.start)

    message("Getting tts coordinates.")
github dputhier / pygtftk / pygtftk / gtf_interface.py View on Github external
if not token[0].isdigit():
                raise GTFtkError("Column 1 of intput file should be an int.")

        new_data = self._dll.add_attr_to_pos(self._data,
                                             native_str(input_file.name),
                                             native_str(new_key))

        return self._clone(new_data)


if __name__ == "__main__":

    from pygtftk.utils import get_example_file

    a = get_example_file()
    gtf = GTF(a[0])
    for i in gtf["feature", "transcript"]:
        i.write(sys.stdout)
github dputhier / pygtftk / pygtftk / plugins / divergent.py View on Github external
no_strandness=False,
        no_annotation=False):
    """
Find transcript with divergent promoters.
    """

    message("Using -u " + str(upstream) + ".")
    message("Using -d " + str(downstream) + ".")

    tx_with_divergent = dict()
    dist_to_divergent = dict()
    tss_pos = dict()

    message("Loading GTF.")

    gtf = GTF(inputfile)

    message("Getting transcript coordinates.")

    tx_feat = gtf.select_by_key("feature",
                                "transcript")
    message("Getting tss coordinates.")

    tss_bo = tx_feat.get_tss(name=["transcript_id", "gene_id"],
                             sep="||")

    # get tss position
    for i in tss_bo:
        tx_id_tss, gn_id_tss = i.name.split("||")
        tss_pos[tx_id_tss] = int(i.start)

    message("Getting promoter coordinates.")
github dputhier / pygtftk / pygtftk / plugins / select_by_max_exon_nb.py View on Github external
def select_by_max_exon_nb(inputfile=None,
                          outputfile=None):
    """
    Select transcripts based on the number of exons.
    """

    msg = "Selecting transcript with the highest number of exon for each gene."
    message(msg)

    gtf = GTF(inputfile,
              check_ensembl_format=False
              ).select_by_max_exon_nb()

    gtf.write(outputfile, gc_off=True)
github dputhier / pygtftk / pygtftk / plugins / tss_numbering.py View on Github external
def tss_numbering(
        inputfile=None,
        outputfile=None,
        compute_dist=False,
        key_name='tss_number',
        key_name_dist='dist_to_first_tss',
        add_nb_tss_to_gene=False,
        gene_key='nb_tss'):
    """
    Computes the distance between TSS of gene transcripts.
    """

    gtf = GTF(inputfile, check_ensembl_format=True)

    gn_tss_dist = defaultdict(dict)

    message("Getting TSSs.")
    tss = gtf.get_tss(name=["transcript_id"], as_dict=True)
    tx_to_gn = gtf.get_tx_to_gn()

    for k in tss:
        gn_id = tx_to_gn[k]
        gn_tss_dist[gn_id][k] = int(tss[k])

    # if_dict_of_dict is true, get_gn_to_tx() returns a dict of dict
    # that maps gene_id to transcript_id and transcript_id to TSS
    # numbering (1 for most 5', then 2...). For transcripts having
    # the same TSSs, the tss number will be the same.
    gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True)
github dputhier / pygtftk / pygtftk / plugins / splicing_site.py View on Github external
def splicing_site(inputfile=None,
                  outputfile=None,
                  exon_numbering_key=False,
                  names="exon_id,transcript_id,gene_id",
                  separator="\t"):
    """
    Compute the locations of splice donor are acceptor  sites. You may extend them in 3' and 5' depending on your needs.
    """

    gtf = GTF(inputfile)

    nb_exons = gtf.nb_exons()

    info = "feature,seqid,start,end,transcript_id," + exon_numbering_key
    info += ",strand," + names

    exon_info = gtf.extract_data_iter_list(info)

    for i in exon_info:

        if i[0] == "exon":
            if i[5] == ".":
                message("Some exon lines do not contain any numbering. "
                        "Use add_exon_nb or set --exon-numbering-key to the proper key.",
                        type="ERROR")
github dputhier / pygtftk / pygtftk / plugins / intergenic.py View on Github external
def intergenic(
        inputfile=None,
        outputfile=None,
        chrom_info=None):
    """
 Extract intergenic regions.
    """

    message("Searching for intergenic regions.")

    gtf = GTF(inputfile)

    intergenic_regions = gtf.get_intergenic(chrom_info)

    nb_intergenic_region = 1

    for i in intergenic_regions:
        i.name = "region_" + str(nb_intergenic_region)
        write_properly(chomp(str(i)), outputfile)
        nb_intergenic_region += 1

    gc.disable()
    close_properly(outputfile, inputfile)