How to use the pyani.pyani_files function in pyani

To help you get started, we’ve selected a few pyani examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github widdowquinn / pyani / pyani / anib.py View on Github external
:param mode:  str, analysis type (ANIb or ANIblastall)
    :param logger:  a logger for messages

    Returns the following pandas dataframes in an ANIResults object;
    query sequences are rows, subject sequences are columns:

    - alignment_lengths - non-symmetrical: total length of alignment
    - percentage_identity - non-symmetrical: ANIb (Goris) percentage identity
    - alignment_coverage - non-symmetrical: coverage of query
    - similarity_errors - non-symmetrical: count of similarity errors

    May throw a ZeroDivisionError if one or more BLAST runs failed, or a
    very distant sequence was included in the analysis.
    """
    # Process directory to identify input files
    blastfiles = pyani_files.get_input_files(blast_dir, ".blast_tab")
    # Hold data in ANIResults object
    results = ANIResults(list(org_lengths.keys()), mode)

    # Fill diagonal NA values for alignment_length with org_lengths
    for org, length in list(org_lengths.items()):
        results.alignment_lengths[org][org] = length

    # Process .blast_tab files assuming that the filename format holds:
    # org1_vs_org2.blast_tab:
    for blastfile in blastfiles:
        qname, sname = blastfile.stem.split("_vs_")

        # We may have BLAST files from other analyses in the same directory
        # If this occurs, we raise a warning, and skip the file
        if qname not in list(org_lengths.keys()):
            if logger:
github widdowquinn / pyani / pyani / scripts / subcommands / subcmd_index.py View on Github external
def subcmd_index(args: Namespace) -> int:
    """Generate a file with the MD5 hash for each genome in an input directory.

    :param args:  Namespace, received command-line arguments
    :param logger:  logging object

    Identify the genome files in the input directory, and generate a single
    MD5 for each so that .fna produces .md5

    Genome files (FASTA) are identified from the file extension.
    """
    logger = logging.getLogger(__name__)

    # Get list of FASTA files in the input directory
    logger.info("Scanning directory %s for FASTA files", args.indir)
    fpaths = pyani_files.get_fasta_paths(args.indir)
    logger.info("Found FASTA files:")
    logger.info([f"\t{fpath}\n" for fpath in fpaths])

    # Lists of class/label information
    classes = []
    labels = []

    # Create MD5 hash for each file, if needed
    for fpath in fpaths:
        hashfname = fpath.with_suffix(".md5")
        if hashfname.is_file():
            logger.info("%s already indexed (using existing hash)", fpath)
            with open(hashfname, "r") as ifh:
                datahash = ifh.readline().split()[0]
        else:
            # Write an .md5 hash file
github widdowquinn / pyani / pyani / anim.py View on Github external
:param org_lengths:  dictionary of total sequence lengths, keyed by sequence

    Returns the following pandas dataframes in an ANIResults object;
    query sequences are rows, subject sequences are columns:

    - alignment_lengths - symmetrical: total length of alignment
    - percentage_identity - symmetrical: percentage identity of alignment
    - alignment_coverage - non-symmetrical: coverage of query and subject
    - similarity_errors - symmetrical: count of similarity errors

    May throw a ZeroDivisionError if one or more NUCmer runs failed, or a
    very distant sequence was included in the analysis.
    """
    # Process directory to identify input files - as of v0.2.4 we use the
    # .filter files that result from delta-filter (1:1 alignments)
    deltafiles = pyani_files.get_input_files(delta_dir, ".filter")

    # Hold data in ANIResults object
    results = ANIResults(list(org_lengths.keys()), "ANIm")

    # Fill diagonal NA values for alignment_length with org_lengths
    for org, length in list(org_lengths.items()):
        results.alignment_lengths[org][org] = length

    # Process .delta files assuming that the filename format holds:
    # org1_vs_org2.delta
    for deltafile in deltafiles:
        qname, sname = deltafile.stem.split("_vs_")

        # We may have .delta files from other analyses in the same directory
        # If this occurs, we raise a warning, and skip the .delta file
        if qname not in list(org_lengths.keys()):
github widdowquinn / pyani / pyani / anim.py View on Github external
def get_fasta_files(dirname: Path = Path(".")) -> Iterable:
    """Return iterable of FASTA files in the passed directory.

    :param dirname:  str, path to input directory
    """
    infiles = pyani_files.get_input_files(
        dirname, ".fasta", ".fas", ".fa", ".fna", ".fsa_nt"
    )
    return infiles
github widdowquinn / pyani / pyani / scripts / average_nucleotide_identity.py View on Github external
# Ensure argument validity and get method function/config
    test_class_label_paths(args, logger)
    test_scheduler(args, logger)
    method_function, method_config = get_method(args, logger)
    make_outdirs(args)

    # Skip calculations (or not) depending on rerender option
    if args.rerender:
        logger.warning(
            "--rerender option used. Producing graphics with no new recalculations"
        )
    else:
        # Run ANI comparisons
        logger.info("Identifying FASTA files in %s", args.indirname)
        infiles = pyani_files.get_fasta_files(args.indirname)
        logger.info("Input files:\n\t%s", "\n\t".join([str(_) for _ in infiles]))

        # Are we subsampling? If so, make the selection here
        if args.subsample:
            infiles = subsample_input(args, logger, infiles)
            logger.info(
                "Sampled input files:\n\t%s", "\n\t".join([str(_) for _ in infiles])
            )

        # Get lengths of input sequences
        logger.info("Processing input sequence lengths")
        org_lengths = pyani_files.get_sequence_lengths(infiles)
        seqlens = os.linesep.join(
            ["\t%s: %d" % (k, v) for k, v in list(org_lengths.items())]
        )
        logger.info("Sequence lengths:\n%s", seqlens)
github widdowquinn / pyani / pyani / scripts / subcommands.py View on Github external
name = args.name

    # Add info for this analysis to the database
    logger.info("Adding analysis information to database %s", args.dbpath)
    run_id = pyani_db.add_run(args.dbpath, "ANIm", args.cmdline,
                              start_time, "started", name)
    logger.info("Current analysis has ID %s in this database", run_id)

    # Identify input files for comparison, and populate the database
    logger.info("Identifying input genome/hash files:")
    infiles = pyani_files.get_fasta_and_hash_paths(args.indir)
    # Get hash string and sequence description for each FASTA/hash pair,
    # and add info to the current database
    for fastafile, hashfile in infiles:
        # Get genome data
        inhash, filecheck = pyani_files.read_hash_string(hashfile)
        indesc = pyani_files.read_fasta_description(fastafile)
        abspath = os.path.abspath(fastafile)
        genome_len = pyani_tools.get_genome_length(abspath)
        outstr = ["FASTA file:\t%s" % abspath,
                  "description:\t%s" % indesc,
                  "hash file:\t%s" % hashfile,
                  "MD5 hash:\t%s" % inhash,
                  "Total length:\t%d" % genome_len]
        logger.info('\t' + '\n\t'.join(outstr))

        # Attempt to add current genome/path combination to database
        logger.info("Adding genome data to database...")
        try:
            genome_id = pyani_db.add_genome(args.dbpath, inhash,
                                            abspath, genome_len, indesc)
        except sqlite3.IntegrityError:  # genome data already in database