How to use toil - 10 common examples

To help you get started, we’ve selected a few toil examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github edraizen / molmimic / generate_data / calculate_bsa.py View on Github external
def start_toil(job, dataset_name, name="bsa"):
    path = os.path.join(get_interfaces_path(dataset_name), "by_superfamily")
    for cdd, sfam_id in iter_cdd(use_id=True, group_superfam=True):
        sfam_path = os.path.join(path, str(int(sfam_id)), str(int(sfam_id)))
        if not os.path.isfile(sfam_path+".observed_interactome"):
            continue
        cjob = job.addChildJobFn(observed_bsa, dataset_name, sfam_id)
        if not os.path.isfile(sfam_path+".inferred_interactome"):
            continue
        cjob.addFollowOnJobFn(inferred_bsa, dataset_name, sfam_id)

if __name__ == "__main__":
    from toil.common import Toil
    from toil.job import Job

    parser = Job.Runner.getDefaultArgumentParser()
    options = parser.parse_args()
    options.logLevel = "DEBUG"
    options.clean = "always"
    dataset_name = options.jobStore.split(":")[-1]

    job = Job.wrapJobFn(start_toil, dataset_name)
    with Toil(options) as toil:
        toil.start(job)
github edraizen / molmimic / molmimic / generate_data / get_inferred_structural_interactome.py View on Github external
# except (SystemExit, KeyboardInterrupt):
    #     raise
    # except:
    #     try:
    #         observed_interactome = filter_hdf_chunks("IBIS_observed.h5", "table", "obs_int_id", float(mol_sfam_id))
    #     except (SystemExit, KeyboardInterrupt):
    #         raise
    #     except:
    #         job.log("Failed reading IBIS_observed.h5")
    #         return
    sfamFileStoreID = sfamFileStoreIDs[mol_sfam_id]
    obsFilePath = get_file(job, "{}_obs.h5".format(int(mol_sfam_id)),
        sfamFileStoreID, work_dir=work_dir)

    observed_interactome = pd.read_hdf(obsFilePath, "table")
    RealtimeLogger.info("Obs has {} rows".format(observed_interactome.shape))

    # obsFilePath = os.path.join(work_dir, "{0}.observed_interactome".format(int(mol_sfam_id)))
    # out_store.read_input_file("{0}/{0}.observed_interactome".format(int(mol_sfam_id)), obsPath)

    tableInfPath = get_file(job, "IBIS_inferred_{}.h5".format(table), tableInfStoreID)
    # skip_int = set([tuple(map(int, os.path.basename(f)[:-3].split("_"))) for f in out_store.list_input_directory(
    #     "{}/_infrows/Intrac{}".format(int(mol_sfam_id),  table)) if f.endswith(".h5")])
    try:
        inf_int_ids = filter_hdf_chunks(tableInfPath, "Intrac{}".format(table), chunksize=100,
            nbr_superfam_id=mol_sfam_id)
    except (RuntimeError, TypeError):
        job.log("Unable to find sfam {} in table {}, Skipping".format(mol_sfam_id, table))
        return

    #inf_int_ids = set([tuple(row) for row in inf_int_ids.itertuples()])
    #inf_int_ids -= skip_int
github edraizen / molmimic / molmimic / generate_data / calculate_features.py View on Github external
else:
        #pdb_or_key is key
        assert pdb_or_key.count("_") == 3
        key = os.path.splitext(pdb_or_key)[0]
        pdb, chain, sdi, domNo = os.path.basename(key).split("_")
        sdi, domNo = sdi[3:], domNo[1:]

    try:
        pdb_path = os.path.join(work_dir, os.path.basename(key)+".pdb")
        in_store.read_input_file(key+".pdb", pdb_path)

        s = ProteinFeaturizer(pdb_path, pdb, chain, sdi=sdi, domNo=domNo,
            work_dir=work_dir, job=job)

        _, atom_features = s.calculate_flat_features()
        RealtimeLogger.info("Finished atom features")
        _, residue_features = s.calculate_flat_features(course_grained=True)
        RealtimeLogger.info("Finished residue features")
        graph_features = s.calculate_graph()
        RealtimeLogger.info("Finished edge features")

        out_store.write_output_file(atom_features, key+"_atom.npy")
        out_store.write_output_file(residue_features, key+"_residue.npy")
        out_store.write_output_file(graph_features, key+"_edges.gz")

        for f in (pdb_path, atom_features, residue_features, graph_features):
            try:
                os.remove(f)
            except OSError:
                pass
    except (SystemExit, KeyboardInterrupt):
        raise
github edraizen / molmimic / molmimic / generate_data / calculate_features.py View on Github external
assert pdb_or_key.count("_") == 3
        key = os.path.splitext(pdb_or_key)[0]
        pdb, chain, sdi, domNo = os.path.basename(key).split("_")
        sdi, domNo = sdi[3:], domNo[1:]

    try:
        pdb_path = os.path.join(work_dir, os.path.basename(key)+".pdb")
        in_store.read_input_file(key+".pdb", pdb_path)

        s = ProteinFeaturizer(pdb_path, pdb, chain, sdi=sdi, domNo=domNo,
            work_dir=work_dir, job=job)

        _, atom_features = s.calculate_flat_features()
        RealtimeLogger.info("Finished atom features")
        _, residue_features = s.calculate_flat_features(course_grained=True)
        RealtimeLogger.info("Finished residue features")
        graph_features = s.calculate_graph()
        RealtimeLogger.info("Finished edge features")

        out_store.write_output_file(atom_features, key+"_atom.npy")
        out_store.write_output_file(residue_features, key+"_residue.npy")
        out_store.write_output_file(graph_features, key+"_edges.gz")

        for f in (pdb_path, atom_features, residue_features, graph_features):
            try:
                os.remove(f)
            except OSError:
                pass
    except (SystemExit, KeyboardInterrupt):
        raise
    except Exception as e:
        raise
github BD2KGenomics / toil-scripts / rna-seq-pipeline / rna-seq_pipeline.py View on Github external
bucket_name = s3_dir.split('/')[0]
    bucket_dir = '/'.join(s3_dir.split('/')[1:])
    # I/O
    uuid_tar = return_input_paths(job, work_dir, ids, 'uuid.tar.gz')
    # Upload to S3
    conn = boto.connect_s3()
    bucket = conn.get_bucket(bucket_name)
    k = Key(bucket)
    k.key = os.path.join(bucket_dir, uuid + '.tar.gz')
    k.set_contents_from_filename(uuid_tar)


if __name__ == "__main__":
    # Define Parser object and add to toil
    parser = build_parser()
    Job.Runner.addToilOptions(parser)
    args = parser.parse_args()

    # Store input_URLs for downloading
    inputs = {'config': args.config,
              'unc.bed': args.unc,
              'hg19.transcripts.fa': args.fasta,
              'composite_exons.bed': args.composite_exons,
              'normalize.pl': args.normalize,
              'output_dir': args.output_dir,
              'rsem_ref.zip': args.rsem_ref,
              'chromosomes.zip': args.chromosomes,
              'ebwt.zip': args.ebwt,
              'ssec': args.ssec,
              's3_dir': args.s3_dir,
              'uuid': None,
              'samples.zip': None,
github BD2KGenomics / toil-scripts / src / toil_scripts / defuse_pipeline / defuse_pipeline.py View on Github external
help='Path to the (filled in) manifest file, generated with "generate-manifest". '
                                 '\nDefault value: "%(default)s"')
    parser_run.add_argument('--fq', default=None, type=str,
                            help='URL for the sample BAM. URLs can take the form: http://, file://, s3://, '
                                 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.')
    parser_run.add_argument('--uuid', default=None, type=str, help='Provide the UUID of a sample when using the'
                                                                   '"--bam" option')

    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    writeToDebug('Debug log')

    Job.Runner.addToilOptions(parser)

    args = parser.parse_args()

    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-defuse.yaml'), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-defuse.tsv'), generate_manifest)
    if 'generate' in args.command:
        sys.exit()
    if args.command == 'run':
        # Read in config yaml file and set the default value to None
        config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()}
        check_for_required_parameters(config)

        # Program checks
github BD2KGenomics / toil-scripts / src / toil_scripts / exome_variant_pipeline / exome_variant_pipeline.py View on Github external
help='Path to the (filled in) manifest file, generated with "generate-manifest". '
                                 '\nDefault value: "%(default)s"')
    parser_run.add_argument('--normal', default=None, type=str,
                            help='URL for the normal BAM. URLs can take the form: http://, ftp://, file://, s3://, '
                                 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.')
    parser_run.add_argument('--tumor', default=None, type=str,
                            help='URL for the tumor BAM. URLs can take the form: http://, ftp://, file://, s3://, '
                                 'and gnos://. The UUID for the sample must be given with the "--uuid" flag.')
    parser_run.add_argument('--uuid', default=None, type=str, help='Provide the UUID of a sample when using the'
                                                                   '"--tumor" and "--normal" option')
    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    # Add Toil options
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()
    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-exome.yaml'), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-exome.tsv'), generate_manifest)
    # Pipeline execution
    elif args.command == 'run':
        require(os.path.exists(args.config), '{} not found. Please run '
                                             '"toil-rnaseq generate-config"'.format(args.config))
        if args.normal or args.tumor or args.uuid:
            require(args.normal and args.tumor and args.uuid, '"--tumor", "--normal" and "--uuid" must all be supplied')
            samples = [[args.uuid, args.normal, args.tumor]]
        else:
            samples = parse_manifest(args.manifest)
github BD2KGenomics / toil-scripts / src / toil_scripts / rnaseq_cgl / rnaseq_cgl_pipeline.py View on Github external
'\nDefault value: "%(default)s"')
    group.add_argument('--manifest', default='manifest-toil-rnaseq.tsv', type=str,
                       help='Path to the (filled in) manifest file, generated with "generate-manifest". '
                            '\nDefault value: "%(default)s"')
    group.add_argument('--samples', default=None, nargs='+', type=str,
                       help='Space delimited sample URLs (any number). Samples must be tarfiles/tarballs that contain '
                            'fastq files. URLs follow the format: http://foo.com/sample.tar, '
                            'file:///full/path/to/file.tar. The UUID for the sample will be derived from the file.'
                            'Samples passed in this way will be assumed to be paired end, if using single-end data, '
                            'please use the manifest option.')
    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
    # Add Toil options
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()
    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'config-toil-rnaseq.yaml'), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, 'manifest-toil-rnaseq.tsv'), generate_manifest)
    # Pipeline execution
    elif args.command == 'run':
        require(os.path.exists(args.config), '{} not found. Please run '
                                             '"toil-rnaseq generate-config"'.format(args.config))
        if not args.samples:
            require(os.path.exists(args.manifest), '{} not found and no samples provided. Please '
                                                   'run "toil-rnaseq generate-manifest"'.format(args.manifest))
            samples = parse_samples(path_to_manifest=args.manifest)
        else:
github edraizen / molmimic / generate_data / prepare_protein.py View on Github external
j2.addFollowOnJobFn(create_data_loader, dataset_name, cdd)
        j2.addFollowOnJobFn(convert_pdb_to_mmtf, dataset_name, cdd)


if __name__ == "__main__":
    from toil.common import Toil
    from toil.job import Job

    parser = Job.Runner.getDefaultArgumentParser()
    options = parser.parse_args()
    options.logLevel = "DEBUG"
    options.clean = "always"
    dataset_name = options.jobStore.split(":")[-1]

    job = Job.wrapJobFn(start_toil, dataset_name)
    with Toil(options) as toil:
        toil.start(job)
github edraizen / molmimic / molmimic / generate_data / main.py View on Github external
action="store_true",
        default=False)
    options = parser.parse_args()
    options.logLevel = "DEBUG"
    #options.clean = "always"
    options.targetTime = 1

    if options.cathcode is not None:
        options.cathcode = [c.split(".") for c in options.cathcode]

    sfam_file = os.path.abspath("cath.h5")
    if not os.path.isfile(sfam_file):
        store = IOStore.get("aws:us-east-1:molmimic-cath")
        store.read_input_file("cath-domain-description-file-small.h5", sfam_file)

    with Toil(options) as workflow:
        if not workflow.options.restart:
            cathFileStoreID = workflow.importFile("file://" + os.path.abspath(sfam_file))
            job = Job.wrapJobFn(start_toil, cathFileStoreID, cathcode=options.cathcode,
                update_features=options.features, force=options.force)
            workflow.start(job)
        else:
            workflow.restart()