How to use the hail.read_matrix_table function in hail

To help you get started, we’ve selected a few hail examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hail-is / hail / hail / python / cluster-tests / cluster-vep-check.py View on Github external
import hail as hl

GOLD_STD = 'gs://hail-common/vep/vep/vep_examplars/vep_no_csq_4dc19bc1b.mt/'
GOLD_STD_CSQ = 'gs://hail-common/vep/vep/vep_examplars/vep_csq_4dc19bc1b.mt/'

for path, csq in [(GOLD_STD, False), (GOLD_STD_CSQ, True)]:
    print(f"Checking 'hl.vep' replicates on '{path}'")
    expected = hl.read_matrix_table(path)
    actual = hl.vep(expected.rows().select(), 'gs://hail-common/vep/vep/vep85-loftee-gcloud-testing.json', csq=csq)
    actual._force_count()
    # vep_result_agrees = actual._same(expected)
github macarthur-lab / gnomad_hail / resources / basics.py View on Github external
:param str full_meta: Whether to add all metadata (warning: large)
    :param str fam_version: Version of metadata (default to current)
    :param str fam_root: Where to put the pedigree information. Set to None if no pedigree information is desired.
    :param str duplicate_mapping_root: Where to put the duplicate genome/exome samples ID mapping (default is None -- do not annotate)
    :param bool release_samples: When set, filters the data to release samples only
    :param str release_annotations: One of the RELEASES to add variant annotations (into va), or None for no data
    :return: gnomAD hardcalls dataset with chosen annotations
    :rtype: MatrixTable
    """
    from gnomad_hail.utils import filter_to_adj

    if raw and split:
        raise DataException('No split raw data. Use of hardcalls is recommended.')

    if non_refs_only:
        mt = hl.read_matrix_table(get_gnomad_data_path(data_type, split=split, non_refs_only=non_refs_only, hail_version=hail_version))
    else:
        mt = hl.read_matrix_table(get_gnomad_data_path(data_type, hardcalls=not raw, split=split, hail_version=hail_version))

    if adj:
        mt = filter_to_adj(mt)

    if meta_root:
        meta_ht = get_gnomad_meta(data_type, meta_version, full_meta=full_meta)
        mt = mt.annotate_cols(**{meta_root: meta_ht[mt.s]})

    if duplicate_mapping_root:
        dup_ht = hl.import_table(genomes_exomes_duplicate_ids_tsv_path, impute=True,
                                 key='exome_id' if data_type == "exomes" else 'genome_id')
        mt = mt.annotate_cols(**{duplicate_mapping_root: dup_ht[mt.s]})

    if fam_root:
github hail-is / hail / benchmark / python / benchmark_hail / run / matrix_table_benchmarks.py View on Github external
def matrix_table_take_row(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt.info.AF.take(100)
github hail-is / hail / benchmark / python / benchmark_hail / run / shuffle_benchmarks.py View on Github external
def shuffle_key_rows_by_mt(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt = mt.annotate_rows(reversed_position_locus=hl.struct(
        contig=mt.locus.contig,
        position=-mt.locus.position))
    mt = mt.key_rows_by(mt.reversed_position_locus)
    mt._force_count_rows()
github hail-is / hail / hail / python / hail / experimental / vcf_combiner.py View on Github external
return
    tmp_path += f'{uuid.uuid4()}/'

    ht = hl.read_matrix_table(out_paths[0]).rows()
    intervals = calculate_new_intervals(ht, TARGET_RECORDS)

    mts = [hl.read_matrix_table(path, _intervals=intervals) for path in out_paths]
    combined_mts = [combine_gvcfs(mt) for mt in chunks(mts, MAX_COMBINE_NUMBER)]
    i = 0
    while len(combined_mts) > 1:
        tmp = tmp_path + f'{i}/'
        pad = len(str(len(combined_mts)))
        hl.experimental.write_matrix_tables(combined_mts, tmp, overwrite=True)
        paths = [tmp + str(n).zfill(pad) + '.mt' for n in range(len(combined_mts))]

        ht = hl.read_matrix_table(out_paths[0]).rows()
        intervals = calculate_new_intervals(ht, TARGET_RECORDS)

        mts = [hl.read_matrix_table(path, _intervals=intervals) for path in paths]
        combined_mts = [combine_gvcfs(mts) for mt in chunks(mts, MAX_COMBINE_NUMBER)]
        i += 1
    combined_mts[0].write(out_file, overwrite=overwrite)
github macarthur-lab / hail-elasticsearch-pipelines / hail_scripts / v02 / utils / hail_utils.py View on Github external
def read_mt(mt_path: str):
    """Read MT from the given path."""
    logger.info(f"\n==> read in: {mt_path}")

    return hl.read_matrix_table(mt_path)
github macarthur-lab / tx_annotation / tx_annotation.py View on Github external
def read_tx_annotation_tables(mt_path, gtex_tx_summary_path, mt_type="mt"):
    if mt_type == "mt":
        mt = hl.read_matrix_table(mt_path)

    elif mt_type == "ht":
        mt = hl.read_table(mt_path)
        mt = hl.MatrixTable.from_rows_table(mt)

    gtex = hl.read_table(gtex_tx_summary_path)
    return mt, gtex
github hail-is / hail / hail / python / scripts / drive_combiner.py View on Github external
sample_names, paths = [list(x) for x in zip(*samples)]
    sample_names = [[n] for n in sample_names]
    assert len(paths) == len(samples)
    out_paths = stage_one(paths, sample_names, tmp_path, intervals, header, out_file)
    if not out_paths:
        return
    tmp_path += f'{uuid.uuid4()}/'
    mts = [hl.read_matrix_table(path) for path in out_paths]
    combined_mts = [comb.combine_gvcfs(mt) for mt in chunks(mts, MAX_COMBINE_NUMBER)]
    i = 0
    while len(combined_mts) > 1:
        tmp = tmp_path + f'{i}/'
        pad = len(str(len(combined_mts)))
        hl.experimental.write_matrix_tables(combined_mts, tmp, overwrite=True)
        paths = [tmp + str(n).zfill(pad) + '.mt' for n in range(len(combined_mts))]
        mts = [hl.read_matrix_table(path) for path in paths]
        combined_mts = [comb.combine_gvcfs(mts) for mt in chunks(mts, MAX_COMBINE_NUMBER)]
        i += 1
    combined_mts[0].write(out_file, overwrite=overwrite)
github hail-is / hail / python / hail / methods / statgen.py View on Github external
else:
        field = Env.get_uid()
        mt = mt.select_entries(**{field: call_expr})
    mt = mt.select_rows().select_cols()
    mt = mt.distinct_by_row()
    locally_pruned_table_path = new_temp_file()
    (_local_ld_prune(require_biallelic(mt, 'ld_prune'), field, r2, bp_window_size, memory_per_core)
        .add_index()
        .write(locally_pruned_table_path, overwrite=True))
    locally_pruned_table = hl.read_table(locally_pruned_table_path)

    locally_pruned_ds_path = new_temp_file()
    mt = mt.annotate_rows(info=locally_pruned_table[mt.row_key])
    (mt.filter_rows(hl.is_defined(mt.info))
        .write(locally_pruned_ds_path, overwrite=True))
    locally_pruned_ds = hl.read_matrix_table(locally_pruned_ds_path)

    n_locally_pruned_variants = locally_pruned_ds.count_rows()
    info(f'ld_prune: local pruning stage retained {n_locally_pruned_variants} variants')

    standardized_mean_imputed_gt_expr = hl.or_else(
        (locally_pruned_ds[field].n_alt_alleles() - locally_pruned_ds.info.mean) * locally_pruned_ds.info.centered_length_rec,
        0.0)

    std_gt_bm = BlockMatrix.from_entry_expr(standardized_mean_imputed_gt_expr, block_size=block_size)
    r2_bm = (std_gt_bm @ std_gt_bm.T) ** 2

    _, stops = hl.linalg.utils.locus_windows(locally_pruned_table.locus, bp_window_size)

    entries = r2_bm.sparsify_row_intervals(range(stops.size), stops, blocks_only=True).entries()
    entries = entries.filter((entries.entry >= r2) & (entries.i < entries.j))