How to use the hail.agg.count_where function in hail

To help you get started, we’ve selected a few hail examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github macarthur-lab / gnomad_hail / utils.py View on Github external
def get_projectmax(mt: hl.MatrixTable, loc: hl.expr.StringExpression) -> hl.MatrixTable:
    """
    First pass of projectmax (returns aggregated MT with project_max field)

    :param MatrixTable mt: Input MT
    :param StringExpression loc: Column expression location of project ID (e.g. mt.meta.pid)
    :return: Frequency data with annotated project_max
    :rtype: MatrixTable
    """
    # TODO: add hom count
    mt = mt.annotate_cols(project=loc)
    agg_mt = mt.group_cols_by(mt.project).aggregate(ac=hl.agg.sum(mt.GT.n_alt_alleles()),
                                                    an=2 * hl.agg.count_where(hl.is_defined(mt.GT)),
                                                    hom=hl.agg.count_where(mt.GT.is_hom_var()))
    agg_mt = agg_mt.annotate_entries(af=agg_mt.ac / agg_mt.an)
    return agg_mt.annotate_rows(project_max=hl.agg.take(hl.struct(project=agg_mt.project, ac=agg_mt.ac,
                                                                  af=agg_mt.af, an=agg_mt.an, hom=agg_mt.hom),
                                                        5, -agg_mt.af))
github macarthur-lab / gnomad_hail / gnomad / variant_qc / pipeline.py View on Github external
n_de_novo_singleton=hl.agg.filter(
            ht.ac_raw == 1, hl.agg.sum(fam.n_de_novos_raw)
        ),
        n_de_novos_adj=hl.agg.sum(fam.n_de_novos_adj),
        n_de_novo=hl.agg.sum(fam.n_de_novos_raw),
        n_trans_singletons=hl.agg.filter(
            ht.ac_raw == 2, hl.agg.sum(fam.n_transmitted_raw)
        ),
        n_untrans_singletons=hl.agg.filter(
            (ht.ac_raw < 3) & (ht.ac_qc_samples_unrelated_raw == 1),
            hl.agg.sum(fam.n_untransmitted_raw),
        ),
        n_train_trans_singletons=hl.agg.filter(
            (ht.ac_raw == 2) & ht.positive_train_site, hl.agg.sum(fam.n_transmitted_raw)
        ),
        n_omni=hl.agg.count_where(truth_data.omni),
        n_mills=hl.agg.count_where(truth_data.mills),
        n_hapmap=hl.agg.count_where(truth_data.hapmap),
        n_kgp_phase1_hc=hl.agg.count_where(truth_data.kgp_phase1_hc),
    )
github hail-is / hail / benchmark / python / benchmark_hail / run / matrix_table_benchmarks.py View on Github external
def many_aggs(mt):
    aggs = [
        hl.agg.count_where(mt.GT.is_hom_ref()),
        hl.agg.count_where(mt.GT.is_het()),
        hl.agg.count_where(mt.GT.is_hom_var()),
        hl.agg.count_where(mt.GT.is_non_ref()),
        hl.agg.count_where(mt.GT.n_alt_alleles() == 2),
        hl.agg.count_where(mt.GT.phased),
        hl.agg.count_where(mt.GT.is_haploid()),
        hl.agg.count_where(mt.GT.is_diploid()),
        hl.agg.count_where(mt.GT.ploidy == 2),
        hl.agg.fraction(mt.AD[0] > 0),
        hl.agg.fraction(mt.AD[0] < 0),
        hl.agg.fraction(mt.AD.length() < 0),
        hl.agg.fraction(mt.AD.length() > 0),
        hl.agg.fraction(mt.PL[0] > 0),
        hl.agg.fraction(mt.PL[0] < 0),
        hl.agg.fraction(mt.PL.length() < 0),
        hl.agg.fraction(mt.PL.length() > 0),
        hl.agg.fraction(mt.GQ < 0),
        hl.agg.fraction(mt.GQ > 0),
github macarthur-lab / gnomad_hail / gnomad / variant_qc / pipeline.py View on Github external
ht.ac_raw == 1, hl.agg.sum(fam.n_de_novos_raw)
        ),
        n_de_novos_adj=hl.agg.sum(fam.n_de_novos_adj),
        n_de_novo=hl.agg.sum(fam.n_de_novos_raw),
        n_trans_singletons=hl.agg.filter(
            ht.ac_raw == 2, hl.agg.sum(fam.n_transmitted_raw)
        ),
        n_untrans_singletons=hl.agg.filter(
            (ht.ac_raw < 3) & (ht.ac_qc_samples_unrelated_raw == 1),
            hl.agg.sum(fam.n_untransmitted_raw),
        ),
        n_train_trans_singletons=hl.agg.filter(
            (ht.ac_raw == 2) & ht.positive_train_site, hl.agg.sum(fam.n_transmitted_raw)
        ),
        n_omni=hl.agg.count_where(truth_data.omni),
        n_mills=hl.agg.count_where(truth_data.mills),
        n_hapmap=hl.agg.count_where(truth_data.hapmap),
        n_kgp_phase1_hc=hl.agg.count_where(truth_data.kgp_phase1_hc),
    )
github hail-is / hail / benchmark / python / benchmark_hail / run / matrix_table_benchmarks.py View on Github external
def many_aggs(mt):
    aggs = [
        hl.agg.count_where(mt.GT.is_hom_ref()),
        hl.agg.count_where(mt.GT.is_het()),
        hl.agg.count_where(mt.GT.is_hom_var()),
        hl.agg.count_where(mt.GT.is_non_ref()),
        hl.agg.count_where(mt.GT.n_alt_alleles() == 2),
        hl.agg.count_where(mt.GT.phased),
        hl.agg.count_where(mt.GT.is_haploid()),
        hl.agg.count_where(mt.GT.is_diploid()),
        hl.agg.count_where(mt.GT.ploidy == 2),
        hl.agg.fraction(mt.AD[0] > 0),
        hl.agg.fraction(mt.AD[0] < 0),
        hl.agg.fraction(mt.AD.length() < 0),
        hl.agg.fraction(mt.AD.length() > 0),
        hl.agg.fraction(mt.PL[0] > 0),
        hl.agg.fraction(mt.PL[0] < 0),
        hl.agg.fraction(mt.PL.length() < 0),
        hl.agg.fraction(mt.PL.length() > 0),
        hl.agg.fraction(mt.GQ < 0),
        hl.agg.fraction(mt.GQ > 0),
        hl.agg.fraction(mt.GQ % 2 == 0),
        hl.agg.fraction(mt.GQ % 2 != 0),
        hl.agg.fraction(mt.GQ / 5 < 10),
github hail-is / hail / benchmark / python / benchmark_hail / run / matrix_table_benchmarks.py View on Github external
def many_aggs(mt):
    aggs = [
        hl.agg.count_where(mt.GT.is_hom_ref()),
        hl.agg.count_where(mt.GT.is_het()),
        hl.agg.count_where(mt.GT.is_hom_var()),
        hl.agg.count_where(mt.GT.is_non_ref()),
        hl.agg.count_where(mt.GT.n_alt_alleles() == 2),
        hl.agg.count_where(mt.GT.phased),
        hl.agg.count_where(mt.GT.is_haploid()),
        hl.agg.count_where(mt.GT.is_diploid()),
        hl.agg.count_where(mt.GT.ploidy == 2),
        hl.agg.fraction(mt.AD[0] > 0),
        hl.agg.fraction(mt.AD[0] < 0),
        hl.agg.fraction(mt.AD.length() < 0),
        hl.agg.fraction(mt.AD.length() > 0),
        hl.agg.fraction(mt.PL[0] > 0),
        hl.agg.fraction(mt.PL[0] < 0),
        hl.agg.fraction(mt.PL.length() < 0),
        hl.agg.fraction(mt.PL.length() > 0),
        hl.agg.fraction(mt.GQ < 0),
        hl.agg.fraction(mt.GQ > 0),
        hl.agg.fraction(mt.GQ % 2 == 0),
        hl.agg.fraction(mt.GQ % 2 != 0),
        hl.agg.fraction(mt.GQ / 5 < 10),
        hl.agg.fraction(mt.GQ / 5 <= 10),
        hl.agg.fraction(mt.GQ / 5 > 10),
        hl.agg.fraction(mt.GQ / 5 >= 10),
github macarthur-lab / gnomad_hail / gnomad / utils / liftover.py View on Github external
def check_mismatch(ht: hl.Table) -> hl.expr.expressions.StructExpression:
    """
    Checks for mismatches between reference allele and allele in reference fasta

    :param ht: Table to be checked
    :return: StructExpression containing counts for mismatches and count for all variants on negative strand
    """

    mismatch = ht.aggregate(hl.struct(total_variants=hl.agg.count(),
                                total_mismatch=hl.agg.count_where(ht.reference_mismatch),
                                negative_strand=hl.agg.count_where(ht.new_locus.is_negative_strand),
                                negative_strand_mismatch=hl.agg.count_where(ht.new_locus.is_negative_strand & ht.reference_mismatch)
                                )
                            )
    return mismatch
github macarthur-lab / gnomad_hail / gnomad / utils / liftover.py View on Github external
def check_mismatch(ht: hl.Table) -> hl.expr.expressions.StructExpression:
    """
    Checks for mismatches between reference allele and allele in reference fasta

    :param ht: Table to be checked
    :return: StructExpression containing counts for mismatches and count for all variants on negative strand
    """

    mismatch = ht.aggregate(hl.struct(total_variants=hl.agg.count(),
                                total_mismatch=hl.agg.count_where(ht.reference_mismatch),
                                negative_strand=hl.agg.count_where(ht.new_locus.is_negative_strand),
                                negative_strand_mismatch=hl.agg.count_where(ht.new_locus.is_negative_strand & ht.reference_mismatch)
                                )
                            )
    return mismatch
github macarthur-lab / gnomad_hail / gnomad / utils / liftover.py View on Github external
def check_mismatch(ht: hl.Table) -> hl.expr.expressions.StructExpression:
    """
    Checks for mismatches between reference allele and allele in reference fasta

    :param ht: Table to be checked
    :return: StructExpression containing counts for mismatches and count for all variants on negative strand
    """

    mismatch = ht.aggregate(hl.struct(total_variants=hl.agg.count(),
                                total_mismatch=hl.agg.count_where(ht.reference_mismatch),
                                negative_strand=hl.agg.count_where(ht.new_locus.is_negative_strand),
                                negative_strand_mismatch=hl.agg.count_where(ht.new_locus.is_negative_strand & ht.reference_mismatch)
                                )
                            )
    return mismatch
github macarthur-lab / gnomad_hail / gnomad / variant_qc / pipeline.py View on Github external
grch37_resources.reference_data.get_truth_ht()
        if build == "GRCh37"
        else grch38_resources.reference_data.get_truth_ht()
    )[ht.key]
    fam = fam_stats_ht[ht.key]

    return dict(
        min_score=hl.agg.min(ht.score),
        max_score=hl.agg.max(ht.score),
        n=hl.agg.count(),
        n_ins=hl.agg.count_where(hl.is_insertion(ht.alleles[0], ht.alleles[1])),
        n_del=hl.agg.count_where(hl.is_deletion(ht.alleles[0], ht.alleles[1])),
        n_ti=hl.agg.count_where(hl.is_transition(ht.alleles[0], ht.alleles[1])),
        n_tv=hl.agg.count_where(hl.is_transversion(ht.alleles[0], ht.alleles[1])),
        n_1bp_indel=hl.agg.count_where(indel_length == 1),
        n_mod3bp_indel=hl.agg.count_where((indel_length % 3) == 0),
        n_singleton=hl.agg.count_where(ht.singleton),
        fail_hard_filters=hl.agg.count_where(
            (ht.info.QD < 2) | (ht.info.FS > 60) | (ht.info.MQ < 30)
        ),
        n_pos_train=hl.agg.count_where(ht.positive_train_site),
        n_neg_train=hl.agg.count_where(ht.negative_train_site),
        n_clinvar=hl.agg.count_where(hl.is_defined(clinvar)),
        n_de_novos_singleton_adj=hl.agg.filter(
            ht.ac == 1, hl.agg.sum(fam.n_de_novos_adj)
        ),
        n_de_novo_singleton=hl.agg.filter(
            ht.ac_raw == 1, hl.agg.sum(fam.n_de_novos_raw)
        ),
        n_de_novos_adj=hl.agg.sum(fam.n_de_novos_adj),
        n_de_novo=hl.agg.sum(fam.n_de_novos_raw),
        n_trans_singletons=hl.agg.filter(