"""Estimate reasonable bin sizes from BAM read counts or depths."""
from __future__ import absolute_import, division, print_function
import logging
import os
import tempfile
import numpy as np
import pandas as pd
from skgenome import tabio, GenomicArray as GA
from . import coverage, samutil
from .antitarget import compare_chrom_names
from .descriptives import weighted_median
[docs]def midsize_file(fnames):
"""Select the median-size file from several given filenames.
If an even number of files is given, selects the file just below the median.
"""
return sorted(fnames, key=lambda f: os.stat(f).st_size
)[len(fnames) // 2 - 1]
[docs]def do_autobin(bam_fname, method, targets=None, access=None,
bp_per_bin=100000., target_min_size=20, target_max_size=50000,
antitarget_min_size=500, antitarget_max_size=1000000):
"""Quickly calculate reasonable bin sizes from BAM read counts.
Parameters
----------
bam_fname : string
BAM filename.
method : string
One of: 'wgs' (whole-genome sequencing), 'amplicon' (targeted amplicon
capture), 'hybrid' (hybridization capture).
targets : GenomicArray
Targeted genomic regions (for 'hybrid' and 'amplicon').
access : GenomicArray
Sequencing-accessible regions of the reference genome (for 'hybrid' and
'wgs').
bp_per_bin : int
Desired number of sequencing read nucleotide bases mapped to each bin.
Returns
-------
2-tuple of 2-tuples:
((target depth, target avg. bin size),
(antitarget depth, antitarget avg. bin size))
"""
if method in ('amplicon', 'hybrid'):
if targets is None:
raise ValueError("Target regions are required for method %r "
"but were not provided." % method)
if not len(targets):
raise ValueError("Target regions are required for method %r "
"but were not provided." % method)
# Closes over bp_per_bin
def depth2binsize(depth, min_size, max_size):
if depth:
bin_size = int(round(bp_per_bin / depth))
if bin_size < min_size:
logging.info("Limiting est. bin size %d to given min. %d",
bin_size, min_size)
bin_size = min_size
elif bin_size > max_size:
logging.info("Limiting est. bin size %d to given max. %d",
bin_size, max_size)
bin_size = max_size
return bin_size
samutil.ensure_bam_index(bam_fname)
rc_table = samutil.idxstats(bam_fname, drop_unmapped=True)
read_len = samutil.get_read_length(bam_fname)
logging.info("Estimated read length %s", read_len)
# Dispatch
if method == 'amplicon':
# From BAM index
# rc_table = update_chrom_length(rc_table, targets)
# tgt_depth = average_depth(rc_table, read_len)
# By sampling
tgt_depth = sample_region_cov(bam_fname, targets)
anti_depth = None
elif method == 'hybrid':
tgt_depth, anti_depth = hybrid(rc_table, read_len, bam_fname, targets,
access)
elif method == 'wgs':
if access is not None and len(access):
rc_table = update_chrom_length(rc_table, access)
tgt_depth = average_depth(rc_table, read_len)
anti_depth = None
# Clip bin sizes to specified ranges
tgt_bin_size = depth2binsize(tgt_depth, target_min_size, target_max_size)
anti_bin_size = depth2binsize(anti_depth, antitarget_min_size,
antitarget_max_size)
return ((tgt_depth, tgt_bin_size),
(anti_depth, anti_bin_size))
[docs]def hybrid(rc_table, read_len, bam_fname, targets, access=None):
"""Hybrid capture sequencing."""
# Identify off-target regions
if access is None:
access = idxstats2ga(rc_table, bam_fname)
# Verify BAM chromosome names match those in target BED
compare_chrom_names(access, targets)
antitargets = access.subtract(targets)
# Only examine chromosomes present in all 2-3 input datasets
rc_table, targets, antitargets = shared_chroms(rc_table, targets,
antitargets)
# Deal with targets
target_depth = sample_region_cov(bam_fname, targets)
# Antitargets: subtract captured reads from total
target_length = region_size_by_chrom(targets)['length']
target_reads = (target_length * target_depth / read_len).values
anti_table = update_chrom_length(rc_table, antitargets)
anti_table = anti_table.assign(mapped=anti_table.mapped - target_reads)
anti_depth = average_depth(anti_table, read_len)
return target_depth, anti_depth
# ---
[docs]def average_depth(rc_table, read_length):
"""Estimate the average read depth across the genome.
Returns
-------
float
Median of the per-chromosome mean read depths, weighted by chromosome
size.
"""
mean_depths = read_length * rc_table.mapped / rc_table.length
return weighted_median(mean_depths, rc_table.length)
[docs]def idxstats2ga(table, bam_fname):
return GA(table.assign(start=0, end=table.length)
.loc[:, ('chromosome', 'start', 'end')],
meta_dict={'filename': bam_fname})
[docs]def sample_region_cov(bam_fname, regions, max_num=100):
"""Calculate read depth in a randomly sampled subset of regions."""
midsize_regions = sample_midsize_regions(regions, max_num)
with tempfile.NamedTemporaryFile(suffix='.bed', mode='w+t') as f:
tabio.write(regions.as_dataframe(midsize_regions), f, 'bed4')
f.flush()
table = coverage.bedcov(f.name, bam_fname, 0)
# Mean read depth across all sampled regions
return table.basecount.sum() / (table.end - table.start).sum()
[docs]def sample_midsize_regions(regions, max_num):
"""Randomly select a subset of up to `max_num` regions."""
sizes = regions.end - regions.start
lo_size, hi_size = np.percentile(sizes[sizes > 0], [25, 75])
midsize_regions = regions.data[(sizes >= lo_size) & (sizes <= hi_size)]
if len(midsize_regions) > max_num:
midsize_regions = midsize_regions.sample(max_num, random_state=0xA5EED)
return midsize_regions
[docs]def shared_chroms(*tables):
"""Intersection of DataFrame .chromosome values."""
chroms = tables[0].chromosome.drop_duplicates()
for tab in tables[1:]:
if tab is not None:
new_chroms = tab.chromosome.drop_duplicates()
chroms = chroms[chroms.isin(new_chroms)]
return [None if tab is None else tab[tab.chromosome.isin(chroms)]
for tab in tables]
[docs]def update_chrom_length(rc_table, regions):
if regions is not None and len(regions):
chrom_sizes = region_size_by_chrom(regions)
rc_table = rc_table.merge(chrom_sizes, on='chromosome', how='inner')
rc_table['length'] = rc_table['length_y'] # ?
rc_table = rc_table.drop(['length_x', 'length_y'], axis=1)
return rc_table
[docs]def region_size_by_chrom(regions):
chromgroups = regions.data.groupby('chromosome', sort=False)
# sizes = chromgroups.apply(total_region_size) # XXX
sizes = [total_region_size(g) for _key, g in chromgroups]
return pd.DataFrame({'chromosome': regions.chromosome.drop_duplicates(),
'length': sizes})
[docs]def total_region_size(regions):
"""Aggregate area of all genomic ranges in `regions`."""
return (regions.end - regions.start).sum()