生信分析进阶3 - pysam操作bam文件统计unique reads和mapped reads高级技巧合辑
pysam操作bam文件统计unique reads和mapped reads高级技巧
1. Linux服务器读取bam文件
服务器查看bam常用方法。
# bam_path: bam文件路径
samtools view -h bam_path|grep -v '^@'|less -S
2. samtools + python os库读取bam文件
缺点速度较慢。
import os# 读取bam
# bam_path: bam文件路径
read_lines = os.popen("samtools view {} -h".format(bam_path)).readlines()# 遍历bam文件每行
for index, line in enumerate(read_lines):# 去除末尾\nline = line.strip()# 过滤@开头行if line.startswith('@'):continueelse:# 打印关键信息print(index, '\t', line)
3. 使用pysam对bam进行读取操作
速度快,需要注意:samtools的坐标是1-base,而pysam坐标是0-base。
import pysambam_path = "/path/sample.bam"###### 方法1 ######
# 读取bam文件
bam_file = pysam.AlignmentFile(bam_path, 'rb')
# 读取sam文件,可直接读取
# bam_file = pysam.AlignmentFile(bam_path, 'r')
# 读取cram文件
# bam_file = pysam.AlignmentFile(bam_path, 'rc')for line in bam_file:record = lineprint(record)break# 关闭bam
bam_file.close()###### 方法2 ######
# 无需close关闭
with pysam.AlignmentFile(bam_path, 'rb') as bam_file :for line in bam_file :print(line )break
4. 使用pysam对bam进行写入操作
bam_path = "/path/sample.bam"
bam_out_path = "/path/sample-out.bam"with pysam.AlignmentFile(bam_out_path, 'wb', template=bam_file) as write_bam_file:# 写入读取的record至bam文件末尾write_bam_file.write(record)
5. 使用pysam检查bam index是否存在
# 检查是否存在bam index, 存在返回True, 否则为False
bam_file.check_index()# 与samtools联用, 不存在index则samtools index生成
if not bam_file.check_index():os.system("samtools index {}".format(bam_path))
6. 使用pysam对bam进行排序
bam_out_path = "/path/sample.sorted.bam"
# 对bam进行sort排序
pysam.sort("-o", bam_sort_path, bam_path)
7. 使用pysam提取基因组某个区域的比对结果
region = bam_file.fetch(contig='chr1', start=60000, stop=60200)
print(region)
8. 使用pysam提取基因组某个区域的reads数量
# 返回reads数目值
region_reads_count = bam_file.count(contig='chr1', start=500000, stop=1000000)
print(region_reads_count)
# 56604
9. 使用pysam提取基因组某个区域的每个碱基覆盖深度
# 返回区间每个碱基的的覆盖深度
region_coverage = bam_file.count_coverage(contig='chr5', start=5000000, stop=5000100)
print(region_coverage)
10. 使用pysam获取bam文件中每条染色体的mapped和unmapped reads数量
bam_file.get_index_statistics()
# [IndexStats(contig='chr1', mapped=3115437, unmapped=3233, total=3118670),
# IndexStats(contig='chr2', mapped=2426504, unmapped=2287, total=2428791),
# IndexStats(contig='chr3', mapped=2092895, unmapped=1806, total=2094701),
# IndexStats(contig='chr4', mapped=1189202, unmapped=1058, total=1190260),
# IndexStats(contig='chr5', mapped=1357775, unmapped=1167, total=1358942),
# IndexStats(contig='chr6', mapped=1772859, unmapped=1572, total=1774431),
# IndexStats(contig='chr7', mapped=1484016, unmapped=1528, total=1485544),
# IndexStats(contig='chr8', mapped=1122131, unmapped=1034, total=1123165),
# IndexStats(contig='chr9', mapped=1341301, unmapped=1486, total=1342787),
# IndexStats(contig='chr10', mapped=1046607, unmapped=909, total=1047516),
# IndexStats(contig='chr11', mapped=1731600, unmapped=1797, total=1733397),
# IndexStats(contig='chr12', mapped=1829623, unmapped=1534, total=1831157),
# IndexStats(contig='chr13', mapped=568294, unmapped=420, total=568714),
# IndexStats(contig='chr14', mapped=1013704, unmapped=907, total=1014611),
# IndexStats(contig='chr15', mapped=1270915, unmapped=1089, total=1272004),
# IndexStats(contig='chr16', mapped=1937126, unmapped=2096, total=1939222),
# IndexStats(contig='chr17', mapped=2175991, unmapped=2195, total=2178186),
# IndexStats(contig='chr18', mapped=547370, unmapped=530, total=547900),
# IndexStats(contig='chr19', mapped=1790984, unmapped=2367, total=1793351),
# IndexStats(contig='chr20', mapped=693052, unmapped=775, total=693827),
# IndexStats(contig='chr21', mapped=421361, unmapped=496, total=421857),
# IndexStats(contig='chr22', mapped=967377, unmapped=1140, total=968517),
# IndexStats(contig='chrX', mapped=2187343, unmapped=1680, total=2189023),
# IndexStats(contig='chrY', mapped=35027, unmapped=27, total=35054),
# IndexStats(contig='chrMT', mapped=1334891, unmapped=1494, total=1336385)]
11. 使用pysam获取bam文件每条染色体长度
# 比对上reads数量
bam_file.mapped# 未比对上的reads数量
bam_file.unmapped###################
chrom_names = bam_file.references
chrom_lengths = bam_file.lengths# 打印数据类型
print(type(chrom_names))
# <class 'tuple'># zip压缩2个元组
for chrom, length in zip(chrom_names, chrom_lengths):print(chrom, ':', length)# chr1 : 249250621
# chr2 : 243199373
# chr3 : 198022430
# chr4 : 191154276
# chr5 : 180915260
# chr6 : 171115067
# chr7 : 159138663
# chr8 : 146364022
# chr9 : 141213431
# chr10 : 135534747
# chr11 : 135006516
# chr12 : 133851895
# chr13 : 115169878
# chr14 : 107349540
# chr15 : 102531392
# chr16 : 90354753
# chr17 : 81195210
# chr18 : 78077248
# chr19 : 59128983
# chr20 : 63025520
# chr21 : 48129895
# chr22 : 51304566
# chrX : 155270560
# chrY : 59373566
# chrMT : 16569
12. 使用pysam操作bam文件用法
with pysam.AlignmentFile(bam_path, 'rb') as bam_file:# bam_file对象每一行都是一条readsfor reads in bam_file:# 打印与reads配对的另一条readsline_match = bam_file.mate(reads)print(line_match)# 判断是否是PCR重复序列print(reads.is_duplicate)# 判断是否是成对readsprint(reads.is_paired)# 判断是否是左端readsprint(reads.is_read1)# 判断是否是右端readsprint(reads.is_read2)# 判断是否是质控失败print(reads.is_qcfail)# 判断是否比对到参考基因组print(reads.is_unmapped)# 输出reads的比对质量print(reads.mapping_quality)# 输出reads比对到参考基因组上的起始和结束坐标print(reads.get_blocks())# [(10179, 10206), (10206, 10227), (10228, 10273), (10273, 10297), (10297, 10303), (10303, 10325)]# 输出在指定区域坐标的重叠区域长度print(reads.get_overlap(500000,600000))# 0# 输出reads在比对到参考基因组的上参考序列print(reads.get_reference_sequence())# tAACCCTAACCCTAACCCTAACCCTAACCCTAACCCtAACCCtAACCCTAAcCCCtAACCCtAACCCTAAACCCtAAACCCTAACCCTAACCCTAACCCtAACCCtAACCCcAACCCCAACCCCAaCCCCAACCCcAACCCcAACCbreak
13. 使用pysam计算指定区域的unique mapped的reads数量
# 计算指定区域的unique mapping的reads数目
import pysam#### 输入参数 ####
chr='chr16'
start=1250000
stop=1260000bam_file = pysam.AlignmentFile(bam_path, 'rb')# 定义集合,避免reads重复,其大小就是unique reads的数量
region_reads = set()
count = 0for read in bam_file.fetch(chr, start, stop):region_reads.add(read.query_name)count = count +1# 打印结果
print('{} unique reads in {}:{}-{}'.format(len(region_reads), chr, start, stop))
print('{} alignment in {}:{}-{}'.format(count, chr, start, stop))# 4778 unique reads in chr16:1250000-1260000
# 9514 alignment in chr16:1250000-1260000
14. 使用pysam计算非重叠区间(窗口)的unique mapping的 reads数量
import pysam# 设置统计窗口大小100kb
bin_size = 100000bam_file = pysam.AlignmentFile(bam_path, "rb")
print(bam_file.references)
print(bam_file.lengths)# 将结果写入文件
with open("bam.window.reads.count", 'w') as fwrite:# 写入headerfwrite.write("ref\tstart\tstop\tunique_reads\tmapped_reads\n")# 遍历bam文件中比对到参考基因组上的染色体名称for i in range(len(bam_file.references)):# 获取比对到参考基因组上的染色体名称和长度refname = bam_file.references[i]seqlen = bam_file.lengths[i]# 以bin_size窗口大小将染色体分割为j个窗口mapped_count = 0for j in range(1, seqlen, bin_size):# pysam为0-base, 坐标需-1# 获取stop坐标, j为start坐标if j + bin_size - 1 < bam_file.lengths[i]:stop = j + bin_size - 1else:stop = bam_file.lengths[i]region_reads = set()# 获取unique_reads, mapped_readsfor read in bam_file.fetch(refname, j, stop):region_reads.add(read.query_name)mapped_count += 1# 写入文件fwrite.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(refname, j, stop, len(region_reads), mapped_count))