#! /opt/local/bin python3.4 """ filter_vcf_for_targets.py Toby Spribille, 22 July 2014 This is a script to filter lines out of a .vcf file based on a list of target sequences of interest. The .vcf file should use ONLY the table part - the header should be removed - and should look like this: comp9859_c0_seq1 1373 . A C 30.57 PASS AC=1;AF=0.250;AN=4;BaseQRankSum=-0.922;DP=7;Dels=0.00;FS=0.000;HaplotypeScore=0.0000;MLEAC=1;MLEAF=0.250;MQ=60.00;MQ0=0;MQRankSum=0.198;POSITIVE_TRAIN_SITE;QD=5.10;ReadPosRankSum=-0.922;VQSLOD=5.34;culprit=HaplotypeScore GT:AD:DP:GQ:PL ./. ./. 0/1:4,2:6:57:57,0,135 0/0:1,0:1:3:0,3,39 comp987_c0_seq1 453 . T C 120.17 PASS AC=2;AF=0.250;AN=8;BaseQRankSum=-0.157;DP=22;Dels=0.00;FS=8.704;HaplotypeScore=0.0000;MLEAC=2;MLEAF=0.250;MQ=60.00;MQ0=0;MQRankSum=-0.157;POSITIVE_TRAIN_SITE;QD=8.58;ReadPosRankSum=-0.940;VQSLOD=2.89;culprit=HaplotypeScore GT:AD:DP:GQ:PL 0/1:1,2:3:33:70,0,33 0/0:4,0:4:6:0,6,89 0/1:8,3:11:83:83,0,254 0/0:4,0:4:12:0,12,154 comp987_c1_seq1 1059 . A C 64.45 PASS AC=2;AF=0.250;AN=8;BaseQRankSum=1.449;DP=17;Dels=0.00;FS=0.000;HaplotypeScore=0.0000;MLEAC=2;MLEAF=0.250;MQ=60.00;MQ0=0;MQRankSum=0.441;POSITIVE_TRAIN_SITE;QD=6.45;ReadPosRankSum=1.071;VQSLOD=5.71;culprit=HaplotypeScore GT:AD:DP:GQ:PL 0/1:1,1:2:33:35,0,33 0/0:2,0:2:6:0,6,80 0/1:6,2:8:62:62,0,181 0/0:5,0:5:9:0,9,128 The targets file should look like this: comp9859_c0_seq1 comp987_c1_seq1 It has to run on python 3.4 due to the dependency on the pandas module. Call with /Volumes/Macintosh_HD/usr/local/bin/python3.4 """ infilename = "/Volumes/Macintosh_HD/Users/tobyspribille/Desktop/bryoria_SNP_2015/hard_filter_pass_SNPs_220115_table_only.vcf" infile = open(infilename, 'r') linenumber = 0 infile_ncols = 25 infile_nrows = 1483282 infile2name = "/Volumes/Macintosh_HD/Users/tobyspribille/Desktop/bryoria_SNP_2015/targets_only_all_algae.txt" infile2 = open(infile2name, 'r') infile2_ncols = 1 infile2_nrows = 2046 #outfilename = infilename + ".counts" #OUT = open(outfilename, 'w') vcf_file = [[0 for x in range(infile_ncols)]for x in range(infile_nrows)] for line in infile: if linenumber >= 0: line = line.strip('\n') elementlist = line.split('\t') i = 0 for element in elementlist: vcf_file[linenumber][i] = element i += 1 linenumber = linenumber + 1 import pandas df = pandas.DataFrame(vcf_file, columns=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'fre10', 'fre11', 'fre5', 'fre6', 'fre8', 'fre9', 'index12', 'index13', 'index14', 'index15', 'tor1', 'tor12', 'tor14', 'tor2', 'tor3', 'tor4']) linenumber = 0 transcripts_to_include = [[0 for x in range(infile2_ncols)]for x in range(infile2_nrows)] for line in infile2: if linenumber >= 1: line = line.strip('\n') elementlist = line.split('\t') #print(elementlist) i = 0 for element in elementlist: transcripts_to_include[linenumber][i] = element i += 1 linenumber = linenumber + 1 df2 = pandas.DataFrame(transcripts_to_include, columns=['transcripts']) select_transcripts_only = df.merge(df2, how='inner', left_on='CHROM', right_on='transcripts') select_transcripts_only.to_csv("/Volumes/Macintosh_HD/Users/tobyspribille/Desktop/bryoria_SNP_2015/algal_transcripts_only_180215.vcf", sep='\t', na_rep='', float_format=None, cols=None, header=True) infile.close() #OUT.close()