''' Created on Mar 1, 2015 @author: Markus Pfenninger ''' #import necessary modules import glob import re import csv #prepare output file and fields outfname = "TE_in_exons.txt" genefile = "Crip_exon.txt" #required format: .gff infile = "MF_TEs.txt" outf = open(outfname, "w") scaffID = [] rangeStart = [] rangeEnd = [] annotation = [] #Get all scaffold IDs, rangestarts and -ends from gff output file and write them in a respective list with open(genefile) as to_read: reader = csv.reader(to_read, delimiter = "\t") counter = 0 for row in reader: scaffID.append(row[0]) rangeStart.append(row[3]) rangeEnd.append(row[4]) annotation.append(row[7]) counter = counter + 1 print (counter, " exons read in") #open SNP location file file with open(infile) as to_read: reader = csv.reader(to_read, delimiter = "\t") pos_counter = 0 succ_counter = 0 fail_counter = 0 for row in reader: pos_counter = pos_counter + 1 sID = row[0] pos = row[1] #print(sID, "\t", pos) #infer lowest index for the respective scaffold if sID in scaffID: counter = scaffID.index(sID) if scaffID[counter] == sID: if float(pos) >= float(rangeStart[counter]) and float(pos) <= float(rangeEnd[counter]): #print("hit!") succ_counter = succ_counter + 1 outf.write(str(row[0]) + "\t" + str(row[1]) + "\t" + str(annotation[counter]) + "\t1"+ "\t" + str(row[2]) + "\n") elif float(pos) <= float(rangeStart[counter]) or float(pos) >= float(rangeEnd[counter]): fail_counter = fail_counter + 1 #outf.write(str(row[0]) + "\t" + str(row[1]) + "\t" + str(row[3]) + "\t" + str(row[4]) + "\t0"+ "\n") else: fail_counter = fail_counter + 1 #outf.write(str(row[0]) + "\t" + str(row[1]) + "\t" + str(row[3]) + "\t" + str(row[4]) + "\t0"+ "\n") print ("looked at ", pos_counter, " SNP positions") print ("found ",succ_counter, " hits in exons") print (fail_counter , " positions were not in exons") outf.close()