Summary ======= Contains SAS script for estimating genotyping error rates using Mendelian incompatibilities. ####Formatting File for running script All 0’s indicating missing genotype data must be replaced with . Column heads must be tab delimited First column head is locus, second column head is position and then sample number with A and B for the two alleles #### Running script Set the number of genecopies (2 times the number of individuals in the input file) Set “minloci” (any dyad sharing less than this number is will be dropped) Add array of individuals (must match individual ID name in input file, all individuals must be listed here) Coverage Threshold SAS code for reference-aligned and de novo assembled data ============================================================================ * This code selects observations ABOVE a certain read depth or QV score, then estimates the proportion of mismatching loci for a list of dyads; **********************NEED TO REPLACE 0'S WITH . IN STRUCTURE FILE*******************************************************************************; **********************MAKE SURE COLUMN HEADS ARE TAB DELIMITED. FIRST COLUMN IS LOCUS SECOND COLUMN IS POSITION THEN CH NUMBER WTIH A AND B FOR THE TWO ALLELES**************; %let Genecopies = 4; ****************** Number of genetic copies. 2 times the number of individuals in input file; %LET MINLOCI = 100; *ANY INDIVIDUAL WITH FEWER THAN THIS NUMBER OF LOCI WILL BE DELETED; * Import data; PROC IMPORT OUT= WORK.SNPs DATAFILE= "C:\Users\structure.txt" DBMS=TAB REPLACE; GETNAMES=YES; DATAROW=2; proc sort data = SNPs; by locus position; Data Final; set SNPs; array Individual(&Genecopies.) "offspring"a "offspring"b "mother"a "mother"b; *****************NEED TO PROVIDE LIST OF INDIVIDUALS!!!!!!!!!!!!!!; array AlleleA(&Genecopies.) AlleleA1-AlleleA&Genecopies.; array AlleleC(&Genecopies.) AlleleC1-AlleleC&Genecopies.; array AlleleG(&Genecopies.) AlleleG1-AlleleG&Genecopies.; array AlleleT(&Genecopies.) AlleleT1-AlleleT&Genecopies.; do i = 1 to &Genecopies.; if Individual(i) = 1 then AlleleA(i) = 1; if Individual(i) ne 1 and Individual(i) ne . then AlleleA(i) = 0; if Individual(i) = 2 then AlleleC(i) = 1; if Individual(i) ne 2 and Individual(i) ne . then AlleleC(i) = 0; if Individual(i) = 3 then AlleleG(i) = 1; if Individual(i) ne 3 and Individual(i) ne . then AlleleG(i) = 0; if Individual(i) = 4 then AlleleT(i) = 1; if Individual(i) ne 4 and Individual(i) ne . then AlleleT(i) = 0; end; FreqA = mean(of AlleleA1-AlleleA&Genecopies.); FreqC = mean(of AlleleC1-AlleleC&Genecopies.); FreqG = mean(of AlleleG1-AlleleG&Genecopies.); FreqT = mean(of AlleleT1-AlleleT&Genecopies.); PA = max(of FreqA--FreqT); PB = 1 - PA; drop AlleleA1-AlleleA&Genecopies. AlleleC1-AlleleC&Genecopies. AlleleG1-AlleleG&Genecopies. AlleleT1-AlleleT&Genecopies. FreqA--FreqT; PME = PA * PB * (2 - (1 - 0.5*PB) - (1 - 0.5*PA) + 0.5 * 1); %macro Dyads(JuvMajorAllele,JuvMinorAllele,MomMajorAllele,MomMinorAllele,DyadNumber); Data Dyad&DyadNumber.; set Final; if &JuvMajorAllele. = . or &MomMajorAllele. = . then delete; * Get rid of loci that are not shared between mom and baby; match = 0; if &JuvMajorAllele. = &MomMajorAllele. then match = 1; if &JuvMajorAllele. = &MomMinorAllele. then match + 1; if &JuvMinorAllele. = &MomMajorAllele. then match + 1; if &JuvMinorAllele. = &MomMinorAllele. then match + 1; if match > 0 then error = 0; if match = 0 then error = 1; proc means noprint; var error; output out = Error_&DyadNumber. mean(error)=MeanError sum(PME) = ExpectedMEs sum(Error)= ObservedMEs n(Error)=nloci; Data Error_&DyadNumber.; set Error_&DyadNumber.; Sloth1 = "&JuvMajorAllele."; Sloth2 = "&MomMajorAllele."; drop _TYPE_ _FREQ_; %mend; ************************NEED TO PROVIDE LIST OF DYADS OF INTEREST HERE************************************; %Dyads("offspring"a,"offspring"b,"mother"a,"mother"b,1); ***********************NEED TO SPECIFIC NUMBER OF DYADS OF INTEREST HERE********************************; %let NumberOfDyads = 1; Data Error; %macro combine; set %do i = 1 %to &NumberOfDyads.; Error_&i. %end; %mend; %combine; GE = ObservedMEs/ExpectedMEs; IF NLOCI < &MINLOCI. THEN DELETE; proc print data = Error; proc means data = Error mean; var MeanError GE nLoci; PROC MEANS noprint; VAR OBSERVEDMES EXPECTEDMES; OUTPUT OUT = POOLEDERROR SUM(OBSERVEDMES)=SUMOBSERVEDMES SUM(EXPECTEDMES)=SUMEXPECTEDMES; DATA POOLEDERROR; SET POOLEDERROR; POOLEDERROR = SUMOBSERVEDMES/SUMEXPECTEDMES; PROC PRINT; run; ################################################################################################################################################## SAS code for coverage and quality score intervals for reference-aligned data =========================================================================== * This code selects observations within a certain read depth or QV score, then estimates the proportion of mismatching loci for a list of dyads; **********************IN OTHER WORDS FOR EACH COMBINATION OF STRINGENT VS LIBERAL AND BATCH VS STRUCTURE FILES***********************************; **********************NEED TO REPLACE 0'S WITH . IN STRUCTURE FILE*******************************************************************************; **********************MAKE SURE COLUMN HEADS ARE TAB DELIMITED. FIRST COLUMN IS LOCUS SECOND COLUMN IS POSITION THEN CH NUMBER WTIH A AND B FOR THE TWO ALLELES**************; %let Genecopies = 4; ****************** Number of genetic copies. 2 times the number of individuals in input file. MUST BE CORRECT!!!!; %LET MINLOCI = 100; *ANY INDIVIDUAL WITH FEWER THAN THIS NUMBER OF LOCI WILL BE DELETED; PROC IMPORT OUT= WORK.SNPsStringent DATAFILE= "C:\Users\highercoverage_batch.txt" DBMS=TAB REPLACE; GETNAMES=YES; DATAROW=2; Data SNPsStringent; set SNPsStringent; keep __Catalog_ID SNPs Chr BP; Data SNPsStringent; set SNPsStringent; rename __Catalog_ID = locus SNPs = position; proc sort; by Chr Locus; Data SNPsStringent; set SNPsStringent; position = compress(position,'>;ATCG'); p1 = scan(position, 1, ','); p2 = scan(position, 2, ','); p3 = scan(position, 3, ','); if p3 = p2 then p3 = ''; if p2 = p1 then p2 = ''; DATA SNPsStringent; SET SNPsStringent; ARRAY ap(3) p1 - p3 ; DO i = 1 to 3; newposition = ap(i); OUTPUT; END; DATA SNPsStringent; SET SNPsStringent; newpositionnumeric = newposition * 1; drop position p1-p3; Data SNPsStringent; set SNPsStringent; rename newpositionnumeric = position; Data SNPsStringent; set SNPsStringent; if locus = . then delete; if position = . then delete; BATCHSTRINGENT = "YES"; proc sort; by locus position; PROC IMPORT OUT= WORK.SNPsStructureStringent DATAFILE= "C:\Users\highercoverage_structure.txt" DBMS=TAB REPLACE; GETNAMES=YES; DATAROW=2; data SNPsStructureStringent; set SNPsStructureStringent; STRUCTURESTRINGENT = "YES"; proc sort; by locus position; Data FinalSNPsStringent; merge SNPsStringent SNPsStructureStringent; by locus position; Rename locus = LocusStringent; Data FinalSNPsStringent; set FinalSNPsStringent; STRINGENT = "YES"; if StructureStringent ne "YES" or BatchStringent ne "YES" then delete; Keep LocusStringent Chr BP Position Stringent; proc sort; by Chr BP position; * Import input file with relatively high read depth or QV; PROC IMPORT OUT= WORK.SNPsLiberal DATAFILE= "C:\Users\lowercoverage_batch.txt" DBMS=TAB REPLACE; GETNAMES=YES; DATAROW=2; Data SNPsLiberal; set SNPsLiberal; keep __Catalog_ID SNPs Chr BP; Data SNPsLiberal; set SNPsLiberal; rename __Catalog_ID = locus SNPs = position; proc sort; by Chr Locus; Data SNPsLiberal; set SNPsLiberal; position = compress(position,'>;ATCG'); p1 = scan(position, 1, ','); p2 = scan(position, 2, ','); p3 = scan(position, 3, ','); if p3 = p2 then p3 = ''; if p2 = p1 then p2 = ''; DATA SNPsLiberal; SET SNPsLiberal; ARRAY ap(3) p1 - p3 ; DO i = 1 to 3; newposition = ap(i); OUTPUT; END; DATA SNPsLiberal; SET SNPsLiberal; newpositionnumeric = newposition * 1; drop position p1-p3; Data SNPsLiberal; set SNPsLiberal; rename newpositionnumeric = position; Data SNPsLiberal; set SNPsLiberal; if locus = . then delete; if position = . then delete; BATCHLIBERAL = "YES"; proc sort; by locus position; PROC IMPORT OUT= WORK.SNPsStructureLiberal DATAFILE= "C:\Users\lowercoverage_structure.txt" DBMS=TAB REPLACE; GETNAMES=YES; DATAROW=2; data SNPsStructureLiberal; set SNPsStructureLiberal; STRUCTURELIBERAL = "YES"; proc sort; by locus position; Data FinalSNPsLiberal; merge SNPsLiberal SNPsStructureLiberal; by locus position; Rename locus = LocusStringent; Data FinalSNPsLiberal; set FinalSNPsLiberal; LIBERAL = "YES"; if StructureLiberal ne "YES" or BatchLiberal ne "YES" then delete; proc sort; by Chr BP position; Data Final; merge FinalSNPsStringent FinalSNPsLiberal; by Chr BP position; if Liberal = '' then delete; if Stringent = 'YES' then delete; drop batchliberal structureliberal; array Individual(&Genecopies.) "offspring"a "offspring"b "mother"a "mother"b; *****************NEED TO PROVIDE LIST OF INDIVIDUALS!!!!!!!!!!!!!!; array AlleleA(&Genecopies.) AlleleA1-AlleleA&Genecopies.; array AlleleC(&Genecopies.) AlleleC1-AlleleC&Genecopies.; array AlleleG(&Genecopies.) AlleleG1-AlleleG&Genecopies.; array AlleleT(&Genecopies.) AlleleT1-AlleleT&Genecopies.; do i = 1 to &Genecopies.; if Individual(i) = 1 then AlleleA(i) = 1; if Individual(i) ne 1 and Individual(i) ne . then AlleleA(i) = 0; if Individual(i) = 2 then AlleleC(i) = 1; if Individual(i) ne 2 and Individual(i) ne . then AlleleC(i) = 0; if Individual(i) = 3 then AlleleG(i) = 1; if Individual(i) ne 3 and Individual(i) ne . then AlleleG(i) = 0; if Individual(i) = 4 then AlleleT(i) = 1; if Individual(i) ne 4 and Individual(i) ne . then AlleleT(i) = 0; end; FreqA = mean(of AlleleA1-AlleleA&Genecopies.); FreqC = mean(of AlleleC1-AlleleC&Genecopies.); FreqG = mean(of AlleleG1-AlleleG&Genecopies.); FreqT = mean(of AlleleT1-AlleleT&Genecopies.); PA = max(of FreqA--FreqT); PB = 1 - PA; drop AlleleA1-AlleleA&Genecopies. AlleleC1-AlleleC&Genecopies. AlleleG1-AlleleG&Genecopies. AlleleT1-AlleleT&Genecopies. FreqA--FreqT; PME = PA * PB * (2 - (1 - 0.5*PB) - (1 - 0.5*PA) + 0.5 * 1); %macro Dyads(JuvMajorAllele,JuvMinorAllele,MomMajorAllele,MomMinorAllele,DyadNumber); Data Dyad&DyadNumber.; set Final; if &JuvMajorAllele. = . or &MomMajorAllele. = . then delete; * Get rid of loci that are not shared between mom and baby; match = 0; if &JuvMajorAllele. = &MomMajorAllele. then match = 1; if &JuvMajorAllele. = &MomMinorAllele. then match + 1; if &JuvMinorAllele. = &MomMajorAllele. then match + 1; if &JuvMinorAllele. = &MomMinorAllele. then match + 1; if match > 0 then error = 0; if match = 0 then error = 1; proc means noprint; var error; output out = Error_&DyadNumber. mean(error)=MeanError sum(PME) = ExpectedMEs sum(Error)= ObservedMEs n(error)=nloci; Data Error_&DyadNumber.; set Error_&DyadNumber.; Sloth1 = "&JuvMajorAllele."; Sloth2 = "&MomMajorAllele."; drop _TYPE_ _FREQ_; %mend; ************************NEED TO PROVIDE LIST OF DYADS OF INTEREST HERE************************************; %Dyads("offspring"a,"offspring"b,"mother"a,"mother"b,1); ***********************NEED TO SPECIFIC NUMBER OF DYADS OF INTEREST HERE********************************; %let NumberOfDyads = 1; Data Error; %macro combine; set %do i = 1 %to &NumberOfDyads.; Error_&i. %end; %mend; %combine; GE = ObservedMEs/ExpectedMEs; IF NLOCI < &MINLOCI. THEN DELETE; proc print data = Error; proc means data = Error mean; var MeanError GE nLoci; PROC MEANS noprint; VAR OBSERVEDMES EXPECTEDMES; OUTPUT OUT = POOLEDERROR SUM(OBSERVEDMES)=SUMOBSERVEDMES SUM(EXPECTEDMES)=SUMEXPECTEDMES; DATA POOLEDERROR; SET POOLEDERROR; POOLEDERROR = SUMOBSERVEDMES/SUMEXPECTEDMES; PROC PRINT; run; SAS code for coverage intervals for de novo assembled data * This code selects observations within a certain read depth or QV score, then estimates the proportion of mismatching loci for a list of dyads; **********************NEED TO REPLACE 0'S WITH . IN STRUCTURE FILE*******************************************************************************; **********************MAKE SURE COLUMN HEADS ARE TAB DELIMITED. FIRST COLUMN IS LOCUS SECOND COLUMN IS POSITION THEN CH NUMBER WTIH A AND B FOR THE TWO ALLELES**************; %let Genecopies = 4; ****************** Number of genetic copies. 2 times the number of individuals in input file. MUST BE CORRECT!!!!; %LET MINLOCI = 100; *ANY INDIVIDUAL WITH FEWER THAN THIS NUMBER OF LOCI WILL BE DELETED; PROC IMPORT OUT= WORK.SNPsStructureStringent DATAFILE= "C:\Users\highcoverage_structure.txt" DBMS=TAB REPLACE; GETNAMES=YES; DATAROW=2; data SNPsStructureStringent; set SNPsStructureStringent; proc sort; by locus position; Data FinalSNPsStringent; set SNPsStructureStringent; Rename locus = LocusStringent; Data FinalSNPsStringent; set FinalSNPsStringent; STRINGENT = "YES"; Keep LocusStringent Position Stringent; proc sort; by LOCUSTSTRINGENT position; PROC IMPORT OUT= WORK.SNPsStructureLiberal DATAFILE= "C:\Users\lowercoverage_structure.txt" DBMS=TAB REPLACE; GETNAMES=YES; DATAROW=2; data SNPsStructureLiberal; set SNPsStructureLiberal; proc sort; by locus position; Data FinalSNPsLiberal; set SNPsStructureLiberal; Rename locus = LocusStringent; Data FinalSNPsLiberal; set FinalSNPsLiberal; LIBERAL = "YES"; proc sort; by LOCUSSTRINGENT position; Data Final; merge FinalSNPsStringent FinalSNPsLiberal; by /*Chr BP*/LOCUSSTRINGENT position; if Stringent = 'YES' then delete; n = _n_; array Individual(&Genecopies.) "offspring"a "offspring"b "mother"a "mother"b; *****************NEED TO PROVIDE LIST OF INDIVIDUALS!!!!!!!!!!!!!!; array AlleleA(&Genecopies.) AlleleA1-AlleleA&Genecopies.; array AlleleC(&Genecopies.) AlleleC1-AlleleC&Genecopies.; array AlleleG(&Genecopies.) AlleleG1-AlleleG&Genecopies.; array AlleleT(&Genecopies.) AlleleT1-AlleleT&Genecopies.; do i = 1 to &Genecopies.; if Individual(i) = 1 then AlleleA(i) = 1; if Individual(i) ne 1 and Individual(i) ne . then AlleleA(i) = 0; if Individual(i) = 2 then AlleleC(i) = 1; if Individual(i) ne 2 and Individual(i) ne . then AlleleC(i) = 0; if Individual(i) = 3 then AlleleG(i) = 1; if Individual(i) ne 3 and Individual(i) ne . then AlleleG(i) = 0; if Individual(i) = 4 then AlleleT(i) = 1; if Individual(i) ne 4 and Individual(i) ne . then AlleleT(i) = 0; end; FreqA = mean(of AlleleA1-AlleleA&Genecopies.); FreqC = mean(of AlleleC1-AlleleC&Genecopies.); FreqG = mean(of AlleleG1-AlleleG&Genecopies.); FreqT = mean(of AlleleT1-AlleleT&Genecopies.); PA = max(of FreqA--FreqT); PB = 1 - PA; drop AlleleA1-AlleleA&Genecopies. AlleleC1-AlleleC&Genecopies. AlleleG1-AlleleG&Genecopies. AlleleT1-AlleleT&Genecopies. FreqA--FreqT; PME = PA * PB * (2 - (1 - 0.5*PB) - (1 - 0.5*PA) + 0.5 * 1); %macro Dyads(JuvMajorAllele,JuvMinorAllele,MomMajorAllele,MomMinorAllele,DyadNumber); Data Dyad&DyadNumber.; set Final; if &JuvMajorAllele. = . or &MomMajorAllele. = . then delete; * Get rid of loci that are not shared between mom and baby; match = 0; if &JuvMajorAllele. = &MomMajorAllele. then match = 1; if &JuvMajorAllele. = &MomMinorAllele. then match + 1; if &JuvMinorAllele. = &MomMajorAllele. then match + 1; if &JuvMinorAllele. = &MomMinorAllele. then match + 1; if match > 0 then error = 0; if match = 0 then error = 1; proc means noprint; var error; output out = Error_&DyadNumber. mean(error)=MeanError sum(PME) = ExpectedMEs sum(Error)= ObservedMEs n(error)=nloci; Data Error_&DyadNumber.; set Error_&DyadNumber.; Sloth1 = "&JuvMajorAllele."; Sloth2 = "&MomMajorAllele."; drop _TYPE_ _FREQ_; %mend; ************************NEED TO PROVIDE LIST OF DYADS OF INTEREST HERE************************************; %Dyads("offspring"a,"offspring"b,"mother"a,"mother"b,1); ***********************NEED TO SPECIFIC NUMBER OF DYADS OF INTEREST HERE********************************; %let NumberOfDyads = 1; Data Error; %macro combine; set %do i = 1 %to &NumberOfDyads.; Error_&i. %end; %mend; %combine; GE = ObservedMEs/ExpectedMEs; IF NLOCI < &MINLOCI. THEN DELETE; proc print data = Error; proc means data = Error mean; var MeanError GE nLoci; PROC MEANS noprint; VAR OBSERVEDMES EXPECTEDMES; OUTPUT OUT = POOLEDERROR SUM(OBSERVEDMES)=SUMOBSERVEDMES SUM(EXPECTEDMES)=SUMEXPECTEDMES; DATA POOLEDERROR; SET POOLEDERROR; POOLEDERROR = SUMOBSERVEDMES/SUMEXPECTEDMES; PROC PRINT; run;