#Perl -Sx "{0}"; Exit {Status} #!perl #This script will only work properly if there is a variant call for every sample at every variant analyzed use warnings; use strict; use Data::Dumper; # use Statistics::Basic qw(:all nofill); # use Number::Format qw(:subs); # use Math::NumberCruncher; if ($#ARGV != 1) {die "\n\tUsage: perl vcfSNPs.pl [infile] [# generations]\n\n"; } my $infile = $ARGV[0]; my $gen_count = $ARGV[1]; my $outfile = $infile."_var_sharing_sampling_".$gen_count."gens.txt"; #read in all vars and memorize genotypes for each sample, store var in array my $samples = []; my $all_vars = []; my $var_count = 0; open (my $fp, '<', $infile) or die "Couldn't open $infile for reading\n"; while (<$fp>) {chomp; if ($_ =~ m/^\#\#/) { next; } if ($_ =~ m/^\#/) { $samples = getSampleNames ( $samples, $_); next; } my @data = split(' ', $_); my $chrom = shift(@data); my $pos = shift(@data); my $rsid = shift(@data); my $ref = shift(@data); my $alt = shift(@data); my $qual = shift(@data); my $filter = shift(@data); my $info = shift(@data); my $format = shift(@data); my $gen_calls = \@data; if (isBiallelic($alt) == 1) { $var_count++; my $var_info = {}; $var_info->{'chrom'} = $chrom; $var_info->{'pos'} = $pos; $var_info->{'samples'} = {}; my $sample_vars = $var_info->{'samples'}; for (my $i = 0; $i <= $#$gen_calls; $i++) { my $sample_id = $samples->[$i]->{'id'}; # my $vars = $samples->[$i]->{'vars'}; my $var_calls = $gen_calls->[$i]; my $gen_sum = getGenSum ($var_calls); $var_info->{'samples'}->{$sample_id} = $gen_sum; } push (@$all_vars, $var_info); } } close ($fp); # $var_count = 3000; my $shares = {}; for (my $gen = 1; $gen<=$gen_count; $gen++) { #randomly sample from vars array until as many vars have been sampled as exist in the data my $gen_scores = {}; for (my $i = 0; $i< $var_count; $i++) { my $index = int(rand(@{$all_vars})); # print STDERR "Sampling the $index var of $var_count total\n"; my $rand_var_info = $all_vars->[$index]; my $sample_vars = $rand_var_info->{'samples'}; foreach my $sample (keys %$sample_vars) { push ( @{$gen_scores->{$sample}}, $sample_vars->{$sample}); } } #analyze and store results for gen- that's the stuff below # $gen_scores is a hash with all the samples as keys. Each key's value is an array that holds scores for the samples for (my $i = 0; $i <= $#$samples-1; $i++) { my $sample1_info = $samples->[$i]; my $sample1_id = $sample1_info->{'id'}; for (my $j= $i; $j <= $#$samples; $j++) { my $sample2_info = $samples->[$j]; my $sample2_id = $sample2_info->{'id'}; my $sample_share = 0; if ( $#{$gen_scores->{$sample1_id}} != $#{$gen_scores->{$sample2_id}}) {print STDERR "Error: Unequal variant list lengths\n"; } # print Dumper ($gen_scores); exit; for (my $k = 0; $k <= $#{$gen_scores->{$sample1_id}}; $k++ ) { my $sample1_gen = $gen_scores->{$sample1_id}->[$k]; my $sample2_gen = $gen_scores->{$sample2_id}->[$k]; # print STDERR "$sample1_gen\t$sample2_gen\n"; my $diff = abs($sample1_gen - $sample2_gen); if ($diff == 0){ $sample_share++; } elsif ($diff == 1) {$sample_share += 0.5; } elsif ($diff == 2) {} else { print STDERR "Error: How do I parse $diff?\n"; } } my $total_share = $sample_share/$var_count; push (@{$shares->{$sample1_id}->{$sample2_id}}, $total_share); } } } #print results with standard deviations open (my $outfp, '>', $outfile) or die "Couldn't open $outfile for printing\n"; for (my $i = 0; $i <= $#$samples-1; $i++) { my $sample1_info = $samples->[$i]; my $sample1_id = $sample1_info->{'id'}; for (my $j= $i; $j <= $#$samples; $j++) { my $sample2_info = $samples->[$j]; my $sample2_id = $sample2_info->{'id'}; my $share_scores = $shares->{$sample1_id}->{$sample2_id}; # print STDERR "$sample1_id\t$sample2_id\t@$share_scores\n"; # my $mean = mean ( @{$share_scores} ); # my $stddv = stddev ( @{$share_scores} ); my $mean = getMean( $share_scores ); my $stddv = getStddev ( $share_scores, $mean ); print $outfp "$sample1_id\t$sample2_id\t$mean\t$stddv\n"; print STDERR "$sample1_id\t$sample2_id\t$mean\t$stddv\n"; } } close ($outfp); exit; sub getMean { my $numbers = shift; my $count = 0; my $sum = 0; foreach my $num (@$numbers) { $sum += $num; $count++; } return ($sum/$count); } # Calculate the Mean of your data set. # For each number: subtract the Mean and square the result. # Calculate the Mean of those squared differences. The result is called the Variance. # Calculate the square root of the Variance. The result is the Standard Deviation. # Important note- If the data set is only a sample of the whole population, change the formula to divide by one less when calculating the Variance. This is called Bessel's correction. sub getStddev { my $numbers = shift; my $mean = shift; my $dev = 0; my $count = 0; foreach my $num (@$numbers) { $dev += ($num-$mean)**2; $count++; } my $variance = $dev/($count-1); return ( sqrt($variance) ); } sub isBiallelic { my $alt = shift; my @alts = split(',', $alt); if ( $#alts > 0 ) { return 0; } else {return 1; } } sub getGenSum { my $var_calls = shift; my @alleles = split(':', $var_calls); my @genotypes = split('\/', $alleles[0]); my $gen_sum = $genotypes[0] + $genotypes[1]; if ($gen_sum > 2) {die "Is this really bi_allelic?\n@alleles\n"; } return $gen_sum; } sub getSampleNames { my $samples = shift; my $line = shift; my @fields = split(' ', $line); for (my $i = 9; $i <= $#fields; $i++) { my $sample_info = {}; $sample_info->{'id'} = $fields[$i]; # $sample_info->{'vars'} = []; push (@{$samples}, $sample_info); } return $samples; }