{\rtf1\ansi\ansicpg1252\cocoartf1404\cocoasubrtf470
{\fonttbl\f0\froman\fcharset0 TimesNewRomanPSMT;}
{\colortbl;\red255\green255\blue255;}
\paperw11900\paperh16840\margl1440\margr1440\vieww16820\viewh13360\viewkind0
\deftab720
\pard\tx566\tx1133\tx1700\tx2267\tx2834\tx3401\tx3968\tx4535\tx5102\tx5669\tx6236\tx6803\pardeftab720\ri380\partightenfactor0

\f0\fs24 \cf0 \ul \ulc0 ##Downsampling script \
\pard\tx566\tx1133\tx1700\tx2267\tx2834\tx3401\tx3968\tx4535\tx5102\tx5669\tx6236\tx6803\pardeftab720\ri380\partightenfactor0
\cf0 \ulnone ##Bash script to randomly subsample bamfiles and calculate autosomal and sex chromosome coverage in ancient horse data\
##The goal is to generate 20 randomly subsampled bamfiles, per individual bam file, for a random subset number of reads\
## Script usage: "Resampling_bam_files <name_of_file>"\
## Per BAM file, a directory with the resampled bamfiles and coverage estimates is created.\
## We are resampling (WITHOUT replacement) up to 20000 reads, with increments of 500. We use shuf for subsampling\
## WARNING: If the original BAM file contains less than 20000 reads there is no random resampling and the same file is returned at each iteration\
## We use Paleomix 1.2.13 to calculate coverage of the different chromosomes\
## Questions: email bastiaan.star@ibv.uio.no\
\
module purge\
#Set local environment\
source /projects/cees/bin/paleomix/loadModules_no_java_Paleomix_1.2.13.sh\
\
mkdir -p temp_$1\
\
#Get header original BAM file\
samtools view -H $1 > temp_$1/header_$1\
\
#Get list of reads from original BAM file\
samtools view $1 |awk '\{print $1\}' > temp_$1/list_read_names_$1\
\
#Get total number of reads from the original BAM file\
read_n=$(wc -l temp_$1/list_read_names_$1 |cut -d " " -f 1)\
\
echo "Total read number is $read_n"\
\
#Check if the BAM file contains sufficient reads to allow resampling. Keep in mind that if you have just this over number, resampling becomes less random (you are essentially returning the original file)\
\
if [ 20000 -gt $read_n ]; then echo "WARNING, BAM file $\{1\} has too few reads to resample 20000" ; else echo "BAM file $\{1\} contains a sufficient number of reads for resampling" ; fi\
\
#Create directory for downsampled BAM files\
mkdir -p downsampled_$1\
\
#Subsample for this number of reads\
for y in 00500 01000 01500 02000 02500 03000 03500 04000 04500 05000 05500 06000 06500 07000 07500 08000 08500 09000 09500 10000 10500 11000 11500 12000 12500 13000 13500 14000 14500 15000 15500 16000 16500 17000 17500 18000 18500 19000 19500 20000;\
\
do\
echo "Resample $y random reads..."\
\
#Subsampling and coverage\
for x in \{001..20\};\
do\
echo ".....iteration $x"\
shuf -n $y temp_$1/list_read_names_$1 -o temp_$1/shuf_$x\
samtools view $1 | fgrep -w -f temp_$1/shuf_$x > temp_$1/$1_$x.sam 2> /dev/null\
cat temp_$1/header_$1 temp_$1/$1_$x.sam > temp_$1/temp.$1_$x.sam 2> /dev/null\
samtools view -Shub temp_$1/temp.$1_$x.sam > temp_$1/$1_$x.bam 2> /dev/null\
samtools sort temp_$1/$1_$x.bam downsampled_$1/$1_$\{y\}_$\{x\} 2> /dev/null\
samtools index downsampled_$1/$1_$\{y\}_$\{x\}.bam 2> /dev/null\
\
#Get coverage of file\
paleomix coverage downsampled_$1/$1_$\{y\}_$\{x\}.bam --overwrite > /dev/null 2>&1\
\
#Use awk to calculate the ratio of x/y chromosome versus autosomal coverage\
cat downsampled_$1/$1_$\{y\}_$\{x\}.coverage |grep chrX |awk '$2=="*" \{print $14\}' > downsampled_$1/x_cov_$x\
cat downsampled_$1/$1_$\{y\}_$\{x\}.coverage |grep chrY |awk '$2=="*" \{print $14\}' > downsampled_$1/y_cov_$x\
autosomal_cov=$(cat downsampled_$1/$1_$\{y\}_$\{x\}.coverage |grep -v chrX |grep -v chrY |grep -v chrUn |awk '$2=="*" \{sumx+=$14; countx++\}; END\{print sumx/countx\}')\
awk -v aut=$autosomal_cov -v name=$1 -v iteration=$y -v cycle=$x '\{print name"\\t"iteration"\\t"$0/aut"\\t"cycle\}' downsampled_$1/x_cov_$x > downsampled_$1/$\{1\}_$\{y\}_$\{x\}_x_auto_ratio.txt\
awk -v aut=$autosomal_cov -v name=$1 -v iteration=$y -v cycle=$x '\{print name"\\t"iteration"\\t"$0/aut"\\t"cycle\}' downsampled_$1/y_cov_$x > downsampled_$1/$\{1\}_$\{y\}_$\{x\}_y_auto_ratio.txt\
rm downsampled_$1/x_cov_$x\
rm downsampled_$1/y_cov_$x\
\
done\
\
done\
}