#!/bin/bash # # Script Name: preprocessing.sh # Description: # - Demultiplex metabarcoding paired end fastq files using cutadapt # - Transform fastq files into tab files using seqkit # - Join R1 and R2 tab files # Requirements: cutadapt, seqkit, gnumeric # Arguments: # - corr_tag_file: an excel file provided by SPYGEN describing the correspondences between tags and samples # - fastq_dir: Folder directory containing fastq files # Author: Morgane Bruno # Email: morgane.bruno@cefe.cnrs.fr # Licence: MIT # Usage: bash preprocessing.sh corr_tag.xlsx path/to/fastq/ # #_input corr_tag_file=$1 fastq_dir=$2 out_path="01_demultiplex/" #_main mkdir -p $out_path # GNUMERIC: Convert XLSX into CSV metadata_path="${out_path}corr_tag/" mkdir -p $metadata_path xlsx_base=$(basename $corr_tag_file .xlsx) corr_tag_csv="${metadata_path}${xlsx_base}.csv" if [ ! -f "$corr_tag_csv" ]; then ssconvert $corr_tag_file $corr_tag_csv fi # Check corr_tag column name run_idx=$(head -n 1 $corr_tag_csv | tr "," "\n" | grep -n "^RUN$") if [ -z "${run_idx}" ]; then echo "::error:: Missing column: RUN" exit 0 fi tag_idx=$(head -n 1 $corr_tag_csv | tr "," "\n" | grep -n "^TAG$") if [ -z "${tag_idx}" ]; then echo "::error:: Missing column: TAG" exit 0 fi sample_idx=$(head -n 1 $corr_tag_csv | tr "," "\n" | grep -n "^SAMPLE$") if [ -z "${sample_idx}" ]; then echo "::error:: Missing column: SAMPLE" exit 0 fi # Extract RUN name run_name=$(awk -F"," -v idx=$run_idx 'NR>1 {h[$idx]++} END{for(run in h) print run}' $corr_tag_csv) # Demultiplexing for run in $run_name do echo "::info:: Start demultiplexing ${run} ..." run_outdir="${out_path}${run}/" mkdir -p $run_outdir awk -F',' -v r=$run -v ridx=$run_idx -v sidx=$sample_idx -v tidx=$tag_idx -v path=$run_outdir '($ridx == r) { if ($sidx ~ /SPY/) { split($tidx, tag5, ""); tag3 = ""; for (i = length(tag5); i > 0; i--) { nt = tag5[i]; if (nt ~ /c/) tag3 = tag3 "g"; if (nt ~ /g/) tag3 = tag3 "c"; if (nt ~ /t/) tag3 = tag3 "a"; if (nt ~ /a/) tag3 = tag3 "t"; } print ">" path "/" $sidx "\n" $tidx "..." tag3 } } ' $corr_tag_csv > $run_outdir/tags_cutadapt.fa if [ -s $run_outdir/tags_cutadapt.fa ]; then # if SPY filter in RUN # Check fastq files fq_r1=$(find $fastq_dir -name *$run*R1*) if [ -z "${fq_r1}" ]; then echo "::error:: Missing file for run: ${run}" exit 0 fi fq_r2=$(echo $fq_r1 | sed 's/R1/R2/') # CUTADAPT: DEMULTIPLEXING cutadapt --revcomp -O 8 --no-indels -g file:$run_outdir/tags_cutadapt.fa -G file:$run_outdir/tags_cutadapt.fa --output {name}.1.fastq.gz --paired-output {name}.2.fastq.gz $fq_r1 $fq_r2 # SEQKIT: Convert FASTQ into TSV for sample_r1 in $run_outdir*1.fastq.gz do tab_r1=$(echo $sample_r1 | sed 's/gz/tsv/') csv_r1=$(echo $sample_r1 | sed 's/gz/csv/') seqkit fx2tab -Q $sample_r1 > $tab_r1 awk -F' ' '{ print $1 "," $3 }' $tab_r1 > $csv_r1 sample_r2=$(echo $sample_r1 | sed 's/1.fastq/2.fastq/') tab_r2=$(echo $sample_r2 | sed 's/gz/tsv/') csv_r2=$(echo $sample_r2 | sed 's/gz/csv/') seqkit fx2tab -Q $sample_r2 > $tab_r2 awk -F' ' '{ print $1 "," $3 }' $tab_r2 > $csv_r2 csv_out=$(echo $sample_r1 | sed 's/1.fastq.gz/csv/') echo "Forward,Reverse" > $csv_out join -t$',' -o1.2,2.2 -e "NA" $csv_r1 $csv_r2 >> $csv_out rm $tab_r1 rm $tab_r2 rm $csv_r1 rm $csv_r2 done else echo "::info:: No SPY filters were processed in run: ${run}" rm -r $run_outdir fi done