Skip to content
Snippets Groups Projects

Demultiplexing and convert fastq into tsv

  • Clone with SSH
  • Clone with HTTPS
  • Embed
  • Share
    The snippet can be accessed without any authentication.
    Authored by mbruno
    • Demultiplex metabarcoding paired end fastq files using cutadapt

    • Transform fastq files into tab files using seqkit

    • Join R1 and R2 tab files

    Edited
    preprocessing.sh 3.51 KiB
    #!/bin/bash
    
    #
    # Script Name: preprocessing.sh
    # Description:
    #   - Demultiplex metabarcoding paired end fastq files using cutadapt
    #   - Transform fastq files into tab files using seqkit
    #   - Join R1 and R2 tab files
    # Requirements: cutadapt, seqkit, gnumeric
    # Arguments:
    #   - corr_tag_file: an excel file provided by SPYGEN describing the correspondences between tags and samples
    #   - fastq_dir: Folder directory containing fastq files
    # Author: Morgane Bruno
    # Email: morgane.bruno@cefe.cnrs.fr
    # Licence: MIT
    # Usage: bash preprocessing.sh corr_tag.xlsx path/to/fastq/
    #
    
    #_input
    corr_tag_file=$1
    fastq_dir=$2
    out_path="01_demultiplex/"
    
    #_main
    mkdir -p $out_path
    
    # GNUMERIC: Convert XLSX into CSV
    metadata_path="${out_path}corr_tag/"
    mkdir -p $metadata_path
    xlsx_base=$(basename $corr_tag_file .xlsx)
    corr_tag_csv="${metadata_path}${xlsx_base}.csv"
    if [ ! -f "$corr_tag_csv" ]; then
      ssconvert $corr_tag_file $corr_tag_csv 
    fi
    
    # Check corr_tag column name
    run_idx=$(head -n 1 $corr_tag_csv | tr "," "\n" | grep -n "^RUN$")
    if [ -z "${run_idx}" ]; then 
      echo "::error:: Missing column: RUN"
      exit 0
    fi
    tag_idx=$(head -n 1 $corr_tag_csv | tr "," "\n" | grep -n "^TAG$")
    if [ -z "${tag_idx}" ]; then 
      echo "::error:: Missing column: TAG"
      exit 0
    fi
    sample_idx=$(head -n 1 $corr_tag_csv | tr "," "\n" | grep -n "^SAMPLE$")
    if [ -z "${sample_idx}" ]; then
      echo "::error:: Missing column: SAMPLE"
      exit 0
    fi
    
    # Extract RUN name
    run_name=$(awk -F"," -v idx=$run_idx 'NR>1 {h[$idx]++} END{for(run in h) print run}' $corr_tag_csv)
    
    # Demultiplexing
    for run in $run_name
    do
      echo "::info:: Start demultiplexing ${run} ..."
      run_outdir="${out_path}${run}/"
      mkdir -p $run_outdir
      awk -F',' -v r=$run -v ridx=$run_idx -v sidx=$sample_idx -v tidx=$tag_idx -v path=$run_outdir '($ridx == r) {
      if ($sidx ~ /SPY/) {
        split($tidx, tag5, "");
        tag3 = "";
        for (i = length(tag5); i > 0; i--) {
          nt = tag5[i];
          if (nt ~ /c/) tag3 = tag3 "g";
          if (nt ~ /g/) tag3 = tag3 "c";
          if (nt ~ /t/) tag3 = tag3 "a";
          if (nt ~ /a/) tag3 = tag3 "t";
        }
        print ">" path "/" $sidx "\n" $tidx "..." tag3
       }
      } ' $corr_tag_csv > $run_outdir/tags_cutadapt.fa
      if [ -s $run_outdir/tags_cutadapt.fa ]; then # if SPY filter in RUN
        # Check fastq files
        fq_r1=$(find $fastq_dir -name *$run*R1*)
        if [ -z "${fq_r1}" ]; then
          echo "::error:: Missing file for run: ${run}"
          exit 0
        fi
        fq_r2=$(echo $fq_r1 | sed 's/R1/R2/')
        # CUTADAPT: DEMULTIPLEXING
        cutadapt --revcomp -O 8 --no-indels  -g file:$run_outdir/tags_cutadapt.fa -G file:$run_outdir/tags_cutadapt.fa --output {name}.1.fastq.gz --paired-output {name}.2.fastq.gz $fq_r1 $fq_r2
        # SEQKIT: Convert FASTQ  into TSV
        for sample_r1 in $run_outdir*1.fastq.gz
        do
          tab_r1=$(echo $sample_r1 | sed 's/gz/tsv/')
          csv_r1=$(echo $sample_r1 | sed 's/gz/csv/')
          seqkit fx2tab -Q $sample_r1 > $tab_r1
          awk -F' ' '{ print $1 "," $3 }' $tab_r1 > $csv_r1
          sample_r2=$(echo $sample_r1 | sed 's/1.fastq/2.fastq/')
          tab_r2=$(echo $sample_r2 | sed 's/gz/tsv/')
          csv_r2=$(echo $sample_r2 | sed 's/gz/csv/')
          seqkit fx2tab -Q $sample_r2 > $tab_r2
          awk -F' ' '{ print $1 "," $3 }' $tab_r2 > $csv_r2
          csv_out=$(echo $sample_r1 | sed 's/1.fastq.gz/csv/')
          echo "Forward,Reverse" > $csv_out
          join -t$',' -o1.2,2.2 -e "NA" $csv_r1 $csv_r2 >> $csv_out
          rm $tab_r1
          rm $tab_r2
          rm $csv_r1
          rm $csv_r2
        done
      else
        echo "::info:: No SPY filters were processed in run: ${run}"
        rm -r $run_outdir
      fi
    done
    0% Loading or .
    You are about to add 0 people to the discussion. Proceed with caution.
    Finish editing this message first!
    Please register or to comment