edna
exploitation
eDNA_intra_pipeline_comparison

Repository

ENVYAML=./dada2_and_obitools/obitools_env_conda.yaml
conda env create -f $ENVYAML
conda activate obitools
conda deactivate
install.packages("dada2")
git clone https://github.com/torognes/swarm.git
cd swarm/
make
install.packages("lulu")
git clone https://github.com/torognes/vsearch.git
cd vsearch
./autogen.sh
./configure
make
sudo make install
conda activate obitools
illuminapairedend --score-min=40 -r mullus_surmuletus_data/Aquarium_2_F.fastq mullus_surmuletus_data/Aquarium_2_R.fastq > Aquarium_2.fastq
# a new .fastq file is created, it contains the sequences after the merging of forward and reverse strands
# alignments which have a quality score higher than 40 (-- score-min=40) are merged and annotated "aligned", while alignemnts with a lower quality score are concatenated and annotated "joined"
obigrep -p 'mode!="joined"' Aquarium_2.fastq > Aquarium_2.ali.fastq
# -p requires a python expression
# python creates a new dataset (.ali.fastq) which only contains the sequences annotated "aligned"
ngsfilter -t mullus_surmuletus_data/Med_corr_tags.txt -u Aquarium_2.unidentified.fastq Aquarium_2.ali.fastq > Aquarium_2.ali.assigned.fastq
# the command creates new files :
# ".unidentified.fastq" file contains the sequences that were not assigned whith a correct tag
# ".ali.assigned.fastq" file contains the sequences that were assigned with a correct tag, so they contain only the barcode sequences
mkdir samples
# creates the folder
mv -t samples Aquarium_2.ali.assigned.fastq
# places the latests ".fastq" files in the folder
cd samples
obisplit -t samples --fastq sample/Aquarium_2.ali.assigned.fastq
# separates the files depending on their samples
mv -t ./dada2_and_obitools Aquarium_2.ali.assigned.fastq
# removes the original files from the folder
library("dada2")
fns <- sort(list.files(path, pattern = ".fastq"", full.names = T))
# the function only extracts files that end with the chosen pattern and they are extracted with their whole path
sample.names <- sapply(strsplit(basename(fns), ".fastq"), '[', 1)
# the function "basename" removes all the path up to the file name
# the function "strsplit" removes the pattern written
filts <- file.path(path, "filtered", paste0(sample.names, ".filt.fastq.gz"))
# builds the path to the new folder, which will be located in the path already used and which name will be "filtered"
# the files are named as described before with sample.names, and the pattern ".filt.fastq.gz" will be added
out <- filterAndTrim(fns, filts,
                     truncLen = 235,
                     maxN = 0,
                     maxEE = 1,
                     compress = T,
                     verbose = T)
# "truncLen" value is chosen considering the marker length and define were the reads will be trimmed (after 235 bp here), and reads which are shorter than this value are filtered
# "maxN" is the number of N tolerated in the sequences after filtering (0 here)
# "maxEE" define the maximal number of expected errors tolerated in a read (1 here), based on the quality score (EE = sum(10^(-Q/10)))
# "compress = T" means that the files will be gzipped
# "verbose = T" means that information concerning the number of sequences after filtering will be given
derep <- derepFastq(filts)
# the function annotates each sequence with his abundance
seqtab <- makeSequenceTable(derep)
uniqueSeqs <- getUniques(seqtab)
uniquesToFasta(uniqueSeqs, "./path/Aquarium2.derep.fasta")
obiclean -r 0.05 -H Aquarium_2.fasta > Aquarium_2.clean.fasta
# here, the command only returns only the sequences tagged "head" by the algorithm, and the chosen ratio is 0.05
err <- learnErrors(derep[k], randomize=T)
# builds the error model
dadas <- dada(derep[k], err)
# eliminates the false sequences identified by the model to only conserve ASVs
seqtab <- makeSequenceTable(dadas)
# constructs a sequence table with the sequences filtered
uniqueSeqs <- getUniques(seqtab)
uniquesToFasta(uniqueSeqs, paste0("PipelineB_", sample.names[k], ".fasta"))
# creates a new ".fasta" file containing the ASVs
swarm -z -d 1 -o stats_Aquarium_2.txt -w Aquarium_2.clustered.fasta < Aquarium_2.fasta
# "-z" option permits to accept the abundance in the header, provided that there is no space in the header and that the value is preceded by "size="
# "-d" is the maximal number of differences tolerated between 2 sequences to be gathered in the same OTU (1 here)
# "-o" option returns a ".txt" file in which each line corresponds to an OTU with all the amplicons belonging to this OTU
# "-w" option gives a "fasta" file with the representative sequence of each OTU
vsearch --usearch_global Aquarium_2.fasta --db Aquarium_2.fasta --self --id .84 --iddef 1 --userout match_list_Aquarium_2.txt -userfields query+target+id --maxaccepts 0 --query_cov .9 --maxhits 10
library("lulu")

OTUtable <- read.fasta(Aquarium_2.clustered.fasta)
matchlist <- read.table(match_list_Aquarium_2.txt)
# prepare the files needed for LULU processing

curated_results <- lulu(OTUtable, matchlist)
# LULU processing with the lulu R function

curated_results
# shows the OTU names and their abundance after the curation
obigrep -p 'count>=10' Aquarium_2.fasta > Aquarium_2.grep.fasta
# "-p 'count>=10'" option eliminates sequences with an abundance inferior to 10
tab <- read.table(Aquarium_2.txt, header=T)
seqtab_1 <- makeSequenceTable(tab)
seqtab_2 <- removeBimeraDenovo(seqtab_1, verbose=T)
# processes the bimera removal

uniqueSeqs <- getUniques(seqtab_2)
uniquesToFasta(uniqueSeqs, paste0(sample.names, ".fasta")
# creates the new file without bimeras
vsearch --uchime3_denovo Aquarium_2.fasta --nonchimeras Aquarium2_uchime3.fasta