edna
exploitation
swarm_and_obitools

Repository

bash Anaconda3-2020.07-Linux-x86_64.sh
~/anaconda3/bin/conda
conda config --set auto_activate_base false
ENVYAML=./dada2_and_obitools/obitools_env_conda.yaml
conda env create -f $ENVYAML
conda activate obitools
conda deactivate
git clone https://github.com/torognes/swarm.git
cd swarm/
make
cp -r ./bin/swarm /usr/local/bin
unzip mullus_surmuletus_data.zip
conda activate obitools
illuminapairedend --score-min=40 -r mullus_surmuletus_data/Aquarium_2_R1.fastq mullus_surmuletus_data/Aquarium_2_R2.fastq > Aquarium_2.fastq
# a new .fastq file is created, it contains the sequences after the merging of forward and reverse strands
# alignments which have a quality score higher than 40 (-- score-min=40) are merged and annotated "aligned", while alignemnts with a lower quality score are concatenated and annotated "joined"
obigrep -p 'mode!="joined"' Aquarium_2.fastq > Aquarium_2.ali.fastq
# -p requires a python expression
# python creates a new dataset (.ali.fastq) which only contains the sequences annotated "aligned"
ngsfilter -t mullus_surmuletus_data/Med_corr_tags.txt -u Aquarium_2.unidentified.fastq Aquarium_2.ali.fastq > Aquarium_2.ali.assigned.fastq
# the command creates new files :
# ".unidentified.fastq" file contains the sequences that were not assigned whith a correct tag
# ".ali.assigned.fastq" file contains the sequences that were assigned with a correct tag, so they contain only the barcode sequences
mkdir samples
# create the folder
mv -t samples Aquarium_2.ali.assigned.fastq
# place the latests ".fastq" files in the folder
cd samples
obisplit -t samples --fastq sample/Aquarium_2.ali.assigned.fastq
# separate the files depending on their samples
mv -t ./swarm_and_obitools Aquarium_2.ali.assigned.fastq
# remove the original files from the folder
obiuniq Aquarium_2.fastq > Aquarium_2.uniq.fasta
obigrep -l 20 -p 'count>=10' Aquarium_2.uniq.fasta > Aquarium_2.grep.fasta
# "-l 20" option eliminates sequences with a length shorter than 20 bp
# "-p 'count>=10'" option eliminates sequences with an abundance inferior to 10
obiclean -r 0.05 -H Aquarium_2.grep.fasta > Aquarium_2.clean.fasta
# here, the command returns only the sequences tagged "head" by the algorithm, and the chosen ratio is 0.05
ecotag -m 0.5 -d ./only_obitools/embl_std -R ./only_obitools/base_ref_finale_formated.fasta Aquarium_2.clean.fasta > Aquarium_2.tag.fasta
# only the sequences with a similarity score higher than 0.95 are annotated to their corresponding taxon
obiannotate -k count Aquarium_2.tag.fasta > Aquarium_2.tag_1.fasta
# only the attribute "count" is conserved
swarm -z -d 1 -o stats_Aquarium_2.txt -w Aquarium_2.end.fasta < Aquarium_2.tag_1.fasta
# "-z" option permits to accept the abundance in the header, provided that there is no space in the header and that the value is preceded by "size="
# "-d" is the maximal number of differences tolerated between 2 sequences to be gathered in the same OTU
# "-o" option returns a ".txt" file in which each line corresponds to an OTU with all the amplicons belonging to this OTU
# "-w" option gives a "fasta" file with the representative sequence of each OTU