diff --git a/I - Pre-processing steps/dada2_pre-processing.R b/I - Pre-processing steps/dada2_pre-processing.R index 991f06539abee6c803822dac13b0191317fedcf5..c7edc366f8db042cf53b31c09a8ef361ef90c000 100644 --- a/I - Pre-processing steps/dada2_pre-processing.R +++ b/I - Pre-processing steps/dada2_pre-processing.R @@ -1,66 +1,66 @@ -#STEP 1 : Be prepared - -## load the package : -library("dada2") - -## create a path to your ".fastq" files : -path <- "./edna_intra_pipeline_comparison/samples" - -## select the ".fastq" files you want to analyze : -fns <- sort(list.files(path, pattern = ".fastq", full.names = T)) - -## the function only extracts files that end with the chosen pattern and -## they are extracted with their whole path - -## then you can only keep the part of your files name you want : -sample.names <- sapply(strsplit(basename(fns), ".fastq"), '[', 1) - -## the function "basename" removes all the path up to the file name - -## the function "strsplit" removes the pattern written - -######################################################################## -#STEP 2 : Filtering & Trimming - -## begin the creation of the new files and folder : -filts <- file.path(path, "filtered", paste0(sample.names, ".filt.fastq.gz")) - -## builds the path to the new folder, which will be located in the path -## already used and which name will be "filtered" - -## the files are named as described before with "sample.names", and -## the pattern ".filt.fastq.gz" will be added - -## from the ".fastq files" of "fns", create the new ".fastq" files of -## "filts" after filtering and trimming : -out <- filterAndTrim(fns, filts, - truncLen = 235, - maxN = 0, - maxEE = 1, - compress = T, - verbose = T) - -## "truncLen" value is chosen considering the marker length and define -## were the reads will be trimmed (after 235 bp here), and reads which -## are shortened than this value are filtered - -## "maxN" is the number of N tolerated in the sequences after -## filtering (0 here) - -## "maxEE" defines the maximal number of expected errors tolerated in a -## read (1 here), based on the quality score (EE = sum(10^(-Q/10))) - -## "compress = T" means that the files will be gzipped - -## "verbose = T" means that information concerning the number of sequences after -## sequences after filtering will be given - -######################################################################## -#STEP 3 : Dereplication - -## "derepFastq" function eliminates all the replications of each sequence in the files -derep <- derepFastq(filts) - -## the function annotates each sequence with his abundance - -######################################################################## \ No newline at end of file +#STEP 1 : Be prepared + +## load the package : +library("dada2") + +## create a path to your ".fastq" files : +path <- "./edna_intra_pipeline_comparison/samples" + +## select the ".fastq" files you want to analyze : +fns <- sort(list.files(path, pattern = ".fastq", full.names = T)) + +## the function only extracts files that end with the chosen pattern and +## they are extracted with their whole path + +## then you can only keep the part of your files name you want : +sample.names <- sapply(strsplit(basename(fns), ".fastq"), '[', 1) + +## the function "basename" removes all the path up to the file name + +## the function "strsplit" removes the pattern written + +######################################################################## +#STEP 2 : Filtering & Trimming + +## begin the creation of the new files and folder : +filts <- file.path(path, "filtered", paste0(sample.names, ".filt.fastq.gz")) + +## builds the path to the new folder, which will be located in the path +## already used and which name will be "filtered" + +## the files are named as described before with "sample.names", and +## the pattern ".filt.fastq.gz" will be added + +## from the ".fastq files" of "fns", create the new ".fastq" files of +## "filts" after filtering and trimming : +out <- filterAndTrim(fns, filts, + truncLen = 235, + maxN = 0, + maxEE = 1, + compress = T, + verbose = T) + +## "truncLen" value is chosen considering the marker length and define +## were the reads will be trimmed (after 235 bp here), and reads which +## are shorter than this value are filtered + +## "maxN" is the number of N tolerated in the sequences after +## filtering (0 here) + +## "maxEE" defines the maximal number of expected errors tolerated in a +## read (1 here), based on the quality score (EE = sum(10^(-Q/10))) + +## "compress = T" means that the files will be gzipped + +## "verbose = T" means that information concerning the number of sequences after +## sequences after filtering will be given + +######################################################################## +#STEP 3 : Dereplication + +## "derepFastq" function eliminates all the replications of each sequence in the files +derep <- derepFastq(filts) + +## the function annotates each sequence with his abundance + +########################################################################