diff --git a/Snakefile b/Snakefile index 9fd4744208cb2eeb5b3e5ac65411b7f427bf510c..3e1b9ea01b5b75badda6f2fb8f1702e16e3a6fed 100644 --- a/Snakefile +++ b/Snakefile @@ -1,19 +1,18 @@ #=============================================================================== #HEADER #=============================================================================== -__author__ = ["Pierre-Edouard Guerin", "Morgane Bruno"] -__credits__ = ["Pierre-Edouard Guerin", "Virginie Marques", "Morgane Bruno"] +__author__ = "Pierre-Edouard Guerin" +__credits__ = ["Pierre-Edouard Guerin", "Virginie Marques"] __license__ = "MIT" -__version__ = "1.3" -__maintainer__ = "Morgane Bruno" -__email__ = "morgane.bruno@cefe.cnrs.fr" -__status__ = "Development" +__version__ = "1.2" +__maintainer__ = "Pierre-Edouard Guerin" +__email__ = "pierre-edouard.guerin@cefe.cnrs.fr" +__status__ = "Production" """ Codes for scientific papers related to metabarcoding studies AUTHORS ======= -* Morgane Bruno | morgane.bruno@cefe.cnrs.fr * Pierre-Edouard Guerin | pierre-edouard.guerin@cefe.cnrs.fr * Virginie Marques | virginie.marques@cefe.cnrs.fr * CNRS/CEFE, CNRS/MARBEC | Montpellier, France @@ -28,38 +27,403 @@ it will return a demultiplexing.csv file. The output contains all required wildcards en related information necessary to run the next workflow steps. """ -from snakemake.utils import min_version -min_version("6.10.0") -import os, csv + +############################################################################### +# MODULES +############################################################################### +import pandas +import glob +import os +from Bio.Seq import Seq + +############################################################################### +# FUNCTIONS +############################################################################### + +def mkdir_results(results_subfolders): + for subfolder in results_subfolders: + if not os.path.exists(os.path.join("results",subfolder)): + os.mkdir(os.path.join("results",subfolder)) + + +## read a sample description .dat file and return a dataframe object +def read_dat(filedat): + dfdat = pandas.read_csv(filedat, sep="\t", header=None) + dfdat.columns=['experiment','plaque','barcode','primer5','primer3','F'] + return dfdat + + +## join all the element of columns named "*cols" of the dataframe "df" into +## a string separated by "sep". It returns a list of joined columns elements +## as a string. +def str_join(df, sep, *cols): + from functools import reduce + return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), [df[col] for col in cols]) + + +############################################################################### +# GLOBAL VARIABLES +############################################################################### + + +results_subfolders = ['00_flags', + '01_settings', + '02_illuminapairedend', + '02b_scaterred', + '03_remove_unaligned', + '04_demultiplex_dat', + '05_demultiplex_flags', + '06_assign_marker_sample_to_sequence', + '07_split_fastq_by_sample', + '08_samples', + '09_dereplicate_samples', + '10_goodlength_samples', + '11_clean_pcrerr_samples', + '12_rm_internal_samples', + '13_cat_samples_into_runs', + '14_dereplicate_runs', + '15_taxonomic_assignment', + '16_remove_annotations', + '17_sort_abundance_assigned_sequences', + '18_table_assigned_sequences' + ] + + +mkdir_results(results_subfolders) + + + +## check format (CLASSIC or RAPIDRUN) +if config['format'] == "CLASSIC": + print("CLASSIC data: one single marker for each run") + dfrClassic = pandas.DataFrame(columns=['plaque','run','sample','projet','marker']) + for run in config['fichiers']['dat']: + thisRunDatfile=config['fichiers']['dat'][run] + thisDat=read_dat(thisRunDatfile) + for index, datRow in thisDat.iterrows(): + thisRow = { + "plaque": datRow['plaque'], + "run": run, + "sample": datRow['plaque'], + "projet": datRow['experiment'], + "marker": run + } + dfrClassic = dfrClassic.append(thisRow, ignore_index=True) + print(dfrClassic) + export_allsample = dfrClassic.to_csv (r'results/01_settings/all_samples_classic.csv', index = None, header = False, sep = ';') + rapidrunfile="results/01_settings/all_samples_classic.csv" +else: + print("RAPIDRUN data: many markers for many runs") + #configfile: "01_infos/config.yaml" + rapidrunfile = config['fichiers']['rapidrun'] + #rapidrunfile="01_infos/all_samples.tsv" + + +## read 'rapidrun' .tsv file +dfr =pandas.read_csv(rapidrunfile, sep=";") +dfr.columns = ['plaque', 'run','sample','projet','marker'] +## remove blacklisted projets +blacklistedProjets = config['blacklist']['projet'] +dfrp= dfr[dfr.run.isin(blacklistedProjets) == False] +## remove blacklisted runs +blacklistedRuns = config['blacklist']['run'] +dfrm= dfrp[dfrp.run.isin(blacklistedRuns) == False] +## get list of `run`, `projet` and `marker` wildcards +uniqRuns=dfrm.run.unique() +uniqProjets=dfrm.projet.unique() +uniqMarkers=dfrm.marker.unique() + + +############################################################################### +# MAIN +############################################################################### + +## list of dataframe of `marker` sample description .dat file +all_dat= {} +for marker in uniqMarkers: + all_dat[marker]=read_dat(config['fichiers']['dat'][marker]) +## init dataframe with all columns we want to output +dfMulti = pandas.DataFrame(columns=["demultiplex","projet", "marker","run", "plaque","sample","barcode5","barcode3","primer5","primer3","min_f","min_r","lenBarcode5","lenBarcode3"]) +## fill the dataframe, each row belong to a `projet`/`marker`/`run`/`sample` wildcards combination +for run in uniqRuns: + for marker in uniqMarkers: + runMarkerDfrm=dfrm[(dfrm.run == run) & (dfrm.marker == marker)] + for plaque in runMarkerDfrm.plaque: + selectedRow=runMarkerDfrm[(runMarkerDfrm.plaque == plaque)] + projet=selectedRow['projet'].values[0] + sample=selectedRow['sample'].values[0] + plaque_dat=all_dat[marker][(all_dat[marker].plaque == plaque)] + barcode5=plaque_dat["barcode"].values[0] + barcode3=str(Seq(barcode5).reverse_complement()) + primer5=plaque_dat["primer5"].values[0] + #primer3=str(Seq(plaque_dat["primer3"].values[0]).reverse_complement()) + ## attention ici on utilise bien le .dat de ngsfilter + primer3=plaque_dat["primer3"].values[0] + lenBarcode5=len(barcode5) + lenBarcode3=len(barcode3) + min_f=int(int(len(primer5))*2/3) + min_r=int(int(len(primer3))*2/3) + sa_fastq=projet+"_"+marker+"/"+sample+".fastq" + ru_fastq=run+".fastq" + #print(marker, run, plaque, projet,sample) + #print(ru_fastq) + #print(sa_fastq) + #print(barcode5, barcode3, primer5, primer3) + #print(min_f, min_r, lenBarcode5,lenBarcode3) + #print("========================") + demultiplex=projet+"/"+marker+"/"+run+"/"+sample + projmarkrun=projet+"/"+marker+"/"+run + thisRow= { + "demultiplex": demultiplex, + "projmarkrun": projmarkrun, + "projet": projet, + "marker": marker, + "run": run, + "plaque": plaque, + "sample": sample, + "barcode5": barcode5, + "barcode3": barcode3, + "primer5": primer5, + "primer3": primer3, + "min_f": min_f, + "min_r": min_r, + "lenBarcode5": lenBarcode5, + "lenBarcode3": lenBarcode3, + } + #print(thisRow) + dfMulti = dfMulti.append( thisRow, ignore_index=True) +## write the demultiplexing .csv file +export_csv = dfMulti.to_csv (r'results/01_settings/all_demultiplex.csv', index = None, header=True) +## print an overview of the dataframe we wrote +print (dfMulti) + +if config['format'] != "CLASSIC": + rapidrunfile = config['fichiers']['rapidrun'] +else: + rapidrunfile="results/01_settings/all_samples_classic.csv" + if os.path.isfile(rapidrunfile) is not True: + raise Exception("ERROR: "+rapidrunfile+" is not a file. You must run step 01_settings first in order to generate this file for the CLASSIC format.") + +## read the rapidrun file as dataframe +dfr =pandas.read_csv(rapidrunfile, sep=";") +dfr.columns = ['plaque', 'run','sample','projet','marker'] +## remove blacklisted projets +blacklistedProjets = config['blacklist']['projet'] +dfrp= dfr[dfr.run.isin(blacklistedProjets) == False] +## remove blacklisted runs +blacklistedRuns = config['blacklist']['run'] +dfrm= dfrp[dfrp.run.isin(blacklistedRuns) == False] +## names of `run` wildcards +uniqRuns=dfrm.run.unique() +## set number of chunks +listChunks = [*range(1,(config['illuminapairedend']['nb_chunk']+1))] + + + +## keep only marker and run information for demultiplexing +dfRunMarker=dfrm.loc[:, ( 'marker','run')] +dfRunMarker['runMarker']=str_join(dfrm, '/', 'marker', 'run') +dfRunMarker['projMarker']=str_join(dfrm, '/', 'projet', 'marker') +dfRunMarker=dfRunMarker.drop_duplicates() + + +listRunMarker = list(dfRunMarker.runMarker) +dfRunMarker['projMarkRun'] = dfRunMarker['projMarker']+"/"+dfRunMarker['run'] +## write .dat files for ngsfilter for each projet/marker +for projmarkerrun in dfRunMarker['projMarkRun'].unique(): + thisProj = projmarkerrun.split("/")[0] + thisMarker = projmarkerrun.split("/")[1] + thisRun = projmarkerrun.split("/")[2] + fileName = "results/04_demultiplex_dat/" + str(thisProj) + "_" + str(thisMarker) + "_" + str(thisRun) + ".tsv" + dfprojmarkerrunMulti = dfMulti[(dfMulti.projet == thisProj) & (dfMulti.marker == thisMarker) & (dfMulti.run == thisRun)] + if dfprojmarkerrunMulti.empty == False: + print("Writing sample description files for project and marker", thisProj, thisMarker, ":", fileName) + thisExp = list(dfprojmarkerrunMulti['projet']) + thisSample = list(dfprojmarkerrunMulti['sample']) + thisTags = list(dfprojmarkerrunMulti['barcode5']) + ThisForward_primer = list(dfprojmarkerrunMulti['primer5']) + ThisReverse_primer = list(dfprojmarkerrunMulti['primer3']) + This_extra_info = list(pandas.Series(["F"]).repeat(len(thisExp))) + thisDat = { '#exp': thisExp, 'sample': thisSample, 'tags': thisTags, 'forward_primer': ThisForward_primer, 'reverse_primer': ThisReverse_primer, 'extra_information': This_extra_info } + dfDat = pandas.DataFrame(thisDat, columns = ['#exp', 'sample', 'tags', 'forward_primer', 'reverse_primer', 'extra_information']) + dfDat = dfDat.drop_duplicates() + #print(dfDat) + ## check if files already exists + files_present = glob.glob(fileName) + if files_present: + os.remove(fileName) + dfDat.to_csv(fileName, index=False, sep="\t") + else: + print("Writing sample description files for project and marker", thisProj, thisMarker, ":", "WARNING no information about it, check the file", rapidrunfile, "or blacklist this project") + +dfRunMarker['dat'] = ["results/04_demultiplex_dat/"+ele.replace('/','_')+".tsv" for ele in dfRunMarker['projMarkRun']] + + +projetMarkerRuns = dfMulti.loc[:, ('projmarkrun','marker','projet')].drop_duplicates() +dfpmr = projetMarkerRuns +## attribute sample description files to each row with corresponding `marker` +if config['format'] == "CLASSIC": + thisMarker=str(list(config["assign_taxon"]["bdr"])[0]) + markerDic = {} + for dmarker in dfpmr['marker']: + markerDic[dmarker] = thisMarker + dfpmr['bdr'] = dfpmr['marker'].map(markerDic).map(config["assign_taxon"]["bdr"]) + dfpmr['fasta'] = dfpmr['marker'].map(markerDic).map(config["assign_taxon"]["fasta"]) +else: + dfpmr['bdr'] = dfpmr['marker'].map(config["assign_taxon"]["bdr"]) + dfpmr['fasta'] = dfpmr['marker'].map(config["assign_taxon"]["fasta"]) + +## also perform taxonomic assignment with custom reference database +if config['custom_baseref']: + print("also perform taxonomic assignment with custom reference database") + for crmarker in config['assign_customtax']['bdr'].keys(): + print(crmarker) + df_crmarker = dfpmr[(dfpmr.marker == crmarker)].copy() + df_crmarker['bdr'] = config['assign_customtax']['bdr'][crmarker] + df_crmarker['fasta'] = config['assign_customtax']['fasta'][crmarker] + df_crmarker['projmarkrun'] = df_crmarker['projmarkrun'].astype(str) + '_custom' + dfpmr = pandas.concat([dfpmr, df_crmarker], ignore_index=True) + customprojmarkrun = dfpmr[dfpmr['projmarkrun'].str.contains('_custom')]['projmarkrun'] + realprojmarkrun = customprojmarkrun.str.replace('_custom','') + d_ln_custom = { 'customprojmarkrun':customprojmarkrun, 'realprojmarkrun':realprojmarkrun } + df_ln_custom = pandas.DataFrame(d_ln_custom) + print(df_ln_custom) + customLnTsvFile = 'results/01_settings/customlinkprojtmarkrun.tsv' + df_ln_custom.to_csv (r'./'+customLnTsvFile, index = None, header = False, sep = '\t') + + +## display selected `projet`/`marker`/`run` with related information +print(dfpmr) + + ############################################################################### # WORKFLOW ############################################################################### -#_rules -include: "rules/common.smk" + +rule all: + input: + 'results/00_flags/table_assigned_sequences.flag' + if config['illuminapairedend']['nb_chunk'] != 0: include: "rules/split_fastq.smk" include: "rules/chunk_illuminapairedend.smk" else: include: "rules/illuminapairedend.smk" include: "rules/remove_unaligned.smk" -include: "rules/ngsfilter.smk" -include: "rules/obisplit.smk" -include: "rules/obiuniq.smk" + + +rule flag_assembly: + input: + expand('results/03_remove_unaligned/{run}.ali.fastq', run=uniqRuns) + output: + touch('results/00_flags/assembly.flag') +rule flag_demultiplex_init: + input: + 'results/00_flags/assembly.flag' + output: + expand('results/05_demultiplex_flags/{demultiplex}.flag', demultiplex=dfRunMarker.projMarkRun) + run: + for dem in dfRunMarker.projMarkRun: + fileFlag = "results/05_demultiplex_flags/"+dem+".flag" + open(fileFlag, 'a').close() + + +include: "rules/assign_marker_sample_to_sequence.smk" +include: "rules/split_fastq_by_sample.smk" + +rule flag_demultiplex_done: + input: + expand('results/07_split_fastq_by_sample/flags/{demultiplex}.done', demultiplex=dfRunMarker.projMarkRun) + output: + touch('results/00_flags/demultiplex.flag') + + + +rule check_samples: + input: + 'results/00_flags/demultiplex.flag' + output: + expand('results/08_samples/{sample}.fasta', sample=dfMulti['demultiplex']) + run: + for sample in dfMulti['demultiplex']: + file_sample = "results/07_split_fastq_by_sample/"+sample+".fasta" + if not os.path.exists(file_sample): + print("WARNING: results/07_split_fastq_by_sample/"+sample+".fasta not found. We removed it from this analysis.") + open("results/08_samples/"+str(sample)+".fasta",'a').close() + else: + os.replace("results/07_split_fastq_by_sample/"+str(sample)+".fasta", "results/08_samples/"+str(sample)+".fasta") + + + + +include: "rules/dereplicate_samples.smk" include: "rules/goodlength_samples.smk" include: "rules/clean_pcrerr_samples.smk" include: "rules/rm_internal_samples.smk" -include: "rules/samples_to_runs.smk" + + +rule flag_filtering_done: + input: + expand('results/12_rm_internal_samples/{sample}.c.r.l.u.fasta',sample=dfMulti['demultiplex']) + output: + touch('results/00_flags/filtering.flag') + + +if config['custom_baseref']: + rule custom_bdr_cat_samples_into_runs: + input: + 'results/00_flags/filtering.flag' + output: + expand('results/13_cat_samples_into_runs/{projmarkrun}.fasta',projmarkrun=dfpmr['projmarkrun']) + params: + cltf= customLnTsvFile + shell: + ''' + bash scripts/cat_samples_into_runs.sh + bash scripts/link_custom_runs.sh {params.cltf} + ''' +else: + rule cat_samples_into_runs: + input: + 'results/00_flags/filtering.flag' + output: + expand('results/13_cat_samples_into_runs/{projmarkrun}.fasta',projmarkrun=dfpmr['projmarkrun']) + shell: + ''' + bash scripts/cat_samples_into_runs.sh + ''' + include: "rules/dereplicate_runs.smk" -include: "rules/taxonomic_assignment.smk" +if config['custom_baseref']: + include: "rules/custom_taxonomic_assignment.smk" +else: + include: "rules/taxonomic_assignment.smk" include: "rules/remove_annotations.smk" include: "rules/sort_abundance_assigned_sequences.smk" include: "rules/table_assigned_sequences.smk" -#_targets -rule all: + +## copy and rename final results +rule flag_table_assigned_sequences: input: - get_targets() + expand('results/18_table_assigned_sequences/{projmarkrun}.csv', projmarkrun=dfpmr['projmarkrun']) + output: + touch('results/00_flags/table_assigned_sequences.flag') + run: + print("Final results...", end='') + for pmr in dfpmr['projmarkrun']: + thisProjet = pmr.split('/')[0] + thisMarker = pmr.split('/')[1] + thisRun = pmr.split('/')[2] + tableNameFile = "results/18_table_assigned_sequences/"+pmr+".csv" + if "custom" in thisRun: + newTableNameFile = "results/"+thisProjet+"_"+thisMarker+"_"+thisRun+"_ecotag.csv" + else: + newTableNameFile = "results/"+thisProjet+"_"+thisMarker+"_"+thisRun+"_ncbi_ecotag.csv" + print(thisProjet+"_"+thisMarker+"_"+thisRun+", ", end='') + os.system("cp {0} {1}".format(tableNameFile, newTableNameFile)) + print("done") diff --git a/rules/chunk_illuminapairedend.smk b/rules/chunk_illuminapairedend.smk index 7d966bdaf9d22a0fa694fdc26007c7922d3c1ee8..ad167bd555cf4c8c37a9502c508dbf00169adf13 100644 --- a/rules/chunk_illuminapairedend.smk +++ b/rules/chunk_illuminapairedend.smk @@ -1,13 +1,14 @@ -__author__ = ["Pierre-Edouard Guerin", "Morgane Bruno"] +__author__ = "Pierre-Edouard Guerin" __license__ = "MIT" + ## On scattered fastq chunk Paired end alignment then keep reads with quality > 40 checkpoint illuminapairedend: input: R1='results/02b_scaterred/{run}_R1_{chunk}.fastq', R2='results/02b_scaterred/{run}_R2_{chunk}.fastq' output: - fq=temp('results/02_illuminapairedend/{run}_{chunk}.fastq') + fq='results/02_illuminapairedend/{run}_{chunk}.fastq' conda: '../envs/obitools_envs.yaml' singularity: @@ -18,13 +19,20 @@ checkpoint illuminapairedend: s_min=config["illuminapairedend"]["s_min"], gathered=lambda wildcards: 'results/02_illuminapairedend/' + wildcards.run + '.fastq', shell: - '''illuminapairedend -r {input.R2} {input.R1} --score-min={params.s_min} > {output.fq} 2> {log}''' + '''illuminapairedend -r {input.R2} {input.R1} --score-min={params.s_min} > {output.fq} 2> {log};''' + +## function to get list of chunk files for each run +def get_chunks(wildcards): + meschunks = expand(rules.illuminapairedend.output.fq, **wildcards, chunk=listChunks) + return meschunks ## gather `chunk` files into a single `run` fastq file rule merge_chunks: input: - expand('results/02_illuminapairedend/{{run}}_{chunk}.fastq', chunk = range(1, config['illuminapairedend']['nb_chunk']+1)) + get_chunks output: fq='results/02_illuminapairedend/{run}.fastq' shell: '''cat {input} > {output}''' + + diff --git a/rules/clean_pcrerr_samples.smk b/rules/clean_pcrerr_samples.smk index 5acd4f62aff5c6a8ead68dd888c9c0c60cbb32bf..7b1cbb65de67c4334ebb6e23b6cebf3e7a360810 100644 --- a/rules/clean_pcrerr_samples.smk +++ b/rules/clean_pcrerr_samples.smk @@ -5,19 +5,19 @@ __license__ = "MIT" ### Clean the sequences for PCR/sequencing errors (sequence variants) rule clean_pcrerr_samples: input: - 'results/10_goodlength_samples/{demultiplex}/{sample}.l.u.fasta' + 'results/10_goodlength_samples/{sample}.l.u.fasta' output: - temp('results/11_clean_pcrerr_samples/{demultiplex}/{sample}.r.l.u.fasta') + 'results/11_clean_pcrerr_samples/{sample}.r.l.u.fasta' conda: '../envs/obitools_envs.yaml' singularity: config["singularity"]["obitools"] log: - 'logs/11_clean_pcrerr_samples/{demultiplex}/{sample}.log' + 'logs/11_clean_pcrerr_samples/{sample}.log' params: + dmulti= lambda wildcards: dfMulti[dfMulti.demultiplex == wildcards.sample].to_dict('records')[0], r=config["clean_pcrerr_samples"]["r"] shell: ''' - mkdir -p $(dirname {output}) - [ -s {input} ] && obiclean -r {params.r} {input} > {output} 2> {log} || touch {output} - ''' + mkdir -p results/11_clean_pcrerr_samples/{params.dmulti[projmarkrun]}; + if [[ -s {input} ]]; then obiclean -r {params.r} {input} > {output} 2> {log}; else touch {output} 2> {log}; fi''' \ No newline at end of file diff --git a/rules/custom_taxonomic_assignment.smk b/rules/custom_taxonomic_assignment.smk index 13a3597ff99bd45c5b72af8ca09f4dbd986f7bb6..a46c5d06a5f9d132dfbb42d69b91a2a8791b6e9d 100644 --- a/rules/custom_taxonomic_assignment.smk +++ b/rules/custom_taxonomic_assignment.smk @@ -14,16 +14,18 @@ rule custom_taxonomic_assignment: '../envs/obitools_envs.yaml' singularity: config["singularity"]["obitools"] + params: + drun= lambda wildcards: dfpmr[dfpmr.projmarkrun == wildcards.projmarkrun].to_dict('records')[0], log: 'logs/15_taxonomic_assignment/{projmarkrun}.log' shell: ''' - mkdir -p $(dirname {output}); + mkdir -p results/15_taxonomic_assignment/{params.drun[projet]}/{params.drun[marker]}; SUB="_custom" STR={input} if [[ "$STR" == *"$SUB"* ]]; then - ecotag -t {params.drun[bdr]} -R {params.drun[fasta]} {input} > {output} 2> {log} - else + ecotag -t {params.drun[bdr]} -R {params.drun[fasta]} {input} > {output} 2> {log} + else ecotag -d {params.drun[bdr]} -R {params.drun[fasta]} {input} > {output} 2> {log} fi - ''' + ''' \ No newline at end of file diff --git a/rules/dereplicate_runs.smk b/rules/dereplicate_runs.smk index cc1cf124473d0192af10cbada2eb336daa2c6355..f10e40d3c8a06904a01318cce67d6b06d15ecd4e 100644 --- a/rules/dereplicate_runs.smk +++ b/rules/dereplicate_runs.smk @@ -5,17 +5,19 @@ __license__ = "MIT" ## Dereplicate and merge samples together rule dereplicate_runs: input: - 'results/13_cat_samples_into_runs/{demultiplex}.fasta' + 'results/13_cat_samples_into_runs/{projmarkrun}.fasta' output: - 'results/14_dereplicate_runs/{demultiplex}.uniq.fasta' + 'results/14_dereplicate_runs/{projmarkrun}.uniq.fasta' conda: '../envs/obitools_envs.yaml' - container: + singularity: config["singularity"]["obitools"] + params: + drun= lambda wildcards: dfpmr[dfpmr.projmarkrun == wildcards.projmarkrun].to_dict('records')[0], log: - 'logs/14_dereplicate_runs/{demultiplex}.log' + 'logs/14_dereplicate_runs/{projmarkrun}.log' shell: ''' - mkdir -p $(dirname {output}) + mkdir -p results/14_dereplicate_runs/{params.drun[projet]}/{params.drun[marker]} obiuniq -m sample {input} > {output} 2> {log} - ''' + ''' \ No newline at end of file diff --git a/rules/dereplicate_samples.smk b/rules/dereplicate_samples.smk index f6053140a22f96d003e5436ecaa4aeeb230e3fa6..2e51b0de092156f8429ae1dd024f1decc566dd76 100644 --- a/rules/dereplicate_samples.smk +++ b/rules/dereplicate_samples.smk @@ -17,4 +17,4 @@ rule dereplicate_samples: dmulti= lambda wildcards: dfMulti[dfMulti.demultiplex == wildcards.sample].to_dict('records')[0], shell: ''' mkdir -p results/09_dereplicate_samples/{params.dmulti[projmarkrun]}; - if [[ -s {input} ]]; then obiuniq -m sample {input} > {output} 2> {log}; else touch {output}; fi;''' + if [[ -s {input} ]]; then obiuniq -m sample {input} > {output} 2> {log}; else touch {output}; fi;''' \ No newline at end of file diff --git a/rules/goodlength_samples.smk b/rules/goodlength_samples.smk index a156734dcece856b7a875e608abcbb0019e0dbcc..0c070766bf2f629faf859719d1b7373496e8e21a 100644 --- a/rules/goodlength_samples.smk +++ b/rules/goodlength_samples.smk @@ -5,21 +5,21 @@ __license__ = "MIT" ## only sequence more than 20bp with no ambiguity IUAPC with total coverage greater than 10 reads rule goodlength_samples: input: - 'results/09_dereplicate_samples/{demultiplex}/{sample}.uniq.fasta' + 'results/09_dereplicate_samples/{sample}.uniq.fasta' output: - temp('results/10_goodlength_samples/{demultiplex}/{sample}.l.u.fasta') + 'results/10_goodlength_samples/{sample}.l.u.fasta' conda: '../envs/obitools_envs.yaml' singularity: config["singularity"]["obitools"] log: - 'logs/10_goodlength_samples/{demultiplex}/{sample}.log' - priority: 50 + 'logs/10_goodlength_samples/{sample}.log' params: + dmulti= lambda wildcards: dfMulti[dfMulti.demultiplex == wildcards.sample].to_dict('records')[0], seq_count=config["good_length_samples"]["seq_count"], seq_length_min=config["good_length_samples"]["seq_length_min"] shell: ''' - mkdir -p $(dirname {output}) - [ -s {input} ] && obigrep -p "count>{params.seq_count}" -s "^[ACGT]+$" -p "seq_length>{params.seq_length_min}" {input} > {output} 2> {log} || touch {output} - ''' + mkdir -p results/10_goodlength_samples/{params.dmulti[projmarkrun]}; + if [[ -s {input} ]]; then obigrep -p 'count>{params.seq_count}' -s '^[ACGT]+$' -p 'seq_length>{params.seq_length_min}' {input} > {output} 2> {log}; + else touch {output}; fi''' \ No newline at end of file diff --git a/rules/illuminapairedend.smk b/rules/illuminapairedend.smk index b9ca031196f03f530e2796ca7ecf70d42d98ffc2..04424d945d212d60aa6eaa45baeacd35657e28a1 100644 --- a/rules/illuminapairedend.smk +++ b/rules/illuminapairedend.smk @@ -16,7 +16,7 @@ rule illuminapairedend: log: 'logs/02_illuminapairedend/{run}.log' params: - s_min=config["illuminapairedend"]["s_min"] + s_min=config["illuminapairedend"]["s_min"] shell: '''illuminapairedend -r {input.R2} {input.R1} --score-min={params.s_min} > {output.fq} 2> {log}''' diff --git a/rules/remove_annotations.smk b/rules/remove_annotations.smk index 12416ccdfabd3a95d6f65e313090c7addafd45b7..ed5d371a0c8e3f31b558eec3bc2a8a887d2fcf9f 100644 --- a/rules/remove_annotations.smk +++ b/rules/remove_annotations.smk @@ -5,25 +5,26 @@ __license__ = "MIT" ## Some unuseful attributes can be removed at this stage rule remove_annotations: input: - 'results/15_taxonomic_assignment/{demultiplex}_{bdr}.tag.u.fasta' + 'results/15_taxonomic_assignment/{projmarkrun}.tag.u.fasta' output: - 'results/16_remove_annotations/{demultiplex}_{bdr}.a.t.u.fasta' + 'results/16_remove_annotations/{projmarkrun}.a.t.u.fasta' conda: '../envs/obitools_envs.yaml' singularity: config["singularity"]["obitools"] + params: + drun= lambda wildcards: dfpmr[dfpmr.projmarkrun == wildcards.projmarkrun].to_dict('records')[0], log: - 'logs/16_remove_annotations/{demultiplex}_{bdr}.log' + 'logs/16_remove_annotations/{projmarkrun}.log' shell: ''' - mkdir -p $(dirname {output}); + mkdir -p results/16_remove_annotations/{params.drun[projet]}/{params.drun[marker]}; obiannotate --delete-tag=scientific_name_by_db --delete-tag=obiclean_samplecount \ - --delete-tag=obiclean_count --delete-tag=obiclean_singletoncount \ - --delete-tag=obiclean_cluster --delete-tag=obiclean_internalcount \ - --delete-tag=obiclean_head --delete-tag=obiclean_headcount \ - --delete-tag=id_status --delete-tag=rank_by_db --delete-tag=obiclean_status \ - --delete-tag=seq_length_ori --delete-tag=sminL --delete-tag=sminR \ - --delete-tag=reverse_score --delete-tag=reverse_primer --delete-tag=reverse_match --delete-tag=reverse_tag \ - --delete-tag=forward_tag --delete-tag=forward_score --delete-tag=forward_primer --delete-tag=forward_match \ - --delete-tag=tail_quality {input} > {output} 2> {log} - ''' + --delete-tag=obiclean_count --delete-tag=obiclean_singletoncount \ + --delete-tag=obiclean_cluster --delete-tag=obiclean_internalcount \ + --delete-tag=obiclean_head --delete-tag=obiclean_headcount \ + --delete-tag=id_status --delete-tag=rank_by_db --delete-tag=obiclean_status \ + --delete-tag=seq_length_ori --delete-tag=sminL --delete-tag=sminR \ + --delete-tag=reverse_score --delete-tag=reverse_primer --delete-tag=reverse_match --delete-tag=reverse_tag \ + --delete-tag=forward_tag --delete-tag=forward_score --delete-tag=forward_primer --delete-tag=forward_match \ + --delete-tag=tail_quality {input} > {output} 2> {log}''' \ No newline at end of file diff --git a/rules/remove_unaligned.smk b/rules/remove_unaligned.smk index c46176d7ed063bb0e1c95b9cf48ceafb6fda28ad..fa9ca266de93920d021ed1f8f83bda5b508aba38 100644 --- a/rules/remove_unaligned.smk +++ b/rules/remove_unaligned.smk @@ -1,12 +1,13 @@ __author__ = "Pierre-Edouard Guerin" __license__ = "MIT" + ## Remove unaligned sequence records rule remove_unaligned: input: - 'results/02_illuminapairedend/{run}.fastq' + fq='results/02_illuminapairedend/{run}.fastq' output: - 'results/03_remove_unaligned/{run}.ali.fastq' + ali='results/03_remove_unaligned/{run}.ali.fastq' conda: '../envs/obitools_envs.yaml' singularity: @@ -14,4 +15,4 @@ rule remove_unaligned: log: 'logs/03_remove_unaligned/{run}.log' shell: - '''obigrep -p 'mode!=\"joined\"' {input} > {output} 2> {log}''' + '''obigrep -p 'mode!=\"joined\"' {input.fq} > {output.ali} 2> {log}''' diff --git a/rules/rm_internal_samples.smk b/rules/rm_internal_samples.smk index 979c93746e193eb4b002d77f98589c41b5b92189..35ab385c2c5dee8ce629af3dfb55188c4696053e 100644 --- a/rules/rm_internal_samples.smk +++ b/rules/rm_internal_samples.smk @@ -5,17 +5,19 @@ __license__ = "MIT" ## Remove sequence which are classified as 'internal' by obiclean rule rm_internal_samples: input: - 'results/11_clean_pcrerr_samples/{demultiplex}/{sample}.r.l.u.fasta' + 'results/11_clean_pcrerr_samples/{sample}.r.l.u.fasta' output: - temp('results/12_rm_internal_samples/{demultiplex}/{sample}.c.r.l.u.fasta') + 'results/12_rm_internal_samples/{sample}.c.r.l.u.fasta' conda: '../envs/obitools_envs.yaml' + params: + dmulti= lambda wildcards: dfMulti[dfMulti.demultiplex == wildcards.sample].to_dict('records')[0], singularity: config["singularity"]["obitools"] log: - 'logs/12_rm_internal_samples/{demultiplex}/{sample}.log' + 'logs/12_rm_internal_samples/{sample}.log' shell: ''' - mkdir -p $(dirname {output}) - [ -s {input} ] && obigrep -p "obiclean_internalcount == 0" {input} > {output} 2> {log} || touch {output} + mkdir -p results/12_rm_internal_samples/{params.dmulti[projmarkrun]}; + if [[ -s {input} ]]; then mkdir -p ../results/04_filter_samples/04_filtered/{params.dmulti[projmarkrun]}; obigrep -p "obiclean_internalcount == 0" {input} > {output} 2> {log} ; else touch {output} 2> {log}; fi; ''' diff --git a/rules/sort_abundance_assigned_sequences.smk b/rules/sort_abundance_assigned_sequences.smk index a9b172c048adba5591e7846a4ff954cac8945475..84e6381afee6a5bb64a7170127bedaa02d945e45 100644 --- a/rules/sort_abundance_assigned_sequences.smk +++ b/rules/sort_abundance_assigned_sequences.smk @@ -5,17 +5,18 @@ __license__ = "MIT" ## The sequences can be sorted by decreasing order of count rule sort_abundance_assigned_sequences: input: - 'results/16_remove_annotations/{demultiplex}_{bdr}.a.t.u.fasta' + 'results/16_remove_annotations/{projmarkrun}.a.t.u.fasta' output: - 'results/17_sort_abundance_assigned_sequences/{demultiplex}_{bdr}.s.a.t.u.fasta' + 'results/17_sort_abundance_assigned_sequences/{projmarkrun}.s.a.t.u.fasta' conda: '../envs/obitools_envs.yaml' singularity: config["singularity"]["obitools"] + params: + drun= lambda wildcards: dfpmr[dfpmr.projmarkrun == wildcards.projmarkrun].to_dict('records')[0], log: - 'logs/17_sort_abundance_assigned_sequences/{demultiplex}_{bdr}.log' + 'logs/17_sort_abundance_assigned_sequences/{projmarkrun}.log' shell: ''' - mkdir -p $(dirname {output}) - obisort -k count -r {input} > {output} 2> {log} - ''' + mkdir -p results/17_sort_abundance_assigned_sequences/{params.drun[projet]}/{params.drun[marker]}; + obisort -k count -r {input} > {output} 2> {log}''' \ No newline at end of file diff --git a/rules/split_fastq.smk b/rules/split_fastq.smk index a37748435972176121544641c1615cfa59b0a97a..3fd61e4666b3a21a33b3b38376bd09378c747ede 100644 --- a/rules/split_fastq.smk +++ b/rules/split_fastq.smk @@ -1,15 +1,15 @@ __author__ = "Pierre-Edouard Guerin" __license__ = "MIT" -listChunks = [*range(1,(config['illuminapairedend']['nb_chunk']+1))] + ## scatter a fastq file rule split_fastq: input: R1=config["fichiers"]["folder_fastq"]+'{run}_R1.fastq.gz', R2=config["fichiers"]["folder_fastq"]+'{run}_R2.fastq.gz' output: - R1=temp('results/02b_scaterred/{run}_R1_{chunk}.fastq'), - R2=temp('results/02b_scaterred/{run}_R2_{chunk}.fastq') + R1='results/02b_scaterred/{run}_R1_{chunk}.fastq', + R2='results/02b_scaterred/{run}_R2_{chunk}.fastq' conda: '../envs/fastqsplitter_envs.yaml' params: @@ -17,3 +17,5 @@ rule split_fastq: R2=lambda wildcards: '-o '+' -o '.join([ 'results/02b_scaterred/' + wildcards.run +'_R2_'+ str(i) + '.fastq' for i in listChunks]) shell: '''CMD="fastqsplitter -i {input.R1}"; eval $CMD {params.R1}; CMD="fastqsplitter -i {input.R2}"; eval $CMD {params.R2};''' + + diff --git a/rules/table_assigned_sequences.smk b/rules/table_assigned_sequences.smk index d2718b86219e340bc28857ac802c652a34c3c10b..60898f314851e4d84d4bfae5dfc0516eeb3cfec1 100644 --- a/rules/table_assigned_sequences.smk +++ b/rules/table_assigned_sequences.smk @@ -5,17 +5,17 @@ __license__ = "MIT" ## Generate a table final results rule table_assigned_sequences: input: - rules.sort_abundance_assigned_sequences.output + 'results/17_sort_abundance_assigned_sequences/{projmarkrun}.s.a.t.u.fasta' output: - 'results/{demultiplex}_{bdr}_ecotag.csv' - log: - 'logs/18_table_assigned_sequences/{demultiplex}_{bdr}_obitab.log' + 'results/18_table_assigned_sequences/{projmarkrun}.csv' conda: '../envs/obitools_envs.yaml' - container: + singularity: config["singularity"]["obitools"] + params: + drun= lambda wildcards: dfpmr[dfpmr.projmarkrun == wildcards.projmarkrun].to_dict('records')[0], + log: + 'logs/18_table_assigned_sequences/{projmarkrun}.log' shell: - ''' - obitab -o {input} > {output} 2> {log} - ''' - + '''mkdir -p results/17_sort_abundance_assigned_sequences/{params.drun[projet]}/{params.drun[marker]}; + obitab -o {input} > {output} 2> {log}''' diff --git a/rules/taxonomic_assignment.smk b/rules/taxonomic_assignment.smk index 4530a32de881c5908bb7bfae289dc11e7a17c423..73acc961660d98240773f235911fae33c044e9da 100644 --- a/rules/taxonomic_assignment.smk +++ b/rules/taxonomic_assignment.smk @@ -5,46 +5,21 @@ __license__ = "MIT" ## Assign each sequence to a taxon rule taxonomic_assignment: input: - 'results/14_dereplicate_runs/{demultiplex}.uniq.fasta' + 'results/14_dereplicate_runs/{projmarkrun}.uniq.fasta' output: - 'results/15_taxonomic_assignment/{demultiplex}_ncbi.tag.u.fasta' + 'results/15_taxonomic_assignment/{projmarkrun}.tag.u.fasta' threads: workflow.cores * 0.5 conda: '../envs/obitools_envs.yaml' singularity: config["singularity"]["obitools"] - log: - 'logs/15_taxonomic_assignment/{demultiplex}.log' params: - db = lambda w: config['assign_taxon']['bdr'][str({w.demultiplex}).split('/')[1]], - fasta = lambda w: config['assign_taxon']['fasta'][str({w.demultiplex}).split('/')[1]] + drun= lambda wildcards: dfpmr[dfpmr.projmarkrun == wildcards.projmarkrun].to_dict('records')[0], + log: + 'logs/15_taxonomic_assignment/{projmarkrun}.log' shell: ''' - mkdir -p $(dirname {output}) - ecotag -d {params.db} -R {params.fasta} {input} > {output} 2> {log} - ''' - -if config['custom_baseref']: - rule custom_bdr_taxonomic_assignment: - input: - get_custom_assignment_input - output: - 'results/15_taxonomic_assignment/{demultiplex}_custom.tag.u.fasta' - threads: - workflow.cores * 0.5 - conda: - '../envs/obitools_envs.yaml' - singularity: - config["singularity"]["obitools"] - log: - 'logs/15_taxonomic_assignment/{demultiplex}.log' - params: - db = lambda w: config['assign_customtax']['bdr'][str({w.demultiplex}).split('/')[1]], - fasta = lambda w: config['assign_customtax']['fasta'][str({w.demultiplex}).split('/')[1]] - shell: - ''' - mkdir -p $(dirname {output}) - ecotag -t {params.db} -R {params.fasta} {input} > {output} 2> {log} - ''' - + mkdir -p results/15_taxonomic_assignment/{params.drun[projet]}/{params.drun[marker]}; + ecotag -d {params.drun[bdr]} -R {params.drun[fasta]} {input} > {output} 2> {log} + ''' \ No newline at end of file