diff --git a/pipeline/UniProt2Reactome_All_Levels.txt.gz b/pipeline/UniProt2Reactome_All_Levels.txt.gz deleted file mode 100644 index f8ab478..0000000 Binary files a/pipeline/UniProt2Reactome_All_Levels.txt.gz and /dev/null differ diff --git a/pipeline/merge_data.py b/pipeline/merge_data.py index 1094a13..fe3e338 100755 --- a/pipeline/merge_data.py +++ b/pipeline/merge_data.py @@ -20,44 +20,45 @@ indir = args.indir # directory with outputs from pull_data.sh outdir = args.outdir # (default is '.') directory where outputs from this will go flybase = args.flybase # (default if 'NA') FB for Flybase and DME Reactome annotations, NA for none -orthologs = args.orthologs # $3/orthofinder/Orthologues_"$noext"-cluster/"$noext"-cluster__v__dromel-cluster.tsv from pathannotator.sh script +orthologs = args.orthologs # $outdir/orthofinder/Orthologues_"$noext"-cluster/"$noext"-cluster__v__dromel-cluster.tsv from pathannotator.sh script outbase = args.outbase # file basename for output files suppliec to the pathannotator.sh wrapper script pd.set_option('display.max_columns', None) #READ API TABLES INTO PANDAS DATAFRAMES if kofam == "no" and species != "NA": - ncbi_ver = pd.read_table(f"{indir}/ncbiver.tsv", dtype=str) - ncbi_spec = pd.read_table(f"{indir}/conv_ncbi-proteinid_{species}.tsv", dtype=str) - spec_ko = pd.read_table(f"{indir}/link_{species}_ko.tsv", dtype=str) - spec_pathway = pd.read_table(f"{indir}/link_pathway_{species}.tsv", dtype=str) - list_pathway_spec = pd.read_table(f"{indir}/list_pathway_{species}.tsv", dtype=str) - ko_pathway = pd.read_table(f"{indir}/link_ko_pathway.tsv", dtype=str) - pathway = pd.read_table(f"{indir}/list_pathway.tsv", dtype=str) + if os.path.exists(f"{indir}/ko_ncbi.tsv") and os.path.getsize(f"{indir}/ko_ncbi.tsv") > 0: + ncbi_ver = pd.read_table(f"{indir}/ncbiver.tsv", dtype=str) + ncbi_spec = pd.read_table(f"{indir}/conv_ncbi-proteinid_{species}.tsv", dtype=str) + spec_ko = pd.read_table(f"{indir}/link_{species}_ko.tsv", dtype=str) + spec_pathway = pd.read_table(f"{indir}/link_pathway_{species}.tsv", dtype=str) + list_pathway_spec = pd.read_table(f"{indir}/list_pathway_{species}.tsv", dtype=str) + ko_pathway = pd.read_table(f"{indir}/link_ko_pathway.tsv", dtype=str) + pathway = pd.read_table(f"{indir}/list_pathway.tsv", dtype=str) #ADD HEADERS TO DATAFRAME COLUMNS - ncbi_ver.columns = ['Input_protein_ID_version', 'Input_protein_ID'] - ncbi_spec.columns = ['KEGG_genes_ID', 'Input_protein_ID'] - spec_ko.columns = ['KEGG_KO', 'KEGG_genes_ID'] - ko_pathway.columns = ['KEGG_ref_pathway', 'KEGG_KO'] - pathway.columns = ['KEGG_ref_pathway', 'KEGG_ref_pathway_name'] - spec_pathway.columns = ['KEGG_genes_ID', f"KEGG_{species}_pathway"] - list_pathway_spec.columns = [f"KEGG_{species}_pathway", f"KEGG_{species}_pathway_name"] + ncbi_ver.columns = ['Input_protein_ID_version', 'Input_protein_ID'] + ncbi_spec.columns = ['KEGG_genes_ID', 'Input_protein_ID'] + spec_ko.columns = ['KEGG_KO', 'KEGG_genes_ID'] + ko_pathway.columns = ['KEGG_ref_pathway', 'KEGG_KO'] + pathway.columns = ['KEGG_ref_pathway', 'KEGG_ref_pathway_name'] + spec_pathway.columns = ['KEGG_genes_ID', f"KEGG_{species}_pathway"] + list_pathway_spec.columns = [f"KEGG_{species}_pathway", f"KEGG_{species}_pathway_name"] #MERGE DATAFRAMES INTO ONE FOR REFERENCE PATHWAYS - ncbi_ver_spec = pd.merge(ncbi_ver, ncbi_spec, on='Input_protein_ID', how='inner') - ncbi_ver_spec_ko = pd.merge(ncbi_ver_spec, spec_ko, on='KEGG_genes_ID', how='inner') - ncbi_ver_spec_ko_pathway = pd.merge(ncbi_ver_spec_ko, ko_pathway, on='KEGG_KO', how='inner') - ncbi_ver_spec_ko_pathway_pathname = pd.merge(ncbi_ver_spec_ko_pathway, pathway, on='KEGG_ref_pathway', how='left') - ncbi_ver_spec_ko_pathway_pathname.drop(['Input_protein_ID', 'KEGG_genes_ID'], axis=1, inplace=True) - ncbi_ver_spec_ko_pathway_pathname.rename(columns={'Input_protein_ID_version': 'Input_protein_ID'}, inplace=True) - ncbi_ver_spec_ko_pathway_pathname = ncbi_ver_spec_ko_pathway_pathname.drop_duplicates() - ncbi_ver_spec_ko_pathway_pathname.to_csv(f"{outdir}/{outbase}_KEGG_ref.tsv", sep='\t', index=False) + ncbi_ver_spec = pd.merge(ncbi_ver, ncbi_spec, on='Input_protein_ID', how='inner') + ncbi_ver_spec_ko = pd.merge(ncbi_ver_spec, spec_ko, on='KEGG_genes_ID', how='inner') + ncbi_ver_spec_ko_pathway = pd.merge(ncbi_ver_spec_ko, ko_pathway, on='KEGG_KO', how='inner') + ncbi_ver_spec_ko_pathway_pathname = pd.merge(ncbi_ver_spec_ko_pathway, pathway, on='KEGG_ref_pathway', how='left') + ncbi_ver_spec_ko_pathway_pathname.drop(['Input_protein_ID', 'KEGG_genes_ID'], axis=1, inplace=True) + ncbi_ver_spec_ko_pathway_pathname.rename(columns={'Input_protein_ID_version': 'Input_protein_ID'}, inplace=True) + ncbi_ver_spec_ko_pathway_pathname = ncbi_ver_spec_ko_pathway_pathname.drop_duplicates() + ncbi_ver_spec_ko_pathway_pathname.to_csv(f"{outdir}/{outbase}_KEGG_ref.tsv", sep='\t', index=False) #MERGE DATAFRAMES INTO ONE FOR { species } PATHWAYS - ncbi_ver_spec_ko_specpath = pd.merge(ncbi_ver_spec_ko, spec_pathway, on='KEGG_genes_ID', how='inner') - ncbi_ver_spec_ko_specpath_specpathname = pd.merge(ncbi_ver_spec_ko_specpath, list_pathway_spec, on=f"KEGG_{species}_pathway", how='left') - ncbi_ver_spec_ko_specpath_specpathname.drop(['Input_protein_ID', 'KEGG_genes_ID'], axis=1, inplace=True) - ncbi_ver_spec_ko_specpath_specpathname.rename(columns={'Input_protein_ID_version': 'Input_protein_ID'}, inplace=True) - ncbi_ver_spec_ko_specpath_specpathname = ncbi_ver_spec_ko_specpath_specpathname.drop_duplicates() - ncbi_ver_spec_ko_specpath_specpathname.to_csv(f"{outdir}/{outbase}_KEGG_species.tsv", sep='\t', index=False) + ncbi_ver_spec_ko_specpath = pd.merge(ncbi_ver_spec_ko, spec_pathway, on='KEGG_genes_ID', how='inner') + ncbi_ver_spec_ko_specpath_specpathname = pd.merge(ncbi_ver_spec_ko_specpath, list_pathway_spec, on=f"KEGG_{species}_pathway", how='left') + ncbi_ver_spec_ko_specpath_specpathname.drop(['Input_protein_ID', 'KEGG_genes_ID'], axis=1, inplace=True) + ncbi_ver_spec_ko_specpath_specpathname.rename(columns={'Input_protein_ID_version': 'Input_protein_ID'}, inplace=True) + ncbi_ver_spec_ko_specpath_specpathname = ncbi_ver_spec_ko_specpath_specpathname.drop_duplicates() + ncbi_ver_spec_ko_specpath_specpathname.to_csv(f"{outdir}/{outbase}_KEGG_species.tsv", sep='\t', index=False) #ADD FLYBASE AND REACTOME ANNOTATIONS WHEN DME IS THE SPECIFIED SPECIES if flybase == "FB" and species == "dme": #READ INTO DATAFRAMES @@ -156,24 +157,25 @@ print("You have not requested Flybase annotations.") elif kofam == "yes" and species == "NA": #READ API TABLES INTO PANDAS DATAFRAMES - ncbi_ver = pd.read_table(f"{indir}/ncbiver.tsv", dtype=str) - ncbi_ko = pd.read_table(f"{indir}/ko_ncbi.tsv", dtype=str) - ko_pathway = pd.read_table(f"{indir}/link_ko_pathway.tsv", dtype=str) - pathway = pd.read_table(f"{indir}/list_pathway.tsv", dtype=str) + if os.path.exists(f"{indir}/ko_ncbi.tsv") and os.path.getsize(f"{indir}/ko_ncbi.tsv") > 0: + ncbi_ver = pd.read_table(f"{indir}/ncbiver.tsv", dtype=str) + ncbi_ko = pd.read_table(f"{indir}/ko_ncbi.tsv", dtype=str) + ko_pathway = pd.read_table(f"{indir}/link_ko_pathway.tsv", dtype=str) + pathway = pd.read_table(f"{indir}/list_pathway.tsv", dtype=str) #ADD HEADERS TO DATAFRAME COLUMNS - ncbi_ver.columns = ['Input_protein_ID_version', 'Input_protein_ID'] - ncbi_ko.columns = ['KEGG_KO', 'Input_protein_ID'] - ko_pathway.columns = ['KEGG_ref_pathway', 'KEGG_KO'] - pathway.columns = ['KEGG_ref_pathway', 'KEGG_ref_pathway_name'] + ncbi_ver.columns = ['Input_protein_ID_version', 'Input_protein_ID'] + ncbi_ko.columns = ['KEGG_KO', 'Input_protein_ID'] + ko_pathway.columns = ['KEGG_ref_pathway', 'KEGG_KO'] + pathway.columns = ['KEGG_ref_pathway', 'KEGG_ref_pathway_name'] #MERGE DATAFRAMES INTO ONE FOR REFERENCE PATHWAYS - ncbi_ver_ko = pd.merge(ncbi_ver, ncbi_ko, on='Input_protein_ID', how='inner') - ncbi_ver_ko_pathway = pd.merge(ncbi_ver_ko, ko_pathway, on='KEGG_KO', how='inner') - ncbi_ver_ko_pathway_pathname = pd.merge(ncbi_ver_ko_pathway, pathway, on='KEGG_ref_pathway', how='left') - ncbi_ver_ko_pathway_pathname = ncbi_ver_ko_pathway_pathname[["Input_protein_ID_version","Input_protein_ID","KEGG_KO","KEGG_ref_pathway","KEGG_ref_pathway_name"]] - ncbi_ver_ko_pathway_pathname.drop('Input_protein_ID', axis=1, inplace=True) - ncbi_ver_ko_pathway_pathname.rename(columns={"Input_protein_ID_version": "Input_protein_ID"}, inplace=True) - ncbi_ver_ko_pathway_pathname = ncbi_ver_ko_pathway_pathname.drop_duplicates() - ncbi_ver_ko_pathway_pathname.to_csv(f"{outdir}/{outbase}_KEGG_ref.tsv", sep='\t', index=False) + ncbi_ver_ko = pd.merge(ncbi_ver, ncbi_ko, on='Input_protein_ID', how='inner') + ncbi_ver_ko_pathway = pd.merge(ncbi_ver_ko, ko_pathway, on='KEGG_KO', how='inner') + ncbi_ver_ko_pathway_pathname = pd.merge(ncbi_ver_ko_pathway, pathway, on='KEGG_ref_pathway', how='left') + ncbi_ver_ko_pathway_pathname = ncbi_ver_ko_pathway_pathname[["Input_protein_ID_version","Input_protein_ID","KEGG_KO","KEGG_ref_pathway","KEGG_ref_pathway_name"]] + ncbi_ver_ko_pathway_pathname.drop('Input_protein_ID', axis=1, inplace=True) + ncbi_ver_ko_pathway_pathname.rename(columns={"Input_protein_ID_version": "Input_protein_ID"}, inplace=True) + ncbi_ver_ko_pathway_pathname = ncbi_ver_ko_pathway_pathname.drop_duplicates() + ncbi_ver_ko_pathway_pathname.to_csv(f"{outdir}/{outbase}_KEGG_ref.tsv", sep='\t', index=False) if flybase == "FB": #READ INTO DATAFRAMES fbgn_CG = pd.read_table(f"{indir}/Fbgn_CG.tsv", dtype=str) @@ -230,39 +232,40 @@ print("You have not requested Flybase annotations.") elif kofam == "yes" and species != "NA": #READ API TABLES INTO PANDAS DATAFRAMES - ncbi_ver = pd.read_table(f"{indir}/ncbiver.tsv", dtype=str) - ncbi_ko = pd.read_table(f"{indir}/ko_ncbi.tsv", dtype=str) - ko_pathway = pd.read_table(f"{indir}/link_ko_pathway.tsv", dtype=str) - pathway = pd.read_table(f"{indir}/list_pathway.tsv", dtype=str) - spec_ko = pd.read_table(f"{indir}/link_{species}_ko.tsv", dtype=str) - spec_pathway = pd.read_table(f"{indir}/link_pathway_{species}.tsv", dtype=str) - list_pathway_spec = pd.read_table(f"{indir}/list_pathway_{species}.tsv", dtype=str) + if os.path.exists(f"{indir}/ko_ncbi.tsv") and os.path.getsize(f"{indir}/ko_ncbi.tsv") > 0: + ncbi_ver = pd.read_table(f"{indir}/ncbiver.tsv", dtype=str) + ncbi_ko = pd.read_table(f"{indir}/ko_ncbi.tsv", dtype=str) + ko_pathway = pd.read_table(f"{indir}/link_ko_pathway.tsv", dtype=str) + pathway = pd.read_table(f"{indir}/list_pathway.tsv", dtype=str) + spec_ko = pd.read_table(f"{indir}/link_{species}_ko.tsv", dtype=str) + spec_pathway = pd.read_table(f"{indir}/link_pathway_{species}.tsv", dtype=str) + list_pathway_spec = pd.read_table(f"{indir}/list_pathway_{species}.tsv", dtype=str) #ADD HEADERS TO DATAFRAME COLUMNS - ncbi_ver.columns = ['Input_protein_ID_version', 'Input_protein_ID'] - ncbi_ko.columns = ['KEGG_KO', 'Input_protein_ID'] - spec_ko.columns = ['KEGG_KO', 'KEGG_genes_ID'] - ko_pathway.columns = ['KEGG_ref_pathway', 'KEGG_KO'] - pathway.columns = ['KEGG_ref_pathway', 'KEGG_ref_pathway_name'] - spec_pathway.columns = ['KEGG_genes_ID', f"KEGG_{species}_pathway"] - list_pathway_spec.columns = [f"KEGG_{species}_pathway", f"KEGG_{species}_pathway_name"] + ncbi_ver.columns = ['Input_protein_ID_version', 'Input_protein_ID'] + ncbi_ko.columns = ['KEGG_KO', 'Input_protein_ID'] + spec_ko.columns = ['KEGG_KO', 'KEGG_genes_ID'] + ko_pathway.columns = ['KEGG_ref_pathway', 'KEGG_KO'] + pathway.columns = ['KEGG_ref_pathway', 'KEGG_ref_pathway_name'] + spec_pathway.columns = ['KEGG_genes_ID', f"KEGG_{species}_pathway"] + list_pathway_spec.columns = [f"KEGG_{species}_pathway", f"KEGG_{species}_pathway_name"] #MERGE DATAFRAMES INTO ONE FOR REFERENCE PATHWAYS - ncbi_ver_ko = pd.merge(ncbi_ver, ncbi_ko, on='Input_protein_ID', how='inner') - ncbi_ver_ko_pathway = pd.merge(ncbi_ver_ko, ko_pathway, on='KEGG_KO', how='inner') - ncbi_ver_ko_pathway_pathname = pd.merge(ncbi_ver_ko_pathway, pathway, on='KEGG_ref_pathway', how='left') - ncbi_ver_ko_pathway_pathname = ncbi_ver_ko_pathway_pathname[["Input_protein_ID_version","Input_protein_ID","KEGG_KO","KEGG_ref_pathway","KEGG_ref_pathway_name"]] - ncbi_ver_ko_pathway_pathname.drop('Input_protein_ID', axis=1, inplace=True) - ncbi_ver_ko_pathway_pathname.rename(columns={"Input_protein_ID_version": "Input_protein_ID"}, inplace=True) - ncbi_ver_ko_pathway_pathname = ncbi_ver_ko_pathway_pathname.drop_duplicates() - ncbi_ver_ko_pathway_pathname.to_csv(f"{outdir}/{outbase}_KEGG_ref.tsv", sep='\t', index=False) + ncbi_ver_ko = pd.merge(ncbi_ver, ncbi_ko, on='Input_protein_ID', how='inner') + ncbi_ver_ko_pathway = pd.merge(ncbi_ver_ko, ko_pathway, on='KEGG_KO', how='inner') + ncbi_ver_ko_pathway_pathname = pd.merge(ncbi_ver_ko_pathway, pathway, on='KEGG_ref_pathway', how='left') + ncbi_ver_ko_pathway_pathname = ncbi_ver_ko_pathway_pathname[["Input_protein_ID_version","Input_protein_ID","KEGG_KO","KEGG_ref_pathway","KEGG_ref_pathway_name"]] + ncbi_ver_ko_pathway_pathname.drop('Input_protein_ID', axis=1, inplace=True) + ncbi_ver_ko_pathway_pathname.rename(columns={"Input_protein_ID_version": "Input_protein_ID"}, inplace=True) + ncbi_ver_ko_pathway_pathname = ncbi_ver_ko_pathway_pathname.drop_duplicates() + ncbi_ver_ko_pathway_pathname.to_csv(f"{outdir}/{outbase}_KEGG_ref.tsv", sep='\t', index=False) #MERGE DATAFRAMES INTO ONE FOR { species } PATHWAYS - ncbi_ver_spec_ko = pd.merge(ncbi_ver_ko, spec_ko, on='KEGG_KO', how='inner') - ncbi_ver_spec_ko_specpath = pd.merge(ncbi_ver_spec_ko, spec_pathway, on='KEGG_genes_ID', how='inner') - ncbi_ver_spec_ko_specpath_specpathname = pd.merge(ncbi_ver_spec_ko_specpath, list_pathway_spec, on=f"KEGG_{species}_pathway", how='left') - ncbi_ver_spec_ko_specpath_specpathname = ncbi_ver_spec_ko_specpath_specpathname[["KEGG_genes_ID","Input_protein_ID_version","Input_protein_ID","KEGG_KO",f"KEGG_{species}_pathway",f"KEGG_{species}_pathway_name"]] - ncbi_ver_spec_ko_specpath_specpathname.drop(['Input_protein_ID', 'KEGG_genes_ID'], axis=1, inplace=True) - ncbi_ver_spec_ko_specpath_specpathname.rename(columns={'Input_protein_ID_version': 'Input_protein_ID'}, inplace=True) - ncbi_ver_spec_ko_specpath_specpathname = ncbi_ver_spec_ko_specpath_specpathname.drop_duplicates() - ncbi_ver_spec_ko_specpath_specpathname.to_csv(f"{outdir}/{outbase}_KEGG_species.tsv", sep='\t', index=False) + ncbi_ver_spec_ko = pd.merge(ncbi_ver_ko, spec_ko, on='KEGG_KO', how='inner') + ncbi_ver_spec_ko_specpath = pd.merge(ncbi_ver_spec_ko, spec_pathway, on='KEGG_genes_ID', how='inner') + ncbi_ver_spec_ko_specpath_specpathname = pd.merge(ncbi_ver_spec_ko_specpath, list_pathway_spec, on=f"KEGG_{species}_pathway", how='left') + ncbi_ver_spec_ko_specpath_specpathname = ncbi_ver_spec_ko_specpath_specpathname[["KEGG_genes_ID","Input_protein_ID_version","Input_protein_ID","KEGG_KO",f"KEGG_{species}_pathway",f"KEGG_{species}_pathway_name"]] + ncbi_ver_spec_ko_specpath_specpathname.drop(['Input_protein_ID', 'KEGG_genes_ID'], axis=1, inplace=True) + ncbi_ver_spec_ko_specpath_specpathname.rename(columns={'Input_protein_ID_version': 'Input_protein_ID'}, inplace=True) + ncbi_ver_spec_ko_specpath_specpathname = ncbi_ver_spec_ko_specpath_specpathname.drop_duplicates() + ncbi_ver_spec_ko_specpath_specpathname.to_csv(f"{outdir}/{outbase}_KEGG_species.tsv", sep='\t', index=False) #ADD FLYBASE AND REACTOME ANNOTATIONS if flybase == "FB" and species == "dme": #READ INTO DATAFRAMES diff --git a/pipeline/pathannot_to_gmt.py b/pipeline/pathannot_to_gmt.py index 3073004..fc16d83 100644 --- a/pipeline/pathannot_to_gmt.py +++ b/pipeline/pathannot_to_gmt.py @@ -158,114 +158,30 @@ justlist = alltogether['Input_protein_ID'].apply(pd.Series) alltogether = pd.concat([alltogether.drop('Input_protein_ID', axis=1), justlist], axis=1) alltogether.to_csv(f"{outdir}/{outbase}_all_pathways.gmt", sep='\t', header=False, index=False) -else: - print ("Cannot find the proper combination of output files.") - -#REMOVE ANY TRAILING TABS FROM EMPTY DF FIELDS -gmtfile=f"{outdir}/{outbase}_all_pathways.gmt" -notab=f"{outdir}/gmt.tmp" - -with open(gmtfile, 'r') as infile, open(notab, 'w') as outfile: - for line in infile: - cleaned_line = re.sub(r'\t+\n', '\n', line) - outfile.write(cleaned_line) - -os.rename(notab, gmtfile) - -############################################################################################################# -''' -#FIND THE OUTPUT FILES TO COMBINE -keggref = f"{pathannotator}/*KEGG_ref.tsv" -keggreffile = glob.glob(keggref) -keggreffile = str(keggreffile[0]) -#READ TABLES INTO PANDAS DATAFRAMES AND ADD HEADERS TO IPRS -kr = pd.read_table(f"{keggreffile}", dtype=str) -kr.columns = ['Input_protein_ID', 'KEGG_KO', 'KEGG_ref_pathway', 'KEGG_ref_pathway_name'] -#DROP UNWANTED COLUMNS (KEEP PROTEIN ACCESSIONS (1), INTERPRO ANNOTATIONS (12) AND PATHWAY ANNOTATIONS (15) -kr = kr.drop(columns=['KEGG_KO', 'KEGG_ref_pathway_name']) -#MAKE HEADERS MATCH FOR ALL DFS -kr.columns = ['Input_protein_ID', 'Pathway_or_domain'] -#ADD SECOND COLUMN TO EACH DF WITH 'OPTIONAL DESCRIPTION' FOR GMT FORMAT -kr.insert(loc=1, column='Description', value='KEGG_reference_pathway') -#REMOVE DUPLICATE ROWS -kr = kr.drop_duplicates() - -if os.path.exists(f"{pathannotator}/*KEGG_species.tsv"): -#keggspec = f"{pathannotator}/*KEGG_species.tsv" - keggspecfile = glob.glob(keggspec) - keggspecfile = str(keggspecfile[0]) - #READ TABLES INTO PANDAS DATAFRAMES AND ADD HEADERS TO IPRS - ks = pd.read_table(f"{keggspecfile}", dtype=str) - ks.columns = ['Input_protein_ID', 'KEGG_KO', 'KEGG_species_pathway', 'KEGG_species_pathway_name'] - #DROP UNWANTED COLUMNS (KEEP PROTEIN ACCESSIONS (1), INTERPRO ANNOTATIONS (12) AND PATHWAY ANNOTATIONS (15) - ks = ks.drop(columns=['KEGG_KO', 'KEGG_species_pathway_name']) - #MAKE HEADERS MATCH FOR ALL DFS - kr.columns = ['Input_protein_ID', 'Pathway_or_domain'] - #ADD SECOND COLUMN TO EACH DF WITH 'OPTIONAL DESCRIPTION' FOR GMT FORMAT - ks.insert(loc=1, column='Description', value='KEGG_species_pathway') - #REMOVE DUPLICATE ROWS - ks = ks.drop_duplicates() - -if os.path.exists(f"{pathannotator}/*flybase.tsv"): -#flybase = f"{pathannotator}/*flybase.tsv" - flybasefile = glob.glob(flybasetsv) +elif keggreffile == [] and keggspecfile == [] and flybasefile != [] and reactomefile != []: + print ("FB, RT outputs are present") + #FIND OUTPUT FILES BASED ON PATTERN MATCH flybasefile = str(flybasefile[0]) + reactomefile = str(reactomefile[0]) #READ TABLES INTO PANDAS DATAFRAMES AND ADD HEADERS TO IPRS fb = pd.read_table(f"{flybasefile}", dtype=str) fb.columns = ['Input_protein_ID', 'KEGG_KO', 'Flybase_pathway_ID', 'Flybase_pathway_name'] - #DROP UNWANTED COLUMNS (KEEP PROTEIN ACCESSIONS (1), INTERPRO ANNOTATIONS (12) AND PATHWAY ANNOTATIONS (15) - fb = fb.drop(columns=['KEGG_KO', 'Flybase_pathway_name']) - #MAKE HEADERS MATCH FOR ALL DFS - fb.columns = ['Input_protein_ID', 'Pathway_or_domain'] - #ADD SECOND COLUMN TO EACH DF WITH 'OPTIONAL DESCRIPTION' FOR GMT FORMAT - fb.insert(loc=1, column='Description', value='FlyBase_pathway') - #REMOVE DUPLICATE ROWS - fb = fb.drop_duplicates() - -if os.path.exists(f"{pathannotator}/*reactome.tsv"): -#reactome = f"{pathannotator}/*reactome.tsv" - reactomefile = glob.glob(reactometsv) - reactomefile = str(reactomefile[0]) - #READ TABLES INTO PANDAS DATAFRAMES AND ADD HEADERS TO IPRS rt = pd.read_table(f"{reactomefile}", dtype=str) rt.columns = ['Input_protein_ID', 'UniProt_ID', 'Reactome_pathway_ID', 'Reactome_pathway_name'] #DROP UNWANTED COLUMNS (KEEP PROTEIN ACCESSIONS (1), INTERPRO ANNOTATIONS (12) AND PATHWAY ANNOTATIONS (15) + fb = fb.drop(columns=['KEGG_KO', 'Flybase_pathway_name']) rt = rt.drop(columns=['UniProt_ID', 'Reactome_pathway_name']) #MAKE HEADERS MATCH FOR ALL DFS + fb.columns = ['Input_protein_ID', 'Pathway_or_domain'] rt.columns = ['Input_protein_ID', 'Pathway_or_domain'] #ADD SECOND COLUMN TO EACH DF WITH 'OPTIONAL DESCRIPTION' FOR GMT FORMAT + fb.insert(loc=1, column='Description', value='FlyBase_pathway') rt.insert(loc=1, column='Description', value='Reactome_pathway') #REMOVE DUPLICATE ROWS + fb = fb.drop_duplicates() rt = rt.drop_duplicates() - -#if 'keggreffile' in locals() and 'keggspecfile' in locals() and 'flybasefile' in locals() and 'reactomefile' in locals(): -if not kr.empty and not ks.empty and not fb.empty and not rt.empty: - print ("KR, KS, FB, RT outputs are all present") - alltogether = pd.concat([kr, ks, fb, rt]) - alltogether = alltogether.groupby(['Pathway_or_domain', 'Description'])['Input_protein_ID'].agg(list).reset_index() - justlist = alltogether['Input_protein_ID'].apply(pd.Series) - alltogether = pd.concat([alltogether.drop('Input_protein_ID', axis=1), justlist], axis=1) - alltogether.to_csv(f"{outdir}/{outbase}_all_pathways.gmt", sep='\t', header=False, index=False) -#elif 'keggreffile' in locals() and 'keggspecfile' in locals() and 'flybasefile' not in locals() and 'reactomefile' not in locals(): -elif not kr.empty and not ks.empty and fb.empty and rt.empty: - print ("KR and KS outputs are present") - alltogether = pd.concat([kr, ks]) - alltogether = alltogether.groupby(['Pathway_or_domain', 'Description'])['Input_protein_ID'].agg(list).reset_index() - justlist = alltogether['Input_protein_ID'].apply(pd.Series) - alltogether = pd.concat([alltogether.drop('Input_protein_ID', axis=1), justlist], axis=1) - alltogether.to_csv(f"{outdir}/{outbase}_all_pathways.gmt", sep='\t', header=False, index=False) -#elif 'keggreffile' in locals() and 'keggspecfile' not in locals() and 'flybasefile' in locals() and 'reactomefile' in locals(): -elif not kr.empty and ks.empty and not rb.empty and not rt.empty: - print ("KR, FB and RT outputs are present") - alltogether = pd.concat([kr, fb, rt]) - alltogether = alltogether.groupby(['Pathway_or_domain', 'Description'])['Input_protein_ID'].agg(list).reset_index() - justlist = alltogether['Input_protein_ID'].apply(pd.Series) - alltogether = pd.concat([alltogether.drop('Input_protein_ID', axis=1), justlist], axis=1) - alltogether.to_csv(f"{outdir}/{outbase}_all_pathways.gmt", sep='\t', header=False, index=False) -#elif 'keggreffile' in locals() and 'keggspecfile' not in locals() and 'flybasefile' not in locals() and 'reactomefile' not in locals(): -elif not kr.empty and ks.empty and fb.empty and rt.empty: - print ("only KR output is present") - alltogether = kr + #BRING ALL DATA TOGETHER IN GMT FORMAT + alltogether = pd.concat([fb, rt]) alltogether = alltogether.groupby(['Pathway_or_domain', 'Description'])['Input_protein_ID'].agg(list).reset_index() justlist = alltogether['Input_protein_ID'].apply(pd.Series) alltogether = pd.concat([alltogether.drop('Input_protein_ID', axis=1), justlist], axis=1) @@ -283,4 +199,3 @@ outfile.write(cleaned_line) os.rename(notab, gmtfile) -''' diff --git a/pipeline/pathannotator.sh b/pipeline/pathannotator.sh index 9a8c067..e6c2587 100755 --- a/pipeline/pathannotator.sh +++ b/pipeline/pathannotator.sh @@ -1,6 +1,10 @@ #! /bin/bash #CHECK FOR OUTDIR. IF IT DOESN'T EXIST CREATE IT +if [ -z "$outdir" ]; then outdir=="."; fi +if [ ! -d "$outdir" ]; then mkdir -p "$outdir"; fi + + if [ -f "$outdir"/link_ko_pathway.tsv ]; then rm "$outdir"/link_ko_pathway.tsv; fi if [ -f "$outdir"/list_pathway.tsv ]; then rm "$outdir"/list_pathway.tsv; fi if [ -f "$outdir"/conv_ncbi-proteinid_"$keggcode".tsv ]; then rm "$outdir"/conv_ncbi-proteinid_"$keggcode".tsv; fi @@ -21,7 +25,7 @@ if [ -n "$(ls $outdir/fbgn_annotation_ID_fb* 2>/dev/null)" ]; then rm $outdir/fb if [ -n "$(ls $outdir/dmel-all-translation*.fasta* 2>/dev/null)" ]; then rm $outdir/dmel-all-translation*.fasta*; fi if [ -n "$(ls $outdir/fbgn_fbtr_fbpp_fb* 2>/dev/null)" ]; then rm $outdir/fbgn_fbtr_fbpp_fb*; fi if [ -f "$outdir"/Fbgn_fbpp.tsv ]; then rm "$outdir"/Fbgn_fbpp.tsv; fi -if [ -d "$outdir"/tmp ]; then rm -r "$outdir"/tmp; fi +#if [ -d "$outdir"/tmp ]; then rm -r "$outdir"/tmp; fi if [ -f "$outdir"/tmp.txt ]; then rm "$outdir"/tmp.txt; fi if [ -f "$outdir"/ncbiversion.tmp ]; then rm "$outdir"/ncbiversion.tmp; fi if [ -f "$outdir"/ncbiver.tsv ]; then rm "$outdir"/ncbiver.tsv; fi @@ -87,10 +91,8 @@ fi ####################################################################################################### #SET DEFAULTS IF OPTIONS NOT PROVIDED if [ -z "${flybase}" ]; then $flybase == 'NA'; fi -if [ -z "${outdir}" ]; then $outdir == '.'; fi if [ -z "${keggcode}" ]; then $keggcode == 'NA'; fi -if [ ! -d "$outdir" ]; then mkdir -p "$outdir"; fi #GETTING NUMBER OF AVAILABLE PROCESSORS FOR USE IN THREADING avail=$(getconf _NPROCESSORS_ONLN) @@ -222,8 +224,14 @@ then #FILTER KOFAM HERE echo "Filtering KofamScan results" grep -P "^\*" $outdir/kofam_result_full.txt >> $outdir/kofam_filtered_asterisk.txt - awk '{ print $3"\t"$2 }' $outdir/kofam_filtered_asterisk.txt > $outdir/ko_ncbi.tsv - sed -i 's/\..*$//' $outdir/ko_ncbi.tsv + if [ -s $outdir/kofam_filtered_asterisk.txt ] + then + echo "Filtered KofamScan results NOT empty. Proceeding with KEGG annotation." + awk '{ print $3"\t"$2 }' $outdir/kofam_filtered_asterisk.txt > $outdir/ko_ncbi.tsv + sed -i 's/\..*$//' $outdir/ko_ncbi.tsv + else + echo "Filtered KofamScan results EMPTY. Moving on to FlyBase and Reactome annotation." + fi #IF FB AND NOT 'DME' RUN ORTHOFINDER AND PROCEED TO MERGE (INCLUDING FLYBASE) if [ "$keggcode" != "dme" ] && [ "$flybase" == "FB" ]; @@ -307,8 +315,14 @@ then #FILTER KOFAM HERE echo "Filtering KofamScan results" grep -P "^\*" $outdir/kofam_result_full.txt >> $outdir/kofam_filtered_asterisk.txt - awk '{ print $3"\t"$2 }' $outdir/kofam_filtered_asterisk.txt > $outdir/ko_ncbi.tsv - sed -i 's/\..*$//' $outdir/ko_ncbi.tsv + if [ -s $outdir/kofam_filtered_asterisk.txt ] + then + echo "Filtered KofamScan results NOT empty. Proceeding with KEGG annotation." + awk '{ print $3"\t"$2 }' $outdir/kofam_filtered_asterisk.txt > $outdir/ko_ncbi.tsv + sed -i 's/\..*$//' $outdir/ko_ncbi.tsv + else + echo "Filtered KofamScan results EMPTY. Moving on to FlyBase and Reactome annotation." + fi #IF FB AND NOT 'DME' RUN ORTHOFINDER AND PROCEED TO MERGE (INCLUDING FLYBASE) if [ "$keggcode" != "dme" ] && [ "$flybase" == "FB" ]; @@ -405,8 +419,14 @@ else #ELSE MEANS THESE ARE NOT NCBI PROTEIN IDS. #FILTER KOFAM HERE echo "Filtering KofamScan results" grep -P "^\*" $outdir/kofam_result_full.txt >> $outdir/kofam_filtered_asterisk.txt - awk '{ print $3"\t"$2 }' $outdir/kofam_filtered_asterisk.txt > $outdir/ko_ncbi.tsv - sed -i 's/\..*$//' $outdir/ko_ncbi.tsv + if [ -s $outdir/kofam_filtered_asterisk.txt ] + then + echo "Filtered KofamScan results NOT empty. Proceeding with KEGG annotation." + awk '{ print $3"\t"$2 }' $outdir/kofam_filtered_asterisk.txt > $outdir/ko_ncbi.tsv + sed -i 's/\..*$//' $outdir/ko_ncbi.tsv + else + echo "Filtered KofamScan results EMPTY. Moving on to FlyBase and Reactome annotation." + fi #IF FB RUN ORTHOFINDER AND PROCEED TO MERGE (INCLUDING FLYBASE) if [ "$flybase" == FB ]; @@ -488,8 +508,14 @@ else #ELSE MEANS THESE ARE NOT NCBI PROTEIN IDS. #FILTER KOFAM HERE echo "Filtering KofamScan results" grep -P "^\*" $outdir/kofam_result_full.txt >> $outdir/kofam_filtered_asterisk.txt - awk '{ print $3"\t"$2 }' $outdir/kofam_filtered_asterisk.txt > $outdir/ko_ncbi.tsv - sed -i 's/\..*$//' $outdir/ko_ncbi.tsv + if [ -s $outdir/kofam_filtered_asterisk.txt ] + then + echo "Filtered KofamScan results NOT empty. Proceeding with KEGG annotation." + awk '{ print $3"\t"$2 }' $outdir/kofam_filtered_asterisk.txt > $outdir/ko_ncbi.tsv + sed -i 's/\..*$//' $outdir/ko_ncbi.tsv + else + echo "Filtered KofamScan results EMPTY. Moving on to FlyBase and Reactome annotation." + fi #IF FB RUN ORTHOFINDER AND PROCEED TO MERGE (INCLUDING FLYBASE) if [ "$flybase" == FB ];