First version PeakQC.

pnnl · Jul 15, 2024 · 20b07b8 · 20b07b8
commit 20b07b8
Show file tree

Hide file tree

Showing 684 changed files with 41,702 additions and 0 deletions.
diff --git a/IonToolPack.py b/IonToolPack.py
@@ -0,0 +1,114 @@
+import os
+import pandas as pd
+import sys
+from qc.utils import FormatDataframeSamples
+from qc.qc_pipeline import qc_pipeline
+from tkinter import filedialog, Tk, Button, ttk, Entry, StringVar, messagebox, Label
+
+def import_list_ms_runs():
+    global df
+    global filepath
+    global userIonsFileLabel
+    csv_file_path = filedialog.askopenfilename()
+    print(csv_file_path)
+    separator = "\t"
+    if csv_file_path.endswith('.csv'):
+        separator = ','
+    df = pd.read_csv(csv_file_path, sep=separator)
+
+    df = FormatDataframeSamples(df, basePathCsvRuns=os.path.dirname(csv_file_path))
+
+    if len(df) == 0:
+        return
+
+    filepath.set(os.path.dirname(csv_file_path))
+    if os.path.exists(filepath.get() + "/User-Ions.csv"):
+        userIonsFileLabel.set("User ions file: DETECTED User-Ions.csv")
+    else:
+        userIonsFileLabel.set("User ions file: NONE")
+
+    #Clear the treeview list items
+    for item in tree.get_children():
+        tree.delete(item)
+
+    tree["columns"] = list(df.columns)
+    for col in df.columns:
+        tree.heading(col, text=col)
+    for idx, (_, row) in enumerate(df.iterrows()):
+        if idx % 2 == 0:
+            tree.insert("", "end", text=str(idx), values=list(row), tags=("evenrow",))
+        else:
+            tree.insert("", "end", text=str(idx), values=list(row))
+
+def import_list_ms_runs_clipboard():
+    global df
+    global filepath
+    global userIonsFileLabel
+    runs = []
+    paths = []
+    cb = root.clipboard_get()
+    for item in cb.split('\n'):
+        runx = os.path.basename(item)
+        runs.append(runx)
+        paths.append(item.removesuffix(runx))
+
+    if len(runs) == 0:
+        return
+
+    df = pd.DataFrame({"MSRUN": runs, "MSRUNPATH": paths})
+    df = FormatDataframeSamples(df)
+    #Clear the treeview list items
+    for item in tree.get_children():
+        tree.delete(item)
+
+    tree["columns"] = list(df.columns)
+    for col in df.columns:
+        tree.heading(col, text=col)
+    for idx, (_, row) in enumerate(df.iterrows()):
+        if idx % 2 == 0:
+            tree.insert("", "end", text=str(idx), values=list(row), tags=("evenrow",))
+        else:
+            tree.insert("", "end", text=str(idx), values=list(row))
+    # Take the path of the first run as the result path:
+    filepath.set(df["MSRUNPATH"][0])
+    if os.path.exists(filepath.get() + "/User-Ions.csv"):
+        userIonsFileLabel.set("User ions file: DETECTED User-Ions.csv")
+    else:
+        userIonsFileLabel.set("User ions file: NONE")
+
+def call_backend_process():
+    global df
+    global filepath
+    if df is None or len(df) == 0:
+        messagebox.showerror("Please import a list of MS runs.")
+        return
+
+    if os.path.exists(filepath.get() + "/config.toml"):
+        qc_pipeline(df, filepath.get(), filepath.get() + "/config.toml")
+    else:
+        qc_pipeline(df, filepath.get())
+
+
+if __name__ == "__main__":
+    import multiprocessing
+    multiprocessing.freeze_support()
+    root = Tk()
+    root.geometry('800x600')
+    root.title("IonToolPack v1 | PeakQC")
+    root.iconbitmap(sys.executable)
+
+    tree = ttk.Treeview(root, height=20, show="headings")
+    filepath = StringVar()
+    userIonsFileLabel = StringVar()
+    userIonsFileLabel.set("User ions file: NONE")
+
+    Button(root, text='Import list of MS runs (.csv, .txt)', command=import_list_ms_runs).pack()
+    Button(root, text='Paste MS runs from clipboard', command=import_list_ms_runs_clipboard).pack()
+    tree.pack(fill="both")
+
+    Label(root, text="Output path:").pack() # add label for file path
+    Entry(root, textvariable=filepath).pack() # add text box for file path
+    Label(root, textvariable=userIonsFileLabel).pack() # add label for user ions file
+    Button(root, text='Process', command=call_backend_process).pack()
+
+    root.mainloop()
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,24 @@
+BSD 2-Clause License
+
+Copyright 2022 Battelle Memorial Institute
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/PeakQC_User-guide_v2024-06-29.pdf b/PeakQC_User-guide_v2024-06-29.pdf
diff --git a/README.md b/README.md
@@ -0,0 +1,27 @@
+
+# IonToolPack | PeakQC
+
+IonToolPack is a software suite for housing tools for mass spectrometry. The first one is PeakQC, a software a tool for automated quality control (QC) of mass spectrometry (MS) data which is is omics-agnostic (works for any ion type, e.g.: metabolomics, lipidomics, proteomics, etc.), supports various instrument platforms and acquisition modes and has a simple graphical user interface.
+
+## Usage
+1. Download the latest version (Release section, right panel) and uncompress it
+2. Double click IonToolPack.exe
+3. Import raw MS files and click “Process”
+4. See more details and examples in PeakQC_User-guide_*.pdf
+
+## MS data supported
+Supported formats include Agilent 'd', Bruker 'd' (improvements in progress), Thermo '.raw', and mzML, and for different types of MS acquisition methods:
+* LC-MS
+* LC-IMS-MS
+* With/without fragmentation spectra in DDA or DIA mode
+* Direct infusion 
+
+## Contact
+
+aivett.bilbao@pnnl.gov
+
+## References
+
+If you this tool or any portions of this code please cite: 
+* Harrison et al. "PeakQC: A Software Tool for Omics-Agnostic Automated Quality Control of Mass Spectrometry Data". Journal of the American Society for Mass Spectrometry 2024 https://doi.org/10.1021/jasms.4c00146.
+* Bilbao et al. "MZA: A Data Conversion Tool to Facilitate Software Development and Artificial Intelligence Research in Multidimensional Mass Spectrometry". Journal of Proteome Research 2023 https://doi.org/10.1021/acs.jproteome.2c00313.
diff --git a/config.toml b/config.toml
@@ -0,0 +1,29 @@
+# Configuration file
+
+# For overlaid figures, Half Window size of the view (+- center):
+MZVIEWHALFWINDOW = 0.7
+RTVIEWHALFWINDOW = 1.5
+ATVIEWHALFWINDOW = 2
+
+# Mass tolerance for chromatogram extraction, Half Window m/z value (+- center or m/z target):
+MZXICHALFWINDOW = 0.01
+
+# Error thresholds, absolute value:
+MZERRORPPM = 15
+RTERROR = 0.3
+ATERROR = 0.1
+ABUNDANCEERROR = 30 # Percentage absolute error, a percentage of the mean of the ion abundance applied as a threshold to report QC ions outside tolerances
+
+AutoTrackedIonsTopN = 4 # Number of auto-tracked ions to detect per sample group
+MinIntensityPresencePercentage = 80 # Intensity threshold presence/absence 
+
+MinMzDistDetectCentroidMS = 0.0005 # If distance between 2 consecutive points from the max intensity peak is smaller than this value then it is considered profile mode spectrum
+
+FigureLegendMaxNumberLines = 15
+
+# MZA conversion:
+MinIntensityMza = 20
+
+# time-vs-mz images:
+TimeVsMzImageMinIntensityPercentage = 10
+TimeVsMzImageMaxIntensityCeilingPercentage = 70
diff --git a/qc/auto_ion_tracking.py b/qc/auto_ion_tracking.py
@@ -0,0 +1,51 @@
+import numpy as np
+import pandas as pd
+from qc.ion_batch import GetHighResCoordinates
+
+def DetectTopmostIons(dfions, dfruns, topIons=4, minMzDistDetectCentroidMS=0.005):
+    dfions = dfions.copy()
+    dfions["rtRegion"] = 0
+    dfions["MZ"] = dfions["MZ"].astype(int)
+    dftopmost = pd.DataFrame()
+    Nrt = int(topIons) # take more blocks across RT because the mass is usually very stable and good
+    selectedMzs = [] # to keep only the first MZ, each ion will be extracted from all runs
+    for label in dfions['LABELSAMPLEGROUP'].unique():
+        df = dfions[(dfions["LABELSAMPLEGROUP"] == label) & (~dfions["MZ"].isin(selectedMzs))]
+        if len(df) == 0:
+            continue
+        # Step 1: Partition the space
+        rtbins = pd.cut(df['RT'], bins=Nrt, labels=False)
+        # Step 2: Select rows with maximum frequency and intensity in each zone
+        rtRegion = 1
+        for rt_bin in range(Nrt):
+            zone_rows = df[(rtbins == rt_bin) & (~df["MZ"].isin(selectedMzs))]
+            if not zone_rows.empty:
+                zone_rows = zone_rows[zone_rows['FREQ'] == max(zone_rows['FREQ'])]
+                #max_row = zone_rows.loc[(zone_rows['FREQ'] * zone_rows['INTENSITY']).idxmax()]
+                max_row = zone_rows.loc[(zone_rows['INTENSITY']).idxmax()]
+                df.at[max_row.name, 'rtRegion'] = rtRegion
+                rtRegion = rtRegion + 1
+                selectedMzs.append(df.loc[max_row.name, "MZ"])
+        dftopmost = pd.concat([dftopmost, df[df["rtRegion"] > 0]])
+
+    dftopmost.reset_index(inplace=True, drop=True)
+    # keep only the first MS run for each LABELSAMPLEGROUP
+    dfruns = dfruns.copy()
+    dfruns.drop_duplicates(['LABELSAMPLEGROUP'], inplace=True, keep='first')
+    dfruns.reset_index(inplace=True, drop=True)
+    for k in dftopmost.index:
+        ionmz = dftopmost["MZ"][k]
+        ionrt = dftopmost["RT"][k]/10 # <- ToDo: correct scaling in PCA.py
+        mzaFile = list(dfruns["MZAPATH"][dfruns["LABELSAMPLEGROUP"] == (dftopmost["LABELSAMPLEGROUP"][k])])[0] + ".mza"
+        [ionmz,ionrt] = GetHighResCoordinates(mzaFile, ionmz, ionrt, rtrange=1, mzrange=1, minMzDistCentroid=minMzDistDetectCentroidMS) # these tolerances must be kept at unit resolution
+        dftopmost.loc[k,"MZ"] = ionmz
+        dftopmost.loc[k,"RT"] = ionrt 
+
+    dftopmost = dftopmost[(dftopmost["MZ"] > 0) & (dftopmost["RT"] > 0)]
+    dftopmost.sort_values(by=["LABELSAMPLEGROUP", "rtRegion", "FREQ", "INTENSITY"], ascending=[True, True, False, False], inplace=True)
+    dftopmost.drop_duplicates(subset=["LABELSAMPLEGROUP", "rtRegion"], inplace=True, keep='first')
+    dftopmost.drop(columns=["rtRegion"], inplace=True)
+    dftopmost.reset_index(inplace=True, drop=True)
+    dftopmost["MOLECULE"] = ["Ion" + str(k+1) for k in dftopmost.index]
+    #dftopmost["MOLECULE"] += "-MZ" + str(round(dftopmost["MZ"], ndigits=2))+ "-RT" + str(round(dftopmost["RT"], ndigits=1))
+    return dftopmost