Skip to content

Commit

Permalink
First version PeakQC.
Browse files Browse the repository at this point in the history
  • Loading branch information
aivett committed Jul 15, 2024
0 parents commit 20b07b8
Show file tree
Hide file tree
Showing 684 changed files with 41,702 additions and 0 deletions.
114 changes: 114 additions & 0 deletions IonToolPack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import os
import pandas as pd
import sys
from qc.utils import FormatDataframeSamples
from qc.qc_pipeline import qc_pipeline
from tkinter import filedialog, Tk, Button, ttk, Entry, StringVar, messagebox, Label

def import_list_ms_runs():
global df
global filepath
global userIonsFileLabel
csv_file_path = filedialog.askopenfilename()
print(csv_file_path)
separator = "\t"
if csv_file_path.endswith('.csv'):
separator = ','
df = pd.read_csv(csv_file_path, sep=separator)

df = FormatDataframeSamples(df, basePathCsvRuns=os.path.dirname(csv_file_path))

if len(df) == 0:
return

filepath.set(os.path.dirname(csv_file_path))
if os.path.exists(filepath.get() + "/User-Ions.csv"):
userIonsFileLabel.set("User ions file: DETECTED User-Ions.csv")
else:
userIonsFileLabel.set("User ions file: NONE")

#Clear the treeview list items
for item in tree.get_children():
tree.delete(item)

tree["columns"] = list(df.columns)
for col in df.columns:
tree.heading(col, text=col)
for idx, (_, row) in enumerate(df.iterrows()):
if idx % 2 == 0:
tree.insert("", "end", text=str(idx), values=list(row), tags=("evenrow",))
else:
tree.insert("", "end", text=str(idx), values=list(row))

def import_list_ms_runs_clipboard():
global df
global filepath
global userIonsFileLabel
runs = []
paths = []
cb = root.clipboard_get()
for item in cb.split('\n'):
runx = os.path.basename(item)
runs.append(runx)
paths.append(item.removesuffix(runx))

if len(runs) == 0:
return

df = pd.DataFrame({"MSRUN": runs, "MSRUNPATH": paths})
df = FormatDataframeSamples(df)
#Clear the treeview list items
for item in tree.get_children():
tree.delete(item)

tree["columns"] = list(df.columns)
for col in df.columns:
tree.heading(col, text=col)
for idx, (_, row) in enumerate(df.iterrows()):
if idx % 2 == 0:
tree.insert("", "end", text=str(idx), values=list(row), tags=("evenrow",))
else:
tree.insert("", "end", text=str(idx), values=list(row))
# Take the path of the first run as the result path:
filepath.set(df["MSRUNPATH"][0])
if os.path.exists(filepath.get() + "/User-Ions.csv"):
userIonsFileLabel.set("User ions file: DETECTED User-Ions.csv")
else:
userIonsFileLabel.set("User ions file: NONE")

def call_backend_process():
global df
global filepath
if df is None or len(df) == 0:
messagebox.showerror("Please import a list of MS runs.")
return

if os.path.exists(filepath.get() + "/config.toml"):
qc_pipeline(df, filepath.get(), filepath.get() + "/config.toml")
else:
qc_pipeline(df, filepath.get())


if __name__ == "__main__":
import multiprocessing
multiprocessing.freeze_support()
root = Tk()
root.geometry('800x600')
root.title("IonToolPack v1 | PeakQC")
root.iconbitmap(sys.executable)

tree = ttk.Treeview(root, height=20, show="headings")
filepath = StringVar()
userIonsFileLabel = StringVar()
userIonsFileLabel.set("User ions file: NONE")

Button(root, text='Import list of MS runs (.csv, .txt)', command=import_list_ms_runs).pack()
Button(root, text='Paste MS runs from clipboard', command=import_list_ms_runs_clipboard).pack()
tree.pack(fill="both")

Label(root, text="Output path:").pack() # add label for file path
Entry(root, textvariable=filepath).pack() # add text box for file path
Label(root, textvariable=userIonsFileLabel).pack() # add label for user ions file
Button(root, text='Process', command=call_backend_process).pack()

root.mainloop()
24 changes: 24 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
BSD 2-Clause License

Copyright 2022 Battelle Memorial Institute

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Binary file added PeakQC_User-guide_v2024-06-29.pdf
Binary file not shown.
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

# IonToolPack | PeakQC

IonToolPack is a software suite for housing tools for mass spectrometry. The first one is PeakQC, a software a tool for automated quality control (QC) of mass spectrometry (MS) data which is is omics-agnostic (works for any ion type, e.g.: metabolomics, lipidomics, proteomics, etc.), supports various instrument platforms and acquisition modes and has a simple graphical user interface.

## Usage
1. Download the latest version (Release section, right panel) and uncompress it
2. Double click IonToolPack.exe
3. Import raw MS files and click “Process”
4. See more details and examples in PeakQC_User-guide_*.pdf

## MS data supported
Supported formats include Agilent 'd', Bruker 'd' (improvements in progress), Thermo '.raw', and mzML, and for different types of MS acquisition methods:
* LC-MS
* LC-IMS-MS
* With/without fragmentation spectra in DDA or DIA mode
* Direct infusion

## Contact

aivett.bilbao@pnnl.gov

## References

If you this tool or any portions of this code please cite:
* Harrison et al. "PeakQC: A Software Tool for Omics-Agnostic Automated Quality Control of Mass Spectrometry Data". Journal of the American Society for Mass Spectrometry 2024 https://doi.org/10.1021/jasms.4c00146.
* Bilbao et al. "MZA: A Data Conversion Tool to Facilitate Software Development and Artificial Intelligence Research in Multidimensional Mass Spectrometry". Journal of Proteome Research 2023 https://doi.org/10.1021/acs.jproteome.2c00313.
29 changes: 29 additions & 0 deletions config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Configuration file

# For overlaid figures, Half Window size of the view (+- center):
MZVIEWHALFWINDOW = 0.7
RTVIEWHALFWINDOW = 1.5
ATVIEWHALFWINDOW = 2

# Mass tolerance for chromatogram extraction, Half Window m/z value (+- center or m/z target):
MZXICHALFWINDOW = 0.01

# Error thresholds, absolute value:
MZERRORPPM = 15
RTERROR = 0.3
ATERROR = 0.1
ABUNDANCEERROR = 30 # Percentage absolute error, a percentage of the mean of the ion abundance applied as a threshold to report QC ions outside tolerances

AutoTrackedIonsTopN = 4 # Number of auto-tracked ions to detect per sample group
MinIntensityPresencePercentage = 80 # Intensity threshold presence/absence

MinMzDistDetectCentroidMS = 0.0005 # If distance between 2 consecutive points from the max intensity peak is smaller than this value then it is considered profile mode spectrum

FigureLegendMaxNumberLines = 15

# MZA conversion:
MinIntensityMza = 20

# time-vs-mz images:
TimeVsMzImageMinIntensityPercentage = 10
TimeVsMzImageMaxIntensityCeilingPercentage = 70
51 changes: 51 additions & 0 deletions qc/auto_ion_tracking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import numpy as np
import pandas as pd
from qc.ion_batch import GetHighResCoordinates

def DetectTopmostIons(dfions, dfruns, topIons=4, minMzDistDetectCentroidMS=0.005):
dfions = dfions.copy()
dfions["rtRegion"] = 0
dfions["MZ"] = dfions["MZ"].astype(int)
dftopmost = pd.DataFrame()
Nrt = int(topIons) # take more blocks across RT because the mass is usually very stable and good
selectedMzs = [] # to keep only the first MZ, each ion will be extracted from all runs
for label in dfions['LABELSAMPLEGROUP'].unique():
df = dfions[(dfions["LABELSAMPLEGROUP"] == label) & (~dfions["MZ"].isin(selectedMzs))]
if len(df) == 0:
continue
# Step 1: Partition the space
rtbins = pd.cut(df['RT'], bins=Nrt, labels=False)
# Step 2: Select rows with maximum frequency and intensity in each zone
rtRegion = 1
for rt_bin in range(Nrt):
zone_rows = df[(rtbins == rt_bin) & (~df["MZ"].isin(selectedMzs))]
if not zone_rows.empty:
zone_rows = zone_rows[zone_rows['FREQ'] == max(zone_rows['FREQ'])]
#max_row = zone_rows.loc[(zone_rows['FREQ'] * zone_rows['INTENSITY']).idxmax()]
max_row = zone_rows.loc[(zone_rows['INTENSITY']).idxmax()]
df.at[max_row.name, 'rtRegion'] = rtRegion
rtRegion = rtRegion + 1
selectedMzs.append(df.loc[max_row.name, "MZ"])
dftopmost = pd.concat([dftopmost, df[df["rtRegion"] > 0]])

dftopmost.reset_index(inplace=True, drop=True)
# keep only the first MS run for each LABELSAMPLEGROUP
dfruns = dfruns.copy()
dfruns.drop_duplicates(['LABELSAMPLEGROUP'], inplace=True, keep='first')
dfruns.reset_index(inplace=True, drop=True)
for k in dftopmost.index:
ionmz = dftopmost["MZ"][k]
ionrt = dftopmost["RT"][k]/10 # <- ToDo: correct scaling in PCA.py
mzaFile = list(dfruns["MZAPATH"][dfruns["LABELSAMPLEGROUP"] == (dftopmost["LABELSAMPLEGROUP"][k])])[0] + ".mza"
[ionmz,ionrt] = GetHighResCoordinates(mzaFile, ionmz, ionrt, rtrange=1, mzrange=1, minMzDistCentroid=minMzDistDetectCentroidMS) # these tolerances must be kept at unit resolution
dftopmost.loc[k,"MZ"] = ionmz
dftopmost.loc[k,"RT"] = ionrt

dftopmost = dftopmost[(dftopmost["MZ"] > 0) & (dftopmost["RT"] > 0)]
dftopmost.sort_values(by=["LABELSAMPLEGROUP", "rtRegion", "FREQ", "INTENSITY"], ascending=[True, True, False, False], inplace=True)
dftopmost.drop_duplicates(subset=["LABELSAMPLEGROUP", "rtRegion"], inplace=True, keep='first')
dftopmost.drop(columns=["rtRegion"], inplace=True)
dftopmost.reset_index(inplace=True, drop=True)
dftopmost["MOLECULE"] = ["Ion" + str(k+1) for k in dftopmost.index]
#dftopmost["MOLECULE"] += "-MZ" + str(round(dftopmost["MZ"], ndigits=2))+ "-RT" + str(round(dftopmost["RT"], ndigits=1))
return dftopmost
Loading

0 comments on commit 20b07b8

Please sign in to comment.