Merge pull request #65 from m-beau/m-beau

M beau
m-beau · Sep 10, 2021 · 4d323c8 · 4d323c8
2 parents be1eb44 + 78205b3
commit 4d323c8
Show file tree

Hide file tree

Showing 14 changed files with 388 additions and 776 deletions.
diff --git a/README.md b/README.md
@@ -1,16 +1,18 @@
-# Npyx: loading, processing and plotting Neuropixels data
+# Npyx: loading, processing and plotting Neuropixels data in python
 
 [![PyPI Version](https://img.shields.io/pypi/v/npyx.svg)](https://pypi.org/project/npyx/)
 
-Npyx is a pure python library built for electrophysiologists using Neuropixels. It features a suite of core utility functions for loading, processing and plotting Neuropixels data.
+Npyx is a python library built for electrophysiologists using Neuropixels electrodes. It features a suite of core utility functions for loading, processing and plotting Neuropixels data.
 
-This package results from the needs of an experimentalist who could not stand MATLAB, hence wrote himself a suite of functions to emotionally bear doing neuroscience. There isn't any dedicated preprint available yet, so if you enjoy this package and use it for your research, please cite [this paper](https://www.nature.com/articles/s41593-019-0381-8). Cheers!
+This package results from the needs of an experimentalist who could not stand MATLAB, hence wrote himself a suite of functions to emotionally bear with doing neuroscience. There isn't any dedicated preprint available yet, so if you enjoy this package and use it for your research, please star [the github repo](https://github.com/m-beau/NeuroPyxels) (click on the top-right star button!) and cite [this paper](https://www.nature.com/articles/s41593-019-0381-8). Cheers!
 
 ## Documentation:
 Npyx works in harmony with the data formatting employed by [SpikeGLX](https://billkarsh.github.io/SpikeGLX/) used in combination with [Kilosort](https://github.com/MouseLand/Kilosort) and [Phy](https://phy.readthedocs.io/en/latest/).
 
 Most npyx functions take at least one input: **`dp`**, which is the path to your Kilosort/phy dataset. You can find a [full description of the structure of such datasets](https://phy.readthedocs.io/en/latest/sorting_user_guide/#installation) on phy documentation.
 
+Importantly, dp can also be the path to a merged dataset, generated by circuitProphyler - every function will (normally) perceive merged dataset folders as any regular dataset folder.
+
 More precisely, every function requires the files `myrecording.ap.meta`, `spike_times.npy` and `spike_clusters.npy`. Then particular functions will require particular files: loading waveforms with `npyx.spk_wvf.wvf` or extracting your sync channel with `npyx.io.get_npix_sync` require the raw data `myrecording.ap.bin`, extracting the spike sorted group of your units `cluster_groups.tsv` and so on.
 
 Example use cases are:
@@ -79,22 +81,25 @@ u=234
 # plot waveform, 2.8ms around center, on 8 channels around peak channel,
 # with no single waveforms in the background (sample_lines)
 fig = plot_wvf(dp, u, Nchannels=8, t_waveforms=2.8, sample_lines=0)
+```
 
+```python
 # plot ccg between 234 and 92
-fig = plot_ccg(dp, [u,92], cbin=0.2, cwin=80)
+fig = plot_ccg(dp, [u,92], cbin=0.2, cwin=80, as_grid=True)
 ```
+![ccg](images/ccg.png)
 
 ### Merge datasets acquired on two probes simultaneously
 ```python
 # The three recordings need to include the same sync channel.
-from npyx.Prophyler import Prophyler
+from npyx.merger import Merger
 dps = ['same_folder/lateralprobe_dataset',
        'same_folder/medialprobe_dataset',
        'same_folder/anteriorprobe_dataset']
 probenames = ['lateral','medial','anterior']
 dp_dict = {p:dp for p, dp in zip(dps, probenames)}
-multipro = Prophyler(dp_dic)
-dp=multipro.dp_pro
+merged = Merger(dp_dic)
+dp=merged.dp_merged
 # This will merge the 3 datasets (only relevant information, not the raw data) in a new folder at
 # same_folder/prophyler_lateralprobe_dataset_medialprobe_dataset_anteriorprobe_dataset
 # which can then be used as if it were a single dataset by all npyx functions.
@@ -153,8 +158,7 @@ Useful link to [create a python package from a git repository](https://towardsda
 
 ### Push local updates to github:
 ```bash
-# DO NOT DO THAT WITHOUT CHECKING WITH MAXIME FIRST
-# ONLY ON DEDICATED BRANCH CREATED WITH THE GITHUB BROWSER INTERFACE
+# ONLY ON DEDICATED BRANCH
 
 cd path/to/save_dir/NeuroPyxels
 git checkout DEDICATED_BRANCH_NAME # ++++++ IMPORTANT
@@ -165,7 +169,7 @@ git push origin DEDICATED_BRANCH_NAME # ++++++ IMPORTANT
 # Then pull request to master branch using the online github green button! Do not forget this last step, to allow the others repo to sync.
 ```
 
-### Push local updates to PyPI (only Maxime)
+### Push local updates to PyPI (Maxime)
 First change the version in ./setup.py in a text editor
 ```python
 setup(name='npyx',

diff --git a/build/lib/npyx/__init__.py b/build/lib/npyx/__init__.py
@@ -29,4 +29,4 @@
     npyx.stats
 """
 
-__version__ = '1.6.4'
+__version__ = '1.6.5'
diff --git a/build/lib/npyx/feat.py b/build/lib/npyx/feat.py
@@ -1916,6 +1916,6 @@ def filter_df(dfram):
     """
     all_conds = np.isclose(np.sum(dfram.iloc[:,-6:-1], axis = 1), 0) |  np.isclose(np.sum(dfram.iloc[:,-11:-6], axis=1), 0) |  np.isclose(np.sum(dfram.iloc[:,16:-12], axis = 1), 0) |  np.isclose(np.sum(dfram.iloc[:,2:17], axis =1), 0) |  np.array(dfram.iloc[:,:-1].isnull().any(axis=1)).flatten() | np.isclose(dfram.iloc[:,-12], 0)
 
-  return dfram[~all_conds]
+    return dfram[~all_conds]
 
 
diff --git a/build/lib/npyx/gl.py b/build/lib/npyx/gl.py
@@ -24,32 +24,32 @@ def get_rec_len(dp, unit='seconds'):
         if unit=='milliseconds':t_end*=1e3
     return t_end
 
+def regenerate_cluster_groups(dp):
+    units=np.unique(np.load(Path(dp,"spike_clusters.npy")))
+    qualities=pd.DataFrame({'cluster_id':units, 'group':['unsorted']*len(units)})
+    return qualities
+
 def load_units_qualities(dp, again=False):
     f='cluster_group.tsv'
-    regenerate=False
-    if os.path.isfile(Path(dp, f)):
+    regenerate=False if not again else True
+    if Path(dp, f).exists():
         qualities = pd.read_csv(Path(dp, f),delimiter='\t')
         if 'group' not in qualities.columns:
             print('WARNING there does not seem to be any group column in cluster_group.tsv - kilosort >2 weirdness. Making a fresh file.')
             regenerate=True
         else:
             if 'unsorted' not in qualities['group'].values:
                 regenerate=True
+        if regenerate:
+            qualities_new=regenerate_cluster_groups(dp)
+            missing_clusters_m=(~np.isin(qualities_new['cluster_id'], qualities['cluster_id']))
+            qualities_missing=qualities_new.loc[missing_clusters_m,:]
+            qualities=qualities.append(qualities_missing).sort_values('cluster_id')
+            qualities.to_csv(Path(dp, 'cluster_group.tsv'), sep='\t', index=False)
+
     else:
         print('cluster groups table not found in provided data path. Generated from spike_clusters.npy.')
-        regenerate=True
-
-    if regenerate:
-        units=np.unique(np.load(Path(dp,"spike_clusters.npy")))
-        qualities=pd.DataFrame({'cluster_id':units, 'group':['unsorted']*len(units)})
-        qualities.to_csv(Path(dp, 'cluster_group.tsv'), sep='\t', index=False)
-        return qualities
-
-    if again: # file was found if this line is reached
-        units=np.unique(np.load(Path(dp,"spike_clusters.npy")))
-        new_unsorted_units=units[~np.isin(units, qualities['cluster_id'])]
-        qualities=qualities.append(pd.DataFrame({'cluster_id':new_unsorted_units, 'group':['unsorted']*len(new_unsorted_units)}), ignore_index=True)
-        qualities=qualities.sort_values('cluster_id')
+        qualities=regenerate_cluster_groups(dp)
         qualities.to_csv(Path(dp, 'cluster_group.tsv'), sep='\t', index=False)
 
     return qualities

diff --git a/images/ccg.png b/images/ccg.png
diff --git a/npyx.egg-info/PKG-INFO b/npyx.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: npyx
-Version: 1.6.4
+Version: 1.6.5
 Summary: Python routines dealing with Neuropixels data.
 Home-page: https://github.com/Npix-routines/NeuroPyxels
 Author: Maxime Beau

diff --git a/npyx/__init__.py b/npyx/__init__.py
@@ -1,9 +1,18 @@
 # -*- coding: utf-8 -*-
 
-from . import utils, io, gl, spk_t, spk_wvf, corr, plot, ml, behav, circuitProphyler, stats
-
-from .utils import npa, sign, thresh, smooth, zscore
-
+from . import utils, io, gl, spk_t, spk_wvf, corr, plot, ml, behav, merger, stats
+
+from .utils import *
+from .io import *
+from .gl import *
+from .spk_t import *
+from .spk_wvf import *
+from .corr import *
+from .plot import *
+from .ml import *
+from .behav import *
+from .merger import *
+from .stats import *
 
 __doc__="""
 

diff --git a/npyx/feat.py b/npyx/feat.py
@@ -15,6 +15,7 @@
 
 
 """
+import tqdm
 
 import numpy as np
 from pathlib import Path
@@ -73,7 +74,7 @@ def neg_peak_amp_time(waves, axes= 1):
 
 def cross_times(waves, axis = 1):
     """
-    find where the wvf 'crosses' 0 between the most negative and the next peak 
+    find where the wvf 'crosses' 0 between the most negative and the next peak
 
     """
     if len(waves.shape)==1:
@@ -401,10 +402,10 @@ def end_amp_time(waves, axis = 1):
     peak_t, peak_v = detect_peaks(waves)
     onset_v = peak_v[-1]*0.05
 
-    # now we find the last crossing of this value 
+    # now we find the last crossing of this value
     # get section after the last peak
     after_peak= waves[peak_t[-1]:]
-    # now we have list of crossings where 
+    # now we have list of crossings where
     crossings = np.where(np.diff(np.sign(after_peak-onset_v)))
 
     # if there are crossings, get the first of these
@@ -591,7 +592,7 @@ def pos_half_width(waves, axes = 1):
 
     neg_t, neg_v, pos_t, pos_v, flipped_order = peaks_order(waves)
 
-    # regradless of the order of the peaks, return the half width time for the 
+    # regradless of the order of the peaks, return the half width time for the
     # the positive peak
     perc_50 = 0.5 * pos_v
 
@@ -614,7 +615,7 @@ def pos_half_width(waves, axes = 1):
     return cross_start, cross_end,perc_50, cross_end - cross_start
 
 
-# old 
+# old
     # get the wvf peaks
 #    peak_t, peak_v = detect_peaks(waves)
 #
@@ -637,7 +638,7 @@ def pos_half_width(waves, axes = 1):
 #    # also need this in case there are other crossings that might happen at
 #    # other times of the recording
 #
-#    # depending on the orientation of the peaks, find the corresponding 
+#    # depending on the orientation of the peaks, find the corresponding
 #    # pos half-width
 #    start_interval = cross_times(waves)[0].astype(int)
 #    end_interval = end_amp_time(waves)[0]
@@ -883,7 +884,7 @@ def in_distance_surface(dp, dist_limit=2000 ):
         filter out neurons that are so deep as to be in the nuclei.
         """
 
-        # get the peak channels for each unit sorted by channel location                
+        # get the peak channels for each unit sorted by channel location
         peak_chan_depth = get_depthSort_peakChans(dp, units = [], quality = 'good')
 
         surface_chan = peak_chan_depth[0,1]
@@ -930,7 +931,7 @@ def chan_spread(all_wav, chan_path, unit, n_chans = 20, chan_spread_dist = 25.6)
 
     _,_, p2p = consecutive_peaks_amp(all_wav.T)
 
-    # search for the file that has the given peak chan 
+    # search for the file that has the given peak chan
     max_chan_path = list(Path(chan_path/'routinesMemory').glob(f'dsm_{unit}_peakchan*'))[0]
     max_chan = int(np.load(max_chan_path))
 
@@ -942,7 +943,7 @@ def chan_spread(all_wav, chan_path, unit, n_chans = 20, chan_spread_dist = 25.6)
 
     # I want a distribution of amplitudes with the x axis being distance
     # so sort the matrix so the first column is sorted
-    # plot the second column 
+    # plot the second column
     sort_dist = np.argsort(dists)
     sort_dist_p2p = np.vstack((dists[sort_dist], p2p[sort_dist])).T
 
@@ -1034,7 +1035,7 @@ def chan_plot(waves, peak_chan, n_chans=20):
     fig,ax  = plt.subplots(n_chans+1,2)
 
     for i in range(plot_waves.shape[0]):
-        # find where to have the plot depending on 
+        # find where to have the plot depending on
 
         y_axis = (i//2)%(plot_waves.shape[0]+1 )
         x_axis = i%2
@@ -1428,11 +1429,11 @@ def wvf_feat(dp, units):
         if len_unit_spks > 10_000:
 #           if there is enough ram then the process is much faster
 #           How much RAM is available and how much is needed
-#           number of spikes * number of channels per spike * number of datapoints per chan (including padding) 
+#           number of spikes * number of channels per spike * number of datapoints per chan (including padding)
 #            ram_needed =  len_unit_spks * 384 *182
 #            ram_available = vmem().available
 #            if ram_needed < ram_available:
-#                # if there is enough ram to store all the spikes in memory, FAST 
+#                # if there is enough ram to store all the spikes in memory, FAST
 #                mean_pc, extracted_waves, _, max_chan = wvf_dsmatch(dp,unit, prnt=False, again=False,fast =True, save = True)
 #            else:
 #                # not enough RAM to store all spikes in memory, Slow
@@ -1596,6 +1597,7 @@ def process_all(recs_fn, show = False, again = False):
 
     all_feat = []
     for i, ds in list(recs.items())[:]:
+        print(f"/nProcessing dataset {ds['dp']}...")
         data_root = Path(ds['dp'])/'routinesMemory'
         features_folder = data_root / 'features'
         acg_folder = data_root / 'acg'
@@ -1637,7 +1639,10 @@ def process_all(recs_fn, show = False, again = False):
 
         within_bounds = in_distance_surface(ds['dp'])
 
+        print("Extracting waveform and temporal features...")
+        pbar = tqdm(total=len(good_units))
         for unit in good_units:
+            pbar.update(1)
             acg_filename = rec_name + '_' + 'acg' + '_' +  '_' + str(unit) + '_' + cell_type + '.npy'
             wvf_filename = rec_name + '_' + 'wvf' + '_' +  '_' + str(unit) + '_' + cell_type + '.npy'
             mean_wvf_path = wvf_folder / wvf_filename
@@ -1680,6 +1685,7 @@ def process_all(recs_fn, show = False, again = False):
     wvf_files = []
     acg_files = []
 
+    print("Computing PCA features across datasets...")
     for i, ds in list(recs.items())[:]:
 #        data_root = Path('/home/npyx/projects/optotag/proc_data')
         data_root = Path(ds['dp'])/'routinesMemory'
@@ -1726,7 +1732,7 @@ def process_all(recs_fn, show = False, again = False):
     all_acgs = np.vstack(load_acg)
 
     # find where either of the matrices are 0 or inf valued and filter these vectors
-    # from both 
+    # from both
     zero_rows_acg = np.where(np.sum(all_acgs, axis = 1) ==0)[0].tolist()
     zero_rows_wvf = np.where(np.sum(all_wvfs, axis = 1) ==0)[0].tolist()
 
@@ -1748,7 +1754,7 @@ def process_all(recs_fn, show = False, again = False):
     masked_acg_files = acg_files[mask]
     masked_wvf_files = wvf_files[mask]
 
-    # push all the processed wvf and acg through a PCA 
+    # push all the processed wvf and acg through a PCA
     # and get the first few principal components
 
     acg_projected, acg_pca = get_pca_weights(masked_acgs, show = show, titl = 'ACG')
@@ -1759,7 +1765,7 @@ def process_all(recs_fn, show = False, again = False):
     # now that we have the pca of some of the projected pcas
     # need to make a new vector that can be then merged back with the dataframe
 
-    ### MANUAL FILTERING here 
+    ### MANUAL FILTERING here
     ### THERE ARE some clear outlier wvf visible in the pca space that I am removing
     ### I checked these items being filtered out by hand and they can be thrown out
 
@@ -1783,7 +1789,11 @@ def process_all(recs_fn, show = False, again = False):
     # add them to a ddataframe
     # join the dataframe with the wvf and acg PCA
     # remove the rows where the features, wvf or pca are 0
-    cols1 = ['file', 'unit', 'mfr', 'mifr', 'med_isi', 'mode_isi', 'prct5ISI', 'entropy','CV2_mean', 'CV2_median', 'CV', 'IR', 'Lv', 'LvR', 'LcV', 'SI', 'skw', 'neg voltage', 'neg time', 'pos voltage', 'pos time' , 'pos 10-90 time', 'neg 10-90 time', 'pos 50%', 'neg 50%', 'onset time', 'onset voltage', 'wvf duration', 'peak/trough ratio','recovery slope', 'repolarisation slope','chan spread', 'backprop', 'under2mm']
+    cols1 = ['file', 'unit', 'mfr', 'mifr', 'med_isi', 'mode_isi', 'prct5ISI',
+             'entropy','CV2_mean', 'CV2_median', 'CV', 'IR', 'Lv', 'LvR', 'LcV', 'SI', 'skw',
+             'neg voltage', 'neg time', 'pos voltage', 'pos time' , 'pos 10-90 time', 'neg 10-90 time',
+             'pos 50%', 'neg 50%', 'onset time', 'onset voltage', 'wvf duration', 'peak/trough ratio','recovery slope',
+             'repolarisation slope','chan spread', 'backprop', 'under2mm']
 
     all_feat_df = pd.DataFrame()
 
@@ -1843,7 +1853,7 @@ def process_all(recs_fn, show = False, again = False):
 
     # need to add cell type data
     # create new list of NaN values
-    # loop over all the files and if there are ones that 
+    # loop over all the files and if there are ones that
 
     cell_type_tagged = []