SpikeInterface · chrishalcrow · Dec 31, 2025 · Dec 31, 2025 · Dec 31, 2025 · Jan 4, 2026
diff --git a/doc/modules/curation.rst b/doc/modules/curation.rst
@@ -206,7 +206,7 @@ Curation format
 SpikeInterface internally supports a JSON-based manual curation format.
 When manual curation is necessary, modifying a dataset in place is a bad practice.
 Instead, to ensure the reproducibility of the spike sorting pipelines, we have introduced a simple and JSON-based manual curation format.
-This format defines at the moment : merges + deletions + manual tags.
+This format defines at the moment : manual labelling, removal, merging, splitting and removal of spikes from a unit.
 The simple file can be kept along side the output of a sorter and applied on the result to have a "clean" result.
 
 This format has two part:
@@ -216,21 +216,26 @@ This format has two part:
     * "format_version" : format specification
     * "unit_ids" : the list of unit_ds
     * "label_definitions" : list of label categories and possible labels per category.
-                            Every category can be *exclusive=True* onely one label or *exclusive=False* several labels possible
+        If a unit can only have one label, the category can be set to be *exclusive=True*. If several labels can be used at once, the category can be set to be *exclusive=False*.
 
   * **manual output** curation with the folowing keys:
 
     * "manual_labels"
-    * "merge_unit_groups"
-    * "removed_units"
+    * "merges"
+    * "removed"
+    * "splits"
+    * "discard_spikes"
 
-Here is the description of the format with a simple example (the first part of the
-format is the definition; the second part of the format is manual action):
+The first three ("manual_labels", "merges" and "removed") act at the unit level. They label, merge or remove whole units. While the final two
+("splits" and "discard_spikes") act at the spike level: we need to define which spikes from a unit are being split into a new unit, or which
+spikes from a unit are to be discarded. Note that all spike indices are with respect to the original analyzer.
+
+Here is a simple example of the format:
 
 .. code-block:: json
 
     {
-        "format_version": "1",
+        "format_version": "3",
         "unit_ids": [
             "u1",
             "u2",
@@ -266,25 +271,31 @@ format is the definition; the second part of the format is manual action):
         "manual_labels": [
             {
                 "unit_id": "u1",
-                "quality": [
-                    "good"
-                ]
+                "labels": {
+                    "quality": [
+                        "good"
+                    ]
+                }
             },
             {
                 "unit_id": "u2",
-                "quality": [
-                    "noise"
-                ],
-                "putative_type": [
-                    "excitatory",
-                    "pyramidal"
-                ]
+                "labels": {
+                    "quality": [
+                        "noise"
+                    ],
+                    "putative_type": [
+                        "excitatory",
+                        "pyramidal"
+                    ]
+                }
             },
             {
                 "unit_id": "u3",
-                "putative_type": [
-                    "inhibitory"
-                ]
+                "labels": {
+                    "putative_type": [
+                        "inhibitory"
+                    ]
+                }
             }
         ],
         "merge_unit_groups": [
@@ -301,9 +312,48 @@ format is the definition; the second part of the format is manual action):
         "removed_units": [
             "u31",
             "u42"
+        ],
+        "splits": [
+            {
+                "unit_id": "u1",
+                "mode": "indices",
+                "indices": [
+                    [
+                        10,
+                        20,
+                        30
+                    ]
+                ],
+                "new_unit_ids": [
+                    "u1-1",
+                    "u1-2"
+                ]
+            }
+        ],
+        "discard_spikes": [
+            {
+                "unit_id": "u10",
+                "indices": [
+                    56,
+                    57,
+                    59,
+                    60
+                ]
+            },
+            {
+                "unit_id": "u14",
+                "indices": [
+                    123,
+                    321
+                ]
+            }
         ]
     }
 
+Note that you cannot split and merge a unit at the same time.
+
+We do not expect users to create their own curation json files. Instead, our internal curation algorithms will output
+results which can be easily transformed into the format. We also hope that external packages can use our format.
 
 The curation format can be loaded into a dictionary and directly applied to
 a ``BaseSorting`` or ``SortingAnalyzer`` object using the :py:func:`~spikeinterface.curation.apply_curation` function.

diff --git a/src/spikeinterface/core/sorting_tools.py b/src/spikeinterface/core/sorting_tools.py
@@ -517,6 +517,7 @@ def apply_splits_to_sorting(
     sorting: BaseSorting,
     unit_splits: dict[int | str, list[list[int | str]]],
     new_unit_ids: list[list[int | str]] | None = None,
+    discard_spikes_unit_ids: list[list[int | str]] | None = None,
     return_extra: bool = False,
     new_id_strategy: str = "append",
 ):
@@ -543,6 +544,8 @@ def apply_splits_to_sorting(
         List of new unit_ids for each split. If given, it needs to have the same length as `unit_splits`.
         and each element must have the same length as the corresponding list of split indices.
         If None, new ids will be generated.
+    discard_spikes_unit_ids : list | None, default: None
+        List of units which contain spikes to discard.
     return_extra : bool, default: False
         If True, also return the new_unit_ids.
     new_id_strategy : "append" | "split", default: "append"
@@ -565,7 +568,11 @@ def apply_splits_to_sorting(
     # this is true when running via apply_curation
 
     new_unit_ids = generate_unit_ids_for_split(
-        sorting.unit_ids, unit_splits, new_unit_ids=new_unit_ids, new_id_strategy=new_id_strategy
+        sorting.unit_ids,
+        unit_splits,
+        new_unit_ids=new_unit_ids,
+        new_id_strategy=new_id_strategy,
+        discard_spikes_unit_ids=discard_spikes_unit_ids,
     )
     all_unit_ids = _get_ids_after_splitting(sorting.unit_ids, unit_splits, new_unit_ids)
     all_unit_ids = list(all_unit_ids)
@@ -660,7 +667,9 @@ def set_properties_after_splits(
     sorting_post_split.set_property("is_split", is_split)
 
 
-def generate_unit_ids_for_split(old_unit_ids, unit_splits, new_unit_ids=None, new_id_strategy="append"):
+def generate_unit_ids_for_split(
+    old_unit_ids, unit_splits, new_unit_ids=None, new_id_strategy="append", discard_spikes_unit_ids=None
+):
     """
     Function to generate new units ids during a splitting procedure. If `new_units_ids`
     are provided, it will return these unit ids, checking that they are consistent with
@@ -682,6 +691,8 @@ def generate_unit_ids_for_split(old_unit_ids, unit_splits, new_unit_ids=None, ne
             * "split" : new_unit_ids will be the created as {split_unit_id]-{split_number}
                         (e.g. when splitting unit "13" in 2: "13-0" / "13-1").
                         Only works if unit_ids are str otherwise switch to "append"
+    discard_spikes_unit_ids : list | None, default: None
+        List of units which contain spikes to discard.
 
     Returns
     -------
@@ -691,46 +702,90 @@ def generate_unit_ids_for_split(old_unit_ids, unit_splits, new_unit_ids=None, ne
     assert new_id_strategy in ["append", "split"], "new_id_strategy should be 'append' or 'split'"
     old_unit_ids = np.asarray(old_unit_ids)
 
-    if new_unit_ids is not None:
-        for split_unit, new_split_ids in zip(unit_splits.values(), new_unit_ids):
-            # then only doing a consistency check
-            assert len(split_unit) == len(new_split_ids), "new_unit_ids should have the same len as unit_splits.values"
-            # new_unit_ids can also be part of old_unit_ids only inside the same group:
-            assert all(
-                new_split_id not in old_unit_ids for new_split_id in new_split_ids
-            ), "new_unit_ids already exists but outside the split groups"
-    else:
-        dtype = old_unit_ids.dtype
-        if np.issubdtype(dtype, np.integer) and new_id_strategy == "split":
-            warnings.warn("new_id_strategy 'split' is not compatible with integer unit_ids. Switching to 'append'.")
-            new_id_strategy = "append"
-
-        new_unit_ids = []
-        current_unit_ids = old_unit_ids.copy()
-        for unit_to_split, split_indices in unit_splits.items():
-            num_splits = len(split_indices)
-            # select new_unit_ids greater that the max id, event greater than the numerical str ids
-            if new_id_strategy == "append":
-                if np.issubdtype(dtype, np.character):
-                    # dtype str
-                    if all(p.isdigit() for p in current_unit_ids):
-                        # All str are digit : we can generate a max
-                        m = max(int(p) for p in current_unit_ids) + 1
-                        new_units_for_split = [str(m + i) for i in range(num_splits)]
-                    else:
-                        # we cannot automatically find new names
-                        new_units_for_split = [f"{unit_to_split}-split{i}" for i in range(num_splits)]
-                else:
-                    # dtype int
-                    new_units_for_split = list(max(current_unit_ids) + 1 + np.arange(num_splits, dtype=dtype))
-                # we append the new split unit ids to continue to increment the max id
-                current_unit_ids = np.concatenate([current_unit_ids, new_units_for_split])
-            elif new_id_strategy == "split":
-                # we made sure that dtype is not integer
+    new_new_unit_ids = []
+    if discard_spikes_unit_ids is None:
+        discard_spikes_unit_ids = []
+
+    if new_unit_ids is not None and len(new_unit_ids) > 0:
+        for split_unit, new_split_ids in zip(unit_splits.keys(), new_unit_ids):
+            if split_unit in discard_spikes_unit_ids:
+                # make the discard unit have the original unit_id, which we'll delete later
+                new_new_unit_ids.append([split_unit] + new_split_ids)
+            else:
+                new_new_unit_ids.append(new_split_ids)
+
+        return new_new_unit_ids
+
+    dtype = old_unit_ids.dtype
+    if np.issubdtype(dtype, np.integer) and new_id_strategy == "split":
+        warnings.warn("new_id_strategy 'split' is not compatible with integer unit_ids. Switching to 'append'.")
+        new_id_strategy = "append"
+
+    # If unit_ids are list of numeric strings, we will convert to int, use "append" code with int unit_ids, then convert back to string
+    all_strings = False
+    if np.issubdtype(dtype, np.character):
+        if all(p.isdigit() for p in old_unit_ids):
+            all_strings = True
+            old_unit_ids = [int(unit_id) for unit_id in old_unit_ids]
+        else:
+            # we cannot automatically find new names
+            warnings.warn(
+                "new_id_strategy 'append' is not compatible with non-numeric string unit_ids. Switching to 'split'."
+            )
+            new_id_strategy = "split"
+
+    if new_id_strategy == "append":
+        next_max_unit_id = max(old_unit_ids) + 1
+        highest_possible_unit_id = max(old_unit_ids) + len(discard_spikes_unit_ids)
+
+    for unit_to_split, split_indices in unit_splits.items():
+
+        num_splits = len(split_indices)
+
+        # decide if unit is a simple discard, a simple split or a discard and split
+        just_discard = False
+        discard_and_split = False
+        if unit_to_split in discard_spikes_unit_ids:
+            if num_splits == 2:
+                just_discard = True
+            elif num_splits > 2:
+                discard_and_split = True
+
+        # the new units made from the discard_spikes must be at the *front* of the new_units_for_split list.
+        if new_id_strategy == "append":
+            if just_discard:
+                # give the discard unit the highest possible id, leaving the cleaned unit's id untouched.
+                new_units_for_split = [highest_possible_unit_id, unit_to_split]
+                highest_possible_unit_id -= 1
+            elif discard_and_split:
+                # give the discard unit the id `unit_to_split`, and the other units their expected append ids
+                new_units_for_split = [unit_to_split] + list(next_max_unit_id + np.arange(num_splits - 1))
+                next_max_unit_id += len(new_units_for_split) - 1
+            else:
+                # there is no discard unit
+                new_units_for_split = list(next_max_unit_id + np.arange(num_splits))
+                next_max_unit_id += len(new_units_for_split)
+
+        elif new_id_strategy == "split":
+            if just_discard:
+                new_units_for_split = [f"{unit_to_split}-{i}" for i in np.arange(len(split_indices) - 1)] + [
+                    unit_to_split
+                ]
+            elif discard_and_split:
+                new_units_for_split = [unit_to_split] + [
+                    f"{unit_to_split}-{i}" for i in np.arange(len(split_indices) - 1)
+                ]
+            else:
                 new_units_for_split = [f"{unit_to_split}-{i}" for i in np.arange(len(split_indices))]
-            new_unit_ids.append(new_units_for_split)
 
-    return new_unit_ids
+        new_new_unit_ids.append(new_units_for_split)
+
+    if all_strings:
+        new_new_unit_ids = [
+            [str(unit_id) for unit_id in new_units_for_split] for new_units_for_split in new_new_unit_ids
+        ]
+
+    return new_new_unit_ids
 
 
 def check_unit_splits_consistency(unit_splits, sorting):

diff --git a/src/spikeinterface/core/sortinganalyzer.py b/src/spikeinterface/core/sortinganalyzer.py
@@ -1386,6 +1386,7 @@ def split_units(
         new_unit_ids: list[list[int | str]] | None = None,
         new_id_strategy: str = "append",
         return_new_unit_ids: bool = False,
+        discard_spikes_unit_ids=None,
         format: str = "memory",
         folder: Path | str | None = None,
         verbose: bool = False,
@@ -1435,7 +1436,9 @@ def split_units(
 
         check_unit_splits_consistency(split_units, self.sorting)
 
-        new_unit_ids = generate_unit_ids_for_split(self.unit_ids, split_units, new_unit_ids, new_id_strategy)
+        new_unit_ids = generate_unit_ids_for_split(
+            self.unit_ids, split_units, new_unit_ids, new_id_strategy, discard_spikes_unit_ids
+        )
         all_unit_ids = _get_ids_after_splitting(self.unit_ids, split_units, new_unit_ids=new_unit_ids)
 
         new_analyzer = self._save_or_select_or_merge_or_split(