Enable modular station selection

tarolangner · tarolangner · commit 690af57d3b20 · 2020-10-09T10:16:56.000+02:00
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Neural networks for kidney segmentation in UK Biobank neck-to-knee body MRI
+# Neural networks for semantic segmentation of UK Biobank neck-to-knee body MRI
 
 This repository contains PyTorch code for cross-validation and inference with neural networks for kidney segmentation on UK Biobank neck-to-knee body MRI, as described in:
 [_"Kidney segmentation in neck-to-knee body MRI of 40,000 UK Biobank participants"_](https://arxiv.org/abs/2006.06996) [1]
@@ -8,7 +8,7 @@ The included inference pipeline and trained snapshot enables measurements of lef
 Contents:
 - 2.5D U-Net architecture with residual connections
 - Infrastructure for training and *cross-validation*
-- Pipeline for *inference* on MRI DICOMs
+- Pipeline for *inference* on neck-to-knee body MRI DICOMs
 - Code for *quality_controls* based on numerical metrics
 - Trained snapshot for parenchymal kidney tissue can be found at *TODO*
 
diff --git a/cross_validation/README.md b/cross_validation/README.md
@@ -6,3 +6,5 @@ Using the code samples in the *scripts* subfolder, a segmentation model can be t
 3. Run *crossValidate.py* to train and evaluate a model, with the results stored to the directory "networks"
 
 To re-train a network for inference using all data, the cross-validation split can simply be set to contain all images in one split set.
+
+Note that the first run on new training data may be very slow, whereas subsequent runs will benefit from caching by the data loader.
diff --git a/cross_validation/scripts/createNewSplit.py b/cross_validation/scripts/createNewSplit.py
@@ -5,7 +5,7 @@
 # Any ids that should be added only for training can be listed 
 # in a separate file at a later stage.
 
-split_name = "kidney_64_8fold"
+split_name = "liver_99_8fold"
 split_path = "../splits/" + split_name + "/"
 
 id_list = split_path + "id_list.txt"
diff --git a/cross_validation/scripts/createTrainingSlices.py b/cross_validation/scripts/createTrainingSlices.py
@@ -24,10 +24,22 @@
 def main(argv):
 
     #
-    path_img = "/media/taro/DATA/Taro/UKBiobank/segmentations/kidney/combined_128/signals/NRRD/"
-    path_seg = "/media/taro/DATA/Taro/UKBiobank/segmentations/kidney/combined_128/segmentations/NRRD/"
-    path_ids = "/media/taro/DATA/Taro/UKBiobank/segmentations/kidney/combined_128/subject_ids.txt"
-    output_path = "../image_data/kidney_128/"
+    #path_img = "/media/taro/DATA/Taro/UKBiobank/segmentations/kidney/combined_128/signals/NRRD/"
+    #path_seg = "/media/taro/DATA/Taro/UKBiobank/segmentations/kidney/combined_128/segmentations/NRRD/"
+    #path_ids = "/media/taro/DATA/Taro/UKBiobank/segmentations/kidney/combined_128/subject_ids.txt"
+    #output_path = "../image_data/kidney_128/"
+
+    #
+    #path_img = "/media/taro/DATA/Taro/Projects/ukb_segmentation/github/temp_volumes/liver/signals/NRRD_3/"
+    #path_seg = "/media/taro/DATA/Taro/Projects/ukb_segmentation/github/temp_volumes/liver/segmentations/NRRD_fixedHeaders/"
+    #path_ids = "/media/taro/DATA/Taro/Projects/ukb_segmentation/github/temp_volumes/liver/ids.txt"
+    #output_path = "../image_data/liver_allStations/"
+
+    #
+    path_img = "/media/taro/DATA/Taro/UKBiobank/segmentations/liver/Andres_refined/signals/"
+    path_seg = "/media/taro/DATA/Taro/UKBiobank/segmentations/liver/Andres_refined/segmentations/"
+    path_ids = "/media/taro/DATA/Taro/UKBiobank/segmentations/liver/Andres_refined/ids_add.txt"
+    output_path = "../image_data/liver_refined_99_add/"
 
     #####
     createFolders(output_path, overwrite=True)
@@ -92,7 +104,7 @@ def convertSubject(subject_id, files_img, files_seg, path_img, path_seg, output_
             (slices_seg, shape_seg) = formatSeg(path_seg + file_s, shape_img)
 
             if not np.array_equal(shape_img, shape_seg):
-                print("ERROR: Mismatching dimensions for img and seg of {}".format(name))
+                print("ERROR: Mismatching dimensions for img and seg of {} ({} vs {})".format(name, shape_img, shape_seg))
                 sys.exit()
 
         # For each axial slice, save outputs
diff --git a/cross_validation/scripts/crossValidate.py b/cross_validation/scripts/crossValidate.py
@@ -21,6 +21,11 @@
 import dataLoading
 
 
+# After the slice-wise training, the validation fuses all specified imaging stations
+# to a common space to calculate subject-wise Dice scores and other evaluation metrics
+c_target_spacing = np.array((2.23214293, 2.23214293, 4.5)) # abdominal spacing
+#c_target_spacing = np.array((2.23214293, 2.23214293, 3.)) # top station spacing
+
 def main(argv):
 
     path_network_out = "../networks/kidney_64_8fold/"
@@ -32,14 +37,18 @@ def main(argv):
     path_stations_img = "/media/taro/DATA/Taro/UKBiobank/segmentations/kidney/combined_128/signals/NRRD/"
     path_stations_gt = "/media/taro/DATA/Taro/UKBiobank/segmentations/kidney/combined_128/segmentations/NRRD_fixedHeaders/"
 
-    # Optional path to list of ids which are to be used as additional training samples on each split.
+    # Select which MRI stations to use for training and evaluation
+    station_ids = [0, 1, 2]
+
+    # Optional name of list file in split path with ids 
+    # which are to be used as additional training samples, in each split.
     # Set to None for conventional cross-validation
     path_train_ids_add = None
 
-    runExperiment(path_network_out, path_training_slices, path_split, path_stations_img, path_stations_gt, path_train_ids_add)
+    runExperiment(path_network_out, path_training_slices, path_split, path_stations_img, path_stations_gt, path_train_ids_add, station_ids)
 
 
-def runExperiment(path_network_out, path_training_slices, path_split, path_stations_img, path_stations_gt, path_train_ids_add):
+def runExperiment(path_network_out, path_training_slices, path_split, path_stations_img, path_stations_gt, path_train_ids_add, station_ids):
 
     I = 80000 # Training iterations
     save_step = 5000 # Iterations between checkpoint saving
@@ -50,7 +59,7 @@ def runExperiment(path_network_out, path_training_slices, path_split, path_stati
     class_count = 2 # Number of labels, including background
     class_weights = torch.FloatTensor([1, 1]) # Background, L1, L2...
 
-    start_k = 0 # First cross-validation set to validate against
+    start_k = 0 # First cross-validation set to train and validate against
 
     do_train = True
     do_predict = True
@@ -97,13 +106,13 @@ def runExperiment(path_network_out, path_training_slices, path_split, path_stati
             os.makedirs(path_out_k)
             os.makedirs(path_checkpoints)
 
-            loader_train = getDataloader(path_training_slices + "data/", path_out_k + "train_files.txt", train_subsets, path_split, B=1, sigma=2, points=8, path_train_ids_add=path_train_ids_add)
+            loader_train = getDataloader(path_training_slices + "data/", path_out_k + "train_files.txt", train_subsets, path_split, B=1, sigma=2, points=8, path_train_ids_add=path_train_ids_add, station_ids=station_ids)
             time = train.train(net, loader_train, I, path_checkpoints, save_step, class_weights, I_reduce_lr)
 
             with open(path_out_k + "training_time.txt", "w") as f: f.write("{}".format(time))
 
         if do_predict:
-            evaluate.evaluateSnapshots(path_checkpoints, path_stations_img, path_stations_gt, path_split, val_subset, path_out_k + "eval/", net)
+            evaluate.evaluateSnapshots(path_checkpoints, path_stations_img, path_stations_gt, path_split, val_subset, path_out_k + "eval/", net, station_ids, c_target_spacing)
 
         evaluate.writeSubsetTrainingCurve(path_out_k)
         
@@ -121,7 +130,7 @@ def createDocumentation(network_path, split_path):
 
 
 #
-def getDataloader(input_path, output_path, subsets, path_split, B, sigma, points, path_train_ids_add):
+def getDataloader(input_path, output_path, subsets, path_split, B, sigma, points, path_train_ids_add, station_ids):
 
     # Get chosen volumes
     subject_ids = []
@@ -140,10 +149,10 @@ def getDataloader(input_path, output_path, subsets, path_split, B, sigma, points
 
     print("Loading data for {} subjects".format(len(subject_ids)))
 
-    # For each subject, use stations 1 and 2
+    # For each subject, use the specified stations
     stations = []
-    stations.extend([f + "_station1" for f in subject_ids])
-    stations.extend([f + "_station2" for f in subject_ids])
+    for s in station_ids:
+        stations.extend([f + "_station{}".format(s) for f in subject_ids])
 
     # Get training samples
     files = [f for f in os.listdir(input_path) if os.path.isfile(os.path.join(input_path, f))]
diff --git a/cross_validation/scripts/evaluate.py b/cross_validation/scripts/evaluate.py
@@ -217,7 +217,7 @@ def plotCurve(path_out, x, y, label_x, label_y):
     plt.close()
 
 
-def evaluateSnapshots(path_checkpoints, path_stations_img, path_stations_gt, path_split, val_subset, path_out, net):
+def evaluateSnapshots(path_checkpoints, path_stations_img, path_stations_gt, path_split, val_subset, path_out, net, station_ids, target_spacing):
 
     time_start = time.time()
 
@@ -236,8 +236,7 @@ def evaluateSnapshots(path_checkpoints, path_stations_img, path_stations_gt, pat
 
     # Fuse and store reference segmentation
     for i in range(N):
-
-        fuseStationsGt(val_subjects[i], path_stations_gt, path_out + "volumes/")
+        fuseStationsGt(val_subjects[i], path_stations_img, path_stations_gt, path_out + "volumes/", station_ids, target_spacing)
 
     # Find checkpoints
     checkpoint_files = [f for f in os.listdir(path_checkpoints) if os.path.isfile(os.path.join(path_checkpoints, f))]
@@ -250,7 +249,7 @@ def evaluateSnapshots(path_checkpoints, path_stations_img, path_stations_gt, pat
         print("   Evaluating snapshot {}...".format(i))
         checkpoint_i = path_checkpoints + checkpoint_files[i]
 
-        predictWithCheckpoint(checkpoint_i, path_stations_img, val_subjects, net, path_out + "volumes/")
+        predictWithCheckpoint(checkpoint_i, path_stations_img, val_subjects, net, path_out + "volumes/", station_ids, target_spacing)
 
         iteration = checkpoint_files[i].split("_")[1].split(".")[0]
         evaluateAgreement(path_out, iteration, val_subjects)
@@ -281,6 +280,7 @@ def evaluateAgreement(path_out, iteration, val_subjects):
             voxel_dim = np.array((space_dir[0][0], space_dir[1][1], space_dir[2][2]))
             voxel_scale = np.prod(voxel_dim) / (10*10*10)
 
+
             # Get positives, true positives, false positives
             p = np.count_nonzero(gt)
             tp = np.count_nonzero(np.multiply(gt, out))
@@ -295,7 +295,7 @@ def evaluateAgreement(path_out, iteration, val_subjects):
             f.write("{},{},{},{},{},{}\n".format(val_subjects[i],dice,p,tp,fp,voxel_scale))
 
 
-def predictWithCheckpoint(path_checkpoint, path_stations_img, val_subjects, net, path_out):
+def predictWithCheckpoint(path_checkpoint, path_stations_img, val_subjects, net, path_out, station_ids, target_spacing):
 
     # Load network weights
     checkpoint = torch.load(path_checkpoint, map_location={"cuda" : "cpu"})
@@ -309,32 +309,71 @@ def predictWithCheckpoint(path_checkpoint, path_stations_img, val_subjects, net,
 
         print("Subject {}".format(val_subjects[i]))
 
-        (img_1, header_1) = nrrd.read(path_stations_img + "{}_station1_W.nrrd".format(val_subjects[i]))
-        (img_2, header_2) = nrrd.read(path_stations_img + "{}_station2_W.nrrd".format(val_subjects[i]))
+        stations = []
+        headers = []
 
-        (img, out, header, _, _) = predictForSubject.predictForSubject([img_1, img_2], [header_1, header_2], net)
+        for s in station_ids:
+
+            (station, header) = nrrd.read(path_stations_img + "{}_station{}_W.nrrd".format(val_subjects[i], s))
+            stations.append(station)
+            headers.append(header)
+            #(img_2, header_2) = nrrd.read(path_stations_img + "{}_station2_W.nrrd".format(val_subjects[i]))
 
         if not os.path.exists(path_out + "{}_img.nrrd".format(val_subjects[i])):
+            fuse_img = True
+        else:
+            fuse_img = False
+
+        (img, out, header, _, _) = predictForSubject.predictForSubject(stations, headers, net, target_spacing, fuse_img)
+
+        if fuse_img:
             nrrd.write(path_out + "{}_img.nrrd".format(val_subjects[i]), img, header, compression_level=1)
 
         nrrd.write(path_out + "{}_out.nrrd".format(val_subjects[i]), out, header, compression_level=1)
 
 
-def fuseStationsGt(subject_id, path_stations_gt, path_out):
+def fuseStationsGt(subject_id, path_stations_img, path_stations_gt, path_out, station_ids, target_spacing):
 
-    (gt_1, header_1) = nrrd.read(path_stations_gt + "{}_station1.nrrd".format(subject_id))
-    (gt_2, header_2) = nrrd.read(path_stations_gt + "{}_station2.nrrd".format(subject_id))
+    volumes_gt = []
+    headers_gt = []
+    positions = []
+    spacings = []
 
-    # Rounding before fusion appears to give best results for SmartPaint values
-    gt_1 = np.around(gt_1)
-    gt_2 = np.around(gt_2)
+    for s in station_ids:
 
-    #
-    (W, W_size, W_end, scalings, offsets) = fuseVolumes.getResamplingParameters([gt_1, gt_2], [header_1, header_2])
+        path_s = path_stations_gt + "{}_station{}.nrrd".format(subject_id, s)
+
+        if not os.path.exists(path_s): 
+            print("WARNING: Could not find ground truth segmentation, assuming empty segmentation for {}".format(path_s))
+            path_s = path_stations_img + "{}_station{}_W.nrrd".format(subject_id, s)
+
+            # Load signal instead and set values to 0
+            (volume_gt, header) = nrrd.read(path_s)
+            volume_gt[:] = 0
 
-     (gt, seg_fusion_cost) = fuseVolumes.fuseStations(gt_1, gt_2, W, W_size, W_end, scalings, offsets, False)
+        else:
+            (volume_gt, header) = nrrd.read(path_s)
+
+            # Round volumes to binarize segmentations from SmartPaint. Using the float values appears to provide no benefit
+            volume_gt = np.around(volume_gt)
+
+        volumes_gt.append(volume_gt)
+        headers_gt.append(header)
+
+        #
+        positions.append(header["space origin"])
+
+        spacing = header["space directions"]
+        spacing = np.array((spacing[0][0], spacing[1][1], spacing[2][2]))
+
+        spacings.append(spacing)
+
+    #
+    (gt, gt_origin,seg_fusion_cost) = fuseVolumes.fuseStations(volumes_gt, positions, spacings, target_spacing, False)
 
-    header = header_1
+    header = headers_gt[0]
     header["sizes"] = gt.shape
+    header["space origin"] = gt_origin
+    for i in range(3): header["space directions"][i][i] = target_spacing[i]
 
     nrrd.write(path_out + "{}_gt.nrrd".format(subject_id), gt, header, compression_level=1)
diff --git a/image_fusion/dicomToVolume.py b/image_fusion/dicomToVolume.py
@@ -42,7 +42,7 @@
 c_use_gpu = True # If yes, use numba for gpu access, otherwise use scipy on cpu
 
 
-def dicomToVolume(input_path_zip):
+def dicomToVolume(input_path_zip, station_ids):
 
     if not os.path.exists(input_path_zip):
         print("Could not find input file {}".format(input_path_zip))
@@ -59,7 +59,7 @@ def dicomToVolume(input_path_zip):
     headers = []
     
     # Only use abdominal imaging stations
-    for i in range(1, 3):
+    for i in station_ids:
 
         #
         voxel_data_w[i] = np.flip(voxel_data_w[i], 2)
diff --git a/image_fusion/fuseVolumes.py b/image_fusion/fuseVolumes.py
diff --git a/image_fusion/predictForSubject.py b/image_fusion/predictForSubject.py
diff --git a/inference/infer.py b/inference/infer.py