0 导入函数库 ⤒
print("\n... IMPORTS STARTING ...\n")
print("\n\tVERSION INFORMATION")
# Machine Learning and Data Science Imports
import tensorflow as tf; print(f"\t\t– TENSORFLOW VERSION: {tf.__version__}");
import tensorflow_hub as tfhub; print(f"\t\t– TENSORFLOW HUB VERSION: {tfhub.__version__}");
import tensorflow_addons as tfa; print(f"\t\t– TENSORFLOW ADDONS VERSION: {tfa.__version__}");
import pandas as pd; pd.options.mode.chained_assignment = None;
import numpy as np; print(f"\t\t– NUMPY VERSION: {np.__version__}");
import sklearn; print(f"\t\t– SKLEARN VERSION: {sklearn.__version__}");
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
from pandarallel import pandarallel; pandarallel.initialize();
from sklearn.model_selection import GroupKFold, StratifiedKFold
from scipy.spatial import cKDTree
# # RAPIDS
# import cudf, cupy, cuml
# from cuml.neighbors import NearestNeighbors
# from cuml.manifold import TSNE, UMAP
# Built In Imports
from kaggle_datasets import KaggleDatasets
from collections import Counter
from datetime import datetime
from glob import glob
import warnings
import requests
import hashlib
import imageio
import IPython
import sklearn
import urllib
import zipfile
import pickle
import random
import shutil
import string
import json
import math
import time
import gzip
import ast
import sys
import io
import os
import gc
import re
# Visualization Imports
from matplotlib.colors import ListedColormap
from matplotlib.patches import Rectangle
import matplotlib.patches as patches
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm; tqdm.pandas();
import plotly.express as px
import seaborn as sns
from PIL import Image, ImageEnhance
import matplotlib; print(f"\t\t– MATPLOTLIB VERSION: {matplotlib.__version__}");
from matplotlib import animation, rc; rc('animation', html='jshtml')
import plotly
import PIL
import cv2
import plotly.io as pio
print(pio.renderers)
def seed_it_all(seed=7):
""" Attempt to be Reproducible """
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
print("\n\n... IMPORTS COMPLETE ...\n")
\
1 背景知识 ⤒
\
1.1 基本知识
主要任务描述
在本次比赛中,您将创建一个模型,在 MRI 扫描中自动分割胃和肠。 MRI 扫描来自实际的癌症患者,他们在放射治疗期间的不同日子进行了 1-5 次 MRI 扫描。 您将基于这些扫描的数据集制定您的算法,以提出创造性的深度学习解决方案,帮助癌症患者获得更好的护理。
基本背景信息
2019 年,全球估计有 500 万人被诊断出患有胃肠道癌症。 在这些患者中,大约一半有资格接受放射治疗,通常每天进行 10-15 分钟,持续 1-6 周。 放射肿瘤学家尝试使用指向肿瘤的 X 射线束进行高剂量辐射,同时避开胃和肠。 借助集成磁共振成像和直线加速器系统(也称为 MR-Linacs)等新技术,肿瘤学家能够可视化肿瘤和肠道的每日位置,可能每天都在变化。
在这些扫描中,放射肿瘤学家必须手动勾勒出胃和肠的位置,以便调整 X 射线束的方向,以增加向肿瘤输送的剂量并避开胃和肠。 这是一个耗时且劳动密集型的过程,可以将治疗从每天 15 分钟延长到每天一个小时,这对于患者来说可能难以忍受——除非深度学习可以帮助自动化分割过程。 分割胃和肠的方法可以使治疗更快,让更多的患者得到更有效的治疗。
课题主办方信息
UW-Madison Carbone 癌症中心是基于 MR-Linac 放射治疗的先驱,自 2015 年以来一直根据患者的日常解剖结构对患者进行 MRI 引导放射治疗。UW-Madison 已慷慨同意支持该项目,该项目为接受治疗的患者提供匿名 MRI 在威斯康星大学麦迪逊卡本癌症中心。 威斯康星大学麦迪逊分校是威斯康星州麦迪逊市的一所公立研究型大学。 威斯康星大学的理念是大学向国家、国家和世界的承诺,他们的努力将使所有公民受益。
可视化
上面的肿瘤(粉红色粗线)靠近胃(红色粗线)。 高剂量的放射线直接照射到肿瘤上,同时避开胃部。 剂量水平用颜色表示。 较高剂量用红色表示,较低剂量用绿色表示。\
MRI 是一种出色的软组织可视化成像方式。 这对于腹部肿瘤特别有用,例如下面显示的胰腺癌。 左图显示了患者呼气期间的解剖结构,而右图显示了最大吸气屏气 (MIBH) 期间的解剖结构变化。 在 MIBH 图像中,我们可以看到几乎所有软组织的运动,这为我们在治疗过程中对齐肿瘤提供了卓越的能力。 我们正在分析使用这些治疗计划和实施技术的临床影响以及我们的患者遵守自我引导呼吸操作的能力。 [REF]
相关报告
癌症需要付出足够的代价。 如果成功,您将使放射肿瘤学家能够安全地向肿瘤提供更高剂量的辐射,同时避开胃和肠。 这将使癌症患者的日常治疗更快,并让他们获得更有效的治疗,副作用更少,更好地长期控制癌症。
\
2 SETUP ⤒
\
2.1 加速检测
为了使用 TPU,我们使用 TPUClusterResolver 进行初始化,这是连接到远程集群和初始化云 TPU 所必需的。 让我们回顾两个重要的点
1、在Kaggle上使用TPU时,不需要为**TPUClusterResolver**指定参数 2. 但是,在 Google Compute Engine (GCE) 上,您需要执行以下操作:
\
# The name you gave to the TPU to use
TPU_WORKER = 'my-tpu-name'
# or you can also specify the grpc path directly
# TPU_WORKER = 'grpc://xxx.xxx.xxx.xxx:8470'
# The zone you chose when you created the TPU to use on GCP.
ZONE = 'us-east1-b'
# The name of the GCP project where you created the TPU to use on GCP.
PROJECT = 'my-tpu-project'
tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=TPU_WORKER, zone=ZONE, project=PROJECT)
print(f"\n... ACCELERATOR SETUP STARTING ...\n")
# Detect hardware, return appropriate distribution strategy
try:
# TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
TPU = tf.distribute.cluster_resolver.TPUClusterResolver()
except ValueError:
TPU = None
if TPU:
print(f"\n... RUNNING ON TPU - {TPU.master()}...")
tf.config.experimental_connect_to_cluster(TPU)
tf.tpu.experimental.initialize_tpu_system(TPU)
strategy = tf.distribute.experimental.TPUStrategy(TPU)
else:
print(f"\n... RUNNING ON CPU/GPU ...")
# Yield the default distribution strategy in Tensorflow
# --> Works on CPU and single GPU.
strategy = tf.distribute.get_strategy()
# What Is a Replica?
# --> A single Cloud TPU device consists of FOUR chips, each of which has TWO TPU cores.
# --> Therefore, for efficient utilization of Cloud TPU, a program should make use of each of the EIGHT (4x2) cores.
# --> Each replica is essentially a copy of the training graph that is run on each core and
# trains a mini-batch containing 1/8th of the overall batch size
N_REPLICAS = strategy.num_replicas_in_sync
print(f"... # OF REPLICAS: {N_REPLICAS} ...\n")
print(f"\n... ACCELERATOR SETUP COMPLTED ...\n")
\
2.2 数据访问
TPU 读取数据必须直接从 Google Cloud Storage (GCS) 读取。 Kaggle 提供了一个实用程序库——KaggleDatasets——它有一个实用程序函数 .get_gcs_path,它允许我们访问我们的输入数据集在 GCS 中的位置。\
print("\n... DATA ACCESS SETUP STARTED ...\n")
if TPU:
# Google Cloud Dataset path to training and validation images
DATA_DIR = KaggleDatasets().get_gcs_path('uw-madison-gi-tract-image-segmentation')
save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
load_locally = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
else:
# Local path to training and validation images
DATA_DIR = "/kaggle/input/uw-madison-gi-tract-image-segmentation"
save_locally = None
load_locally = None
print(f"\n... DATA DIRECTORY PATH IS:\n\t--> {DATA_DIR}")
print(f"\n... IMMEDIATE CONTENTS OF DATA DIRECTORY IS:")
for file in tf.io.gfile.glob(os.path.join(DATA_DIR, "*")): print(f"\t--> {file}")
print("\n\n... DATA ACCESS SETUP COMPLETED ...\n")
\
2.3 利用XLA优化
XLA(加速线性代数)是用于线性代数的特定领域编译器,可以加速 TensorFlow 模型,而无需更改源代码。 结果是提高了速度和内存使用率。
\
当一个 TensorFlow 程序运行时,所有的操作都由 TensorFlow 执行器单独执行。 每个 TensorFlow 操作都有一个预编译的 GPU/TPU 内核实现,执行器会分派到该内核实现。
XLA 为我们提供了另一种运行模型的模式:它将 TensorFlow 图编译成专门为给定模型生成的计算内核序列。 因为这些内核是模型独有的,所以它们可以利用模型特定的信息进行优化。
\
print(f"\n... XLA OPTIMIZATIONS STARTING ...\n")
print(f"\n... CONFIGURE JIT (JUST IN TIME) COMPILATION ...\n")
# enable XLA optmizations (10% speedup when using @tf.function calls)
tf.config.optimizer.set_jit(True)
print(f"\n... XLA OPTIMIZATIONS COMPLETED ...\n")
\
2.4 数据定义以及初始化S
print("\n... BASIC DATA SETUP STARTING ...\n\n")
# Open the training dataframe and display the initial dataframe
TRAIN_DIR = os.path.join(DATA_DIR, "train")
TRAIN_CSV = os.path.join(DATA_DIR, "train.csv")
train_df = pd.read_csv(TRAIN_CSV)
# Get all training images
all_train_images = glob(os.path.join(TRAIN_DIR, "**", "*.png"), recursive=True)
print("\n... ORIGINAL TRAINING DATAFRAME... \n")
display(train_df)
TEST_DIR = os.path.join(DATA_DIR, "test")
SS_CSV = os.path.join(DATA_DIR, "sample_submission.csv")
ss_df = pd.read_csv(SS_CSV)
# Get all testing images if there are any
all_test_images = glob(os.path.join(TEST_DIR, "**", "*.png"), recursive=True)
print("\n\n\n... ORIGINAL SUBMISSION DATAFRAME... \n")
display(ss_df)
# For debugging purposes when the test set hasn't been substituted we will know
DEBUG=len(ss_df)==0
if DEBUG:
TEST_DIR = TRAIN_DIR
all_test_images = all_train_images
ss_df = train_df.iloc[:10]
ss_df = ss_df[["id", "class"]]
ss_df["predicted"] = ""
print("\n\n\n... DEBUG SUBMISSION DATAFRAME... \n")
display(ss_df)
SF2LF = {"lb":"Large Bowel","sb":"Small Bowel","st":"Stomach"}
LF2SF = {v:k for k,v in SF2LF.items()}
print(f"\n\n\n... ARE WE DEBUGGING: {DEBUG}... \n")
print("\n... BASIC DATA SETUP FINISHED ...\n\n")
\
2.5 更新数据用于获得额外信息
我将逻辑包装在预处理函数中,但也逐步完成,以便人们可以验证他们是否愿意
注意:为简洁起见,我已将列标识符更改如下:
- large_bowel --> lb
- small_bowel --> sb
- stomach --> st
def get_filepath_from_partial_identifier(_ident, file_list):
return [x for x in file_list if _ident in x][0]
def df_preprocessing(df, globbed_file_list, is_test=False):
""" The preprocessing steps applied to get column information """
# 1. Get Case-ID as a column (str and int)
df["case_id_str"] = df["id"].apply(lambda x: x.split("_", 2)[0])
df["case_id"] = df["id"].apply(lambda x: int(x.split("_", 2)[0].replace("case", "")))
# 2. Get Day as a column
df["day_num_str"] = df["id"].apply(lambda x: x.split("_", 2)[1])
df["day_num"] = df["id"].apply(lambda x: int(x.split("_", 2)[1].replace("day", "")))
# 3. Get Slice Identifier as a column
df["slice_id"] = df["id"].apply(lambda x: x.split("_", 2)[2])
# 4. Get full file paths for the representative scans
df["_partial_ident"] = (globbed_file_list[0].rsplit("/", 4)[0]+"/"+ # /kaggle/input/uw-madison-gi-tract-image-segmentation/train/
df["case_id_str"]+"/"+ # .../case###/
df["case_id_str"]+"_"+df["day_num_str"]+ # .../case###_day##/
"/scans/"+df["slice_id"]) # .../slice_####
_tmp_merge_df = pd.DataFrame({"_partial_ident":[x.rsplit("_",4)[0] for x in globbed_file_list], "f_path":globbed_file_list})
df = df.merge(_tmp_merge_df, on="_partial_ident").drop(columns=["_partial_ident"])
# 5. Get slice dimensions from filepath (int in pixels)
df["slice_h"] = df["f_path"].apply(lambda x: int(x[:-4].rsplit("_",4)[1]))
df["slice_w"] = df["f_path"].apply(lambda x: int(x[:-4].rsplit("_",4)[2]))
# 6. Pixel spacing from filepath (float in mm)
df["px_spacing_h"] = df["f_path"].apply(lambda x: float(x[:-4].rsplit("_",4)[3]))
df["px_spacing_w"] = df["f_path"].apply(lambda x: float(x[:-4].rsplit("_",4)[4]))
if not is_test:
# 7. Merge 3 Rows Into A Single Row (As This/Segmentation-RLE Is The Only Unique Information Across Those Rows)
l_bowel_df = df[df["class"]=="large_bowel"][["id", "segmentation"]].rename(columns={"segmentation":"lb_seg_rle"})
s_bowel_df = df[df["class"]=="small_bowel"][["id", "segmentation"]].rename(columns={"segmentation":"sb_seg_rle"})
stomach_df = df[df["class"]=="stomach"][["id", "segmentation"]].rename(columns={"segmentation":"st_seg_rle"})
df = df.merge(l_bowel_df, on="id", how="left")
df = df.merge(s_bowel_df, on="id", how="left")
df = df.merge(stomach_df, on="id", how="left")
df = df.drop_duplicates(subset=["id",]).reset_index(drop=True)
df["lb_seg_flag"] = df["lb_seg_rle"].apply(lambda x: not pd.isna(x))
df["sb_seg_flag"] = df["sb_seg_rle"].apply(lambda x: not pd.isna(x))
df["st_seg_flag"] = df["st_seg_rle"].apply(lambda x: not pd.isna(x))
df["n_segs"] = df["lb_seg_flag"].astype(int)+df["sb_seg_flag"].astype(int)+df["st_seg_flag"].astype(int)
# 8. Reorder columns to the a new ordering (drops class and segmentation as no longer necessary)
new_col_order = ["id", "f_path", "n_segs", "lb_seg_rle", "lb_seg_flag", "sb_seg_rle", "sb_seg_flag", "st_seg_rle", "st_seg_flag", "slice_h", "slice_w", "px_spacing_h", "px_spacing_w", "case_id_str", "case_id", "day_num_str", "day_num", "slice_id",]
if is_test: new_col_order.insert(1, "class")
new_col_order = [_c for _c in new_col_order if _c in df.columns]
df = df[new_col_order]
return df
print("\n... UPDATING DATAFRAMES WITH ACCESSIBLE INFORMATION STARTED ...\n\n")
# 1. Get Case-ID as a column (str and int)
train_df["case_id_str"] = train_df["id"].apply(lambda x: x.split("_", 2)[0])
train_df["case_id"] = train_df["id"].apply(lambda x: int(x.split("_", 2)[0].replace("case", "")))
# 2. Get Day as a column
train_df["day_num_str"] = train_df["id"].apply(lambda x: x.split("_", 2)[1])
train_df["day_num"] = train_df["id"].apply(lambda x: int(x.split("_", 2)[1].replace("day", "")))
# 3. Get Slice Identifier as a column
train_df["slice_id"] = train_df["id"].apply(lambda x: x.split("_", 2)[2])
# 4. Get full file paths for the representative scans
train_df["_partial_ident"] = (TRAIN_DIR+"/"+ # /kaggle/input/uw-madison-gi-tract-image-segmentation/train/
train_df["case_id_str"]+"/"+ # .../case###/
train_df["case_id_str"]+"_"+train_df["day_num_str"]+ # .../case###_day##/
"/scans/"+train_df["slice_id"]) # .../slice_####
_tmp_merge_df = pd.DataFrame({"_partial_ident":[x.rsplit("_",4)[0] for x in all_train_images], "f_path":all_train_images})
train_df = train_df.merge(_tmp_merge_df, on="_partial_ident").drop(columns=["_partial_ident"])
# Minor cleanup of our temporary workaround
del _tmp_merge_df; gc.collect(); gc.collect()
# 5. Get slice dimensions from filepath (int in pixels)
train_df["slice_h"] = train_df["f_path"].apply(lambda x: int(x[:-4].rsplit("_",4)[1]))
train_df["slice_w"] = train_df["f_path"].apply(lambda x: int(x[:-4].rsplit("_",4)[2]))
# 6. Pixel spacing from filepath (float in mm)
train_df["px_spacing_h"] = train_df["f_path"].apply(lambda x: float(x[:-4].rsplit("_",4)[3]))
train_df["px_spacing_w"] = train_df["f_path"].apply(lambda x: float(x[:-4].rsplit("_",4)[4]))
# 7. Merge 3 Rows Into A Single Row (As This/Segmentation-RLE Is The Only Unique Information Across Those Rows)
l_bowel_train_df = train_df[train_df["class"]=="large_bowel"][["id", "segmentation"]].rename(columns={"segmentation":"lb_seg_rle"})
s_bowel_train_df = train_df[train_df["class"]=="small_bowel"][["id", "segmentation"]].rename(columns={"segmentation":"sb_seg_rle"})
stomach_train_df = train_df[train_df["class"]=="stomach"][["id", "segmentation"]].rename(columns={"segmentation":"st_seg_rle"})
train_df = train_df.merge(l_bowel_train_df, on="id", how="left")
train_df = train_df.merge(s_bowel_train_df, on="id", how="left")
train_df = train_df.merge(stomach_train_df, on="id", how="left")
train_df = train_df.drop_duplicates(subset=["id",]).reset_index(drop=True)
train_df["lb_seg_flag"] = train_df["lb_seg_rle"].apply(lambda x: not pd.isna(x))
train_df["sb_seg_flag"] = train_df["sb_seg_rle"].apply(lambda x: not pd.isna(x))
train_df["st_seg_flag"] = train_df["st_seg_rle"].apply(lambda x: not pd.isna(x))
train_df["n_segs"] = train_df["lb_seg_flag"].astype(int)+train_df["sb_seg_flag"].astype(int)+train_df["st_seg_flag"].astype(int)
# 8. Reorder columns to the a new ordering (drops class and segmentation as no longer necessary)
train_df = train_df[["id", "f_path", "n_segs", "lb_seg_rle", "lb_seg_flag", "sb_seg_rle", "sb_seg_flag", "st_seg_rle", "st_seg_flag", "slice_h", "slice_w", "px_spacing_h", "px_spacing_w", "case_id_str", "case_id", "day_num_str", "day_num", "slice_id",]]
# 9. Display update dataframe
print("\n... UPDATED TRAINING DATAFRAME... \n")
display(train_df)
ss_df = df_preprocessing(ss_df, all_test_images, is_test=True)
print("\n\n\n... UPDATED SUBMISSION DATAFRAME... \n")
display(ss_df)
print("\n... UPDATING DATAFRAMES WITH ACCESSIBLE INFORMATION FINISHED ...\n\n")
\
3 辅助函数 & 类 ⤒
# ref: https://www.kaggle.com/paulorzp/run-length-encode-and-decode
# modified from: https://www.kaggle.com/inversion/run-length-decoding-quick-start
def rle_decode(mask_rle, shape, color=1):
""" TBD
Args:
mask_rle (str): run-length as string formated (start length)
shape (tuple of ints): (height,width) of array to return
Returns:
Mask (np.array)
- 1 indicating mask
- 0 indicating background
"""
# Split the string by space, then convert it into a integer array
s = np.array(mask_rle.split(), dtype=int)
# Every even value is the start, every odd value is the "run" length
starts = s[0::2] - 1
lengths = s[1::2]
ends = starts + lengths
# The image image is actually flattened since RLE is a 1D "run"
if len(shape)==3:
h, w, d = shape
img = np.zeros((h * w, d), dtype=np.float32)
else:
h, w = shape
img = np.zeros((h * w,), dtype=np.float32)
# The color here is actually just any integer you want!
for lo, hi in zip(starts, ends):
img[lo : hi] = color
# Don't forget to change the image back to the original shape
return img.reshape(shape)
# https://www.kaggle.com/namgalielei/which-reshape-is-used-in-rle
def rle_decode_top_to_bot_first(mask_rle, shape):
""" TBD
Args:
mask_rle (str): run-length as string formated (start length)
shape (tuple of ints): (height,width) of array to return
Returns:
Mask (np.array)
- 1 indicating mask
- 0 indicating background
"""
s = mask_rle.split()
starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
starts -= 1
ends = starts + lengths
img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
for lo, hi in zip(starts, ends):
img[lo:hi] = 1
return img.reshape((shape[1], shape[0]), order='F').T # Reshape from top -> bottom first
# ref.: https://www.kaggle.com/stainsby/fast-tested-rle
def rle_encode(img):
""" TBD
Args:
img (np.array):
- 1 indicating mask
- 0 indicating background
Returns:
run length as string formated
"""
pixels = img.flatten()
pixels = np.concatenate([[0], pixels, [0]])
runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
runs[1::2] -= runs[::2]
return ' '.join(str(x) for x in runs)
def flatten_l_o_l(nested_list):
""" Flatten a list of lists """
return [item for sublist in nested_list for item in sublist]
def load_json_to_dict(json_path):
""" tbd """
with open(json_path) as json_file:
data = json.load(json_file)
return data
def tf_load_png(img_path):
return tf.image.decode_png(tf.io.read_file(img_path), channels=3)
def open_gray16(_path, normalize=True, to_rgb=False):
""" Helper to open files """
if normalize:
if to_rgb:
return np.tile(np.expand_dims(cv2.imread(_path, cv2.IMREAD_ANYDEPTH)/65535., axis=-1), 3)
else:
return cv2.imread(_path, cv2.IMREAD_ANYDEPTH)/65535.
else:
if to_rgb:
return np.tile(np.expand_dims(cv2.imread(_path, cv2.IMREAD_ANYDEPTH), axis=-1), 3)
else:
return cv2.imread(_path, cv2.IMREAD_ANYDEPTH)
\
4 数据探索 ⤒
\
4.0 在调查之前先看一个例子
我们这样做只是为了确保一切都在应有的位置,并且我们了解如何访问所有相关数据的基础知识。
我们将把这个基本探索功能包装为单个函数,以便于检查任何通过的标识符
def get_overlay(img_path, rle_strs, img_shape, _alpha=0.999, _beta=0.35, _gamma=0):
_img = open_gray16(img_path, to_rgb=True)
_img = ((_img-_img.min())/(_img.max()-_img.min())).astype(np.float32)
_seg_rgb = np.stack([rle_decode(rle_str, shape=img_shape, color=1) if rle_str is not None else np.zeros(img_shape, dtype=np.float32) for rle_str in rle_strs], axis=-1).astype(np.float32)
seg_overlay = cv2.addWeighted(src1=_img, alpha=_alpha,
src2=_seg_rgb, beta=_beta, gamma=_gamma)
return seg_overlay
def examine_id(ex_id, df=train_df, plot_overlay=True, print_meta=False, plot_grayscale=False, plot_binary_segmentation=False):
""" Wrapper function to allow for easy visual exploration of an example """
print(f"\n... ID ({ex_id}) EXPLORATION STARTED ...\n\n")
demo_ex = df[df.id==ex_id].squeeze()
if print_meta:
print(f"\n... WITH DEMO_ID=`{DEMO_ID}` WE HAVE THE FOLLOWING DEMO EXAMPLE TO WORK FROM... \n\n")
display(demo_ex.to_frame())
if plot_grayscale:
print(f"\n\n... GRAYSCALE IMAGE PLOT ...\n")
plt.figure(figsize=(12,12))
plt.imshow(open_gray16(demo_ex.f_path), cmap="gray")
plt.title(f"Original Grayscale Image For ID: {demo_ex.id}", fontweight="bold")
plt.axis(False)
plt.show()
if plot_binary_segmentation:
print(f"\n\n... BINARY SEGMENTATION MASKS ...\n")
plt.figure(figsize=(20,10))
for i, _seg_type in enumerate(["lb", "sb", "st"]):
if pd.isna(demo_ex[f"{_seg_type}_seg_rle"]): continue
plt.subplot(1,3,i+1)
plt.imshow(rle_decode(demo_ex[f"{_seg_type}_seg_rle"], shape=(demo_ex.slice_w, demo_ex.slice_h), color=1))
plt.title(f"RLE Encoding For {SF2LF[_seg_type]} Segmentation", fontweight="bold")
plt.axis(False)
plt.tight_layout()
plt.show()
if plot_overlay:
print(f"\n\n... IMAGE WITH RGB SEGMENTATION MASK OVERLAY ...\n")
# We need to normalize the loaded image values to be between 0 and 1 or else our plot will look weird
# _img = open_gray16(demo_ex.f_path, to_rgb=True)
#_img = ((_img-_img.min())/(_img.max()-_img.min())).astype(np.float32)
#_seg_rgb = np.stack([rle_decode(demo_ex[f"{_seg_type}_seg_rle"], shape=(demo_ex.slice_w, demo_ex.slice_h), color=1) if not pd.isna(demo_ex[f"{_seg_type}_seg_rle"]) else np.zeros((demo_ex.slice_w, demo_ex.slice_h)) for _seg_type in ["lb", "sb", "st"]], axis=-1).astype(np.float32)
#seg_overlay = cv2.addWeighted(src1=_img, alpha=0.99,
#src2=_seg_rgb, beta=0.33, gamma=0)
_rle_strs = [demo_ex[f"{_seg_type}_seg_rle"] if not pd.isna(demo_ex[f"{_seg_type}_seg_rle"]) else None for _seg_type in ["lb", "sb", "st"]]
seg_overlay = get_overlay(demo_ex.f_path, _rle_strs, img_shape=(demo_ex.slice_w, demo_ex.slice_h))
plt.figure(figsize=(12,12))
plt.imshow(seg_overlay)
plt.title(f"Segmentation Overlay For ID: {demo_ex.id}", fontweight="bold")
handles = [Rectangle((0,0),1,1, color=_c) for _c in [(0.667,0.0,0.0), (0.0,0.667,0.0), (0.0,0.0,0.667)]]
labels = ["Large Bowel Segmentation Map", "Small Bowel Segmentation Map", "Stomach Segmentation Map"]
plt.legend(handles,labels)
plt.axis(False)
plt.show()
print("\n\n... SINGLE ID EXPLORATION FINISHED ...\n\n")
print("\n... SINGLE ID EXPLORATION STARTED ...\n\n")
DEMO_ID = "case123_day20_slice_0082"
demo_ex = train_df[train_df.id==DEMO_ID].squeeze()
print(f"\n... WITH DEMO_ID=`{DEMO_ID}` WE HAVE THE FOLLOWING DEMO EXAMPLE TO WORK FROM... \n\n")
display(demo_ex.to_frame())
print(f"\n\n... LET'S PLOT THE IMAGE FIRST ...\n")
plt.figure(figsize=(12,12))
plt.imshow(open_gray16(demo_ex.f_path), cmap="gray")
plt.title(f"Original Grayscale Image For ID: {demo_ex.id}", fontweight="bold")
plt.axis(False)
plt.show()
print(f"\n\n... LET'S PLOT THE 3 SEGMENTATION MASKS ...\n")
plt.figure(figsize=(20,10))
for i, _seg_type in enumerate(["lb", "sb", "st"]):
if pd.isna(demo_ex[f"{_seg_type}_seg_rle"]): continue
plt.subplot(1,3,i+1)
plt.imshow(rle_decode(demo_ex[f"{_seg_type}_seg_rle"], shape=(demo_ex.slice_w, demo_ex.slice_h), color=1))
plt.title(f"RLE Encoding For {SF2LF[_seg_type]} Segmentation", fontweight="bold")
plt.axis(False)
plt.tight_layout()
plt.show()
print(f"\n\n... LET'S PLOT THE IMAGE WITH AN RGB SEGMENTATION MASK OVERLAY ...\n")
# We need to normalize the loaded image values to be between 0 and 1 or else our plot will look weird
_img = open_gray16(demo_ex.f_path, to_rgb=True)
_img = ((_img-_img.min())/(_img.max()-_img.min())).astype(np.float32)
_seg_rgb = np.stack([rle_decode(demo_ex[f"{_seg_type}_seg_rle"], shape=(demo_ex.slice_w, demo_ex.slice_h), color=1) if not pd.isna(demo_ex[f"{_seg_type}_seg_rle"]) else np.zeros((demo_ex.slice_w, demo_ex.slice_h)) for _seg_type in ["lb", "sb", "st"]], axis=-1).astype(np.float32)
seg_overlay = cv2.addWeighted(src1=_img, alpha=0.99,
src2=_seg_rgb, beta=0.33, gamma=0.0)
plt.figure(figsize=(12,12))
plt.imshow(seg_overlay)
plt.title(f"Segmentation Overlay For ID: {demo_ex.id}", fontweight="bold")
handles = [Rectangle((0,0),1,1, color=_c) for _c in [(0.667,0.0,0.0), (0.0,0.667,0.0), (0.0,0.0,0.667)]]
labels = ["Large Bowel Segmentation Map", "Small Bowel Segmentation Map", "Stomach Segmentation Map"]
plt.legend(handles,labels)
plt.axis(False)
plt.show()
print(f"\n\n... LET'S PRINT THE RELEVANT INFORMATION ...\n")
print(f"\t--> IMAGE CASE ID : {demo_ex.case_id}")
print(f"\t--> IMAGE DAY NUMBER : {demo_ex.day_num}")
print(f"\t--> IMAGE SLICE WIDTH : {demo_ex.slice_w}")
print(f"\t--> IMAGE SLICE HEIGHT : {demo_ex.slice_h}")
print(f"\t--> IMAGE PIXEL SPACING WIDTH : {demo_ex.px_spacing_w}")
print(f"\t--> IMAGE PIXEL SPACING HEIGHT : {demo_ex.px_spacing_h}")
print("\n\n... SINGLE ID EXPLORATION FINISHED ...\n\n")
既然我们做了这个函数......让我们迭代一些在所有语言环境中都存在肿瘤的例子并绘制它们|
# Plot 10 random-ids where all tumor locales are present (max one id per case)
N_TO_PLOT = 10
for _id in train_df[train_df.n_segs==3].groupby("case_id")["id"].first().sample(N_TO_PLOT):
examine_id(_id)
\
4.1 调查发生分割地图类型
很明显,并非所有图像都有各个区域(胃、大肠、小肠)的分割图,因此我们将确定这些区域独立发生的频率......以及这些地图共同发生的频率 -发生。
观察
-
共有 38,496 个示例子。
-
可以观察到,超过一半的给定示例没有注释存在!
- 有 21,906 (56.9046%) 个示例不存在注释/掩码/分段
- 相反,有 16,590 (43.0954%) 个示例存在一个或多个注释
-
有 2,468 (6.41%) 个示例带有一个注释。
-
可以观察到绝大多数单mask标注都是Stomach!
- 在这些注释中,2286 (~92.6%) 是胃
- 在这些注释中,123 (~4.98%) 是大肠
- 在这些注释中,59 (~2.39%) 是小肠
-
有 10,921 (28.37%) 个示例带有两个注释。
-
与单个注释示例相比,可以观察到大多数注释不包括胃,即**'大肠,小肠'**!
- 在这些注释中,7781 (~71.3%) 是**'大肠、小肠'**
- 在这些注释中,2980 (~27.3%) 是**'大肠、胃'**
- 在这些注释中,160 (~1.47%) 是**'小肠、胃'**
-
最后,有 3,201 (8.32%) 个示例,所有三个注释都存在。
def get_seg_combo_str(row):
seg_str_list = []
if row["lb_seg_flag"]: seg_str_list.append("Large Bowel")
if row["sb_seg_flag"]: seg_str_list.append("Small Bowel")
if row["st_seg_flag"]: seg_str_list.append("Stomach")
if len(seg_str_list)>0:
return ", ".join(seg_str_list)
else:
return "No Mask"
train_df["seg_combo_str"] = train_df.progress_apply(get_seg_combo_str, axis=1)
fig = px.histogram(train_df, train_df["n_segs"].astype(str), color="seg_combo_str", title="<b>Number of Segmentation Masks Per Image</b>",
labels={"x":"Number of Segmentation Masks Per Image", "seg_combo_str":"<b>Segmentation Masks Present</b>"})
fig.show()
\
4.2 研究图像大小
可以观察到,并非所有图像都具有相同的大小......但是,鉴于此,图像切片大小之间没有太大差异。
观察可以发现
-
请记住,总共有 38,496 个示例。
-
在全体范围内,我们可以看到 3 个图像形状是正方形,而一个是矩形,它们都属于相对较小尺寸的相当紧凑的分布
-
其中有 4 个独特的尺寸:
-
- Least frequent image size
- Smallest image size
- Only 144 of the 38,496 occurences are this size (0.37%)
-
- Most frequent image size
- Second smallest image size
- 25,920 of the 38,496 occurences are this size (67.33%)
-
- Second least frequent image size
- Second largest image size
- 1,200 of the 38,496 occurences are this size (3.12%)
-
- Second most frequent image size
- Largest image size
- 11,232 of the 38,496 occurences are this size (29.17%)
-
fig = px.scatter(train_df.drop_duplicates(subset=["slice_w", "slice_h"]), x="slice_w", y="slice_h",
size=train_df.groupby(["slice_w", "slice_h"])["id"].transform("count").iloc[train_df.drop_duplicates(subset=["slice_w", "slice_h"]).index],
color="("+train_df.drop_duplicates(subset=["slice_w", "slice_h"])["slice_w"].astype(str)+","+train_df.drop_duplicates(subset=["slice_w", "slice_h"])["slice_h"].astype(str)+")",
title="<b>Bubble Chart Showing The Various Image Sizes</b>",
labels={"color":"<b>Size Legend</b>",
"size":"<b>Number Of Observations</b>",
"slice_h":"<b>Image Slice Height (pixels)</b>",
"slice_w":"<b>Image Slice Width (pixels)</b>"},
size_max=160)
fig.show()
\
4.3 研究像素间距
可以观察到,并非所有图像都具有相同的像素间距……但是,鉴于此,像素间距之间的差异并不大。
观察发现
-
请记住,总共有 38,496 个示例。
-
在全体范围内,我们可以看到所有像素间距都是正方形,并且绝大多数是
-
There are only 2 unique sets of pixel spacings:
-
- Most frequent pixel spacing
- Smallest pixel spacing (barely)
- 37,296 of the 38,496 occurences are this size (96.88%)
-
- Least frequent image size
- Largest pixel spacing (barely)
- 1,200 of the 38,496 occurences are this size (3.12%)
-
fig = px.scatter(train_df.drop_duplicates(subset=["px_spacing_w", "px_spacing_h"]), x="px_spacing_w", y="px_spacing_h",
size=train_df.groupby(["px_spacing_w", "px_spacing_h"])["id"].transform("count").iloc[train_df.drop_duplicates(subset=["px_spacing_w", "px_spacing_h"]).index],
color="("+train_df.drop_duplicates(subset=["px_spacing_w", "px_spacing_h"])["px_spacing_w"].astype(str)+","+train_df.drop_duplicates(subset=["px_spacing_w", "px_spacing_h"])["px_spacing_h"].astype(str)+")",
title="<b>Bubble Chart Showing The Various Pixel Spacings</b>",
labels={"color":"<b>Pixel Spacing Sets Legend</b>",
"size":"<b>Number Of Observations</b>",
"px_spacing_h":"<b>Pixel Spacing Height (mm)</b>",
"px_spacing_w":"<b>Pixel Spacing Width (mm)</b>"},
size_max=160)
fig.show()
\
4.4 调查案例id
这是 case_id 的主要描述
“本次比赛中的每个案例都由多组扫描切片表示(每组以扫描发生的日期标识)。有些案例按时间划分(早期在训练中,后期在测试中),而有些案例是按时间划分的 案例是按案例拆分的——整个案例都在训练或测试中。本次比赛的目标是能够推广到部分和完全看不见的案例。
我并没有真正观察到与任何特定 case_id 值相关的任何奇怪之处。 在分层/创建折叠时,我可能会尝试将它们分组......但是,它们似乎并没有造成明显的偏见。
当我们按 day 着色时,我们可以看到所有案例(大部分)都是由不同日期的 144 组或更少见的 80 组图像组成的。
fig = px.histogram(train_df, train_df.case_id.astype(str), color="day_num_str", title="<b>Distribution Of Images Per Case ID</b>",
labels={"x":"<b>Case ID</b>", "day_num_str": "<b>The Day The Scan Took Place</b>"}, text_auto=True, width=2000)
fig.show()
from matplotlib import gridspec
def plot_case(case_id, day=None, df=train_df, _figsize=(20,30), n_cols=16):
# Initialize
case_df = df[df.case_id==case_id]
if day is not None:
_case_df = case_df[(case_df.day_num==day) | (case_df.day_num_str==str(day))]
if len(_case_df)>0:
approx_shrink = len(_case_df)/len(case_df)
case_df=_case_df
_figsize = (_figsize[0], int(np.ceil(1.25*_figsize[1]*approx_shrink)))
else:
print("There are no valid samples for the passed `day`. Reverting to all days in case.")
del _case_df
n_ex = len(case_df)
print("...Preparing...")
# Get relevant data
case_paths = case_df["f_path"].tolist()
case_rles = [[_rle if not pd.isna(_rle) else None for _rle in _rles] for _rles in case_df[["lb_seg_rle", "sb_seg_rle", "st_seg_rle"]].values.tolist()]
case_img_shapes = [(_w,_h) for _w,_h in zip(case_df["slice_w"].tolist(), case_df["slice_h"].tolist())]
all_overlays = [get_overlay(img_path, rle_strs, img_shape) for img_path, rle_strs, img_shape in zip(case_paths, case_rles, case_img_shapes)]
print("...Plotting...")
# Plot
plt.figure(figsize=_figsize)
n_rows = int(np.ceil(n_ex/n_cols))
gs = gridspec.GridSpec(n_rows, n_cols,
wspace=0.0, hspace=0.0,
top=1.-0.5/(n_rows+1), bottom=0.5/(n_rows+1),
left=0.5/(n_cols+1), right=1-0.5/(n_cols+1))
for i in range(n_rows):
if len(all_overlays)==0: break
for j in range(n_cols):
if len(all_overlays)==0: break
ax=plt.subplot(gs[i,j])
ax.imshow(all_overlays.pop())
ax.axis(False)
ax.set_xticklabels([])
ax.set_yticklabels([])
print("...Displaying...")
plt.show()
DEMO_CASE = 134
print(f"\n\n... PLOTTING DEMO CASE ID #{DEMO_CASE} ...\n\n")
plot_case(DEMO_CASE)
DEMO_CASE = 9
print(f"\n\n\n\n... PLOTTING DEMO CASE ID #{DEMO_CASE} ...\n\n")
plot_case(DEMO_CASE)
DEMO_CASE = 7
DEMO_DAYS = [0, 13, 19]
for _dday in DEMO_DAYS:
print(f"\n\n\n... PLOTTING DEMO CASE ID #{DEMO_CASE} - FOR DAY #{_dday} ...\n\n")
plot_case(DEMO_CASE, day=_dday)
\
4.5 MASK 尺寸/地区S
我们知道,RLE 编码中的每个其他数字都代表一系列掩码……因此,如果我们将所有这些数字相加,我们将得到图像中被掩码像素的总数。 这比打开和关闭每个图像要快得多。
观察
-
可以观察到,遮罩区域的分布大部分是正常的,尽管它略微偏向较小的一侧......
-
尽管 Stomach 分布在 400-750 像素之间存在奇数差距,但所有分布都相似。
-
有趣的是,虽然不常见,但我们确实有一些非常大MASK(>7500 像素)
- 另外,最大的MASK是给小肠用的,这很有趣
def get_mask_area(rle):
return sum([int(x) for x in rle.split()[1::2]])
train_df["lb_seg_area"] = train_df.lb_seg_rle.apply(lambda x: None if pd.isna(x) else get_mask_area(x))
train_df["sb_seg_area"] = train_df.sb_seg_rle.apply(lambda x: None if pd.isna(x) else get_mask_area(x))
train_df["st_seg_area"] = train_df.st_seg_rle.apply(lambda x: None if pd.isna(x) else get_mask_area(x))
fig = px.histogram(train_df, ["lb_seg_area", "sb_seg_area", "st_seg_area"], title="<b>Mask Areas Overlaid</b>", barmode="overlay",
labels={"value":"<b>Mask Area</b>"})
fig.show()
print("\n\n\n... EXAMINE AN EXAMPLE WITH A LARGE AMOUNT OF SEGMENTATION MASK ...\n")
examine_id("case134_day22_slice_0102")
\
4.6 MASK数据集创建、类重叠和 MASK HEATMAP
确定MASK是否相互重叠(multilabel)或不重叠(multiclass)非常重要。为此,我们将快速创建 npy 文件的数据集。在这个创建过程中,我们将检查重叠。
观察
-
存在重叠,虽然并不常见,但某些图像显示出高度重叠。
-
这意味着我们不能将问题描述为简单的分类语义分割。
-
我们必须将问题描述为多标签语义分割
-
这意味着我们的MASK将采用以下形式 -->
- 通道维度是每个分割类型的二进制掩码
- 这将允许MASK重叠
注意以下绘制的图像:
- 在下面的检查图像中,我们可以看到小肠的一部分完全位于大肠的较大部分内。
- 这说明了为什么将其视为多标签语义分割如此重要!
def is_overlap(_arr):
return _arr.sum(axis=-1).max()>1
def make_seg_mask(row, output_dir="/kaggle/working/npy_files", check_overlap=False):
slice_shape = (row.slice_w, row.slice_h)
if not pd.isna(row.lb_seg_rle):
lb_mask = rle_decode(row.lb_seg_rle, slice_shape, )
else:
lb_mask = np.zeros(slice_shape)
if not pd.isna(row.sb_seg_rle):
sb_mask = rle_decode(row.sb_seg_rle, slice_shape)
else:
sb_mask = np.zeros(slice_shape)
if not pd.isna(row.st_seg_rle):
st_mask = rle_decode(row.st_seg_rle, slice_shape)
else:
st_mask = np.zeros(slice_shape)
mask_arr = np.stack([lb_mask, sb_mask, st_mask], axis=-1).astype(np.uint8)
np.save(f"./npy_files/{row.id}_mask", mask_arr)
if check_overlap:
if is_overlap(mask_arr):
return np.where(mask_arr.sum(axis=-1)>1, 1, 0).sum()
else:
return 0
NPY_DIR = "/kaggle/working/npy_files"
if not os.path.isdir(NPY_DIR): os.makedirs(NPY_DIR, exist_ok=True)
train_df["seg_overlap_area"] = train_df.progress_apply(lambda x: make_seg_mask(x, output_dir=NPY_DIR, check_overlap=True), axis=1)
print("\n... LET'S EXAMINE THE IMAGE WITH THE HIGHEST AMOUNT OF OVERLAP ...\n")
examine_id(train_df[train_df.seg_overlap_area==train_df.seg_overlap_area.max()].id.values[0])
fig = px.histogram(train_df[train_df.seg_overlap_area>0], "seg_overlap_area", color="seg_combo_str", nbins=50,
log_y=True, title="<b>Distribution of Non-Zero Segmentation Overlaps <sub>(Count Is Logarithmic)</sub></b>",
labels={"seg_overlap_area":"<b>Area of Mask Overlap</b>",
"seg_combo_str":"<b>Segmentation Masks In Image</b>"})
fig.update_layout(legend=dict(
yanchor="top",
y=0.99,
xanchor="right",
x=0.995
))
fig.show()
heatmap = np.zeros((256,256,3), dtype=np.float32)
for _, _row in tqdm(train_df.iterrows(), total=len(train_df)):
if (_row.lb_seg_flag or _row.sb_seg_flag or _row.st_seg_flag):
_mask = cv2.resize(np.load(f"./npy_files/{_row.id}_mask.npy"), (256,256), interpolation=cv2.INTER_NEAREST)
heatmap+=_mask
heatmap=heatmap/heatmap.max()
plt.figure(figsize=(20,12))
plt.subplot(1,4,1)
plt.imshow(heatmap[..., 0], cmap="magma")
plt.title("Large Bowel Segmentation Mask – Heat Map", fontweight="bold")
plt.axis(False)
plt.subplot(1,4,2)
plt.imshow(heatmap[..., 1], cmap="magma")
plt.title("Small Bowel Segmentation Mask – Heat Map", fontweight="bold")
plt.axis(False)
plt.subplot(1,4,3)
plt.imshow(heatmap[..., 2], cmap="magma")
plt.title("Stomach Segmentation Mask – Heat Map", fontweight="bold")
plt.axis(False)
plt.subplot(1,4,4)
plt.imshow(heatmap)
plt.title("All Sementation Masks Combined – Heat Map", fontweight="bold")
plt.axis(False)
plt.tight_layout()
plt.show()
\
4.7 数据集中的像素值
分析数据集很重要,因为我们需要对数据进行规范化以将其转换为更适合机器学习的格式(uint8 (0-255) 或 float32 (0-1))。 在不知道图像限制的情况下,我们可能会在归一化时意外降低数据的分辨率。
观察
有趣的是,数据集中的最大值等于不到 int16 的一半或 uint16 的四分之一。
-
Max Value for UINT16
- 65535
-
Max Value for INT16
- 32767
-
Half of Max Value for INT16
- 16384
-
Actual Max Value in the dataset
- 15865
def get_image_vals(row):
_img = cv2.imread(row.f_path, -1)
_nonzero_px_count = np.count_nonzero(_img)
row["nonzero_num_pxs"] = _nonzero_px_count
row["max_px_value"] = _img.max()
row["min_px_value"] = _img.min()
row["mean_px_value"] = _img.mean()
row["nonzero_mean_px_value"] = _img.sum()/_nonzero_px_count
return row
train_df = train_df.progress_apply(get_image_vals, axis=1)
print(f"\n\n\n... UPDATED TRAIN DATAFRAME ...\n")
display(train_df.head())
print("\n\n")
for _c in ["nonzero_num_pxs", "max_px_value", "min_px_value", "mean_px_value", "nonzero_mean_px_value"]:
print(f"\n... STATS FOR COLUMN --> `{_c}`...")
print(f"\t--> MIN VAL: {train_df[_c].min():.1f}")
print(f"\t--> MEAN VAL: {train_df[_c].mean():.1f}")
print(f"\t--> MAX VAL: {train_df[_c].max():.1f}")
\
4.8 确定任何关于分割的启发式或规则
SLICEWISE 观察
对于给定的 case-id 和 day number,存在两种不同数量的扫描
-
144 个切片 --> 259 个实例
-
80 片 ---> 15 个实例
-
没有具有任何分割掩码的切片编号 1、138、139、140、141、142、143 或 144 的示例
-
如果我们按器官分解,我们会为每个器官得到以下无值切片
- 大肠 – 1、138、139、140、141、142、143、144
- 小肠 – 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 138, 139, 140, 141, 142, 143, 144
- 胃 – 1, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144
滚动切片观察
接下来,我们要确定在之前或之后没有分割掩码时是否存在任何分割掩码(理想情况下,如果存在,我们可以测量其普遍性,如果不存在,我们可以确定最小数量需要的连续段)。
train_df["lb_seg_isolated"] = False
train_df["sb_seg_isolated"] = False
train_df["st_seg_isolated"] = False
train_df.loc[
(train_df["lb_seg_flag"]==True) &
(train_df["lb_seg_flag"]!=train_df["lb_seg_flag"].shift(1, fill_value=False)) &
(train_df["lb_seg_flag"]!=train_df["lb_seg_flag"].shift(-1, fill_value=False)), "lb_seg_isolated"
] = True
train_df.loc[
(train_df["sb_seg_flag"]==True) &
(train_df["sb_seg_flag"]!=train_df["sb_seg_flag"].shift(1, fill_value=False)) &
(train_df["sb_seg_flag"]!=train_df["sb_seg_flag"].shift(-1, fill_value=False)), "sb_seg_isolated"
] = True
train_df.loc[
(train_df["st_seg_flag"]==True) &
(train_df["st_seg_flag"]!=train_df["st_seg_flag"].shift(1, fill_value=False)) &
(train_df["st_seg_flag"]!=train_df["st_seg_flag"].shift(-1, fill_value=False)), "st_seg_isolated"
] = True
# case43_day18 (lb), case138_day0 (lb), case7_day0 (st)
train_df[train_df.lb_seg_isolated]