作者简介: 本文作者系大学统计学专业教师,多年从事统计学的教学科研工作,在随机过程、统计推 断、机器学习领域有深厚的理论积累与应用实践。个人主页
在2019年,据估计全球约有500万人被诊断为胃肠癌。这其中有半数患者接受为期1~6周、每天10—15分钟的化疗。在化疗过程中,使用X-射线的放射剂量照射肿瘤而尽量避开正常的胃肠器官。在MRI(磁共振成像)扫描中,为了靶向肿瘤位置而避开正常的胃肠器官,医生必须手工标出胃肠器官的位置。这是一个耗时费力的工作,并且使化疗过程延长15分至1小时,给患者增加了治疗的困难。

开发在MRI扫描中自动识别分割胃肠器官与癌症靶点的算法,将使化疗过程更快速、高效,减轻医生的负担和患者的困难。

在这个问题里,我们要分割MRI图像的胃肠器官。图像是16-bit灰质png格式。
数据集的 class有3个不同的值:large bowel, small bowel, stomach.

print(clr.S+"--- train.csv ---"+clr.E)
train = pd.read_csv("../input/uw-madison-gi-tract-image-segmentation/train.csv")
print(clr.S+"shape:"+clr.E, train.shape)
print(clr.S+"Unique ID cases:"+clr.E, train["id"].nunique())
print(clr.S+"Missing Values Column:"+clr.E, train.isna().sum().index[-1])
print("\t", clr.S+"with a total missing rows of:"+clr.E, train.isna().sum().values[-1])
print("\t", clr.S+"% of missing rows:"+clr.E,
len(train[train["segmentation"].isna()==False]), "\n")
print(clr.S+"Sample of train.csv:"+clr.E)
train.sample(5, random_state=26)

# Show a dataframe of missing values
sns.displot(
data=train.isna().melt(value_name="missing"),
y="variable",
hue="missing",
multiple="fill",
# Change aspect of the chart
aspect=3,
height=6,
# Change colors
palette=[my_colors[5], my_colors[2]],
legend=False)
plt.title("- [train.csv] %Perc Missing Values per variable -", size=18, weight="bold")
plt.xlabel("Total Percentage")
plt.ylabel("Dataframe Variable")
plt.legend(["Missing", "Not Missing"]);
plt.show();
print("\n")
# Plot 2
plt.figure(figsize=(24,6))
cbar_kws = {
"ticks": [0, 1],
}
sns.heatmap(train.isna(), cmap=[my_colors[5], my_colors[2]], cbar_kws=cbar_kws)
plt.title("- [train.csv] Missing Values per observation -", size=18, weight="bold")
plt.xlabel("")
plt.ylabel("Observation")
plt.show();


def read_image(path):
'''Reads and converts the image.
path: the full complete path to the .png file'''
# Read image in a corresponding manner
# convert int16 -> float32
image = cv2.imread(path, cv2.IMREAD_UNCHANGED).astype('float32')
# Scale to [0, 255]
image = cv2.normalize(image, None, alpha = 0, beta = 255,
norm_type = cv2.NORM_MINMAX, dtype = cv2.CV_32F)
image = image.astype(np.uint8)
return image

分割由一列包括不同像素点和长度的数字组成。例如,
‘28094 3 28358 7 28623 9 28889 9 29155 9 29421 9 29687 9 29953 9 30219 9 30484 10 30750 10 31016 10 31282 10 31548 10 31814 10 32081 9 32347 8 32614 6’, 在这个数字列里,
28094是像素的开始点,3是长度。这样,我们可以计算每个分割的结束点
def mask_from_segmentation(segmentation, shape):
'''Returns the mask corresponding to the inputed segmentation.
segmentation: a list of start points and lengths in this order
max_shape: the shape to be taken by the mask
return:: a 2D mask'''
# Get a list of numbers from the initial segmentation
segm = np.asarray(segmentation.split(), dtype=int)
# Get start point and length between points
start_point = segm[0::2] - 1
length_point = segm[1::2]
# Compute the location of each endpoint
end_point = start_point + length_point
# Create an empty list mask the size of the original image
# take onl
case_mask = np.zeros(shape[0]*shape[1], dtype=np.uint8)
# Change pixels from 0 to 1 that are within the segmentation
for start, end in zip(start_point, end_point):
case_mask[start:end] = 1
case_mask = case_mask.reshape((shape[0], shape[1]))
return case_mask
# Example
segmentation = '45601 5 45959 10 46319 12 46678 14 47037 16 47396 18 47756 18 48116 19 48477 18 48837 19 \
49198 19 49558 19 49919 19 50279 20 50639 20 50999 21 51359 21 51719 22 52079 22 52440 22 52800 22 53161 21 \
53523 20 53884 20 54245 19 54606 19 54967 18 55328 17 55689 16 56050 14 56412 12 56778 4 57855 7 58214 9 58573 12 \
58932 14 59292 15 59651 16 60011 17 60371 17 60731 17 61091 17 61451 17 61812 15 62172 15 62532 15 62892 14 \
63253 12 63613 12 63974 10 64335 7'
shape = (310, 360)
case_mask = mask_from_segmentation(segmentation, shape)
wandb_mask = []
wandb_mask.append(wandb.Image(case_mask))
plt.figure(figsize=(5, 5))
plt.title("Mask Example:")
plt.imshow(case_mask)
plt.axis("off")
plt.show();

对于每个图像ID, 我们产生三层图像:

由原始图像产生full masks, 这些masks提供有关健康组织的有价值信息。

def get_id_mask(ID, verbose=False):
'''Returns a mask for each case ID. If no segmentation was found, the mask will be empty
- meaning formed by only 0
ID: the case ID from the train.csv file
verbose: True if we want any prints
return: segmentation mask'''
# ~~~ Get the data ~~~
# Get the portion of dataframe where we have ONLY the speciffied ID
ID_data = train[train["id"]==ID].reset_index(drop=True)
# Split the dataframe into 3 series of observations
# each for one speciffic class - "large_bowel", "small_bowel", "stomach"
observations = [ID_data.loc[k, :] for k in range(3)]
# ~~~ Create the mask ~~~
# Get the maximum height out of all observations
# if max == 0 then no class has a segmentation
# otherwise we keep the length of the mask
max_height = np.max([obs.image_height for obs in observations])
max_width = np.max([obs.image_width for obs in observations])
# Get shape of the image
# 3 channels of color/classes
shape = (max_height, max_width, 3)
# Create an empty mask with the shape of the image
mask = np.zeros(shape, dtype=np.uint8)
# If there is at least 1 segmentation found in the group of 3 classes
if max_height != 0:
for k, location in enumerate(["large_bowel", "small_bowel", "stomach"]):
observation = observations[k]
segmentation = observation.segmentation
# If a segmentation is found
# Append a new channel to the mask
if pd.isnull(segmentation) == False:
mask[..., k] = mask_from_segmentation(segmentation, shape)
# If no segmentation was found skip
elif max_segmentation == 0:
mask = None
if verbose:
print("None of the classes have segmentation.")
return mask
# Full Example
# Read image
path = '../input/uw-madison-gi-tract-image-segmentation/train/case131/case131_day0/scans/slice_0066_360_310_1.50_1.50.png'
img = read_image(path)
# Get mask
ID = "case131_day0_slice_0066"
mask = get_id_mask(ID, verbose=False)
plot_original_mask(img, mask, alpha=1)

我们产生3D masks文件夹, 每个mask对应3层:
# Create folder to save masks
os.mkdir("masks_png")
# Get a list of unique ids
unique_ids = train[train["segmentation"].isna()==False]["id"].unique()
for ID in tqdm(unique_ids):
# Get the mask
mask = get_id_mask(ID, verbose=False)
# Write it in folder
cv2.imwrite(f"masks_png/{ID}.png", mask)
# Save to zip file
shutil.make_archive('zip_masks3D', 'zip', 'masks_png')
# Delete the initial folder
shutil.rmtree('masks_png')
定义矩阵:

# Create folder to save masks
os.mkdir("masks2D_png")
# Get unique paths for the 3D masks (generated at step 3.1)
all_mask_paths = train[train["segmentation"].isna()==False]["mask_path"].unique()
for mask_path in tqdm(all_mask_paths):
# Get file name
name = mask_path.split("/")[-1]
# Read mask
mask = cv2.imread(mask_path)
# Change masks from 3D to 2D
mask = mask_3D_to_2D(mask)
# Write it in folder
cv2.imwrite(f"masks2D_png/{name}", mask)
# Save to zip file
shutil.make_archive('zip_masks2D', 'zip', 'masks2D_png')
# Delete the initial folder
shutil.rmtree('masks2D_png')
完