「这是我参与2022首次更文挑战的第25天,活动详情查看:2022首次更文挑战」。
移动小屋 - 基于飞桨技术的智能车内互动玩伴
- aistudio地址: aistudio.baidu.com/aistudio/pr…
- github地址:github.com/L0113408/Jo…
💕 github 、 aistu 💕
💕💕 dio欢迎一 💕💕
💕💕💕键三连💕💕💕
1. 项目介绍
设计概述:移动小屋为年轻父母设计智能出行时的新育儿体验,在陌生与新奇的移动空间内,通过车窗交互连结前后排的隔绝空间。在城市间,长途上和黑夜里多场景下的互动方式。
1. 1 方案思路💡
1)需求验证:我们通过桌面学习和亲友访谈了解到,年轻家长会因为新生儿的到来而购买空间更大行驶更稳的智能汽车,同时也因安抚艰难而不知如何度过平均1-2小时的出行时间。
2)设计机会:调查显示幼少儿对新奇事物有着天然的好奇心和注意力,只需多加引导即可长期受益;同时我们扫描了智能座舱内的原始触点,最终选择车窗做为可交互的最易操作区域。
3)设计方案:选定行程中不同路段和其行驶特性进行针对性的创意脑爆。最终为城市里走走停停的场景提供图像识别的快速问答题用于提升注意力,在长途高架上行驶的场景设计了图像分割渲染的涂绘功能 用于激活创意性思维,在隧道或黑夜里视线模糊的场景我们渲染了夜色星空用于展现共享记忆图片。
1.2 技术概述 🤖️
模型介绍:目标检测模块主要使用yolov5的模型,利用flask框架搭建图片检测的API。bot通过http请求调用API接口,并存储记录为后续的提问环节做数据支撑。
🤖️ 技术分类:
1)行车图像中的目标检测
2)图像分割与动漫渲染
3)Wechaty图片与对话交互
1.3 项目视频 📺
了解更多的项目视频在此>>>> 手动点点赞吧
b站视频链接:www.bilibili.com/video/BV1D6…
2. 实现内容一 图像目标识别
2.1 用途
将路途中系统拍摄到的图像进行分析处理,运用paddle hub里已训练的物体模型进目标识别
2.2 步骤
- bot接受图片
- 调http服务发送图片到服务器api
- 推理完毕
- 返回目标对象坐标,类别,数量
- 在图像上绘制,存储
- bot机器人根据返回数据提问,构建回答,答案
3. 实现内容二 facade图像分割
import os
import io
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image as PilImage
import paddle
from paddle.nn import functional as F
paddle.set_device('gpu')
paddle.__version__
# 解压缩数据集
!pwd
# !tar -xf data/data91009/facade.tar.gz
3.1查看数据
# 训练、测试数据集列表生成
IMAGE_SIZE = (512, 683)
train_images_path = "facade/images/"
label_images_path = "facade/labels_signle/"
# image_count = len([os.path.join(train_images_path, image_name) for image_name in os.listdir(train_images_path) if image_name.endswith('.jpg')])
image_count = len([os.path.join(label_images_path, image_name) for image_name in os.listdir(label_images_path) if image_name.endswith('.png')])
print("图像样本的总数量是:", image_count)
# 对数据集进行处理,划分训练集、测试集
def _sort_images(image_dir, image_type):
"""
对文件夹内的图像进行按照文件名排序
"""
files = []
for image_name in os.listdir(image_dir):
if image_name.endswith('.{}'.format(image_type)) \
and not image_name.startswith('.'):
# files.append(os.path.join(image_dir, image_name))
files.append(image_name)
return sorted(files)
def write_file(mode, images, labels):
with open('./{}.txt'.format(mode), 'w') as f:
for i in range(len(images)):
f.write('{}\t{}\n'.format(images[i], labels[i]))
"""
由于所有文件都是散落在文件夹中,在训练时我们需要使用的是数据集和标签对应的数据关系,
所以我们第一步是对原始的数据集进行整理,得到数据集和标签两个数组,分别一一对应。
这样可以在使用的时候能够很方便的找到原始数据和标签的对应关系,否则对于原有的文件夹图片数据无法直接应用。
在这里是用了一个非常简单的方法,按照文件名称进行排序。
因为刚好数据和标签的文件名是按照这个逻辑制作的,名字都一样,只有扩展名不一样。
"""
images = _sort_images(train_images_path, 'jpg')
print(len(images))
labels = _sort_images(label_images_path, 'png')
eval_num = int(image_count * 0.15)
# for image in images:
# print(image[:-4])
images = [image[:-4]+'.jpg' for image in images if image[:-4]+'.png' in labels]
print(len(images))
write_file('train', images[:-eval_num], labels[:-eval_num])
write_file('test', images[-eval_num:], labels[-eval_num:])
write_file('predict', images[-eval_num:], labels[-eval_num:])
# 查看数据
with open('./train.txt', 'r') as f:
i = 0
for line in f.readlines():
image_path, label_path = line.strip().split('\t')
image = np.array(PilImage.open(os.path.join(train_images_path, image_path)))
label = np.array(PilImage.open(os.path.join(label_images_path, label_path)))
print(image.shape)
print(np.unique(label.astype('uint8')))
if i > 2:
break
# 进行图片的展示
plt.figure()
plt.subplot(1,2,1),
plt.title('Train Image')
plt.imshow(image.astype('uint8'))
plt.axis('off')
plt.subplot(1,2,2),
plt.title('Label')
plt.imshow(label.astype('uint8'), cmap='gray')
plt.axis('off')
plt.show()
i = i + 1
3.2 数据集定义
import random
from paddle.io import Dataset
from paddle.vision.transforms import transforms as T
class FacadeDataset(Dataset):
"""
数据集定义
"""
def __init__(self, mode='train'):
"""
构造函数
"""
self.image_size = IMAGE_SIZE
self.mode = mode.lower()
assert self.mode in ['train', 'test', 'predict'], \
"mode should be 'train' or 'test' or 'predict', but got {}".format(self.mode)
self.train_images = []
self.label_images = []
if self.mode=='predict':
with open('facade/predict_list.txt', 'r') as f:
for line in f.readlines():
image = line.strip()
self.train_images.append((os.path.join('facade/predict', image)))
#self.label_images.append((os.path.join('facade/predict', label)))
print(os.path.join('facade/predict', image))
else:
with open('./{}.txt'.format(self.mode), 'r') as f:
for line in f.readlines():
image, label = line.strip().split('\t')
self.train_images.append((os.path.join(train_images_path, image)))
self.label_images.append((os.path.join(label_images_path, label)))
def _load_img(self, path, color_mode='rgb', transforms=[]):
"""
统一的图像处理接口封装,用于规整图像大小和通道
"""
with open(path, 'rb') as f:
img = PilImage.open(io.BytesIO(f.read()))
if color_mode == 'grayscale':
# if image is not already an 8-bit, 16-bit or 32-bit grayscale image
# convert it to an 8-bit grayscale image.
if img.mode not in ('L', 'I;16', 'I'):
img = img.convert('L')
elif color_mode == 'rgba':
if img.mode != 'RGBA':
img = img.convert('RGBA')
elif color_mode == 'rgb':
if img.mode != 'RGB':
img = img.convert('RGB')
else:
raise ValueError('color_mode must be "grayscale", "rgb", or "rgba"')
return T.Compose(transforms)(img)
def __getitem__(self, idx):
"""
返回 image, label
"""
if self.mode == 'predict':
train_image = self._load_img(self.train_images[idx],
transforms=[
T.Transpose(),
T.Normalize(mean=127.5, std=127.5)
#T.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
]) # 加载原始图像
# 返回image, label
train_image = np.array(train_image, dtype='float32')
return train_image
else:
train_image = self._load_img(self.train_images[idx],
transforms=[
T.Transpose(),
T.Normalize(mean=127.5, std=127.5)
#T.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
]) # 加载原始图像
label_image = self._load_img(self.label_images[idx],
color_mode='grayscale',
transforms=[T.Grayscale()]) # 加载Label图像
# 返回image, label
train_image = np.array(train_image, dtype='float32')
label_image = np.array(label_image, dtype='int64')
return train_image, label_image
def __len__(self):
"""
返回数据集总数
"""
return len(self.train_images)
3.3 U-net 网络定义
class DoubleConv(paddle.nn.Layer):
"""(convolution => [BN] => ReLU) * 2"""
def __init__(self, in_channels, out_channels):
super(DoubleConv, self).__init__()
self.double_conv = paddle.nn.Sequential(
paddle.nn.Conv2D(in_channels, out_channels, kernel_size=3, padding=1),
paddle.nn.BatchNorm2D(out_channels),
paddle.nn.ReLU(),
paddle.nn.Conv2D(out_channels, out_channels, kernel_size=3, padding=1),
paddle.nn.BatchNorm2D(out_channels),
paddle.nn.ReLU()
)
def forward(self, x):
return self.double_conv(x)
class Down(paddle.nn.Layer):
"""Downscaling with maxpool then double conv"""
def __init__(self, in_channels, out_channels):
super(Down, self).__init__()
self.maxpool_conv = paddle.nn.Sequential(
paddle.nn.MaxPool2D(kernel_size=2, stride=2, padding=0),
DoubleConv(in_channels, out_channels)
)
def forward(self, x):
return self.maxpool_conv(x)
class Up(paddle.nn.Layer):
"""Upscaling then double conv"""
def __init__(self, in_channels, out_channels, bilinear=True):
super(Up, self).__init__()
if bilinear:
self.up = paddle.nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
else:
self.up = paddle.nn.ConvTranspose2d(in_channels // 2, in_channels // 2, kernel_size=2, stride=2)
self.conv = DoubleConv(in_channels, out_channels)
def forward(self, x1, x2):
x1 = self.up(x1)
# print(x2.shape, x1.shape)
# print(x2.shape[2] - x1.shape[2])
diffY = paddle.to_tensor([x2.shape[2] - x1.shape[2]])
diffX = paddle.to_tensor([x2.shape[3] - x1.shape[3]])
x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2, diffY // 2, diffY - diffY // 2])
x = paddle.concat([x2, x1], axis=1)
return self.conv(x)
class U_Net(paddle.nn.Layer):
def __init__(self, num_classes, bilinear=True):
super(U_Net, self).__init__()
self.num_classes = num_classes
self.bilinear = bilinear
self.inc = DoubleConv(3, 64)
self.down1 = Down(64, 128)
self.down2 = Down(128, 256)
self.down3 = Down(256, 512)
self.down4 = Down(512, 512)
self.up1 = Up(1024, 256, bilinear)
self.up2 = Up(512, 128, bilinear)
self.up3 = Up(256, 64, bilinear)
self.up4 = Up(128, 64, bilinear)
self.output_conv = paddle.nn.Conv2D(64, num_classes, kernel_size=1)
def forward(self, inputs):
x1 = self.inc(inputs)
x2 = self.down1(x1)
x3 = self.down2(x2)
x4 = self.down3(x3)
x5 = self.down4(x4)
x = self.up1(x5, x4)
x = self.up2(x, x3)
x = self.up3(x, x2)
x = self.up4(x, x1)
y = self.output_conv(x)
return y
3.4 模型训练
# 10分类
num_classes = 10
network = U_Net(num_classes)
model = paddle.Model(network)
# 网络擦好看
model.summary((-1, 3,) + IMAGE_SIZE)
# 数据集载入
train_dataset = FacadeDataset(mode='train') # 训练数据集
val_dataset = FacadeDataset(mode='test') # 验证数据集
optim = paddle.optimizer.RMSProp(learning_rate=0.001, rho=0.9, momentum=0.0, epsilon=1e-07, centered=False,parameters=model.parameters())
beta1 = paddle.to_tensor([0.9], dtype="float32")
beta2 = paddle.to_tensor([0.99], dtype="float32")
model.prepare(optim, paddle.nn.CrossEntropyLoss(axis=1))
# 高层api训练
model.fit(train_dataset, val_dataset, epochs=50, batch_size=1, verbose=1)
3.5 模型评价
# 预测
predict_dataset = PetDataset(mode='predict')
predict_results = model.predict(predict_dataset)
# 分割评价
import numpy as np
def diceCoeff(preds, gts, eps=1e-5, endlabel=9):
"""
preds列表,其中的每一个统一到单通道,考虑标签为1-9的dice值计算
计算公式:
dice = (2 * (pred ∩ gt)) / (pred ∪ gt)
"""
final_eval = 0
pred = np.concatenate([np.expand_dims(pred, 0) for pred in preds], axis=0)
gt = np.concatenate([np.expand_dims(gt, 0) for gt in gts], axis=0)
print('label:', np.unique(gt))
N = pred.shape[0]
pred_flat = pred.reshape(N, -1)
gt_flat = gt.reshape(N, -1)
for i in range(N):
eval_result = 0
labels = np.unique(gt_flat[i])
# print(labels)
for label in labels:
unionset = np.sum(pred_flat[i] == label) + np.sum(gt_flat[i] == label)
pred_tmp, gt_tmp = pred_flat[i], gt_flat[i]
pred_tmp[pred_tmp == label] = 1
pred_tmp[pred_tmp != label] = 0
gt_tmp[gt_tmp == label] = 1
gt_tmp[gt_tmp != label] = 0
intersection = np.sum(pred_tmp * gt_tmp)
eval_result += (2 * intersection + eps) / (unionset + eps)
final_eval += eval_result /len(labels)
return final_eval / N
3.6预测结果对比
plt.figure(figsize=(10, 10))
i = 0
mask_idx = 0
pred_list = []
#gt_list = []
with open('facade/predict_list.txt', 'r') as f:
for line in f.readlines():
#image_path, label_path = line.strip().split('\t')
label_path = line.strip()
print(os.path.join('facade/predict', label_path))
#image = PilImage.open((os.path.join(train_images_path, image_path)))
image = PilImage.open(os.path.join('facade/predict', label_path))
image = np.array(image).astype('uint8')
#label = np.array(label).astype('uint8')
if i > 8:
break
plt.subplot(3, 3, i + 1)
plt.imshow(image)
#plt.imshow(label)
plt.title('Input Image')
plt.axis("off")
plt.subplot(3, 3, i + 2)
# plt.imshow(label, cmap='gray')
plt.imshow(label)
plt.title('Label')
plt.axis("off")
# 模型只有一个输出,所以我们通过predict_results[0]来取出1000个预测的结果
# 映射原始图片的index来取出预测结果,提取mask进行展示
data = predict_results[0][mask_idx][0].transpose((1, 2, 0))
mask = np.argmax(data, axis=-1)
print(mask)
print(mask.shape)
print(50*'*')
# print(np.unique(mask))
pred_list.append(mask)
#gt_list.append(label)
#print('指标结果:', diceCoeff(pred_list, gt_list))
plt.subplot(3, 3, i + 3)
# plt.imshow(mask.astype('uint8'), cmap='gray')
plt.imshow(mask.astype('uint8'))
plt.title('Predict')
plt.axis("off")
i += 3
mask_idx += 1
pred_list=[]
#gt_list=[]
plt.show()
3.7 模型保存
model.save('mymodel')
3.8 模型载入预测
# 载入
from paddle.static import InputSpec
num_classes = 10
network = U_Net(num_classes)
# model = paddle.Model(network, inputs=[InputSpec(shape=[-1], IMAGE_SIZE), dtype='float32', name='image')])
model = paddle.Model(network, inputs=[InputSpec([-1, 3, 512,683], 'float32', name='image')])
# model.summary((-1, 3,) + IMAGE_SIZE)
model.load('mymodel/mymodel')
# 模型配置
model.prepare()
# 数据集
predict_dataset = PetDataset(mode='predict')
predict_results = model.predict(predict_dataset)
3.8 分类--颜色变换
def generate_rgb(gray_array):
# rgb=np.zeros([512,512,3], int)
x,y=gray_array.shape
rgb=np.zeros([x,y,3], int)
gray_array=np.array(gray_array)
# 0 其它 1 various = 0:0:0 2 building = 213:113:156 3 car = 205:238:38 4
# door = 144:80:41 5 pavement = 197:170:100 6 road = 23:143:210 7
# sky = 178:254:255 8 vegetation = 152:255:152 9 window = 254:255:205
for i in range(x):
for j in range(y):
if gray_array[i,j]==0:
rgb[i,j,0]=255
rgb[i,j,1]=255
rgb[i,j,2]=255
if gray_array[i,j]==2:
rgb[i,j,0]=213
rgb[i,j,1]=113
rgb[i,j,2]=156
if gray_array[i,j]==3:
rgb[i,j,0]=205
rgb[i,j,1]=208
rgb[i,j,2]=38
if gray_array[i,j]==4:
rgb[i,j,0]=114
rgb[i,j,1]=80
rgb[i,j,2]=41
if gray_array[i,j]==5:
rgb[i,j,0]=197
rgb[i,j,1]=170
rgb[i,j,2]=100
if gray_array[i,j]==6:
rgb[i,j,0]=23
rgb[i,j,1]=143
rgb[i,j,2]=210
if gray_array[i,j]==6:
rgb[i,j,0]=23
rgb[i,j,1]=143
rgb[i,j,2]=210
if gray_array[i,j]==7:
rgb[i,j,0]=178
rgb[i,j,1]=254
rgb[i,j,2]=255
if gray_array[i,j]==8:
rgb[i,j,0]=152
rgb[i,j,1]=255
rgb[i,j,2]=152
if gray_array[i,j]==9:
rgb[i,j,0]=254
rgb[i,j,1]=255
rgb[i,j,2]=255
return rgb
# 保存变化颜色后图片
import tqdm
i = 0
mask_idx = 0
pred_list = []
#gt_list = []
with open('facade/predict_list.txt', 'r') as f:
for line in tqdm.tqdm(f.readlines()):
label_path = line.strip()
image = PilImage.open((os.path.join('facade/predict', label_path)))
image = np.array(image).astype('uint8')
data = predict_results[0][mask_idx][0].transpose((1, 2, 0))
mask = np.argmax(data, axis=-1)
# print(mask)
# print(mask.shape)
# print(50*'*')
# print(mask.shape)
mask=generate_rgb(mask)
PilImage.fromarray(np.uint8(mask)).save(os.path.join("predict_results_rgb", label_path[:-4] + ".png"))
pred_list.append(mask)
i += 3
mask_idx += 1
pred_list=[]
4. Wechaty 对话整合
利用wechaty 和 状态机制作一个多轮对话任务系统。
4.1. 状态机设计说明
以“多分支路径选择”事件为例:
4.1.1 Json脚本定义
"状态编码":{
"state":"P01", // 当前状态编码
"question": "您已进入移动小屋,我们即将启程!请发送1进入自主探索,发送2选择系统推送。", // 消息
"action":"Switch", // 事件类型
"imgpath":"", // 图片地址
"wait":"true", // 是否等待回复
"1":{
"next":"P02" // 回复"1"时的下一个状态编码
},
"2":{
"next":"P04" // 回复"2"时的下一个状态编码
}
},
4.1.2 事件处理
# 多分支路径选择
def Switch(self, user_Info : User):
path = r''
# 回复消息文本
text: str = self.msg.text()
# 获取状态信息
cur_Process = processes[user_Info.state]
if text in cur_Process :
# 根据回复选择路径分支
user_Info.state = cur_Process[text]['next']
cur_Process = processes[user_Info.state]
send = cur_Process['question']
else :
send = None
return send, user_Info
4.1.3 状态机操作
# 状态机操作
a = Action(self.msg)
cur_Process = processes[user_Info.state]
method = cur_Process['action']
result = methodcaller(method, user_Info)(a)
4.2. 目标识别,看图问答
# 看图问答
def Count(self, user_Info : User):
names_CN = processes['objects']
cur_Process = processes[user_Info.state]
img_path : str = cur_Process['imgpath']
# 目标识别服务通信设置
file = open(img_path, mode='rb')
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection": "keep-alive",
"Host": "36kr.com/newsflashes",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:55.0) Gecko/20100101 Firefox/55.0"
}
files = {'file': (f'{img_path[:-4]}', file, 'image/jpg')}
print('向服务器发送请求!')
url = 'http://192.168.1.200:5001/api'
re = requests.post(url,headers,files=files)
if re.status_code == 200:
data = json.loads(re.content)
imgName = data['imageName']
boxList = data['resJson']
clsList = data['resCls']
img = Image.open(f'./images/{imgName}.jpg')
imgArray = np.array(img)
txt_path = f'./DataBase/boxList/{imgName}'
# 收到boxList,绘制识别目标区域
for eachbox in boxList:
label2 = f"{names_CN[f'{eachbox[-2]}']} {eachbox[-1]*100}"
label = '%s %.2f%%' % (eachbox[-2], eachbox[-1]*100)
print(label2)
plot_one_box(eachbox, imgArray, label=label, color=None, line_thickness=3)
img = Image.fromarray(imgArray)
# img.show()
img.save(f'./DataBase/image/{imgName}.jpg')
# 创建问题
num = random.randrange(len(clsList))
user_Info.cls = clsList[num]
send = f"图片中{names_CN[f'{clsList[num][1]}']['name']}数量是多少呢?"
user_Info.state = cur_Process['A00']['next']
# await msg.say(question)
else:
# await msg.say(f"编号{str(imagesList[-1])[:-4]}图片正在飞速发往图片星球路上被加勒比星球拦截,请重新发送!...")
send = '图片识别失败,请重新发送!……'
return send, user_Info
4.3. 图像分割
# 看图问答
def facade(self, user_Info : User):
names_CN = processes['objects']
cur_Process = processes[user_Info.state]
img_path : str = cur_Process['imgpath']
# org_im是主动发送的原图,通过机器人接收存在服务器
org_im = cv2.imread(img_path) # 替换1.jpg为变量
x, y = org_im.shape[0:2]
org_im = cv2.resize(org_im, ( 704,480))
data = {'image': cv2_to_base64(org_im)}
# 请求链接
# apikey就是应用key
url = "https://aistudio.baidu.com/serving/online/6716?apiKey=95564e53-5804-4e51-bd8d-c1a035eb36c4"
# 发送HTTP请求
r = requests.post(url=url, data=json.dumps(data))
print(r)
print(r.json())
# base64处理后的数据
base64_img = r.json()['result']['image']
# 自定义保存文件名
target_img = os.path.join('target_img', 'target_img' + img_path.split('/')[-1])
with open(target_img, 'wb') as f: # test.jpg可自定义,后面 robot 返回给用户
f.write(base64.b64decode(base64_img.split(',')[-1]))
user_Info.state = cur_Process['A00']['next']
cur_Process = processes[user_Info.state]
send = cur_Process['question']
return send, user_Info
5. 下一步计划
我们的项目始于一次脑爆会,期间受到来自商业、设计、营销等领域的小伙伴的支持和指导。希望可以在技术的挖掘和探索下再深入一些!
下一步的计划 NEXT STEP:
- 需求调研和论证
- 多模态的交互方式与界面设计,引入手势、声控等方式
- 硬件研发和测试
欢迎感兴趣的伙伴邮件联系Kk karliekou@gmail.com