因为深度学习训练数据少,要生成样本。虽然在生成样本过程中,是以区间的形式生成,但是领导还是怕有重复,怕训练的时候出现问题,不贴合实际场景。这样,首先简单看下在生成样本过程中,会有多少重复样本。下面是简单判断在一个文件夹下,图像样本相同比较:
import os
import os.path as osp
import cv2
import numpy as np
def get_file_list(src_dir, ext='.jpg'):
name_list = []
for roots, dirs, files in os.walk(src_dir):
if roots != src_dir:
break
for file_name in files:
if file_name.endswith(ext):
name_list.append(osp.splitext(file_name)[0])
return name_list
def CheckImages(src_dir, im_name_list, src_ext):
num = len(im_name_list)
count = 0
for i in range(0, num-1):
im_name = src_dir + '/' + im_name_list[i] + src_ext
print(i, ' of ', num)
img = cv2.imread(im_name, cv2.IMREAD_COLOR)
fg = 0
for j in range(i+1, num):
im_name_t = src_dir + '/' + im_name_list[j] + src_ext
img_t = cv2.imread(im_name_t, cv2.IMREAD_COLOR)
# im_sub = img - img_t
# p_min = np.min(im_sub)
p_max = np.max(img - img_t)
if 0 == p_max:
fg = 1
break
if 1 == fg:
count += 1
print('ratio of the same', float(count) / num)
if __name__ == '__main__':
src_dir = '/media/ada/m-disk-6t/pycharmprojects/mysamples/samples/data'
ext = '.png'
name_list = get_file_list(src_dir, ext=ext)
CheckImages(src_dir, name_list, ext)
最终测试发现,1000张图像中有1张重复。