It is a program that finds duplicate images scattered under a certain folder Calculate the distance between the vectors after reducing and monochrome the image and vectorizing it while leaving the features of the image at a minimum. Objects with a distance of 0 or close to 0 are judged to be the same image. Such a program.
Then convert it to numpy.array for easier calculation.
def img2vec(filename):
img = Image.open(filename)
img = img.resize((200, 200), resample=Image.BILINEAR) #Shrink
img = img.convert('1') #Binarization
#img.save(get_mono_filename(filename)) #If you want to check the image
return np.array(img)
The size of 200 x 200 set here may be too large, and it took about 9 hours to execute this program for 22000 sheets, so I think that about 50 x 50 is fine.
Find the distance between vectors with numpy, it's very easy to use numpy.
def normdiff(vec1, vec2):
norm = np.linalg.norm(vec1 - vec2)
return norm
norm = normdiff(img2vec("picture1.bmp"), img2vec("picture2.bmp"))
print(norm)
If you execute a test code like this using the above function, is the degree of approximation between images? Is displayed.
import csv
import datetime
import glob
import multiprocessing as mulproc
import numpy as np
import os
from PIL import Image
import sys
def log(*args):
timestr = datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S")
body = " ".join([timestr] + [str(v) for v in args])
print(body)
with open("log.txt", mode="a", encoding="utf-8") as wf:
wf.write(body + "\n")
def get_mono_filename(filename):
dirname, filename = os.path.split(filename)
return os.path.join("mono", filename)
def img2vec(filename):
# ANTIALIAS
# BILINEAR
# BICUBIC
# NEAREST
img = Image.open(filename)
img = img.resize((200, 200), resample=Image.BILINEAR)
img = img.convert('1')
#img.save(get_mono_filename(filename)) #If you want to check the image
return np.array(img)
def normdiff(vec1, vec2):
norm = np.linalg.norm(vec1 - vec2)
return norm
def walk_img_files(walk_dir):
for root, dirs, files in os.walk(walk_dir):
yield root
for file in files:
yield os.path.join(root, file)
def is_picture_filename(filename):
extensions = ["png", "jpg"]
for ext in extensions:
if filename.endswith("." + ext): return True
return False
def save_vector(filename, vec):
with open(filename, mode="w", encoding="utf-8") as wf:
writer = csv.writer(wf, lineterminator='\n')
writer.writerows(vec)
def save_labels(filenames):
with open("./labels.txt", mode="w", encoding="utf-8") as wf:
wf.write("\n".join(filenames))
def create_vector_dump(search_dir):
files = list(walk_img_files(search_dir))
img_files = []
for f in files:
if is_picture_filename(f): img_files.append(f)
for img_file in img_files:
vec = img2vec(img_file)
dir, filename = os.path.split(img_file)
save_vector(os.path.join("vector", filename) + ".vector", list(vec.astype(int)))
save_labels(img_files)
return
def load_labels():
with open("./labels.txt", mode="r", encoding="utf-8") as rf:
body = rf.read()
labels = body.split("\n")
labels = [l for l in labels if len(l) > 0]
return labels
def load_vecs(labels):
log("start load vectoes")
vecs = []
for i, l in enumerate(labels):
dirname, filename = os.path.split(l)
filename = os.path.join("vector", filename + ".vector")
vecs.append(np.loadtxt(filename, delimiter=","))
log("load vectoes {}/{} complete".format(i, len(labels)))
log("end load vectoes")
return np.array(vecs)
def save_results(rows):
with open("results.csv", mode="w", encoding="utf-8") as wf:
writer = csv.writer(wf, lineterminator='\n')
writer.writerows(rows)
def create_join_imgs(filename, vecs):
vecs = np.concatenate(vecs, axis=1)
vecs *= 255
img = Image.fromarray(vecs).convert("1")
img.save(filename)
def create_dup_imgs(approximates, vecs, labels):
for i, approximate in enumerate(approximates):
orig_label = labels[i]
if len(approximate) < 1: continue
img_vecs = [vecs[i]] + [vecs[ai] for ai in approximate]
dirname, filename = os.path.split(orig_label)
filename = os.path.join("dupulicate", filename)
img = create_join_imgs(filename, img_vecs)
class EnumApproximate:
def __init__(self):
labels = load_labels()
#labels = labels[0:1000]
self.labels = labels
vecs = load_vecs(labels)
self.vecs = vecs
self.threthold = float(10.0)
def enum_approximate(self, index):
indexes = []
vec = self.vecs[index]
for i, v in enumerate(self.vecs):
if i == index: continue
dif = normdiff(vec, v)
if dif <= self.threthold: indexes.append(i)
return indexes
def exec(self):
log("start")
approximates = []
for i in range(len(self.labels)):
log("enum_approximate vectoes {}/{} complete".format(i, len(self.labels)))
approximates.append(self.enum_approximate(i))
rows = []
for i in range(len(self.labels)):
idxs = approximates[i]
cols = [self.labels[i]] + [self.labels[ii] for ii in idxs]
rows.append(cols)
save_results(rows)
create_dup_imgs(approximates, self.vecs, self.labels)
log("end")
def main():
x = EnumApproximate()
x.exec()
if __name__ == '__main__':
create_vector_dump(r"O:\picture\KanColle")
main()
Create a folder called vector, dupulicate in the same hierarchy as the source
create_vector_dump(r"O:\picture\KanColle")
Executing this one line will create a CSV vectorized image in the vector folder. Execute main when vectorization is complete. You can run both at the same time.
When the execution is completed, a CSV-like file called result.csv and an image in which duplicate images are connected to the duplicate folder are created.
Just finding it does not mean deleting duplicates, so if you want to do something from there, you can write a script based on result.csv.
Recommended Posts