A program that searches for the same image

It is a program that finds duplicate images scattered under a certain folder Calculate the distance between the vectors after reducing and monochrome the image and vectorizing it while leaving the features of the image at a minimum. Objects with a distance of 0 or close to 0 are judged to be the same image. Such a program.

Make the image as small as possible

Then convert it to numpy.array for easier calculation.

def img2vec(filename):
    img = Image.open(filename)
    img = img.resize((200, 200), resample=Image.BILINEAR) #Shrink
    img = img.convert('1') #Binarization
    #img.save(get_mono_filename(filename)) #If you want to check the image
    return np.array(img)

The size of 200 x 200 set here may be too large, and it took about 9 hours to execute this program for 22000 sheets, so I think that about 50 x 50 is fine.

Compare images

Find the distance between vectors with numpy, it's very easy to use numpy.

def normdiff(vec1, vec2):
    norm = np.linalg.norm(vec1 - vec2)
    return norm

Exam code

    norm = normdiff(img2vec("picture1.bmp"), img2vec("picture2.bmp"))
    print(norm)

If you execute a test code like this using the above function, is the degree of approximation between images? Is displayed.

Whole source

import csv
import datetime
import glob
import multiprocessing as mulproc
import numpy as np
import os
from PIL import Image
import sys

def log(*args):
    timestr = datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S")
    body = " ".join([timestr] + [str(v) for v in args])
    print(body)
    with open("log.txt", mode="a", encoding="utf-8") as wf:
        wf.write(body + "\n")

def get_mono_filename(filename):
    dirname, filename = os.path.split(filename)
    return os.path.join("mono", filename)

def img2vec(filename):
    # ANTIALIAS
    # BILINEAR
    # BICUBIC
    # NEAREST
    img = Image.open(filename)
    img = img.resize((200, 200), resample=Image.BILINEAR)
    img = img.convert('1')
    #img.save(get_mono_filename(filename)) #If you want to check the image
    return np.array(img)

def normdiff(vec1, vec2):
    norm = np.linalg.norm(vec1 - vec2)
    return norm

def walk_img_files(walk_dir):
    for root, dirs, files in os.walk(walk_dir):
        yield root
        for file in files:
            yield os.path.join(root, file)

def is_picture_filename(filename):
    extensions = ["png", "jpg"]
    for ext in extensions:
        if filename.endswith("." + ext): return True
    return False

def save_vector(filename, vec):
    with open(filename, mode="w", encoding="utf-8") as wf:
        writer = csv.writer(wf, lineterminator='\n')
        writer.writerows(vec)

def save_labels(filenames):
    with open("./labels.txt", mode="w", encoding="utf-8") as wf:
        wf.write("\n".join(filenames))

def create_vector_dump(search_dir):
    files = list(walk_img_files(search_dir))
    img_files = []
    for f in files:
        if is_picture_filename(f): img_files.append(f)

    for img_file in img_files:
        vec = img2vec(img_file)
        dir, filename = os.path.split(img_file)
        save_vector(os.path.join("vector", filename) + ".vector", list(vec.astype(int)))
    save_labels(img_files)
    return

def load_labels():
    with open("./labels.txt", mode="r", encoding="utf-8") as rf:
        body = rf.read()
        labels = body.split("\n")
        labels = [l for l in labels if len(l) > 0]
        return labels

def load_vecs(labels):
    log("start load vectoes")
    vecs = []
    for i, l in enumerate(labels):
        dirname, filename = os.path.split(l)
        filename = os.path.join("vector", filename + ".vector")
        vecs.append(np.loadtxt(filename, delimiter=","))
        log("load vectoes {}/{} complete".format(i, len(labels)))
    log("end load vectoes")
    return np.array(vecs)

def save_results(rows):
    with open("results.csv", mode="w", encoding="utf-8") as wf:
        writer = csv.writer(wf, lineterminator='\n')
        writer.writerows(rows)

def create_join_imgs(filename, vecs):
    vecs = np.concatenate(vecs, axis=1)
    vecs *= 255
    img = Image.fromarray(vecs).convert("1")
    img.save(filename)

def create_dup_imgs(approximates, vecs, labels):
    for i, approximate in enumerate(approximates):
        orig_label = labels[i]
        if len(approximate) < 1: continue
        img_vecs = [vecs[i]] + [vecs[ai] for ai in approximate]
        dirname, filename = os.path.split(orig_label)
        filename = os.path.join("dupulicate", filename)
        img = create_join_imgs(filename, img_vecs)

class EnumApproximate:
    def __init__(self):
        labels = load_labels()
        #labels = labels[0:1000]
        self.labels = labels
        vecs = load_vecs(labels)
        self.vecs = vecs
        self.threthold = float(10.0)

    def enum_approximate(self, index):
        indexes = []
        vec = self.vecs[index]
        for i, v in enumerate(self.vecs):
            if i == index: continue
            dif = normdiff(vec, v)
            if dif <= self.threthold: indexes.append(i)
        return indexes

    def exec(self):
        log("start")
        approximates = []
        for i in range(len(self.labels)):
            log("enum_approximate vectoes {}/{} complete".format(i, len(self.labels)))
            approximates.append(self.enum_approximate(i))
        rows = []
        for i in range(len(self.labels)):
            idxs = approximates[i]
            cols = [self.labels[i]] + [self.labels[ii] for ii in idxs]
            rows.append(cols)
        save_results(rows)
        create_dup_imgs(approximates, self.vecs, self.labels)
        log("end")

def main():
    x = EnumApproximate()
    x.exec()

if __name__ == '__main__':
    create_vector_dump(r"O:\picture\KanColle")
    main()

About execution

Create a folder called vector, dupulicate in the same hierarchy as the source

create_vector_dump(r"O:\picture\KanColle")

Executing this one line will create a CSV vectorized image in the vector folder. Execute main when vectorization is complete. You can run both at the same time.

When the execution is completed, a CSV-like file called result.csv and an image in which duplicate images are connected to the duplicate folder are created.

Just finding it does not mean deleting duplicates, so if you want to do something from there, you can write a script based on result.csv.