A program that searches for the same image

A program that searches for the same image

It is a program that finds duplicate images scattered under a certain folder Calculate the distance between the vectors after reducing and monochrome the image and vectorizing it while leaving the features of the image at a minimum. Objects with a distance of 0 or close to 0 are judged to be the same image. Such a program.

Make the image as small as possible

Then convert it to numpy.array for easier calculation.

def img2vec(filename):
    img = Image.open(filename)
    img = img.resize((200, 200), resample=Image.BILINEAR) #Shrink
    img = img.convert('1') #Binarization
    #img.save(get_mono_filename(filename)) #If you want to check the image
    return np.array(img)

The size of 200 x 200 set here may be too large, and it took about 9 hours to execute this program for 22000 sheets, so I think that about 50 x 50 is fine.

Compare images

Find the distance between vectors with numpy, it's very easy to use numpy.

def normdiff(vec1, vec2):
    norm = np.linalg.norm(vec1 - vec2)
    return norm

Exam code

    norm = normdiff(img2vec("picture1.bmp"), img2vec("picture2.bmp"))
    print(norm)

If you execute a test code like this using the above function, is the degree of approximation between images? Is displayed.

Whole source

import csv
import datetime
import glob
import multiprocessing as mulproc
import numpy as np
import os
from PIL import Image
import sys

def log(*args):
    timestr = datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S")
    body = " ".join([timestr] + [str(v) for v in args])
    print(body)
    with open("log.txt", mode="a", encoding="utf-8") as wf:
        wf.write(body + "\n")

def get_mono_filename(filename):
    dirname, filename = os.path.split(filename)
    return os.path.join("mono", filename)

def img2vec(filename):
    # ANTIALIAS
    # BILINEAR
    # BICUBIC
    # NEAREST
    img = Image.open(filename)
    img = img.resize((200, 200), resample=Image.BILINEAR)
    img = img.convert('1')
    #img.save(get_mono_filename(filename)) #If you want to check the image
    return np.array(img)

def normdiff(vec1, vec2):
    norm = np.linalg.norm(vec1 - vec2)
    return norm

def walk_img_files(walk_dir):
    for root, dirs, files in os.walk(walk_dir):
        yield root
        for file in files:
            yield os.path.join(root, file)

def is_picture_filename(filename):
    extensions = ["png", "jpg"]
    for ext in extensions:
        if filename.endswith("." + ext): return True
    return False

def save_vector(filename, vec):
    with open(filename, mode="w", encoding="utf-8") as wf:
        writer = csv.writer(wf, lineterminator='\n')
        writer.writerows(vec)

def save_labels(filenames):
    with open("./labels.txt", mode="w", encoding="utf-8") as wf:
        wf.write("\n".join(filenames))

def create_vector_dump(search_dir):
    files = list(walk_img_files(search_dir))
    img_files = []
    for f in files:
        if is_picture_filename(f): img_files.append(f)

    for img_file in img_files:
        vec = img2vec(img_file)
        dir, filename = os.path.split(img_file)
        save_vector(os.path.join("vector", filename) + ".vector", list(vec.astype(int)))
    save_labels(img_files)
    return

def load_labels():
    with open("./labels.txt", mode="r", encoding="utf-8") as rf:
        body = rf.read()
        labels = body.split("\n")
        labels = [l for l in labels if len(l) > 0]
        return labels

def load_vecs(labels):
    log("start load vectoes")
    vecs = []
    for i, l in enumerate(labels):
        dirname, filename = os.path.split(l)
        filename = os.path.join("vector", filename + ".vector")
        vecs.append(np.loadtxt(filename, delimiter=","))
        log("load vectoes {}/{} complete".format(i, len(labels)))
    log("end load vectoes")
    return np.array(vecs)

def save_results(rows):
    with open("results.csv", mode="w", encoding="utf-8") as wf:
        writer = csv.writer(wf, lineterminator='\n')
        writer.writerows(rows)

def create_join_imgs(filename, vecs):
    vecs = np.concatenate(vecs, axis=1)
    vecs *= 255
    img = Image.fromarray(vecs).convert("1")
    img.save(filename)

def create_dup_imgs(approximates, vecs, labels):
    for i, approximate in enumerate(approximates):
        orig_label = labels[i]
        if len(approximate) < 1: continue
        img_vecs = [vecs[i]] + [vecs[ai] for ai in approximate]
        dirname, filename = os.path.split(orig_label)
        filename = os.path.join("dupulicate", filename)
        img = create_join_imgs(filename, img_vecs)

class EnumApproximate:
    def __init__(self):
        labels = load_labels()
        #labels = labels[0:1000]
        self.labels = labels
        vecs = load_vecs(labels)
        self.vecs = vecs
        self.threthold = float(10.0)

    def enum_approximate(self, index):
        indexes = []
        vec = self.vecs[index]
        for i, v in enumerate(self.vecs):
            if i == index: continue
            dif = normdiff(vec, v)
            if dif <= self.threthold: indexes.append(i)
        return indexes

    def exec(self):
        log("start")
        approximates = []
        for i in range(len(self.labels)):
            log("enum_approximate vectoes {}/{} complete".format(i, len(self.labels)))
            approximates.append(self.enum_approximate(i))
        rows = []
        for i in range(len(self.labels)):
            idxs = approximates[i]
            cols = [self.labels[i]] + [self.labels[ii] for ii in idxs]
            rows.append(cols)
        save_results(rows)
        create_dup_imgs(approximates, self.vecs, self.labels)
        log("end")

def main():
    x = EnumApproximate()
    x.exec()

if __name__ == '__main__':
    create_vector_dump(r"O:\picture\KanColle")
    main()

About execution

Create a folder called vector, dupulicate in the same hierarchy as the source

create_vector_dump(r"O:\picture\KanColle")

Executing this one line will create a CSV vectorized image in the vector folder. Execute main when vectorization is complete. You can run both at the same time.

When the execution is completed, a CSV-like file called result.csv and an image in which duplicate images are connected to the duplicate folder are created.

43164116_big_p1.png

Just finding it does not mean deleting duplicates, so if you want to do something from there, you can write a script based on result.csv.

Recommended Posts

A program that searches for the same image
Python program that looks for the same file name
Hashing algorithm for determining the same image
[Python] A program that rounds the score
A shell program that displays the Fibonacci sequence
Created a fooling image for the caption generative model
[Python] A program that counts the number of valleys
[Python] A program that compares the positions of kangaroos.
The image is a slug
A story about improving the program for partial filling of 3D binarized image data
Try a similar search for Image Search using the Python SDK [Search]
A program that just presses and releases the Esc key
[Python] A program that finds the most common bird types
[Golang] A program that determines the turn with random numbers
I wrote a script that splits the image in two
A program that automatically resizes the iOS app icon to the required image size in Python
The story of writing a program
Let's write a simple simulation program for the "Monty Hall problem"
[Ev3dev] Create a program that captures the LCD (screen) using python
Try to write a program that abuses the program and sends 100 emails
A program that answers a few questions and predicts the next answer
I made a program that automatically calculates the zodiac with tkinter
[Python] A program that rotates the contents of the list to the left
Try to generate a death metal jacket image with DCGAN + scrape the metal database site for that
Understand the probabilities and statistics that can be used for progress management with a python program
Download the top n Google image searches
Disclose the know-how that created a similar image search service for AV actresses by deep learning by chainer
[Python] A program that creates stairs with #
[Python] A program that calculates the number of chocolate segments that meet the conditions
If you create a program that automatically starts / terminates the verification environment for pull request, the verification has progressed.
A class that hits the DMM API
Find the dates for a jarring tournament
[Python] A program that calculates the number of socks to be paired
Latin learning for the purpose of writing a Latin sentence analysis program (Part 1)
Change the list in a for statement
A program that will slowly recover the economy from any news headline
A python program that resizes a video and turns it into an image
A code that corrects the yoon / sokuon (sokuon)
A program that plays rock-paper-scissors using Python
Set up a server that processes multiple connections at the same time
I made a LINE BOT that returns a terrorist image using the Flickr API
A simple workaround for bots to try to post tweets with the same content
Find a building on Google Earth that is the same height as Shingodzilla
[Python] A program that finds the minimum and maximum values without using methods
Let's display a simple template that is ideal for Django for the first time
[Python] A program that calculates the difference between the total numbers on the diagonal line.
[Python] A program that calculates the number of updates of the highest and lowest records
Turn multiple lists with a for statement at the same time in Python
A model that identifies the guitar with fast.ai
Created a Python wrapper for the Qiita API
Nogizaka46 A program that automatically saves blog images
Make a histogram for the time being (matplotlib)
A program that removes duplicate statements in Python
Python: Prepare a serializer for the class instance:
Image processing? The story of starting Python for
Detect folders with the same image in ImageHash
I made a Python program for Raspberry Pi that operates Omron's environmental sensor in the mode with data storage
A python script that generates a sample dataset for checking the operation of a classification tree
A program that notifies slack of the operating status of fully automatic botanical photography equipment
A python script that gets the number of jobs for a specified condition from indeed.com
A program that asks for a few kilograms to reach BMI and standard weight [Python]