I want to automatically find high-quality parts from the videos I shot

Thing you want to do





Output (When actually using it, please combine the following two files with ffmpeg or video editing software)


import datetime
import os

import cv2
import librosa
import numpy as np
import scipy

#A function for console output that feels good. It does not have to be.
def pretty_print_sec(sec):
    int_sec = int(sec)

    hour = int_sec // 3600
    left_sec = int_sec - hour * 3600
    minute = left_sec // 60
    left_sec = left_sec - minute * 60

    hour_str = ("00" + str(hour))[-2:]
    min_str = ("00" + str(minute))[-2:]
    sec_str = ("00" + str(left_sec))[-2:]

    return ":".join([hour_str, min_str, sec_str])

#Function used to check if the target number of seconds is the target of clipping
def is_in(tuple_list, val):
    for tup in tuple_list:
        if tup[0] <= val <= tup[1]:
            return True

    return False

#Use this when cropping based on the maximum value
def cut_by_max_rms(rms, percentile):

    is_on = False

    start = 0
    end = 0

    threshold = np.percentile(rms[0], percentile)
    cut_list = []

    #If the previous frame was also selected for cropping, combine the cropping ranges
    for i, val in enumerate(rms[0]):
        if val >= threshold and is_on:
        elif val >= threshold and not is_on:
            is_on = True
            start = float(i) * 30
        elif val < threshold and is_on:
            end = float(i) * 30
            is_on = False
            cut_list.append((start, end))

    if is_on:
        cut_list.append((start, float(i + 1) * 30))

    return cut_list

#Use this for maximal point base
def cut_by_local_max_rms(rms, max_frame_num):

    cut_list = []

    order = 1
    while True:
        pts = list(scipy.signal.argrelmax(rms[0], order=order)[0])

        if len(pts) < max_frame_num:

        order += 1

    for point in pts:
        cut_list.append((point * 30, (point + 1) * 30))

    return cut_list

#Identification of cutout location
#Cut out based on the volume
def decide_cut_frames(cut_type, voice_file):
    #Load audio to identify crops
    #I want to make it as light as possible, so I read it at sample rate 8000
    y_voice, sr_voice = librosa.load(voice_file, sr=8000, mono=True)

    #Check the volume every 30 seconds
    rms = librosa.feature.rms(
        frame_length=sr_voice * 30,
        hop_length=sr_voice * 30,

    if cut_type == "local_max":
        #The volume is maximum(Where the peak is standing)Select up to 20 frames and cut out
        cut_list = cut_by_local_max_rms(rms, 20)
    elif cut_type == "max":
        #Top 5 loudest%Cut out the frame of
        cut_list = cut_by_local_max_rms(rms, 100 - 95)

    return cut_list

#Video crop
def cut_movie(cut_list, movie_file, output_movie_file):

    movie = cv2.VideoCapture(movie_file)
    fps = movie.get(cv2.CAP_PROP_FPS)
    height = movie.get(cv2.CAP_PROP_FRAME_HEIGHT)
    width = movie.get(cv2.CAP_PROP_FRAME_WIDTH)
    print(fps, int(width), int(height))

    #Format at output
    #Note that it may change depending on the OS
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")

    #If it already exists, an error will occur, so delete it once.
    if os.path.exists(output_movie_file):

    out = cv2.VideoWriter(
        output_movie_file, fourcc, int(fps), (int(width), int(height))

    for start, end in cut_list:
        i = start * fps
        movie.set(0, start * 1000)

        #Read frame by frame from start and break when end is exceeded
        while movie.isOpened():
            sec = float(i / fps)
            if sec % 60 == 0:
                print(pretty_print_sec(sec), datetime.datetime.now(), flush=True)

            ret, frame = movie.read()
            if not ret:

            #Add text for current time
            font = cv2.FONT_HERSHEY_SIMPLEX
                (10, int(height * 0.9)),
                (0, 255, 0),

            if is_in(cut_list, sec):

            i += 1
            if sec > end:


#Audio crop
def cut_audio(cut_list, voice_file, output_audio_file):

    #Note that sr will be 22050 if None is specified.
    y_full, sr_full = librosa.load(voice_file, sr=None, mono=False)

    output_array = [[], []]
    for start, end in cut_list:
        for i in range(int(start * sr_full), int(end * sr_full) + 1):
            val_0 = y_full[0, i]
            val_1 = y_full[1, i]

            sec = float(i / sr_full)
            if sec % 60 == 0:
                print(pretty_print_sec(sec), datetime.datetime.now(), flush=True)

            if is_in(cut_list, sec):

            if sec > end:

    #Fall if you don't use asfortranarray
        output_audio_file, np.asfortranarray(output_array), sr_full

def main():
    audio_file = "full.mp3"  #Extracted video audio
    voice_file = "voice.wav"  #Extracted only voice from video
    movie_file = "full.mp4"

    output_audio_file = "cut.wav"
    output_movie_file = "cut.mp4"

    cut_type = "local_max"  #Maxima base
    # cut_type = "max" #Maximum value base

    cut_list = decide_cut_frames(cut_type, voice_file)
    cut_movie(cut_list, movie_file, output_movie_file)
    cut_audio(cut_list, audio_file, output_audio_file)

if __name__ == "__main__":


