From 27c0b9904a13cdfb6b01b777f6c9e64949e0cca6 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Sun, 19 Mar 2023 19:45:16 +0200 Subject: [PATCH 1/4] Convert to package --- README.md | 26 +++++++++++++------------- polymath/__init__.py | 0 polymath.py => polymath/__main__.py | 0 3 files changed, 13 insertions(+), 13 deletions(-) create mode 100644 polymath/__init__.py rename polymath.py => polymath/__main__.py (100%) mode change 100755 => 100644 diff --git a/README.md b/README.md index 2e70233..db2c7fb 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ docker run \ -v "$(pwd)"/separated:/polymath/separated \ -v "$(pwd)"/library:/polymath/library \ -v "$(pwd)"/input:/polymath/input \ - polymath python /polymath/polymath.py -a ./input/song1.wav + polymath python -m polymath -a ./input/song1.wav ``` ## Run Polymath @@ -73,54 +73,54 @@ docker run \ ##### Add YouTube video to library (auto-download) ```bash -python polymath.py -a n6DAqMFe97E +python -m polymath -a n6DAqMFe97E ``` ##### Add audio file (wav or mp3) ```bash -python polymath.py -a /path/to/audiolib/song.wav +python -m polymath -a /path/to/audiolib/song.wav ``` ##### Add multiple files at once ```bash -python polymath.py -a n6DAqMFe97E,eaPzCHEQExs,RijB8wnJCN0 -python polymath.py -a /path/to/audiolib/song1.wav,/path/to/audiolib/song2.wav -python polymath.py -a /path/to/audiolib/ +python -m polymath -a n6DAqMFe97E,eaPzCHEQExs,RijB8wnJCN0 +python -m polymath -a /path/to/audiolib/song1.wav,/path/to/audiolib/song2.wav +python -m polymath -a /path/to/audiolib/ ``` Songs are automatically analyzed once which takes some time. Once in the database, they can be access rapidly. The database is stored in the folder "/library/database.p". To reset everything, simply delete it. ### 2. Quantize songs in the Polymath Library ##### Quantize a specific songs in the library to tempo 120 BPM (-q = database audio file ID, -t = tempo in BPM) ```bash -python polymath.py -q n6DAqMFe97E -t 120 +python -m polymath -q n6DAqMFe97E -t 120 ``` ##### Quantize all songs in the library to tempo 120 BPM ```bash -python polymath.py -q all -t 120 +python -m polymath -q all -t 120 ``` ##### Quantize a specific songs in the library to the tempo of the song (-k) ```bash -python polymath.py -q n6DAqMFe97E -k +python -m polymath -q n6DAqMFe97E -k ``` Songs are automatically quantized to the same tempo and beat-grid and saved to the folder “/processed”. ### 3. Search for similar songs in the Polymath Library ##### Search for 10 similar songs based on a specific songs in the library (-s = database audio file ID, -sa = results amount) ```bash -python polymath.py -s n6DAqMFe97E -sa 10 +python -m polymath -s n6DAqMFe97E -sa 10 ``` ##### Search for similar songs based on a specific songs in the library and quantize all of them to tempo 120 BPM ```bash -python polymath.py -s n6DAqMFe97E -sa 10 -q all -t 120 +python -m polymath -s n6DAqMFe97E -sa 10 -q all -t 120 ``` ##### Include BPM as search criteria (-st) ```bash -python polymath.py -s n6DAqMFe97E -sa 10 -q all -t 120 -st -k +python -m polymath -s n6DAqMFe97E -sa 10 -q all -t 120 -st -k ``` Similar songs are automatically found and optionally quantized and saved to the folder "/processed". This makes it easy to create for example an hour long mix of songs that perfectly match one after the other. ### 4. Convert Audio to MIDI ##### Convert all processed audio files and stems to MIDI (-m) ```bash -python polymath.py -a n6DAqMFe97E -q all -t 120 -m +python -m polymath -a n6DAqMFe97E -q all -t 120 -m ``` Generated Midi Files are currently always 120BPM and need to be time adjusted in your DAW. This will be resolved [soon](https://github.com/spotify/basic-pitch/issues/40). The current Audio2Midi model gives mixed results with drums/percussion. This will be resolved with additional audio2midi model options in the future. diff --git a/polymath/__init__.py b/polymath/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/polymath.py b/polymath/__main__.py old mode 100755 new mode 100644 similarity index 100% rename from polymath.py rename to polymath/__main__.py From 01c6cd26e53e55980b95daa792a416f5ab3dba82 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Sun, 19 Mar 2023 19:45:38 +0200 Subject: [PATCH 2/4] Delete unused function --- polymath/__main__.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/polymath/__main__.py b/polymath/__main__.py index 9e302bc..5fb9289 100644 --- a/polymath/__main__.py +++ b/polymath/__main__.py @@ -550,10 +550,6 @@ def get_nearest(query,videos,querybpm, searchforbpm): # print("get_nearest",nearest.id) return nearest -def getNearest(k, array): - k = k / 10 # HACK - return min(enumerate(array), key=lambda x: abs(x[1]-k)) - ################## MAIN ################## From ee42a3e8b69329796dea49c71031d05fc15ddfc4 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Sun, 19 Mar 2023 20:00:43 +0200 Subject: [PATCH 3/4] Split into modules --- .gitignore | 4 +- polymath/__init__.py | 1 + polymath/__main__.py | 554 +------------------------------------ polymath/audio_features.py | 165 +++++++++++ polymath/extract.py | 78 ++++++ polymath/ingest.py | 141 ++++++++++ polymath/library.py | 28 ++ polymath/midi.py | 18 ++ polymath/nearest.py | 40 +++ polymath/quantize.py | 107 +++++++ 10 files changed, 591 insertions(+), 545 deletions(-) create mode 100644 polymath/audio_features.py create mode 100644 polymath/extract.py create mode 100644 polymath/ingest.py create mode 100644 polymath/library.py create mode 100644 polymath/midi.py create mode 100644 polymath/nearest.py create mode 100644 polymath/quantize.py diff --git a/.gitignore b/.gitignore index fe504ed..bc170d9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +*.py[cod] +.idea library/ processed/ -separated/ \ No newline at end of file +separated/ diff --git a/polymath/__init__.py b/polymath/__init__.py index e69de29..412727a 100644 --- a/polymath/__init__.py +++ b/polymath/__init__.py @@ -0,0 +1 @@ +from .library import Video # noqa - required for compatibility with old database files diff --git a/polymath/__main__.py b/polymath/__main__.py index 5fb9289..734c696 100644 --- a/polymath/__main__.py +++ b/polymath/__main__.py @@ -1,557 +1,23 @@ #!/usr/bin/env python +########################################## +################ POLYMATH ################ +############## by samim.io ############### +########################################## import os -import sys import pickle import argparse -import subprocess -import fnmatch -import hashlib -import shutil -from math import log2, pow -import numpy as np -import librosa -import crepe -import soundfile as sf -import pyrubberband as pyrb -from yt_dlp import YoutubeDL -from sf_segmenter.segmenter import Segmenter import tensorflow as tf from basic_pitch import ICASSP_2022_MODEL_PATH -from basic_pitch.inference import predict_and_save -from basic_pitch.inference import predict - -########################################## -################ POLYMATH ################ -############## by samim.io ############### -########################################## - -class Video: - def __init__(self,name,video,audio): - self.id = "" - self.url = "" - self.name = name - self.video = video - self.audio = audio - self.video_features = [] - self.audio_features = [] - -### Library - -LIBRARY_FILENAME = "library/database.p" -basic_pitch_model = "" - -def write_library(videos): - with open(LIBRARY_FILENAME, "wb") as lib: - pickle.dump(videos, lib) - - -def read_library(): - try: - with open(LIBRARY_FILENAME, "rb") as lib: - return pickle.load(lib) - except: - print("No Database file found:", LIBRARY_FILENAME) - return [] - - -################## VIDEO PROCESSING ################## - -def audio_extract(vidobj,file): - print("audio_extract",file) - command = "ffmpeg -hide_banner -loglevel panic -i "+file+" -ab 160k -ac 2 -ar 44100 -vn -y " + vidobj.audio - subprocess.call(command,shell=True) - return vidobj.audio - -def video_download(vidobj,url): - print("video_download",url) - ydl_opts = { - 'outtmpl': 'library/%(id)s', - 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio/best--merge-output-format mp4', - } - with YoutubeDL(ydl_opts) as ydl: - ydl.download(url) - - with ydl: result = ydl.extract_info(url, download=True) - - if 'entries' in result: video = result['entries'][0] # Can be a playlist or a list of videos - else: video = result # Just a video - - filename = f"library/{video['id']}.{video['ext']}" - print("video_download: filename",filename,"extension",video['ext']) - vidobj.id = video['id'] - vidobj.name = video['title'] - vidobj.video = filename - vidobj.url = url - return vidobj - -def video_process(vids,videos): - for vid in vids: - print('------ process video',vid) - # check if id already in db - download_vid = True - for video in videos: - if video.id == vid: - print("already in db",vid) - download_vid = False - break - - # analyse videos and save to disk - if download_vid: - video = Video(vid,vid,f"library/{vid}.wav") - video = video_download(video,f"https://www.youtube.com/watch?v={vid}") - audio_extract(video,video.video) - videos.append(video) - print("NAME",video.name,"VIDEO",video.video,"AUDIO",video.audio) - write_library(videos) - print("video_process DONE",len(videos)) - return videos - -################## AUDIO PROCESSING ################## - -def audio_directory_process(vids, videos): - filesToProcess = [] - for vid in vids: - path = vid - pattern = "*.mp3" - for filename in fnmatch.filter(os.listdir(path), pattern): - filepath = os.path.join(path, filename) - print(filepath) - if os.path.isfile(filepath): - filesToProcess.append(filepath) - - print('Found', len(filesToProcess), 'wav or mp3 files') - if len(filesToProcess) > 0: - videos = audio_process(filesToProcess, videos) - return videos - -def audio_process(vids, videos): - for vid in vids: - print('------ process audio',vid) - # extract file name - audioname = vid.split("/")[-1] - audioname, _ = audioname.split(".") - - # generate a unique ID based on file path and name - hash_object = hashlib.sha256(vid.encode()) - audioid = hash_object.hexdigest() - audioid = f"{audioname}_{audioid}" - - # check if id already in db - process_audio = True - for video in videos: - if video.id == audioid: - print("already in db",vid) - process_audio = False - break - - # check if is mp3 and convert it to wav - if vid.endswith(".mp3"): - # convert mp3 to wav and save it - print('converting mp3 to wav:', vid) - y, sr = librosa.load(path=vid, sr=None, mono=False) - path = os.path.join(os.getcwd(), 'library', audioid+'.wav') - # resample to 44100k if required - if sr != 44100: - print('converting audio file to 44100:', vid) - y = librosa.resample(y, orig_sr=sr, target_sr=44100) - sf.write(path, np.ravel(y), 44100) - vid = path - - # check if is wav and copy it to local folder - elif vid.endswith(".wav"): - path1 = vid - path2 = os.path.join(os.getcwd(), 'library', audioid+'.wav') - y, sr = librosa.load(path=vid, sr=None, mono=False) - if sr != 44100: - print('converting audio file to 44100:', vid) - y = librosa.resample(y, orig_sr=sr, target_sr=44100) - sf.write(path2, y, 44100) - else: - shutil.copy2(path1, path2) - vid = path2 - - # analyse videos and save to disk - if process_audio: - video = Video(audioname,'',vid) - video.id = audioid - video.url = vid - videos.append(video) - write_library(videos) - print("Finished procesing files:",len(videos)) - - return videos - -################## AUDIO FEATURES ################## - -def root_mean_square(data): - return float(np.sqrt(np.mean(np.square(data)))) - -def loudness_of(data): - return root_mean_square(data) - -def normalized(list): - """Given an audio buffer, return it with the loudest value scaled to 1.0""" - return list.astype(np.float32) / float(np.amax(np.abs(list))) - -neg80point8db = 0.00009120108393559096 -bit_depth = 16 -default_silence_threshold = (neg80point8db * (2 ** (bit_depth - 1))) * 4 - -def start_of(list, threshold=default_silence_threshold, samples_before=1): - if int(threshold) != threshold: - threshold = threshold * float(2 ** (bit_depth - 1)) - index = np.argmax(np.absolute(list) > threshold) - if index > (samples_before - 1): - return index - samples_before - else: - return 0 - -def end_of(list, threshold=default_silence_threshold, samples_after=1): - if int(threshold) != threshold: - threshold = threshold * float(2 ** (bit_depth - 1)) - rev_index = np.argmax( - np.flipud(np.absolute(list)) > threshold - ) - if rev_index > (samples_after - 1): - return len(list) - (rev_index - samples_after) - else: - return len(list) - -def trim_data( - data, - start_threshold=default_silence_threshold, - end_threshold=default_silence_threshold -): - start = start_of(data, start_threshold) - end = end_of(data, end_threshold) - - return data[start:end] -def load_and_trim(file): - y, rate = librosa.load(file, mono=True) - y = normalized(y) - trimmed = trim_data(y) - return trimmed, rate +from polymath.extract import get_audio_features +from polymath.ingest import video_process, audio_directory_process, audio_process +from polymath.library import write_library, read_library +from polymath.nearest import get_nearest +from polymath.quantize import quantizeAudio -def get_loudness(file): - loudness = -1 - try: - audio, rate = load_and_trim(file) - loudness = loudness_of(audio) - except Exception as e: - sys.stderr.write(f"Failed to run on {file}: {e}\n") - return loudness - -def get_volume(file): - volume = -1 - avg_volume = -1 - try: - audio, rate = load_and_trim(file) - volume = librosa.feature.rms(y=audio)[0] - avg_volume = np.mean(volume) - loudness = loudness_of(audio) - except Exception as e: - sys.stderr.write(f"Failed to get Volume and Loudness on {file}: {e}\n") - return volume, avg_volume, loudness - -def get_key(freq): - A4 = 440 - C0 = A4*pow(2, -4.75) - name = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] - h = round(12*log2(freq/C0)) - octave = h // 12 - n = h % 12 - return name[n] + str(octave) - -def get_average_pitch(pitch): - pitches = [] - confidences_thresh = 0.8 - i = 0 - while i < len(pitch): - if(pitch[i][2] > confidences_thresh): - pitches.append(pitch[i][1]) - i += 1 - if len(pitches) > 0: - average_frequency = np.array(pitches).mean() - average_key = get_key(average_frequency) - else: - average_frequency = 0 - average_key = "A0" - return average_frequency,average_key - -def get_intensity(y, sr, beats): - # Beat-synchronous Loudness - Intensity - CQT = librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('A1')) - freqs = librosa.cqt_frequencies(CQT.shape[0], fmin=librosa.note_to_hz('A1')) - perceptual_CQT = librosa.perceptual_weighting(CQT**2, freqs, ref=np.max) - CQT_sync = librosa.util.sync(perceptual_CQT, beats, aggregate=np.median) - return CQT_sync - -def get_pitch(y_harmonic, sr, beats): - # Chromagram - C = librosa.feature.chroma_cqt(y=y_harmonic, sr=sr) - # Beat-synchronous Chroma - Pitch - C_sync = librosa.util.sync(C, beats, aggregate=np.median) - return C_sync - -def get_timbre(y, sr, beats): - # Mel spectogram - S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128) - log_S = librosa.power_to_db(S, ref=np.max) - # MFCC - Timbre - mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13) - delta_mfcc = librosa.feature.delta(mfcc) - delta2_mfcc = librosa.feature.delta(mfcc, order=2) - M = np.vstack([mfcc, delta_mfcc, delta2_mfcc]) - # Beat-synchronous MFCC - Timbre - M_sync = librosa.util.sync(M, beats) - return M_sync - -def get_segments(audio_file): - segmenter = Segmenter() - boundaries, labs = segmenter.proc_audio(audio_file) - return boundaries,labs - -def get_pitch_dnn(audio_file): - # DNN Pitch Detection - pitch = [] - audio, sr = librosa.load(audio_file) - time, frequency, confidence, activation = crepe.predict(audio, sr, model_capacity="tiny", viterbi=True, center=True, step_size=10, verbose=1) # tiny|small|medium|large|full - i = 0 - while i < len(time): - pitch.append([time[i],frequency[i],confidence[i]]) - i += 1 - return pitch - -def stemsplit(destination, demucsmodel): - subprocess.run(["demucs", destination, "-n", demucsmodel]) # '--mp3' - -def extractMIDI(audio_paths, output_dir): - print('- Extract Midi') - save_midi = True - sonify_midi = False - save_model_outputs = False - save_notes = False - - predict_and_save(audio_path_list=audio_paths, - output_directory=output_dir, - save_midi=save_midi, - sonify_midi=sonify_midi, - save_model_outputs=save_model_outputs, - save_notes=save_notes) - - -def quantizeAudio(vid, bpm=120, keepOriginalBpm = False, pitchShiftFirst = False, extractMidi = False): - print("Quantize Audio: Target BPM", bpm, - "-- id:",vid.id, - "bpm:",round(vid.audio_features["tempo"],2), - "frequency:",round(vid.audio_features['frequency'],2), - "key:",vid.audio_features['key'], - "timbre:",round(vid.audio_features['timbre'],2), - "name:",vid.name, - 'keepOriginalBpm:', keepOriginalBpm - ) - - # load audio file - y, sr = librosa.load(vid.audio, sr=None) - - # Keep Original Song BPM - if keepOriginalBpm: - bpm = float(vid.audio_features['tempo']) - print('Keep original audio file BPM:', vid.audio_features['tempo']) - # Pitch Shift audio file to desired BPM first - elif pitchShiftFirst: # WORK IN PROGRESS - print('Pitch Shifting audio to desired BPM', bpm) - # Desired tempo in bpm - original_tempo = vid.audio_features['tempo'] - speed_factor = bpm / original_tempo - # Resample the audio to adjust the sample rate accordingly - sr_stretched = int(sr / speed_factor) - y = librosa.resample(y=y, orig_sr=sr, target_sr=sr_stretched) #, res_type='linear' - y = librosa.resample(y, orig_sr=sr, target_sr=44100) - - # extract beat - y_harmonic, y_percussive = librosa.effects.hpss(y) - tempo, beats = librosa.beat.beat_track(sr=sr, onset_envelope=librosa.onset.onset_strength(y=y_percussive, sr=sr), trim=False) - beat_frames = librosa.frames_to_samples(beats) - - # generate metronome - fixed_beat_times = [] - for i in range(len(beat_frames)): - fixed_beat_times.append(i * 120 / bpm) - fixed_beat_frames = librosa.time_to_samples(fixed_beat_times) - - # construct time map - time_map = [] - for i in range(len(beat_frames)): - new_member = (beat_frames[i], fixed_beat_frames[i]) - time_map.append(new_member) - - # add ending to time map - original_length = len(y+1) - orig_end_diff = original_length - time_map[i][0] - new_ending = int(round(time_map[i][1] + orig_end_diff * (tempo / bpm))) - new_member = (original_length, new_ending) - time_map.append(new_member) - - # time strech audio - print('- Quantize Audio: source') - strechedaudio = pyrb.timemap_stretch(y, sr, time_map) - - path_suffix = ( - f"Key {vid.audio_features['key']} - " - f"Freq {round(vid.audio_features['frequency'], 2)} - " - f"Timbre {round(vid.audio_features['timbre'], 2)} - " - f"BPM Original {int(vid.audio_features['tempo'])} - " - f"BPM {bpm}" - ) - path_prefix = ( - f"{vid.id} - {vid.name}" - ) - - audiofilepaths = [] - # save audio to disk - path = os.path.join(os.getcwd(), 'processed', path_prefix + " - " + path_suffix +'.wav') - sf.write(path, strechedaudio, sr) - audiofilepaths.append(path) - - # process stems - stems = ['bass', 'drums', 'guitar', 'other', 'piano', 'vocals'] - for stem in stems: - path = os.path.join(os.getcwd(), 'separated', 'htdemucs_6s', vid.id, stem +'.wav') - print(f"- Quantize Audio: {stem}") - y, sr = librosa.load(path, sr=None) - strechedaudio = pyrb.timemap_stretch(y, sr, time_map) - # save stems to disk - path = os.path.join(os.getcwd(), 'processed', path_prefix + " - Stem " + stem + " - " + path_suffix +'.wav') - sf.write(path, strechedaudio, sr) - audiofilepaths.append(path) - - # metronome click (optinal) - click = False - if click: - clicks_audio = librosa.clicks(times=fixed_beat_times, sr=sr) - print(len(clicks_audio), len(strechedaudio)) - clicks_audio = clicks_audio[:len(strechedaudio)] - path = os.path.join(os.getcwd(), 'processed', vid.id + '- click.wav') - sf.write(path, clicks_audio, sr) - - if extractMidi: - output_dir = os.path.join(os.getcwd(), 'processed') - extractMIDI(audiofilepaths, output_dir) - - -def get_audio_features(file,file_id,extractMidi = False): - print("------------------------------ get_audio_features:",file_id,"------------------------------") - print('1/8 segementation') - segments_boundaries,segments_labels = get_segments(file) - - print('2/8 pitch tracking') - frequency_frames = get_pitch_dnn(file) - average_frequency,average_key = get_average_pitch(frequency_frames) - - print('3/8 load sample') - y, sr = librosa.load(file, sr=None) - song_duration = librosa.get_duration(y=y, sr=sr) - - print('4/8 sample separation') - y_harmonic, y_percussive = librosa.effects.hpss(y) - - print('5/8 beat tracking') - tempo, beats = librosa.beat.beat_track(sr=sr, onset_envelope=librosa.onset.onset_strength(y=y_percussive, sr=sr), trim=False) - - print('6/8 feature extraction') - CQT_sync = get_intensity(y, sr, beats) - C_sync = get_pitch(y_harmonic, sr, beats) - M_sync = get_timbre(y, sr, beats) - volume, avg_volume, loudness = get_volume(file) - - print('7/8 feature aggregation') - intensity_frames = np.matrix(CQT_sync).getT() - pitch_frames = np.matrix(C_sync).getT() - timbre_frames = np.matrix(M_sync).getT() - - print('8/8 split stems') - stemsplit(file, 'htdemucs_6s') - - if extractMidi: - audiofilepaths = [] - stems = ['bass', 'drums', 'guitar', 'other', 'piano', 'vocals'] - for stem in stems: - path = os.path.join(os.getcwd(), 'separated', 'htdemucs_6s', file_id, stem +'.wav') - audiofilepaths.append(path) - output_dir = os.path.join(os.getcwd(), 'separated', 'htdemucs_6s', file_id) - extractMIDI(audiofilepaths, output_dir) - - audio_features = { - "id":file_id, - "tempo":tempo, - "duration":song_duration, - "timbre":np.mean(timbre_frames), - "timbre_frames":timbre_frames, - "pitch":np.mean(pitch_frames), - "pitch_frames":pitch_frames, - "intensity":np.mean(intensity_frames), - "intensity_frames":intensity_frames, - "volume": volume, - "avg_volume": avg_volume, - "loudness": loudness, - "beats":librosa.frames_to_time(beats, sr=sr), - "segments_boundaries":segments_boundaries, - "segments_labels":segments_labels, - "frequency_frames":frequency_frames, - "frequency":average_frequency, - "key":average_key - } - return audio_features - -################## SEARCH NEAREST AUDIO ################## - -previous_list = [] - -def get_nearest(query,videos,querybpm, searchforbpm): - global previous_list - # print("Search: query:", query.name, '- Incl. BPM in search:', searchforbpm) - nearest = {} - smallest = 1000000000 - smallestBPM = 1000000000 - smallestTimbre = 1000000000 - smallestIntensity = 1000000000 - for vid in videos: - if vid.id != query.id: - comp_bpm = abs(querybpm - vid.audio_features['tempo']) - comp_timbre = abs(query.audio_features["timbre"] - vid.audio_features['timbre']) - comp_intensity = abs(query.audio_features["intensity"] - vid.audio_features['intensity']) - #comp = abs(query.audio_features["pitch"] - vid.audio_features['pitch']) - comp = abs(query.audio_features["frequency"] - vid.audio_features['frequency']) - - if searchforbpm: - if vid.id not in previous_list and comp < smallest and comp_bpm < smallestBPM:# and comp_timbre < smallestTimbre: - smallest = comp - smallestBPM = comp_bpm - smallestTimbre = comp_timbre - nearest = vid - else: - if vid.id not in previous_list and comp < smallest: - smallest = comp - smallestBPM = comp_bpm - smallestTimbre = comp_timbre - nearest = vid - #print("--- result",i['file'],i['average_frequency'],i['average_key'],"diff",comp) - # print(nearest) - previous_list.append(nearest.id) - - if len(previous_list) >= len(videos)-1: - previous_list.pop(0) - # print("getNearestPitch: previous_list, pop first") - # print("get_nearest",nearest.id) - return nearest - - -################## MAIN ################## def main(): print("---------------------------------------------------------------------------- ") @@ -562,7 +28,7 @@ def main(): for directory in ("processed", "library", "separated", "separated/htdemucs_6s"): os.makedirs(directory, exist_ok=True) - + # Parse command line input parser = argparse.ArgumentParser(description='polymath') parser.add_argument('-a', '--add', help='youtube id', required=False) diff --git a/polymath/audio_features.py b/polymath/audio_features.py new file mode 100644 index 0000000..460d8a0 --- /dev/null +++ b/polymath/audio_features.py @@ -0,0 +1,165 @@ +import sys +from math import pow, log2 + +import crepe +import librosa +import numpy as np +from sf_segmenter import Segmenter + +neg80point8db = 0.00009120108393559096 +bit_depth = 16 +default_silence_threshold = (neg80point8db * (2 ** (bit_depth - 1))) * 4 + + +def root_mean_square(data): + return float(np.sqrt(np.mean(np.square(data)))) + + +def loudness_of(data): + return root_mean_square(data) + + +def normalized(list): + """Given an audio buffer, return it with the loudest value scaled to 1.0""" + return list.astype(np.float32) / float(np.amax(np.abs(list))) + + +def start_of(list, threshold=default_silence_threshold, samples_before=1): + if int(threshold) != threshold: + threshold = threshold * float(2 ** (bit_depth - 1)) + index = np.argmax(np.absolute(list) > threshold) + if index > (samples_before - 1): + return index - samples_before + else: + return 0 + + +def end_of(list, threshold=default_silence_threshold, samples_after=1): + if int(threshold) != threshold: + threshold = threshold * float(2 ** (bit_depth - 1)) + rev_index = np.argmax( + np.flipud(np.absolute(list)) > threshold + ) + if rev_index > (samples_after - 1): + return len(list) - (rev_index - samples_after) + else: + return len(list) + + +def trim_data( + data, + start_threshold=default_silence_threshold, + end_threshold=default_silence_threshold +): + start = start_of(data, start_threshold) + end = end_of(data, end_threshold) + + return data[start:end] + + +def load_and_trim(file): + y, rate = librosa.load(file, mono=True) + y = normalized(y) + trimmed = trim_data(y) + return trimmed, rate + + +def get_loudness(file): + loudness = -1 + try: + audio, rate = load_and_trim(file) + loudness = loudness_of(audio) + except Exception as e: + sys.stderr.write(f"Failed to run on {file}: {e}\n") + return loudness + + +def get_volume(file): + volume = -1 + avg_volume = -1 + try: + audio, rate = load_and_trim(file) + volume = librosa.feature.rms(y=audio)[0] + avg_volume = np.mean(volume) + loudness = loudness_of(audio) + except Exception as e: + sys.stderr.write(f"Failed to get Volume and Loudness on {file}: {e}\n") + return volume, avg_volume, loudness + + +def get_key(freq): + A4 = 440 + C0 = A4*pow(2, -4.75) + name = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] + h = round(12*log2(freq/C0)) + octave = h // 12 + n = h % 12 + return name[n] + str(octave) + + +def get_average_pitch(pitch): + pitches = [] + confidences_thresh = 0.8 + i = 0 + while i < len(pitch): + if(pitch[i][2] > confidences_thresh): + pitches.append(pitch[i][1]) + i += 1 + if len(pitches) > 0: + average_frequency = np.array(pitches).mean() + average_key = get_key(average_frequency) + else: + average_frequency = 0 + average_key = "A0" + return average_frequency,average_key + + +def get_intensity(y, sr, beats): + # Beat-synchronous Loudness - Intensity + CQT = librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('A1')) + freqs = librosa.cqt_frequencies(CQT.shape[0], fmin=librosa.note_to_hz('A1')) + perceptual_CQT = librosa.perceptual_weighting(CQT**2, freqs, ref=np.max) + CQT_sync = librosa.util.sync(perceptual_CQT, beats, aggregate=np.median) + return CQT_sync + + +def get_pitch(y_harmonic, sr, beats): + # Chromagram + C = librosa.feature.chroma_cqt(y=y_harmonic, sr=sr) + # Beat-synchronous Chroma - Pitch + C_sync = librosa.util.sync(C, beats, aggregate=np.median) + return C_sync + + +def get_timbre(y, sr, beats): + # Mel spectogram + S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128) + log_S = librosa.power_to_db(S, ref=np.max) + # MFCC - Timbre + mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13) + delta_mfcc = librosa.feature.delta(mfcc) + delta2_mfcc = librosa.feature.delta(mfcc, order=2) + M = np.vstack([mfcc, delta_mfcc, delta2_mfcc]) + # Beat-synchronous MFCC - Timbre + M_sync = librosa.util.sync(M, beats) + return M_sync + + +def get_segments(audio_file): + segmenter = Segmenter() + boundaries, labs = segmenter.proc_audio(audio_file) + return boundaries,labs + + +def get_pitch_dnn(audio_file): + # DNN Pitch Detection + pitch = [] + audio, sr = librosa.load(audio_file) + time, frequency, confidence, activation = crepe.predict(audio, sr, model_capacity="tiny", viterbi=True, center=True, step_size=10, verbose=1) # tiny|small|medium|large|full + i = 0 + while i < len(time): + pitch.append([time[i],frequency[i],confidence[i]]) + i += 1 + return pitch + + diff --git a/polymath/extract.py b/polymath/extract.py new file mode 100644 index 0000000..2f8006f --- /dev/null +++ b/polymath/extract.py @@ -0,0 +1,78 @@ +import os +import subprocess + +import librosa +import numpy as np + +from polymath.audio_features import get_segments, get_pitch_dnn, get_average_pitch, get_intensity, get_pitch, \ + get_timbre, get_volume +from polymath.midi import extractMIDI + + +def stemsplit(destination, demucsmodel): + subprocess.run(["demucs", destination, "-n", demucsmodel]) # '--mp3' + + +def get_audio_features(file,file_id,extractMidi = False): + print("------------------------------ get_audio_features:",file_id,"------------------------------") + print('1/8 segementation') + segments_boundaries,segments_labels = get_segments(file) + + print('2/8 pitch tracking') + frequency_frames = get_pitch_dnn(file) + average_frequency,average_key = get_average_pitch(frequency_frames) + + print('3/8 load sample') + y, sr = librosa.load(file, sr=None) + song_duration = librosa.get_duration(y=y, sr=sr) + + print('4/8 sample separation') + y_harmonic, y_percussive = librosa.effects.hpss(y) + + print('5/8 beat tracking') + tempo, beats = librosa.beat.beat_track(sr=sr, onset_envelope=librosa.onset.onset_strength(y=y_percussive, sr=sr), trim=False) + + print('6/8 feature extraction') + CQT_sync = get_intensity(y, sr, beats) + C_sync = get_pitch(y_harmonic, sr, beats) + M_sync = get_timbre(y, sr, beats) + volume, avg_volume, loudness = get_volume(file) + + print('7/8 feature aggregation') + intensity_frames = np.matrix(CQT_sync).getT() + pitch_frames = np.matrix(C_sync).getT() + timbre_frames = np.matrix(M_sync).getT() + + print('8/8 split stems') + stemsplit(file, 'htdemucs_6s') + + if extractMidi: + audiofilepaths = [] + stems = ['bass', 'drums', 'guitar', 'other', 'piano', 'vocals'] + for stem in stems: + path = os.path.join(os.getcwd(), 'separated', 'htdemucs_6s', file_id, stem +'.wav') + audiofilepaths.append(path) + output_dir = os.path.join(os.getcwd(), 'separated', 'htdemucs_6s', file_id) + extractMIDI(audiofilepaths, output_dir) + + audio_features = { + "id":file_id, + "tempo":tempo, + "duration":song_duration, + "timbre":np.mean(timbre_frames), + "timbre_frames":timbre_frames, + "pitch":np.mean(pitch_frames), + "pitch_frames":pitch_frames, + "intensity":np.mean(intensity_frames), + "intensity_frames":intensity_frames, + "volume": volume, + "avg_volume": avg_volume, + "loudness": loudness, + "beats":librosa.frames_to_time(beats, sr=sr), + "segments_boundaries":segments_boundaries, + "segments_labels":segments_labels, + "frequency_frames":frequency_frames, + "frequency":average_frequency, + "key":average_key + } + return audio_features diff --git a/polymath/ingest.py b/polymath/ingest.py new file mode 100644 index 0000000..f232f02 --- /dev/null +++ b/polymath/ingest.py @@ -0,0 +1,141 @@ +import fnmatch +import hashlib +import os +import shutil +import subprocess + +import librosa +import numpy as np +import soundfile as sf + +from yt_dlp import YoutubeDL + +from polymath.library import write_library, Video + + +def audio_extract(vidobj,file): + print("audio_extract",file) + command = "ffmpeg -hide_banner -loglevel panic -i "+file+" -ab 160k -ac 2 -ar 44100 -vn -y " + vidobj.audio + subprocess.call(command,shell=True) + return vidobj.audio + + +def video_download(vidobj,url): + print("video_download",url) + ydl_opts = { + 'outtmpl': 'library/%(id)s', + 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio/best--merge-output-format mp4', + } + with YoutubeDL(ydl_opts) as ydl: + ydl.download(url) + + with ydl: result = ydl.extract_info(url, download=True) + + if 'entries' in result: video = result['entries'][0] # Can be a playlist or a list of videos + else: video = result # Just a video + + filename = f"library/{video['id']}.{video['ext']}" + print("video_download: filename",filename,"extension",video['ext']) + vidobj.id = video['id'] + vidobj.name = video['title'] + vidobj.video = filename + vidobj.url = url + return vidobj + + +def video_process(vids,videos): + for vid in vids: + print('------ process video',vid) + # check if id already in db + download_vid = True + for video in videos: + if video.id == vid: + print("already in db",vid) + download_vid = False + break + + # analyse videos and save to disk + if download_vid: + video = Video(vid,vid,f"library/{vid}.wav") + video = video_download(video,f"https://www.youtube.com/watch?v={vid}") + audio_extract(video,video.video) + videos.append(video) + print("NAME",video.name,"VIDEO",video.video,"AUDIO",video.audio) + write_library(videos) + print("video_process DONE",len(videos)) + return videos + + +def audio_directory_process(vids, videos): + filesToProcess = [] + for vid in vids: + path = vid + pattern = "*.mp3" + for filename in fnmatch.filter(os.listdir(path), pattern): + filepath = os.path.join(path, filename) + print(filepath) + if os.path.isfile(filepath): + filesToProcess.append(filepath) + + print('Found', len(filesToProcess), 'wav or mp3 files') + if len(filesToProcess) > 0: + videos = audio_process(filesToProcess, videos) + return videos + + +def audio_process(vids, videos): + for vid in vids: + print('------ process audio',vid) + # extract file name + audioname = vid.split("/")[-1] + audioname, _ = audioname.split(".") + + # generate a unique ID based on file path and name + hash_object = hashlib.sha256(vid.encode()) + audioid = hash_object.hexdigest() + audioid = f"{audioname}_{audioid}" + + # check if id already in db + process_audio = True + for video in videos: + if video.id == audioid: + print("already in db",vid) + process_audio = False + break + + # check if is mp3 and convert it to wav + if vid.endswith(".mp3"): + # convert mp3 to wav and save it + print('converting mp3 to wav:', vid) + y, sr = librosa.load(path=vid, sr=None, mono=False) + path = os.path.join(os.getcwd(), 'library', audioid+'.wav') + # resample to 44100k if required + if sr != 44100: + print('converting audio file to 44100:', vid) + y = librosa.resample(y, orig_sr=sr, target_sr=44100) + sf.write(path, np.ravel(y), 44100) + vid = path + + # check if is wav and copy it to local folder + elif vid.endswith(".wav"): + path1 = vid + path2 = os.path.join(os.getcwd(), 'library', audioid+'.wav') + y, sr = librosa.load(path=vid, sr=None, mono=False) + if sr != 44100: + print('converting audio file to 44100:', vid) + y = librosa.resample(y, orig_sr=sr, target_sr=44100) + sf.write(path2, y, 44100) + else: + shutil.copy2(path1, path2) + vid = path2 + + # analyse videos and save to disk + if process_audio: + video = Video(audioname,'',vid) + video.id = audioid + video.url = vid + videos.append(video) + write_library(videos) + print("Finished procesing files:",len(videos)) + + return videos diff --git a/polymath/library.py b/polymath/library.py new file mode 100644 index 0000000..4963e25 --- /dev/null +++ b/polymath/library.py @@ -0,0 +1,28 @@ +import pickle + +LIBRARY_FILENAME = "library/database.p" + + +def write_library(videos): + with open(LIBRARY_FILENAME, "wb") as lib: + pickle.dump(videos, lib) + + +def read_library(): + try: + with open(LIBRARY_FILENAME, "rb") as lib: + return pickle.load(lib) + except: + print("No Database file found:", LIBRARY_FILENAME) + return [] + + +class Video: + def __init__(self,name,video,audio): + self.id = "" + self.url = "" + self.name = name + self.video = video + self.audio = audio + self.video_features = [] + self.audio_features = [] diff --git a/polymath/midi.py b/polymath/midi.py new file mode 100644 index 0000000..3610c49 --- /dev/null +++ b/polymath/midi.py @@ -0,0 +1,18 @@ +from basic_pitch.inference import predict_and_save + + +def extractMIDI(audio_paths, output_dir): + print('- Extract Midi') + save_midi = True + sonify_midi = False + save_model_outputs = False + save_notes = False + + predict_and_save( + audio_path_list=audio_paths, + output_directory=output_dir, + save_midi=save_midi, + sonify_midi=sonify_midi, + save_model_outputs=save_model_outputs, + save_notes=save_notes, + ) diff --git a/polymath/nearest.py b/polymath/nearest.py new file mode 100644 index 0000000..e776df1 --- /dev/null +++ b/polymath/nearest.py @@ -0,0 +1,40 @@ +previous_list = [] + + +def get_nearest(query,videos,querybpm, searchforbpm): + global previous_list + # print("Search: query:", query.name, '- Incl. BPM in search:', searchforbpm) + nearest = {} + smallest = 1000000000 + smallestBPM = 1000000000 + smallestTimbre = 1000000000 + smallestIntensity = 1000000000 + for vid in videos: + if vid.id != query.id: + comp_bpm = abs(querybpm - vid.audio_features['tempo']) + comp_timbre = abs(query.audio_features["timbre"] - vid.audio_features['timbre']) + comp_intensity = abs(query.audio_features["intensity"] - vid.audio_features['intensity']) + #comp = abs(query.audio_features["pitch"] - vid.audio_features['pitch']) + comp = abs(query.audio_features["frequency"] - vid.audio_features['frequency']) + + if searchforbpm: + if vid.id not in previous_list and comp < smallest and comp_bpm < smallestBPM:# and comp_timbre < smallestTimbre: + smallest = comp + smallestBPM = comp_bpm + smallestTimbre = comp_timbre + nearest = vid + else: + if vid.id not in previous_list and comp < smallest: + smallest = comp + smallestBPM = comp_bpm + smallestTimbre = comp_timbre + nearest = vid + #print("--- result",i['file'],i['average_frequency'],i['average_key'],"diff",comp) + # print(nearest) + previous_list.append(nearest.id) + + if len(previous_list) >= len(videos)-1: + previous_list.pop(0) + # print("getNearestPitch: previous_list, pop first") + # print("get_nearest",nearest.id) + return nearest diff --git a/polymath/quantize.py b/polymath/quantize.py new file mode 100644 index 0000000..76bd1d3 --- /dev/null +++ b/polymath/quantize.py @@ -0,0 +1,107 @@ +import os + +import librosa +import pyrubberband as pyrb +import soundfile as sf + +from polymath.midi import extractMIDI + + +def quantizeAudio(vid, bpm=120, keepOriginalBpm = False, pitchShiftFirst = False, extractMidi = False): + print("Quantize Audio: Target BPM", bpm, + "-- id:",vid.id, + "bpm:",round(vid.audio_features["tempo"],2), + "frequency:",round(vid.audio_features['frequency'],2), + "key:",vid.audio_features['key'], + "timbre:",round(vid.audio_features['timbre'],2), + "name:",vid.name, + 'keepOriginalBpm:', keepOriginalBpm + ) + + # load audio file + y, sr = librosa.load(vid.audio, sr=None) + + # Keep Original Song BPM + if keepOriginalBpm: + bpm = float(vid.audio_features['tempo']) + print('Keep original audio file BPM:', vid.audio_features['tempo']) + # Pitch Shift audio file to desired BPM first + elif pitchShiftFirst: # WORK IN PROGRESS + print('Pitch Shifting audio to desired BPM', bpm) + # Desired tempo in bpm + original_tempo = vid.audio_features['tempo'] + speed_factor = bpm / original_tempo + # Resample the audio to adjust the sample rate accordingly + sr_stretched = int(sr / speed_factor) + y = librosa.resample(y=y, orig_sr=sr, target_sr=sr_stretched) #, res_type='linear' + y = librosa.resample(y, orig_sr=sr, target_sr=44100) + + # extract beat + y_harmonic, y_percussive = librosa.effects.hpss(y) + tempo, beats = librosa.beat.beat_track(sr=sr, onset_envelope=librosa.onset.onset_strength(y=y_percussive, sr=sr), trim=False) + beat_frames = librosa.frames_to_samples(beats) + + # generate metronome + fixed_beat_times = [] + for i in range(len(beat_frames)): + fixed_beat_times.append(i * 120 / bpm) + fixed_beat_frames = librosa.time_to_samples(fixed_beat_times) + + # construct time map + time_map = [] + for i in range(len(beat_frames)): + new_member = (beat_frames[i], fixed_beat_frames[i]) + time_map.append(new_member) + + # add ending to time map + original_length = len(y+1) + orig_end_diff = original_length - time_map[i][0] + new_ending = int(round(time_map[i][1] + orig_end_diff * (tempo / bpm))) + new_member = (original_length, new_ending) + time_map.append(new_member) + + # time strech audio + print('- Quantize Audio: source') + strechedaudio = pyrb.timemap_stretch(y, sr, time_map) + + path_suffix = ( + f"Key {vid.audio_features['key']} - " + f"Freq {round(vid.audio_features['frequency'], 2)} - " + f"Timbre {round(vid.audio_features['timbre'], 2)} - " + f"BPM Original {int(vid.audio_features['tempo'])} - " + f"BPM {bpm}" + ) + path_prefix = ( + f"{vid.id} - {vid.name}" + ) + + audiofilepaths = [] + # save audio to disk + path = os.path.join(os.getcwd(), 'processed', path_prefix + " - " + path_suffix +'.wav') + sf.write(path, strechedaudio, sr) + audiofilepaths.append(path) + + # process stems + stems = ['bass', 'drums', 'guitar', 'other', 'piano', 'vocals'] + for stem in stems: + path = os.path.join(os.getcwd(), 'separated', 'htdemucs_6s', vid.id, stem +'.wav') + print(f"- Quantize Audio: {stem}") + y, sr = librosa.load(path, sr=None) + strechedaudio = pyrb.timemap_stretch(y, sr, time_map) + # save stems to disk + path = os.path.join(os.getcwd(), 'processed', path_prefix + " - Stem " + stem + " - " + path_suffix +'.wav') + sf.write(path, strechedaudio, sr) + audiofilepaths.append(path) + + # metronome click (optinal) + click = False + if click: + clicks_audio = librosa.clicks(times=fixed_beat_times, sr=sr) + print(len(clicks_audio), len(strechedaudio)) + clicks_audio = clicks_audio[:len(strechedaudio)] + path = os.path.join(os.getcwd(), 'processed', vid.id + '- click.wav') + sf.write(path, clicks_audio, sr) + + if extractMidi: + output_dir = os.path.join(os.getcwd(), 'processed') + extractMIDI(audiofilepaths, output_dir) From 4f9221ff1ced5d893d6385b3f1cf5a15a306b8f2 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Sun, 19 Mar 2023 20:03:04 +0200 Subject: [PATCH 4/4] Remove apparently unnecessary loading of basic pitch model --- polymath/__main__.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/polymath/__main__.py b/polymath/__main__.py index 734c696..6a24438 100644 --- a/polymath/__main__.py +++ b/polymath/__main__.py @@ -9,9 +9,6 @@ import pickle import argparse -import tensorflow as tf -from basic_pitch import ICASSP_2022_MODEL_PATH - from polymath.extract import get_audio_features from polymath.ingest import video_process, audio_directory_process, audio_process from polymath.library import write_library, read_library @@ -88,10 +85,6 @@ def main(): # MIDI extractmidi = bool(args.midi) - if extractmidi: - global basic_pitch_model - basic_pitch_model = tf.saved_model.load(str(ICASSP_2022_MODEL_PATH)) - # Tempo tempo = int(args.tempo or 120)