diff --git a/dataset_preprocess.ipynb b/dataset_preprocess.ipynb index 555564c..fa7bca1 100644 --- a/dataset_preprocess.ipynb +++ b/dataset_preprocess.ipynb @@ -2,6 +2,19 @@ "cells": [ { "cell_type": "code", + "execution_count": null, + "id": "02c7e690-19f6-4393-a1e0-bfc1eb5632f9", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -U joblib\n", + "!pip install -U ipywidgets\n", + "!pip install -U tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "5d0fd231-9b57-4bc3-b08f-3a8dce0287ee", "metadata": { "ExecuteTime": { @@ -9,31 +22,43 @@ "start_time": "2024-09-26T03:56:06.674763Z" } }, + "outputs": [], "source": [ "from midi_tokenizer import MIDITokenizer\n", + "\n", "import MIDI \n", "# Note: The original MIDI.py has poor performance because list.pop(0) has poor performance on some pythons or machines. I changed it to list = list[1:]\n", + "\n", "import glob\n", "import re\n", - "from tqdm.notebook import tqdm\n", + "from tqdm import tqdm\n", "from pathlib import Path\n", "import shutil\n", - "import random" - ], - "outputs": [], - "execution_count": 2 + "import random\n", + "\n", + "from joblib import Parallel, delayed, parallel_config" + ] }, { + "cell_type": "code", + "execution_count": null, + "id": "4954c58615b62ea4", "metadata": { "ExecuteTime": { "end_time": "2024-09-26T03:56:07.269183Z", "start_time": "2024-09-26T03:56:07.249515Z" } }, - "cell_type": "code", + "outputs": [], "source": [ "tokenizer = MIDITokenizer()\n", "\n", + "#======================================================================================\n", + "\n", + "move_files = False # Move processed and bad MIDIs or only copy them\n", + "\n", + "#======================================================================================\n", + "\n", "def process_midi_file(midi_file):\n", " try:\n", " with open(midi_file, 'rb') as f:\n", @@ -61,42 +86,45 @@ " path = midi_file.replace(dataset_dir, f\"{processed_dir}/\")\n", " path = Path(path)\n", " path.parent.mkdir(parents=True, exist_ok=True)\n", - " shutil.move(midi_file, path)\n", + "\n", + " if move_files:\n", + " shutil.move(midi_file, path)\n", + "\n", + " else:\n", + " shutil.copy2(midi_file, path)\n", " else:\n", " res = \"_\".join(res)\n", " path = midi_file.replace(dataset_dir, f\"{rm_dir}/{res}/\")\n", " path = Path(path)\n", " path.parent.mkdir(parents=True, exist_ok=True)\n", - " shutil.move(midi_file, path)\n", - "\n", + " \n", + " if move_files:\n", + " shutil.move(midi_file, path)\n", "\n", - "def process_all(midi_files):\n", - " for midi_file in tqdm(midi_files):\n", - " process_midi_file(midi_file)" - ], - "id": "4954c58615b62ea4", - "outputs": [], - "execution_count": 3 + " else:\n", + " shutil.copy2(midi_file, path)" + ] }, { + "cell_type": "code", + "execution_count": null, + "id": "70af1fc8172b064b", "metadata": { "ExecuteTime": { "end_time": "2024-09-26T03:56:14.151436Z", "start_time": "2024-09-26T03:56:14.144923Z" } }, - "cell_type": "code", - "source": [ - "dataset_dir = r\"D:\\myprojects\\dataset\\midi_datasets\"\n", - "processed_dir = r\"D:\\myprojects\\dataset\\midi_datasets_processed\" # All processed midi will be moved to here. The folder will be created automatically.\n", - "rm_dir = r\"D:\\myprojects\\dataset\\midi_datasets_rm\" # All bad midi will be moved here. The folder will be created automatically." - ], - "id": "70af1fc8172b064b", "outputs": [], - "execution_count": 4 + "source": [ + "dataset_dir = r\"/home/ubuntu/SOURCE/MIDIs\" # Source MIDI dataset directory\n", + "processed_dir = r\"/home/ubuntu/OUTPUT/processed_midis\" # All processed midi will be moved to here. The folder will be created automatically.\n", + "rm_dir = r\"/home/ubuntu/OUTPUT/bad_midis\" # badAll bad midi will be moved here. The folder will be created automatically." + ] }, { "cell_type": "code", + "execution_count": null, "id": "ffc52c85-0a0a-46b0-bbb9-d8dcec4f2a80", "metadata": { "ExecuteTime": { @@ -104,16 +132,16 @@ "start_time": "2024-09-26T03:56:15.029956Z" } }, + "outputs": [], "source": [ "midi_files = glob.glob(f\"{dataset_dir}/**/*\", recursive=True)\n", "midi_files = [file for file in midi_files if re.search(r'\\.midi?$', file, re.IGNORECASE)]\n", "midi_files = sorted(midi_files)" - ], - "outputs": [], - "execution_count": 5 + ] }, { "cell_type": "code", + "execution_count": null, "id": "4a478b13-ef64-4145-aa17-b50779c9efc0", "metadata": { "ExecuteTime": { @@ -121,83 +149,75 @@ "start_time": "2024-09-26T03:56:47.922667Z" } }, + "outputs": [], "source": [ "# check if dataset_dir is correct\n", "print(len(midi_files))\n", "print(random.choice(midi_files))" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "203144\n", - "D:\\myprojects\\dataset\\midi_datasets\\los_midi\\MIDIs\\1\\127e2700fa41458e5cbfde61066eeb33.mid\n" - ] - } - ], - "execution_count": 6 + ] }, { + "cell_type": "code", + "execution_count": null, + "id": "d294f226332745c9", "metadata": { "ExecuteTime": { "end_time": "2024-09-26T03:57:07.239075500Z", "start_time": "2024-09-26T03:56:57.902649Z" } }, - "cell_type": "code", + "outputs": [], "source": [ - "# start processing.\n", - "# If you cancel the run, you will need to reload midi_files before you can run it again\n", - "process_all(midi_files)" - ], - "id": "d294f226332745c9", - "outputs": [ - { - "data": { - "text/plain": [ - " 0%| | 0/203144 [00:00 3\u001B[0m \u001B[43mprocess_all\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmidi_files\u001B[49m\u001B[43m)\u001B[49m\n", - "Cell \u001B[1;32mIn[3], line 41\u001B[0m, in \u001B[0;36mprocess_all\u001B[1;34m(midi_files)\u001B[0m\n\u001B[0;32m 39\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mprocess_all\u001B[39m(midi_files):\n\u001B[0;32m 40\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m midi_file \u001B[38;5;129;01min\u001B[39;00m tqdm(midi_files):\n\u001B[1;32m---> 41\u001B[0m \u001B[43mprocess_midi_file\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmidi_file\u001B[49m\u001B[43m)\u001B[49m\n", - "Cell \u001B[1;32mIn[3], line 17\u001B[0m, in \u001B[0;36mprocess_midi_file\u001B[1;34m(midi_file)\u001B[0m\n\u001B[0;32m 15\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m res:\n\u001B[0;32m 16\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m---> 17\u001B[0m mid \u001B[38;5;241m=\u001B[39m \u001B[43mMIDI\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmidi2score\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdatas\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 18\u001B[0m mid \u001B[38;5;241m=\u001B[39m tokenizer\u001B[38;5;241m.\u001B[39mtokenize(mid)\n\u001B[0;32m 19\u001B[0m quality, res \u001B[38;5;241m=\u001B[39m tokenizer\u001B[38;5;241m.\u001B[39mcheck_quality(mid)\n", - "File \u001B[1;32mD:\\myprojects\\python\\universal-midi-model\\MIDI.py:402\u001B[0m, in \u001B[0;36mmidi2score\u001B[1;34m(midi)\u001B[0m\n\u001B[0;32m 398\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mmidi2score\u001B[39m(midi\u001B[38;5;241m=\u001B[39m\u001B[38;5;124mb\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m'\u001B[39m):\n\u001B[0;32m 399\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124mr\u001B[39m\u001B[38;5;124;03m'''\u001B[39;00m\n\u001B[0;32m 400\u001B[0m \u001B[38;5;124;03mTranslates MIDI into a \"score\", using midi2opus() then opus2score()\u001B[39;00m\n\u001B[0;32m 401\u001B[0m \u001B[38;5;124;03m'''\u001B[39;00m\n\u001B[1;32m--> 402\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mopus2score\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmidi2opus\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmidi\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\n", - "File \u001B[1;32mD:\\myprojects\\python\\universal-midi-model\\MIDI.py:353\u001B[0m, in \u001B[0;36mopus2score\u001B[1;34m(opus)\u001B[0m\n\u001B[0;32m 351\u001B[0m _clean_up_warnings()\n\u001B[0;32m 352\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m [\u001B[38;5;241m1000\u001B[39m, [], ]\n\u001B[1;32m--> 353\u001B[0m tracks \u001B[38;5;241m=\u001B[39m \u001B[43mcopy\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mdeepcopy\u001B[49m\u001B[43m(\u001B[49m\u001B[43mopus\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;66;03m# couple of slices probably quicker...\u001B[39;00m\n\u001B[0;32m 354\u001B[0m ticks \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mint\u001B[39m(tracks\u001B[38;5;241m.\u001B[39mpop(\u001B[38;5;241m0\u001B[39m))\n\u001B[0;32m 355\u001B[0m score \u001B[38;5;241m=\u001B[39m [ticks, ]\n", - "File \u001B[1;32m~\\AppData\\Local\\Programs\\Python\\Python310\\lib\\copy.py:146\u001B[0m, in \u001B[0;36mdeepcopy\u001B[1;34m(x, memo, _nil)\u001B[0m\n\u001B[0;32m 144\u001B[0m copier \u001B[38;5;241m=\u001B[39m _deepcopy_dispatch\u001B[38;5;241m.\u001B[39mget(\u001B[38;5;28mcls\u001B[39m)\n\u001B[0;32m 145\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m copier \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m--> 146\u001B[0m y \u001B[38;5;241m=\u001B[39m \u001B[43mcopier\u001B[49m\u001B[43m(\u001B[49m\u001B[43mx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmemo\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 147\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 148\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28missubclass\u001B[39m(\u001B[38;5;28mcls\u001B[39m, \u001B[38;5;28mtype\u001B[39m):\n", - "File \u001B[1;32m~\\AppData\\Local\\Programs\\Python\\Python310\\lib\\copy.py:206\u001B[0m, in \u001B[0;36m_deepcopy_list\u001B[1;34m(x, memo, deepcopy)\u001B[0m\n\u001B[0;32m 204\u001B[0m append \u001B[38;5;241m=\u001B[39m y\u001B[38;5;241m.\u001B[39mappend\n\u001B[0;32m 205\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m a \u001B[38;5;129;01min\u001B[39;00m x:\n\u001B[1;32m--> 206\u001B[0m append(\u001B[43mdeepcopy\u001B[49m\u001B[43m(\u001B[49m\u001B[43ma\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmemo\u001B[49m\u001B[43m)\u001B[49m)\n\u001B[0;32m 207\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m y\n", - "File \u001B[1;32m~\\AppData\\Local\\Programs\\Python\\Python310\\lib\\copy.py:146\u001B[0m, in \u001B[0;36mdeepcopy\u001B[1;34m(x, memo, _nil)\u001B[0m\n\u001B[0;32m 144\u001B[0m copier \u001B[38;5;241m=\u001B[39m _deepcopy_dispatch\u001B[38;5;241m.\u001B[39mget(\u001B[38;5;28mcls\u001B[39m)\n\u001B[0;32m 145\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m copier \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m--> 146\u001B[0m y \u001B[38;5;241m=\u001B[39m \u001B[43mcopier\u001B[49m\u001B[43m(\u001B[49m\u001B[43mx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmemo\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 147\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 148\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28missubclass\u001B[39m(\u001B[38;5;28mcls\u001B[39m, \u001B[38;5;28mtype\u001B[39m):\n", - "File \u001B[1;32m~\\AppData\\Local\\Programs\\Python\\Python310\\lib\\copy.py:206\u001B[0m, in \u001B[0;36m_deepcopy_list\u001B[1;34m(x, memo, deepcopy)\u001B[0m\n\u001B[0;32m 204\u001B[0m append \u001B[38;5;241m=\u001B[39m y\u001B[38;5;241m.\u001B[39mappend\n\u001B[0;32m 205\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m a \u001B[38;5;129;01min\u001B[39;00m x:\n\u001B[1;32m--> 206\u001B[0m append(\u001B[43mdeepcopy\u001B[49m\u001B[43m(\u001B[49m\u001B[43ma\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmemo\u001B[49m\u001B[43m)\u001B[49m)\n\u001B[0;32m 207\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m y\n", - "File \u001B[1;32m~\\AppData\\Local\\Programs\\Python\\Python310\\lib\\copy.py:177\u001B[0m, in \u001B[0;36mdeepcopy\u001B[1;34m(x, memo, _nil)\u001B[0m\n\u001B[0;32m 175\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m y \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m x:\n\u001B[0;32m 176\u001B[0m memo[d] \u001B[38;5;241m=\u001B[39m y\n\u001B[1;32m--> 177\u001B[0m \u001B[43m_keep_alive\u001B[49m\u001B[43m(\u001B[49m\u001B[43mx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmemo\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;66;03m# Make sure x lives at least as long as d\u001B[39;00m\n\u001B[0;32m 178\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m y\n", - "File \u001B[1;32m~\\AppData\\Local\\Programs\\Python\\Python310\\lib\\copy.py:254\u001B[0m, in \u001B[0;36m_keep_alive\u001B[1;34m(x, memo)\u001B[0m\n\u001B[0;32m 244\u001B[0m \u001B[38;5;250m\u001B[39m\u001B[38;5;124;03m\"\"\"Keeps a reference to the object x in the memo.\u001B[39;00m\n\u001B[0;32m 245\u001B[0m \n\u001B[0;32m 246\u001B[0m \u001B[38;5;124;03mBecause we remember objects by their id, we have\u001B[39;00m\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 251\u001B[0m \u001B[38;5;124;03mthe memo itself...\u001B[39;00m\n\u001B[0;32m 252\u001B[0m \u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[0;32m 253\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m--> 254\u001B[0m memo[\u001B[38;5;28;43mid\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mmemo\u001B[49m\u001B[43m)\u001B[49m]\u001B[38;5;241m.\u001B[39mappend(x)\n\u001B[0;32m 255\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m:\n\u001B[0;32m 256\u001B[0m \u001B[38;5;66;03m# aha, this is the first one :-)\u001B[39;00m\n\u001B[0;32m 257\u001B[0m memo[\u001B[38;5;28mid\u001B[39m(memo)]\u001B[38;5;241m=\u001B[39m[x]\n", - "\u001B[1;31mKeyboardInterrupt\u001B[0m: " - ] - } - ], - "execution_count": 7 + "# start processing...\n", + "\n", + "NUMBER_OF_PARALLEL_JOBS = 128 # Number of parallel jobs\n", + "NUMBER_OF_FILES_PER_ITERATION = 256 # Number of files to queue for each parallel iteration\n", + "\n", + "print('=' * 70)\n", + "print('Processing MIDI files. Please wait...')\n", + "print('=' * 70)\n", + "\n", + "for i in tqdm(range(0, len(midi_files), NUMBER_OF_FILES_PER_ITERATION)):\n", + "\n", + " with parallel_config(n_jobs=NUMBER_OF_PARALLEL_JOBS, verbose = 0):\n", + "\n", + " Parallel(backend='loky', n_jobs=NUMBER_OF_PARALLEL_JOBS, verbose=0)(delayed(process_midi_file)(f) for f in midi_files[i:i+NUMBER_OF_FILES_PER_ITERATION])\n", + "\n", + "print('=' * 70)\n", + "print('Done!')\n", + "print('=' * 70)" + ] }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, - "source": "", - "id": "d15aaa7b7a4d41b1" + "id": "d15aaa7b7a4d41b1", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate the number of processed and bad MIDIs\n", + "\n", + "print('=' * 70)\n", + "print('Scannging processed MIDIs dir...')\n", + "\n", + "processed_midis = []\n", + "\n", + "for (dirpath, dirnames, filenames) in os.walk(processed_dir):\n", + " processed_midis += [os.path.join(dirpath, file) for file in filenames]\n", + "\n", + "print('=' * 70)\n", + "print('Scannging bad MIDIs dir...')\n", + "\n", + "bad_midis = []\n", + "\n", + "for (dirpath, dirnames, filenames) in os.walk(rm_dir):\n", + " bad_midis += [os.path.join(dirpath, file) for file in filenames]\n", + "\n", + "print('=' * 70)\n", + "print('Number of good MIDIs:', len(processed_midis))\n", + "print('Number of bad MIDIs:', len(bad_midis))\n", + "print('=' * 70)" + ] } ], "metadata": { @@ -216,7 +236,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.12.7" } }, "nbformat": 4,