Skip to content

Commit

Permalink
Merge pull request #32 from asigalov61/main
Browse files Browse the repository at this point in the history
Added joblib for much faster pre-processing
  • Loading branch information
SkyTNT authored Dec 30, 2024
2 parents 17d1774 + 7e7461b commit 52881db
Showing 1 changed file with 108 additions and 88 deletions.
196 changes: 108 additions & 88 deletions dataset_preprocess.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,63 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "02c7e690-19f6-4393-a1e0-bfc1eb5632f9",
"metadata": {},
"outputs": [],
"source": [
"!pip install -U joblib\n",
"!pip install -U ipywidgets\n",
"!pip install -U tqdm"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5d0fd231-9b57-4bc3-b08f-3a8dce0287ee",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-26T03:56:06.687771Z",
"start_time": "2024-09-26T03:56:06.674763Z"
}
},
"outputs": [],
"source": [
"from midi_tokenizer import MIDITokenizer\n",
"\n",
"import MIDI \n",
"# Note: The original MIDI.py has poor performance because list.pop(0) has poor performance on some pythons or machines. I changed it to list = list[1:]\n",
"\n",
"import glob\n",
"import re\n",
"from tqdm.notebook import tqdm\n",
"from tqdm import tqdm\n",
"from pathlib import Path\n",
"import shutil\n",
"import random"
],
"outputs": [],
"execution_count": 2
"import random\n",
"\n",
"from joblib import Parallel, delayed, parallel_config"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4954c58615b62ea4",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-26T03:56:07.269183Z",
"start_time": "2024-09-26T03:56:07.249515Z"
}
},
"cell_type": "code",
"outputs": [],
"source": [
"tokenizer = MIDITokenizer()\n",
"\n",
"#======================================================================================\n",
"\n",
"move_files = False # Move processed and bad MIDIs or only copy them\n",
"\n",
"#======================================================================================\n",
"\n",
"def process_midi_file(midi_file):\n",
" try:\n",
" with open(midi_file, 'rb') as f:\n",
Expand Down Expand Up @@ -61,143 +86,138 @@
" path = midi_file.replace(dataset_dir, f\"{processed_dir}/\")\n",
" path = Path(path)\n",
" path.parent.mkdir(parents=True, exist_ok=True)\n",
" shutil.move(midi_file, path)\n",
"\n",
" if move_files:\n",
" shutil.move(midi_file, path)\n",
"\n",
" else:\n",
" shutil.copy2(midi_file, path)\n",
" else:\n",
" res = \"_\".join(res)\n",
" path = midi_file.replace(dataset_dir, f\"{rm_dir}/{res}/\")\n",
" path = Path(path)\n",
" path.parent.mkdir(parents=True, exist_ok=True)\n",
" shutil.move(midi_file, path)\n",
"\n",
" \n",
" if move_files:\n",
" shutil.move(midi_file, path)\n",
"\n",
"def process_all(midi_files):\n",
" for midi_file in tqdm(midi_files):\n",
" process_midi_file(midi_file)"
],
"id": "4954c58615b62ea4",
"outputs": [],
"execution_count": 3
" else:\n",
" shutil.copy2(midi_file, path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70af1fc8172b064b",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-26T03:56:14.151436Z",
"start_time": "2024-09-26T03:56:14.144923Z"
}
},
"cell_type": "code",
"source": [
"dataset_dir = r\"D:\\myprojects\\dataset\\midi_datasets\"\n",
"processed_dir = r\"D:\\myprojects\\dataset\\midi_datasets_processed\" # All processed midi will be moved to here. The folder will be created automatically.\n",
"rm_dir = r\"D:\\myprojects\\dataset\\midi_datasets_rm\" # All bad midi will be moved here. The folder will be created automatically."
],
"id": "70af1fc8172b064b",
"outputs": [],
"execution_count": 4
"source": [
"dataset_dir = r\"/home/ubuntu/SOURCE/MIDIs\" # Source MIDI dataset directory\n",
"processed_dir = r\"/home/ubuntu/OUTPUT/processed_midis\" # All processed midi will be moved to here. The folder will be created automatically.\n",
"rm_dir = r\"/home/ubuntu/OUTPUT/bad_midis\" # badAll bad midi will be moved here. The folder will be created automatically."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ffc52c85-0a0a-46b0-bbb9-d8dcec4f2a80",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-26T03:56:17.065171Z",
"start_time": "2024-09-26T03:56:15.029956Z"
}
},
"outputs": [],
"source": [
"midi_files = glob.glob(f\"{dataset_dir}/**/*\", recursive=True)\n",
"midi_files = [file for file in midi_files if re.search(r'\\.midi?$', file, re.IGNORECASE)]\n",
"midi_files = sorted(midi_files)"
],
"outputs": [],
"execution_count": 5
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4a478b13-ef64-4145-aa17-b50779c9efc0",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-26T03:56:47.931850Z",
"start_time": "2024-09-26T03:56:47.922667Z"
}
},
"outputs": [],
"source": [
"# check if dataset_dir is correct\n",
"print(len(midi_files))\n",
"print(random.choice(midi_files))"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"203144\n",
"D:\\myprojects\\dataset\\midi_datasets\\los_midi\\MIDIs\\1\\127e2700fa41458e5cbfde61066eeb33.mid\n"
]
}
],
"execution_count": 6
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d294f226332745c9",
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-26T03:57:07.239075500Z",
"start_time": "2024-09-26T03:56:57.902649Z"
}
},
"cell_type": "code",
"outputs": [],
"source": [
"# start processing.\n",
"# If you cancel the run, you will need to reload midi_files before you can run it again\n",
"process_all(midi_files)"
],
"id": "d294f226332745c9",
"outputs": [
{
"data": {
"text/plain": [
" 0%| | 0/203144 [00:00<?, ?it/s]"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "e6c1a9bfdf5f4b22b15241716389db54"
}
},
"metadata": {},
"output_type": "display_data"
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[7], line 3\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;66;03m# start processing. You can increase the thread num if your machine is good\u001B[39;00m\n\u001B[0;32m 2\u001B[0m \u001B[38;5;66;03m# If you cancel the run, you will need to reload midi_files before you can run it again\u001B[39;00m\n\u001B[1;32m----> 3\u001B[0m \u001B[43mprocess_all\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmidi_files\u001B[49m\u001B[43m)\u001B[49m\n",
"Cell \u001B[1;32mIn[3], line 41\u001B[0m, in \u001B[0;36mprocess_all\u001B[1;34m(midi_files)\u001B[0m\n\u001B[0;32m 39\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mprocess_all\u001B[39m(midi_files):\n\u001B[0;32m 40\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m midi_file \u001B[38;5;129;01min\u001B[39;00m tqdm(midi_files):\n\u001B[1;32m---> 41\u001B[0m \u001B[43mprocess_midi_file\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmidi_file\u001B[49m\u001B[43m)\u001B[49m\n",
"Cell \u001B[1;32mIn[3], line 17\u001B[0m, in \u001B[0;36mprocess_midi_file\u001B[1;34m(midi_file)\u001B[0m\n\u001B[0;32m 15\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m res:\n\u001B[0;32m 16\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m---> 17\u001B[0m mid \u001B[38;5;241m=\u001B[39m \u001B[43mMIDI\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mmidi2score\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdatas\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 18\u001B[0m mid \u001B[38;5;241m=\u001B[39m tokenizer\u001B[38;5;241m.\u001B[39mtokenize(mid)\n\u001B[0;32m 19\u001B[0m quality, res \u001B[38;5;241m=\u001B[39m tokenizer\u001B[38;5;241m.\u001B[39mcheck_quality(mid)\n",
"File \u001B[1;32mD:\\myprojects\\python\\universal-midi-model\\MIDI.py:402\u001B[0m, in \u001B[0;36mmidi2score\u001B[1;34m(midi)\u001B[0m\n\u001B[0;32m 398\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mmidi2score\u001B[39m(midi\u001B[38;5;241m=\u001B[39m\u001B[38;5;124mb\u001B[39m\u001B[38;5;124m'\u001B[39m\u001B[38;5;124m'\u001B[39m):\n\u001B[0;32m 399\u001B[0m \u001B[38;5;250m \u001B[39m\u001B[38;5;124mr\u001B[39m\u001B[38;5;124;03m'''\u001B[39;00m\n\u001B[0;32m 400\u001B[0m \u001B[38;5;124;03mTranslates MIDI into a \"score\", using midi2opus() then opus2score()\u001B[39;00m\n\u001B[0;32m 401\u001B[0m \u001B[38;5;124;03m'''\u001B[39;00m\n\u001B[1;32m--> 402\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[43mopus2score\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmidi2opus\u001B[49m\u001B[43m(\u001B[49m\u001B[43mmidi\u001B[49m\u001B[43m)\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32mD:\\myprojects\\python\\universal-midi-model\\MIDI.py:353\u001B[0m, in \u001B[0;36mopus2score\u001B[1;34m(opus)\u001B[0m\n\u001B[0;32m 351\u001B[0m _clean_up_warnings()\n\u001B[0;32m 352\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m [\u001B[38;5;241m1000\u001B[39m, [], ]\n\u001B[1;32m--> 353\u001B[0m tracks \u001B[38;5;241m=\u001B[39m \u001B[43mcopy\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mdeepcopy\u001B[49m\u001B[43m(\u001B[49m\u001B[43mopus\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;66;03m# couple of slices probably quicker...\u001B[39;00m\n\u001B[0;32m 354\u001B[0m ticks \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mint\u001B[39m(tracks\u001B[38;5;241m.\u001B[39mpop(\u001B[38;5;241m0\u001B[39m))\n\u001B[0;32m 355\u001B[0m score \u001B[38;5;241m=\u001B[39m [ticks, ]\n",
"File \u001B[1;32m~\\AppData\\Local\\Programs\\Python\\Python310\\lib\\copy.py:146\u001B[0m, in \u001B[0;36mdeepcopy\u001B[1;34m(x, memo, _nil)\u001B[0m\n\u001B[0;32m 144\u001B[0m copier \u001B[38;5;241m=\u001B[39m _deepcopy_dispatch\u001B[38;5;241m.\u001B[39mget(\u001B[38;5;28mcls\u001B[39m)\n\u001B[0;32m 145\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m copier \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m--> 146\u001B[0m y \u001B[38;5;241m=\u001B[39m \u001B[43mcopier\u001B[49m\u001B[43m(\u001B[49m\u001B[43mx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmemo\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 147\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 148\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28missubclass\u001B[39m(\u001B[38;5;28mcls\u001B[39m, \u001B[38;5;28mtype\u001B[39m):\n",
"File \u001B[1;32m~\\AppData\\Local\\Programs\\Python\\Python310\\lib\\copy.py:206\u001B[0m, in \u001B[0;36m_deepcopy_list\u001B[1;34m(x, memo, deepcopy)\u001B[0m\n\u001B[0;32m 204\u001B[0m append \u001B[38;5;241m=\u001B[39m y\u001B[38;5;241m.\u001B[39mappend\n\u001B[0;32m 205\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m a \u001B[38;5;129;01min\u001B[39;00m x:\n\u001B[1;32m--> 206\u001B[0m append(\u001B[43mdeepcopy\u001B[49m\u001B[43m(\u001B[49m\u001B[43ma\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmemo\u001B[49m\u001B[43m)\u001B[49m)\n\u001B[0;32m 207\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m y\n",
"File \u001B[1;32m~\\AppData\\Local\\Programs\\Python\\Python310\\lib\\copy.py:146\u001B[0m, in \u001B[0;36mdeepcopy\u001B[1;34m(x, memo, _nil)\u001B[0m\n\u001B[0;32m 144\u001B[0m copier \u001B[38;5;241m=\u001B[39m _deepcopy_dispatch\u001B[38;5;241m.\u001B[39mget(\u001B[38;5;28mcls\u001B[39m)\n\u001B[0;32m 145\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m copier \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[1;32m--> 146\u001B[0m y \u001B[38;5;241m=\u001B[39m \u001B[43mcopier\u001B[49m\u001B[43m(\u001B[49m\u001B[43mx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmemo\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 147\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[0;32m 148\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28missubclass\u001B[39m(\u001B[38;5;28mcls\u001B[39m, \u001B[38;5;28mtype\u001B[39m):\n",
"File \u001B[1;32m~\\AppData\\Local\\Programs\\Python\\Python310\\lib\\copy.py:206\u001B[0m, in \u001B[0;36m_deepcopy_list\u001B[1;34m(x, memo, deepcopy)\u001B[0m\n\u001B[0;32m 204\u001B[0m append \u001B[38;5;241m=\u001B[39m y\u001B[38;5;241m.\u001B[39mappend\n\u001B[0;32m 205\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m a \u001B[38;5;129;01min\u001B[39;00m x:\n\u001B[1;32m--> 206\u001B[0m append(\u001B[43mdeepcopy\u001B[49m\u001B[43m(\u001B[49m\u001B[43ma\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmemo\u001B[49m\u001B[43m)\u001B[49m)\n\u001B[0;32m 207\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m y\n",
"File \u001B[1;32m~\\AppData\\Local\\Programs\\Python\\Python310\\lib\\copy.py:177\u001B[0m, in \u001B[0;36mdeepcopy\u001B[1;34m(x, memo, _nil)\u001B[0m\n\u001B[0;32m 175\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m y \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m x:\n\u001B[0;32m 176\u001B[0m memo[d] \u001B[38;5;241m=\u001B[39m y\n\u001B[1;32m--> 177\u001B[0m \u001B[43m_keep_alive\u001B[49m\u001B[43m(\u001B[49m\u001B[43mx\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmemo\u001B[49m\u001B[43m)\u001B[49m \u001B[38;5;66;03m# Make sure x lives at least as long as d\u001B[39;00m\n\u001B[0;32m 178\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m y\n",
"File \u001B[1;32m~\\AppData\\Local\\Programs\\Python\\Python310\\lib\\copy.py:254\u001B[0m, in \u001B[0;36m_keep_alive\u001B[1;34m(x, memo)\u001B[0m\n\u001B[0;32m 244\u001B[0m \u001B[38;5;250m\u001B[39m\u001B[38;5;124;03m\"\"\"Keeps a reference to the object x in the memo.\u001B[39;00m\n\u001B[0;32m 245\u001B[0m \n\u001B[0;32m 246\u001B[0m \u001B[38;5;124;03mBecause we remember objects by their id, we have\u001B[39;00m\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 251\u001B[0m \u001B[38;5;124;03mthe memo itself...\u001B[39;00m\n\u001B[0;32m 252\u001B[0m \u001B[38;5;124;03m\"\"\"\u001B[39;00m\n\u001B[0;32m 253\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m--> 254\u001B[0m memo[\u001B[38;5;28;43mid\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mmemo\u001B[49m\u001B[43m)\u001B[49m]\u001B[38;5;241m.\u001B[39mappend(x)\n\u001B[0;32m 255\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mKeyError\u001B[39;00m:\n\u001B[0;32m 256\u001B[0m \u001B[38;5;66;03m# aha, this is the first one :-)\u001B[39;00m\n\u001B[0;32m 257\u001B[0m memo[\u001B[38;5;28mid\u001B[39m(memo)]\u001B[38;5;241m=\u001B[39m[x]\n",
"\u001B[1;31mKeyboardInterrupt\u001B[0m: "
]
}
],
"execution_count": 7
"# start processing...\n",
"\n",
"NUMBER_OF_PARALLEL_JOBS = 128 # Number of parallel jobs\n",
"NUMBER_OF_FILES_PER_ITERATION = 256 # Number of files to queue for each parallel iteration\n",
"\n",
"print('=' * 70)\n",
"print('Processing MIDI files. Please wait...')\n",
"print('=' * 70)\n",
"\n",
"for i in tqdm(range(0, len(midi_files), NUMBER_OF_FILES_PER_ITERATION)):\n",
"\n",
" with parallel_config(n_jobs=NUMBER_OF_PARALLEL_JOBS, verbose = 0):\n",
"\n",
" Parallel(backend='loky', n_jobs=NUMBER_OF_PARALLEL_JOBS, verbose=0)(delayed(process_midi_file)(f) for f in midi_files[i:i+NUMBER_OF_FILES_PER_ITERATION])\n",
"\n",
"print('=' * 70)\n",
"print('Done!')\n",
"print('=' * 70)"
]
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "",
"id": "d15aaa7b7a4d41b1"
"id": "d15aaa7b7a4d41b1",
"metadata": {},
"outputs": [],
"source": [
"# Calculate the number of processed and bad MIDIs\n",
"\n",
"print('=' * 70)\n",
"print('Scannging processed MIDIs dir...')\n",
"\n",
"processed_midis = []\n",
"\n",
"for (dirpath, dirnames, filenames) in os.walk(processed_dir):\n",
" processed_midis += [os.path.join(dirpath, file) for file in filenames]\n",
"\n",
"print('=' * 70)\n",
"print('Scannging bad MIDIs dir...')\n",
"\n",
"bad_midis = []\n",
"\n",
"for (dirpath, dirnames, filenames) in os.walk(rm_dir):\n",
" bad_midis += [os.path.join(dirpath, file) for file in filenames]\n",
"\n",
"print('=' * 70)\n",
"print('Number of good MIDIs:', len(processed_midis))\n",
"print('Number of bad MIDIs:', len(bad_midis))\n",
"print('=' * 70)"
]
}
],
"metadata": {
Expand All @@ -216,7 +236,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
"version": "3.12.7"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 52881db

Please sign in to comment.