REW CSV parsing with tab as separator. Added unknown manufacturer err…

…ors to prompt UI. Fixed SquigCrawler's source_name detection and raw data file names.
jaakkopasanen · Nov 5, 2023 · 5ce8767 · 5ce8767
1 parent 2a60e0e
commit 5ce8767
Show file tree

Hide file tree

Showing 7 changed files with 68 additions and 15 deletions.
diff --git a/.gitignore b/.gitignore
@@ -111,7 +111,7 @@ my_results/
 **/data/ignore
 measurements/crinacle/*data*/
 measurements/crinacle/Raw measurements (Patreon)/
-measurements/crinacle/phone_book*.json
+measurements/*/phone_book*.json
 results/crinacle/*/*/*.csv
 measurements/Rtings/crawl_graph_data_urls.json
 measurements/*/raw_data/

diff --git a/autoeq/csv.py b/autoeq/csv.py
@@ -12,8 +12,8 @@
 float_pattern = r'-?\d+(?:\.\d+)?'
 autoeq_pattern = re.compile(rf'{header_pattern}(?:\n{float_pattern}(?:,{float_pattern})+)+')
 rew_float_pattern = rf'(?:{float_pattern}|\?)'
-rew_pattern = re.compile(rf'^(?:\*.*\n)*\* Freq\(Hz\), SPL\(dB\), Phase\(degrees\)\n(?:{rew_float_pattern}, {rew_float_pattern}, {rew_float_pattern})+\n*')
-rew_space_pattern = re.compile(rf'^(?:\*.*\n)*\* Freq\(Hz\) SPL\(dB\) Phase\(degrees\)(?:\n{rew_float_pattern} {rew_float_pattern} {rew_float_pattern})+')
+rew_pattern = re.compile(rf'^(?:\*.*\n)*\* Freq\(Hz\)(?:, ?| |\t)SPL\(dB\)(?:, ?| |\t)Phase\(degrees\)\n(?:{rew_float_pattern}(?:, ?| |\t){rew_float_pattern}(?:, ?| |\t){rew_float_pattern})+\n*')
+#rew_space_pattern = re.compile(rf'^(?:\*.*\n)*\* Freq\(Hz\) SPL\(dB\) Phase\(degrees\)(?:\n{rew_float_pattern} {rew_float_pattern} {rew_float_pattern})+')
 crinacle_pattern = re.compile(rf'[\s\n]?Frequency\tdB\tUnweighted(?:\n{float_pattern}\t{float_pattern})+[.\n]?')
 
 
@@ -86,12 +86,7 @@ def parse_csv(csv):
 
     if rew_pattern.match(csv) or crinacle_pattern.match(csv):
         # These two have all sort of junk in them but the first column is frequency and the second SPL, so all good
-        csv = '\n'.join([line for line in lines if numeric_start.match(line)])
-        lines = csv.split('\n')
-
-    if rew_space_pattern.match(csv):
-        csv = '\n'.join([line for line in lines if numeric_start.match(line) and '?' not in line])
-        csv = csv.replace(' ', '\t')
+        csv = '\n'.join([re.sub(r'(?:, ?| |\t)', '\t', line) for line in lines if numeric_start.match(line) and '?' not in line])
         lines = csv.split('\n')
 
     column_separator, decimal_separator = find_csv_separators(csv)

diff --git a/dbtools/crawler.py b/dbtools/crawler.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import sys
+import urllib
 from pathlib import Path
 import pandas as pd
 from rapidfuzz import fuzz
@@ -240,6 +241,9 @@ def create_prompts(self, max_prompts=100):
         for item in crawled_items:
             if not self.is_prompt_needed(item):
                 continue
+            name = item.source_name or urllib.parse.unquote(item.url.split('/')[-1])
+            if self.manufacturers.find(name)[0] is None:
+                print(f'Cannot detect manufacturer for: {name}')
             self.prompts.append(PromptListItem(NamePrompt(item, self.prompt_callback), self.switch_prompt))
             if len(self.prompts) >= max_prompts:
                 break

diff --git a/dbtools/db.ipynb b/dbtools/db.ipynb
@@ -292,7 +292,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 55,
    "id": "8d357b06-b94d-4719-8aa9-ad4c17edfce4",
    "metadata": {},
    "outputs": [],
@@ -311,6 +311,31 @@
     "display(crawler.widget)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "acadfe6d-2b93-4b25-b8e2-c9058f416c33",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5ec0a3548e164d5fa3fe0d19ff914a15",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/37 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "crawler.process(new_only=True)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 14,

diff --git a/dbtools/manufacturers.tsv b/dbtools/manufacturers.tsv
@@ -2,6 +2,7 @@
 1MORE
 64 Audio	64audio
 7Hz	7Hertz
+7th Acoustics
 A-Audio	A Audio
 AAW	Advanced AcousticsWerkes	AAW (Advanced AcousticsWerkes)
 Abyss
@@ -13,6 +14,7 @@ Adam
 Advanced	ADV	Advanced Sound Group	Advanced Sound
 Aedle
 AfterShokz
+AFUL
 Aiaiai
 AirBuds
 Akai
@@ -169,7 +171,7 @@ Ferrari
 Fidue
 FIIL
 FiiO
-Final Audio	Final Audio Design	Final Audio Design (FAD)
+Final Audio	Final Audio Design	Final Audio Design (FAD)	Final
 fineEars
 Fir Audio	FiR
 First Harmonic
@@ -247,6 +249,7 @@ JLab Audio
 Jomo Audio	Jomo
 JQ
 Just Ear	Justear
+Juzear
 JVC	Massdrop x JVC
 Kaldas Research
 KBEAR	KBEar

diff --git a/dbtools/squig_crawler.py b/dbtools/squig_crawler.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 
 import sys
+import urllib
 from pathlib import Path
 import numpy as np
 import json
@@ -59,7 +60,6 @@ def parse_books(self):
         Returns:
             NameIndex
         """
-        self.measurements_path.joinpath('phone_books').mkdir(parents=True, exist_ok=True)
         book_maps = {}
         for db in self.dbs:
             # 4620 measurements name index
@@ -88,7 +88,7 @@ def crawl(self):
                 anchor = row.find('a')
                 form = 'in-ear' if db['type'] == 'IEMs' else 'over-ear'
                 book = self.book_maps[form]
-                normalized_file_name = self.normalize_file_name(anchor['text'])
+                normalized_file_name = self.normalize_file_name(urllib.parse.unquote(anchor['href']))
                 item = NameItem(
                         url=f'{self.db_url(db)}/{anchor["href"]}',
                         source_name=book[normalized_file_name] if normalized_file_name in book else None,
@@ -100,7 +100,7 @@ def crawl(self):
         return self.crawl_index
 
     def raw_data_path(self, item):
-        return self.measurements_path.joinpath('raw_data', item.form, item.url.split('/')[-1])
+        return self.measurements_path.joinpath('raw_data', item.form, urllib.parse.unquote(item.url.split('/')[-1]))
 
     def get_item_from_url(self, url):
         index_item = self.name_index.find_one(url=url)

diff --git a/tests/test_csv.py b/tests/test_csv.py
@@ -102,12 +102,31 @@
 19999.498 ? 0
 """
 
+csv11 = """* Measurement data measured by REW V5.20.3
+* Source: USB-C to 3.5mm Headphone Jack Adapter, USB-C to 3.5mm Headphone Jack Adapter, 0, volume: 0.138. Timing signal peak level -19.8 dBFS, measurement signal peak level -19.3 dBFS
+* Format: 256k Log Swept Sine, 1 sweep at -12.0 dBFS using an acoustic timing reference
+* Dated: Jul 10, 2023 9:36:37 PM
+* REW Settings:
+*  C-weighting compensation: Off
+*  Target level: 75.0 dB
+* Note: Delay -0.1027 ms (-35 mm, -1.39 in) using estimated IR delay relative to Acoustic reference on USB-C to 3.5mm Headphone Jack Adapter L with no timing offset
+* Measurement: Duo L Jul 10
+* Smoothing: 1/12 octave
+* Frequency Step: 1/48 octave
+* Start Frequency: 20.000 Hz
+*
+* Freq(Hz)	SPL(dB)	Phase(degrees)
+20.000000	96.774	36.7401
+20.299999	96.813	36.0714
+20.600000	96.843	35.3904
+"""
+
 
 class TestCsv(unittest.TestCase):
     def test_regex(self):
         pattern_asserts = [
             (csv1, autoeq_pattern), (csv2, None), (csv3, None), (csv4, None), (csv5, rew_pattern), (csv6, None),
-            (csv7, None), (csv8, autoeq_pattern), (csv9, None),
+            (csv7, None), (csv8, autoeq_pattern), (csv9, None), (csv10, rew_pattern), (csv11, rew_pattern)
         ]
         for s, pattern in pattern_asserts:
             if pattern:
@@ -192,3 +211,10 @@ def test_parse_csv10(self):
         self.assertEqual([20.0, 20.25, 19998.498], d['frequency'], )
         self.assertIn('raw', d)
         self.assertEqual([68.334, 68.335, 27.402], d['raw'])
+
+    def test_parse_csv11(self):
+        d = parse_csv(csv11)
+        self.assertIn('frequency', d)
+        self.assertEqual([20.0, 20.299999, 20.600000], d['frequency'], )
+        self.assertIn('raw', d)
+        self.assertEqual([96.774, 96.813, 96.843], d['raw'])