From 34def1bc4e14831ebe41dc1ee726ae779ca2fcec Mon Sep 17 00:00:00 2001 From: H1DDENADM1N <65646535+H1DDENADM1N@users.noreply.github.com> Date: Sun, 12 Jan 2025 20:11:22 +0800 Subject: [PATCH] feat: add 160,435 British and American phonetic symbols to dictionary - Display phonetic symbols only when British and American pronunciations differ - Phonetic data scraped from the Concise Oxford English Dictionary (OALD) - Added logging to track key operations - Implemented performance timing to measure execution time --- .gitignore | 3 + eccedict.py | 478 +++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 440 insertions(+), 41 deletions(-) diff --git a/.gitignore b/.gitignore index 19d3c51..f94f6c2 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,6 @@ concise-enhanced.mdx concise-enhanced.png concise-enhanced.css Secret.py +/output/* +/logs/* +/oald-fork/* \ No newline at end of file diff --git a/eccedict.py b/eccedict.py index 9d8e705..233f582 100644 --- a/eccedict.py +++ b/eccedict.py @@ -5,10 +5,12 @@ # eccedict.py - # # Created by H1DDENADM1N on 2025/01/09 -# Last Modified: 2025/01/10 19:00 +# Last Modified: 2025/01/12 19:52 # # ====================================================================== import re +import sys +from datetime import datetime from pathlib import Path import duckdb @@ -25,39 +27,308 @@ CHINESE_PATTERN = re.compile(r"[\u4e00-\u9fff]") -def convert_csv_to_duckdb(csv_file: Path, duckdb_file: Path): +def configure_logging(log_dir, rotation="1 week", retention="1 month", level="DEBUG"): """ - 字典转化,csv 转换到 duckdb + 配置日志记录器以输出到控制台和文件,同时保留颜色,并根据当前日期和日志级别生成文件名。 + + :param log_dir: 日志文件的目录。 + :param rotation: 日志文件的轮转策略。 + :param retention: 日志文件的保留策略。 + :param level: 日志级别。 + """ + # 使用 pathlib 创建日志目录路径 + log_path = Path(log_dir) + log_path.mkdir(parents=True, exist_ok=True) + + # 获取当前日期 + current_date = datetime.now().strftime("%Y-%m-%d") + + # 配置控制台日志记录器,显示带颜色的日志 + logger.remove() # 移除默认的日志记录器 + logger.add( + sys.stdout, + colorize=True, + format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}", + level=level, + ) + + # 配置文件日志记录器,根据日志级别生成不同的日志文件 + log_levels = ["TRACE", "DEBUG", "INFO", "SUCCESS", "WARNING", "ERROR", "CRITICAL"] + for log_level in log_levels: + log_file_path = log_path / f"{current_date}_{log_level}.log" + logger.add( + str(log_file_path), + colorize=True, + format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}", + level=log_level, + rotation=rotation, + retention=retention, + filter=lambda record, level=log_level: record["level"].name == level, + ) + + +def convert_csv_to_stardictdb(csv_file: Path, stardictdb_file: Path): + """ + 字典转化 stardict.csv 转换到 stardict.ddb """ if not csv_file.exists(): logger.error(f"{csv_file} 未找到") raise FileNotFoundError(f"{csv_file} 未找到") - if duckdb_file.exists(): - logger.error(f"{duckdb_file} 已存在") - raise FileExistsError(f"{duckdb_file} 已存在") - # 连接到DuckDB数据库(如果数据库不存在,则会自动创建) - conn = duckdb.connect(database=str(duckdb_file), read_only=False) - # 读取CSV文件并导入到DuckDB表中 + if stardictdb_file.exists(): + logger.error(f"{stardictdb_file} 已存在") + raise FileExistsError(f"{stardictdb_file} 已存在") + # 连接到 stardict.ddb 数据库(如果数据库不存在,则会自动创建) + conn = duckdb.connect(database=str(stardictdb_file), read_only=False) + # 读取CSV文件并导入到 stardict.ddb 数据库的 stardict 表中 conn.execute(f"CREATE TABLE stardict AS SELECT * FROM read_csv_auto('{csv_file}')") conn.close() -def convert_duckdb_to_txt(duckdb_file: Path, txt_file: Path, buffer_size: int = 1000): +def update_phonetics_from_phoneticsdb_to_stardictdb( + phoneticsdb_file: Path, stardictdb_file: Path +): + """ + 从 phonetics.ddb 文件中 phon_uk 和 phon_us 列读取英、美音标数据,并更新到 stardict.ddb 文件中的 phonetic 列 + """ + if not stardictdb_file.exists(): + logger.error(f"{stardictdb_file} 未找到") + raise FileNotFoundError(f"{stardictdb_file} 未找到") + if not phoneticsdb_file.exists(): + logger.error(f"{phoneticsdb_file} 未找到") + raise FileNotFoundError(f"{phoneticsdb_file} 未找到") + + # 连接到 stardict.ddb 数据库 + conn = duckdb.connect(database=str(stardictdb_file), read_only=False) + cursor = conn.cursor() + # 连接到 phonetics.ddb 数据库 + phonetics_conn = duckdb.connect(database=str(phoneticsdb_file), read_only=True) + phonetics_cursor = phonetics_conn.cursor() + # 从 stardict.ddb 拿 word 去 phonetics.ddb 取音标 + try: + # 从 stardict.ddb 中获取所有单词 + cursor.execute("SELECT word, phonetic FROM stardict") + rows = cursor.fetchall() + + # 遍历每个单词,从 phonetics.ddb 中获取音标并更新到 stardict.ddb + for row in rows: + (word, phonetic) = row + phonetics_cursor.execute( + """ + SELECT phon_uk, phon_us + FROM words + WHERE word = ? + """, + (word,), + ) + result = phonetics_cursor.fetchone() + + if result: + (phon_uk, phon_us) = result + # 更新 stardict.ddb 中的 phonetic 列 + if phon_uk == phon_us: + new_phonetic = phon_uk.strip("/") + else: + new_phonetic = f"英 {phon_uk.strip('/')} 美 {phon_us.strip('/')}" + cursor.execute( + """ + UPDATE stardict + SET phonetic = ? + WHERE word = ? + """, + (new_phonetic, word), + ) + logger.debug(f"更新单词 {word} 的音标: {new_phonetic}") + else: + pass # 跳过没有找到音标的单词 + + # 提交事务 + conn.commit() + + except Exception as e: + logger.error(f"更新音标时发生错误: {e}") + if conn.in_transaction: # 检查是否有活动的事务 + conn.rollback() # 回滚事务 + finally: + # 关闭连接 + cursor.close() + conn.close() + phonetics_cursor.close() + phonetics_conn.close() + + +def build_phonetics_ddb(oald_txt: Path, phoneticsdb_file: Path): + """ + 从 oald-fork.txt 文件中读取单词和英、美音标数据,并生成 phonetics.ddb 数据库(words 表包含 word、phon_uk、phon_us 三个列) + """ + if not oald_txt.exists(): + logger.error(f"{oald_txt} 未找到") + raise FileNotFoundError(f"{oald_txt} 未找到") + + # 连接到 phonetics.ddb 数据库 + conn = duckdb.connect(phoneticsdb_file) + cursor = conn.cursor() + + # 创建表来存储单词和音标信息 + cursor.execute(""" + CREATE TABLE words ( + word TEXT PRIMARY KEY, + phon_uk TEXT, + phon_us TEXT + ) + """) + + with oald_txt.open("r", encoding="utf-8") as f: + word = None # 当前词条 + # 逐行读取文件 + for line in f: + line = line.strip() # 去除首尾空白字符 + if not line: + continue # 跳过空行 + # 处理 "" 行 + if line.startswith(""): + word = None # 重置当前词条 + continue + # 处理 "@@@LINK=" 行 + elif line.startswith("@@@LINK="): + word = None # 重置当前词条 + continue # 跳过链接行,先建立非链接词条,稍后再处理链接词条 + # 处理普通词条行 + elif line.startswith("" 行 + if line.startswith(""): + word = None # 重置当前词条 + continue + # 处理 "@@@LINK=" 行 + elif line.startswith("@@@LINK="): + # 从 phonetics.ddb 取 phon_uk 和 phon_us + linked_to = line.lstrip("@@@LINK=") + cursor.execute( + """ + SELECT phon_uk, phon_us + FROM words + WHERE word = ?""", + (linked_to,), + ) + result = cursor.fetchone() + + if result: + (phon_uk, phon_us) = result + # 更新 phoneticsdb_file 中的 phon_uk 和 phon_am 列 + if phon_uk is not None and phon_us is not None: + # 将单词和音标信息插入数据库 + try: + cursor.execute( + """ + INSERT INTO words (word, phon_uk, phon_us) + VALUES (?, ?, ?) + """, + (word, phon_uk, phon_us), + ) + except duckdb.duckdb.ConstraintException: + logger.debug(f"链接词条 {linked_to} 已存在,跳过") + pass + # 处理普通词条行 + elif line.startswith("" "1. 开源英汉字典:MIT / CC 双协议
" "2. 标注牛津三千关键词:音标后 K字符
" @@ -437,42 +708,167 @@ def generate_mdx(txt_file: Path, mdx_file: Path): "4. 标注 COCA/BNC 的词频顺序
" "5. 标注考试大纲信息:中高研四六托雅 等
" "6. 增加汉英反查
" - "", # 从 concise-enhanced.info.html 中提取 + "", ) - # 写入 MDX 文件 + # 写入 concise-enhanced.mdx 文件 with mdx_file.open("wb") as outfile: writer.write(outfile) +def calculate_time_interval(log1, log2): + # 提取时间戳部分 + time_format = "%Y-%m-%d %H:%M:%S" + timestamp1 = log1.split(" | ")[0] + timestamp2 = log2.split(" | ")[0] + + # 将时间戳转换为 datetime 对象 + time1 = datetime.strptime(timestamp1, time_format) + time2 = datetime.strptime(timestamp2, time_format) + + # 计算时间差 + time_diff = time2 - time1 + + # 提取小时、分钟、秒 + total_seconds = int(time_diff.total_seconds()) + hours = total_seconds // 3600 + minutes = (total_seconds % 3600) // 60 + seconds = total_seconds % 60 + + # 构建结果字符串,忽略值为 0 的部分 + result = [] + if hours > 0: + result.append(f"{hours}h") + if minutes > 0: + result.append(f"{minutes}m") + if seconds > 0: + result.append(f"{seconds}s") + + # 如果所有值都为 0,返回 1s + if not result: + return "⏱️ 1s" + + return f"⏱️ {''.join(result)}" + + if __name__ == "__main__": - logger.info("开始转换...") + # 配置日志 完整流程耗时 ⏱️2h3m26s + configure_logging("logs", level="DEBUG") + # 记录开始时间 + start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + # 0️⃣ 📁 创建输出目录、定义文件路径 ⏱️1s + logger.info("创建输出目录、定义文件路径...") + # 源文件 csv_file = Path("stardict.csv") + oald_txt = Path().cwd() / "oald-fork" / "oald-fork.txt" + # GoldenDict 路径 + goldendict_exe = Path(r"C:\SSS\GoldenDict-ng\goldendict.exe") + # 输出文件 + output_dir = Path("output") + stardictdb_file = Path() / output_dir / "stardict.ddb" + phoneticsdb_file = Path() / output_dir / "phonetics.ddb" + txt_file = Path() / output_dir / "stardict.txt" + mdx_file = Path("concise-enhanced.mdx") + # 检查源文件是否存在 if not csv_file.exists(): - raise FileNotFoundError(f"{duckdb_file} 未找到") - - duckdb_file = Path("stardict.ddb") - if not duckdb_file.exists(): - convert_csv_to_duckdb(csv_file, duckdb_file) - - txt_file = Path("stardict.txt") + raise FileNotFoundError( + f"{csv_file} 未找到 (stardict.csv 由 stardict.7z 解包获得)" + ) + if not oald_txt.exists(): + print( + f"{oald_txt} 未找到 (oald-fork.txt 由 精装牛津十 mdx 使用 AutoMdxBuilder 解包获得)" + ) + # 检查 GoldenDict 是否存在 + if not goldendict_exe.exists(): + raise FileNotFoundError(f"{goldendict_exe} 未找到 (GoldenDict-ng 软件)") + # 清空输出目录 + if not output_dir.exists(): + output_dir.mkdir() + if stardictdb_file.exists(): + # 删除旧的词典数据库文件 + stardictdb_file.unlink() + if phoneticsdb_file.exists(): + # 删除旧的音标数据库文件 + phoneticsdb_file.unlink() if txt_file.exists(): # 删除旧的 TXT 文件 txt_file.unlink() - convert_duckdb_to_txt(duckdb_file, txt_file, buffer_size=1_000_000) - logger.info(f"TXT 文件已生成:{txt_file}") - - mdx_file = Path("concise-enhanced.mdx") if mdx_file.exists(): # 删除旧的 MDX 文件 mdx_file.unlink() + logger.info("输出目录、文件路径配置完成") + + # 记录步骤0结束时间 + step0_end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + time_interval = calculate_time_interval(start_time, step0_end_time) + logger.success(f"步骤0完成,耗时: {time_interval}") + + logger.info("开始转换...") + + # 1️⃣ ⭐ 生成 stardict.ddb ⏱️1s + logger.info("生成 stardict.ddb 文件...") + convert_csv_to_stardictdb(csv_file, stardictdb_file) + logger.info(f"stardict.ddb 文件已生成:{stardictdb_file}") + + # 记录步骤1结束时间 + step1_end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + time_interval = calculate_time_interval(step0_end_time, step1_end_time) + logger.success(f"步骤1完成,耗时: {time_interval}") + + # 2️⃣ 🔖 生成 phonetics.ddb ⏱️14m50s + logger.info("生成 phonetics.ddb 文件...") + build_phonetics_ddb(oald_txt, phoneticsdb_file) + logger.info(f"phonetics.ddb 文件已生成:{phoneticsdb_file}") + + # 记录步骤2结束时间 + step2_end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + time_interval = calculate_time_interval(step1_end_time, step2_end_time) + logger.success(f"步骤2完成,耗时: {time_interval}") + + # 3️⃣ 🆕 更新 stardict.ddb 音标信息 ⏱️1h13m20s + logger.info("更新音标信息...") + update_phonetics_from_phoneticsdb_to_stardictdb(phoneticsdb_file, stardictdb_file) + logger.info(f"更新音标信息完成:{stardictdb_file}") + + # 记录步骤3结束时间 + step3_end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + time_interval = calculate_time_interval(step2_end_time, step3_end_time) + logger.success(f"步骤3完成,耗时: {time_interval}") + + # 4️⃣ 📄 生成 stardict.txt ⏱️26m25s + logger.info("生成 stardict.txt 文件...") + convert_stardictdb_to_txt(stardictdb_file, txt_file, buffer_size=1_000_000) + logger.info(f"stardict.txt 文件已生成:{txt_file}") + + # 记录步骤4结束时间 + step4_end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + time_interval = calculate_time_interval(step3_end_time, step4_end_time) + logger.success(f"步骤4完成,耗时: {time_interval}") + + # 5️⃣ 📦 生成 concise-enhanced.mdx ⏱️8m49s + logger.info("生成 concise-enhanced.mdx 文件...") generate_mdx(txt_file, mdx_file) - logger.info(f"MDX 文件已生成:{mdx_file}") + logger.info(f"concise-enhanced.mdx 文件已生成:{mdx_file}") + + # 记录步骤5结束时间 + step5_end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + time_interval = calculate_time_interval(step4_end_time, step5_end_time) + logger.success(f"步骤5完成,耗时: {time_interval}") + + # 6️⃣ 🔍 打开 GoldenDict,自动重建索引 ⏱️1s + logger.info("打开 GoldenDict...") + import subprocess + + subprocess.Popen(str(goldendict_exe)) + logger.info("GoldenDict 已打开") - # goldendict_exe = Path(r"C:\SSS\GoldenDict-ng\goldendict.exe") - # if goldendict_exe.exists(): - # # 打开 GoldenDict,自动重建索引 - # import subprocess + # 记录步骤6结束时间 + step6_end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + time_interval = calculate_time_interval(step5_end_time, step6_end_time) + logger.success(f"步骤6完成,耗时: {time_interval}") - # subprocess.run(str(goldendict_exe)) + # 记录总耗时 + total_time_interval = calculate_time_interval(start_time, step6_end_time) + logger.success(f"所有步骤完成,总耗时: {total_time_interval}")