From 34def1bc4e14831ebe41dc1ee726ae779ca2fcec Mon Sep 17 00:00:00 2001
From: H1DDENADM1N <65646535+H1DDENADM1N@users.noreply.github.com>
Date: Sun, 12 Jan 2025 20:11:22 +0800
Subject: [PATCH] feat: add 160,435 British and American phonetic symbols to
dictionary
- Display phonetic symbols only when British and American pronunciations differ
- Phonetic data scraped from the Concise Oxford English Dictionary (OALD)
- Added logging to track key operations
- Implemented performance timing to measure execution time
---
.gitignore | 3 +
eccedict.py | 478 +++++++++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 440 insertions(+), 41 deletions(-)
diff --git a/.gitignore b/.gitignore
index 19d3c51..f94f6c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,6 @@ concise-enhanced.mdx
concise-enhanced.png
concise-enhanced.css
Secret.py
+/output/*
+/logs/*
+/oald-fork/*
\ No newline at end of file
diff --git a/eccedict.py b/eccedict.py
index 9d8e705..233f582 100644
--- a/eccedict.py
+++ b/eccedict.py
@@ -5,10 +5,12 @@
# eccedict.py -
#
# Created by H1DDENADM1N on 2025/01/09
-# Last Modified: 2025/01/10 19:00
+# Last Modified: 2025/01/12 19:52
#
# ======================================================================
import re
+import sys
+from datetime import datetime
from pathlib import Path
import duckdb
@@ -25,39 +27,308 @@
CHINESE_PATTERN = re.compile(r"[\u4e00-\u9fff]")
-def convert_csv_to_duckdb(csv_file: Path, duckdb_file: Path):
+def configure_logging(log_dir, rotation="1 week", retention="1 month", level="DEBUG"):
"""
- 字典转化,csv 转换到 duckdb
+ 配置日志记录器以输出到控制台和文件,同时保留颜色,并根据当前日期和日志级别生成文件名。
+
+ :param log_dir: 日志文件的目录。
+ :param rotation: 日志文件的轮转策略。
+ :param retention: 日志文件的保留策略。
+ :param level: 日志级别。
+ """
+ # 使用 pathlib 创建日志目录路径
+ log_path = Path(log_dir)
+ log_path.mkdir(parents=True, exist_ok=True)
+
+ # 获取当前日期
+ current_date = datetime.now().strftime("%Y-%m-%d")
+
+ # 配置控制台日志记录器,显示带颜色的日志
+ logger.remove() # 移除默认的日志记录器
+ logger.add(
+ sys.stdout,
+ colorize=True,
+ format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
+ level=level,
+ )
+
+ # 配置文件日志记录器,根据日志级别生成不同的日志文件
+ log_levels = ["TRACE", "DEBUG", "INFO", "SUCCESS", "WARNING", "ERROR", "CRITICAL"]
+ for log_level in log_levels:
+ log_file_path = log_path / f"{current_date}_{log_level}.log"
+ logger.add(
+ str(log_file_path),
+ colorize=True,
+ format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
+ level=log_level,
+ rotation=rotation,
+ retention=retention,
+ filter=lambda record, level=log_level: record["level"].name == level,
+ )
+
+
+def convert_csv_to_stardictdb(csv_file: Path, stardictdb_file: Path):
+ """
+ 字典转化 stardict.csv 转换到 stardict.ddb
"""
if not csv_file.exists():
logger.error(f"{csv_file} 未找到")
raise FileNotFoundError(f"{csv_file} 未找到")
- if duckdb_file.exists():
- logger.error(f"{duckdb_file} 已存在")
- raise FileExistsError(f"{duckdb_file} 已存在")
- # 连接到DuckDB数据库(如果数据库不存在,则会自动创建)
- conn = duckdb.connect(database=str(duckdb_file), read_only=False)
- # 读取CSV文件并导入到DuckDB表中
+ if stardictdb_file.exists():
+ logger.error(f"{stardictdb_file} 已存在")
+ raise FileExistsError(f"{stardictdb_file} 已存在")
+ # 连接到 stardict.ddb 数据库(如果数据库不存在,则会自动创建)
+ conn = duckdb.connect(database=str(stardictdb_file), read_only=False)
+ # 读取CSV文件并导入到 stardict.ddb 数据库的 stardict 表中
conn.execute(f"CREATE TABLE stardict AS SELECT * FROM read_csv_auto('{csv_file}')")
conn.close()
-def convert_duckdb_to_txt(duckdb_file: Path, txt_file: Path, buffer_size: int = 1000):
+def update_phonetics_from_phoneticsdb_to_stardictdb(
+ phoneticsdb_file: Path, stardictdb_file: Path
+):
+ """
+ 从 phonetics.ddb 文件中 phon_uk 和 phon_us 列读取英、美音标数据,并更新到 stardict.ddb 文件中的 phonetic 列
+ """
+ if not stardictdb_file.exists():
+ logger.error(f"{stardictdb_file} 未找到")
+ raise FileNotFoundError(f"{stardictdb_file} 未找到")
+ if not phoneticsdb_file.exists():
+ logger.error(f"{phoneticsdb_file} 未找到")
+ raise FileNotFoundError(f"{phoneticsdb_file} 未找到")
+
+ # 连接到 stardict.ddb 数据库
+ conn = duckdb.connect(database=str(stardictdb_file), read_only=False)
+ cursor = conn.cursor()
+ # 连接到 phonetics.ddb 数据库
+ phonetics_conn = duckdb.connect(database=str(phoneticsdb_file), read_only=True)
+ phonetics_cursor = phonetics_conn.cursor()
+ # 从 stardict.ddb 拿 word 去 phonetics.ddb 取音标
+ try:
+ # 从 stardict.ddb 中获取所有单词
+ cursor.execute("SELECT word, phonetic FROM stardict")
+ rows = cursor.fetchall()
+
+ # 遍历每个单词,从 phonetics.ddb 中获取音标并更新到 stardict.ddb
+ for row in rows:
+ (word, phonetic) = row
+ phonetics_cursor.execute(
+ """
+ SELECT phon_uk, phon_us
+ FROM words
+ WHERE word = ?
+ """,
+ (word,),
+ )
+ result = phonetics_cursor.fetchone()
+
+ if result:
+ (phon_uk, phon_us) = result
+ # 更新 stardict.ddb 中的 phonetic 列
+ if phon_uk == phon_us:
+ new_phonetic = phon_uk.strip("/")
+ else:
+ new_phonetic = f"英 {phon_uk.strip('/')} 美 {phon_us.strip('/')}"
+ cursor.execute(
+ """
+ UPDATE stardict
+ SET phonetic = ?
+ WHERE word = ?
+ """,
+ (new_phonetic, word),
+ )
+ logger.debug(f"更新单词 {word} 的音标: {new_phonetic}")
+ else:
+ pass # 跳过没有找到音标的单词
+
+ # 提交事务
+ conn.commit()
+
+ except Exception as e:
+ logger.error(f"更新音标时发生错误: {e}")
+ if conn.in_transaction: # 检查是否有活动的事务
+ conn.rollback() # 回滚事务
+ finally:
+ # 关闭连接
+ cursor.close()
+ conn.close()
+ phonetics_cursor.close()
+ phonetics_conn.close()
+
+
+def build_phonetics_ddb(oald_txt: Path, phoneticsdb_file: Path):
+ """
+ 从 oald-fork.txt 文件中读取单词和英、美音标数据,并生成 phonetics.ddb 数据库(words 表包含 word、phon_uk、phon_us 三个列)
+ """
+ if not oald_txt.exists():
+ logger.error(f"{oald_txt} 未找到")
+ raise FileNotFoundError(f"{oald_txt} 未找到")
+
+ # 连接到 phonetics.ddb 数据库
+ conn = duckdb.connect(phoneticsdb_file)
+ cursor = conn.cursor()
+
+ # 创建表来存储单词和音标信息
+ cursor.execute("""
+ CREATE TABLE words (
+ word TEXT PRIMARY KEY,
+ phon_uk TEXT,
+ phon_us TEXT
+ )
+ """)
+
+ with oald_txt.open("r", encoding="utf-8") as f:
+ word = None # 当前词条
+ # 逐行读取文件
+ for line in f:
+ line = line.strip() # 去除首尾空白字符
+ if not line:
+ continue # 跳过空行
+ # 处理 ">" 行
+ if line.startswith(">"):
+ word = None # 重置当前词条
+ continue
+ # 处理 "@@@LINK=" 行
+ elif line.startswith("@@@LINK="):
+ word = None # 重置当前词条
+ continue # 跳过链接行,先建立非链接词条,稍后再处理链接词条
+ # 处理普通词条行
+ elif line.startswith("" 行
+ if line.startswith(">"):
+ word = None # 重置当前词条
+ continue
+ # 处理 "@@@LINK=" 行
+ elif line.startswith("@@@LINK="):
+ # 从 phonetics.ddb 取 phon_uk 和 phon_us
+ linked_to = line.lstrip("@@@LINK=")
+ cursor.execute(
+ """
+ SELECT phon_uk, phon_us
+ FROM words
+ WHERE word = ?""",
+ (linked_to,),
+ )
+ result = cursor.fetchone()
+
+ if result:
+ (phon_uk, phon_us) = result
+ # 更新 phoneticsdb_file 中的 phon_uk 和 phon_am 列
+ if phon_uk is not None and phon_us is not None:
+ # 将单词和音标信息插入数据库
+ try:
+ cursor.execute(
+ """
+ INSERT INTO words (word, phon_uk, phon_us)
+ VALUES (?, ?, ?)
+ """,
+ (word, phon_uk, phon_us),
+ )
+ except duckdb.duckdb.ConstraintException:
+ logger.debug(f"链接词条 {linked_to} 已存在,跳过")
+ pass
+ # 处理普通词条行
+ elif line.startswith(""
"1. 开源英汉字典:MIT / CC 双协议
"
"2. 标注牛津三千关键词:音标后 K字符
"
@@ -437,42 +708,167 @@ def generate_mdx(txt_file: Path, mdx_file: Path):
"4. 标注 COCA/BNC 的词频顺序
"
"5. 标注考试大纲信息:中高研四六托雅 等
"
"6. 增加汉英反查
"
- "", # 从 concise-enhanced.info.html 中提取
+ "",
)
- # 写入 MDX 文件
+ # 写入 concise-enhanced.mdx 文件
with mdx_file.open("wb") as outfile:
writer.write(outfile)
+def calculate_time_interval(log1, log2):
+ # 提取时间戳部分
+ time_format = "%Y-%m-%d %H:%M:%S"
+ timestamp1 = log1.split(" | ")[0]
+ timestamp2 = log2.split(" | ")[0]
+
+ # 将时间戳转换为 datetime 对象
+ time1 = datetime.strptime(timestamp1, time_format)
+ time2 = datetime.strptime(timestamp2, time_format)
+
+ # 计算时间差
+ time_diff = time2 - time1
+
+ # 提取小时、分钟、秒
+ total_seconds = int(time_diff.total_seconds())
+ hours = total_seconds // 3600
+ minutes = (total_seconds % 3600) // 60
+ seconds = total_seconds % 60
+
+ # 构建结果字符串,忽略值为 0 的部分
+ result = []
+ if hours > 0:
+ result.append(f"{hours}h")
+ if minutes > 0:
+ result.append(f"{minutes}m")
+ if seconds > 0:
+ result.append(f"{seconds}s")
+
+ # 如果所有值都为 0,返回 1s
+ if not result:
+ return "⏱️ 1s"
+
+ return f"⏱️ {''.join(result)}"
+
+
if __name__ == "__main__":
- logger.info("开始转换...")
+ # 配置日志 完整流程耗时 ⏱️2h3m26s
+ configure_logging("logs", level="DEBUG")
+ # 记录开始时间
+ start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+ # 0️⃣ 📁 创建输出目录、定义文件路径 ⏱️1s
+ logger.info("创建输出目录、定义文件路径...")
+ # 源文件
csv_file = Path("stardict.csv")
+ oald_txt = Path().cwd() / "oald-fork" / "oald-fork.txt"
+ # GoldenDict 路径
+ goldendict_exe = Path(r"C:\SSS\GoldenDict-ng\goldendict.exe")
+ # 输出文件
+ output_dir = Path("output")
+ stardictdb_file = Path() / output_dir / "stardict.ddb"
+ phoneticsdb_file = Path() / output_dir / "phonetics.ddb"
+ txt_file = Path() / output_dir / "stardict.txt"
+ mdx_file = Path("concise-enhanced.mdx")
+ # 检查源文件是否存在
if not csv_file.exists():
- raise FileNotFoundError(f"{duckdb_file} 未找到")
-
- duckdb_file = Path("stardict.ddb")
- if not duckdb_file.exists():
- convert_csv_to_duckdb(csv_file, duckdb_file)
-
- txt_file = Path("stardict.txt")
+ raise FileNotFoundError(
+ f"{csv_file} 未找到 (stardict.csv 由 stardict.7z 解包获得)"
+ )
+ if not oald_txt.exists():
+ print(
+ f"{oald_txt} 未找到 (oald-fork.txt 由 精装牛津十 mdx 使用 AutoMdxBuilder 解包获得)"
+ )
+ # 检查 GoldenDict 是否存在
+ if not goldendict_exe.exists():
+ raise FileNotFoundError(f"{goldendict_exe} 未找到 (GoldenDict-ng 软件)")
+ # 清空输出目录
+ if not output_dir.exists():
+ output_dir.mkdir()
+ if stardictdb_file.exists():
+ # 删除旧的词典数据库文件
+ stardictdb_file.unlink()
+ if phoneticsdb_file.exists():
+ # 删除旧的音标数据库文件
+ phoneticsdb_file.unlink()
if txt_file.exists():
# 删除旧的 TXT 文件
txt_file.unlink()
- convert_duckdb_to_txt(duckdb_file, txt_file, buffer_size=1_000_000)
- logger.info(f"TXT 文件已生成:{txt_file}")
-
- mdx_file = Path("concise-enhanced.mdx")
if mdx_file.exists():
# 删除旧的 MDX 文件
mdx_file.unlink()
+ logger.info("输出目录、文件路径配置完成")
+
+ # 记录步骤0结束时间
+ step0_end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ time_interval = calculate_time_interval(start_time, step0_end_time)
+ logger.success(f"步骤0完成,耗时: {time_interval}")
+
+ logger.info("开始转换...")
+
+ # 1️⃣ ⭐ 生成 stardict.ddb ⏱️1s
+ logger.info("生成 stardict.ddb 文件...")
+ convert_csv_to_stardictdb(csv_file, stardictdb_file)
+ logger.info(f"stardict.ddb 文件已生成:{stardictdb_file}")
+
+ # 记录步骤1结束时间
+ step1_end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ time_interval = calculate_time_interval(step0_end_time, step1_end_time)
+ logger.success(f"步骤1完成,耗时: {time_interval}")
+
+ # 2️⃣ 🔖 生成 phonetics.ddb ⏱️14m50s
+ logger.info("生成 phonetics.ddb 文件...")
+ build_phonetics_ddb(oald_txt, phoneticsdb_file)
+ logger.info(f"phonetics.ddb 文件已生成:{phoneticsdb_file}")
+
+ # 记录步骤2结束时间
+ step2_end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ time_interval = calculate_time_interval(step1_end_time, step2_end_time)
+ logger.success(f"步骤2完成,耗时: {time_interval}")
+
+ # 3️⃣ 🆕 更新 stardict.ddb 音标信息 ⏱️1h13m20s
+ logger.info("更新音标信息...")
+ update_phonetics_from_phoneticsdb_to_stardictdb(phoneticsdb_file, stardictdb_file)
+ logger.info(f"更新音标信息完成:{stardictdb_file}")
+
+ # 记录步骤3结束时间
+ step3_end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ time_interval = calculate_time_interval(step2_end_time, step3_end_time)
+ logger.success(f"步骤3完成,耗时: {time_interval}")
+
+ # 4️⃣ 📄 生成 stardict.txt ⏱️26m25s
+ logger.info("生成 stardict.txt 文件...")
+ convert_stardictdb_to_txt(stardictdb_file, txt_file, buffer_size=1_000_000)
+ logger.info(f"stardict.txt 文件已生成:{txt_file}")
+
+ # 记录步骤4结束时间
+ step4_end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ time_interval = calculate_time_interval(step3_end_time, step4_end_time)
+ logger.success(f"步骤4完成,耗时: {time_interval}")
+
+ # 5️⃣ 📦 生成 concise-enhanced.mdx ⏱️8m49s
+ logger.info("生成 concise-enhanced.mdx 文件...")
generate_mdx(txt_file, mdx_file)
- logger.info(f"MDX 文件已生成:{mdx_file}")
+ logger.info(f"concise-enhanced.mdx 文件已生成:{mdx_file}")
+
+ # 记录步骤5结束时间
+ step5_end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ time_interval = calculate_time_interval(step4_end_time, step5_end_time)
+ logger.success(f"步骤5完成,耗时: {time_interval}")
+
+ # 6️⃣ 🔍 打开 GoldenDict,自动重建索引 ⏱️1s
+ logger.info("打开 GoldenDict...")
+ import subprocess
+
+ subprocess.Popen(str(goldendict_exe))
+ logger.info("GoldenDict 已打开")
- # goldendict_exe = Path(r"C:\SSS\GoldenDict-ng\goldendict.exe")
- # if goldendict_exe.exists():
- # # 打开 GoldenDict,自动重建索引
- # import subprocess
+ # 记录步骤6结束时间
+ step6_end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+ time_interval = calculate_time_interval(step5_end_time, step6_end_time)
+ logger.success(f"步骤6完成,耗时: {time_interval}")
- # subprocess.run(str(goldendict_exe))
+ # 记录总耗时
+ total_time_interval = calculate_time_interval(start_time, step6_end_time)
+ logger.success(f"所有步骤完成,总耗时: {total_time_interval}")