From 0374361f733d048e3229da0b5682b2907a715e21 Mon Sep 17 00:00:00 2001 From: journey-ad Date: Fri, 21 Jun 2024 02:29:39 +0800 Subject: [PATCH] :sparkles: Support keyword tokenizer using jieba --- app/api/graphql/route.ts | 48 ++++++++++++++++++++------------ app/search/page.tsx | 1 + components/FileList.tsx | 4 +-- components/SearchResultsItem.tsx | 8 +++--- components/SearchResultsList.tsx | 4 ++- lib/jieba.ts | 18 ++++++++++++ next.config.js | 5 ++++ package.json | 1 + utils/index.ts | 13 +++++---- 9 files changed, 71 insertions(+), 31 deletions(-) create mode 100644 lib/jieba.ts diff --git a/app/api/graphql/route.ts b/app/api/graphql/route.ts index e6f59eb..2c26c38 100644 --- a/app/api/graphql/route.ts +++ b/app/api/graphql/route.ts @@ -5,6 +5,7 @@ import { NextRequest } from "next/server"; import { formatTorrent } from "./service"; +import { jiebaExtract } from "@/lib/jieba"; import { query } from "@/lib/pgdb"; import { SEARCH_KEYWORD_SPLIT_REGEX } from "@/config/constant"; @@ -40,6 +41,7 @@ const typeDefs = gql` } type SearchResult { + keywords: [String!]! torrents: [Torrent!]! total_count: Int! has_more: Boolean! @@ -95,21 +97,35 @@ const resolvers = { }; const sizeFilter = sizeFilterMap[queryInput.filterSize] || ""; - // Extract and process keywords - const keywords = Array.from( - new Set( - queryInput.keyword - .trim() - .split(SEARCH_KEYWORD_SPLIT_REGEX) - .filter((k: string) => k.trim().length >= 2), - ), + // Extract keywords using regex tokenizer + let keywords = queryInput.keyword + .trim() + .split(SEARCH_KEYWORD_SPLIT_REGEX); + + // Use jieba to extract additional keywords if input is a full sentence + if (keywords.length === 1 && queryInput.keyword.length >= 4) { + keywords.push(...jiebaExtract(queryInput.keyword)); + } + + // Ensure full keyword is the first item + if (!keywords.includes(queryInput.keyword)) { + keywords.unshift(queryInput.keyword); + } + + // Remove duplicates and filter out keywords shorter than 2 characters to avoid slow SQL queries + keywords = Array.from( + new Set(keywords.filter((k: string) => k.trim().length >= 2)), ); // Construct the keyword filter condition + // The full keyword (first item) is handled separately let keywordFilter = `torrents.name ILIKE $1`; - if (keywords.length > 0) { + // Combine remaining keywords with `AND`, then with full keyword using `OR` + // Ensures the full keyword matches first, followed by individual tokens + if (keywords.length > 1) { keywordFilter += ` OR ${keywords + .slice(1) .map((_: any, i: number) => `torrents.name ILIKE $${i + 2}`) .join(" AND ")}`; } @@ -132,8 +148,8 @@ const resolvers = { ${timeFilter} -- 时间范围过滤条件 ${sizeFilter} -- 大小范围过滤条件 ${orderBy ? `ORDER BY ${orderBy}` : ""} -- 排序方式 - LIMIT $${keywords.length + 2} -- 返回数量 - OFFSET $${keywords.length + 3} -- 分页偏移 + LIMIT $${keywords.length + 1} -- 返回数量 + OFFSET $${keywords.length + 2} -- 分页偏移 ) -- 从过滤后的数据中查询文件信息 SELECT @@ -163,13 +179,12 @@ const resolvers = { `; const params = [ - `%${queryInput.keyword}%`, ...keywords.map((k: any) => `%${k}%`), queryInput.limit, queryInput.offset, ]; - // console.log(sql, params); + // console.log(sql, params, keywords); const queryArr = [query(sql, params)]; @@ -186,10 +201,7 @@ const resolvers = { ${sizeFilter} ) AS limited_total; `; - const countParams = [ - `%${queryInput.keyword}%`, - ...keywords.map((k: any) => `%${k}%`), - ]; + const countParams = [...keywords.map((k: any) => `%${k}%`)]; queryArr.push(query(countSql, countParams)); } else { @@ -207,7 +219,7 @@ const resolvers = { queryInput.withTotalCount && queryInput.offset + queryInput.limit < total_count; - return { torrents, total_count, has_more }; + return { keywords, torrents, total_count, has_more }; }, torrentByHash: async (_: any, { hash }: any) => { // SQL query to fetch torrent data and files information by hash diff --git a/app/search/page.tsx b/app/search/page.tsx index 18c315b..addeb81 100644 --- a/app/search/page.tsx +++ b/app/search/page.tsx @@ -154,6 +154,7 @@ export default async function SearchPage({ diff --git a/components/FileList.tsx b/components/FileList.tsx index 7ecabbd..c871532 100644 --- a/components/FileList.tsx +++ b/components/FileList.tsx @@ -120,7 +120,7 @@ function FileItem({ highlight, }: { file: FileItem | Directory; - highlight?: string; + highlight?: string | string[]; }) { return (
  • @@ -57,7 +57,7 @@ export default function SearchResultsItem({ diff --git a/components/SearchResultsList.tsx b/components/SearchResultsList.tsx index 78b2fbb..b505cc3 100644 --- a/components/SearchResultsList.tsx +++ b/components/SearchResultsList.tsx @@ -12,11 +12,13 @@ import { SEARCH_PARAMS, SEARCH_PAGE_MAX } from "@/config/constant"; export default function SearchResultsList({ resultList, + keywords, cost_time = 0, total_count = 0, searchOption, }: { resultList: SearchResultsListProps["torrents"]; + keywords: string[]; cost_time: number; total_count: number; searchOption: { @@ -127,7 +129,7 @@ export default function SearchResultsList({ {resultList.map((item) => (
    - +
    ))} diff --git a/lib/jieba.ts b/lib/jieba.ts new file mode 100644 index 0000000..421cefc --- /dev/null +++ b/lib/jieba.ts @@ -0,0 +1,18 @@ +import { load, cut, extract } from "@node-rs/jieba"; + +let jiebaLoaded = false; + +export function loadJieba() { + if (!jiebaLoaded) { + load(); + jiebaLoaded = true; + } +} + +export function jiebaCut(text: string) { + return cut(text); +} + +export function jiebaExtract(text: string) { + return extract(text, 3).map((_) => _.keyword); +} diff --git a/next.config.js b/next.config.js index d41608c..fb932a7 100644 --- a/next.config.js +++ b/next.config.js @@ -8,6 +8,11 @@ console.log("[Next] build mode:", mode); /** @type {import('next').NextConfig} */ const nextConfig = { output: mode, + experimental: { + serverComponentsExternalPackages: [ + '@node-rs/jieba' + ] + } } module.exports = withNextIntl(nextConfig); diff --git a/package.json b/package.json index c568763..db00307 100644 --- a/package.json +++ b/package.json @@ -14,6 +14,7 @@ "@nextui-org/react": "^2.4.1", "@nextui-org/system": "2.2.1", "@nextui-org/theme": "2.2.5", + "@node-rs/jieba": "^1.10.3", "@react-aria/ssr": "3.9.4", "@react-aria/visually-hidden": "3.8.12", "@tsparticles/react": "^3.0.0", diff --git a/utils/index.ts b/utils/index.ts index 9f8b7f4..10a2a17 100644 --- a/utils/index.ts +++ b/utils/index.ts @@ -60,15 +60,16 @@ export function getSizeColor(size: number | string) { } } -export function parseHighlight(text: string, highlight: string) { +export function parseHighlight(text: string, highlight: string | string[]) { if (!text || !highlight) { return text; } - const keywords = highlight - .split(SEARCH_KEYWORD_SPLIT_REGEX) - .filter((k: string) => k.trim().length >= 2); - - keywords.unshift(highlight); + const keywords = + typeof highlight === "string" + ? [highlight, ...highlight.split(SEARCH_KEYWORD_SPLIT_REGEX)].filter( + (k: string) => k.trim().length >= 2, + ) + : highlight; // Function to escape HTML special characters to avoid interference function escapeHtml(unsafe: string) {