Skip to content

Commit

Permalink
✨ Support keyword tokenizer using jieba
Browse files Browse the repository at this point in the history
  • Loading branch information
journey-ad committed Jun 20, 2024
1 parent 776f67d commit 0374361
Show file tree
Hide file tree
Showing 9 changed files with 71 additions and 31 deletions.
48 changes: 30 additions & 18 deletions app/api/graphql/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import { NextRequest } from "next/server";

import { formatTorrent } from "./service";

import { jiebaExtract } from "@/lib/jieba";
import { query } from "@/lib/pgdb";
import { SEARCH_KEYWORD_SPLIT_REGEX } from "@/config/constant";

Expand Down Expand Up @@ -40,6 +41,7 @@ const typeDefs = gql`
}
type SearchResult {
keywords: [String!]!
torrents: [Torrent!]!
total_count: Int!
has_more: Boolean!
Expand Down Expand Up @@ -95,21 +97,35 @@ const resolvers = {
};
const sizeFilter = sizeFilterMap[queryInput.filterSize] || "";

// Extract and process keywords
const keywords = Array.from(
new Set(
queryInput.keyword
.trim()
.split(SEARCH_KEYWORD_SPLIT_REGEX)
.filter((k: string) => k.trim().length >= 2),
),
// Extract keywords using regex tokenizer
let keywords = queryInput.keyword
.trim()
.split(SEARCH_KEYWORD_SPLIT_REGEX);

// Use jieba to extract additional keywords if input is a full sentence
if (keywords.length === 1 && queryInput.keyword.length >= 4) {
keywords.push(...jiebaExtract(queryInput.keyword));
}

// Ensure full keyword is the first item
if (!keywords.includes(queryInput.keyword)) {
keywords.unshift(queryInput.keyword);
}

// Remove duplicates and filter out keywords shorter than 2 characters to avoid slow SQL queries
keywords = Array.from(
new Set(keywords.filter((k: string) => k.trim().length >= 2)),
);

// Construct the keyword filter condition
// The full keyword (first item) is handled separately
let keywordFilter = `torrents.name ILIKE $1`;

if (keywords.length > 0) {
// Combine remaining keywords with `AND`, then with full keyword using `OR`
// Ensures the full keyword matches first, followed by individual tokens
if (keywords.length > 1) {
keywordFilter += ` OR ${keywords
.slice(1)
.map((_: any, i: number) => `torrents.name ILIKE $${i + 2}`)
.join(" AND ")}`;
}
Expand All @@ -132,8 +148,8 @@ const resolvers = {
${timeFilter} -- 时间范围过滤条件
${sizeFilter} -- 大小范围过滤条件
${orderBy ? `ORDER BY ${orderBy}` : ""} -- 排序方式
LIMIT $${keywords.length + 2} -- 返回数量
OFFSET $${keywords.length + 3} -- 分页偏移
LIMIT $${keywords.length + 1} -- 返回数量
OFFSET $${keywords.length + 2} -- 分页偏移
)
-- 从过滤后的数据中查询文件信息
SELECT
Expand Down Expand Up @@ -163,13 +179,12 @@ const resolvers = {
`;

const params = [
`%${queryInput.keyword}%`,
...keywords.map((k: any) => `%${k}%`),
queryInput.limit,
queryInput.offset,
];

// console.log(sql, params);
// console.log(sql, params, keywords);

const queryArr = [query(sql, params)];

Expand All @@ -186,10 +201,7 @@ const resolvers = {
${sizeFilter}
) AS limited_total;
`;
const countParams = [
`%${queryInput.keyword}%`,
...keywords.map((k: any) => `%${k}%`),
];
const countParams = [...keywords.map((k: any) => `%${k}%`)];

queryArr.push(query(countSql, countParams));
} else {
Expand All @@ -207,7 +219,7 @@ const resolvers = {
queryInput.withTotalCount &&
queryInput.offset + queryInput.limit < total_count;

return { torrents, total_count, has_more };
return { keywords, torrents, total_count, has_more };
},
torrentByHash: async (_: any, { hash }: any) => {
// SQL query to fetch torrent data and files information by hash
Expand Down
1 change: 1 addition & 0 deletions app/search/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ export default async function SearchPage({
<SearchResultsList
cost_time={cost_time}
resultList={data.torrents}
keywords={data.keywords}
searchOption={searchOption}
total_count={totalCount}
/>
Expand Down
4 changes: 2 additions & 2 deletions components/FileList.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ function FileItem({
highlight,
}: {
file: FileItem | Directory;
highlight?: string;
highlight?: string | string[];
}) {
return (
<li
Expand Down Expand Up @@ -188,7 +188,7 @@ export default function FileList({
max = -1,
}: {
torrent: TorrentItemProps;
highlight?: string;
highlight?: string | string[];
max?: number;
}) {
const t = useTranslations();
Expand Down
8 changes: 4 additions & 4 deletions components/SearchResultsItem.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ import { SEARCH_DISPLAY_FILES_MAX } from "@/config/constant";

export default function SearchResultsItem({
item,
keyword,
keywords,
}: {
item: TorrentItemProps;
keyword: string;
keywords: string | string[];
}) {
const data = {
...item,
Expand All @@ -47,7 +47,7 @@ export default function SearchResultsItem({
<Link isExternal href={data.url} title={data.name}>
<h2
dangerouslySetInnerHTML={{
__html: parseHighlight(data.name, keyword),
__html: parseHighlight(data.name, keywords),
}}
className="text-md leading-normal"
/>
Expand All @@ -57,7 +57,7 @@ export default function SearchResultsItem({
<Divider className="bg-gray-200 dark:bg-slate-700" />
<CardBody className="px-4">
<FileList
highlight={keyword}
highlight={keywords}
max={SEARCH_DISPLAY_FILES_MAX}
torrent={data as TorrentItemProps}
/>
Expand Down
4 changes: 3 additions & 1 deletion components/SearchResultsList.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@ import { SEARCH_PARAMS, SEARCH_PAGE_MAX } from "@/config/constant";

export default function SearchResultsList({
resultList,
keywords,
cost_time = 0,
total_count = 0,
searchOption,
}: {
resultList: SearchResultsListProps["torrents"];
keywords: string[];
cost_time: number;
total_count: number;
searchOption: {
Expand Down Expand Up @@ -127,7 +129,7 @@ export default function SearchResultsList({

{resultList.map((item) => (
<div key={item.hash} className="mb-6">
<SearchResultsItem item={item} keyword={searchOption.keyword} />
<SearchResultsItem item={item} keywords={keywords} />
</div>
))}

Expand Down
18 changes: 18 additions & 0 deletions lib/jieba.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import { load, cut, extract } from "@node-rs/jieba";

let jiebaLoaded = false;

export function loadJieba() {
if (!jiebaLoaded) {
load();
jiebaLoaded = true;
}
}

export function jiebaCut(text: string) {
return cut(text);
}

export function jiebaExtract(text: string) {
return extract(text, 3).map((_) => _.keyword);
}
5 changes: 5 additions & 0 deletions next.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ console.log("[Next] build mode:", mode);
/** @type {import('next').NextConfig} */
const nextConfig = {
output: mode,
experimental: {
serverComponentsExternalPackages: [
'@node-rs/jieba'
]
}
}

module.exports = withNextIntl(nextConfig);
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"@nextui-org/react": "^2.4.1",
"@nextui-org/system": "2.2.1",
"@nextui-org/theme": "2.2.5",
"@node-rs/jieba": "^1.10.3",
"@react-aria/ssr": "3.9.4",
"@react-aria/visually-hidden": "3.8.12",
"@tsparticles/react": "^3.0.0",
Expand Down
13 changes: 7 additions & 6 deletions utils/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,16 @@ export function getSizeColor(size: number | string) {
}
}

export function parseHighlight(text: string, highlight: string) {
export function parseHighlight(text: string, highlight: string | string[]) {
if (!text || !highlight) {
return text;
}
const keywords = highlight
.split(SEARCH_KEYWORD_SPLIT_REGEX)
.filter((k: string) => k.trim().length >= 2);

keywords.unshift(highlight);
const keywords =
typeof highlight === "string"
? [highlight, ...highlight.split(SEARCH_KEYWORD_SPLIT_REGEX)].filter(
(k: string) => k.trim().length >= 2,
)
: highlight;

// Function to escape HTML special characters to avoid interference
function escapeHtml(unsafe: string) {
Expand Down

0 comments on commit 0374361

Please sign in to comment.