-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconstruct_index.py
77 lines (57 loc) · 2.31 KB
/
construct_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#-*- coding:utf-8 -*-
############################################
#
# Author: Chuwei Luo
# Email: luochuwei@gmail.com
# Date: 10/05/2016
# Usage: construct index
#
############################################
import cPickle
import os
import jieba
from whoosh.analysis import Tokenizer,Token
from whoosh.compat import text_type
from whoosh import fields
from whoosh import index
from whoosh.fields import *
from whoosh import scoring
from whoosh import qparser
pid_p_r = cPickle.load(open(r'pid_p_r.pkl', 'rb'))
class ChineseTokenizer(Tokenizer):
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs):
assert isinstance(value, text_type), "%r is not unicode" % value
t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
seglist=jieba.cut_for_search(value) #使用结巴分词库进行分词
for w in seglist:
t.original = t.text = w
t.boost = 1.0
if positions:
t.pos=start_pos+value.find(w)
if chars:
t.startchar=start_char+value.find(w)
t.endchar=start_char+value.find(w)+len(w)
yield t #通过生成器返回每个分词的结果token
def ChineseAnalyzer():
return ChineseTokenizer()
analyzer = ChineseAnalyzer()
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer))
if not os.path.exists("index"):
os.mkdir("index")
ix = index.create_in("index",schema)
ix = index.open_dir("index")
writer = ix.writer()
for pid in xrange(len(pid_p_r)):
writer.add_document(title = str(pid).decode("utf-8"), path = u"/"+str(pid).decode("utf-8"), content = pid_p_r[pid][0].decode("utf-8"))
writer.commit()
def find(text):
og = qparser.OrGroup.factory(0.9)
parser = qparser.QueryParser("content", schema, group=og)
with ix.searcher() as searcher:
# q = parser.parse(u"今天天气不错")
q = parser.parse(text.decode("utf-8"))
results = searcher.search(q)
if 0 != len(results):
for hit in results:
print hit.highlights("content")
return results