python 正向最大匹配分词和逆向最大匹配分词
正向最大匹配
'converted other encoding to unicode encoding'
if isinstance(s, unicode):
return s
else:
return unicode(s, encoding)
print 'word: ', word
print "\n"
while segStrLen > 0:
'forward max match segment'
wordList = []
segStr = str
segStrLen = len(segStr)
for word in wordDict:
1. # -*- coding:utf-8 -*-
2.
3. CODEC='utf-8'
4.
5. def u(s, encoding):
6.
7.
8.
9.
10.
11.
12. def fwd_mm_seg(wordDict, maxLen, str):
13.
14.
15.
16.
17.
18.
19.
20.
21.
22.
23.
24.
25.
26.
27.
28.
29.
30.
31.
32.
33.
34.
35. #
36.
37.
38.
39.
40.
wordList.append(subStr)
segStr = segStr[wordLen:]
segStrLen = segStrLen - wordLen
wordLen = segStrLen
subStr = segStr[0:wordLen]
print "subStr: ", subStr
while wordLen > 1:
if subStr in wordDict:
for wordstr in wordList:
print "wordstr: ", wordstr
if segStrLen > maxLen:
wordLen = maxLen
else:
print "subStr1: %r" % subStr
break
else:
print "subStr2: %r" % subStr
wordLen = wordLen - 1
subStr = subStr[0:wordLen]
print "subStr3: ", subStr
return wordList
fp_dict = open('words.dic')
wordDict = {}
for eachWord in fp_dict:
41.
42.
43.
44. def main():
45.
46.
47.
48.
49.
50.
51.
52.
53.
54.
55. if __name__ == '__main__':
56.
57.
main()
wordDict[u(eachWord.strip(), 'utf-8')] = 1
segStr = u'你好世界 hello world'
print segStr
wordList = fwd_mm_seg(wordDict, 10, segStr)
print "==".join(wordList)
逆向最大匹配
'converted other encoding to unicode encoding'
if isinstance(s, unicode):
return s
else:
return unicode(s, encoding)
1. # -*- coding:utf-8 -*-
2.
3.
4. def u(s, encoding):
5.
6.
7.
8.
9.
10.
11. CODEC='utf-8'
12.
13. def bwd_mm_seg(wordDict, maxLen, str):
14.
15.
16.
17.
18.
19.
20.
21.
22.
23.
24.
25.
'forward max match segment'
wordList = []
segStr = str
segStrLen = len(segStr)
for word in wordDict:
if segStrLen > maxLen:
wordLen = maxLen
print "\n"
while segStrLen > 0:
wordLen = segStrLen
print 'word: ', word
else:
subStr = segStr[-wordLen:None]
print "subStr: ", subStr
while wordLen > 1:
if subStr in wordDict:
print "subStr1: %r" % subStr
break
print "subStr2: %r" % subStr
wordLen = wordLen - 1
subStr = subStr[-wordLen:None]
wordDict[u(eachWord.strip(), 'utf-8')] = 1
segStr = ur'你好世界 hello world'
print segStr
wordList = bwd_mm_seg(wordDict, 10, segStr)
print "==".join(wordList)
print "subStr3: ", subStr
wordList.append(subStr)
segStr = segStr[0: -wordLen]
segStrLen = segStrLen - wordLen
print "wordstr: ", wordstr
else:
wordList.reverse()
for wordstr in wordList:
26.
27.
28.
29.
30.
31.
32.
33.
34.
35.
36. #
37.
38.
39.
40.
41.
42.
43.
44.
45.
46. def main():
47.
48.
49.
50.
51.
52.
53.
54.
55.
56. if __name__ == '__main__':
57.
58.
return wordList
main()
fp_dict = open('words.dic')
wordDict = {}
for eachWord in fp_dict: