logo资料库

python正向最大匹配分词和逆向最大匹配分词.docx

第1页 / 共3页
第2页 / 共3页
第3页 / 共3页
资料共3页,全文预览结束
python正向最大匹配分词和逆向最大匹配分词
python 正向最大匹配分词和逆向最大匹配分词 正向最大匹配 'converted other encoding to unicode encoding' if isinstance(s, unicode): return s else: return unicode(s, encoding) print 'word: ', word print "\n" while segStrLen > 0: 'forward max match segment' wordList = [] segStr = str segStrLen = len(segStr) for word in wordDict: 1. # -*- coding:utf-8 -*- 2. 3. CODEC='utf-8' 4. 5. def u(s, encoding): 6. 7. 8. 9. 10. 11. 12. def fwd_mm_seg(wordDict, maxLen, str): 13. 14. 15. 16. 17. 18. 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. # 36. 37. 38. 39. 40. wordList.append(subStr) segStr = segStr[wordLen:] segStrLen = segStrLen - wordLen wordLen = segStrLen subStr = segStr[0:wordLen] print "subStr: ", subStr while wordLen > 1: if subStr in wordDict: for wordstr in wordList: print "wordstr: ", wordstr if segStrLen > maxLen: wordLen = maxLen else: print "subStr1: %r" % subStr break else: print "subStr2: %r" % subStr wordLen = wordLen - 1 subStr = subStr[0:wordLen] print "subStr3: ", subStr
return wordList fp_dict = open('words.dic') wordDict = {} for eachWord in fp_dict: 41. 42. 43. 44. def main(): 45. 46. 47. 48. 49. 50. 51. 52. 53. 54. 55. if __name__ == '__main__': 56. 57. main() wordDict[u(eachWord.strip(), 'utf-8')] = 1 segStr = u'你好世界 hello world' print segStr wordList = fwd_mm_seg(wordDict, 10, segStr) print "==".join(wordList) 逆向最大匹配 'converted other encoding to unicode encoding' if isinstance(s, unicode): return s else: return unicode(s, encoding) 1. # -*- coding:utf-8 -*- 2. 3. 4. def u(s, encoding): 5. 6. 7. 8. 9. 10. 11. CODEC='utf-8' 12. 13. def bwd_mm_seg(wordDict, maxLen, str): 14. 15. 16. 17. 18. 19. 20. 21. 22. 23. 24. 25. 'forward max match segment' wordList = [] segStr = str segStrLen = len(segStr) for word in wordDict: if segStrLen > maxLen: wordLen = maxLen print "\n" while segStrLen > 0: wordLen = segStrLen print 'word: ', word else:
subStr = segStr[-wordLen:None] print "subStr: ", subStr while wordLen > 1: if subStr in wordDict: print "subStr1: %r" % subStr break print "subStr2: %r" % subStr wordLen = wordLen - 1 subStr = subStr[-wordLen:None] wordDict[u(eachWord.strip(), 'utf-8')] = 1 segStr = ur'你好世界 hello world' print segStr wordList = bwd_mm_seg(wordDict, 10, segStr) print "==".join(wordList) print "subStr3: ", subStr wordList.append(subStr) segStr = segStr[0: -wordLen] segStrLen = segStrLen - wordLen print "wordstr: ", wordstr else: wordList.reverse() for wordstr in wordList: 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36. # 37. 38. 39. 40. 41. 42. 43. 44. 45. 46. def main(): 47. 48. 49. 50. 51. 52. 53. 54. 55. 56. if __name__ == '__main__': 57. 58. return wordList main() fp_dict = open('words.dic') wordDict = {} for eachWord in fp_dict:
分享到:
收藏