python雙向最大匹配演算法 雙向最大匹配分詞演算法

2021-10-11 12:29:13 字數 3972 閱讀 4750

#!/usr/bin/python

#encoding=gbk

import sys

dictmaxlength = 5

dctdict = {}

encoding='gbk'

初始化字典、初始化最大詞長

def initdct(dct):

global dctdict

global dictmaxlength

dctobj = open(dct)

for line in dctobj:

line = line.strip()

word = line.split("\t")[0].strip()

dctdict[word] = line

word = word.strip().decode(encoding)

if dictmaxlength < len(word):

dictmaxlength = len(word)

dctobj.close()

正向最大匹配演算法

def maximunmathching(sent):

global dictmaxlength

global dctdict

index = 0

j = 0

result = ""

sent = sent.strip().decode(encoding)

sentlen = len(sent)

while(index < sentlen):

for i in range(dictmaxlength, 0, -1):

j = i + index

if j > sentlen:

j = sentlen

sub = sent[index:j]

if len(sub) > 1:

if dctdict.has_key(sub.encode(encoding)):

index += i

result += sub.encode(encoding) + " "

break;

else:

index += i

if not sub.encode(encoding) == " ":

result += sub.encode(encoding) + " "

break

return result.strip()

逆向最大匹配演算法

def reversemaximunmathching(sent):

global dctdict

global dictmaxlength

sb = ""

sent = sent.strip().decode(encoding)

index = len(sent)

j = 0

list =

while index >= 0:

for i in range(dictmaxlength, 0, -1):

j = index - i

if j < 0: j = 0

sub = sent[j:index]

if len(sub) > 1:

if dctdict.has_key(sub.encode(encoding)):

index = index - i

break;

else:

if not sub.encode(encoding) == " ":

index = index - i

break

list.reverse()

return " ".join(list)

非字典詞、單字字典詞、總詞數 越少越好

def segmenter(sent):

mm = maximunmathching(sent).strip()

rmm = reversemaximunmathching(sent).strip()

if mm == rmm:

return mm

else:

return bmmresult(mm, rmm)

非字典詞、單字字典詞、總詞數 越少越好

def bmmresult(mm, rmm):

#print mm

#print rmm

global dctdict

mmlist = mm.split(" ")

rmmlist = rmm.split(" ")

oovnum_mm = 0

oovnum_rmm = 0

signum_mm = 0

signum_rmm = 0

totnum_mm = len(mmlist)

totnum_rmm = len(rmmlist)

for word in mmlist:

if not dctdict.has_key(word):

oovnum_mm += 1

if len(word.decode(encoding)) == 1:

signum_mm += 1

for word in rmmlist:

if not dctdict.has_key(word):

oovnum_rmm += 1

if len(word.decode(encoding)) == 1:

signum_rmm += 1

mmwmix = 0

rmmnwmix = 0

if oovnum_mm > oovnum_rmm:

rmmnwmix += 1

elif oovnum_mm < oovnum_rmm:

mmwmix += 1

if signum_mm > signum_rmm:

rmmnwmix += 1

elif signum_mm < signum_rmm:

mmwmix += 1

if totnum_mm > totnum_rmm:

rmmnwmix += 1

elif totnum_mm < totnum_rmm:

mmwmix += 1

#print oovnum_mm, signum_mm, totnum_mm

#print oovnum_rmm, signum_rmm, totnum_rmm

if mmwmix < mmwmix:

return mm

else:

return rmm

def handlefile(input, output):

inputobj = open(input)

outputobj = open(output,"w")

index = 0

for line in inputobj:

index += 1

if index % 100000 == 0:

print str(index) + "\r"

line = line.strip().lower()

seg = segmenter(line)

outputobj.write(seg.strip() + "\n")

inputobj.close()

outputobj.close()

if __name__ == '__main__':

if len(sys.ar**) != 4:

print "usage %s dict[in] infile[in] outfile[out]." %sys.ar**[0]

sys.exit(-1)

dct = sys.ar**[1]

input = sys.ar**[2]

output = sys.ar**[3]

initdct(dct)

#sent = "chien中華人民共和國在2023年成立了"

#print segmenter(sent)

handlefile(input, output)

基於規則的雙向最大匹配演算法的分詞

雙向最大匹配演算法 bi directction matching method 是將最大匹配法得到的分詞結果和逆向最大匹配法得到的結果通過雙向最大匹配演算法的規則進行篩選而得到。coding utf 8 project exuding nlp all author texuding time 20...

最大匹配演算法

最大匹配法是最簡單的分詞方法,他完全使用詞典進行分詞,如果詞典好,則分詞的效果好 正向,即從左往右進行匹配 maximum match method 最大匹配法 class mm def init self self.window size 4 def cut self,text result in...

Python 最大逆向匹配演算法

第三次重新寫這個演算法,每次寫都有新的體會。這次最大的感受是把訪問資料夾的包都熟悉了一下,os和shutil。後者用來刪除整個檔案,這種破壞力還是慎用吧。def mk new dir filename 新建乙個資料夾,如果存在,則刪除並重建。if os.path.exists filename is...