CRF 實現分詞

2021-07-23 08:51:15 字數 4762 閱讀 8675

首先介紹crf++的具體安裝和使用

下面我講一下我的思路:

由於task2根目錄下的_crfpp.so沒有引入

所以正常能執行的是資料夾task2_b-i下的內容,此資料夾下對詞語分類只有b i兩種

首先我用msr_training.utf8 通過python程式 make_crf_train_data.py轉化成訓練語料需要的格式,即tag_train_data.utf8,

然後我開始訓練模型,得到model 再利用crf自帶的python工具包,對輸入文字分詞,具體實現是通過python程式 crf_segment.py ,

最後就將msr_test.utf8 分詞得到 crf_tag_result.utf8.

crf_segment.py

#!/usr/bin/python

# -*- coding: utf-8 -*-

# crf_segmenter.py

# usage:python crf_segmenter.py crf_model test_file result_file

# 利用crf自帶的python工具包,對輸入文字進行分詞

import codecs

import sys

import crfpp

def crf_segmenter(input_file, output_file, tagger):

input_data = codecs.open(input_file, 'r', 'utf-8')

output_data = codecs.open(output_file, 'w', 'utf-8')

forline

in input_data.readlines():

tagger.clear()

forword

inline.strip():

word = word.strip()

ifword:

tagger.add((word + "\to\tb").encode('utf-8'))

tagger.parse()

size = tagger.size()

xsize = tagger.xsize()

for i in range(0, size):

for j in range(0, xsize):

char = tagger.x(i, j).decode('utf-8')

tag = tagger.y2(i)

if tag == 'b':

output_data.write(' ' + char)

elif tag == 'm':

output_data.write(char)

elif tag == 'e':

output_data.write(char + ' ')

else: # tag == 's'

output_data.write(' ' + char + ' ')

output_data.write('\n')

input_data.close()

output_data.close()

if __name__ == '__main__':

iflen(sys.argv) != 4:

print("usage: python crf_segmenter.py crf_model test_file result_file")

sys.exit()

crf_model = sys.argv[1]

input_file = sys.argv[2]

output_file = sys.argv[3]

tagger = crfpp.tagger("-m " + crf_model)

crf_segmenter(input_file, output_file, tagger)

make_crf_train_data.py

#!/usr/bin/python

# -*- coding: utf-8 -*-

# make_crf_train_data.py

# 得到crf++要求的格式的訓練檔案

# 用法:命令列--python dataprocess.py input_file output_file

import sys

import codecs

# 4 tags for character tagging: b(begin), e(end), m(middle), s(single)

def character_4tagging(input_file, output_file):

input_data = codecs.open(input_file, 'r', 'utf-8')

output_data = codecs.open(output_file, 'w', 'utf-8')

forline

in input_data.readlines():

word_list = line.strip().split()

forword

in word_list:

iflen(word) == 1:

output_data.write(word + "\ts\n")

else:

output_data.write(word[0] + "\tb\n")

for w in

word[1:len(word) - 1]:

output_data.write(w + "\tm\n")

output_data.write(word[len(word) - 1] + "\te\n")

output_data.write("\n")

input_data.close()

output_data.close()

# 6 tags for character tagging: b(begin), e(end), m(middle), s(single), m1, m2

def character_6tagging(input_file, output_file):

input_data = codecs.open(input_file, 'r', 'utf-8')

output_data = codecs.open(output_file, 'w', 'utf-8')

forline

in input_data.readlines():

word_list = line.strip().split()

forword

in word_list:

iflen(word) == 1:

output_data.write(word + "\ts\n")

elif len(word) == 2:

output_data.write(word[0] + "\tb\n")

output_data.write(word[1] + "\te\n")

elif len(word) == 3:

output_data.write(word[0] + "\tb\n")

output_data.write(word[1] + "\tm\n")

output_data.write(word[2] + "\te\n")

elif len(word) == 4:

output_data.write(word[0] + "\tb\n")

output_data.write(word[1] + "\tm1\n")

output_data.write(word[2] + "\tm\n")

output_data.write(word[3] + "\te\n")

elif len(word) == 5:

output_data.write(word[0] + "\tb\n")

output_data.write(word[1] + "\tm1\n")

output_data.write(word[2] + "\tm2\n")

output_data.write(word[3] + "\tm\n")

output_data.write(word[4] + "\te\n")

elif len(word) > 5:

output_data.write(word[0] + "\tb\n")

output_data.write(word[1] + "\tm1\n")

output_data.write(word[2] + "\tm2\n")

for w in

word[3:len(word) - 1]:

output_data.write(w + "\tm\n")

output_data.write(word[len(word) - 1] + "\te\n")

output_data.write("\n")

input_data.close()

output_data.close()

if __name__ == '__main__':

iflen(sys.argv) != 3:

print ("usage: python dataprocess.py inputfile outputfile")

sys.exit()

input_file = sys.argv[1]

output_file = sys.argv[2]

character_4tagging(input_file, output_file)

r

運用CRF技術進行簡單分詞

input data codecs.open pku training,r utf 8 output data codecs.open pku training out,w utf 8 for line in input data.readlines word list line.strip spl...

CRF中文分詞開源版發布啦

crf中文分詞開源版發布啦 langiner gmail.com 中文分詞經過艱苦的研發,終於發布了。中文分詞是網際網路應用不可缺少的基礎技術之一,也是語音和語言產品必不可少的技術元件。自2003年第一屆國際中文分詞評測以來,由字構詞的分詞方法獲得了壓倒性優勢,國內主要通過crf 開源軟體包來學習該...

CRF 及CRF 安裝與解釋

conditional random field 條件隨機場,一種機器學習技術 模型 crf由john lafferty最早用於nlp技術領域,其在nlp技術領域中主要用於文字標註,並有多種應用場景,例如 本文主要描述如何使用crf技術來進行中文分詞。1.crf把分詞當做字的詞位分類問題,通常定義字...