時期實體識別

2021-09-25 10:27:43 字數 3562 閱讀 1638

import re

from datetime import datetime,timedelta

from dateutil.parser import parse

import jieba.posseg as psg

util_cn_num =

util_cn_unit =

def cn2dig(src):

if src == "":

return none

m = re.match("\d+", src)

if m:

return int(m.group(0))

rsl = 0

unit = 1

for item in src[::-1]:

if item in util_cn_unit.keys():

unit = util_cn_unit[item]

elif item in util_cn_num.keys():

num = util_cn_num[item]

rsl += num * unit

else:

return none

if rsl < unit:

rsl += unit

return rsl

def year2dig(year):

res = ''

for item in year:

if item in util_cn_num.keys():

res = res + str(util_cn_num[item])

else:

res = res + item

m = re.match("\d+", res)

if m:

if len(m.group(0)) == 2:

return int(datetime.datetime.today().year/100)*100 + int(m.group(0))

else:

return int(m.group(0))

else:

return none

def parse_datetime(msg):

if msg is none or len(msg) == 0:

return none

try:

dt = parse(msg, fuzzy=true)

return dt.strftime('%y-%m-%d %h:%m:%s')

except exception as e:

m = re.match(

r"([0-9零一二兩三四五六七**十]+年)?([0-9一二兩三四五六七**十]+月)?([0-9一二兩三四五六七**十]+[號日])?([上中下午晚早]+)?([0-9零一二兩三四五六七**十百]+[點:\.時])?([0-9零一二三四五六七**十百]+分?)?([0-9零一二三四五六七**十百]+秒)?",

msg)

if m.group(0) is not none:

res =

params = {}

for name in res:

if res[name] is not none and len(res[name]) != 0:

tmp = none

if name == 'year':

tmp = year2dig(res[name][:-1])

else:

tmp = cn2dig(res[name][:-1])

if tmp is not none:

params[name] = int(tmp)

target_date = datetime.today().replace(**params)

is_pm = m.group(4)

if is_pm is not none:

if is_pm == u'下午' or is_pm == u'晚上' or is_pm =='中午':

hour = target_date.time().hour

if hour < 12:

target_date = target_date.replace(hour=hour + 12)

return target_date.strftime('%y-%m-%d %h:%m:%s')

else:

return none

def check_time_valid(word):

m = re.match("\d+$", word)

if m:

if len(word) <= 6:

return none

word1 = re.sub('[號|日]\d+$', '日', word)

if word1 != word:

return check_time_valid(word1)

else:

return word1

#時間提取

def time_extract(text):

time_res =

word = ''

keydate =

for k, v in psg.cut(text):

if k in keydate:

if word != '':

word = (datetime.today() + timedelta(days=keydate.get(k, 0))).strftime('%y%m%d%h%m%s').format(y='年', m='月',d='日', h='時', f='分', s='秒')

elif word != '':

if v in ['m', 't']:

word = word + k

else:

word = ''

elif v in ['m', 't']:

word = k

if word != '':

result = list(filter(lambda x: x is not none, [check_time_valid(w) for w in time_res]))

final_res = [parse_datetime(w) for w in result]

return [x for x in final_res if x is not none]

text1 = '我要住到明天下午三點'

print(text1, time_extract(text1), sep=':')

text2 = '預定28號的房間'

print(text2, time_extract(text2), sep=':')

text3 = '我要從26號下午4點住到11月2號'

print(text3, time_extract(text3), sep=':')

text4 = '我要預訂今天到30的房間'

print(text4, time_extract(text4), sep=':')

text5 = '今天30號呵呵'

print(text5, time_extract(text5), sep=':')

ai命名實體識別模型 命名實體識別

crf中有兩類特徵函式,分別是狀態特徵和轉移特徵,狀態特徵用當前節點 某個輸出位置可能的狀態中的某個狀態稱為乙個節點 的狀態分數表示,轉移特徵用上乙個節點到當前節點的轉移分數表示。其損失函式定義如下 crf損失函式的計算,需要用到真實路徑分數 包括狀態分數和轉移分數 其他所有可能的路徑的分數 包括狀...

命名實體識別

簡單的分詞器 如二元分詞器 無法識別oov,所以需要運用一些規定的規則來輔助識別 如 在識別音譯人名時,可以設定規則 一旦發現某詞是人名,而該詞後面跟隨人名詞時,將他們合併 針對不同情況,需要設計相應的標註集 拿人名識別舉例 輸入資料集進行訓練後,會將人名拆分為碎片,模擬人名的錯誤切分.接著,檢查拆...

實體識別類別標註

當我們要對字串中的實體進行標註時,需要尋找到實體在字串的開始位置 如下 def index q list in k list q list,k list known q list in k list,find index first time of q list in k list q list l...