基於simhash的短文本去重

2021-09-22 02:13:43 字數 2913 閱讀 9518

#!/usr/bin/env python

# -*- coding:utf-8 -*-

"""利用simhash進行文字去重

"""from simhash import simhash, simhashindex

import jieba

import codecs

import datetime

import os

class dudup(object):

def __init__(self, data_dir='../data/', model_dir='../model/', file_name='test_data.txt',\

clean_file_name='test_data_clean.txt'):

self.data_dir = data_dir

self.model_dir = model_dir

self.file_name = file_name

self.clean_file_name = clean_file_name

def stop_word_list(self, stop_words_path):

stopwords = [x.strip() for x in codecs.open(stop_words_path, 'r', encoding='utf-8').readlines()]

return stopwords

def tokenization(self, line):

""":param line: 每行原始資料

:return: 分詞、去除停用詞後的資料

"""result =

words = jieba.lcut(line)

for word in words:

if word not in self.stop_word_list(self.data_dir + 'stopwords.txt'):

return result

def read_data(self, file):

data_list =

with open(self.data_dir + file, encoding='utf-8') as data:

for line in data.readlines():

return data_list

def get_data_dict(self):

data_dic = {}

index = 1

clean_data =

if not os.path.exists(self.data_dir + self.file_name):

# clean_data = [self.tokenization(sent) for sent in self.read_data() if len(sent)]

with open(self.data_dir + self.clean_file_name, 'w', encoding='utf-8') as cleaned_data:

for sent in self.read_data():

clean_line = self.tokenization(sent)

cleaned_data.write(' '.join(clean_line)+'\n')

else:

clean_data = self.read_data(self.clean_file_name)

data_line_number = len(clean_data)

for line in clean_data:

data_dic[str(index)] = ' '.join(line)

index += 1

if index == data_line_number:

break

# print(data_dic)

return data_dic

def get_index(self):

data_dic = self.get_data_dict()

print(data_dic) # 列印出字典

line_score = [(id, simhash(sent)) for id, sent in data_dic.items()]

index = simhashindex(line_score, k=2)

return index

if __name__ == '__main__':

start_time = datetime.datetime.now()

find_dup = dudup()

sim_hash_index = find_dup.get_index()

inp = '「全椒縣經開區汙水處理廠****提標改造裝置採購二次'

inp_sim_hash = simhash(' '.join(find_dup.tokenization(inp)))

result_index = sim_hash_index.get_near_dups(inp_sim_hash)

if len(result_index):

print('重複行索引\t', result_index[0])

raw_data_list = find_dup.read_data(find_dup.data_dir + find_dup.file_name)

print('重複標題\t', raw_data_list[int(result_index[0]) - 1])

else:

print("沒有重複行")

end_time = datetime.datetime.now()

print("consume time is %f minutes." % ((end_time - start_time).seconds * 1.0 / 60))

基於simhash的文字去重原理

simhash 是 google 用來處理海量文字去重的演算法。simhash 可以將乙個文件轉換成乙個 64 位的位元組,暫且稱之為特徵字。判斷文件是否重複,只需要判斷文件特徵字之間的漢明距離。根據經驗,一般當兩個文件特徵字之間的漢明距離小於 3,就可以判定兩個文件相似。傳統的hash演算法只負責...

文字去重之SimHash演算法

說到文字相似性計算,大家首先想到的應該是使用向量空間模型vsm vector space model 使用vsm計算相似度,先對文字進行分詞,然後建立文字向量,把相似度的計算轉換成某種特徵向量距離的計算,比如余弦角 歐式距離 jaccard相似係數等。這種方法存在很大乙個問題 需要對文字兩兩進行相似...

文字去重之SimHash演算法

說到文字相似性計算,大家首先想到的應該是使用向量空間模型vsm vector space model 使用vsm計算相似度,先對文字進行分詞,然後建立文字向量,把相似度的計算轉換成某種特徵向量距離的計算,比如余弦角 歐式距離 jaccard相似係數等。這種方法存在很大乙個問題 需要對文字兩兩進行相似...