python 對txt文字的去重處理

2021-10-17 03:14:06 字數 3360 閱讀 9037

總結日常工作需要經常對資料進行去重的處理,在這裡簡單記錄分享下

import os
**如下:

應事先在將txt檔案放在對應路徑下

import os

path =

'./tutu'

if not os.path.

exists

(path)

:

os.makedirs

(path)

f3 =

open

(f"./tutu/a.txt"

,"r"

,encoding=

'utf-8'

)text_list =

s =set()

document = f3.

readlines()

document_num =

int(

len(document)

)print

('原條數:'

+str

(document_num)

)print

('***************=去重中***************='

)content =

[x.strip()

for x in document]

# print(content)

for x in range(0

,len

(content)):

url = content[x]

if url not in s:

s.add(url)

text_list.

(url)

filename =

int(

len(text_list)

)print

('現條數:'

+str

(filename)

)print

('減少了:'

+str

(document_num-filename )

)with open

(f'./tutu/b.txt'

,'a+'

,encoding=

'utf-8'

) as f:

for i in range

(len

(text_list)):

# s = str(i).split()

s =str(text_list[i]

) s = s +

'\n'

f.write

(s)print

('***************=儲存檔案成功***************='

)

import os

path =

'./tutu'

if not os.path.

exists

(path)

: os.

makedirs

(path)

print

('***************=讀取中***************='

)for filename in os.

listdir

(f"./tutu/"):

print

(filename)

with open

(f"./tutu/"

+filename,encoding=

'utf-8'

) as f: #讀取每個檔案

for line in f.

readlines()

: #將每個檔案文字同意逐行寫入乙個word中

with open

(f"./tutu/全部資料整理.txt"

,"a+"

,encoding=

'utf-8'

) as mom:

mom.

write

(line)

print

('***************=已完成txt讀取並寫入新txt***************='

)f3 =

open

(f"./tutu/全部資料整理.txt"

,"r"

,encoding=

'utf-8'

)text_list =

s =set()

document = f3.

readlines()

document_num =

int(

len(document)

)print

('原條數:'

+str

(document_num)

)print

('***************=去重中***************='

)content =

[x.strip()

for x in document]

# print(content)

for x in range(0

,len

(content)):

url = content[x]

if url not in s:

s.add(url)

text_list.

(url)

filename =

int(

len(text_list)

)print

('現條數:'

+str

(filename)

)print

('減少了:'

+str

(document_num-filename )

)f3.

close()

with open

(f'./tutu/全部資料整理(去重後).txt'

,'a+'

,encoding=

'utf-8'

) as f:

for i in range

(len

(text_list)):

# s = str(i).split()

s =str(text_list[i]

) s = s +

'\n'

f.write

(s)print

('***************=儲存去重檔案成功***************='

)os.

remove

(f"./tutu/全部資料整理.txt"

) #刪除全部資料整理.txt

自娛自樂的小方法,在這裡分享給大家,如你也有同樣有趣的方法,分享出來,共同學習,一起進步。謝謝~

finger print 文字去重

任何一段資訊文字,都可以對應乙個不太長的隨機數,作為區別它和其它資訊的指紋 fingerprint 只要演算法設計的好,任何兩段資訊的指紋都很難重複,就如同人類的指紋一樣。資訊指紋在加密 資訊壓縮和處理中有著廣泛的應用。string content2 卓爾防線繼續傷筋動骨 隊長梅方出場再補漏說起來卓...

資料 文字去重

先排序,後取重 sort file.txt uniq usr bin python coding utf 8 import sys reload sys sys.setdefaultencoding utf 8 def text duplicate byset sourcepath,destpath...

python中對list去重的方法

lista 1 2 1 2 3 4 2 第一種print set lista 輸出型別為 class set 因為型別是無序集合 所以每次順序都會變輸出結果為 順序不一致 且型別發生改變 第二種print list set lista 輸出型別為 class list 因為是把set強轉成list ...