關鍵字top排行榜業務

2021-08-19 07:28:54 字數 4183 閱讀 6221

# -*- coding: utf-8 -*-

defmkkey0

(s:str):

""" this function is to split word inclede special character

for example:

source code: lib/posix'path.py (for posix), lib/ntpath.py (for windows nt)

s.split() method :

list=['source','code:','lib/posix'path.py','(for posix)','lib/ntpath.py','(for windows nt)']

but we want:

list=['source','code:','lib','posix','path','py','for',posix','lib','ntpath','py','for', 'windows','nt']

實驗:wordlist = ["goggd`e","qw,sh","god"] 測試驗證map函式生成的結果

for w in map(makekey,wordlist):

print(w) 測試結果返回是列表

['goggd', 'e'] 提醒使用時需再次迭代

['qw', 'sh']

['god']

主要思路是將特殊字元替換為空格字元,儲存在列表list中,然後通過"".join鏈結成只帶空格的字串

"""keychar = set("""~!@#$%^&*()_+`,.-;'\:\"/?""")

key = s.lower()

strlist =

for c in key:

return

"".join(strlist).split()

defmkkey1

(s:str):

""" this function is to split word inclede special character

for example:

source code: lib/posix'path.py (for posix), lib/ntpath.py (for windows nt)

s.split() method :

實驗:wordlist = ["`gog))gd`e","qw,sh","god"]

for w in map(makekey1,wordlist):

print(w)

執行結果:

['gog', 'gd', 'e']

['qw', 'sh']

['god']

"""keychar = set("""~!@#$%^&*()_+`,.-;'\:\"/?""")

key = s.lower()

strlist =

offset = 0

for i,v in enumerate(key):

if v in keychar:

if offset == i: #第乙個為特殊字元或者相鄰的兩字元為特殊字元

offset += 1

continue

offset = i+1

else: #無此項時,最後乙個單詞無法獲取到

if offset < len(key): #說明仍有有效字元

return

"".join(strlist).split()

iterwd = set(['the','is','a','of','on','in','or','to','and','if','an','for'])

defcountwd

(words,iterwords={}):

""" the function to count words repeat times in args

"""wordict = {}

with open('{}'.format(words),'r+t',encoding='utf-8') as f:

for lines in f:

wordlist = lines.split()

for wordlst in map(mkkey0,wordlist):

for word in wordlst:

if word not

in iterwords:

wordict[word] = wordict.get(word,0) +1

countword = sorted(wordict.items(),key=lambda x:x[1],reverse=true)[0:10]

#當資料不是很多時可以採用一次性生成推送給顯示端,若資料過多時一次性推送顯示伺服器時,顯示伺服器壓力過多

#因此此時建議採用生成器模式

#def top(n):

# for i,t in enumerate(sorted(wordict.items(),key=lambda x:x[1],reverse=true)):

# if i > n:

# return

# yield t

#for key,v in top(10):

# print(key,v)

return countword

print(countwd('d:\sample.txt',iterwords=iterwd))

執行結果:

[('path', 138), ('os', 49), ('return', 30), ('windows', 25), ('file', 24), ('pathname', 17), ('true', 17), ('drive', 17), ('this', 17), ('unix', 16)]

# -*- coding: utf-8 -*-

defmkkey1

(key:str,keychar=set("""~!@#$%^&*()_+`\n,. -;'\:\"/?""")):

""" this is function to make key

"""offset = 0

for i,v in enumerate(key):

if v in keychar:

if offset == i:

offset += 1

continue

yield key[offset:i]

offset = i+1

else:

if offset < len(key):

yield key[offset:]

defcountwd

(words,encode,ignorwords):

""" the function to count words repeat times in args

"""wordict = {}

with open('{}'.format(words),'r+t',encoding=encode) as f:

for lines in f:

for word in map(str.lower,mkkey1(lines)):

if word not

in ignorwords:

wordict[word] = wordict.get(word,0) +1

return wordict

iterwd = set(['the','is','a','of','on','in','or','to',' ','and','if','an','for'])

deftop

(wordict=countwd(words='d:\sample.txt',encode='utf-8',ignorwords=iterwd),n=10):

""" this function is count top

"""for i,t in enumerate(sorted(wordict.items(),key=lambda item:item[1],reverse=true)):

if i > n:

break

yield t

for key,v in top():

print(key,v)

python爬蟲入門 豆瓣電影排行榜top250

1.requests 2.re 正規表示式庫 請求頭 此處複製的火狐瀏覽器請求頭 myheader 標記電影次序 time,初始化為1 排行榜第i頁 link str i 25 正則匹配結果 matchobj import requests import re def get movies 請求頭 ...

設計模式Top10排行榜

在工作中,常常使用到設計模式,增強了軟體的靈活性,然而要為它們排排位置,還真是一件難事,因為每個人對設計模式的理解程度,每個人使用的程式語言,個人的習慣,工作性質等等都會影響排行榜。在這裡暫且列出自己心中的排行榜吧 僅限design pattern中提到的設計模式 冠軍寶座 strategy pat...

設計模式Top10排行榜

在工作中,常常使用到設計模式,增強了軟體的靈活性,然而要為它們排排位置,還真是一件難事,因為每個人對設計模式的理解程度,每個人使用的程式語言,個人的習慣,工作性質等等都會影響排行榜。在這裡暫且列出自己心中的排行榜吧 僅限 design pattern 中提到的設計模式 冠軍寶座 strategy p...