用Python進行檔案批處理例項

2021-09-07 20:26:04 字數 4570 閱讀 1480

import sys

import os

import re

from pyltp import sentencesplitter, segmentor, postagger, parser, namedentityrecognizer, sementicrolelabeller

rootdir = 'd:/users/liang/pycharmprojects/analysiscase/'

data = os.path.join(rootdir, 'data/')

model = os.path.join(rootdir, 'model/ltp_data_v3.4.0/')

result = os.path.join(rootdir, 'result/')

# 名字轉換器

def get_seg(filename):

patter = '\d+'

return 'seg-{}.txt'.format(re.findall(patter, filename)[0])

def get_postag(filename):

patter = "\d+"

return 'postag-{}.txt'.format(re.findall(patter, filename)[0])

def get_txt(filename):

return filename.replace('.txtoriginal', '')

def get_train(filename):

return filename.replace('.txtoriginal', '_train_data')

# 檔案路徑 結果路徑 檔名

def process_pre(data_dir, result_dir, result_dir2, filename):

f_original = open(os.path.join(data_dir, filename), 'r', encoding='utf-8')

data_original = f_original.readlines()

str1 = ''

str2 = ''

for aaa in data_original[0:]:

# 分詞

segmentor = segmentor()

segmentor.load_with_lexicon(os.path.join(model, 'cws.model'), filename)

words = segmentor.segment(aaa)

segment123 = list(words)

for x in segment123[0:]:

str1 += '\t' + x

str1 += '\n'

# 標註

postagger = postagger()

postagger.load(os.path.join(model, 'pos.model'))

postags = postagger.postag(words)

postag123 = list(postags)

for x in postag123[0:]:

str2 += '\t' + x

str2 += '\n'

segmentor.release()

postagger.release()

segfilename = get_seg(filename)

postagfilename = get_postag(filename)

f1 = open(os.path.join(result_dir, segfilename), 'w+', encoding='utf-8')

f2 = open(os.path.join(result_dir, postagfilename), 'w', encoding='utf-8')

f1.write(str1)

f2.write(str2)

f1.close()

f2.close()

f_original.close()

# 資料標註雜糅

# 標註好的資料檔名

filename2 = get_txt(filename)

# 存放訓練資料檔名

filetrain = get_train(filename)

# 開啟標好的檔案

f2 = open(os.path.join(data_dir, filename2), 'r', encoding='utf-8')

# 開啟整合好的檔案

filetrain = open(os.path.join(result_dir2, filetrain), 'w+', encoding='utf-8')

# 1.標註好的檔案

data1 = f2.readlines()

# 1.整合的資料

data_combine = ""

# 1.原始資料

f_original = open(os.path.join(data_dir, filename), 'r', encoding='utf-8')

data2 = f_original.read()

# 處理好的資料

data_segment = str1.split('\t')

data_postag = str2.split('\t')

l = len(data_segment)

for i, data_s in enumerate(data_segment[0:]):

flag = 0

if (i == 0):

continue

if (i != l - 1):

data_combine += data_s + '/' + data_postag[data_segment.index(data_s)]

for ner in data1[0:]:

# data_split

# ['頭暈','39','40','症狀和體徵\n']

data_split = ner.split('\t')

if data_s == data2[int(data_split[1]):int(data_split[2]) + 1]:

flag = 1

if data_split[3] == '症狀和體徵\n' or data_split[3] == '症狀和體徵':

data_combine += '#s-nss '

break

elif data_split[3] == '檢查和檢驗\n' or data_split[3] == '檢查和檢驗':

data_combine += '#s-nii '

break

elif data_split[3] == '疾病和診斷\n' or data_split[3] == '疾病和診斷':

data_combine += '#s-ndd '

break

elif data_split[3] == '**\n' or data_split[3] == '**':

data_combine += '#s-nt '

break

elif data_split[3] == '身體部位\n' or data_split[3] == '身體部位':

data_combine += '#s-npb '

break

else:

break

if (flag == 0) and (i != l - 1):

data_combine += '#o '

else:

continue

filetrain.write(data_combine)

filetrain.close()

f_original.close()

# filesegment.close()

# filepostag.close()

if __name__ == '__main__':

filedir = data + '{}/'

resultdir = result + "one/{}/"

resultdir2 = result + "two/{}/"

# process_pre(filedir, resultdir, resultdir2,filename)

for root, dirs, files in os.walk(data):

print(root)

for item in files[1::2]:

# print(root.split('/')[-1], item)

rootsplit = root.split('/')[-1]

one = filedir.format(rootsplit) #每個資料夾資料路徑

two = resultdir.format(rootsplit)#結果1路徑

three = resultdir2.format(rootsplit)#結果2路徑

process_pre(one, two, three, item) #主要在於傳哪些引數,資料路徑,結果1和2路徑,檔名

python進行檔案操作

什麼是檔案 檔案是系統儲存區域的乙個命名位置,用來儲存一些資訊,便於後續訪問。能夠在非易失性儲存器中實現持續性儲存,比如在硬碟上。當我們要讀取或者寫入檔案時,我們需要開啟檔案 在操作完畢時,我們需要關閉檔案,以便釋放和檔案操作相關的系統資源,因此,檔案操作的主要包括以下 開啟檔案 python使用內...

用Python寫批處理

import oscommand labelme json to dataset json os.system command 今天在製作使用經labelme標註過的json檔案資料來源的時候,使用到了這個例項,遇到了乙個問題,在command中,如果含有括號的話,切記要記得加引號 coding u...

用批處理進行進製轉換

echo off setlocal enabledelayedexpansion set p a 請輸入要轉換的十進位制數 set aa a set p b 請輸入要轉換的幾進製?set str 0123456789abcde hex set a m a b set a n a b set n st...