python xml解析例子

2021-07-01 20:48:39 字數 4690 閱讀 8490

# -*- coding: utf-8 -*-

"""created on thu apr 16 23:18:27 2015

@author: shifeng

"""'''

功能:解析cdr_sample.xml檔案,輸出格式為dnorm接收的格式,並將訓練集的「label」寫入到文件中

xml檔案:見csdn資源共享

'''import codecs

import stringio

import xml

from lxml import etree

from xml.sax import *

from xml.sax.handler import *

from xml.etree import elementtree as et

import xml.dom.minidom

dom = xml.dom.minidom.parse("cdr_sample.xml")

root = dom.documentelement

#print root.nodename

#print root.nodevalue

#print root.nodetype

#print root.element_node

#-----------

'''方法一(未採納):

#知道元素名字的子元素,使用getelementsbytagname方法獲取

#colloction為根節點,有四個元素,知道其名,通過root.getelementsbytagname(i)便能取出其子元素

colloction_ele = ["source", "date", "key", "document"]

for i in colloction_ele:

print root.getelementsbytagname(i)[0].nodename #獲取標籤名字

# print root.getelementsbytagname(i)[0].getattribute

#documents有三個標籤

document_ele = ["id", "passage", "annotation"]

documents = root.getelementsbytagname("document")

#print len(documents)

for i in documents: #對每個文件,

for j in document_ele: #取出每個標籤

print i.getelementsbytagname(j)[0].nodename #獲取標籤名字

print i.getelementsbytagname(j)[0].firstchild.data #獲取標籤之間的資料

if j == "annotation":

print i.getelementsbytagname(j)[0].getattribute("id") #獲取標籤屬性

'''#-----------

write_text = open("train_text.txt","w")

#-----------

root_2 = et.parse("cdr_sample.xml")

documents = root_2.findall("./document")

for per in documents: #找到所有document

for child in per: #對於每個document解析其標籤id,passage,annotation

child_tag = child.tag

if child_tag =="id":

text_id = child.text

print child_tag,":",text_id

write_text.write(text_id+"\t") #寫入檔案,id和tab符號

elif child_tag =="passage": #對每個passage進行處理

passages = child

for passage in passages: #每個document標籤下,有多個passage標籤,

#passage有四種標籤,對每種標籤進行處理

passage_tag = passage.tag

if passage_tag == "offset": #r如果是偏移量,取出偏移量

offset = int(passage.text)

print "offset:",offset

elif passage_tag == "text": #如果是文字,取出文字,title_text或者abstract_text

text =passage.text

print passage_tag,"::",text

write_text.write(text) #寫入檔案,title_text和abstract_text兩個,連續寫在一起

elif passage_tag =="annotation": #如果是標註的,

annotations = passage

print 10*"*"

for annotation in annotations: #每個passage標籤下,annotation有四種標籤,對每種標籤處理

annotation_tag = annotation.tag

# print annotation_tag,"+++++++++++++++++++"

if annotation_tag == "location":

print annotation.attrib["offset"],annotation.attrib["length"]

elif annotation_tag == "text":

diease_name = annotation.text

print diease_name

elif annotation_tag == "infon" and annotation.attrib["key"] !="type":

#每個passage標籤下,有多個annotation,每個annotation下有兩個infon標籤,取第二個

infons = annotation

print infons.attrib["key"],infons.text

# for infon in infons:

# print infon.attrib["key"]

elif child_tag =="annotation": #document_ele[2]: #annotation

annotation = child

write_text.write("\n") #每個文件遍歷完一遍後,加乙個換行符號

print 30*"*"

write_text.close()

#「label」對照待續....

'''doc = etree.parse("cdr_sample.xml")

xml_string = etree.tostring(doc)

root = etree.fromstring(xml_string)

parser = make_parser()

# markdecodehandler

# markdecodehandler

handler = userdecodehandler()

parser.setcontenthandle(handler)

parser.parse(root)

for item in handler.marks:

for j in item.items():

print i,j

print type(doc)

print type(root)

# print doc.tag

print root.tag

# with codecs.open("cdr_sample.xml") as xml:

# text = xml.readlines()

# s_xml = ""

# for i in text:

# i=i.strip("\n")

# s_xml+=i

# print s_xml

# soup = beautifulsoup(s_xml)

# print soup.title

# for i in text:

# print i

'''

PythonXML檔案解析

sax是一種基於事件驅動的api。利用sax解析xml文件牽涉到兩個部分 解析器和事件處理器。解析器負責讀取xml文件,並向事件處理器傳送事件,如元素開始跟元素結束事件 而事件處理器則負責對事件作出相應,對傳遞的xml資料進行處理。適於處理下面的問題 在python中使用sax方式處理xml要先引入...

學習筆記 Python XML解析

xml.sax 菜鳥教程 官網文件 python由xml包 lib xml 提供對xml的支援。python處理xml主要有兩種模型,xml.dom和xml.sax分別定義了兩種處理模型的介面 the xml handling submodules are 手冊 inte ce section pu...

python xml解析和生成

解析使用xml.etree.elementtree 模組,生成使用xml.dom.minidom模組,elementtree比dom快,dom生成簡單且會自動格式化。xml version 1.0 encoding utf 8 baspools bas basprovider 0 basprovid...