Convert xml file to some generic delimited file.
Here's a basic python script to convert XML to x delimited files. No warranty is provided, always back up your files.
The awesome lxml library is required for those using the python script.
Download the Windows ( x86 ) installer setup.exe
or
Download the python script flatten_xml.py.
Example usage:
>flatten_xml.exe -i xml_test.xml -o xml_test.out -d "TAB" -l unix
Automatic row break tag set to < b >
WARNING: New line character "\n" Removed From Data
Successful conversion of xml_test.xml to xml_test.out
For help enter "flatten_xml.exe -h"
Back to www.whyhere.net!
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 import getopt
16 import os
17 import sys
18
19
20 from lxml import etree
21
22 __version__ = '1.0'
23 __author__ = 'Erik Schweller'
24 __date__ = '2010/01/24'
25
26
27 LINE_ENDING_STYLE = 'unix'
28 DELIMITER = '\t'
29 WRITE_HEADER = True
30
31
32 ROW_BREAK_TAG = None
33
34 class XMLToDelimited( object ):
35 def __init__( self, row_break_tag = None, line_ending_style = 'unix',
36 delimiter = '\t', write_header = True, long_attr_name = False ):
37 ending_style = { 'windows': '\r\n', 'unix': '\n', 'mac': '\r' }
38
39 assert line_ending_style in ending_style, \
40 'line_ending_style must be one of %s' % ( str( ending_style.keys() ) )
41
42 self._line_ending = ending_style[ line_ending_style ]
43 self._delimiter = delimiter
44 self._write_header = write_header
45 self._row_break_tag = row_break_tag
46
47
48 self._use_long_attr_name = long_attr_name
49
50
51 self._out_file = None
52 self._in_file = None
53
54
55 self._warnings = dict()
56
57 def __call__( self, xml_file, output_file = 'output.txt', user_columns = None ):
58 '''Callable version - directs to convert_to_delimited'''
59 return self.convert_to_delimited( xml_file, output_file, user_columns )
60
61
62 def convert_to_delimited( self, xml_file, output_file = 'output.txt', user_columns = None ):
63 '''Convert xml_file to output_file'''
64 assert os.path.exists( xml_file ), 'Error, %s does not exist' % ( xml_file )
65 try:
66 try:
67 out = open( output_file, 'wb' )
68 except Exception, err:
69 print 'Failed to open %s' % ( output_file )
70 print err
71 return False
72
73 self._in_file = xml_file
74 self._out_file = out
75
76 columns = self._find_columns( user_columns )
77 if len( columns ) == 0:
78 print 'No columns found in', xml_file
79 return False
80
81 keys = columns.keys()
82 keys.sort( lambda x, y: cmp( columns[ x ], columns[ y ] ) )
83 if self._row_break_tag is None:
84 self._row_break_tag = keys[ -1 ]
85 print 'Automatic row break tag set to <', self._row_break_tag, '>'
86 if self._write_header:
87 try:
88 self._write_file_header( columns )
89 except Exception, err:
90 print 'Failed to write header to %s' % ( output_file )
91 print err
92 return False
93 try:
94 self._dump_to_file( columns )
95 except Exception, err:
96 print 'Failed when processing data from %s' % ( xml_file )
97 print err
98 return False
99
100 for w in self._warnings:
101 if self._warnings[ w ]:
102 print 'WARNING:', w
103 finally:
104
105 self._warnings = dict()
106 return True
107
108
109 def _write_file_header( self, columns ):
110 '''Writes header to the output file'''
111 for c in sorted( columns ):
112 self._out_file.write( c )
113 self._out_file.write( self._delimiter )
114 self._out_file.write( self._line_ending )
115
116 def _write_row( self, row, columns ):
117 '''Writes one row at a time to the output file'''
118 for id in sorted( columns ):
119 if id in row:
120 self._out_file.write( row[ id ] )
121 self._out_file.write( self._delimiter )
122 self._out_file.write( self._line_ending )
123
124 def _dump_to_file( self, columns ):
125 '''Fill output file with data ( text and attribute values )'''
126 row = dict()
127 if self._delimiter == '"':
128 comment = "'"
129 else:
130 comment = '"'
131 for item in self._walk_file( self._in_file ):
132 attr = item.attrib
133 tag = item.tag
134 text = item.text
135
136 if '\n' in text:
137 text = text.replace( '\n', '' )
138 self._warnings[ 'New line character "\\n" Removed From Data' ] = True
139 if tag in columns:
140
141 if self._delimiter in text:
142 text = comment + text + comment
143 self._warnings[ 'Current Delimiter Removed From Data' ] = True
144
145 if text.find( self._line_ending ) != -1:
146 text = text.replace( self._line_ending, '' )
147 self._warnings[ 'Current Line Ending Removed From Data' ] = True
148 row[ tag ] = text if len( text ) > 0 else ''
149
150 if len( attr ) > 0:
151 for a in attr:
152 if self._use_long_attr_name:
153 atag = '<' + tag + ' ' + a + '=' + "'" +attr[ a ] + "'>"
154 else:
155 atag = tag + '_' + a + '_' + attr[ a ]
156 row[ atag ] = '1'
157
158
159 if self._row_break_tag == tag and len( row ) > 0:
160 self._write_row( row, columns )
161 row = dict()
162
163
164 def _walk_file( self, file ):
165 '''Memory friendly walk though an xml file'''
166
167 for event, item in etree.iterparse( file ):
168
169 if len( item ) > 0:
170 previous_item = item.getprevious()
171 while previous_item is not None and len( previous_item ) > 0:
172 p = previous_item.getparent()
173 if len( p ) > 0: p.remove( previous_item )
174 previous_item = item.getprevious()
175 yield item
176
177 def _find_columns( self, user_columns = None ):
178 '''Scan xml for all elements and attributes
179 populates dict with potential data colums
180 filters if user_columns is not None'''
181 columns = dict()
182 for item in self._walk_file( self._in_file ):
183
184
185 attr = item.attrib
186 tag = item.tag
187
188
189 if len( attr ) > 0:
190 for a in attr:
191 if self._use_long_attr_name:
192 atag = '<' + tag + ' ' + a + '=' + "'" +attr[ a ] + "'>"
193 else:
194 atag = tag + '_' + a + '_' + attr[ a ]
195 if atag not in columns:
196 columns[ atag ] = 1
197 else:
198 columns[ atag ] += 1
199 if tag not in columns:
200 columns[ tag ] = 1
201 else:
202 columns[ tag ] += 1
203
204
205
206 if user_columns is not None:
207 for c in columns.keys():
208 if c.lower() not in user_columns:
209 del columns[ c ]
210 return columns
211
212 def usage():
213 '''Describes input arguments'''
214 ex = os.path.basename( sys.argv[ 0 ] )
215 if ex.endswith( '.py' ):
216 ex = 'python ' + ex
217 print '''
218 ---XML Flattener---
219 usage:\t''' + ex + ''' -i in_file -o out_file [ options ]
220
221 Required arguments:
222 -i --in_file : Path to input file
223 -o --out_file : Path to output file
224
225 Optional arguments:
226 -h --help : Display this message
227
228 -c --columns : List of tags to include in the
229 output seperated by commas with
230 no spaces ( e.g., "item,price,qty" )
231
232 -d --delimiter : Delimeter character surrounded by "
233 and escaped ( e.g., "\\t", or "," ).
234 "TAB", "SPACE", and "COMMMA" keywords
235 may be used. To use "\\" as delimiter, enter "\\\\".
236
237 -l --line_ending_style : Line ending style ( e.g., "windows", "unix", "mac" )
238
239 -r --row_break_tag : Tag on which to create a new row in output file
240 If not selected, a new row will be made on most
241 frequent tag.
242
243 -n --long_attr_names : If set, attribures shown in xml-like form in column
244 header, else form is "tag_attribute_value"
245 '''
246
247 def main( argv ):
248 '''Entry point for script based usage
249 requires -i and -o arguments'''
250 try:
251
252 opts, args = getopt.getopt( argv, 'i:o:hc:d:l:r:n', [ 'in_file=', 'out_file=',
253 'help', 'columns=', 'delimiter=', 'line_ending_style=', 'row_break_tag=',
254 'long_attr_names' ] )
255 except getopt.GetoptError, err:
256 print str( err )
257 usage()
258 sys.exit( 2 )
259
260 if len( opts ) < 2 and '-h' not in opts and '--help' not in opts:
261 usage()
262 sys.exit( 2 )
263
264 global WRITE_HEADER
265 global LINE_ENDING_STYLE
266 global DELIMITER
267 global ROW_BREAK_TAG
268
269 delimit_map = { 'SPACE': ' ', 'COMMA': ',', 'TAB': '\t', \
270 r'\t': '\t', r'\n': '\n', r'\c': '\c', r'\r': '\r' }
271
272
273 wh = WRITE_HEADER
274 les = LINE_ENDING_STYLE
275 d = DELIMITER
276 rbt = ROW_BREAK_TAG
277 lan = False
278 in_file = None
279 out_file = None
280 columns = None
281
282
283 for opt, arg in opts:
284 if opt in ( '-h', '--help' ):
285 usage()
286 sys.exit()
287 elif opt in ( '-c', '--columns' ):
288 columns = arg
289 columns = [ c.lower() for c in columns.split( ',' ) if len( c ) > 0 ]
290 elif opt in ( '-r', '--row_break_tag' ):
291 rbt = arg
292 elif opt in ( '-d', '--delimiter' ):
293 d = arg
294 if '"' in d:
295 d = arg[ 1:-1 ]
296 if d in delimit_map:
297 d = delimit_map[ d ]
298 elif opt in ( '-l', '--line_ending_style' ):
299 les = arg
300 elif opt in ( '-n', '--long_attr_names' ):
301 lan = True
302 elif opt in ( '-i', '--in_file' ):
303 in_file = arg
304 elif opt in ( '-o', '--out_file' ):
305 out_file = arg
306 else:
307 assert False, "Error: unhandled option"
308
309 assert len( d ) == 1, 'Delimeter must be a single character'
310 assert in_file is not None, 'Must enter an input file name'
311 assert out_file is not None, 'must enter an output file name'
312
313 converter = XMLToDelimited( row_break_tag = rbt, line_ending_style = les,
314 delimiter = d, write_header = wh , long_attr_name = lan)
315
316
317
318 res = converter( in_file, out_file, user_columns = columns )
319 if res:
320 print 'Successful conversion of %s to %s' % ( in_file, out_file )
321 else:
322 print 'Conversion of %s failed' % ( in_file )
323
324 if __name__ == '__main__':
325 '''Command line run'''
326 if len( sys.argv ) == 1:
327 usage()
328 print 'This is a commandline program only.\nPress RETURN to exit.'
329 raw_input()
330 else:
331 main( sys.argv[ 1: ] )
332
333