Convert xml file to some generic delimited file. Here's a basic python script to convert XML to x delimited files. No warranty is provided, always back up your files.

The awesome lxml library is required for those using the python script.

Download the Windows ( x86 ) installer setup.exe
or
Download the python script flatten_xml.py.

Example usage:
>flatten_xml.exe -i xml_test.xml -o xml_test.out -d "TAB" -l unix
  Automatic row break tag set to < b >
  WARNING: New line character "\n" Removed From Data
  Successful conversion of xml_test.xml to xml_test.out


For help enter "flatten_xml.exe -h"

Back to www.whyhere.net!



Donate


flatten_xml.py
  1 # @Brief:         Convert xml -> delimented ( an xml flattener )
  2 # @Author:        Erik Schweller ( othererik .at. g mail .dot. com )
  3 # @Copyright:     Copyright 2010
  4 #  A two pass system for converting any generic xml file
  5 #  to a delimited file ( flattens an xml file ).
  6 #  Converts both elements and element-attribute
  7 #  pairs to columns.
  8 #  For attributes on each tag, a "1" is placed in column if the attribute
  9 #  is present for the current row, otherwise the cell is left blank.
 10 
 11 # TODO: choose best line ending based on data
 12 # TODO: choose best delimiter based on data
 13 
 14 # python
 15 import getopt
 16 import os
 17 import sys
 18 
 19 # lxml
 20 from lxml import etree
 21 
 22 __version__ = '1.0'
 23 __author__ = 'Erik Schweller'
 24 __date__ = '2010/01/24'
 25 
 26 # GLOBAL DEFAULT SETTINGS
 27 LINE_ENDING_STYLE = 'unix'
 28 DELIMITER = '\t'
 29 WRITE_HEADER = True
 30 # if the follwing is None and no commandline override,
 31 # most repeated tag will be used for row breaks
 32 ROW_BREAK_TAG = None
 33 
 34 class XMLToDelimited( object ):
 35     def __init__( self, row_break_tag = None, line_ending_style = 'unix',
 36         delimiter = '\t', write_header = True, long_attr_name = False ):
 37             ending_style = { 'windows': '\r\n', 'unix': '\n', 'mac': '\r' }
 38 
 39             assert line_ending_style in ending_style, \
 40                 'line_ending_style must be one of %s' % ( str( ending_style.keys() ) )
 41 
 42             self._line_ending = ending_style[ line_ending_style ]
 43             self._delimiter = delimiter
 44             self._write_header = write_header
 45             self._row_break_tag = row_break_tag
 46 
 47             # attributes can be written to header in two ways
 48             self._use_long_attr_name = long_attr_name
 49 
 50             # defined at call to convert_to_delimited
 51             self._out_file = None
 52             self._in_file = None
 53 
 54             # setup warnings
 55             self._warnings = dict()
 56 
 57     def __call__( self, xml_file, output_file = 'output.txt', user_columns = None ):
 58         '''Callable version - directs to convert_to_delimited'''
 59         return self.convert_to_delimited( xml_file, output_file, user_columns )
 60 
 61     # driver
 62     def convert_to_delimited( self, xml_file, output_file = 'output.txt', user_columns = None ):
 63         '''Convert xml_file to output_file'''
 64         assert os.path.exists( xml_file ), 'Error, %s does not exist' % ( xml_file )
 65         try:
 66             try:
 67                 out = open( output_file, 'wb' )
 68             except Exception, err:
 69                 print 'Failed to open %s' % ( output_file )
 70                 print err
 71                 return False
 72 
 73             self._in_file = xml_file # string
 74             self._out_file = out #  stream
 75 
 76             columns = self._find_columns( user_columns )
 77             if len( columns ) == 0:
 78                 print 'No columns found in', xml_file
 79                 return False
 80 
 81             keys = columns.keys()
 82             keys.sort( lambda x, y: cmp( columns[ x ], columns[ y ] ) )
 83             if self._row_break_tag is None:
 84                 self._row_break_tag = keys[ -1 ] # largest count
 85                 print 'Automatic row break tag set to <', self._row_break_tag, '>'
 86             if self._write_header:
 87                 try:
 88                     self._write_file_header( columns )
 89                 except Exception, err:
 90                     print 'Failed to write header to %s' % ( output_file )
 91                     print err
 92                     return False
 93             try:
 94                 self._dump_to_file( columns )
 95             except Exception, err:
 96                 print 'Failed when processing data from %s'  % ( xml_file )
 97                 print err
 98                 return False
 99 
100             for w in self._warnings:
101                 if self._warnings[ w ]:
102                     print 'WARNING:', w
103         finally:
104             # clear warnings, etc.
105             self._warnings = dict()
106         return True
107 
108     # support functions - file out
109     def _write_file_header( self, columns ):
110         '''Writes header to the output file'''
111         for c in sorted( columns ):
112             self._out_file.write( c )
113             self._out_file.write( self._delimiter )
114         self._out_file.write( self._line_ending )
115 
116     def _write_row( self, row, columns ):
117         '''Writes one row at a time to the output file'''
118         for id in sorted( columns ):
119             if id in row:
120                 self._out_file.write( row[ id ] )
121             self._out_file.write( self._delimiter )
122         self._out_file.write( self._line_ending )
123 
124     def _dump_to_file( self, columns ):
125         '''Fill output file with data (  text and attribute values  )'''
126         row = dict()
127         if self._delimiter == '"':
128             comment = "'"
129         else:
130             comment = '"'
131         for item in self._walk_file( self._in_file ):
132             attr = item.attrib
133             tag = item.tag
134             text = item.text
135             # for simplicity, '\n' is removed if discovered
136             if '\n' in text:
137                 text = text.replace( '\n', '' )
138                 self._warnings[ 'New line character "\\n" Removed From Data' ] = True
139             if tag in columns:
140                 # preserves white space if delimiter == whitespace
141                 if self._delimiter in text:
142                     text = comment + text + comment
143                     self._warnings[ 'Current Delimiter Removed From Data' ] = True
144                 # remove newline character and set warning
145                 if text.find( self._line_ending ) != -1:
146                     text = text.replace( self._line_ending, '' )
147                     self._warnings[ 'Current Line Ending Removed From Data' ] = True
148                 row[ tag ] = text if len( text ) > 0 else ''
149 
150             if len( attr ) > 0:
151                 for a in attr:
152                     if self._use_long_attr_name:
153                         atag = '<' + tag + ' ' + a + '=' + "'" +attr[ a ] + "'>"
154                     else:
155                         atag = tag + '_' + a + '_' + attr[ a ]
156                     row[ atag ] = '1'
157 
158             # write the row if
159             if self._row_break_tag  ==  tag and len( row ) > 0:
160                 self._write_row( row, columns )
161                 row = dict()
162 
163     # xml parsing
164     def _walk_file( self, file ):
165         '''Memory friendly walk though an xml file'''
166         # throwing away the event
167         for event, item in etree.iterparse( file ):
168             # do some cleanup to save memory
169             if len( item ) > 0:
170                 previous_item = item.getprevious()
171                 while previous_item is not None and len( previous_item ) > 0:
172                     p = previous_item.getparent()
173                     if len( p ) > 0: p.remove( previous_item )
174                     previous_item = item.getprevious()
175             yield item
176 
177     def _find_columns( self, user_columns = None ):
178         '''Scan xml for all elements and attributes
179            populates dict with potential data colums
180            filters if user_columns is not None'''
181         columns = dict()
182         for item in self._walk_file( self._in_file ):
183             # now read the tags
184             # attributes
185             attr = item.attrib
186             tag = item.tag
187 
188             # attributes are placed in columns as tag_attr_value
189             if len( attr ) > 0:
190                 for a in attr:
191                     if self._use_long_attr_name:
192                         atag = '<' + tag + ' ' + a + '=' + "'" +attr[ a ] + "'>"
193                     else:
194                         atag = tag + '_' + a + '_' + attr[ a ]
195                     if atag not in columns:
196                         columns[ atag ] = 1
197                     else:
198                         columns[ atag ] += 1
199             if tag not in columns:
200                 columns[ tag ] = 1
201             else:
202                 columns[ tag ] += 1
203 
204         # TODO: filter on base tag, not attribute loaded
205         # apply user filter
206         if user_columns is not None:
207             for c in columns.keys():
208                 if c.lower() not in user_columns:
209                     del columns[ c ]
210         return columns
211 
212 def usage():
213     '''Describes input arguments'''
214     ex = os.path.basename( sys.argv[ 0 ] )
215     if ex.endswith( '.py' ):
216         ex = 'python ' + ex
217     print '''
218 ---XML Flattener---
219 usage:\t''' + ex + ''' -i in_file -o out_file [ options ]
220 
221 Required arguments:
222 -i --in_file           : Path to input file
223 -o --out_file          : Path to output file
224 
225 Optional arguments:
226 -h --help              : Display this message
227 
228 -c --columns           : List of tags to include in the
229                           output seperated by commas with
230                           no spaces (  e.g., "item,price,qty"  )
231 
232 -d --delimiter         : Delimeter character surrounded by "
233                           and escaped (  e.g., "\\t", or "," ).
234                           "TAB", "SPACE", and "COMMMA" keywords
235                           may be used. To use "\\" as delimiter, enter "\\\\".
236 
237 -l --line_ending_style : Line ending style ( e.g.,  "windows", "unix", "mac"  )
238 
239 -r --row_break_tag     : Tag on which to create a new row in output file
240                           If not selected, a new row will be made on most
241                           frequent tag.
242 
243 -n --long_attr_names   : If set, attribures shown in xml-like form in column
244                           header, else form is "tag_attribute_value"
245 '''
246 
247 def main( argv ):
248     '''Entry point for script based usage
249        requires -i and -o arguments'''
250     try:
251         # not using args
252         opts, args = getopt.getopt( argv, 'i:o:hc:d:l:r:n', [ 'in_file=', 'out_file=',
253          'help', 'columns=', 'delimiter=', 'line_ending_style=', 'row_break_tag=',
254          'long_attr_names' ] )
255     except getopt.GetoptError, err:
256         print str( err )
257         usage()
258         sys.exit( 2 )
259 
260     if len( opts ) < 2 and '-h' not in opts and '--help' not in opts:
261         usage()
262         sys.exit( 2 )
263 
264     global WRITE_HEADER
265     global LINE_ENDING_STYLE
266     global DELIMITER
267     global ROW_BREAK_TAG
268 
269     delimit_map = { 'SPACE': ' ', 'COMMA': ',', 'TAB': '\t', \
270                     r'\t': '\t', r'\n': '\n', r'\c': '\c', r'\r': '\r' }
271 
272     # set defaults
273     wh = WRITE_HEADER
274     les = LINE_ENDING_STYLE
275     d = DELIMITER
276     rbt = ROW_BREAK_TAG
277     lan = False
278     in_file = None
279     out_file = None
280     columns = None
281 
282     # process commandline inputs, step on options where set
283     for opt, arg in opts:
284         if opt in ( '-h', '--help' ):
285             usage()
286             sys.exit()
287         elif opt in ( '-c', '--columns' ):
288             columns = arg
289             columns = [ c.lower() for c in columns.split( ',' ) if len( c ) > 0 ]
290         elif opt in ( '-r', '--row_break_tag' ):
291             rbt = arg
292         elif opt in ( '-d', '--delimiter' ):
293             d = arg
294             if '"' in d:
295                 d = arg[ 1:-1 ]
296             if d in delimit_map:
297                 d = delimit_map[ d ]
298         elif opt in ( '-l', '--line_ending_style' ):
299             les = arg
300         elif opt in ( '-n', '--long_attr_names' ):
301             lan = True
302         elif opt in ( '-i', '--in_file' ):
303             in_file = arg
304         elif opt in ( '-o', '--out_file' ):
305             out_file = arg
306         else:
307             assert False, "Error: unhandled option"
308 
309     assert len( d ) == 1, 'Delimeter must be a single character'
310     assert in_file is not None, 'Must enter an input file name'
311     assert out_file is not None, 'must enter an output file name'
312     # create instance of converter with default settings
313     converter = XMLToDelimited( row_break_tag = rbt, line_ending_style = les,
314         delimiter = d, write_header = wh , long_attr_name = lan)
315 
316     # use converter for files
317     # default settings may be changed by creating a new converter instance
318     res = converter( in_file, out_file, user_columns = columns )
319     if res:
320         print 'Successful conversion of %s to %s' % ( in_file, out_file )
321     else:
322         print 'Conversion of %s failed' % ( in_file )
323 
324 if __name__ == '__main__':
325     '''Command line run'''
326     if len( sys.argv ) == 1:
327         usage()
328         print 'This is a commandline program only.\nPress RETURN to exit.'
329         raw_input()
330     else:
331         main( sys.argv[ 1: ] )
332 
333