JSON Object to CSV File Troubleshooting
Sahlusar
ahlusar.ahluwalia at gmail.com
Sun Jun 21 10:38:13 EDT 2015
It is difficult to explain this to someone asynchronously and without an in person meeting. Moreover, the strict guidelines for disclosing information make it difficult for me to explain the client's requirements and the problems that they face.
I do agree with you Denis that this is an unconventional approach. I was wondering then that perhaps I should add additional functionality at the XML to JSON step? So far, with JSON objects without nested lists (as values) I have been successful with this (the following is rather lengthy):
import xml.etree.cElementTree as ElementTree
from xml.etree.ElementTree import XMLParser
import json
import csv
import tokenize
import token
try:
from collections import OrderedDict
import json
except ImportError:
from ordereddict import OrderedDict
import simplejson as json
import itertools
import six
import string
from csvkit import CSVKitWriter
class XmlListConfig(list):
def __init__(self, aList):
for element in aList:
if element:
# treat like dict
if len(element) == 1 or element[0].tag != element[1].tag:
self.append(XmlDictConfig(element))
# treat like list
elif element[0].tag == element[1].tag:
self.append(XmlListConfig(element))
elif element.text:
text = element.text.strip()
if text:
self.append(text)
class XmlDictConfig(dict):
'''
Example usage:
>>> tree = ElementTree.parse('your_file.xml')
>>> root = tree.getroot()
>>> xmldict = XmlDictConfig(root)
Or, if you want to use an XML string:
>>> root = ElementTree.XML(xml_string)
>>> xmldict = XmlDictConfig(root)
And then use xmldict for what it is..a dictionary.
'''
def __init__(self, parent_element):
if parent_element.items():
self.update(dict(parent_element.items()))
for element in parent_element:
if element:
# treat like dict - we assume that if the first two tags
# in a series are different, then they are all different.
if len(element) == 1 or element[0].tag != element[1].tag:
aDict = XmlDictConfig(element)
# treat like list - we assume that if the first two tags
# in a series are the same, then the rest are the same.
else:
# here, we put the list in dictionary; the key is the
# tag name the list elements all share in common, and
# the value is the list itself
aDict = {element[0].tag: XmlListConfig(element)}
# if the tag has attributes, add those to the dict
if element.items():
aDict.update(dict(element.items()))
self.update({element.tag: aDict})
# this assumes that if you've got an attribute in a tag,
# you won't be having any text. This may or may not be a
# good idea -- time will tell. It works for the way we are
# currently doing XML configuration files...
elif element.items():
self.update({element.tag: dict(element.items())})
# finally, if there are no child tags and no attributes, extract
# the text
else:
self.update({element.tag: element.text})
def main():
#Lines 88-89stantiate the class Elementree
#and applies the method to recursively traverse from the root node
#XmlDictConfig is instantiated in line 90
with open('C:\\Users\\somefile.xml', 'r', encoding='utf-8') as f:
xml_string = f.read()
xml_string= xml_string.replace('�', '') #optional to remove ampersands.
root = ElementTree.XML(xml_string)
xmldict = XmlDictConfig(root)
json_str = json.dumps(xmldict, sort_keys=True, indent=4, separators=(',', ': '))
newly_formatted_data = json.loads(json_str) #encode into JSON
with open('data2.json', 'w') as f: #writing JSON file
json.dump(newly_formatted_data, f)
Peter Otten was very helpful with subsequently converting aJ SON string to a CSV file:
import csv
import json
import sys
def hook(obj):
return obj
def flatten(obj):
for k, v in obj:
if isinstance(v, list):
yield from flatten(v)
else:
yield k, v
if __name__ == "__main__":
with open("somefileneame.json") as f:
data = json.load(f, object_pairs_hook=hook)
pairs = list(flatten(data))
writer = csv.writer(sys.stdout)
header = writer.writerow([k for k, v in pairs])
row = writer.writerow([v for k, v in pairs]) #writer.writerows for any other iterable object
However, for the nested keys as dictionaries with values as dictionaries, I have been taking smaller chunks of the JSON objects and developing some functional programs that allow me to blow up lists:
For example for this example JSON snippet, to de-nest by level the nested dictionaries, I use this function:
def flatten(d, parent_key=''):
items = []
for k, v in d.items():
try:
items.extend(flatten(v, '%s%s_' % (parent_key, k)).items())
except AttributeError:
items.append(('%s%s' % (parent_key, k), v))
return dict(items)
final = (flatten(data2, parent_key =''))
##JSON sample:
data2 = {
"OTF": "0",
"F": "False",
"F": {
"Int32": ["0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0"]
},
"D": {
"B": ["0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0",
"0"]
},
"PBDS": {
"DateTime": ["1/1/0001 12:00:00 AM",
"1/1/0001 12:00:00 AM",
"1/1/0001 12:00:00 AM",
"1/1/0001 12:00:00 AM",
"1/1/0001 12:00:00 AM",
"1/1/0001 12:00:00 AM",
"1/1/0001 12:00:00 AM",
"1/1/0001 12:00:00 AM"]
},
"PBDS": {
"Double": ["0",
"0",
"0",
"0",
"0",
"0",
"0",
"0"]
},
"SCS": {
"String": ["1",
"2"]
}
}
The result:
{'D_B': ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'],
'F_Int32': ['0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0',
'0'],
'OTF': '0',
'PBDS_Double': ['0', '0', '0', '0', '0', '0', '0', '0'],
'SCS_String': ['1', '2']}
I then have used a decorator to explode the lists (thank you to Gribouillis for feedback):
from postprocess import post_process
@post_process(dict)
def explode_lists(adict):
for key, value in adict.items():
if isinstance(value, list):
if all(isinstance(x, str) for x in value):
for i, x in enumerate(value, 1):
yield ('{}{}'.format(key, i), x)
continue
yield key, value
# postprocess.py
def post_process(*filters):
"""Decorator to post process a function's return value through a
sequence of filters (functions with a single argument).
Example:
@post_process(f1, f2, f3)
def f(*args, **kwd):
...
return value
then calling f(...) will actually return f3( f2( f1( f(...)))).
This can also be used to convert a generator to a function
returning a sequence type:
@post_process(dict)
def my_generator():
...
yield key, value
"""
def decorate(func):
from functools import wraps
@wraps(func)
def wrapper(*args, **kwd):
rv = func(*args, **kwd)
for f in filters:
rv = f(rv)
return rv
return wrapper
return decorate
I know that this is alot of sequential steps. I am wondering if I could insert or conditionally pass these functions when originally parsing the XML, so that the JSON is formatted for more recursive reading of the JSON dictionary and then writing to CSV? I welcome constructive feedback for refactoring....
More information about the Python-list
mailing list