Printing Out Spreadsheet Cell Values by (Hierarchical) Column Using pandas

Building on from Wrangling Complex Spreadsheet Column Headers, I’ve been hacking the spreadsheet published here a bit more so that I can print out each column value from each sheet in a given spreadsheet for a particular local authority (that is, particular key value in a particular column), to get an output of the form:

Housing_Data_printrow

(I guess I could add a print suppressor to only print statements where the value is not 0?)

The original notebook can be found here.

The major novelty over the previous post is the colmapbuilder() function that generates a nested dict from a group of hierarchical column names that terminates with either the column code or the cell value for that column and a given row selector (I need to tidy up the function args…)

import pandas as pd
dfx=pd.ExcelFile('Local_Authority_Housing_Statistics_dataset_2013-14.xlsx')

#Menu sheet parse to identify sheets A-I
import re
def getSheetDetails(dfx):
    sd=re.compile(r'Section (\w) - (.*)$')
    sheetDetails={}
    for row in dfx.parse('Menu')[[1]].values:
        if str(row[0]).startswith('Section'):
            sheetDetails[sd.match(row[0]).group(1)]=sd.match(row[0]).group(2)
    return sheetDetails

def dfgrabber(dfx,sheet):
    #First pass - identify row for headers
    df=dfx.parse(sheet,header=None)
    df=df.dropna(how='all')
    row = df[df.apply(lambda x: (x == "DCLG code").any(), axis=1)].index.tolist()[0]#.values[0] # will be an array
    #Second pass - generate dataframe
    df=dfx.parse(sheet,header=row).dropna(how='all').dropna(how='all',axis=1)
    df=df[df['DCLG code'].notnull()].reset_index(drop=True)
    df.columns=[c.split(' ')[0] for c in df.columns]
    return df,row


import collections
def coldecoder(dfx,sheet,row):
    zz=dfx.parse(sheet,header=None)
    stitle=zz[0][[0]][0]
    
    xx=zz[1:row].dropna(how='all')
    #Fill down
    xx.fillna(method='ffill', axis=0,inplace=True)
    #Fill across
    xx=xx.fillna(method='ffill', axis=1)
    #How many rows in the header?
    keydepth=len(xx)
    header=[i for i in range(0,keydepth)]

    xx=xx.append(zz[row:row+1])
    xx.to_csv('multi_index.csv',header=False,index=False,encoding='utf-8')
    mxx=pd.read_csv('multi_index.csv',header=header,encoding='utf-8')
    for c in mxx.columns.get_level_values(0).tolist():
        if c.startswith('Unnamed'):
            mxx = mxx.drop(c, level=0, axis=1)
    #We need to preserve the order of the header columns
    dd=mxx.to_dict(orient='split')
    ddz=zip(dd['columns'],dd['data'][0])
    keyx=collections.OrderedDict() #{}
    for r in ddz:
        if not pd.isnull(r[1]):
            #print r[1].split(' ')[0]
            keyx[r[1].split(' ')[0]]=r[0]
    return stitle,keyx,keydepth

#Based on http://stackoverflow.com/a/10756547/454773
def myprint(d,l=None):
  if l is None: l=''
  for k, v in d.iteritems():
    if isinstance(v, dict):
      print("{}{}".format(l,k))
      myprint(v,l=l+'-')
    else:
      print "{0} {1} : {2}".format(l,k.encode('utf-8'), v)

def colmapbuilder(dfx,sheet,code=None,retval=True):
    df,row=dfgrabber(dfx,sheet)
    sname,skey,kd=coldecoder(dfx,sheet,row)
    kq=collections.OrderedDict() #{}
    for k in skey:
        kq[k]=[]
        for j in skey[k]:
            if j not in kq[k]: kq[k].append(j)
    colmapper=collections.OrderedDict() #{}
    for kkq in kq:
        curr_level = colmapper
        depth=0
        for path in kq[kkq]:
            depth=depth+1
            if path not in curr_level:
                if depth<len(kq[kkq]):
                    curr_level[path] = collections.OrderedDict() #{}
                    curr_level = curr_level[path]
                else:
                    if retval and code is not None:
                        curr_level[path] = df[df['Current\nONS']==code][kkq].iloc[0]
                    else:
                        curr_level[path] = kkq
            else:
                curr_level = curr_level[path]
    return sname, colmapper

ll=dfx.sheet_names
ll.remove('Menu')
for lll in ll:
    sname,cmb=colmapbuilder(dfx,lll,'E06000046')
    print(sname+'\n')
    myprint(cmb)
    print('\n\n')

I’m not sure how this helps, other than demonstrating how we might be able to quickly generate a crude textualisation of values in a single row in spreadsheet with a complex set of hierarchical column names?

The code is also likely to be brittle, so the main questions are:

– is the method reusable?
– can the code/approach be generalised or at least made a little bit more robust and capable of handling other spreadsheets with particular properties? (And then – what properties, and how might we be able to detect those properties?)

Author: Tony Hirst

I'm a Senior Lecturer at The Open University, with an interest in #opendata policy and practice, as well as general web tinkering...

%d bloggers like this: