From 610fbe4d5cd7b9b834c97683c71b9f21540fd45c Mon Sep 17 00:00:00 2001 From: Elliana May Date: Mon, 13 Nov 2023 11:26:36 +0800 Subject: [PATCH] refactor: switch to sdmx api --- saau/sections/age/median.py | 26 +-- saau/sections/ancestry/__init__.py | 5 +- saau/sections/image_provider.py | 9 +- saau/sections/population/density.py | 3 +- saau/sections/population/male_vs_female.py | 2 +- saau/utils/download/abs.py | 234 +++++++-------------- 6 files changed, 97 insertions(+), 182 deletions(-) diff --git a/saau/sections/age/median.py b/saau/sections/age/median.py index dfb6fea..0cd4800 100644 --- a/saau/sections/age/median.py +++ b/saau/sections/age/median.py @@ -21,25 +21,13 @@ def has_required_data(self): def obtain_data(self): data = get_generic_data( DATASETID, - and_=[ - 'FREQUENCY.A', - 'REGIONTYPE.SA2', - 'MEASURE.MAGE' - ], - or_=[ - 'STATE.0', - 'STATE.1', - 'STATE.2', - 'STATE.3', - 'STATE.4', - 'STATE.5', - 'STATE.6', - 'STATE.7', - 'STATE.8', - 'STATE.9' - ] + and_={ + 'FREQUENCY':'A', + 'REGIONTYPE':'SA2', + 'MEASURE':'MAGE', + 'STATE': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] + } ) - assert data['series'] return self.save_json(FILENAME, data) @@ -49,7 +37,7 @@ def region_lookup(self, sa3): def build_image(self): colors = get_cmap('Purples') - age_data = abs_data_to_dataframe(self.load_json(FILENAME)) + age_data = abs_data_to_dataframe(self.data_dir_join(FILENAME)) age_data = [ ( self.region_lookup(data_point.REGION), diff --git a/saau/sections/ancestry/__init__.py b/saau/sections/ancestry/__init__.py index c1e2944..545803c 100644 --- a/saau/sections/ancestry/__init__.py +++ b/saau/sections/ancestry/__init__.py @@ -76,10 +76,7 @@ def obtain_data(self): return self.save_json(self.filename, get_data(self.ancestry_name)) def build_image(self): - data = abs_data_to_dataframe( - self.load_json(self.filename), - ['ANCP', 'FREQUENCY'] - ) + data = abs_data_to_dataframe(self.data_dir_join(self.filename)) data = data[data.pop('Time') == 2011] del data['REGIONTYPE'] diff --git a/saau/sections/image_provider.py b/saau/sections/image_provider.py index c7fa4d3..bdf9962 100644 --- a/saau/sections/image_provider.py +++ b/saau/sections/image_provider.py @@ -48,8 +48,13 @@ def data_dir_join(self, name: PathOrStr) -> str: return join(self.data_dir, name) def save_json(self, name: PathOrStr, data: Any) -> bool: - with open(self.data_dir_join(name), 'w') as fh: - json.dump(data, fh, indent=4) + import pandas as pd + + if isinstance(data, (pd.DataFrame, pd.Series)): + data.to_json(self.data_dir_join(name)) + else: + with open(self.data_dir_join(name), 'w') as fh: + json.dump(data, fh, indent=4) return True def load_json(self, name: PathOrStr) -> Any: diff --git a/saau/sections/population/density.py b/saau/sections/population/density.py index 2e7dfbc..b108bb7 100644 --- a/saau/sections/population/density.py +++ b/saau/sections/population/density.py @@ -14,8 +14,7 @@ def get_data(data_dir): - with open(join(data_dir, filename)) as fh: - return abs_data_to_dataframe(json.load(fh)) + return abs_data_to_dataframe(join(data_dir, filename)) def main(services, data_dir): diff --git a/saau/sections/population/male_vs_female.py b/saau/sections/population/male_vs_female.py index ff9e05d..833a88b 100644 --- a/saau/sections/population/male_vs_female.py +++ b/saau/sections/population/male_vs_female.py @@ -49,7 +49,7 @@ def obtain_data(self): return self.save_json(FILENAME, data) def load_data(self): - df = abs_data_to_dataframe(self.load_json(FILENAME)) + df = abs_data_to_dataframe(self.data_dir_join(FILENAME)) df.SEX_ABS = ( df.SEX_ABS diff --git a/saau/utils/download/abs.py b/saau/utils/download/abs.py index c2ae622..ccae417 100644 --- a/saau/utils/download/abs.py +++ b/saau/utils/download/abs.py @@ -3,119 +3,67 @@ https://web.archive.org/web/20141026141936/http://stat.abs.gov.au/itt/r.jsp?api """ import sys -from itertools import chain -from functools import lru_cache +from functools import cache import pandas -import requests - -BASE = 'https://itt.abs.gov.au/itt/query.jsp' +from pandasdmx import Request class ABSException(Exception): pass -def query(method, params): - params.update({ - 'method': method, - 'format': 'json' - }) - r = requests.get( - BASE, - params=params - ) - if r.url.endswith('unavailable'): - raise ABSException("Service currently down") - - datum = r.json() - - if 'exception' in datum: - raise ABSException(datum['exception']) - - return datum - - -def introspect(datasetid): - concepts = get_dataset_concepts(datasetid) - assert 'concepts' in concepts, concepts - concepts = concepts['concepts'] - - for concept in concepts: - codes = get_codelist_value(datasetid, concept)['codes'] - print(concept, '->', ', '.join( - '[{description}:{code}]'.format_map(code) - for code in codes - )) - input() - - -def validate_query(datasetid, and_=None, or_={}): - datasets = get_dataset_list() - datasets = [dataset['id'] for dataset in datasets['datasets']] - assert datasetid in datasets - concepts = get_dataset_concepts(datasetid)['concepts'] - - and_ = and_ or [] - or_ = or_ or [] - - if isinstance(and_, list): - and_ = [filt.split('.') for filt in and_] - if isinstance(or_, list): - or_ = [filt.split('.') for filt in or_] - if isinstance(and_, dict): - and_ = list(and_.items()) - if isinstance(or_, dict): - or_ = list(or_.items()) - - for key, value in chain(and_, or_): - assert key in concepts - valid = [ - code['code'] - for code in get_codelist_value(datasetid, key)['codes'] - ] - assert value in valid, '{} not in {}'.format(value, valid) - - -@lru_cache() -def get_dataset_list(): - return query('GetDatasetList', {}) - - -@lru_cache() -def get_dataset_concepts(datasetid): - return query('GetDatasetConcepts', {'datasetid': datasetid}) - - -@lru_cache() -def get_codelist_value(datasetid, concept, code=None, relationship=None): - assert concept.isupper(), 'Concepts are case sensitive' - if relationship is not None: - assert relationship in { - 'parent', - 'children', - 'parentCode' - } - if code and isinstance(code, str): - code = [code] - - return query( - 'GetCodeListValue', - { - 'datasetid': datasetid, - 'concept': concept, - 'relationship': relationship, - 'code': code - } - ) - - -commas = ','.join - - -def get_generic_data(datasetid, and_, or_=None, orParent=None, start=None, - end=None, top=None, bottom=None, series=None, - format='json'): +def introspect(datasetid: str) -> None: + from rich_dataframe import DataFramePrettify + + dataflow = rq().dataflow(datasetid, use_cache=True) + + for flow_id, flow in dataflow.structure.items(): + for dimension in flow.dimensions: + enum = dimension.local_representation.enumerated + if enum: + prett = DataFramePrettify( + pandas.DataFrame( + [ + { + "id": code.id, + "name": code.name.localized_default(), + "description": code.description.localized_default(), + } + for code in enum + ] + ), + ) + prett.table.title = f'{flow_id} : {dimension.id} : {enum.id}' + else: + df = pandas.DataFrame( + [ + vars(facet) + for facet in dimension.local_representation.non_enumerated + ] + ) + prett = DataFramePrettify(df) + prett.table.title = f'{flow_id} : {dimension.id}' + prett.prettify() + input() + + +@cache +def rq(): + return Request("ABS_XML", use_cache=True) + + +def get_generic_data( + datasetid, + and_, + orParent=None, + start=None, + end=None, + top=None, + bottom=None, + series=None, + format=None, # "json", +): """ :param datasetid: Any dataset ID in ABS.Stat. These can be retrieved using the GetDatasetList method. @@ -150,68 +98,46 @@ def get_generic_data(datasetid, and_, or_=None, orParent=None, start=None, 2010 :param format: see elsewhere """ - validate_query(datasetid, and_, or_) assert top is None or isinstance(top, int) assert bottom is None or isinstance(bottom, int) - assert isinstance(format, str) - assert format in { - 'csv', - 'htable', - 'vtable', - 'json', - 'latest', - 'excel' - } - if isinstance(and_, list): - and_ = commas(and_) - - if isinstance(or_, list): - or_ = commas(or_) - - return query( - 'GetGenericData', - { - 'datasetid': datasetid, - 'and': and_, - 'or': or_, - 'orParent': orParent, - 'start': start, - 'end': end, - 'top': top, - 'bottom': bottom, - 'series': series, - 'format': format - } + and_ = dict(item.split(".") for item in and_) + + res = rq().data( + datasetid, + key=and_, + params={ + "startPeriod": start, + "endPeriod": end, + "top": top, + "bottom": bottom, + "series": series, + "format": format, + }, + use_cache=True, ) - - -def collapse_concepts(concepts): - return {t['name']: t['Value'] for t in concepts} + return res.to_pandas(attributes="osgd", dtypes_from_dsd=True) def abs_data_to_dataframe(data, delete_cols=None): - data = [ - dict( - collapse_concepts(locale['concepts']), - **observation - ) - for locale in data['series'] - for observation in locale['observations'] - ] - - return ( - pandas.DataFrame(data) - .convert_objects(convert_numeric=True) - .drop(delete_cols or [], axis=1) - ) + if exists(data): + df = pandas.read_json(data) + else: + df = pandas.read_parquet(data.replace(".json", ".parquet")) + if delete_cols: + df = df.drop(delete_cols) + return df def main(argv=sys.argv[1:]): introspect(argv[0]) -if __name__ == '__main__': +if __name__ == "__main__": + import coloredlogs + import logging + + coloredlogs.install(level=logging.INFO) main()