Skip to content

Commit

Permalink
refactor: switch to sdmx api
Browse files Browse the repository at this point in the history
  • Loading branch information
Mause committed Nov 13, 2023
1 parent 023d9c4 commit 610fbe4
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 182 deletions.
26 changes: 7 additions & 19 deletions saau/sections/age/median.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,13 @@ def has_required_data(self):
def obtain_data(self):
data = get_generic_data(
DATASETID,
and_=[
'FREQUENCY.A',
'REGIONTYPE.SA2',
'MEASURE.MAGE'
],
or_=[
'STATE.0',
'STATE.1',
'STATE.2',
'STATE.3',
'STATE.4',
'STATE.5',
'STATE.6',
'STATE.7',
'STATE.8',
'STATE.9'
]
and_={
'FREQUENCY':'A',
'REGIONTYPE':'SA2',
'MEASURE':'MAGE',
'STATE': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
}
)
assert data['series']

return self.save_json(FILENAME, data)

Expand All @@ -49,7 +37,7 @@ def region_lookup(self, sa3):
def build_image(self):
colors = get_cmap('Purples')

age_data = abs_data_to_dataframe(self.load_json(FILENAME))
age_data = abs_data_to_dataframe(self.data_dir_join(FILENAME))
age_data = [
(
self.region_lookup(data_point.REGION),
Expand Down
5 changes: 1 addition & 4 deletions saau/sections/ancestry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,7 @@ def obtain_data(self):
return self.save_json(self.filename, get_data(self.ancestry_name))

def build_image(self):
data = abs_data_to_dataframe(
self.load_json(self.filename),
['ANCP', 'FREQUENCY']
)
data = abs_data_to_dataframe(self.data_dir_join(self.filename))
data = data[data.pop('Time') == 2011]
del data['REGIONTYPE']

Expand Down
9 changes: 7 additions & 2 deletions saau/sections/image_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,13 @@ def data_dir_join(self, name: PathOrStr) -> str:
return join(self.data_dir, name)

def save_json(self, name: PathOrStr, data: Any) -> bool:
with open(self.data_dir_join(name), 'w') as fh:
json.dump(data, fh, indent=4)
import pandas as pd

if isinstance(data, (pd.DataFrame, pd.Series)):
data.to_json(self.data_dir_join(name))
else:
with open(self.data_dir_join(name), 'w') as fh:
json.dump(data, fh, indent=4)
return True

def load_json(self, name: PathOrStr) -> Any:
Expand Down
3 changes: 1 addition & 2 deletions saau/sections/population/density.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@


def get_data(data_dir):
with open(join(data_dir, filename)) as fh:
return abs_data_to_dataframe(json.load(fh))
return abs_data_to_dataframe(join(data_dir, filename))


def main(services, data_dir):
Expand Down
2 changes: 1 addition & 1 deletion saau/sections/population/male_vs_female.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def obtain_data(self):
return self.save_json(FILENAME, data)

def load_data(self):
df = abs_data_to_dataframe(self.load_json(FILENAME))
df = abs_data_to_dataframe(self.data_dir_join(FILENAME))

df.SEX_ABS = (
df.SEX_ABS
Expand Down
234 changes: 80 additions & 154 deletions saau/utils/download/abs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,119 +3,67 @@
https://web.archive.org/web/20141026141936/http://stat.abs.gov.au/itt/r.jsp?api
"""
import sys
from itertools import chain
from functools import lru_cache
from functools import cache

import pandas
import requests

BASE = 'https://itt.abs.gov.au/itt/query.jsp'
from pandasdmx import Request


class ABSException(Exception):
pass


def query(method, params):
params.update({
'method': method,
'format': 'json'
})
r = requests.get(
BASE,
params=params
)
if r.url.endswith('unavailable'):
raise ABSException("Service currently down")

datum = r.json()

if 'exception' in datum:
raise ABSException(datum['exception'])

return datum


def introspect(datasetid):
concepts = get_dataset_concepts(datasetid)
assert 'concepts' in concepts, concepts
concepts = concepts['concepts']

for concept in concepts:
codes = get_codelist_value(datasetid, concept)['codes']
print(concept, '->', ', '.join(
'[{description}:{code}]'.format_map(code)
for code in codes
))
input()


def validate_query(datasetid, and_=None, or_={}):
datasets = get_dataset_list()
datasets = [dataset['id'] for dataset in datasets['datasets']]
assert datasetid in datasets
concepts = get_dataset_concepts(datasetid)['concepts']

and_ = and_ or []
or_ = or_ or []

if isinstance(and_, list):
and_ = [filt.split('.') for filt in and_]
if isinstance(or_, list):
or_ = [filt.split('.') for filt in or_]
if isinstance(and_, dict):
and_ = list(and_.items())
if isinstance(or_, dict):
or_ = list(or_.items())

for key, value in chain(and_, or_):
assert key in concepts
valid = [
code['code']
for code in get_codelist_value(datasetid, key)['codes']
]
assert value in valid, '{} not in {}'.format(value, valid)


@lru_cache()
def get_dataset_list():
return query('GetDatasetList', {})


@lru_cache()
def get_dataset_concepts(datasetid):
return query('GetDatasetConcepts', {'datasetid': datasetid})


@lru_cache()
def get_codelist_value(datasetid, concept, code=None, relationship=None):
assert concept.isupper(), 'Concepts are case sensitive'
if relationship is not None:
assert relationship in {
'parent',
'children',
'parentCode'
}
if code and isinstance(code, str):
code = [code]

return query(
'GetCodeListValue',
{
'datasetid': datasetid,
'concept': concept,
'relationship': relationship,
'code': code
}
)


commas = ','.join


def get_generic_data(datasetid, and_, or_=None, orParent=None, start=None,
end=None, top=None, bottom=None, series=None,
format='json'):
def introspect(datasetid: str) -> None:
from rich_dataframe import DataFramePrettify

dataflow = rq().dataflow(datasetid, use_cache=True)

for flow_id, flow in dataflow.structure.items():
for dimension in flow.dimensions:
enum = dimension.local_representation.enumerated
if enum:
prett = DataFramePrettify(
pandas.DataFrame(
[
{
"id": code.id,
"name": code.name.localized_default(),
"description": code.description.localized_default(),
}
for code in enum
]
),
)
prett.table.title = f'{flow_id} : {dimension.id} : {enum.id}'
else:
df = pandas.DataFrame(
[
vars(facet)
for facet in dimension.local_representation.non_enumerated
]
)
prett = DataFramePrettify(df)
prett.table.title = f'{flow_id} : {dimension.id}'
prett.prettify()
input()


@cache
def rq():
return Request("ABS_XML", use_cache=True)


def get_generic_data(
datasetid,
and_,
orParent=None,
start=None,
end=None,
top=None,
bottom=None,
series=None,
format=None, # "json",
):
"""
:param datasetid: Any dataset ID in ABS.Stat. These can be retrieved using
the GetDatasetList method.
Expand Down Expand Up @@ -150,68 +98,46 @@ def get_generic_data(datasetid, and_, or_=None, orParent=None, start=None,
2010
:param format: see elsewhere
"""
validate_query(datasetid, and_, or_)

assert top is None or isinstance(top, int)
assert bottom is None or isinstance(bottom, int)

assert isinstance(format, str)
assert format in {
'csv',
'htable',
'vtable',
'json',
'latest',
'excel'
}

if isinstance(and_, list):
and_ = commas(and_)

if isinstance(or_, list):
or_ = commas(or_)

return query(
'GetGenericData',
{
'datasetid': datasetid,
'and': and_,
'or': or_,
'orParent': orParent,
'start': start,
'end': end,
'top': top,
'bottom': bottom,
'series': series,
'format': format
}
and_ = dict(item.split(".") for item in and_)

res = rq().data(
datasetid,
key=and_,
params={
"startPeriod": start,
"endPeriod": end,
"top": top,
"bottom": bottom,
"series": series,
"format": format,
},
use_cache=True,
)


def collapse_concepts(concepts):
return {t['name']: t['Value'] for t in concepts}
return res.to_pandas(attributes="osgd", dtypes_from_dsd=True)


def abs_data_to_dataframe(data, delete_cols=None):
data = [
dict(
collapse_concepts(locale['concepts']),
**observation
)
for locale in data['series']
for observation in locale['observations']
]

return (
pandas.DataFrame(data)
.convert_objects(convert_numeric=True)
.drop(delete_cols or [], axis=1)
)
if exists(data):
df = pandas.read_json(data)
else:
df = pandas.read_parquet(data.replace(".json", ".parquet"))
if delete_cols:
df = df.drop(delete_cols)
return df


def main(argv=sys.argv[1:]):
introspect(argv[0])


if __name__ == '__main__':
if __name__ == "__main__":
import coloredlogs
import logging

coloredlogs.install(level=logging.INFO)
main()

0 comments on commit 610fbe4

Please sign in to comment.