From 1d55a9df0ee134cc744e924cb51db9265681c674 Mon Sep 17 00:00:00 2001 From: "Gonzabato, Nelson" Date: Sun, 9 Oct 2022 16:34:13 +0200 Subject: [PATCH 1/2] Extend reqs, move to joblib --- changelog.md | 3 + example_notebooks/pyfdc_example.ipynb | 372 ++++++++++++++++++++++++++ pyfdc/pyfdc.py | 86 +++--- requirements.txt | 2 + 4 files changed, 426 insertions(+), 37 deletions(-) create mode 100644 example_notebooks/pyfdc_example.ipynb diff --git a/changelog.md b/changelog.md index 940ccd8..f9a8410 100644 --- a/changelog.md +++ b/changelog.md @@ -2,6 +2,9 @@ **Version 0.2.3** +* We now use `joblib`'s 'loky' backend for processing the results in parallel. This is related to +[#4](https://github.com/Nelson-Gon/pyfdc/issues/4). + * Tests are now written with the `pytest` framework which means that this is now a dependency. We also now use `pytest_cov` for coverage reports. This also introduces yet another dependency. diff --git a/example_notebooks/pyfdc_example.ipynb b/example_notebooks/pyfdc_example.ipynb new file mode 100644 index 0000000..d9fe67c --- /dev/null +++ b/example_notebooks/pyfdc_example.ipynb @@ -0,0 +1,372 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "from pyfdc.pyfdc import FoodDataCentral\n", + "from pyfdc.utils import set_api_key\n", + "import os " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pyfdc_key is already a valid key\n" + ] + } + ], + "source": [ + "# This is blank because we already have an environment variable set for the key \n", + "set_api_key()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "my_search = FoodDataCentral()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/gonzabat/Desktop/pyfdc/pyfdc/pyfdc.py:112: UserWarning: No target_fields were provided, returning fdc_id, ingredients, and description.\n", + " warn(\"No target_fields were provided, returning fdc_id, ingredients, and description.\")\n" + ] + } + ], + "source": [ + "search = my_search.get_food_info(search_phrase=\"cheese\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
fdc_idingredientsdescription
01943515MILK, SALT, CULTURES, ENZYMES.CHEESE
12083541PARMESAN AND ROMANO CHEESE. MADE FROM PASTEURI...CHEESE
21856944BELLAVITANO CHEESE (PASTEURIZED MILK, CHEESE C...CHEESE
31867792BELLAVITANO CHEESE (PASTEURIZED MILK, CHEESE C...CHEESE
4500370GRUYERE CHEESE AND CHEESE (PASTEURIZED MILK, C...CHEESE
\n", + "
" + ], + "text/plain": [ + " fdc_id ingredients description\n", + "0 1943515 MILK, SALT, CULTURES, ENZYMES. CHEESE\n", + "1 2083541 PARMESAN AND ROMANO CHEESE. MADE FROM PASTEURI... CHEESE\n", + "2 1856944 BELLAVITANO CHEESE (PASTEURIZED MILK, CHEESE C... CHEESE\n", + "3 1867792 BELLAVITANO CHEESE (PASTEURIZED MILK, CHEESE C... CHEESE\n", + "4 500370 GRUYERE CHEESE AND CHEESE (PASTEURIZED MILK, C... CHEESE" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "search.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
descriptionfdc_id
0CHEESE1943515
1CHEESE2083541
2CHEESE1856944
3CHEESE1867792
\n", + "
" + ], + "text/plain": [ + " description fdc_id\n", + "0 CHEESE 1943515\n", + "1 CHEESE 2083541\n", + "2 CHEESE 1856944\n", + "3 CHEESE 1867792" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "my_search.get_food_info(search_phrase=\"cheese\", target_fields=[\"description\", \"fdc_id\"]).head(4)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/gonzabat/Desktop/pyfdc/pyfdc/pyfdc.py:162: UserWarning: No target_field was provided, returning low level results.\n", + " warn(\"No target_field was provided, returning low level results.\")\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01
0fdcId168977
1descriptionAgutuk, meat-caribou (Alaskan ice cream) (Alas...
2publicationDate4/1/2019
3foodNutrients[{'nutrient': {'id': 2045, 'number': '951', 'n...
4dataTypeSR Legacy
5foodClassFinalFood
6nutrientConversionFactors[{'id': 17492, 'value': 6.25, 'type': '.Protei...
7ndbNumber35003
8isHistoricalReferenceTrue
9foodCategory{'id': 24, 'code': '3500', 'description': 'Ame...
\n", + "
" + ], + "text/plain": [ + " 0 \\\n", + "0 fdcId \n", + "1 description \n", + "2 publicationDate \n", + "3 foodNutrients \n", + "4 dataType \n", + "5 foodClass \n", + "6 nutrientConversionFactors \n", + "7 ndbNumber \n", + "8 isHistoricalReference \n", + "9 foodCategory \n", + "\n", + " 1 \n", + "0 168977 \n", + "1 Agutuk, meat-caribou (Alaskan ice cream) (Alas... \n", + "2 4/1/2019 \n", + "3 [{'nutrient': {'id': 2045, 'number': '951', 'n... \n", + "4 SR Legacy \n", + "5 FinalFood \n", + "6 [{'id': 17492, 'value': 6.25, 'type': '.Protei... \n", + "7 35003 \n", + "8 True \n", + "9 {'id': 24, 'code': '3500', 'description': 'Ame... " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "my_search.get_food_details(168977)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.8.13 ('pyfdc')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "d8711246ff82dafb1ff1fc81455883cb4fae0607334e84927198dc99646575f9" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/pyfdc/pyfdc.py b/pyfdc/pyfdc.py index d4da8ec..b2c5322 100644 --- a/pyfdc/pyfdc.py +++ b/pyfdc/pyfdc.py @@ -8,7 +8,9 @@ from utils import key_signup import os from warnings import warn -import re +import re +from joblib import Parallel, delayed +from functools import partial class FoodDataCentral(object): @@ -25,7 +27,8 @@ class FoodDataCentral(object): def __init__(self, api_key=None): if api_key is None: - self.api_key = os.environ.get("pyfdc_key") if "pyfdc_key" in os.environ else key_signup() + self.api_key = os.environ.get( + "pyfdc_key") if "pyfdc_key" in os.environ else key_signup() else: warn("Providing an api_key is discouraged, please consider using set_api_key.") self.api_key = api_key @@ -47,9 +50,10 @@ def __init__(self, api_key=None): def get_food_info_internal(self, search_phrase=None, ingredients=None, brand_owner=None, - target=None, page_number=None, page_size=50, - sort_field=None, sort_direction='asc'): - + page_number=None, page_size=50, + sort_field=None, sort_direction='asc', + target=None + ): """ :param brand_owner: str Defaults to None :param ingredients: str to limit the search to certain ingredients @@ -77,7 +81,8 @@ def get_food_info_internal(self, search_phrase=None, # https://fdc.nal.usda.gov/api-spec/fdc_api.html#/FDC/postFoodsSearch try: - url_response = requests.get(self.base_url, params=search_query, headers={"User-Agent": "Mozilla-5.0"}) + url_response = requests.get(self.base_url, params=search_query, headers={ + "User-Agent": "Mozilla-5.0"}) url_response.raise_for_status() unprocessed_result = json.loads(url_response.content)["foods"] @@ -85,13 +90,15 @@ def get_food_info_internal(self, search_phrase=None, raise else: + res = [] for x in unprocessed_result: - yield [val for key_id, val in x.items() if key_id == self.available_targets[target]] + res.append([val for key_id, val in x.items() if key_id == self.available_targets[target]]) + return res - def get_food_info(self, search_phrase=None, target_fields=None, + def get_food_info(self, search_phrase=None, ingredients=None, brand_owner=None, page_number=1, page_size=50, - sort_field=None, sort_direction='asc'): + sort_field=None, sort_direction='asc', target_fields=None): """ :param search_phrase: A character string to search for. :param target_fields: A list of targets eg ['fdc_id','description'] @@ -105,30 +112,33 @@ def get_food_info(self, search_phrase=None, target_fields=None, :return: A pandas DataFrame """ # TODO: Avoid two functions when one will do aka drop get_food_info_internal - result = [] # Check that page number is not none and is an int (for now) if target_fields is None: - warn("No target_fields were provided, returning fdc_id, ingredients, and description.") + warn( + "No target_fields were provided, returning fdc_id, ingredients, and description.") target_fields = ["fdc_id", "ingredients", "description"] if not isinstance(target_fields, (list, tuple)): - raise TypeError(f"target should be a list or tuple not {type(target_fields).__name__}") - - for target_key in target_fields: - if target_key not in self.available_targets.keys(): - raise KeyError(f"target_key should be one of {self.available_targets.keys()} not {target_key}") - result.append(list(self.get_food_info_internal(search_phrase=search_phrase, target=target_key, - ingredients=ingredients, - brand_owner=brand_owner, - page_number=page_number, - page_size=page_size, - sort_field=sort_field, - sort_direction=sort_direction))) + raise TypeError( + f"target should be a list or tuple not {type(target_fields).__name__}") + + try: + result = Parallel(os.cpu_count() - 1, backend="loky")(delayed( + self.get_food_info_internal)(search_phrase, + ingredients, + brand_owner, + page_number, + page_size, + sort_field, + sort_direction, target) for target in target_fields) + except KeyError: + raise KeyError( + f"target_key should be one of {self.available_targets.keys()}") return DataFrame(list(map(lambda x: list(chain.from_iterable(x)), result)), index=target_fields).transpose() - def get_food_details(self, fdc_id=None, target_field=None, result_format="full",nutrients=None): + def get_food_details(self, fdc_id=None, target_field=None, result_format="full", nutrients=None): """ Accesses the FoodDetails EndPoint :param fdc_id: A FoodDataCentral Food ID @@ -136,18 +146,19 @@ def get_food_details(self, fdc_id=None, target_field=None, result_format="full", a low level result will be returned :return: A DataFrame object with the desired results. """ - + try: # base_url = f"https://api.nal.usda.gov/fdc/v1/{fdc_id}?api_key={self.api_key}" # Replace in base url so we have only for a specific FDC ID. assert fdc_id is not None, "fdc_id should not be None" - assert isinstance(fdc_id, int), f"fdc_id should be an int not {type(fdc_id).__name__}" + assert isinstance( + fdc_id, int), f"fdc_id should be an int not {type(fdc_id).__name__}" base_url = self.base_url.replace("foods/search", f"food/{fdc_id}") - base_url = base_url + "&format=" + result_format - # print(base_url) + base_url = base_url + "&format=" + result_format if nutrients: base_url = base_url + "&nutrients=" + ",".join(nutrients) - url_response = requests.get(base_url, headers={"User-Agent": "Mozilla-5.0"}) + url_response = requests.get( + base_url, headers={"User-Agent": "Mozilla-5.0"}) url_response.raise_for_status() result = url_response.json() @@ -165,19 +176,20 @@ def get_food_details(self, fdc_id=None, target_field=None, result_format="full", else: # if len(target_field) > 1: - # warn("More than one target field was requested, returning only the first") - + # warn("More than one target field was requested, returning only the first") + if target_field == "nutrients": result = json_normalize(result["foodNutrients"]) return result if target_field == "label_nutrients": if not "labelNutrients" in result.keys(): - raise KeyError(f"FDC ID {fdc_id} has no label nutrients.") - label_nutrients_df = json_normalize(result["labelNutrients"]) - label_nutrients_df.columns = [re.sub(".value", "", x) for x in label_nutrients_df] - return label_nutrients_df + raise KeyError( + f"FDC ID {fdc_id} has no label nutrients.") + label_nutrients_df = json_normalize( + result["labelNutrients"]) + label_nutrients_df.columns = [ + re.sub(".value", "", x) for x in label_nutrients_df] + return label_nutrients_df else: return result[target_field] - - diff --git a/requirements.txt b/requirements.txt index 83d8968..ab55880 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,5 @@ m2r==0.2.1 coverage==5.3.1 pytest>=7.0.1 pytest-cov>=3.0.0 +joblib==1.2.0 +autopep8==1.7.0 From 2ca46a1c0dfa1302b8f3d80866b23d75446ef888 Mon Sep 17 00:00:00 2001 From: NelsonGon Date: Tue, 11 Oct 2022 20:22:45 +0200 Subject: [PATCH 2/2] Use less strict joblib requirement to fix issues Joblib 1.2.0 is not available on GH actions. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ab55880..ad77057 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,5 +13,5 @@ m2r==0.2.1 coverage==5.3.1 pytest>=7.0.1 pytest-cov>=3.0.0 -joblib==1.2.0 +joblib>=1.1.0 autopep8==1.7.0