Merge pull request #73 from BAMresearch/72-requested-changes-for-the-…

…metadata-extraction-script Changes at the mixture metadata extraction script
BAMresearch · Nov 17, 2022 · fff0be0 · fff0be0
2 parents 76f8c51 + 4939b8c
commit fff0be0
Show file tree

Hide file tree

Showing 4 changed files with 116 additions and 42 deletions.
diff --git a/.gitignore b/.gitignore
@@ -138,6 +138,7 @@ dmypy.json
 
 # results of minimum working example
 usecases/MinimumWorkingExample/emodul/*
+usecases/MinimumWorkingExample/mixture/*
 
 # graphviz
 *.gv

diff --git a/environment.yml b/environment.yml
@@ -25,6 +25,8 @@ dependencies:
   - python-graphviz
   - pytest
   - pip
+  - xlrd
+  - openpyxl
   - pip:
     - probeye==2.3.2
     - -e .
diff --git a/lebedigital/raw_data_processing/mixture/mixture_metadata_extraction.py b/lebedigital/raw_data_processing/mixture/mixture_metadata_extraction.py
@@ -19,17 +19,27 @@
 #     there already
 # 3.e Define a function to ignore empty annotations.
 # 3.f Extract the metadata for each label which is existing. 
-# 3.g Depending on wheter an output-path is given or not, return the metadata
+# 3.g Depending on whether an output-path is given or not, return the metadata
 #     dictionary or create a yaml-file.
 
 #------------------------------------------------------------------------------
 
 from cmath import nan
 import pandas as pd
+# removing 'SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame'
+pd.options.mode.chained_assignment = None  # default='warn'
+
 import os
 import yaml
 from loguru import logger 
+from pathlib import Path
+
 
+# Set up logger
+baseDir = Path(__file__).parents[0]
+logPath = os.path.join(baseDir, "logs","file_{time}.log")
+#logger.add(logPath, level="DEBUG")  # this also displays the log in the console
+logger.configure(handlers=[{"sink": logPath, "level": "DEBUG"}])
 
 # function to convert german formatting to english
 def replace_comma(string, format = 'float'):
@@ -48,7 +58,9 @@ def replace_comma(string, format = 'float'):
 def isNaN(num):
     return num!= num
 
-
+# decorater in case you want to catch errors so that the script won't break 
+# but just pass without output:
+# @logger.catch
 # extraction script
 def extract_metadata_mixture(
         locationOfRawData,
@@ -67,7 +79,8 @@ def extract_metadata_mixture(
             Path of the excelsheet (xls or xlsx) containing the metadata in one 
             or multiple "Rezeptur"-Sheet(s).
         locationOfProcessedData : string
-            Path of the target folder for yaml-file.
+            Path of the target folder for yaml-file (optional, give only if you 
+            want a yaml-file to be generated).
 
         Output
         -------
@@ -78,21 +91,22 @@ def extract_metadata_mixture(
     """
 
 
-    # Find sheets in the file containing the mixture (keyword: "Rezeptur")
+    # Find sheets in the file containing the mixture (keyword: "Rezeptur"), allow
+    # only one sheet per file
     excelsheet = os.path.basename(locationOfRawData)
     excelfile = pd.read_excel(locationOfRawData, sheet_name= None) 
     listofkeys = [i for i in excelfile.keys() if 'Rezeptur' in i] 
     logger.debug('Working on file: '+ excelsheet)
-    logger.debug('Following sheets contain mixture metadata in this file: ' + str(listofkeys))
-
-
-    for sheet in listofkeys:
+    logger.debug('Following sheet(s) contain mixture metadata in this file: ' + str(listofkeys))
 
-        logger.debug('Working on sheet: '+ sheet)
-        ##############  S E T U P ##############
+    if len(listofkeys) != 1:
+        logger.error('None or multiple sheets with mixture found in the raw data.')
+        raise Exception('None or multiple sheets with mixture found in the raw data.')
+    else:
+        sheet = listofkeys[0]
 
-        # name of yaml-file will be experiment-name + sheet name
-        name = os.path.basename(excelsheet).split('.xl')[0] + ' ___ ' + sheet
+        # name of yaml-file will be experiment-name 
+        name = os.path.basename(excelsheet).split('.xl')[0]
 
         # save data from excelsheet into pandas dataframe
         exceltodf = excelfile[sheet]
@@ -107,17 +121,20 @@ def extract_metadata_mixture(
         labelcolumn = exceltodf.iloc[:,0]  # select first column (containing labels)
         for i in range(len(labelcolumn)):
             labelcolumn[i] = str(labelcolumn[i]).strip()  # remove whitespace
+
             # fill dictionary with labels and corresponding indices, unless the
             # label is "addition". Then differ between 1st and 2nd addition
             if labelcolumn[i] != 'Zusatzstoff':
                 labelidx[labelcolumn[i]] = i                
             elif labelcolumn[i] == 'Zusatzstoff' and 'Zusatzstoff1' not in labelidx.keys():
                 labelidx['Zusatzstoff1'] = i   
-            elif labelcolumn[i] == 'Zusatzstoff' and 'Zusatzstoff1' in labelidx.keys():
+            elif labelcolumn[i] == 'Zusatzstoff' and 'Zusatzstoff1' in labelidx.keys() \
+                    and 'Zusatzstoff2' not in labelidx.keys():
                 labelidx['Zusatzstoff2'] = i 
-                logger.debug("Second addition found in raw data.")  
+                logger.debug('Second addition found in raw data.')  
             else:
-                logger.warning('More than 2 additions/Zusatzstoffe found!')
+                logger.error('More than two additions found in raw data.')
+                raise Exception('More than two additions found in raw data.')
 
 
         # Check for missing labels; the following labels should exist (except 
@@ -127,8 +144,11 @@ def extract_metadata_mixture(
                         'Zusatzmittel', 'Zuschlag (gesamt)'] 
         missing_labels =  [i for i in default_labels if i not in labelidx.keys()]
         if len(missing_labels) != 0:
-            logger.warning('Check raw data, there are labels missing:')
-            logger.warning(missing_labels)
+            if missing_labels == ['Zusatzstoff2']:
+                logger.warning('No addition2 in raw data.')
+            else:
+                logger.error('Check raw data, there are labels missing: ' + str(missing_labels))
+                raise KeyError('Check raw data, there are labels missing', missing_labels)
 
 
         # Some files don't have the type of addition/Zusatzstoff only labeled 
@@ -169,7 +189,7 @@ def no_empty_annotation(name):
 
         # name of specimen
         idx = labelidx['Bezeichnung der Proben:']
-        metadata['specimen_name'] = exceltodf.iloc[idx,3] #4
+        metadata['specimen_name'] = exceltodf.iloc[idx,3]
 
         #----------------------------------------------------------------------
 
@@ -185,7 +205,7 @@ def no_empty_annotation(name):
             metadata['cement--Volume'] = float(replace_comma(str(exceltodf.iat[idx,6])))
             no_empty_annotation('cement')
         else:
-            logger.warning('cement not included in yaml-file')
+            logger.error('cement not included in yaml-file')
 
         # total water data ('Wasser (gesamt)') 
         if 'Wasser (gesamt)' not in missing_labels:
@@ -195,15 +215,15 @@ def no_empty_annotation(name):
             metadata['water_total--Volume'] = float(replace_comma(str(exceltodf.iat[idx,6])))
             no_empty_annotation('water_total')
         else:
-            logger.warning('water_total not included in yaml-file')
+            logger.error('water_total not included in yaml-file')
 
 
         # water cement ratio ('Wasserzementwert')
         if 'Zement' not in missing_labels and 'Wasser (gesamt)' not in missing_labels:
             metadata['water_cement_ratio'] = float(metadata['water_total--QuantityInMix'] 
                                                     / metadata['cement--QuantityInMix'])
         else:
-            logger.warning('water_cement_ratio not included in yaml-file')
+            logger.error('water_cement_ratio not included in yaml-file')
 
 
         # effective water data ('Wasser (wirksam)')  
@@ -214,7 +234,7 @@ def no_empty_annotation(name):
             metadata['water_effective--Volume'] = replace_comma(str(exceltodf.iat[idx,6]))
             no_empty_annotation('water_effective')
         else:
-            logger.warning('water_effective not included in yaml-file')
+            logger.error('water_effective not included in yaml-file')
 
 
         # air content data ('Luftgehalt') 
@@ -225,7 +245,7 @@ def no_empty_annotation(name):
             metadata['air_content--Volume'] = float(replace_comma(str(exceltodf.iat[idx,6])))
             no_empty_annotation('air_content')
         else:
-            logger.warning('air_content not included in yaml-file')
+            logger.error('air_content not included in yaml-file')
 
 
         # Addition data ('Zusatzstoff') 1 
@@ -236,7 +256,7 @@ def no_empty_annotation(name):
             metadata['addition1--Volume'] = float(replace_comma(str(exceltodf.iat[idx,6])))
             no_empty_annotation('addition1')
         else:
-            logger.warning('addition1 not included in yaml-file')
+            logger.error('addition1 not included in yaml-file')
 
 
         # Addition data ('Zusatzstoff') 2 
@@ -258,7 +278,7 @@ def no_empty_annotation(name):
             metadata['admixture--Volume'] = float(replace_comma(str(exceltodf.iat[idx,6])))
             no_empty_annotation('admixture')
         else:
-            logger.warning('admixture not included in yaml-file')
+            logger.error('admixture not included in yaml-file')
 
 
         # Aggregate ('Zuschlag (gesamt)')
@@ -269,7 +289,7 @@ def no_empty_annotation(name):
             metadata['aggregate--Volume'] = float(replace_comma(str(exceltodf.iat[idx,6])))
             no_empty_annotation('aggregate')
         else:
-            logger.warning('aggregate not included in yaml-file')
+            logger.error('aggregate not included in yaml-file')
 
 
 
@@ -279,4 +299,5 @@ def no_empty_annotation(name):
         else:
             with open(os.path.join(locationOfProcessedData, name + '.yaml'), mode='w') as yamlFile:
                 yaml.dump(metadata, yamlFile, sort_keys=False, allow_unicode=True)
-
+
+
diff --git a/usecases/MinimumWorkingExample/dodo.py b/usecases/MinimumWorkingExample/dodo.py
@@ -11,49 +11,99 @@
 from lebedigital.raw_data_processing.youngs_modulus_data \
     .emodul_generate_processed_data import processed_data_from_rawdata
 
+from lebedigital.raw_data_processing.mixture \
+    .mixture_metadata_extraction import extract_metadata_mixture
+
 from doit import get_var
 
 # set a variable to define a cheap or full run
 # the default "doit" is set to "doit mode=cheap"
+# "cheap" option is to reduce computation time once the workflow gets more expensive (calibration)
+#  - currently this means: all mixes, one tests data + KG
+# "single" option is to test the dodo file on a single example (similar to cheap but only a single mix)
 # any other mode value runs the expensive version i.e. "doit mode=full"
 config = {"mode": get_var('mode', 'cheap')}
 
+# when "cheap option" or "single" is run, only this souce of raw data is processed
+single_example_name = 'Wolf 8.2 Probe 1'
+# TODO: (if we want to keep using a single example) automatic identification of corresponding mix
+single_mix_name = '2014_12_10 Wolf.xls'  # corresponding mix for the "single" example
+
+
 DOIT_CONFIG = {'verbosity': 2}
 
 #parent directory of the minimum working example
 ParentDir = os.path.dirname(Path(__file__))
 
-# defining paths
+# EMODULE PATHS 
+# defining paths for emodule
 emodul_output_directory = Path(ParentDir, 'emodul')  # folder with metadata yaml files
 raw_data_emodulus_directory = Path(ParentDir, 'Data', 'E-modul')  # folder with folders of raw data files
 metadata_emodulus_directory = Path(emodul_output_directory, 'metadata_yaml_files')  # folder with metadata yaml files
 processed_data_emodulus_directory = Path(emodul_output_directory, 'processed_data')  # folder with csv data files
 knowledge_graphs_directory = Path(emodul_output_directory, 'knowledge_graphs')  # folder with KG ttl files
 
-# when "cheap option" is run, only this souce of raw data is processed
-cheap_example_name = 'Wolf 8.2 Probe 1'
-
 # create folder, if it is not there
 Path(emodul_output_directory).mkdir(parents=True, exist_ok=True)
 
+# MIXTURE PATHS
+# defining paths for mixture
+raw_data_mixture_directory = Path(ParentDir, 'Data', 'Mischungen')  # folder with raw data files (excel)
+mixture_output_directory = Path(ParentDir, 'mixture')  # folder with folders
+metadata_mixture_directory = Path(mixture_output_directory, 'metadata_yaml_files')  # folder with mixture metadata yaml files
+mixture_knowledge_graphs_directory = Path(mixture_output_directory, 'knowledge_graphs')  # folder with KG ttl files
+
+# List with mixes with problems, to be excluded for now
+excluded_mix_list = ['2014_08_04 Rezepturen_auf 85 Liter_Werner_Losert.xlsx']
+
+# create folder, if it is not there
+Path(mixture_output_directory).mkdir(parents=True, exist_ok=True)
+
+
+# TASKS
+# extract metadata for the mixture
+def task_extract_metadata_mixture():
+    # create folder, if it is not there
+    Path(metadata_mixture_directory).mkdir(parents=True, exist_ok=True)
+
+    # setting for fast test, defining the list
+    if config['mode'] == 'single':
+        list_raw_data_mixture_files = [Path(raw_data_mixture_directory, single_mix_name)]
+
+    else: # make a list of all files
+        list_raw_data_mixture_files = os.scandir(raw_data_mixture_directory)
+
+    for f in list_raw_data_mixture_files:
+        if f.is_file():
+            raw_data_path = Path(f)
+            yaml_metadata_file = Path(metadata_mixture_directory, f.name.split(".xls")[0] + '.yaml')
+
+            if f.name not in excluded_mix_list:
+                yield {
+                    'name': f.name,
+                    'actions': [(extract_metadata_mixture, [raw_data_path, metadata_mixture_directory])],
+                    'targets': [yaml_metadata_file],
+                    'clean': [clean_targets]
+                }
+
 #extract standardized meta data for Young' modulus tests
 def task_extract_metadata_emodul():
     # create folder, if it is not there
     Path(metadata_emodulus_directory).mkdir(parents=True, exist_ok=True)
 
     # setting for fast test, defining the list
-    if config['mode'] == 'cheap':
-        list_raw_data_emodulus_directories = [ Path(raw_data_emodulus_directory, cheap_example_name) ]
+    if config['mode'] == 'cheap' or config['mode'] == 'single':
+        list_raw_data_emodulus_directories = [ Path(raw_data_emodulus_directory, single_example_name) ]
     else: # go through all files
         list_raw_data_emodulus_directories = os.scandir(raw_data_emodulus_directory)
-
+    
     for f in list_raw_data_emodulus_directories:
         if f.is_dir():
             raw_data_path = Path(f)
             raw_data_file = Path(f, 'specimen.dat')
             yaml_metadata_file = Path(metadata_emodulus_directory, f.name + '.yaml')
             yield {
-                'name': yaml_metadata_file,
+                'name': f.name,
                 'actions': [(emodul_metadata, [raw_data_path, yaml_metadata_file])],
                 'file_dep': [raw_data_file],
                 'targets': [yaml_metadata_file],
@@ -66,8 +116,8 @@ def task_extract_processed_data_emodul():
     Path(processed_data_emodulus_directory).mkdir(parents=True, exist_ok=True)
 
     # setting for fast test, defining the list
-    if config['mode'] == 'cheap':
-        list_raw_data_emodulus_directories = [ Path(raw_data_emodulus_directory, cheap_example_name) ]
+    if config['mode'] == 'cheap' or config['mode'] == 'single':
+        list_raw_data_emodulus_directories = [ Path(raw_data_emodulus_directory, single_example_name) ]
     else: # go through all files
         list_raw_data_emodulus_directories = os.scandir(raw_data_emodulus_directory)
 
@@ -79,7 +129,7 @@ def task_extract_processed_data_emodul():
             csv_data_file = Path(processed_data_emodulus_directory, f.name + '.csv')
 
             yield {
-                'name': csv_data_file,
+                'name': f.name,
                 'actions': [(processed_data_from_rawdata, [f, csv_data_file])],
                 'file_dep': [raw_data_file],
                 'targets': [csv_data_file],
@@ -94,8 +144,8 @@ def task_export_knowledgeGraph_emodul():
     Path(knowledge_graphs_directory).mkdir(parents=True, exist_ok=True)
 
     # setting for fast test, defining the list
-    if config['mode'] == 'cheap':
-        list_metadata_yaml_files = [ Path(metadata_emodulus_directory, cheap_example_name + '.yaml') ]
+    if config['mode'] == 'cheap' or config['mode'] == 'single':
+        list_metadata_yaml_files = [ Path(metadata_emodulus_directory, single_example_name + '.yaml') ]
     else: # go through all files
         # list of all meta data files....
         list_metadata_yaml_files = os.scandir(metadata_emodulus_directory)
@@ -114,10 +164,10 @@ def task_export_knowledgeGraph_emodul():
             knowledge_graph_file = Path(knowledge_graphs_directory, name_of_ttl)
 
             yield{
-                'name': knowledge_graph_file,
+                'name': name_of_cvs,
                 'actions': [(generate_knowledge_graph, [metadata_file_path,
                                                     knowledge_graph_file])],
                 'file_dep': [metadata_file_path, processed_data_file_path],
                 'targets': [knowledge_graph_file],
                 'clean': [clean_targets]
-            }
+            }