Skip to content

Commit

Permalink
Merge pull request #73 from BAMresearch/72-requested-changes-for-the-…
Browse files Browse the repository at this point in the history
…metadata-extraction-script

Changes at the mixture metadata extraction script
  • Loading branch information
eriktamsen authored Nov 17, 2022
2 parents 76f8c51 + 4939b8c commit fff0be0
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 42 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ dmypy.json

# results of minimum working example
usecases/MinimumWorkingExample/emodul/*
usecases/MinimumWorkingExample/mixture/*

# graphviz
*.gv
Expand Down
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ dependencies:
- python-graphviz
- pytest
- pip
- xlrd
- openpyxl
- pip:
- probeye==2.3.2
- -e .
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,27 @@
# there already
# 3.e Define a function to ignore empty annotations.
# 3.f Extract the metadata for each label which is existing.
# 3.g Depending on wheter an output-path is given or not, return the metadata
# 3.g Depending on whether an output-path is given or not, return the metadata
# dictionary or create a yaml-file.

#------------------------------------------------------------------------------

from cmath import nan
import pandas as pd
# removing 'SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame'
pd.options.mode.chained_assignment = None # default='warn'

import os
import yaml
from loguru import logger
from pathlib import Path


# Set up logger
baseDir = Path(__file__).parents[0]
logPath = os.path.join(baseDir, "logs","file_{time}.log")
#logger.add(logPath, level="DEBUG") # this also displays the log in the console
logger.configure(handlers=[{"sink": logPath, "level": "DEBUG"}])

# function to convert german formatting to english
def replace_comma(string, format = 'float'):
Expand All @@ -48,7 +58,9 @@ def replace_comma(string, format = 'float'):
def isNaN(num):
return num!= num


# decorater in case you want to catch errors so that the script won't break
# but just pass without output:
# @logger.catch
# extraction script
def extract_metadata_mixture(
locationOfRawData,
Expand All @@ -67,7 +79,8 @@ def extract_metadata_mixture(
Path of the excelsheet (xls or xlsx) containing the metadata in one
or multiple "Rezeptur"-Sheet(s).
locationOfProcessedData : string
Path of the target folder for yaml-file.
Path of the target folder for yaml-file (optional, give only if you
want a yaml-file to be generated).
Output
-------
Expand All @@ -78,21 +91,22 @@ def extract_metadata_mixture(
"""


# Find sheets in the file containing the mixture (keyword: "Rezeptur")
# Find sheets in the file containing the mixture (keyword: "Rezeptur"), allow
# only one sheet per file
excelsheet = os.path.basename(locationOfRawData)
excelfile = pd.read_excel(locationOfRawData, sheet_name= None)
listofkeys = [i for i in excelfile.keys() if 'Rezeptur' in i]
logger.debug('Working on file: '+ excelsheet)
logger.debug('Following sheets contain mixture metadata in this file: ' + str(listofkeys))


for sheet in listofkeys:
logger.debug('Following sheet(s) contain mixture metadata in this file: ' + str(listofkeys))

logger.debug('Working on sheet: '+ sheet)
############## S E T U P ##############
if len(listofkeys) != 1:
logger.error('None or multiple sheets with mixture found in the raw data.')
raise Exception('None or multiple sheets with mixture found in the raw data.')
else:
sheet = listofkeys[0]

# name of yaml-file will be experiment-name + sheet name
name = os.path.basename(excelsheet).split('.xl')[0] + ' ___ ' + sheet
# name of yaml-file will be experiment-name
name = os.path.basename(excelsheet).split('.xl')[0]

# save data from excelsheet into pandas dataframe
exceltodf = excelfile[sheet]
Expand All @@ -107,17 +121,20 @@ def extract_metadata_mixture(
labelcolumn = exceltodf.iloc[:,0] # select first column (containing labels)
for i in range(len(labelcolumn)):
labelcolumn[i] = str(labelcolumn[i]).strip() # remove whitespace

# fill dictionary with labels and corresponding indices, unless the
# label is "addition". Then differ between 1st and 2nd addition
if labelcolumn[i] != 'Zusatzstoff':
labelidx[labelcolumn[i]] = i
elif labelcolumn[i] == 'Zusatzstoff' and 'Zusatzstoff1' not in labelidx.keys():
labelidx['Zusatzstoff1'] = i
elif labelcolumn[i] == 'Zusatzstoff' and 'Zusatzstoff1' in labelidx.keys():
elif labelcolumn[i] == 'Zusatzstoff' and 'Zusatzstoff1' in labelidx.keys() \
and 'Zusatzstoff2' not in labelidx.keys():
labelidx['Zusatzstoff2'] = i
logger.debug("Second addition found in raw data.")
logger.debug('Second addition found in raw data.')
else:
logger.warning('More than 2 additions/Zusatzstoffe found!')
logger.error('More than two additions found in raw data.')
raise Exception('More than two additions found in raw data.')


# Check for missing labels; the following labels should exist (except
Expand All @@ -127,8 +144,11 @@ def extract_metadata_mixture(
'Zusatzmittel', 'Zuschlag (gesamt)']
missing_labels = [i for i in default_labels if i not in labelidx.keys()]
if len(missing_labels) != 0:
logger.warning('Check raw data, there are labels missing:')
logger.warning(missing_labels)
if missing_labels == ['Zusatzstoff2']:
logger.warning('No addition2 in raw data.')
else:
logger.error('Check raw data, there are labels missing: ' + str(missing_labels))
raise KeyError('Check raw data, there are labels missing', missing_labels)


# Some files don't have the type of addition/Zusatzstoff only labeled
Expand Down Expand Up @@ -169,7 +189,7 @@ def no_empty_annotation(name):

# name of specimen
idx = labelidx['Bezeichnung der Proben:']
metadata['specimen_name'] = exceltodf.iloc[idx,3] #4
metadata['specimen_name'] = exceltodf.iloc[idx,3]

#----------------------------------------------------------------------

Expand All @@ -185,7 +205,7 @@ def no_empty_annotation(name):
metadata['cement--Volume'] = float(replace_comma(str(exceltodf.iat[idx,6])))
no_empty_annotation('cement')
else:
logger.warning('cement not included in yaml-file')
logger.error('cement not included in yaml-file')

# total water data ('Wasser (gesamt)')
if 'Wasser (gesamt)' not in missing_labels:
Expand All @@ -195,15 +215,15 @@ def no_empty_annotation(name):
metadata['water_total--Volume'] = float(replace_comma(str(exceltodf.iat[idx,6])))
no_empty_annotation('water_total')
else:
logger.warning('water_total not included in yaml-file')
logger.error('water_total not included in yaml-file')


# water cement ratio ('Wasserzementwert')
if 'Zement' not in missing_labels and 'Wasser (gesamt)' not in missing_labels:
metadata['water_cement_ratio'] = float(metadata['water_total--QuantityInMix']
/ metadata['cement--QuantityInMix'])
else:
logger.warning('water_cement_ratio not included in yaml-file')
logger.error('water_cement_ratio not included in yaml-file')


# effective water data ('Wasser (wirksam)')
Expand All @@ -214,7 +234,7 @@ def no_empty_annotation(name):
metadata['water_effective--Volume'] = replace_comma(str(exceltodf.iat[idx,6]))
no_empty_annotation('water_effective')
else:
logger.warning('water_effective not included in yaml-file')
logger.error('water_effective not included in yaml-file')


# air content data ('Luftgehalt')
Expand All @@ -225,7 +245,7 @@ def no_empty_annotation(name):
metadata['air_content--Volume'] = float(replace_comma(str(exceltodf.iat[idx,6])))
no_empty_annotation('air_content')
else:
logger.warning('air_content not included in yaml-file')
logger.error('air_content not included in yaml-file')


# Addition data ('Zusatzstoff') 1
Expand All @@ -236,7 +256,7 @@ def no_empty_annotation(name):
metadata['addition1--Volume'] = float(replace_comma(str(exceltodf.iat[idx,6])))
no_empty_annotation('addition1')
else:
logger.warning('addition1 not included in yaml-file')
logger.error('addition1 not included in yaml-file')


# Addition data ('Zusatzstoff') 2
Expand All @@ -258,7 +278,7 @@ def no_empty_annotation(name):
metadata['admixture--Volume'] = float(replace_comma(str(exceltodf.iat[idx,6])))
no_empty_annotation('admixture')
else:
logger.warning('admixture not included in yaml-file')
logger.error('admixture not included in yaml-file')


# Aggregate ('Zuschlag (gesamt)')
Expand All @@ -269,7 +289,7 @@ def no_empty_annotation(name):
metadata['aggregate--Volume'] = float(replace_comma(str(exceltodf.iat[idx,6])))
no_empty_annotation('aggregate')
else:
logger.warning('aggregate not included in yaml-file')
logger.error('aggregate not included in yaml-file')



Expand All @@ -279,4 +299,5 @@ def no_empty_annotation(name):
else:
with open(os.path.join(locationOfProcessedData, name + '.yaml'), mode='w') as yamlFile:
yaml.dump(metadata, yamlFile, sort_keys=False, allow_unicode=True)



80 changes: 65 additions & 15 deletions usecases/MinimumWorkingExample/dodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,49 +11,99 @@
from lebedigital.raw_data_processing.youngs_modulus_data \
.emodul_generate_processed_data import processed_data_from_rawdata

from lebedigital.raw_data_processing.mixture \
.mixture_metadata_extraction import extract_metadata_mixture

from doit import get_var

# set a variable to define a cheap or full run
# the default "doit" is set to "doit mode=cheap"
# "cheap" option is to reduce computation time once the workflow gets more expensive (calibration)
# - currently this means: all mixes, one tests data + KG
# "single" option is to test the dodo file on a single example (similar to cheap but only a single mix)
# any other mode value runs the expensive version i.e. "doit mode=full"
config = {"mode": get_var('mode', 'cheap')}

# when "cheap option" or "single" is run, only this souce of raw data is processed
single_example_name = 'Wolf 8.2 Probe 1'
# TODO: (if we want to keep using a single example) automatic identification of corresponding mix
single_mix_name = '2014_12_10 Wolf.xls' # corresponding mix for the "single" example


DOIT_CONFIG = {'verbosity': 2}

#parent directory of the minimum working example
ParentDir = os.path.dirname(Path(__file__))

# defining paths
# EMODULE PATHS
# defining paths for emodule
emodul_output_directory = Path(ParentDir, 'emodul') # folder with metadata yaml files
raw_data_emodulus_directory = Path(ParentDir, 'Data', 'E-modul') # folder with folders of raw data files
metadata_emodulus_directory = Path(emodul_output_directory, 'metadata_yaml_files') # folder with metadata yaml files
processed_data_emodulus_directory = Path(emodul_output_directory, 'processed_data') # folder with csv data files
knowledge_graphs_directory = Path(emodul_output_directory, 'knowledge_graphs') # folder with KG ttl files

# when "cheap option" is run, only this souce of raw data is processed
cheap_example_name = 'Wolf 8.2 Probe 1'

# create folder, if it is not there
Path(emodul_output_directory).mkdir(parents=True, exist_ok=True)

# MIXTURE PATHS
# defining paths for mixture
raw_data_mixture_directory = Path(ParentDir, 'Data', 'Mischungen') # folder with raw data files (excel)
mixture_output_directory = Path(ParentDir, 'mixture') # folder with folders
metadata_mixture_directory = Path(mixture_output_directory, 'metadata_yaml_files') # folder with mixture metadata yaml files
mixture_knowledge_graphs_directory = Path(mixture_output_directory, 'knowledge_graphs') # folder with KG ttl files

# List with mixes with problems, to be excluded for now
excluded_mix_list = ['2014_08_04 Rezepturen_auf 85 Liter_Werner_Losert.xlsx']

# create folder, if it is not there
Path(mixture_output_directory).mkdir(parents=True, exist_ok=True)


# TASKS
# extract metadata for the mixture
def task_extract_metadata_mixture():
# create folder, if it is not there
Path(metadata_mixture_directory).mkdir(parents=True, exist_ok=True)

# setting for fast test, defining the list
if config['mode'] == 'single':
list_raw_data_mixture_files = [Path(raw_data_mixture_directory, single_mix_name)]

else: # make a list of all files
list_raw_data_mixture_files = os.scandir(raw_data_mixture_directory)

for f in list_raw_data_mixture_files:
if f.is_file():
raw_data_path = Path(f)
yaml_metadata_file = Path(metadata_mixture_directory, f.name.split(".xls")[0] + '.yaml')

if f.name not in excluded_mix_list:
yield {
'name': f.name,
'actions': [(extract_metadata_mixture, [raw_data_path, metadata_mixture_directory])],
'targets': [yaml_metadata_file],
'clean': [clean_targets]
}

#extract standardized meta data for Young' modulus tests
def task_extract_metadata_emodul():
# create folder, if it is not there
Path(metadata_emodulus_directory).mkdir(parents=True, exist_ok=True)

# setting for fast test, defining the list
if config['mode'] == 'cheap':
list_raw_data_emodulus_directories = [ Path(raw_data_emodulus_directory, cheap_example_name) ]
if config['mode'] == 'cheap' or config['mode'] == 'single':
list_raw_data_emodulus_directories = [ Path(raw_data_emodulus_directory, single_example_name) ]
else: # go through all files
list_raw_data_emodulus_directories = os.scandir(raw_data_emodulus_directory)

for f in list_raw_data_emodulus_directories:
if f.is_dir():
raw_data_path = Path(f)
raw_data_file = Path(f, 'specimen.dat')
yaml_metadata_file = Path(metadata_emodulus_directory, f.name + '.yaml')
yield {
'name': yaml_metadata_file,
'name': f.name,
'actions': [(emodul_metadata, [raw_data_path, yaml_metadata_file])],
'file_dep': [raw_data_file],
'targets': [yaml_metadata_file],
Expand All @@ -66,8 +116,8 @@ def task_extract_processed_data_emodul():
Path(processed_data_emodulus_directory).mkdir(parents=True, exist_ok=True)

# setting for fast test, defining the list
if config['mode'] == 'cheap':
list_raw_data_emodulus_directories = [ Path(raw_data_emodulus_directory, cheap_example_name) ]
if config['mode'] == 'cheap' or config['mode'] == 'single':
list_raw_data_emodulus_directories = [ Path(raw_data_emodulus_directory, single_example_name) ]
else: # go through all files
list_raw_data_emodulus_directories = os.scandir(raw_data_emodulus_directory)

Expand All @@ -79,7 +129,7 @@ def task_extract_processed_data_emodul():
csv_data_file = Path(processed_data_emodulus_directory, f.name + '.csv')

yield {
'name': csv_data_file,
'name': f.name,
'actions': [(processed_data_from_rawdata, [f, csv_data_file])],
'file_dep': [raw_data_file],
'targets': [csv_data_file],
Expand All @@ -94,8 +144,8 @@ def task_export_knowledgeGraph_emodul():
Path(knowledge_graphs_directory).mkdir(parents=True, exist_ok=True)

# setting for fast test, defining the list
if config['mode'] == 'cheap':
list_metadata_yaml_files = [ Path(metadata_emodulus_directory, cheap_example_name + '.yaml') ]
if config['mode'] == 'cheap' or config['mode'] == 'single':
list_metadata_yaml_files = [ Path(metadata_emodulus_directory, single_example_name + '.yaml') ]
else: # go through all files
# list of all meta data files....
list_metadata_yaml_files = os.scandir(metadata_emodulus_directory)
Expand All @@ -114,10 +164,10 @@ def task_export_knowledgeGraph_emodul():
knowledge_graph_file = Path(knowledge_graphs_directory, name_of_ttl)

yield{
'name': knowledge_graph_file,
'name': name_of_cvs,
'actions': [(generate_knowledge_graph, [metadata_file_path,
knowledge_graph_file])],
'file_dep': [metadata_file_path, processed_data_file_path],
'targets': [knowledge_graph_file],
'clean': [clean_targets]
}
}

0 comments on commit fff0be0

Please sign in to comment.