Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

136 mixturedesign mapping #139

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 187 additions & 0 deletions lebedigital/mapping/mixture_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
# Script for the e-module ontology (extracted from CPTO) to map e-module metadata
# by reading every line of that ontology and finding/ # replacing the placeholders.
# Logging through loguru, you can ignore "debug" messages. "Warning" appear if not
# everything has been mapped.

# import libraries
import yaml
import os
from pathlib import Path
from loguru import logger
import uuid


def load_metadata(dataPath):
'''
Load metadata from a given path and return it as dictionary.
dataPath : string
Path to the metadata yaml-file.

'''

with open(dataPath, 'r') as file:
try:
metadata = yaml.safe_load(file)
return metadata
except Exception as e:
logger.error("Path error: " + str(e))


def generate_placeholder(key):
'''
Generates a placeholder (str) in the format $$key_Value$$ for a given key.
This function should allow to easily change the structure of the placeholder
given in the ontology without having to rewrite the function placeholderreplacement.
Just change the structure here.
'''

placeholder = '$$' + str(key) + '_Value$$'
return placeholder


def placeholderreplacement(
ontoPath,
metadataPath,
outputPath=None
):
'''
Maps the values of one given metadata file (for one specimen or
experiment) to a given ontology, by searching within ontology linewise
for all metadata keys and replacing placeholders with values from the
metadata. Also creates and appends an ID for the specimen.

Parameter:
-----
ontoPath : string
complete path to Ontology (ttl-format)
metadataPath : string
complete path to metadata (yaml-format)
outputPath : string
complete path for output

Output:
---
If no ouput path is given (f.e. for unittesting), the lines will be
returned. If the "ontoPath" is given for output, the ontology will
be overwritten. To avoid this, give a new name to create a new ttl-file.

'''

# load metadata and get the keys
metadata = load_metadata(metadataPath)
keys = list(metadata.keys())

# generate ID for the e-module metadata
specimenID = str(uuid.uuid4())

# read in the ontology as text linewise, creating a list of lines
with open(ontoPath, 'r') as file:
lines = file.readlines()

# Set up logger
logger.debug('S T A R T')
logger.debug('Loaded ttl-File has ' + str(len(lines)) + ' lines.')
usedKeys = [] # to count keys that found a placeholder
ontoPHcounter = [] # to count all placeholders
remainingPH = [] # to count the placeholders that recieved no data

# iterating through the list of lines
for i in range(len(lines)):

# create a list of placeholders
if '_Value$$' in lines[i]:
ph = lines[i].split("$$")[1]
ontoPHcounter.append(ph)

# iterate through list of metadata-keys
for key in keys:

placeholder = generate_placeholder(key)

# if placeholder is in line, replace it with metadata
if placeholder in lines[i]:
logger.debug('Found placeholder "' + placeholder + '" for key "' \
+ key + '" with value "' + str(metadata[key]) + '".')
lines[i] = lines[i].replace(placeholder, str(metadata[key]))
usedKeys.append(key)

# append the specimen-ID name to "key"_ , works for most keys, except
# some keys below
key_ = key + "_ "
if key_ in lines[i]:
lines[i] = lines[i].replace(key_, key + "_" + str(specimenID) + " ")


# append the specimen-ID name to the exceptions
if "_," in lines[i]:
#logger.debug('Appended specimen-ID in line ' + str(i + 1) \
# + ' to ' + str(lines[i].split("_,")[0] + "_,") + '".')
lines[i] = lines[i].replace("_,", "_" + str(specimenID) + ",")
if "_ " in lines[i]:
#logger.debug('Appended specimen-ID in line ' + str(i + 1) \
# + ' to ' + str(lines[i].split("_ ")[0] + "_ ") + '".')
lines[i] = lines[i].replace("_ ", "_" + str(specimenID) + " ")


# ID-key is not given by metadata but created in this script, so map it now:
if generate_placeholder("SpecimenID") in lines[i]:
logger.debug('Found placeholder "' + generate_placeholder("SpecimenID")+ '".')
lines[i] = lines[i].replace(generate_placeholder("SpecimenID"), str(specimenID))



############################ L O G G I N G #############################

# create a list of leftover placeholders to see which ones didn't recieve a value
if '_Value$$' in lines[i]:
ph = lines[i].split("$$")[1]
remainingPH.append(ph)

# for metadata
unusedKeys = [i for i in keys if i not in usedKeys]
if len(unusedKeys) > 0:
logger.warning('Mapped only ' + str(len(usedKeys)) + ' keys to the ontology.')
logger.warning('The following ' + str(len(unusedKeys)) + ' of ' + str(len(keys)) \
+ ' metadata keys have not been mapped: ')
logger.warning(unusedKeys)
else:
logger.debug('All ' + str(len(usedKeys)) + ' metadata keys have been mapped.')

# for placeholders
if len(remainingPH) > 0:
logger.warning('File has ' + str(len(ontoPHcounter)) + ' placeholders.')
logger.warning('The following ' + str(len(remainingPH)) + ' of ' + str(len(ontoPHcounter)) \
+ ' placeholders did not recieve a metadata value: ')
logger.warning(remainingPH)
else:
logger.debug('All ' + str(len(ontoPHcounter)) + ' placeholders within the ontology revieced metadata.')

############################ O U T P U T #############################
if outputPath == None:
return lines

else:
# saving the list again to the file
with open(outputPath, 'w') as file:
for line in lines:
file.write(line)



# T E M P O R A R Y !!!
# For my personal testing, will be removed later. Will cause test-failures
# because of my own local testing data that doesn't exist on your PC.

# defining paths : ONTOLOGY
ontoDir = Path(__file__).parents[2]
ontoFile = "../lebedigital/ConcreteOntology/MixtureDesignOntology.ttl"
ontoPath = os.path.join(ontoDir, ontoFile)

# defining paths : METADATA
dataDir = Path(__file__).parents[2]
dataFile = "../lebedigital/mapping/testMixtureMetadata.yaml"
dataPath = os.path.join(dataDir, dataFile)

# creating mapped ttl
mappedOntoName = os.path.join(Path(__file__).parents[0], 'MixMappedExmpl.ttl')
placeholderreplacement(ontoPath, dataPath, mappedOntoName)
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
# Script for metadata-extraction for mixes with CEM I and CEM II. Output yaml
# should work with the MixDesign ontology for mapping.


#------------------------------------------------------------------------------

from cmath import nan
import pandas as pd
# removing 'SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame'
pd.options.mode.chained_assignment = None # default='warn'

import os
import yaml
from loguru import logger
from pathlib import Path


# Set up logger
baseDir = Path(__file__).parents[0]
logPath = os.path.join(baseDir, "logs","file_{time}.log")
#logger.add(logPath, level="DEBUG") # this also displays the log in the console
logger.configure(handlers=[{"sink": logPath, "level": "DEBUG"}])



# function to convert german formatting to english
def replace_comma(string, format = 'float'):
if '---' in string:
string = nan # maybe None? But this will cause errors when float(string)
return string
elif format == 'float':
string = string.replace(',', '.')
return float(string)
else:
string = string.replace(',', '.')
return string



# function to check for nan-values independently of the format (str/float)
def isNaN(num):
return num!= num



# decorater in case you want to catch errors so that the script won't break
# but just pass without output:
# @logger.catch

# extraction script
def extract_metadata_mixdesign(
locationOfRawData,
locationOfProcessedData = None
):

"""
Extracts the metadata from all "Rezeptur"-sheets of a given datafile
(xls or xlsx). Creates one yaml-file per sheet containing the keyword
"Rezeptur".


Parameter
---------
locationOfRawData : string
Path of the excelsheet (xls or xlsx) containing the metadata in one
or multiple "Rezeptur"-Sheet(s).
locationOfProcessedData : string
Path of the target folder for yaml-file (optional, give only if you
want a yaml-file to be generated).

Output
-------
If no ouput path (locationOfProcessedData) is given (f.e. for unittesting),
the dict containing the metadata will be returned. Otherwise a yaml file
will be created.

"""


# Find sheets in the file containing the mixture (keyword: "Rezeptur"), allow
# only one sheet per file
excelsheet = os.path.basename(locationOfRawData)
excelfile = pd.read_excel(locationOfRawData, sheet_name= None)
listofkeys = [i for i in excelfile.keys() if 'Rezeptur' in i]
logger.debug('Working on file: '+ excelsheet)
logger.debug('Following sheet(s) contain mixture metadata in this file: ' + str(listofkeys))

if len(listofkeys) != 1:
logger.error('None or multiple sheets with mixture found in the raw data.')
raise Exception('None or multiple sheets with mixture found in the raw data.')
else:
sheet = listofkeys[0]

# name of yaml-file will be experiment-name
name = os.path.basename(excelsheet).split('.xl')[0]

# save data from excelsheet into pandas dataframe
exceltodf = excelfile[sheet]

# create empty dictionary for metadata
metadata = {}

# the layout of the excel table can vary, the indices of labels are not
# always the same; that's why: find now the indices of the labels and
# store it in a dictionary
labelidx = {}
labelcolumn = exceltodf.iloc[:,0] # select first column (containing labels)
for i in range(len(labelcolumn)):
labelcolumn[i] = str(labelcolumn[i]).strip() # remove whitespace
labelidx[labelcolumn[i]] = i

# Check for missing labels; the following labels should exist (except
# Zusatzstoff 2, not all raw files have two additions/Zusatzstoffe)
default_labels = ['Bezeichnung der Proben:', 'Zement', 'Wasser (gesamt)',
'Luftgehalt', 'Zusatzmittel', 'Zuschlag (gesamt)']
missing_labels = [i for i in default_labels if i not in labelidx.keys()]
if len(missing_labels) != 0:
logger.error('Check raw data, there are labels missing: ' + str(missing_labels))
raise KeyError('Check raw data, there are labels missing', missing_labels)


############### E X T R A C T I O N #############

# get raw data file name
metadata['RawDataFile'] = locationOfRawData

# get date and time (always the same position)
metadata['MixingDate'] = str(exceltodf.columns[9])[:10]

# lab location - hardcoded
metadata['Lab'] = "BAM"

#----------------------------------------------------------------------

# Extraction of the columns 'Stoffmenge' (QuantityInMix), 'Dichte bzw.
# Rohdichte' (Density).

# Cement data ('Zement')
if 'Zement' not in missing_labels:
idx = labelidx['Zement']
metadata['CEMIQtyInMix'] = float(replace_comma(str(exceltodf.iat[idx,2])))
metadata['CEMIDensity'] = float(replace_comma(str(exceltodf.iat[idx,4])))
else:
logger.error('cement not included in yaml-file')


# total water data ('Wasser (gesamt)')
if 'Wasser (gesamt)' not in missing_labels:
idx = labelidx['Wasser (gesamt)']
metadata['MixingWaterQtyInMix'] = float(replace_comma(str(exceltodf.iat[idx,2])))
metadata['WaterDensity'] = float(replace_comma(str(exceltodf.iat[idx,4])))
else:
logger.error('Water not included in yaml-file')


# water cement ratio ('Wasserzementwert')
if 'Zement' not in missing_labels and 'Wasser (gesamt)' not in missing_labels:
metadata['WaterCementRatio'] = float(metadata['MixingWaterQtyInMix']
/ metadata['CEMIQtyInMix'])
else:
logger.error('WaterCementRatio not included in yaml-file')


# air content ('Luftgehalt')
if 'Luftgehalt' not in missing_labels:
idx = labelidx['Luftgehalt']
metadata['AirContent'] = float(0) # Quantity
metadata['AirDensity'] = float(0)
else:
logger.error('AirContent not included in yaml-file')


# Admixture/Plasticizer ('Zusatzmittel')
if 'Zusatzmittel' not in missing_labels:
idx = labelidx['Zusatzmittel']
metadata['PlasticizerQtyInMix'] = float(replace_comma(str(exceltodf.iat[idx,2])))
metadata['PlasticizerDensity'] = float(replace_comma(str(exceltodf.iat[idx,4])))
else:
logger.error('Plasticizer/Admixture not included in yaml-file')


# Aggregate ('Zuschlag (gesamt)')
if 'Zuschlag (gesamt)' not in missing_labels:
idx = labelidx['Zuschlag (gesamt)']
metadata['OkrillaQtyInMix'] = float(replace_comma(str(exceltodf.iat[idx,2])))
metadata['OkrillaDensity'] = float(replace_comma(str(exceltodf.iat[idx,4])))
else:
logger.error('Okrilla/aggregate not included in yaml-file')



############################ O U T P U T #############################
if locationOfProcessedData == None:
return metadata
else:
with open(os.path.join(locationOfProcessedData, name + '.yaml'), mode='w') as yamlFile:
yaml.dump(metadata, yamlFile, sort_keys=False, allow_unicode=True)


Loading