diff --git a/lebedigital/mapping/mixture_mapping.py b/lebedigital/mapping/mixture_mapping.py new file mode 100644 index 000000000..6283c9d36 --- /dev/null +++ b/lebedigital/mapping/mixture_mapping.py @@ -0,0 +1,187 @@ +# Script for the e-module ontology (extracted from CPTO) to map e-module metadata +# by reading every line of that ontology and finding/ # replacing the placeholders. +# Logging through loguru, you can ignore "debug" messages. "Warning" appear if not +# everything has been mapped. + +# import libraries +import yaml +import os +from pathlib import Path +from loguru import logger +import uuid + + +def load_metadata(dataPath): + ''' + Load metadata from a given path and return it as dictionary. + dataPath : string + Path to the metadata yaml-file. + + ''' + + with open(dataPath, 'r') as file: + try: + metadata = yaml.safe_load(file) + return metadata + except Exception as e: + logger.error("Path error: " + str(e)) + + +def generate_placeholder(key): + ''' + Generates a placeholder (str) in the format $$key_Value$$ for a given key. + This function should allow to easily change the structure of the placeholder + given in the ontology without having to rewrite the function placeholderreplacement. + Just change the structure here. + ''' + + placeholder = '$$' + str(key) + '_Value$$' + return placeholder + + +def placeholderreplacement( + ontoPath, + metadataPath, + outputPath=None + ): + ''' + Maps the values of one given metadata file (for one specimen or + experiment) to a given ontology, by searching within ontology linewise + for all metadata keys and replacing placeholders with values from the + metadata. Also creates and appends an ID for the specimen. + + Parameter: + ----- + ontoPath : string + complete path to Ontology (ttl-format) + metadataPath : string + complete path to metadata (yaml-format) + outputPath : string + complete path for output + + Output: + --- + If no ouput path is given (f.e. for unittesting), the lines will be + returned. If the "ontoPath" is given for output, the ontology will + be overwritten. To avoid this, give a new name to create a new ttl-file. + + ''' + + # load metadata and get the keys + metadata = load_metadata(metadataPath) + keys = list(metadata.keys()) + + # generate ID for the e-module metadata + specimenID = str(uuid.uuid4()) + + # read in the ontology as text linewise, creating a list of lines + with open(ontoPath, 'r') as file: + lines = file.readlines() + + # Set up logger + logger.debug('S T A R T') + logger.debug('Loaded ttl-File has ' + str(len(lines)) + ' lines.') + usedKeys = [] # to count keys that found a placeholder + ontoPHcounter = [] # to count all placeholders + remainingPH = [] # to count the placeholders that recieved no data + + # iterating through the list of lines + for i in range(len(lines)): + + # create a list of placeholders + if '_Value$$' in lines[i]: + ph = lines[i].split("$$")[1] + ontoPHcounter.append(ph) + + # iterate through list of metadata-keys + for key in keys: + + placeholder = generate_placeholder(key) + + # if placeholder is in line, replace it with metadata + if placeholder in lines[i]: + logger.debug('Found placeholder "' + placeholder + '" for key "' \ + + key + '" with value "' + str(metadata[key]) + '".') + lines[i] = lines[i].replace(placeholder, str(metadata[key])) + usedKeys.append(key) + + # append the specimen-ID name to "key"_ , works for most keys, except + # some keys below + key_ = key + "_ " + if key_ in lines[i]: + lines[i] = lines[i].replace(key_, key + "_" + str(specimenID) + " ") + + + # append the specimen-ID name to the exceptions + if "_," in lines[i]: + #logger.debug('Appended specimen-ID in line ' + str(i + 1) \ + # + ' to ' + str(lines[i].split("_,")[0] + "_,") + '".') + lines[i] = lines[i].replace("_,", "_" + str(specimenID) + ",") + if "_ " in lines[i]: + #logger.debug('Appended specimen-ID in line ' + str(i + 1) \ + # + ' to ' + str(lines[i].split("_ ")[0] + "_ ") + '".') + lines[i] = lines[i].replace("_ ", "_" + str(specimenID) + " ") + + + # ID-key is not given by metadata but created in this script, so map it now: + if generate_placeholder("SpecimenID") in lines[i]: + logger.debug('Found placeholder "' + generate_placeholder("SpecimenID")+ '".') + lines[i] = lines[i].replace(generate_placeholder("SpecimenID"), str(specimenID)) + + + + ############################ L O G G I N G ############################# + + # create a list of leftover placeholders to see which ones didn't recieve a value + if '_Value$$' in lines[i]: + ph = lines[i].split("$$")[1] + remainingPH.append(ph) + + # for metadata + unusedKeys = [i for i in keys if i not in usedKeys] + if len(unusedKeys) > 0: + logger.warning('Mapped only ' + str(len(usedKeys)) + ' keys to the ontology.') + logger.warning('The following ' + str(len(unusedKeys)) + ' of ' + str(len(keys)) \ + + ' metadata keys have not been mapped: ') + logger.warning(unusedKeys) + else: + logger.debug('All ' + str(len(usedKeys)) + ' metadata keys have been mapped.') + + # for placeholders + if len(remainingPH) > 0: + logger.warning('File has ' + str(len(ontoPHcounter)) + ' placeholders.') + logger.warning('The following ' + str(len(remainingPH)) + ' of ' + str(len(ontoPHcounter)) \ + + ' placeholders did not recieve a metadata value: ') + logger.warning(remainingPH) + else: + logger.debug('All ' + str(len(ontoPHcounter)) + ' placeholders within the ontology revieced metadata.') + + ############################ O U T P U T ############################# + if outputPath == None: + return lines + + else: + # saving the list again to the file + with open(outputPath, 'w') as file: + for line in lines: + file.write(line) + + + +# T E M P O R A R Y !!! +# For my personal testing, will be removed later. Will cause test-failures +# because of my own local testing data that doesn't exist on your PC. + +# defining paths : ONTOLOGY +ontoDir = Path(__file__).parents[2] +ontoFile = "../lebedigital/ConcreteOntology/MixtureDesignOntology.ttl" +ontoPath = os.path.join(ontoDir, ontoFile) + +# defining paths : METADATA +dataDir = Path(__file__).parents[2] +dataFile = "../lebedigital/mapping/testMixtureMetadata.yaml" +dataPath = os.path.join(dataDir, dataFile) + +# creating mapped ttl +mappedOntoName = os.path.join(Path(__file__).parents[0], 'MixMappedExmpl.ttl') +placeholderreplacement(ontoPath, dataPath, mappedOntoName) \ No newline at end of file diff --git a/lebedigital/raw_data_processing/mixture/mixdesign_metadata_extraction.py b/lebedigital/raw_data_processing/mixture/mixdesign_metadata_extraction.py new file mode 100644 index 000000000..ece9ea1d5 --- /dev/null +++ b/lebedigital/raw_data_processing/mixture/mixdesign_metadata_extraction.py @@ -0,0 +1,199 @@ +# Script for metadata-extraction for mixes with CEM I and CEM II. Output yaml +# should work with the MixDesign ontology for mapping. + + +#------------------------------------------------------------------------------ + +from cmath import nan +import pandas as pd +# removing 'SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame' +pd.options.mode.chained_assignment = None # default='warn' + +import os +import yaml +from loguru import logger +from pathlib import Path + + +# Set up logger +baseDir = Path(__file__).parents[0] +logPath = os.path.join(baseDir, "logs","file_{time}.log") +#logger.add(logPath, level="DEBUG") # this also displays the log in the console +logger.configure(handlers=[{"sink": logPath, "level": "DEBUG"}]) + + + +# function to convert german formatting to english +def replace_comma(string, format = 'float'): + if '---' in string: + string = nan # maybe None? But this will cause errors when float(string) + return string + elif format == 'float': + string = string.replace(',', '.') + return float(string) + else: + string = string.replace(',', '.') + return string + + + +# function to check for nan-values independently of the format (str/float) +def isNaN(num): + return num!= num + + + +# decorater in case you want to catch errors so that the script won't break +# but just pass without output: +# @logger.catch + +# extraction script +def extract_metadata_mixdesign( + locationOfRawData, + locationOfProcessedData = None + ): + + """ + Extracts the metadata from all "Rezeptur"-sheets of a given datafile + (xls or xlsx). Creates one yaml-file per sheet containing the keyword + "Rezeptur". + + + Parameter + --------- + locationOfRawData : string + Path of the excelsheet (xls or xlsx) containing the metadata in one + or multiple "Rezeptur"-Sheet(s). + locationOfProcessedData : string + Path of the target folder for yaml-file (optional, give only if you + want a yaml-file to be generated). + + Output + ------- + If no ouput path (locationOfProcessedData) is given (f.e. for unittesting), + the dict containing the metadata will be returned. Otherwise a yaml file + will be created. + + """ + + + # Find sheets in the file containing the mixture (keyword: "Rezeptur"), allow + # only one sheet per file + excelsheet = os.path.basename(locationOfRawData) + excelfile = pd.read_excel(locationOfRawData, sheet_name= None) + listofkeys = [i for i in excelfile.keys() if 'Rezeptur' in i] + logger.debug('Working on file: '+ excelsheet) + logger.debug('Following sheet(s) contain mixture metadata in this file: ' + str(listofkeys)) + + if len(listofkeys) != 1: + logger.error('None or multiple sheets with mixture found in the raw data.') + raise Exception('None or multiple sheets with mixture found in the raw data.') + else: + sheet = listofkeys[0] + + # name of yaml-file will be experiment-name + name = os.path.basename(excelsheet).split('.xl')[0] + + # save data from excelsheet into pandas dataframe + exceltodf = excelfile[sheet] + + # create empty dictionary for metadata + metadata = {} + + # the layout of the excel table can vary, the indices of labels are not + # always the same; that's why: find now the indices of the labels and + # store it in a dictionary + labelidx = {} + labelcolumn = exceltodf.iloc[:,0] # select first column (containing labels) + for i in range(len(labelcolumn)): + labelcolumn[i] = str(labelcolumn[i]).strip() # remove whitespace + labelidx[labelcolumn[i]] = i + + # Check for missing labels; the following labels should exist (except + # Zusatzstoff 2, not all raw files have two additions/Zusatzstoffe) + default_labels = ['Bezeichnung der Proben:', 'Zement', 'Wasser (gesamt)', + 'Luftgehalt', 'Zusatzmittel', 'Zuschlag (gesamt)'] + missing_labels = [i for i in default_labels if i not in labelidx.keys()] + if len(missing_labels) != 0: + logger.error('Check raw data, there are labels missing: ' + str(missing_labels)) + raise KeyError('Check raw data, there are labels missing', missing_labels) + + + ############### E X T R A C T I O N ############# + + # get raw data file name + metadata['RawDataFile'] = locationOfRawData + + # get date and time (always the same position) + metadata['MixingDate'] = str(exceltodf.columns[9])[:10] + + # lab location - hardcoded + metadata['Lab'] = "BAM" + + #---------------------------------------------------------------------- + + # Extraction of the columns 'Stoffmenge' (QuantityInMix), 'Dichte bzw. + # Rohdichte' (Density). + + # Cement data ('Zement') + if 'Zement' not in missing_labels: + idx = labelidx['Zement'] + metadata['CEMIQtyInMix'] = float(replace_comma(str(exceltodf.iat[idx,2]))) + metadata['CEMIDensity'] = float(replace_comma(str(exceltodf.iat[idx,4]))) + else: + logger.error('cement not included in yaml-file') + + + # total water data ('Wasser (gesamt)') + if 'Wasser (gesamt)' not in missing_labels: + idx = labelidx['Wasser (gesamt)'] + metadata['MixingWaterQtyInMix'] = float(replace_comma(str(exceltodf.iat[idx,2]))) + metadata['WaterDensity'] = float(replace_comma(str(exceltodf.iat[idx,4]))) + else: + logger.error('Water not included in yaml-file') + + + # water cement ratio ('Wasserzementwert') + if 'Zement' not in missing_labels and 'Wasser (gesamt)' not in missing_labels: + metadata['WaterCementRatio'] = float(metadata['MixingWaterQtyInMix'] + / metadata['CEMIQtyInMix']) + else: + logger.error('WaterCementRatio not included in yaml-file') + + + # air content ('Luftgehalt') + if 'Luftgehalt' not in missing_labels: + idx = labelidx['Luftgehalt'] + metadata['AirContent'] = float(0) # Quantity + metadata['AirDensity'] = float(0) + else: + logger.error('AirContent not included in yaml-file') + + + # Admixture/Plasticizer ('Zusatzmittel') + if 'Zusatzmittel' not in missing_labels: + idx = labelidx['Zusatzmittel'] + metadata['PlasticizerQtyInMix'] = float(replace_comma(str(exceltodf.iat[idx,2]))) + metadata['PlasticizerDensity'] = float(replace_comma(str(exceltodf.iat[idx,4]))) + else: + logger.error('Plasticizer/Admixture not included in yaml-file') + + + # Aggregate ('Zuschlag (gesamt)') + if 'Zuschlag (gesamt)' not in missing_labels: + idx = labelidx['Zuschlag (gesamt)'] + metadata['OkrillaQtyInMix'] = float(replace_comma(str(exceltodf.iat[idx,2]))) + metadata['OkrillaDensity'] = float(replace_comma(str(exceltodf.iat[idx,4]))) + else: + logger.error('Okrilla/aggregate not included in yaml-file') + + + + ############################ O U T P U T ############################# + if locationOfProcessedData == None: + return metadata + else: + with open(os.path.join(locationOfProcessedData, name + '.yaml'), mode='w') as yamlFile: + yaml.dump(metadata, yamlFile, sort_keys=False, allow_unicode=True) + + diff --git a/lebedigital/raw_data_processing/mixture/readme.md b/lebedigital/raw_data_processing/mixture/readme.md new file mode 100644 index 000000000..ae34337d7 --- /dev/null +++ b/lebedigital/raw_data_processing/mixture/readme.md @@ -0,0 +1,36 @@ +There are two scripts for extracting mixture metadata: + +# The first script: mixture_metadata_extraction.py + +This script was the first one and is designed for the raw data we have ("..\usecases\MinimumWorkingExample\Data\Mischungen"). +This raw data contains only one type of cement and up to two additions. We don't use the resulting metadata-yaml-file at the +moment for any ontology. + +# The second script: mixdesign_metadata_extraction.py + +This script is made to make the existing raw data fit the MixDesign ontology, so that we can produce some +data for the Minimum Working Example. The MixDesign ontology is refering to different types of cement, as +stated in this [paper](https://www.sciencedirect.com/science/article/pii/S0008884608000884). + +Main differences to the raw data that we have: +1. There are placeholders for Cem I and Cem II, tho our raw data only has one type of cement. +2. No placeholder(s) for addition ('Zusatzstoff'). +3. Aggregate is always of type "Okrilla". + +The following information is attempted to be extracted from raw data: +- Raw data file name ("$$RawDataFile_Value$$"^^xsd:string) +- Water cement ratio ("$$WaterCementRatio_Value$$"^^xsd:decimal), calculated in the script +- Mixing Date ("$$MixingDate_Value$$"^^xsd:dateTimeStamp) +- Location of the lab ("$$Lab_Value$$"^^xsd:string) +- Cement 1: Quantity and density ("$$CEMIQtyInMix_Value$$"^^xsd:decimal, "$$CEMIDensity_Value$$"^^xsd:decimal) +- Water: Quantity and density ("$$MixingWaterQtyInMix_Value$$"^^xsd:decimal, "$$WaterDensity_Value$$"^^xsd:decimal) +- Aggregates: Quantity and density ("$$OkrillaQtyInMix_Value$$"^^xsd:decimal, "$$OkrillaDensity_Value$$"^^xsd:decimal) +- Admixture: Quantity and density ("$$PlasticizerQtyInMix_Value$$"^^xsd:decimal, "$$PlasticizerDensity_Value$$"^^xsd:decimal) +- AirContent: Quantity and density ("$$AirContent_Value$$"^^xsd:decimal, "$$AirDensity_Value$$"^^xsd:decimal) + +Please note: +- The following information is not existent in the raw data: +Cement 2: Quantity and density ("$$CEMIIQtyInMix_Value$$"^^xsd:decimal, "$$CEMIIDensity_Value$$"^^xsd:decimal) +- For calculating the water-cement-ratio, Cem I is used. +- 'OkrillaDensity' ('Zuschlag (gesamt)') seems to have no value given in most raw data files. Instead the volume is given. +- 'AirContent' seems to have no value given for quantity and density in most raw files. Only the volume is given. \ No newline at end of file