BAMresearch · alFrie · Mar 21, 2023 · Mar 21, 2023 · Mar 21, 2023
diff --git a/lebedigital/mapping/mixture_mapping.py b/lebedigital/mapping/mixture_mapping.py
@@ -0,0 +1,187 @@
+# Script for the e-module ontology (extracted from CPTO) to map e-module metadata
+# by reading every line of that ontology and finding/ # replacing the placeholders.
+# Logging through loguru, you can ignore "debug" messages. "Warning" appear if not 
+# everything has been mapped.
+
+# import libraries
+import yaml
+import os
+from pathlib import Path
+from loguru import logger
+import uuid
+
+
+def load_metadata(dataPath):
+    '''
+        Load metadata from a given path and return it as dictionary.
+        dataPath : string
+            Path to the metadata yaml-file.
+
+    '''
+
+    with open(dataPath, 'r') as file:
+        try:
+            metadata = yaml.safe_load(file)
+            return metadata
+        except Exception as e:
+            logger.error("Path error: " + str(e))
+
+
+def generate_placeholder(key):
+    '''
+        Generates a placeholder (str) in the format $$key_Value$$ for a given key.
+        This function should allow to easily change the structure of the placeholder
+        given in the ontology without having to rewrite the function placeholderreplacement.
+        Just change the structure here.
+    '''
+
+    placeholder = '$$' + str(key) + '_Value$$'
+    return placeholder
+
+
+def placeholderreplacement(
+        ontoPath,
+        metadataPath,
+        outputPath=None
+        ):
+    '''
+        Maps the values of one given metadata file (for one specimen or
+        experiment) to a given ontology, by searching within ontology linewise
+        for all metadata keys and replacing placeholders with values from the
+        metadata. Also creates and appends an ID for the specimen.
+
+        Parameter:
+        -----
+        ontoPath : string
+            complete path to Ontology (ttl-format)
+        metadataPath : string
+            complete path to metadata (yaml-format)
+        outputPath : string
+            complete path for output
+
+        Output:
+        ---
+        If no ouput path is given (f.e. for unittesting), the lines will be
+        returned. If the "ontoPath" is given for output, the ontology will
+        be overwritten. To avoid this, give a new name to create a new ttl-file.
+
+    '''
+
+    # load metadata and get the keys
+    metadata = load_metadata(metadataPath)
+    keys = list(metadata.keys())  
+
+    # generate ID for the e-module metadata
+    specimenID = str(uuid.uuid4())
+
+    # read in the ontology as text linewise, creating a list of lines
+    with open(ontoPath, 'r') as file:
+        lines = file.readlines()
+
+        # Set up logger
+        logger.debug('S T A R T')
+        logger.debug('Loaded ttl-File has ' + str(len(lines)) + ' lines.')
+        usedKeys = [] # to count keys that found a placeholder
+        ontoPHcounter = [] # to count all placeholders
+        remainingPH = [] # to count the placeholders that recieved no data
+
+        # iterating through the list of lines
+        for i in range(len(lines)):
+
+            # create a list of placeholders
+            if '_Value$$' in lines[i]:
+                ph = lines[i].split("$$")[1]
+                ontoPHcounter.append(ph)
+
+            # iterate through list of metadata-keys
+            for key in keys:
+
+                placeholder = generate_placeholder(key)
+
+                # if placeholder is in line, replace it with metadata
+                if placeholder in lines[i]:
+                    logger.debug('Found placeholder "' + placeholder + '" for key "' \
+                                 + key + '" with value "' + str(metadata[key]) + '".')
+                    lines[i] = lines[i].replace(placeholder, str(metadata[key]))
+                    usedKeys.append(key)
+
+                # append the specimen-ID name to "key"_ , works for most keys, except 
+                # some keys below
+                key_ = key + "_ "
+                if key_ in lines[i]:
+                    lines[i] = lines[i].replace(key_, key + "_" + str(specimenID) + " ")
+
+
+            # append the specimen-ID name to the exceptions 
+            if "_," in lines[i]:
+                #logger.debug('Appended specimen-ID in line ' + str(i + 1) \
+                #             + ' to ' + str(lines[i].split("_,")[0] + "_,") + '".')    
+                lines[i] = lines[i].replace("_,", "_" + str(specimenID) + ",")
+            if "_ " in lines[i]:
+                #logger.debug('Appended specimen-ID in line ' + str(i + 1) \
+                #             + ' to ' + str(lines[i].split("_ ")[0] + "_ ") + '".')    
+                lines[i] = lines[i].replace("_ ", "_" + str(specimenID) + " ")
+
+
+            # ID-key is not given by metadata but created in this script, so map it now:
+            if generate_placeholder("SpecimenID") in lines[i]:
+                    logger.debug('Found placeholder "' + generate_placeholder("SpecimenID")+ '".')
+                    lines[i] = lines[i].replace(generate_placeholder("SpecimenID"), str(specimenID))
+
+
+
+    ############################ L O G G I N G #############################        
+
+            # create a list of leftover placeholders to see which ones didn't recieve a value
+            if '_Value$$' in lines[i]:
+                ph = lines[i].split("$$")[1]
+                remainingPH.append(ph)
+
+    # for metadata
+    unusedKeys = [i for i in keys if i not in usedKeys]
+    if len(unusedKeys) > 0:
+        logger.warning('Mapped only ' + str(len(usedKeys)) + ' keys to the ontology.')
+        logger.warning('The following ' + str(len(unusedKeys)) + ' of ' + str(len(keys)) \
+                     + ' metadata keys have not been mapped: ')
+        logger.warning(unusedKeys)
+    else:
+        logger.debug('All ' + str(len(usedKeys)) + ' metadata keys have been mapped.')
+
+    # for placeholders
+    if len(remainingPH) > 0:
+        logger.warning('File has ' + str(len(ontoPHcounter)) + ' placeholders.')
+        logger.warning('The following ' + str(len(remainingPH)) + ' of ' + str(len(ontoPHcounter)) \
+                    + ' placeholders did not recieve a metadata value: ')
+        logger.warning(remainingPH)
+    else:
+        logger.debug('All ' + str(len(ontoPHcounter)) + ' placeholders within the ontology revieced metadata.')
+
+    ############################ O U T P U T #############################
+    if outputPath == None:
+        return lines
+
+    else:
+        # saving the list again to the file
+        with open(outputPath, 'w') as file:
+            for line in lines:
+                file.write(line)
+
+
+
+# T E M P O R A R Y !!!
+# For my personal testing, will be removed later. Will cause test-failures
+# because of my own local testing data that doesn't exist on your PC.
+
+# defining paths : ONTOLOGY
+ontoDir = Path(__file__).parents[2]
+ontoFile = "../lebedigital/ConcreteOntology/MixtureDesignOntology.ttl"
+ontoPath = os.path.join(ontoDir, ontoFile)
+
+# defining paths : METADATA
+dataDir = Path(__file__).parents[2]
+dataFile = "../lebedigital/mapping/testMixtureMetadata.yaml"  
+dataPath = os.path.join(dataDir, dataFile)
+
+# creating mapped ttl
+mappedOntoName = os.path.join(Path(__file__).parents[0], 'MixMappedExmpl.ttl')
+placeholderreplacement(ontoPath, dataPath, mappedOntoName)
diff --git a/lebedigital/raw_data_processing/mixture/mixdesign_metadata_extraction.py b/lebedigital/raw_data_processing/mixture/mixdesign_metadata_extraction.py
@@ -0,0 +1,199 @@
+# Script for metadata-extraction for mixes with CEM I and CEM II. Output yaml
+# should work with the MixDesign ontology for mapping.
+
+
+#------------------------------------------------------------------------------
+
+from cmath import nan
+import pandas as pd
+# removing 'SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame'
+pd.options.mode.chained_assignment = None  # default='warn'
+
+import os
+import yaml
+from loguru import logger 
+from pathlib import Path
+
+
+# Set up logger
+baseDir = Path(__file__).parents[0]
+logPath = os.path.join(baseDir, "logs","file_{time}.log")
+#logger.add(logPath, level="DEBUG")  # this also displays the log in the console
+logger.configure(handlers=[{"sink": logPath, "level": "DEBUG"}])
+
+
+
+# function to convert german formatting to english
+def replace_comma(string, format = 'float'):
+    if '---' in string:
+        string = nan  # maybe None? But this will cause errors when float(string)
+        return string
+    elif format == 'float':
+        string = string.replace(',', '.')
+        return float(string)
+    else:
+        string = string.replace(',', '.')
+        return string
+
+
+
+# function to check for nan-values independently of the format (str/float)
+def isNaN(num):
+    return num!= num
+
+
+
+# decorater in case you want to catch errors so that the script won't break 
+# but just pass without output:
+# @logger.catch
+
+# extraction script
+def extract_metadata_mixdesign(
+        locationOfRawData,
+        locationOfProcessedData = None
+        ):    
+
+    """
+        Extracts the metadata from all "Rezeptur"-sheets of a given datafile 
+        (xls or xlsx). Creates one yaml-file per sheet containing the keyword
+        "Rezeptur".
+
+
+        Parameter
+        ---------
+        locationOfRawData : string
+            Path of the excelsheet (xls or xlsx) containing the metadata in one 
+            or multiple "Rezeptur"-Sheet(s).
+        locationOfProcessedData : string
+            Path of the target folder for yaml-file (optional, give only if you 
+            want a yaml-file to be generated).
+
+        Output
+        -------
+        If no ouput path (locationOfProcessedData) is given (f.e. for unittesting), 
+        the dict containing the metadata will be returned. Otherwise a yaml file 
+        will be created.
+
+    """
+
+
+    # Find sheets in the file containing the mixture (keyword: "Rezeptur"), allow
+    # only one sheet per file
+    excelsheet = os.path.basename(locationOfRawData)
+    excelfile = pd.read_excel(locationOfRawData, sheet_name= None) 
+    listofkeys = [i for i in excelfile.keys() if 'Rezeptur' in i] 
+    logger.debug('Working on file: '+ excelsheet)
+    logger.debug('Following sheet(s) contain mixture metadata in this file: ' + str(listofkeys))
+
+    if len(listofkeys) != 1:
+        logger.error('None or multiple sheets with mixture found in the raw data.')
+        raise Exception('None or multiple sheets with mixture found in the raw data.')
+    else:
+        sheet = listofkeys[0]
+
+        # name of yaml-file will be experiment-name 
+        name = os.path.basename(excelsheet).split('.xl')[0]
+
+        # save data from excelsheet into pandas dataframe
+        exceltodf = excelfile[sheet]
+
+        # create empty dictionary for metadata
+        metadata = {}
+
+        # the layout of the excel table can vary, the indices of labels are not 
+        # always the same; that's why: find now the indices of the labels and 
+        # store it in a dictionary
+        labelidx = {}
+        labelcolumn = exceltodf.iloc[:,0]  # select first column (containing labels)
+        for i in range(len(labelcolumn)):
+            labelcolumn[i] = str(labelcolumn[i]).strip()  # remove whitespace
+            labelidx[labelcolumn[i]] = i
+
+        # Check for missing labels; the following labels should exist (except 
+        # Zusatzstoff 2, not all raw files have two additions/Zusatzstoffe)
+        default_labels = ['Bezeichnung der Proben:', 'Zement', 'Wasser (gesamt)', 
+                        'Luftgehalt', 'Zusatzmittel', 'Zuschlag (gesamt)'] 
+        missing_labels =  [i for i in default_labels if i not in labelidx.keys()]
+        if len(missing_labels) != 0:
+            logger.error('Check raw data, there are labels missing: ' + str(missing_labels))
+            raise KeyError('Check raw data, there are labels missing', missing_labels)
+
+
+        ############### E X T R A C T I O N #############
+
+        # get raw data file name
+        metadata['RawDataFile'] = locationOfRawData
+
+        # get date and time (always the same position)
+        metadata['MixingDate'] = str(exceltodf.columns[9])[:10]
+
+        # lab location - hardcoded
+        metadata['Lab'] = "BAM"
+
+        #----------------------------------------------------------------------
+
+        # Extraction of the columns 'Stoffmenge' (QuantityInMix), 'Dichte bzw. 
+        # Rohdichte' (Density).
+
+        # Cement data ('Zement') 
+        if 'Zement' not in missing_labels:
+            idx = labelidx['Zement']
+            metadata['CEMIQtyInMix'] = float(replace_comma(str(exceltodf.iat[idx,2])))
+            metadata['CEMIDensity'] = float(replace_comma(str(exceltodf.iat[idx,4])))
+        else:
+            logger.error('cement not included in yaml-file')
+
+
+        # total water data ('Wasser (gesamt)') 
+        if 'Wasser (gesamt)' not in missing_labels:
+            idx = labelidx['Wasser (gesamt)']
+            metadata['MixingWaterQtyInMix'] = float(replace_comma(str(exceltodf.iat[idx,2])))
+            metadata['WaterDensity'] = float(replace_comma(str(exceltodf.iat[idx,4])))
+        else:
+            logger.error('Water not included in yaml-file')
+
+
+        # water cement ratio ('Wasserzementwert')
+        if 'Zement' not in missing_labels and 'Wasser (gesamt)' not in missing_labels:
+            metadata['WaterCementRatio'] = float(metadata['MixingWaterQtyInMix'] 
+                                                    / metadata['CEMIQtyInMix'])
+        else:
+            logger.error('WaterCementRatio not included in yaml-file')
+
+
+        # air content ('Luftgehalt') 
+        if 'Luftgehalt' not in missing_labels:
+            idx = labelidx['Luftgehalt']
+            metadata['AirContent'] = float(0) # Quantity
+            metadata['AirDensity'] = float(0)
+        else:
+            logger.error('AirContent not included in yaml-file')
+
+
+        # Admixture/Plasticizer ('Zusatzmittel') 
+        if 'Zusatzmittel' not in missing_labels:
+            idx = labelidx['Zusatzmittel']
+            metadata['PlasticizerQtyInMix'] = float(replace_comma(str(exceltodf.iat[idx,2])))
+            metadata['PlasticizerDensity'] = float(replace_comma(str(exceltodf.iat[idx,4])))
+        else:
+            logger.error('Plasticizer/Admixture not included in yaml-file')
+
+
+        # Aggregate ('Zuschlag (gesamt)')
+        if 'Zuschlag (gesamt)' not in missing_labels:
+            idx = labelidx['Zuschlag (gesamt)']
+            metadata['OkrillaQtyInMix'] = float(replace_comma(str(exceltodf.iat[idx,2])))
+            metadata['OkrillaDensity'] = float(replace_comma(str(exceltodf.iat[idx,4])))
+        else:
+            logger.error('Okrilla/aggregate not included in yaml-file')
+
+
+
+        ############################ O U T P U T #############################
+        if locationOfProcessedData == None:
+            return metadata
+        else:
+            with open(os.path.join(locationOfProcessedData, name + '.yaml'), mode='w') as yamlFile:
+                yaml.dump(metadata, yamlFile, sort_keys=False, allow_unicode=True)
+
+