Skip to content

Commit

Permalink
Merge pull request #75 from joefutrelle/ml_analyzed_2023
Browse files Browse the repository at this point in the history
new ml_analyzed algorithm
  • Loading branch information
joefutrelle authored Sep 18, 2023
2 parents e7ecbd9 + 0ad0b36 commit db00491
Showing 1 changed file with 90 additions and 67 deletions.
157 changes: 90 additions & 67 deletions ifcb/metrics/ml_analyzed.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,10 @@
import numpy as np
import os
import pandas as pd

import scipy.stats as stats
import numpy as np

from ifcb.data.adc import SCHEMA_VERSION_1, SCHEMA_VERSION_2


def read_ml_analyzed(path):
"""read from the legacy matlab files"""
from scipy.io import loadmat
mat = loadmat(path, squeeze_me=True)
# ignore variables other than the following
cols = ['filelist_all', 'looktime', 'minproctime', 'ml_analyzed', 'runtime']
# convert to dataframe
df = pd.DataFrame({ c: mat[c] for c in cols }, columns=cols)
df.index = df.pop('filelist_all') # index by bin LID
return df
FLOW_RATE = 0.25 # milliliters per minute for syringe pump

def compute_ml_analyzed_s1_adc(adc, min_proc_time=0.073):
"""compute ml_analyzed for an old instrument"""
Expand Down Expand Up @@ -51,64 +40,98 @@ def compute_ml_analyzed_s1_adc(adc, min_proc_time=0.073):
ml_analyzed = look_time * FLOW_RATE
return ml_analyzed, look_time, run_time

def compute_ml_analyzed_s1(abin, min_proc_time=0.073):
return compute_ml_analyzed_s1_adc(abin.adc, min_proc_time=min_proc_time)

def compute_ml_analyzed_s2_adc(abin):
"""compute ml_analyzed for a new instrument, based on ADC file"""
FLOW_RATE = 0.25 # ml/minute
s = abin.schema
adc = abin.adc
def ma(row):
run_time = row[s.RUN_TIME]
inhibit_time = row[s.INHIBIT_TIME]
look_time = run_time - inhibit_time
ml_analyzed = FLOW_RATE * (look_time / 60.)
return ml_analyzed, look_time, run_time
last_row = adc.iloc[-1]
ml_analyzed, look_time, run_time = ma(last_row)
if ml_analyzed <= 0 or abs(last_row[s.RUN_TIME] - last_row[s.ADC_TIME]) >= 0.3:
row = adc.iloc[-2]
ml_analyzed, look_time, run_time = ma(row)
if ml_analyzed <= 0:
row = adc.iloc[-2]
run_time = row[s.ADC_TIME]
nz = adc[s.RUN_TIME].to_numpy().nonzero()[0]
mode_inhibit_time = stats.mode(np.diff(adc[s.INHIBIT_TIME].iloc[nz]))[0][0]
last_good_inhibit_time = adc[s.INHIBIT_TIME].iloc[nz[-1]]
inhibit_time = last_good_inhibit_time + (len(adc) - len(nz)) * mode_inhibit_time
look_time = run_time - inhibit_time
ml_analyzed = FLOW_RATE * (look_time / 60)
return ml_analyzed, look_time, run_time
def compute_ml_analyzed_s2_adc(adc):
"""
This function returns the estimate of sample volume analyzed (in milliliters),
assuming a standard IFCB configuration with the sample syringe operating at 0.25 mL per minute.
It applies only to IFCB instruments after 007 and higher (except 008).
"""

def compute_ml_analyzed_s2_hdr(abin):
FLOW_RATE = 0.25 # ml/minute
# ml analyzed is (run time - inhibit time) * flow rate
run_time = abin.header('runTime')
inhibit_time = abin.header('inhibitTime')
look_time = run_time - inhibit_time
ml_analyzed = FLOW_RATE * (look_time / 60.)
return ml_analyzed, look_time, run_time
column_names = ['trigger', 'adc_time', 'pmt_a', 'pmt_b', 'pmt_c', 'pmt_d', 'peak_a', 'peak_b', 'peak_c', 'peak_d', 'time_of_flight', 'grabtime_start', 'grabtime_end', 'roi_x', 'roi_y', 'roi_width', 'roi_height', 'start_byte', 'comparator_out', 'start_point', 'signal_length', 'status', 'runtime', 'inhibit_time']
adc.columns = column_names

if adc.empty:
return np.nan, np.nan, np.nan

diffinh = np.diff(adc['inhibit_time'])

# find indices of rows where inhibitime is not 0 and not less than the previous value (within 0.1 second)
# iii = [0] + [i+1 for i in range(len(diffinh)) if diffinh[i] > -0.1 and diffinh[i] < 5]
iii = np.where((diffinh > -0.1) & (diffinh < 5))[0] + 1
iii = np.insert(iii, 0, 0)


# calculate the mode differential inhibittime from the good records, round to nearest 4 digits before finding mode
# this will be used as the "second best" estimate of the inhibittime for the whole file
rounded_diffinh = round(pd.Series(np.diff(adc.loc[iii+1, 'inhibit_time'])), 4)
modeinhibittime = rounded_diffinh.mode().values[0]

def compute_ml_analyzed_s2(abin):
"""compute ml_analyzed for a new instrument"""
TOLERANCE = 0.05
ml_analyzed_hdr, look_time_hdr, run_time_hdr = compute_ml_analyzed_s2_hdr(abin)
ml_analyzed_adc, look_time_adc, run_time_adc = compute_ml_analyzed_s2_adc(abin)
if look_time_hdr > 0 and abs(ml_analyzed_adc - ml_analyzed_hdr) < TOLERANCE:
return ml_analyzed_hdr, look_time_hdr, run_time_hdr
runtime_offset = 0
inhibittime_offset = 0

if adc.shape[0] > 1: # if there is more than one row in the file
# calculate the offset between runtime and adc_time for the first record
runtime_offset_test = adc['runtime'].iloc[1] - adc['adc_time'].iloc[1]

# if the offset is greater than 10 seconds, use it as the offset for the whole file
if runtime_offset_test > 10:
runtime_offset = runtime_offset_test
# use the second row since the first one is bad occasionally, add two mode increments to account for that
inhibittime_offset = adc['inhibit_time'].iloc[1] + modeinhibittime * 2

if adc.shape[0] == len(iii): # if all records are good
# inhibittime is the last record's inhibit_time minus the offset
# this is the best value--if it's not bad
inhibittime = adc['inhibit_time'].iloc[-1] - inhibittime_offset
else:
# second best estimate, last good row, plus mode as best guess for each bad row
inhibittime = adc['inhibit_time'].iloc[iii[-1]] + (adc.shape[0] - len(iii)) * modeinhibittime - inhibittime_offset

# runtime is the last record's runtime minus the offset
runtime = adc['runtime'].iloc[-1] - runtime_offset

n = min(adc.shape[0], 50) # use the first 50 records to estimate the runtime offset
runtime2 = adc['adc_time'].iloc[-1] + (adc['runtime'].iloc[:n] - adc['adc_time'].iloc[:n]).median() - runtime_offset

# if the difference between the two estimates is greater than 0.2 seconds, use the second one
if abs(runtime - runtime2) > 0.2:
runtime = runtime2
else:
return ml_analyzed_adc, look_time_adc, run_time_adc
return np.nan, np.nan, np.nan

looktime = runtime - inhibittime
ml_analyzed = FLOW_RATE * looktime / 60

def compute_ml_analyzed(abin):
"""returns ml_analyzed, look time, run time"""
s = abin.schema
if abin.pid.instrument == 5 and abin.timestamp >= pd.to_datetime('2015-06-01', utc=True):
return ml_analyzed, looktime, runtime

def compute_ml_analyzed_s2(b):
hdr_runtime = b.header('runtime')
hdr_inhibittime = b.header('inhibittime')
_, adc_looktime, adc_runtime = compute_ml_analyzed_s2_adc(b.adc)
adc_inhibittime = adc_runtime - adc_looktime
rat = hdr_runtime / adc_runtime
if rat < 0.98 or rat > 1.02:
runtime = adc_runtime
else:
runtime = hdr_runtime
rat = hdr_inhibittime / adc_inhibittime
if rat < 0.98 or rat > 1.02:
inhibittime = adc_inhibittime
else:
inhibittime = hdr_inhibittime
looktime = runtime - inhibittime
ml_analyzed = FLOW_RATE * looktime / 60
return ml_analyzed, looktime, runtime


def compute_ml_analyzed(b):
s = b.schema
if b.pid.instrument == 5 and b.timestamp >= pd.to_datetime('2015-06-01', utc=True):
# IFCB5 bins after June 2015 require a non-default min_proc_time
return compute_ml_analyzed_s1(abin, min_proc_time=0.05)
return compute_ml_analyzed_s1_adc(b.adc, min_proc_time=0.05)
elif s is SCHEMA_VERSION_1:
return compute_ml_analyzed_s1(abin)
return compute_ml_analyzed_s1_adc(b.adc)
elif s is SCHEMA_VERSION_2:
return compute_ml_analyzed_s2(abin)
return compute_ml_analyzed_s2(b)
else: # unknown bin type, indicating some upstream error
return np.nan, np.nan, np.nan
return np.nan, np.nan, np.nan

0 comments on commit db00491

Please sign in to comment.