Skip to content

Commit

Permalink
Added drop_columns feature; usefull for optimizing cutoffs
Browse files Browse the repository at this point in the history
  • Loading branch information
monk-04 committed Sep 24, 2023
1 parent ac89588 commit 1db3823
Show file tree
Hide file tree
Showing 3 changed files with 329 additions and 3 deletions.
103 changes: 103 additions & 0 deletions tests/test_optimize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import pytest
import numpy as np
import ase
from uf3.data import composition
from uf3.representation.process import *
from uf3.regression import optimize

@pytest.fixture()
def Nb_Sn_chemistry():
element_list = ['Nb', 'Sn']
chemistry_config = composition.ChemicalSystem(element_list,degree=3)
yield chemistry_config

@pytest.fixture()
def bspline_config_larger_cutoff(Nb_Sn_chemistry):
element_list = ['Nb', 'Sn']
chemistry_config = composition.ChemicalSystem(element_list,degree=3)
bspline_config = optimize.get_bspline_config(Nb_Sn_chemistry, rmin=0,
rmax_2b=6, rmax_3b=4,
knot_spacing=0.4)
yield bspline_config


@pytest.fixture()
def bspline_config_smaller_cutoff(Nb_Sn_chemistry):
element_list = ['Nb', 'Sn']
chemistry_config = composition.ChemicalSystem(element_list,degree=3)
bspline_config = optimize.get_bspline_config(Nb_Sn_chemistry, rmin=0,
rmax_2b=4.4, rmax_3b=3.2,
knot_spacing=0.4)
yield bspline_config

@pytest.fixture()
def Nb3Sn_geom():
geom = ase.Atoms('Nb6Sn2',
positions = [[0.000000000, 2.667305470, 1.333652735],
[0.000000000, 2.667305470, 4.000958204],
[1.333652735, 0.000000000, 2.667305470],
[4.000958204, 0.000000000, 2.667305470],
[2.667305470, 1.333652735, 0.000000000],
[2.667305470, 4.000958204, 0.000000000],
[0.000000000, 0.000000000, 0.000000000],
[2.667305470, 2.667305470, 2.667305470]],
pbc = True,
cell = [[5.3346109390, 0.0000000000, 0.0000000000],
[0.0000000000, 5.3346109390, 0.0000000000],
[0.0000000000, 0.0000000000, 5.3346109390]])

yield geom


class TestOptimize:

def test_get_bspline_config(self,bspline_config_larger_cutoff):

for i in bspline_config_larger_cutoff.interactions_map[2]:
assert bspline_config_larger_cutoff.r_min_map[i] == 0
assert bspline_config_larger_cutoff.r_max_map[i] == 6
assert bspline_config_larger_cutoff.resolution_map[i] == 15

for i in bspline_config_larger_cutoff.interactions_map[3]:
assert bspline_config_larger_cutoff.r_min_map[i] == [0,0,0]
assert bspline_config_larger_cutoff.r_max_map[i] == [4, 4, 8]
assert bspline_config_larger_cutoff.resolution_map[i] == [10,10,20]


def test_get_possible_lower_cutoffs(self,bspline_config_larger_cutoff):
cutoff_dict = optimize.get_possible_lower_cutoffs(bspline_config_larger_cutoff)

assert np.allclose(cutoff_dict['rmax_2b_poss'],np.array([0.4, 0.8, 1.2,\
1.6, 2. , 2.4, 2.8, 3.2, 3.6, 4. , 4.4, 4.8, 5.2, 5.6, 6. ]))

assert np.allclose(cutoff_dict['rmax_3b_poss'],np.array([0.8, 1.6, 2.4, 3.2, 4. ]))

def test_drop_columns(self,bspline_config_larger_cutoff,Nb_Sn_chemistry, Nb3Sn_geom):

bspline_handler_larger_cutoff = BasisFeaturizer(bspline_config_larger_cutoff)

columns_to_drop_2b = optimize.get_columns_to_drop_2b(original_bspline_config=bspline_config_larger_cutoff,
modify_2b_cutoff=4.4,
knot_spacing=0.4)

columns_to_drop_3b = optimize.get_columns_to_drop_3b(original_bspline_config=bspline_config_larger_cutoff,
modify_3b_cutoff=3.2,
knot_spacing=0.4)

columns_to_drop = columns_to_drop_2b + columns_to_drop_3b

bspline_config_smaller_cutoff = optimize.get_bspline_config(Nb_Sn_chemistry, rmin=0,
rmax_2b=4.4, rmax_3b=3.2,
knot_spacing=0.4)
bspline_handler_smaller_cutoff = BasisFeaturizer(bspline_config_smaller_cutoff)

assert len(bspline_handler_larger_cutoff.columns) - len(columns_to_drop) == len(bspline_handler_smaller_cutoff.columns)

energy_feature_larger_cutoff = bspline_handler_larger_cutoff.evaluate_configuration(geom=Nb3Sn_geom,energy=-69.44185169)
energy_feature_smaller_cutoff = bspline_handler_smaller_cutoff.evaluate_configuration(geom=Nb3Sn_geom,energy=-69.44185169)

index_to_drop = np.where(np.isin(bspline_handler_larger_cutoff.columns,columns_to_drop))[0]
energy_feature_from_larger_cutoff = np.delete(energy_feature_larger_cutoff['energy'],index_to_drop)

assert np.allclose(energy_feature_smaller_cutoff['energy'],energy_feature_from_larger_cutoff)

29 changes: 26 additions & 3 deletions uf3/regression/least_squares.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,8 @@ def fit_from_file(self,
batch_size=2500,
sample_weights: Dict = None,
energy_key="energy",
progress: str = "bar"):
progress: str = "bar",
drop_columns: list = None):
"""
Accumulate inputs and outputs from batched parsing of HDF5 file
and compute direct solution via LU decomposition.
Expand All @@ -376,6 +377,10 @@ def fit_from_file(self,
sample_weights (dict):
energy_key (str): column name for energies, default "energy".
progress (str): style for progress indicators.
drop_columns (list): list of columns to drop. Used when modifying
the cutoffs of the feature vectors from HDF5 file. No internal
checks are performed to see if dropping provided columns produce
features of the intended cutoffs. Use with Caution.
"""
if not os.path.isfile(filename):
raise FileNotFoundError(filename)
Expand All @@ -391,6 +396,10 @@ def fit_from_file(self,
keys = df.index.unique(level=0).intersection(subset)
if len(keys) == 0:
continue

if drop_columns != None:
df.drop(columns=drop_columns,inplace=True)

intermediates = self.gram_from_df(df,
keys,
e_variance=e_variance,
Expand Down Expand Up @@ -477,7 +486,8 @@ def batched_predict(self,
filename: str,
keys: List[str] = None,
table_names: List[str] = None,
score: bool = True):
score: bool = True,
drop_columns: list = None):
"""
Extract inputs and outputs from HDF5 file and predict energies/forces.
Expand All @@ -494,13 +504,18 @@ def batched_predict(self,
p_f (np.ndarray): target values for forces.
rmse_e (np.ndarray): RMSE across energy predictions.
rmse_e (np.ndarray): RMSE across force predictions.
drop_columns (list): list of columns to drop. Used when modifying
the cutoffs of the feature vectors from HDF5 file. No internal
checks are performed to see if dropping provided columns produce
features of the intended cutoffs. Use with Caution.
"""
n_elements = len(self.bspline_config.element_list)
y_e, p_e, y_f, p_f = batched_prediction(self,
filename,
table_names=table_names,
subset_keys=keys,
n_elements=n_elements)
n_elements=n_elements,
drop_columns=drop_columns)
if score:
rmse_e = rmse_metric(y_e, p_e)
rmse_f = rmse_metric(y_f, p_f)
Expand Down Expand Up @@ -954,6 +969,7 @@ def batched_prediction(model: WeightedLinearModel,
filename: str,
table_names: Collection = None,
subset_keys: Collection = None,
drop_columns: list = None,
**kwargs):
"""
Convenience function for optimization workflow. Read inputs/outputs
Expand All @@ -964,6 +980,10 @@ def batched_prediction(model: WeightedLinearModel,
model (WeightedLinearModel): fitted model.
table_names (list): list of table names to query from HDF5 file.
subset_keys (list): list of keys to query from DataFrame.
drop_columns (list): list of columns to drop. Used when modifying
the cutoffs of the feature vectors from HDF5 file. No internal
checks are performed to see if dropping provided columns produce
features of the intended cutoffs. Use with Caution.
Returns:
y_e (np.ndarray): target values for energies.
Expand All @@ -979,6 +999,9 @@ def batched_prediction(model: WeightedLinearModel,
y_f = []
p_f = []
for df in df_batches:
if drop_columns != None:
df.drop(columns=drop_columns,inplace=True)

predictions = subset_prediction(df,
model,
subset_keys=subset_keys,
Expand Down
200 changes: 200 additions & 0 deletions uf3/regression/optimize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
"""
This module provides functions for optimizing the cutoffs of 2-body and 3-body
terms. If a feature file exist with a higher cutoff, features with lower cutoffs
can be constructed by droping appropriate columns. The functions provided only
work when used with uniform knot spacing.
"""

import numpy as np
from uf3.representation import bspline

def get_bspline_config(chemical_system,
rmin: float,
rmax_2b: float,
rmax_3b: float,
knot_spacing: float):
"""
Function for getting bspline_config object. We recommend using this function
to get bspling_config object for-
1. creating the HDF5 file with larger cutoffs
2. fitting model with lower cutoffs
The rmax_2b and rmax_3b values in the returned bspline_config object can be
slightly different depending on the knot_spacing
Args:
chemical_system: chemical_system
rmax_2b (float): Two-body cutoff
rmax_3b (float): Three-body cutoff. If the 3-body interaction is A-B-C, then
rmax_3b is the distance between A-B (or equivalently A-C) i.e the first or
second term in the 3-body rmax of bspline_config.
knot_spacing (float): knot_spacing used to create the HDF5 file
Returns:
bspline_config:
"""
if rmin!=0:
raise ValueError("Currrent version is only tested for rmin=0")

if (rmax_2b-rmin)%knot_spacing!=0:
if not (np.isclose((rmax_2b-rmin)%knot_spacing,knot_spacing) or \
np.isclose((rmax_2b-rmin)%knot_spacing,0)):
raise ValueError("Provided rmax_2b does not conatin integer number of \n\
knots, seperated by knot_spacing")

if (rmax_3b-rmin)%knot_spacing!=0:
if not (np.isclose((rmax_3b-rmin)%knot_spacing,knot_spacing) or \
np.isclose((rmax_3b-rmin)%knot_spacing,0)):
raise ValueError("Provided rmax_3b does not conatin integer number of \n\
knots, seperated by knot_spacing")

rmax_3b_double = rmax_3b*2
if not(np.isclose((rmax_3b_double%knot_spacing),0) or \
np.isclose((rmax_3b_double%knot_spacing),knot_spacing)):
raise ValueError("Provided rmax_3b contains integer number of knots sperated \n\
by knot_spacing, but rmax_3b_double does not. Consider changing rmax_3b \n\
to "+str(rmax_3b+knot_spacing))

reso_2b = round((rmax_2b - rmin)/knot_spacing)
reso_3b = round((rmax_3b - rmin)/knot_spacing)

reso_3b_double = round(reso_3b*2)

r_min_map = {i:rmin for i in chemical_system.interactions_map[2]}
r_min_map.update({i:[rmin,rmin,rmin] for i in chemical_system.interactions_map[3]})

r_max_map = {i:rmax_2b for i in chemical_system.interactions_map[2]}
r_max_map.update({i:[rmax_3b,rmax_3b,rmax_3b_double] for i in chemical_system.interactions_map[3]})

trailing_trim = 3
leading_trim = 0

resolution_map = {i:reso_2b for i in chemical_system.interactions_map[2]}
resolution_map.update({i:[reso_3b,reso_3b,reso_3b_double] for i in chemical_system.interactions_map[3]})

bspline_config = bspline.BSplineBasis(chemical_system, r_min_map=r_min_map,
r_max_map=r_max_map,resolution_map=resolution_map,
trailing_trim=trailing_trim,leading_trim=leading_trim)

return bspline_config


def get_possible_lower_cutoffs(original_bspline_config):
"""
Function for getting cutoff values obtainable by droping columns of HDF5 file
Args:
original_bspline_config: bspline_config used to create the HDF5 feature file.
This file was produced with larger cutoff.
Returns:
Dict: {"rmax_2b_poss":rmax_2b_poss, "rmax_3b_poss":rmax_3b_poss}
rmax_2b_poss (list): List of possible 2-body cutoffs
rmax_3b_poss (list): List of possible 3-body cutoffs
"""
interaction_2b = original_bspline_config.interactions_map[2][0]
interaction_3b = original_bspline_config.interactions_map[3][0]

rmax_2b_poss = original_bspline_config.knots_map[interaction_2b][4:-3]
rmax_3b_poss = original_bspline_config.knots_map[interaction_3b][0][4:-3][1::2]

for i in range(rmax_2b_poss.shape[0]):
if rmax_2b_poss[i] not in original_bspline_config.knots_map[interaction_2b]:
raise ValueError("Internal check failed-->2B!!")

for i in range(rmax_3b_poss.shape[0]):
if rmax_3b_poss[i] not in original_bspline_config.knots_map[interaction_3b][0]:
raise ValueError("Internal check failed-->3B_0!!")

for i in range(rmax_3b_poss.shape[0]):
if rmax_3b_poss[i] not in original_bspline_config.knots_map[interaction_3b][1]:
raise ValueError("Internal check failed-->3B_1!!")

for i in range(rmax_3b_poss.shape[0]):
if 2*rmax_3b_poss[i] not in original_bspline_config.knots_map[interaction_3b][2]:
raise ValueError("Internal check failed-->3B_2!!")

return {"rmax_2b_poss":rmax_2b_poss, "rmax_3b_poss":rmax_3b_poss}



def get_columns_to_drop_2b(original_bspline_config,
modify_2b_cutoff: float,
knot_spacing: float):
"""
Function for getting appropriate 2-body feature columns to drop for fitting
to lower cutoffs
Args:
original_bspline_config: bspline_config used to create the HDF5 feature file.
This file was produced with larger cutoff.
modify_2b_cutoff (float): Intended 2-body cutoff
knot_spacing (float): knot_spacing used to create the HDF5 file
Returns:
columns_to_drop_2b (list): Should be passed to drop_columns argument of
fit_from_file
"""
column_names = original_bspline_config.get_column_names()
interaction_partitions_num = original_bspline_config.get_interaction_partitions()[0]
interaction_partitions_posn = original_bspline_config.get_interaction_partitions()[1]

columns_to_drop_2b = []
for interaction in original_bspline_config.interactions_map[2]:
num_columns_to_drop_2b = round((original_bspline_config.knots_map[interaction][-4]-modify_2b_cutoff)/knot_spacing)

start_ind_2b = 1+interaction_partitions_posn[interaction]
end_ind_2b = start_ind_2b+interaction_partitions_num[interaction]
columns_to_drop_2b.extend(column_names[end_ind_2b-num_columns_to_drop_2b-3:end_ind_2b-3])
return columns_to_drop_2b

def get_columns_to_drop_3b(original_bspline_config,
modify_3b_cutoff: float,
knot_spacing: float):
"""
Function for getting appropriate 2-body feature columns to drop for fitting
to lower cutoffs
Args:
original_bspline_config: bspline_config used to create the HDF5 feature file.
This file was produced with larger cutoff.
modify_3b_cutoff (float): Intended 3-body cutoff. If the 3-body interaction
is A-B-C, then modify_3b_cutoff is the maximum distance between A-B (or A-C)
i.e the first or second term in the 3-body rmax of bspline_config.
knot_spacing (float): knot_spacing used to create the HDF5 file
Returns:
columns_to_drop_3b (list): Should be passed to drop_columns argument of
fit_from_file
"""
column_names = original_bspline_config.get_column_names()
interaction_partitions_num = original_bspline_config.get_interaction_partitions()[0]
interaction_partitions_posn = original_bspline_config.get_interaction_partitions()[1]

columns_to_drop_3b = []
for interaction in original_bspline_config.interactions_map[3]:
num_columns_to_drop_3b = round((original_bspline_config.knots_map[interaction][0][-4]-modify_3b_cutoff)/knot_spacing)
num_columns_to_drop_3b_double = int(num_columns_to_drop_3b*2)
start_ind_3b = 1+interaction_partitions_posn[interaction]
end_ind_3b = start_ind_3b + interaction_partitions_num[interaction]

l_space, m_space, n_space = original_bspline_config.knots_map[interaction]
L = len(l_space) - 4
M = len(m_space) - 4
N = len(n_space) - 4
grid_c = np.chararray((L, M, N))
grid_c[:] = '0'
grid_c = np.char.multiply(grid_c,16)
grid_c.flat[original_bspline_config.template_mask[interaction]] = column_names[start_ind_3b:end_ind_3b]

grid_c = np.delete(grid_c,np.s_[grid_c.shape[2]-3-num_columns_to_drop_3b_double:grid_c.shape[2]-3],axis=2) #4
grid_c = np.delete(grid_c,np.s_[grid_c.shape[1]-3-num_columns_to_drop_3b:grid_c.shape[1]-3],axis=1) #2
grid_c = np.delete(grid_c,np.s_[grid_c.shape[0]-3-num_columns_to_drop_3b:grid_c.shape[0]-3],axis=0) #2

columns_to_keep = grid_c[grid_c[:] != b'0000000000000000'].astype('<U16')

columns_to_keep_index_3b = np.where(np.isin(column_names[start_ind_3b:end_ind_3b],columns_to_keep))[0]

column_to_drop = np.setdiff1d(column_names[start_ind_3b:end_ind_3b],columns_to_keep)
column_to_drop_index = np.where(np.isin(column_names[start_ind_3b:end_ind_3b],column_to_drop))[0]

columns_to_drop_3b.extend(column_to_drop)
return columns_to_drop_3b

0 comments on commit 1db3823

Please sign in to comment.