Skip to content

Commit

Permalink
add Dataset class for ROCES and further dev of NCES2 and ROCES
Browse files Browse the repository at this point in the history
  • Loading branch information
Jean-KOUAGOU committed Dec 9, 2024
1 parent c74de16 commit 4c9a9d2
Show file tree
Hide file tree
Showing 2 changed files with 215 additions and 59 deletions.
75 changes: 41 additions & 34 deletions ontolearn/concept_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
AbstractHeuristic, EncodedPosNegLPStandardKind
from ontolearn.base_concept_learner import BaseConceptLearner, RefinementBasedConceptLearner
from owlapy.utils import EvaluatedDescriptionSet, ConceptOperandSorter, OperandSetTransform
from ontolearn.data_struct import NCESDataLoader, NCESDataLoaderInference, CLIPDataLoader, CLIPDataLoaderInference
from ontolearn.data_struct import NCESDataset, NCESDatasetInference, CLIPDataset, CLIPDatasetInference
from ontolearn.ea_algorithms import AbstractEvolutionaryAlgorithm, EASimple
from ontolearn.ea_initialization import AbstractEAInitialization, EARandomInitialization, EARandomWalkInitialization
from ontolearn.ea_utils import PrimitiveFactory, OperatorVocabulary, ToolboxVocabulary, Tree, escape, ind_to_string, \
Expand Down Expand Up @@ -688,7 +688,7 @@ def pos_neg_to_tensor(self, pos: Union[Set[OWLNamedIndividual]], neg: Union[Set[
assert self.load_pretrained and self.pretrained_predictor_name, \
"No pretrained model found. Please first train length predictors, see the <<train>> method below"

dataset = CLIPDataLoaderInference([("", pos_str, neg_str)], self.instance_embeddings, False, False)
dataset = CLIPDatasetInference([("", pos_str, neg_str)], self.instance_embeddings, False, False)
dataloader = DataLoader(dataset, batch_size=1, num_workers=self.num_workers,
collate_fn=self.collate_batch_inference, shuffle=False)
x_pos, x_neg = next(iter(dataloader))
Expand Down Expand Up @@ -781,7 +781,7 @@ def fit(self, *args, **kwargs):
def train(self, data: Iterable[List[Tuple]], epochs=300, batch_size=256, learning_rate=1e-3, decay_rate=0.0,
clip_value=5.0, save_model=True, storage_path=None, optimizer='Adam', record_runtime=True,
example_sizes=None, shuffle_examples=False):
train_dataset = CLIPDataLoader(data, self.instance_embeddings, shuffle_examples=shuffle_examples,
train_dataset = CLIPDataset(data, self.instance_embeddings, shuffle_examples=shuffle_examples,
example_sizes=example_sizes)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=self.num_workers,
collate_fn=self.collate_batch, shuffle=True)
Expand Down Expand Up @@ -849,7 +849,7 @@ def get_synthesizer(self, path=None):
pass
except Exception as e:
print(e)
raise RuntimeError
raise FileNotFoundError(f"The path to pretrained models is None and no models could be found in {self.path_of_embeddings.split("embeddings")[0]}trained_models")

if Models:
print("\n Loaded NCES weights!\n")
Expand All @@ -860,7 +860,7 @@ def get_synthesizer(self, path=None):

elif len(glob.glob(path+"/*.pt")) == 0:
print("No pretrained model found! If directory is empty or does not exist, set the NCES `load_pretrained` parameter to `False` or make sure `save_model` was set to `True` in the .train() method.")
raise FileNotFoundError
raise FileNotFoundError(f"Path {path} does not contain any pretrained models!")
else:
for file_name in glob.glob(path+"/*.pt"):
for m in Untrained:
Expand All @@ -875,7 +875,7 @@ def get_synthesizer(self, path=None):
print("\n Loaded NCES weights!\n")
return Models
else:
print("!!!Returning untrained models, could not load pretrained")
print("!!!Returning untrained models, could not load pretrained models")
return Untrained
else:
print("!!!Returning untrained models, could not load pretrained. Check the `load_pretrained parameter` or train the models using NCES.train(data).")
Expand All @@ -888,7 +888,7 @@ def refresh(self, path=None):
self.model = self.get_synthesizer(path)

def sample_examples(self, pos, neg): # pragma: no cover
assert type(pos[0]) == type(neg[0]), "The two iterables pos and neg must be of same type"
assert type(pos[0]) == type(neg[0]), f"The two iterables pos and neg must be of same type, got {type(pos[0])} and {type(neg[0])}"
num_ex = self.num_examples
if min(len(pos), len(neg)) >= num_ex // 2:
if len(pos) > len(neg):
Expand Down Expand Up @@ -941,7 +941,7 @@ def fit_one(self, pos: Union[Set[OWLNamedIndividual], Set[str]], neg: Union[Set[
assert self.load_pretrained and self.learner_names, \
"No pretrained model found. Please first train NCES, see the <<train>> method below"

dataset = NCESDataLoaderInference([("", Pos_str, Neg_str) for (Pos_str, Neg_str) in zip(Pos, Neg)],
dataset = NCESDatasetInference([("", Pos_str, Neg_str) for (Pos_str, Neg_str) in zip(Pos, Neg)],
self.instance_embeddings,
self.vocab, self.inv_vocab, False, self.sorted_examples)

Expand Down Expand Up @@ -1033,7 +1033,7 @@ def fit_from_iterable(self, dataset: Union[List[Tuple[str, Set[OWLNamedIndividua
assert self.load_pretrained and self.learner_names, \
"No pretrained model found. Please first train NCES, refer to the <<train>> method"
dataset = [self.convert_to_list_str_from_iterable(datapoint) for datapoint in dataset]
dataset = NCESDataLoaderInference(dataset, self.instance_embeddings, self.vocab, self.inv_vocab,
dataset = NCESDatasetInference(dataset, self.instance_embeddings, self.vocab, self.inv_vocab,
shuffle_examples)
dataloader = DataLoader(dataset, batch_size=self.batch_size, num_workers=self.num_workers,
collate_fn=self.collate_batch_inference, shuffle=False)
Expand Down Expand Up @@ -1083,7 +1083,7 @@ def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, num_
batch_size = self.batch_size
if data is None:
data = self.generate_training_data(self.knowledge_base_path, num_lps=num_lps, storage_dir=storage_path)
train_dataset = NCESDataLoader(data, self.instance_embeddings, self.vocab, self.inv_vocab,
train_dataset = NCESDataset(data, self.instance_embeddings, self.vocab, self.inv_vocab,
shuffle_examples=shuffle_examples, max_length=self.max_length,
example_sizes=example_sizes)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers,
Expand Down Expand Up @@ -1119,13 +1119,17 @@ def __init__(self, knowledge_base_path,
self.model = self.get_synthesizer()


def _maybe_load_pretrained_config(self):
def _maybe_load_pretrained_config(self, path=None):
if isinstance(self.m, int):
self.m = [self.m]
if self.load_pretrained and len(glob.glob(self.path_of_trained_models + "/*.pt")):
if path:
possible_checkpoints = glob.glob(path + "/*.pt")
else:
possible_checkpoints = glob.glob(self.path_of_trained_models + "/*.pt")
if self.load_pretrained and len(possible_checkpoints):
stop_outer_loop = False
try:
for file_name in glob.glob(self.path_of_trained_models + "/*.pt"):
for file_name in possible_checkpoints:
for m in self.m:
if m in file_name:
try:
Expand All @@ -1148,15 +1152,17 @@ def get_synthesizer(self, path=None):
Models = {str(m): {"emb_model": ConEx(self.embedding_dim, self.num_entities, self.num_relations, self.input_dropout,
self.feature_map_dropout, self.kernel_size, self.num_of_output_channels),
"model": SetTransformer(self.knowledge_base_path, self.vocab, self.inv_vocab, self.max_length,
self.input_size, self.proj_dim, self.num_heads, self.num_seeds, m, self.ln)} for m in self.m}
self.input_size, self.proj_dim, self.num_heads, self.num_seeds, m, self.ln)}
for m in self.m}
if self.load_pretrained:
num_loaded = 0
if path is None:
possible_checkpoints = glob.glob(self.path_of_trained_models + "/*.pt")
try:
if len(glob.glob(self.path_of_trained_models + "/*.pt")) == 0:
if len(possible_checkpoints) == 0:
raise FileNotFoundError(f"`path` is None and no models found in {self.path_of_trained_models}")
else:
for file_name in glob.glob(self.path_of_trained_models + "/*.pt"):
for file_name in possible_checkpoints:
for m in self.m:
if m in file_name:
try:
Expand All @@ -1172,22 +1178,23 @@ def get_synthesizer(self, path=None):
Models[str(m)]["emb_model"] = emb_model.eval()
except Exception as e:
print(e)
raise FileNotFoundError
raise FileNotFoundError(f"The path to pretrained models is None and no models could be found in {self.path_of_trained_models}")

if num_loaded == len(self.m):
print("\nLoaded NCES2 weights!\n")
return Models
else:
print("!!!Returning untrained models, could not load pretrained")
print("!!!Returning untrained models, could not load pretrained models")
return Models

elif len(glob.glob(path+"/*.pt")) == 0:
elif len(glob.glob(path + "/*.pt")) == 0:
print("No pretrained model found! If directory is empty or does not exist, set the NCES2 `load_pretrained` parameter to `False` or make sure `save_model` was set to `True` in the .train() method.")
raise FileNotFoundError
raise FileNotFoundError(f"Path {path} does not contain any pretrained models!")
else:
possible_checkpoints = glob.glob(path + "/*.pt")
num_loaded = 0
for file_name in glob.glob(path+"/*.pt"):
for m, model, emb_model in zip(self.m, Untrained_Synt, Untrained_Emb):
for file_name in possible_checkpoints:
for m in self.m:
if m in file_name:
try:
weights = torch.load(file_name, map_location=self.device, weights_only=True)
Expand All @@ -1207,17 +1214,18 @@ def get_synthesizer(self, path=None):
print("!!!Returning untrained models, could not load pretrained")
return Models
else:
print("!!!Returning untrained models, could not load pretrained. Check the `load_pretrained parameter` or train the models using NCES2.train(data).")
print("!!!Returning untrained models, could not load any pretrained model. Check the `load_pretrained parameter` or train the models using NCES2.train(data).")
return Models


def refresh(self, path=None):
if path is not None:
self.load_pretrained = True
self._maybe_load_pretrained_config(path)
self.model = self.get_synthesizer(path)

def sample_examples(self, pos, neg): # pragma: no cover
assert type(pos[0]) == type(neg[0]), "The two iterables pos and neg must be of same type"
assert type(pos[0]) == type(neg[0]), f"The two iterables pos and neg must be of same type, got {type(pos[0])} and {type(neg[0])}"
num_ex = self.num_examples
if min(len(pos), len(neg)) >= num_ex // 2:
if len(pos) > len(neg):
Expand Down Expand Up @@ -1255,7 +1263,6 @@ def get_prediction(self, models, x1, x2):
return prediction

def fit_one(self, pos: Union[Set[OWLNamedIndividual], Set[str]], neg: Union[Set[OWLNamedIndividual], Set[str]]):
#print("\n\n#### In fit one\n\n")
if isinstance(pos[0], OWLNamedIndividual):
pos_str = [ind.str.split("/")[-1] for ind in pos]
neg_str = [ind.str.split("/")[-1] for ind in neg]
Expand All @@ -1267,17 +1274,18 @@ def fit_one(self, pos: Union[Set[OWLNamedIndividual], Set[str]], neg: Union[Set[
Pos = np.random.choice(pos_str, size=(self.num_predictions, len(pos_str)), replace=True)
Neg = np.random.choice(neg_str, size=(self.num_predictions, len(neg_str)), replace=True)

assert self.load_pretrained and self.learner_names, \
"No pretrained model found. Please first train NCES, see the <<train>> method below"
assert self.load_pretrained and self.m, \
"No pretrained model found. Please first train NCES2"

dataset = NCESDataLoaderInference([("", Pos_str, Neg_str) for (Pos_str, Neg_str) in zip(Pos, Neg)],
dataset = NCES2DatasetInference([("", Pos_str, Neg_str) for (Pos_str, Neg_str) in zip(Pos, Neg)],
self.instance_embeddings,
self.vocab, self.inv_vocab, False, self.sorted_examples)

dataloader = DataLoader(dataset, batch_size=self.batch_size,
num_workers=self.num_workers,
collate_fn=self.collate_batch_inference, shuffle=False)
x_pos, x_neg = next(iter(dataloader))
# Initial a simple solution constructor
simpleSolution = SimpleSolution(list(self.vocab), self.atomic_concept_names)
predictions_raw = self.get_prediction(self.model, x_pos, x_neg)

Expand Down Expand Up @@ -1362,7 +1370,7 @@ def fit_from_iterable(self, dataset: Union[List[Tuple[str, Set[OWLNamedIndividua
assert self.load_pretrained and self.learner_names, \
"No pretrained model found. Please first train NCES, refer to the <<train>> method"
dataset = [self.convert_to_list_str_from_iterable(datapoint) for datapoint in dataset]
dataset = NCESDataLoaderInference(dataset, self.instance_embeddings, self.vocab, self.inv_vocab,
dataset = NCESDatasetInference(dataset, self.instance_embeddings, self.vocab, self.inv_vocab,
shuffle_examples)
dataloader = DataLoader(dataset, batch_size=self.batch_size, num_workers=self.num_workers,
collate_fn=self.collate_batch_inference, shuffle=False)
Expand All @@ -1386,8 +1394,8 @@ def fit_from_iterable(self, dataset: Union[List[Tuple[str, Set[OWLNamedIndividua
return predictions_as_owl_class_expressions

@staticmethod
def generate_training_data(kb_path, num_lps=1000, storage_dir="./NCES_Training_Data"):
lp_gen = LPGen(kb_path=kb_path, max_num_lps=num_lps, storage_dir=storage_dir)
def generate_training_data(kb_path, num_lps=1000, beyond_alc=False, storage_dir="./NCES_Training_Data"):
lp_gen = LPGen(kb_path=kb_path, max_num_lps=num_lps, beyond_alc=beyond_alc, storage_dir=storage_dir)
lp_gen.generate()
print("Loading generated data...")
with open(f"{storage_dir}/LPs.json") as file:
Expand All @@ -1396,7 +1404,6 @@ def generate_training_data(kb_path, num_lps=1000, storage_dir="./NCES_Training_D
return lps



def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, num_lps=1000, learning_rate=1e-4, decay_rate=0.0,
clip_value=5.0, num_workers=8, save_model=True, storage_path=None, optimizer='Adam', record_runtime=True,
example_sizes=None, shuffle_examples=False):
Expand All @@ -1412,7 +1419,7 @@ def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, num_
batch_size = self.batch_size
if data is None:
data = self.generate_training_data(self.knowledge_base_path, num_lps=num_lps, storage_dir=storage_path)
train_dataset = NCESDataLoader(data, self.instance_embeddings, self.vocab, self.inv_vocab,
train_dataset = NCESDataset(data, self.instance_embeddings, self.vocab, self.inv_vocab,
shuffle_examples=shuffle_examples, max_length=self.max_length,
example_sizes=example_sizes)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers,
Expand Down Expand Up @@ -1456,7 +1463,7 @@ def train(self, data: Iterable[List[Tuple]]=None, epochs=50, batch_size=64, num_
batch_size = self.batch_size
if data is None:
data = self.generate_training_data(self.knowledge_base_path, num_lps=num_lps, storage_dir=storage_path)
train_dataset = NCESDataLoader(data, self.instance_embeddings, self.vocab, self.inv_vocab,
train_dataset = NCESDataset(data, self.instance_embeddings, self.vocab, self.inv_vocab,
shuffle_examples=shuffle_examples, max_length=self.max_length,
example_sizes=example_sizes)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers,
Expand Down
Loading

0 comments on commit 4c9a9d2

Please sign in to comment.