-
Notifications
You must be signed in to change notification settings - Fork 0
/
check_consistency.py
100 lines (78 loc) · 3.59 KB
/
check_consistency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# -*- coding: utf-8 -*-
"""
[Martinez-Gil2023d] Framework to Automatically Determine the Quality of Open Data Catalogs, arXiv preprint arXiv:2307.15464, 2023
@author: Jorge Martinez-Gil
"""
import sys
from rdflib import Graph, Namespace, RDF
def check_consistency(rdf_data: str, entity_type: str) -> float:
"""
Checks if there are inconsistencies in the attribute values for a specific entity type.
Args:
rdf_data: A string containing RDF data in Turtle format.
entity_type: The type of entity to compare (e.g. "catalog", "dataset", "distribution").
Returns:
A float representing the percentage of (subject, predicate) pairs that have inconsistent attribute values for the specified entity type.
"""
graph = Graph()
graph.parse(data=rdf_data, format="turtle")
contradictions = set()
# Define namespaces for RDF and DCAT
RDF_NS = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
DCAT_NS = Namespace("http://www.w3.org/ns/dcat#")
# Query the RDF graph for entities of the specified type
if entity_type == "catalog":
entities = graph.subjects(RDF.type, DCAT_NS.Catalog)
elif entity_type == "dataset":
entities = graph.subjects(RDF.type, DCAT_NS.Dataset)
elif entity_type == "distribution":
entities = graph.subjects(RDF.type, DCAT_NS.Distribution)
else:
raise ValueError(f"Invalid entity type: {entity_type}")
# Iterate through the entities and check for inconsistencies in their attribute values
for entity in entities:
subjects_predicates = {} # Dictionary to store (subject, predicate) pairs and their corresponding objects
# Iterate through the triples in the RDF graph for the entity and store (subject, predicate) pairs and their objects
for subj, pred, obj in graph.triples((entity, None, None)):
if (subj, pred) in subjects_predicates:
objects = subjects_predicates[(subj, pred)]
if obj not in objects:
contradictions.add((subj, pred))
objects.append(obj)
else:
subjects_predicates[(subj, pred)] = [obj]
# Calculate the percentage of inconsistencies
total_pairs = len(subjects_predicates)
inconsistent_pairs = len(contradictions)
percentage = (inconsistent_pairs / total_pairs) * 100
return percentage
"""
This program checks the consistency of a Data Catalog for a specific entity type.
Usage:
python check_consistency.py cataglog.ttl entity_type
The function `check_consistency` takes an RDF data string and an entity type as input and returns a set of (subject, predicate) pairs that have inconsistent attribute values for the specified entity type.
"""
def main():
try:
# Get path to Data Catalog and entity type from command line arguments
if len(sys.argv) < 3:
print("Usage: python check_consistency.py filepath entity_type")
sys.exit(1)
rdf_data_path = sys.argv[1]
entity_type = sys.argv[2]
# Load RDF data from file
with open(rdf_data_path, "r", encoding="utf-8") as f:
rdf_data = f.read()
result = check_consistency(rdf_data, entity_type)
print(f"The percentage of inconsistencies in {rdf_data_path} for {entity_type} is {result:.2f}%.")
except FileNotFoundError:
print(f"File not found: {rdf_data_path}")
sys.exit(1)
except ValueError as e:
print(f"Error: {e}")
sys.exit(1)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()