excelparser¶
Module from parsing an excelfile and creating an ontology from it.
The excelfile is read by pandas and the pandas dataframe should have column names: prefLabel, altLabel, Elucidation, Comments, Examples, subClassOf, Relations.
Note that correct case is mandatory.
ExcelError (EMMOntoPyException)
¶
Raised on errors in Excel file.
Source code in ontopy/excelparser.py
class ExcelError(EMMOntoPyException):
"""Raised on errors in Excel file."""
create_ontology_from_excel(excelpath, concept_sheet_name='Concepts', metadata_sheet_name='Metadata', imports_sheet_name='ImportedOntologies', base_iri='http://emmo.info/emmo/domain/onto#', base_iri_from_metadata=True, imports=None, catalog=None, force=False, input_ontology=None)
¶
Creates an ontology from an Excel-file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
excelpath |
str |
Path to Excel workbook. |
required |
concept_sheet_name |
str |
Name of sheet where concepts are defined. The second row of this sheet should contain column names that are supported. Currently these are 'prefLabel','altLabel', 'Elucidation', 'Comments', 'Examples', 'subClassOf', 'Relations'. Multiple entries are separated with ';'. |
'Concepts' |
metadata_sheet_name |
str |
Name of sheet where metadata are defined.
The first row contains column names 'Metadata name' and 'Value'
Supported 'Metadata names' are: 'Ontology IRI',
'Ontology vesion IRI', 'Ontology version Info', 'Title',
'Abstract', 'License', 'Comment', 'Author', 'Contributor'.
Multiple entries are separated with a semi-colon ( |
'Metadata' |
imports_sheet_name |
str |
Name of sheet where imported ontologies are defined. Column name is 'Imported ontologies'. Fully resolvable URL or path to imported ontologies provided one per row. |
'ImportedOntologies' |
base_iri |
str |
Base IRI of the new ontology. |
'http://emmo.info/emmo/domain/onto#' |
base_iri_from_metadata |
bool |
Whether to use base IRI defined from metadata. |
True |
imports |
list |
List of imported ontologies. |
None |
catalog |
dict |
Imported ontologies with (name, full path) key/value-pairs. |
None |
force |
bool |
Forcibly make an ontology by skipping concepts that are erroneously defined or other errors in the excel sheet. |
False |
input_ontology |
Optional[ontopy.ontology.Ontology] |
Ontology that should be updated. Default is None, which means that a completely new ontology is generated. If an input_ontology to be updated is provided, the metadata sheet in the excel sheet will not be considered. |
None |
Returns:
Type | Description |
---|---|
A tuple with the |
|
Source code in ontopy/excelparser.py
def create_ontology_from_excel( # pylint: disable=too-many-arguments
excelpath: str,
concept_sheet_name: str = "Concepts",
metadata_sheet_name: str = "Metadata",
imports_sheet_name: str = "ImportedOntologies",
base_iri: str = "http://emmo.info/emmo/domain/onto#",
base_iri_from_metadata: bool = True,
imports: list = None,
catalog: dict = None,
force: bool = False,
input_ontology: Union[ontopy.ontology.Ontology, None] = None,
) -> Tuple[ontopy.ontology.Ontology, dict, dict]:
"""
Creates an ontology from an Excel-file.
Arguments:
excelpath: Path to Excel workbook.
concept_sheet_name: Name of sheet where concepts are defined.
The second row of this sheet should contain column names that are
supported. Currently these are 'prefLabel','altLabel',
'Elucidation', 'Comments', 'Examples', 'subClassOf', 'Relations'.
Multiple entries are separated with ';'.
metadata_sheet_name: Name of sheet where metadata are defined.
The first row contains column names 'Metadata name' and 'Value'
Supported 'Metadata names' are: 'Ontology IRI',
'Ontology vesion IRI', 'Ontology version Info', 'Title',
'Abstract', 'License', 'Comment', 'Author', 'Contributor'.
Multiple entries are separated with a semi-colon (`;`).
imports_sheet_name: Name of sheet where imported ontologies are
defined.
Column name is 'Imported ontologies'.
Fully resolvable URL or path to imported ontologies provided one
per row.
base_iri: Base IRI of the new ontology.
base_iri_from_metadata: Whether to use base IRI defined from metadata.
imports: List of imported ontologies.
catalog: Imported ontologies with (name, full path) key/value-pairs.
force: Forcibly make an ontology by skipping concepts
that are erroneously defined or other errors in the excel sheet.
input_ontology: Ontology that should be updated.
Default is None,
which means that a completely new ontology is generated.
If an input_ontology to be updated is provided,
the metadata sheet in the excel sheet will not be considered.
Returns:
A tuple with the:
* created ontology
* associated catalog of ontology names and resolvable path as dict
* a dictionary with lists of concepts that raise errors, with the
following keys:
- "already_defined": These are concepts that are already in the
ontology, because they were already added in a
previous line of the excelfile/pandas dataframe, or because
it is already defined in an imported ontology with the same
base_iri as the newly created ontology.
- "in_imported_ontologies": Concepts that are defined in the
excel, but already exist in the imported ontologies.
- "wrongly_defined": Concepts that are given an invalid
prefLabel (e.g. with a space in the name).
- "missing_parents": Concepts that are missing parents.
These concepts are added directly under owl:Thing.
- "invalid_parents": Concepts with invalidly defined parents.
These concepts are added directly under owl:Thing.
- "nonadded_concepts": List of all concepts that are not added,
either because the prefLabel is invalid, or because the
concept has already been added once or already exists in an
imported ontology.
"""
web_protocol = "http://", "https://", "ftp://"
def _relative_to_absolute_paths(path):
if isinstance(path, str):
if not path.startswith(web_protocol):
path = os.path.dirname(excelpath) + "/" + str(path)
return path
try:
imports = pd.read_excel(
excelpath, sheet_name=imports_sheet_name, skiprows=[1]
)
except ValueError:
imports = pd.DataFrame()
else:
# Strip leading and trailing white spaces in paths
imports.replace(r"^\s+", "", regex=True).replace(
r"\s+$", "", regex=True
)
# Set empty strings to nan
imports = imports.replace(r"^\s*$", np.nan, regex=True)
if "Imported ontologies" in imports.columns:
imports["Imported ontologies"] = imports[
"Imported ontologies"
].apply(_relative_to_absolute_paths)
# Read datafile TODO: Some magic to identify the header row
conceptdata = pd.read_excel(
excelpath, sheet_name=concept_sheet_name, skiprows=[0, 2]
)
metadata = pd.read_excel(excelpath, sheet_name=metadata_sheet_name)
return create_ontology_from_pandas(
data=conceptdata,
metadata=metadata,
imports=imports,
base_iri=base_iri,
base_iri_from_metadata=base_iri_from_metadata,
catalog=catalog,
force=force,
input_ontology=input_ontology,
)
create_ontology_from_pandas(data, metadata, imports, base_iri='http://emmo.info/emmo/domain/onto#', base_iri_from_metadata=True, catalog=None, force=False, input_ontology=None)
¶
Create an ontology from a pandas DataFrame.
Check 'create_ontology_from_excel' for complete documentation.
Source code in ontopy/excelparser.py
def create_ontology_from_pandas( # pylint:disable=too-many-locals,too-many-branches,too-many-statements,too-many-arguments
data: pd.DataFrame,
metadata: pd.DataFrame,
imports: pd.DataFrame,
base_iri: str = "http://emmo.info/emmo/domain/onto#",
base_iri_from_metadata: bool = True,
catalog: dict = None,
force: bool = False,
input_ontology: Union[ontopy.ontology.Ontology, None] = None,
) -> Tuple[ontopy.ontology.Ontology, dict]:
"""
Create an ontology from a pandas DataFrame.
Check 'create_ontology_from_excel' for complete documentation.
"""
# Remove lines with empty prefLabel
data = data[data["prefLabel"].notna()]
# Convert all data to string, remove spaces, and finally remove
# additional rows with empty prefLabel.
data = data.astype(str)
data["prefLabel"] = data["prefLabel"].str.strip()
data = data[data["prefLabel"].str.len() > 0]
data.reset_index(drop=True, inplace=True)
if input_ontology:
onto = input_ontology
catalog = {}
else: # Create new ontology
onto, catalog = get_metadata_from_dataframe(
metadata, base_iri, imports=imports
)
# Set given or default base_iri if base_iri_from_metadata is False.
if not base_iri_from_metadata:
onto.base_iri = base_iri
labels = set(data["prefLabel"])
for altlabel in data["altLabel"].str.strip():
if not altlabel == "nan":
labels.update(altlabel.split(";"))
# Dictionary with lists of concepts that raise errors
concepts_with_errors = {
"already_defined": [],
"in_imported_ontologies": [],
"wrongly_defined": [],
"missing_parents": [],
"invalid_parents": [],
"nonadded_concepts": [],
"errors_in_properties": [],
}
onto.sync_python_names()
with onto:
remaining_rows = set(range(len(data)))
all_added_rows = []
while remaining_rows:
added_rows = set()
for index in remaining_rows:
row = data.loc[index]
name = row["prefLabel"]
try:
onto.get_by_label(name)
if onto.base_iri in [
a.namespace.base_iri
for a in onto.get_by_label_all(name)
]:
if not force:
raise ExcelError(
f'Concept "{name}" already in ontology'
)
warnings.warn(
f'Ignoring concept "{name}" since it is already in '
"the ontology."
)
concepts_with_errors["already_defined"].append(name)
continue
concepts_with_errors["in_imported_ontologies"].append(name)
except (ValueError, TypeError) as err:
warnings.warn(
f'Ignoring concept "{name}". '
f'The following error was raised: "{err}"'
)
concepts_with_errors["wrongly_defined"].append(name)
continue
except NoSuchLabelError:
pass
if row["subClassOf"] == "nan":
if not force:
raise ExcelError(f"{row[0]} has no subClassOf")
parent_names = [] # Should be "owl:Thing"
concepts_with_errors["missing_parents"].append(name)
else:
parent_names = str(row["subClassOf"]).split(";")
parents = []
invalid_parent = False
for parent_name in parent_names:
try:
parent = onto.get_by_label(parent_name.strip())
except (NoSuchLabelError, ValueError) as exc:
if parent_name not in labels:
if force:
warnings.warn(
f'Invalid parents for "{name}": '
f'"{parent_name}".'
)
concepts_with_errors["invalid_parents"].append(
name
)
break
raise ExcelError(
f'Invalid parents for "{name}": {exc}\n'
"Have you forgotten an imported ontology?"
) from exc
invalid_parent = True
break
else:
parents.append(parent)
if invalid_parent:
continue
if not parents:
parents = [owlready2.Thing]
concept = onto.new_entity(name, parents)
added_rows.add(index)
# Add elucidation
try:
_add_literal(
row,
concept.elucidation,
"Elucidation",
only_one=True,
)
except AttributeError as err:
if force:
_add_literal(
row,
concept.comment,
"Elucidation",
only_one=True,
)
warnings.warn("Elucidation added as comment.")
else:
raise ExcelError(
f"Not able to add elucidations. {err}."
) from err
# Add examples
try:
_add_literal(
row, concept.example, "Examples", expected=False
)
except AttributeError:
if force:
warnings.warn(
"Not able to add examples. "
"Did you forget to import an ontology?."
)
# Add comments
_add_literal(row, concept.comment, "Comments", expected=False)
# Add altLabels
try:
_add_literal(
row, concept.altLabel, "altLabel", expected=False
)
except AttributeError as err:
if force is True:
_add_literal(
row,
concept.label,
"altLabel",
expected=False,
)
warnings.warn("altLabel added as rdfs.label.")
else:
raise ExcelError(
f"Not able to add altLabels. " f"{err}."
) from err
remaining_rows.difference_update(added_rows)
# Detect infinite loop...
if not added_rows and remaining_rows:
unadded = [data.loc[i].prefLabel for i in remaining_rows]
if force is True:
warnings.warn(
f"Not able to add the following concepts: {unadded}."
" Will continue without these."
)
remaining_rows = False
concepts_with_errors["nonadded_concepts"] = unadded
else:
raise ExcelError(
f"Not able to add the following concepts: {unadded}."
)
all_added_rows.extend(added_rows)
# Add properties in a second loop
for index in all_added_rows:
row = data.loc[index]
properties = row["Relations"]
if properties == "nan":
properties = None
if isinstance(properties, str):
try:
concept = onto.get_by_label(row["prefLabel"].strip())
except NoSuchLabelError:
pass
props = properties.split(";")
for prop in props:
try:
concept.is_a.append(evaluate(onto, prop.strip()))
except pyparsing.ParseException as exc:
warnings.warn(
f"Error in Property assignment for: '{concept}'. "
f"Property to be Evaluated: '{prop}'. "
f"{exc}"
)
concepts_with_errors["errors_in_properties"].append(name)
except NoSuchLabelError as exc:
msg = (
f"Error in Property assignment for: {concept}. "
f"Property to be Evaluated: {prop}. "
f"{exc}"
)
if force is True:
warnings.warn(msg)
concepts_with_errors["errors_in_properties"].append(
name
)
else:
raise ExcelError(msg) from exc
# Synchronise Python attributes to ontology
onto.sync_attributes(
name_policy="uuid", name_prefix="EMMO_", class_docstring="elucidation"
)
onto.dir_label = False
concepts_with_errors = {
key: set(value) for key, value in concepts_with_errors.items()
}
return onto, catalog, concepts_with_errors
english(string)
¶
Returns string
as an English location string.
Source code in ontopy/excelparser.py
def english(string):
"""Returns `string` as an English location string."""
return owlready2.locstr(string, lang="en")
get_metadata_from_dataframe(metadata, base_iri, base_iri_from_metadata=True, imports=None, catalog=None)
¶
Create ontology with metadata from pd.DataFrame
Source code in ontopy/excelparser.py
def get_metadata_from_dataframe( # pylint: disable=too-many-locals,too-many-branches,too-many-statements
metadata: pd.DataFrame,
base_iri: str,
base_iri_from_metadata: bool = True,
imports: pd.DataFrame = None,
catalog: dict = None,
) -> Tuple[ontopy.ontology.Ontology, dict]:
"""Create ontology with metadata from pd.DataFrame"""
# base_iri from metadata if it exists and base_iri_from_metadata
if base_iri_from_metadata:
try:
base_iris = _parse_literal(metadata, "Ontology IRI", metadata=True)
if len(base_iris) > 1:
warnings.warn(
"More than one Ontology IRI given. The first was chosen."
)
base_iri = base_iris[0] + "#"
except (TypeError, ValueError, AttributeError, IndexError):
pass
# Create new ontology
onto = get_ontology(base_iri)
# Add imported ontologies
catalog = {} if catalog is None else catalog
locations = set()
for _, row in imports.iterrows():
# for location in imports:
location = row["Imported ontologies"]
if not pd.isna(location) and location not in locations:
imported = onto.world.get_ontology(location).load()
onto.imported_ontologies.append(imported)
catalog[imported.base_iri.rstrip("#/")] = location
try:
cat = read_catalog(location.rsplit("/", 1)[0])
catalog.update(cat)
except ReadCatalogError:
warnings.warn(f"Catalog for {imported} not found.")
locations.add(location)
# set defined prefix
if not pd.isna(row["prefix"]):
# set prefix for all ontologies with same 'base_iri_root'
if not pd.isna(row["base_iri_root"]):
onto.set_common_prefix(
iri_base=row["base_iri_root"], prefix=row["prefix"]
)
# If base_root not given, set prefix only to top ontology
else:
imported.prefix = row["prefix"]
with onto:
# Add title
try:
_add_literal(
metadata,
onto.metadata.title,
"Title",
metadata=True,
only_one=True,
)
except AttributeError:
pass
# Add license
try:
_add_literal(
metadata, onto.metadata.license, "License", metadata=True
)
except AttributeError:
pass
# Add authors/creators
try:
_add_literal(
metadata, onto.metadata.creator, "Author", metadata=True
)
except AttributeError:
pass
# Add contributors
try:
_add_literal(
metadata,
onto.metadata.contributor,
"Contributor",
metadata=True,
)
except AttributeError:
pass
# Add versionInfo
try:
_add_literal(
metadata,
onto.metadata.versionInfo,
"Ontology version Info",
metadata=True,
only_one=True,
)
except AttributeError:
pass
return onto, catalog