From 1cfee42c4792b9a2102fd6478ac9ac928b9c543b Mon Sep 17 00:00:00 2001 From: Francesco Date: Fri, 8 Apr 2022 11:47:57 +0200 Subject: [PATCH] Prepare: getting named entities of different types, without duplicate names --- prepare/src/main/Main.scala | 19 +++++++++++++------ prepare/src/main/NamedEntity.scala | 4 ++-- prepare/src/main/NamedEntitySet.scala | 16 ++++++++++++++++ 3 files changed, 31 insertions(+), 8 deletions(-) create mode 100644 prepare/src/main/NamedEntitySet.scala diff --git a/prepare/src/main/Main.scala b/prepare/src/main/Main.scala index a8ba710..f5ab08d 100644 --- a/prepare/src/main/Main.scala +++ b/prepare/src/main/Main.scala @@ -5,7 +5,6 @@ import org.apache.jena.reasoner.ReasonerRegistry import java.io.InputStream object Main extends App { - { println("Loading knowledge graph...") val kg: InfModel = ModelFactory.createInfModel(ReasonerRegistry.getTransitiveReasoner, ModelFactory.createDefaultModel) @@ -17,8 +16,12 @@ object Main extends App { }) kg.read(mondialDataset, null, "TTL"); printf("Loaded %s triples\n", kg.listStatements().toList.size) - val countries = getMondialNamedEntities(kg, "Country") - printf("There are %d countries in the dataset\n", countries.size) + val guessableEntities = NamedEntitySet() ++ + getMondialNamedEntities(kg, "Country") ++ + getMondialNamedEntities(kg, "City") ++ + getMondialNamedEntities(kg, "Continent") ++ + getMondialNamedEntities(kg, "Sea") + printf("Found %d guessable named entities\n", guessableEntities.set.size) } def getMondialNamedEntities(kg: Model, prefixedType: String): List[NamedEntity] = { @@ -27,16 +30,20 @@ object Main extends App { val MONDIAL_PREFIX = "http://www.semwebtech.org/mondial/10/meta#" val fullType = MONDIAL_PREFIX + prefixedType; val nameProperty = MONDIAL_PREFIX + "name" - val queryString = String.format("SELECT ?entity ?name WHERE { ?entity a <%s> ; <%s> ?name. }", fullType, nameProperty) + val queryString = s"SELECT ?entity ?name WHERE { ?entity a <${fullType}> ; <${nameProperty}> ?name. }" val resultSet = QueryExecutionFactory.create(queryString, kg).execSelect() while (resultSet.hasNext) { val solution = resultSet.next() result = result :+ new NamedEntity( solution.getLiteral("?name").getString, - solution.getResource("?entity") + solution.getResource("?entity"), + prefixedType ) } + if (result.isEmpty) { + System.err.printf("Error: found no named entities of type %s\n", prefixedType) + System.exit(1) + } result } - } diff --git a/prepare/src/main/NamedEntity.scala b/prepare/src/main/NamedEntity.scala index dbdb7cc..91937c4 100644 --- a/prepare/src/main/NamedEntity.scala +++ b/prepare/src/main/NamedEntity.scala @@ -1,5 +1,5 @@ import org.apache.jena.rdf.model.Resource -class NamedEntity(val Name: String, val entity: Resource) { - +class NamedEntity(val Name: String, val entity: Resource, val readableType: String) { } + diff --git a/prepare/src/main/NamedEntitySet.scala b/prepare/src/main/NamedEntitySet.scala new file mode 100644 index 0000000..f4bd03c --- /dev/null +++ b/prepare/src/main/NamedEntitySet.scala @@ -0,0 +1,16 @@ +class NamedEntitySet(val set: Set[NamedEntity] = Set()) { + + def ++(iterable: Iterable[NamedEntity]): NamedEntitySet = { + val nameSet = set.map(e => e.Name) + val duplicateNameEntity = iterable.find(e => nameSet.contains(e.Name)) + if (duplicateNameEntity.nonEmpty) { + System.err.printf("Error: multiple entities exist with name %s\n", duplicateNameEntity.get.Name) + System.exit(1) + } + NamedEntitySet(set ++ iterable) + } +} + +object NamedEntitySet { + def apply(set: Set[NamedEntity] = Set()): NamedEntitySet = new NamedEntitySet(set) +}