GuessNN/prepare/src/main/guessNNprepare/mains/ExtractEntities.scala

78 lines
2.7 KiB
Scala

package guessNNprepare.mains
import guessNNprepare.NamedEntity
import net.sourceforge.argparse4j.impl.Arguments
import net.sourceforge.argparse4j.inf.{ArgumentParser, Namespace}
import org.apache.jena.query.QueryExecutionFactory
import org.apache.jena.rdf.model.{InfModel, Model, ModelFactory}
import org.apache.jena.reasoner.ReasonerRegistry
import org.json.JSONArray
import java.io.{FileWriter, InputStream}
object ExtractEntities extends MainCommand {
override def description: String = "Save the list of guessable entities to a json file"
override def addCliArgs(parser: ArgumentParser): Unit = {
parser.description(description)
parser.addArgument("json_file").`type`(Arguments.fileType().verifyCanWriteParent()).help("Where the guessable entities will be saved")
}
override def execute(ARGS: Namespace): Unit = {
val jsonFilePath = ARGS.getString("json_file")
println("Loading knowledge graph...")
val kg: InfModel = ModelFactory.createInfModel(ReasonerRegistry.getTransitiveReasoner, ModelFactory.createDefaultModel)
val mondialDataset: InputStream = Option(ClassLoader.getSystemResourceAsStream("mondial_2022_04_04.n3"))
.getOrElse({
System.err.println("Impossible to open dataset")
System.exit(1)
null
})
kg.read(mondialDataset, null, "TTL");
printf("Loaded %s triples\n", kg.listStatements().toList.size)
val guessableEntities = (List[NamedEntity]() ++
getMondialNamedEntities(kg, "Country") ++
getMondialNamedEntities(kg, "City") ++
getMondialNamedEntities(kg, "Continent") ++
getMondialNamedEntities(kg, "Sea"))
.distinct
printf("Found %d guessable named entities\n", guessableEntities.size)
println(s"Writing entities to ${jsonFilePath}")
val jsonEntities:JSONArray = new JSONArray()
guessableEntities.foreach(e => jsonEntities.put(e.json))
val writer = new FileWriter(jsonFilePath)
jsonEntities.write(writer)
writer.flush()
writer.close()
println("Done.")
}
def getMondialNamedEntities(kg: Model, prefixedType: String): List[NamedEntity] = {
var result: List[NamedEntity] = Nil
//noinspection HttpUrlsUsage
val MONDIAL_PREFIX = "http://www.semwebtech.org/mondial/10/meta#"
val fullType = MONDIAL_PREFIX + prefixedType;
val nameProperty = MONDIAL_PREFIX + "name"
val queryString = s"SELECT ?entity ?name WHERE { ?entity a <${fullType}> ; <${nameProperty}> ?name. }"
val resultSet = QueryExecutionFactory.create(queryString, kg).execSelect()
while (resultSet.hasNext) {
val solution = resultSet.next()
result = result :+ new NamedEntity(
solution.getResource("?entity"),
solution.getLiteral("?name").getString,
prefixedType
)
}
if (result.isEmpty) {
System.err.printf("Error: found no named entities of type %s\n", prefixedType)
System.exit(1)
}
result
}
}