From 7d3a6deaab1d1389239da21ca9baf0a62e775ad3 Mon Sep 17 00:00:00 2001 From: Marvin Elsen Date: Mon, 14 Oct 2024 19:14:04 +0200 Subject: [PATCH] Import tatoeba sentences into database --- .../willow/database/CreateDatabase.kt | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/main/kotlin/com/marvinelsen/willow/database/CreateDatabase.kt b/src/main/kotlin/com/marvinelsen/willow/database/CreateDatabase.kt index 647589e..9fc0384 100644 --- a/src/main/kotlin/com/marvinelsen/willow/database/CreateDatabase.kt +++ b/src/main/kotlin/com/marvinelsen/willow/database/CreateDatabase.kt @@ -11,6 +11,7 @@ import com.marvinelsen.crossstraits.api.CrossStraitsDefinition import com.marvinelsen.crossstraits.api.CrossStraitsParser import com.marvinelsen.moedict.api.MoedictDefinition import com.marvinelsen.moedict.api.MoedictParser +import com.marvinelsen.tatoeba.api.TatoebaParser import kotlinx.serialization.builtins.ListSerializer import kotlinx.serialization.builtins.serializer import kotlinx.serialization.json.Json @@ -172,10 +173,38 @@ fun main() { createCrossStraitsEntries(connection) createMoedictEntries(connection) + createTatoebaSentences(connection) connection.close() } +fun createTatoebaSentences(connection: Connection) { + val tatoebaParser = TatoebaParser.instance + val tatoebaSentences = + tatoebaParser.parse(GZIPInputStream(object {}.javaClass.getResourceAsStream("/cmn_sentences.tsv.gz")!!)) + + val insertStatement = + connection.prepareStatement( + "INSERT OR IGNORE INTO sentence(traditional, simplified, character_count) VALUES(?,?,?)" + ) + + tatoebaSentences.forEach { sentence -> + try { + insertStatement.setString(1, ZhConverterUtil.toTraditional(sentence.simplified)) + insertStatement.setString(2, ZhConverterUtil.toSimple(sentence.simplified)) + insertStatement.setInt(3, sentence.simplified.length) + } catch (_: Exception) { + // no-op + } + + insertStatement.addBatch() + } + + insertStatement.executeBatch() + connection.commit() + insertStatement.close() +} + private fun createCrossStraitsEntries(connection: Connection) { val insertStatement = connection.prepareStatement(