Import tatoeba sentences into database

This commit is contained in:
Marvin Elsen 2024-10-14 19:14:04 +02:00
parent a30da84be2
commit 7d3a6deaab
Signed by: marvinelsen
GPG Key ID: 820672408CC318C2

View File

@ -11,6 +11,7 @@ import com.marvinelsen.crossstraits.api.CrossStraitsDefinition
import com.marvinelsen.crossstraits.api.CrossStraitsParser
import com.marvinelsen.moedict.api.MoedictDefinition
import com.marvinelsen.moedict.api.MoedictParser
import com.marvinelsen.tatoeba.api.TatoebaParser
import kotlinx.serialization.builtins.ListSerializer
import kotlinx.serialization.builtins.serializer
import kotlinx.serialization.json.Json
@ -172,10 +173,38 @@ fun main() {
createCrossStraitsEntries(connection)
createMoedictEntries(connection)
createTatoebaSentences(connection)
connection.close()
}
fun createTatoebaSentences(connection: Connection) {
val tatoebaParser = TatoebaParser.instance
val tatoebaSentences =
tatoebaParser.parse(GZIPInputStream(object {}.javaClass.getResourceAsStream("/cmn_sentences.tsv.gz")!!))
val insertStatement =
connection.prepareStatement(
"INSERT OR IGNORE INTO sentence(traditional, simplified, character_count) VALUES(?,?,?)"
)
tatoebaSentences.forEach { sentence ->
try {
insertStatement.setString(1, ZhConverterUtil.toTraditional(sentence.simplified))
insertStatement.setString(2, ZhConverterUtil.toSimple(sentence.simplified))
insertStatement.setInt(3, sentence.simplified.length)
} catch (_: Exception) {
// no-op
}
insertStatement.addBatch()
}
insertStatement.executeBatch()
connection.commit()
insertStatement.close()
}
private fun createCrossStraitsEntries(connection: Connection) {
val insertStatement =
connection.prepareStatement(