Import tatoeba sentences into database
This commit is contained in:
parent
a30da84be2
commit
7d3a6deaab
@ -11,6 +11,7 @@ import com.marvinelsen.crossstraits.api.CrossStraitsDefinition
|
||||
import com.marvinelsen.crossstraits.api.CrossStraitsParser
|
||||
import com.marvinelsen.moedict.api.MoedictDefinition
|
||||
import com.marvinelsen.moedict.api.MoedictParser
|
||||
import com.marvinelsen.tatoeba.api.TatoebaParser
|
||||
import kotlinx.serialization.builtins.ListSerializer
|
||||
import kotlinx.serialization.builtins.serializer
|
||||
import kotlinx.serialization.json.Json
|
||||
@ -172,10 +173,38 @@ fun main() {
|
||||
|
||||
createCrossStraitsEntries(connection)
|
||||
createMoedictEntries(connection)
|
||||
createTatoebaSentences(connection)
|
||||
|
||||
connection.close()
|
||||
}
|
||||
|
||||
fun createTatoebaSentences(connection: Connection) {
|
||||
val tatoebaParser = TatoebaParser.instance
|
||||
val tatoebaSentences =
|
||||
tatoebaParser.parse(GZIPInputStream(object {}.javaClass.getResourceAsStream("/cmn_sentences.tsv.gz")!!))
|
||||
|
||||
val insertStatement =
|
||||
connection.prepareStatement(
|
||||
"INSERT OR IGNORE INTO sentence(traditional, simplified, character_count) VALUES(?,?,?)"
|
||||
)
|
||||
|
||||
tatoebaSentences.forEach { sentence ->
|
||||
try {
|
||||
insertStatement.setString(1, ZhConverterUtil.toTraditional(sentence.simplified))
|
||||
insertStatement.setString(2, ZhConverterUtil.toSimple(sentence.simplified))
|
||||
insertStatement.setInt(3, sentence.simplified.length)
|
||||
} catch (_: Exception) {
|
||||
// no-op
|
||||
}
|
||||
|
||||
insertStatement.addBatch()
|
||||
}
|
||||
|
||||
insertStatement.executeBatch()
|
||||
connection.commit()
|
||||
insertStatement.close()
|
||||
}
|
||||
|
||||
private fun createCrossStraitsEntries(connection: Connection) {
|
||||
val insertStatement =
|
||||
connection.prepareStatement(
|
||||
|
Loading…
Reference in New Issue
Block a user