Create moedict entries

This commit is contained in:
Marvin Elsen 2024-10-09 22:58:12 +02:00
parent bd352760a1
commit 7d52fde387
Signed by: marvinelsen
GPG Key ID: 820672408CC318C2

View File

@ -1,13 +1,16 @@
@file:Suppress("MagicNumber", "LongMethod", "MaximumLineLength", "MaxLineLength") @file:Suppress("MagicNumber", "LongMethod", "MaximumLineLength", "MaxLineLength", "NestedBlockDepth")
package com.marvinelsen.willow.database package com.marvinelsen.willow.database
import com.github.houbb.opencc4j.util.ZhConverterUtil
import com.marvinelsen.cedict.api.CedictParser import com.marvinelsen.cedict.api.CedictParser
import com.marvinelsen.chinese.transliteration.api.PinyinSyllable import com.marvinelsen.chinese.transliteration.api.PinyinSyllable
import com.marvinelsen.chinese.transliteration.api.TransliterationSystem import com.marvinelsen.chinese.transliteration.api.TransliterationSystem
import com.marvinelsen.chinese.transliteration.api.Zhuyin import com.marvinelsen.chinese.transliteration.api.Zhuyin
import com.marvinelsen.crossstraits.api.CrossStraitsDefinition import com.marvinelsen.crossstraits.api.CrossStraitsDefinition
import com.marvinelsen.crossstraits.api.CrossStraitsParser import com.marvinelsen.crossstraits.api.CrossStraitsParser
import com.marvinelsen.moedict.api.MoedictDefinition
import com.marvinelsen.moedict.api.MoedictParser
import kotlinx.serialization.builtins.ListSerializer import kotlinx.serialization.builtins.ListSerializer
import kotlinx.serialization.builtins.serializer import kotlinx.serialization.builtins.serializer
import kotlinx.serialization.json.Json import kotlinx.serialization.json.Json
@ -134,6 +137,7 @@ fun main() {
statement.close() statement.close()
createCrossStraitsEntries(connection) createCrossStraitsEntries(connection)
createMoedictEntries(connection)
connection.close() connection.close()
} }
@ -238,6 +242,132 @@ private fun createCrossStraitsEntries(connection: Connection) {
statement.close() statement.close()
} }
private fun createMoedictEntries(connection: Connection) {
val insertStatement =
connection.prepareStatement(
"INSERT OR IGNORE INTO entry(traditional, simplified, pinyin_with_tone_marks, pinyin_with_tone_numbers, zhuyin, searchable_pinyin, searchable_pinyin_with_tone_numbers, cedict_definitions, cross_straits_definitions, moe_definitions, character_count) VALUES(?,?,?,?,?,?,?,?,?,?,?)"
)
val updateStatement = connection.prepareStatement(
"UPDATE entry SET moe_definitions = ? WHERE id = ?"
)
val statement = connection.createStatement()
val moedictEntries =
MoedictParser.instance.parse(GZIPInputStream(object {}.javaClass.getResourceAsStream("/moedict.json.gz")!!))
moedictEntries.filterNot { it.title.startsWith('{') }.forEach { entry ->
val traditional = entry.title.sanitizeMoeTitle()
entry.heteronyms.filterNot { it.zhuyin.isNullOrBlank() }
.forEach { heteronym ->
val pronunciation = heteronym.zhuyin!!
.sanitizeMoeZhuyin()
.split(Zhuyin.SEPARATOR)
.filterNot { it.isBlank() }
.map { PinyinSyllable.fromZhuyin(it) }
val rs: ResultSet = statement.executeQuery(
"SELECT id FROM entry WHERE traditional = '$traditional' AND pinyin_with_tone_numbers = '${
pronunciation.joinToString(
separator = " "
) { it.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS) }
}'"
)
if (rs.next()) {
updateStatement.setString(
1,
Json.encodeToString(ListSerializer(MoedictDefinition.serializer()), heteronym.definitions)
)
updateStatement.setInt(2, rs.getInt(1))
updateStatement.executeUpdate()
} else {
try {
insertStatement.setString(1, traditional)
insertStatement.setString(2, ZhConverterUtil.toSimple(traditional))
insertStatement.setString(
3,
pronunciation.joinToString(
separator = " "
) { it.format(TransliterationSystem.PINYIN_WITH_TONE_MARKS) }
)
insertStatement.setString(
4,
pronunciation.joinToString(
separator = " "
) { it.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS) }
)
insertStatement.setString(
5,
pronunciation.joinToString(separator = Zhuyin.SEPARATOR) {
it.format(
TransliterationSystem.ZHUYIN
)
}
)
insertStatement.setString(
6,
pronunciation.joinToString(separator = "") {
it.toSearchablePinyin()
}
)
insertStatement.setString(
7,
pronunciation.joinToString(separator = "") {
it.toSearchablePinyinWithToneNumbers()
}
)
insertStatement.setString(
8,
Json.encodeToString(
ListSerializer(String.serializer()),
emptyList()
)
)
insertStatement.setString(
9,
Json.encodeToString(
ListSerializer(String.serializer()),
emptyList()
)
)
insertStatement.setString(
10,
Json.encodeToString(
ListSerializer(MoedictDefinition.serializer()),
heteronym.definitions
)
)
insertStatement.setInt(11, entry.title.length)
} catch (_: Exception) {
// no-op
}
insertStatement.addBatch()
}
rs.close()
}
}
insertStatement.executeBatch()
connection.commit()
insertStatement.close()
statement.close()
}
private fun String.sanitizeMoeZhuyin() = this
.substringBefore("(變)")
.substringAfter("(又音)")
.substringAfter("(語音)")
.replace(""".*""".toRegex(), "")
.replace("""\(.*\)""".toRegex(), "")
.replace("ㄉㄨㄜˇ", "ㄉㄨㄣˇ")
.replace("", "${Zhuyin.SEPARATOR}")
.replace("""\s+""".toRegex(), Zhuyin.SEPARATOR)
private fun String.sanitizeMoeTitle() = this
.replace(""".*""".toRegex(), "")
.replace("""\(.*""".toRegex(), "")
private fun PinyinSyllable.toSearchablePinyinWithToneNumbers() = private fun PinyinSyllable.toSearchablePinyinWithToneNumbers() =
this.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS).lowercase() this.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS).lowercase()