Create moedict entries
This commit is contained in:
parent
bd352760a1
commit
7d52fde387
@ -1,13 +1,16 @@
|
||||
@file:Suppress("MagicNumber", "LongMethod", "MaximumLineLength", "MaxLineLength")
|
||||
@file:Suppress("MagicNumber", "LongMethod", "MaximumLineLength", "MaxLineLength", "NestedBlockDepth")
|
||||
|
||||
package com.marvinelsen.willow.database
|
||||
|
||||
import com.github.houbb.opencc4j.util.ZhConverterUtil
|
||||
import com.marvinelsen.cedict.api.CedictParser
|
||||
import com.marvinelsen.chinese.transliteration.api.PinyinSyllable
|
||||
import com.marvinelsen.chinese.transliteration.api.TransliterationSystem
|
||||
import com.marvinelsen.chinese.transliteration.api.Zhuyin
|
||||
import com.marvinelsen.crossstraits.api.CrossStraitsDefinition
|
||||
import com.marvinelsen.crossstraits.api.CrossStraitsParser
|
||||
import com.marvinelsen.moedict.api.MoedictDefinition
|
||||
import com.marvinelsen.moedict.api.MoedictParser
|
||||
import kotlinx.serialization.builtins.ListSerializer
|
||||
import kotlinx.serialization.builtins.serializer
|
||||
import kotlinx.serialization.json.Json
|
||||
@ -134,6 +137,7 @@ fun main() {
|
||||
statement.close()
|
||||
|
||||
createCrossStraitsEntries(connection)
|
||||
createMoedictEntries(connection)
|
||||
|
||||
connection.close()
|
||||
}
|
||||
@ -238,6 +242,132 @@ private fun createCrossStraitsEntries(connection: Connection) {
|
||||
statement.close()
|
||||
}
|
||||
|
||||
private fun createMoedictEntries(connection: Connection) {
|
||||
val insertStatement =
|
||||
connection.prepareStatement(
|
||||
"INSERT OR IGNORE INTO entry(traditional, simplified, pinyin_with_tone_marks, pinyin_with_tone_numbers, zhuyin, searchable_pinyin, searchable_pinyin_with_tone_numbers, cedict_definitions, cross_straits_definitions, moe_definitions, character_count) VALUES(?,?,?,?,?,?,?,?,?,?,?)"
|
||||
)
|
||||
|
||||
val updateStatement = connection.prepareStatement(
|
||||
"UPDATE entry SET moe_definitions = ? WHERE id = ?"
|
||||
)
|
||||
val statement = connection.createStatement()
|
||||
|
||||
val moedictEntries =
|
||||
MoedictParser.instance.parse(GZIPInputStream(object {}.javaClass.getResourceAsStream("/moedict.json.gz")!!))
|
||||
moedictEntries.filterNot { it.title.startsWith('{') }.forEach { entry ->
|
||||
val traditional = entry.title.sanitizeMoeTitle()
|
||||
|
||||
entry.heteronyms.filterNot { it.zhuyin.isNullOrBlank() }
|
||||
.forEach { heteronym ->
|
||||
val pronunciation = heteronym.zhuyin!!
|
||||
.sanitizeMoeZhuyin()
|
||||
.split(Zhuyin.SEPARATOR)
|
||||
.filterNot { it.isBlank() }
|
||||
.map { PinyinSyllable.fromZhuyin(it) }
|
||||
|
||||
val rs: ResultSet = statement.executeQuery(
|
||||
"SELECT id FROM entry WHERE traditional = '$traditional' AND pinyin_with_tone_numbers = '${
|
||||
pronunciation.joinToString(
|
||||
separator = " "
|
||||
) { it.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS) }
|
||||
}'"
|
||||
)
|
||||
if (rs.next()) {
|
||||
updateStatement.setString(
|
||||
1,
|
||||
Json.encodeToString(ListSerializer(MoedictDefinition.serializer()), heteronym.definitions)
|
||||
)
|
||||
updateStatement.setInt(2, rs.getInt(1))
|
||||
updateStatement.executeUpdate()
|
||||
} else {
|
||||
try {
|
||||
insertStatement.setString(1, traditional)
|
||||
insertStatement.setString(2, ZhConverterUtil.toSimple(traditional))
|
||||
insertStatement.setString(
|
||||
3,
|
||||
pronunciation.joinToString(
|
||||
separator = " "
|
||||
) { it.format(TransliterationSystem.PINYIN_WITH_TONE_MARKS) }
|
||||
)
|
||||
insertStatement.setString(
|
||||
4,
|
||||
pronunciation.joinToString(
|
||||
separator = " "
|
||||
) { it.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS) }
|
||||
)
|
||||
insertStatement.setString(
|
||||
5,
|
||||
pronunciation.joinToString(separator = Zhuyin.SEPARATOR) {
|
||||
it.format(
|
||||
TransliterationSystem.ZHUYIN
|
||||
)
|
||||
}
|
||||
)
|
||||
insertStatement.setString(
|
||||
6,
|
||||
pronunciation.joinToString(separator = "") {
|
||||
it.toSearchablePinyin()
|
||||
}
|
||||
)
|
||||
insertStatement.setString(
|
||||
7,
|
||||
pronunciation.joinToString(separator = "") {
|
||||
it.toSearchablePinyinWithToneNumbers()
|
||||
}
|
||||
)
|
||||
insertStatement.setString(
|
||||
8,
|
||||
Json.encodeToString(
|
||||
ListSerializer(String.serializer()),
|
||||
emptyList()
|
||||
)
|
||||
)
|
||||
insertStatement.setString(
|
||||
9,
|
||||
Json.encodeToString(
|
||||
ListSerializer(String.serializer()),
|
||||
emptyList()
|
||||
)
|
||||
)
|
||||
insertStatement.setString(
|
||||
10,
|
||||
Json.encodeToString(
|
||||
ListSerializer(MoedictDefinition.serializer()),
|
||||
heteronym.definitions
|
||||
)
|
||||
)
|
||||
insertStatement.setInt(11, entry.title.length)
|
||||
} catch (_: Exception) {
|
||||
// no-op
|
||||
}
|
||||
|
||||
insertStatement.addBatch()
|
||||
}
|
||||
rs.close()
|
||||
}
|
||||
}
|
||||
|
||||
insertStatement.executeBatch()
|
||||
connection.commit()
|
||||
insertStatement.close()
|
||||
statement.close()
|
||||
}
|
||||
|
||||
private fun String.sanitizeMoeZhuyin() = this
|
||||
.substringBefore("(變)")
|
||||
.substringAfter("(又音)")
|
||||
.substringAfter("(語音)")
|
||||
.replace("""(.*)""".toRegex(), "")
|
||||
.replace("""\(.*\)""".toRegex(), "")
|
||||
.replace("ㄉㄨㄜˇ", "ㄉㄨㄣˇ")
|
||||
.replace("ㄦ", "${Zhuyin.SEPARATOR}ㄦ")
|
||||
.replace("""\s+""".toRegex(), Zhuyin.SEPARATOR)
|
||||
|
||||
private fun String.sanitizeMoeTitle() = this
|
||||
.replace("""(.*)""".toRegex(), "")
|
||||
.replace("""\(.*)""".toRegex(), "")
|
||||
|
||||
private fun PinyinSyllable.toSearchablePinyinWithToneNumbers() =
|
||||
this.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS).lowercase()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user