Create moedict entries
This commit is contained in:
parent
bd352760a1
commit
7d52fde387
@ -1,13 +1,16 @@
|
|||||||
@file:Suppress("MagicNumber", "LongMethod", "MaximumLineLength", "MaxLineLength")
|
@file:Suppress("MagicNumber", "LongMethod", "MaximumLineLength", "MaxLineLength", "NestedBlockDepth")
|
||||||
|
|
||||||
package com.marvinelsen.willow.database
|
package com.marvinelsen.willow.database
|
||||||
|
|
||||||
|
import com.github.houbb.opencc4j.util.ZhConverterUtil
|
||||||
import com.marvinelsen.cedict.api.CedictParser
|
import com.marvinelsen.cedict.api.CedictParser
|
||||||
import com.marvinelsen.chinese.transliteration.api.PinyinSyllable
|
import com.marvinelsen.chinese.transliteration.api.PinyinSyllable
|
||||||
import com.marvinelsen.chinese.transliteration.api.TransliterationSystem
|
import com.marvinelsen.chinese.transliteration.api.TransliterationSystem
|
||||||
import com.marvinelsen.chinese.transliteration.api.Zhuyin
|
import com.marvinelsen.chinese.transliteration.api.Zhuyin
|
||||||
import com.marvinelsen.crossstraits.api.CrossStraitsDefinition
|
import com.marvinelsen.crossstraits.api.CrossStraitsDefinition
|
||||||
import com.marvinelsen.crossstraits.api.CrossStraitsParser
|
import com.marvinelsen.crossstraits.api.CrossStraitsParser
|
||||||
|
import com.marvinelsen.moedict.api.MoedictDefinition
|
||||||
|
import com.marvinelsen.moedict.api.MoedictParser
|
||||||
import kotlinx.serialization.builtins.ListSerializer
|
import kotlinx.serialization.builtins.ListSerializer
|
||||||
import kotlinx.serialization.builtins.serializer
|
import kotlinx.serialization.builtins.serializer
|
||||||
import kotlinx.serialization.json.Json
|
import kotlinx.serialization.json.Json
|
||||||
@ -134,6 +137,7 @@ fun main() {
|
|||||||
statement.close()
|
statement.close()
|
||||||
|
|
||||||
createCrossStraitsEntries(connection)
|
createCrossStraitsEntries(connection)
|
||||||
|
createMoedictEntries(connection)
|
||||||
|
|
||||||
connection.close()
|
connection.close()
|
||||||
}
|
}
|
||||||
@ -238,6 +242,132 @@ private fun createCrossStraitsEntries(connection: Connection) {
|
|||||||
statement.close()
|
statement.close()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private fun createMoedictEntries(connection: Connection) {
|
||||||
|
val insertStatement =
|
||||||
|
connection.prepareStatement(
|
||||||
|
"INSERT OR IGNORE INTO entry(traditional, simplified, pinyin_with_tone_marks, pinyin_with_tone_numbers, zhuyin, searchable_pinyin, searchable_pinyin_with_tone_numbers, cedict_definitions, cross_straits_definitions, moe_definitions, character_count) VALUES(?,?,?,?,?,?,?,?,?,?,?)"
|
||||||
|
)
|
||||||
|
|
||||||
|
val updateStatement = connection.prepareStatement(
|
||||||
|
"UPDATE entry SET moe_definitions = ? WHERE id = ?"
|
||||||
|
)
|
||||||
|
val statement = connection.createStatement()
|
||||||
|
|
||||||
|
val moedictEntries =
|
||||||
|
MoedictParser.instance.parse(GZIPInputStream(object {}.javaClass.getResourceAsStream("/moedict.json.gz")!!))
|
||||||
|
moedictEntries.filterNot { it.title.startsWith('{') }.forEach { entry ->
|
||||||
|
val traditional = entry.title.sanitizeMoeTitle()
|
||||||
|
|
||||||
|
entry.heteronyms.filterNot { it.zhuyin.isNullOrBlank() }
|
||||||
|
.forEach { heteronym ->
|
||||||
|
val pronunciation = heteronym.zhuyin!!
|
||||||
|
.sanitizeMoeZhuyin()
|
||||||
|
.split(Zhuyin.SEPARATOR)
|
||||||
|
.filterNot { it.isBlank() }
|
||||||
|
.map { PinyinSyllable.fromZhuyin(it) }
|
||||||
|
|
||||||
|
val rs: ResultSet = statement.executeQuery(
|
||||||
|
"SELECT id FROM entry WHERE traditional = '$traditional' AND pinyin_with_tone_numbers = '${
|
||||||
|
pronunciation.joinToString(
|
||||||
|
separator = " "
|
||||||
|
) { it.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS) }
|
||||||
|
}'"
|
||||||
|
)
|
||||||
|
if (rs.next()) {
|
||||||
|
updateStatement.setString(
|
||||||
|
1,
|
||||||
|
Json.encodeToString(ListSerializer(MoedictDefinition.serializer()), heteronym.definitions)
|
||||||
|
)
|
||||||
|
updateStatement.setInt(2, rs.getInt(1))
|
||||||
|
updateStatement.executeUpdate()
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
insertStatement.setString(1, traditional)
|
||||||
|
insertStatement.setString(2, ZhConverterUtil.toSimple(traditional))
|
||||||
|
insertStatement.setString(
|
||||||
|
3,
|
||||||
|
pronunciation.joinToString(
|
||||||
|
separator = " "
|
||||||
|
) { it.format(TransliterationSystem.PINYIN_WITH_TONE_MARKS) }
|
||||||
|
)
|
||||||
|
insertStatement.setString(
|
||||||
|
4,
|
||||||
|
pronunciation.joinToString(
|
||||||
|
separator = " "
|
||||||
|
) { it.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS) }
|
||||||
|
)
|
||||||
|
insertStatement.setString(
|
||||||
|
5,
|
||||||
|
pronunciation.joinToString(separator = Zhuyin.SEPARATOR) {
|
||||||
|
it.format(
|
||||||
|
TransliterationSystem.ZHUYIN
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
insertStatement.setString(
|
||||||
|
6,
|
||||||
|
pronunciation.joinToString(separator = "") {
|
||||||
|
it.toSearchablePinyin()
|
||||||
|
}
|
||||||
|
)
|
||||||
|
insertStatement.setString(
|
||||||
|
7,
|
||||||
|
pronunciation.joinToString(separator = "") {
|
||||||
|
it.toSearchablePinyinWithToneNumbers()
|
||||||
|
}
|
||||||
|
)
|
||||||
|
insertStatement.setString(
|
||||||
|
8,
|
||||||
|
Json.encodeToString(
|
||||||
|
ListSerializer(String.serializer()),
|
||||||
|
emptyList()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
insertStatement.setString(
|
||||||
|
9,
|
||||||
|
Json.encodeToString(
|
||||||
|
ListSerializer(String.serializer()),
|
||||||
|
emptyList()
|
||||||
|
)
|
||||||
|
)
|
||||||
|
insertStatement.setString(
|
||||||
|
10,
|
||||||
|
Json.encodeToString(
|
||||||
|
ListSerializer(MoedictDefinition.serializer()),
|
||||||
|
heteronym.definitions
|
||||||
|
)
|
||||||
|
)
|
||||||
|
insertStatement.setInt(11, entry.title.length)
|
||||||
|
} catch (_: Exception) {
|
||||||
|
// no-op
|
||||||
|
}
|
||||||
|
|
||||||
|
insertStatement.addBatch()
|
||||||
|
}
|
||||||
|
rs.close()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
insertStatement.executeBatch()
|
||||||
|
connection.commit()
|
||||||
|
insertStatement.close()
|
||||||
|
statement.close()
|
||||||
|
}
|
||||||
|
|
||||||
|
private fun String.sanitizeMoeZhuyin() = this
|
||||||
|
.substringBefore("(變)")
|
||||||
|
.substringAfter("(又音)")
|
||||||
|
.substringAfter("(語音)")
|
||||||
|
.replace("""(.*)""".toRegex(), "")
|
||||||
|
.replace("""\(.*\)""".toRegex(), "")
|
||||||
|
.replace("ㄉㄨㄜˇ", "ㄉㄨㄣˇ")
|
||||||
|
.replace("ㄦ", "${Zhuyin.SEPARATOR}ㄦ")
|
||||||
|
.replace("""\s+""".toRegex(), Zhuyin.SEPARATOR)
|
||||||
|
|
||||||
|
private fun String.sanitizeMoeTitle() = this
|
||||||
|
.replace("""(.*)""".toRegex(), "")
|
||||||
|
.replace("""\(.*)""".toRegex(), "")
|
||||||
|
|
||||||
private fun PinyinSyllable.toSearchablePinyinWithToneNumbers() =
|
private fun PinyinSyllable.toSearchablePinyinWithToneNumbers() =
|
||||||
this.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS).lowercase()
|
this.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS).lowercase()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user