diff --git a/src/main/kotlin/com/marvinelsen/willow/database/CreateDatabase.kt b/src/main/kotlin/com/marvinelsen/willow/database/CreateDatabase.kt index a22cee4..be2db23 100644 --- a/src/main/kotlin/com/marvinelsen/willow/database/CreateDatabase.kt +++ b/src/main/kotlin/com/marvinelsen/willow/database/CreateDatabase.kt @@ -1,13 +1,16 @@ -@file:Suppress("MagicNumber", "LongMethod", "MaximumLineLength", "MaxLineLength") +@file:Suppress("MagicNumber", "LongMethod", "MaximumLineLength", "MaxLineLength", "NestedBlockDepth") package com.marvinelsen.willow.database +import com.github.houbb.opencc4j.util.ZhConverterUtil import com.marvinelsen.cedict.api.CedictParser import com.marvinelsen.chinese.transliteration.api.PinyinSyllable import com.marvinelsen.chinese.transliteration.api.TransliterationSystem import com.marvinelsen.chinese.transliteration.api.Zhuyin import com.marvinelsen.crossstraits.api.CrossStraitsDefinition import com.marvinelsen.crossstraits.api.CrossStraitsParser +import com.marvinelsen.moedict.api.MoedictDefinition +import com.marvinelsen.moedict.api.MoedictParser import kotlinx.serialization.builtins.ListSerializer import kotlinx.serialization.builtins.serializer import kotlinx.serialization.json.Json @@ -134,6 +137,7 @@ fun main() { statement.close() createCrossStraitsEntries(connection) + createMoedictEntries(connection) connection.close() } @@ -238,6 +242,132 @@ private fun createCrossStraitsEntries(connection: Connection) { statement.close() } +private fun createMoedictEntries(connection: Connection) { + val insertStatement = + connection.prepareStatement( + "INSERT OR IGNORE INTO entry(traditional, simplified, pinyin_with_tone_marks, pinyin_with_tone_numbers, zhuyin, searchable_pinyin, searchable_pinyin_with_tone_numbers, cedict_definitions, cross_straits_definitions, moe_definitions, character_count) VALUES(?,?,?,?,?,?,?,?,?,?,?)" + ) + + val updateStatement = connection.prepareStatement( + "UPDATE entry SET moe_definitions = ? WHERE id = ?" + ) + val statement = connection.createStatement() + + val moedictEntries = + MoedictParser.instance.parse(GZIPInputStream(object {}.javaClass.getResourceAsStream("/moedict.json.gz")!!)) + moedictEntries.filterNot { it.title.startsWith('{') }.forEach { entry -> + val traditional = entry.title.sanitizeMoeTitle() + + entry.heteronyms.filterNot { it.zhuyin.isNullOrBlank() } + .forEach { heteronym -> + val pronunciation = heteronym.zhuyin!! + .sanitizeMoeZhuyin() + .split(Zhuyin.SEPARATOR) + .filterNot { it.isBlank() } + .map { PinyinSyllable.fromZhuyin(it) } + + val rs: ResultSet = statement.executeQuery( + "SELECT id FROM entry WHERE traditional = '$traditional' AND pinyin_with_tone_numbers = '${ + pronunciation.joinToString( + separator = " " + ) { it.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS) } + }'" + ) + if (rs.next()) { + updateStatement.setString( + 1, + Json.encodeToString(ListSerializer(MoedictDefinition.serializer()), heteronym.definitions) + ) + updateStatement.setInt(2, rs.getInt(1)) + updateStatement.executeUpdate() + } else { + try { + insertStatement.setString(1, traditional) + insertStatement.setString(2, ZhConverterUtil.toSimple(traditional)) + insertStatement.setString( + 3, + pronunciation.joinToString( + separator = " " + ) { it.format(TransliterationSystem.PINYIN_WITH_TONE_MARKS) } + ) + insertStatement.setString( + 4, + pronunciation.joinToString( + separator = " " + ) { it.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS) } + ) + insertStatement.setString( + 5, + pronunciation.joinToString(separator = Zhuyin.SEPARATOR) { + it.format( + TransliterationSystem.ZHUYIN + ) + } + ) + insertStatement.setString( + 6, + pronunciation.joinToString(separator = "") { + it.toSearchablePinyin() + } + ) + insertStatement.setString( + 7, + pronunciation.joinToString(separator = "") { + it.toSearchablePinyinWithToneNumbers() + } + ) + insertStatement.setString( + 8, + Json.encodeToString( + ListSerializer(String.serializer()), + emptyList() + ) + ) + insertStatement.setString( + 9, + Json.encodeToString( + ListSerializer(String.serializer()), + emptyList() + ) + ) + insertStatement.setString( + 10, + Json.encodeToString( + ListSerializer(MoedictDefinition.serializer()), + heteronym.definitions + ) + ) + insertStatement.setInt(11, entry.title.length) + } catch (_: Exception) { + // no-op + } + + insertStatement.addBatch() + } + rs.close() + } + } + + insertStatement.executeBatch() + connection.commit() + insertStatement.close() + statement.close() +} + +private fun String.sanitizeMoeZhuyin() = this + .substringBefore("(變)") + .substringAfter("(又音)") + .substringAfter("(語音)") + .replace("""(.*)""".toRegex(), "") + .replace("""\(.*\)""".toRegex(), "") + .replace("ㄉㄨㄜˇ", "ㄉㄨㄣˇ") + .replace("ㄦ", "${Zhuyin.SEPARATOR}ㄦ") + .replace("""\s+""".toRegex(), Zhuyin.SEPARATOR) + +private fun String.sanitizeMoeTitle() = this + .replace("""(.*)""".toRegex(), "") + .replace("""\(.*)""".toRegex(), "") + private fun PinyinSyllable.toSearchablePinyinWithToneNumbers() = this.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS).lowercase()