Implement Pinyin search

This commit is contained in:
Marvin Elsen 2024-09-20 20:10:12 +02:00
parent 3a159fba76
commit a65c1db941
Signed by: marvinelsen
GPG Key ID: 820672408CC318C2
2 changed files with 52 additions and 3 deletions

View File

@ -27,6 +27,8 @@ fun main() {
pinyin_with_tone_marks TEXT NOT NULL, pinyin_with_tone_marks TEXT NOT NULL,
pinyin_with_tone_numbers TEXT NOT NULL, pinyin_with_tone_numbers TEXT NOT NULL,
zhuyin TEXT NOT NULL, zhuyin TEXT NOT NULL,
searchable_pinyin TEXT NOT NULL,
searchable_pinyin_with_tone_numbers TEXT NOT NULL,
definitions JSON NOT NULL, definitions JSON NOT NULL,
character_count INTEGER NOT NULL, character_count INTEGER NOT NULL,
CONSTRAINT character_count_gte CHECK(character_count > 0) CONSTRAINT character_count_gte CHECK(character_count > 0)
@ -36,6 +38,10 @@ fun main() {
statement.executeUpdate("CREATE INDEX IF NOT EXISTS idx_cedict_traditional ON cedict (traditional)") statement.executeUpdate("CREATE INDEX IF NOT EXISTS idx_cedict_traditional ON cedict (traditional)")
statement.executeUpdate("CREATE INDEX IF NOT EXISTS idx_cedict_simplified ON cedict (simplified)") statement.executeUpdate("CREATE INDEX IF NOT EXISTS idx_cedict_simplified ON cedict (simplified)")
statement.executeUpdate("CREATE INDEX IF NOT EXISTS idx_cedict_character_count ON cedict (character_count)") statement.executeUpdate("CREATE INDEX IF NOT EXISTS idx_cedict_character_count ON cedict (character_count)")
statement.executeUpdate("CREATE INDEX IF NOT EXISTS idx_cedict_searchable_pinyin ON cedict (searchable_pinyin)")
statement.executeUpdate(
"CREATE INDEX IF NOT EXISTS idx_cedict_searchable_pinyin_with_tone_numbers ON cedict (searchable_pinyin_with_tone_numbers)"
)
val cedictParser = CedictParser.instance val cedictParser = CedictParser.instance
val cedictEntries = val cedictEntries =
@ -45,7 +51,7 @@ fun main() {
val insertStatement = val insertStatement =
connection.prepareStatement( connection.prepareStatement(
"INSERT OR IGNORE INTO cedict(traditional, simplified, pinyin_with_tone_marks, pinyin_with_tone_numbers, zhuyin, definitions, character_count) VALUES(?,?,?,?,?,?,?)" "INSERT OR IGNORE INTO cedict(traditional, simplified, pinyin_with_tone_marks, pinyin_with_tone_numbers, zhuyin, searchable_pinyin, searchable_pinyin_with_tone_numbers, definitions, character_count) VALUES(?,?,?,?,?,?,?,?,?)"
) )
for (entry in cedictEntries) { for (entry in cedictEntries) {
try { try {
@ -71,12 +77,31 @@ fun main() {
) )
insertStatement.setString( insertStatement.setString(
6, 6,
entry.pinyinSyllables.joinToString(
separator = ""
) {
it
.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS)
.lowercase()
.replace("""\d""".toRegex(), "")
}
)
insertStatement.setString(
7,
entry.pinyinSyllables.joinToString(
separator = ""
) {
it.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS).lowercase()
}
)
insertStatement.setString(
8,
Json.encodeToString( Json.encodeToString(
ListSerializer(ListSerializer(String.serializer())), ListSerializer(ListSerializer(String.serializer())),
entry.definitions.map { it.glosses } entry.definitions.map { it.glosses }
) )
) )
insertStatement.setInt(7, entry.traditional.length) insertStatement.setInt(9, entry.traditional.length)
} catch (_: Exception) { } catch (_: Exception) {
// no-op // no-op
} }

View File

@ -6,6 +6,7 @@ import java.sql.PreparedStatement
import java.sql.ResultSet import java.sql.ResultSet
class SqliteDictionary(private val connection: Connection) : Dictionary { class SqliteDictionary(private val connection: Connection) : Dictionary {
private val whitespaceRegex = """\s+""".toRegex()
private val searchSimplifiedPreparedStatement: PreparedStatement by lazy { private val searchSimplifiedPreparedStatement: PreparedStatement by lazy {
connection.prepareStatement( connection.prepareStatement(
@ -29,6 +30,18 @@ class SqliteDictionary(private val connection: Connection) : Dictionary {
) )
} }
private val searchPinyinPreparedStatement: PreparedStatement by lazy {
connection.prepareStatement(
"""
SELECT traditional, simplified, pinyin_with_tone_marks, pinyin_with_tone_numbers, zhuyin, definitions
FROM cedict
WHERE searchable_pinyin GLOB ?
OR searchable_pinyin_with_tone_numbers GLOB ?
ORDER BY character_count ASC
""".trimIndent()
)
}
private val findWordsContaining: PreparedStatement by lazy { private val findWordsContaining: PreparedStatement by lazy {
connection.prepareStatement( connection.prepareStatement(
""" """
@ -41,7 +54,7 @@ class SqliteDictionary(private val connection: Connection) : Dictionary {
} }
override fun search(query: String, searchMode: SearchMode) = when (searchMode) { override fun search(query: String, searchMode: SearchMode) = when (searchMode) {
SearchMode.PINYIN -> TODO() SearchMode.PINYIN -> searchPinyin(query)
SearchMode.SIMPLIFIED -> searchSimplified(query) SearchMode.SIMPLIFIED -> searchSimplified(query)
SearchMode.TRADITIONAL -> searchTraditional(query) SearchMode.TRADITIONAL -> searchTraditional(query)
SearchMode.ENGLISH -> TODO() SearchMode.ENGLISH -> TODO()
@ -71,6 +84,17 @@ class SqliteDictionary(private val connection: Connection) : Dictionary {
return resultSet.toListOfDictionaryEntries() return resultSet.toListOfDictionaryEntries()
} }
private fun searchPinyin(query: String): List<DictionaryEntry> {
val sanitizedQuery = query.lowercase().replace(whitespaceRegex, "")
searchPinyinPreparedStatement.setString(1, "$sanitizedQuery*")
searchPinyinPreparedStatement.setString(2, "$sanitizedQuery*")
val resultSet: ResultSet = searchPinyinPreparedStatement.executeQuery()
return resultSet.toListOfDictionaryEntries()
}
private fun searchTraditional(query: String): List<DictionaryEntry> { private fun searchTraditional(query: String): List<DictionaryEntry> {
searchTraditionalPreparedStatement.setString(1, "$query*") searchTraditionalPreparedStatement.setString(1, "$query*")