Implement Pinyin search

This commit is contained in:
Marvin Elsen 2024-09-20 20:10:12 +02:00
parent 3a159fba76
commit a65c1db941
Signed by: marvinelsen
GPG Key ID: 820672408CC318C2
2 changed files with 52 additions and 3 deletions

View File

@ -27,6 +27,8 @@ fun main() {
pinyin_with_tone_marks TEXT NOT NULL,
pinyin_with_tone_numbers TEXT NOT NULL,
zhuyin TEXT NOT NULL,
searchable_pinyin TEXT NOT NULL,
searchable_pinyin_with_tone_numbers TEXT NOT NULL,
definitions JSON NOT NULL,
character_count INTEGER NOT NULL,
CONSTRAINT character_count_gte CHECK(character_count > 0)
@ -36,6 +38,10 @@ fun main() {
statement.executeUpdate("CREATE INDEX IF NOT EXISTS idx_cedict_traditional ON cedict (traditional)")
statement.executeUpdate("CREATE INDEX IF NOT EXISTS idx_cedict_simplified ON cedict (simplified)")
statement.executeUpdate("CREATE INDEX IF NOT EXISTS idx_cedict_character_count ON cedict (character_count)")
statement.executeUpdate("CREATE INDEX IF NOT EXISTS idx_cedict_searchable_pinyin ON cedict (searchable_pinyin)")
statement.executeUpdate(
"CREATE INDEX IF NOT EXISTS idx_cedict_searchable_pinyin_with_tone_numbers ON cedict (searchable_pinyin_with_tone_numbers)"
)
val cedictParser = CedictParser.instance
val cedictEntries =
@ -45,7 +51,7 @@ fun main() {
val insertStatement =
connection.prepareStatement(
"INSERT OR IGNORE INTO cedict(traditional, simplified, pinyin_with_tone_marks, pinyin_with_tone_numbers, zhuyin, definitions, character_count) VALUES(?,?,?,?,?,?,?)"
"INSERT OR IGNORE INTO cedict(traditional, simplified, pinyin_with_tone_marks, pinyin_with_tone_numbers, zhuyin, searchable_pinyin, searchable_pinyin_with_tone_numbers, definitions, character_count) VALUES(?,?,?,?,?,?,?,?,?)"
)
for (entry in cedictEntries) {
try {
@ -71,12 +77,31 @@ fun main() {
)
insertStatement.setString(
6,
entry.pinyinSyllables.joinToString(
separator = ""
) {
it
.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS)
.lowercase()
.replace("""\d""".toRegex(), "")
}
)
insertStatement.setString(
7,
entry.pinyinSyllables.joinToString(
separator = ""
) {
it.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS).lowercase()
}
)
insertStatement.setString(
8,
Json.encodeToString(
ListSerializer(ListSerializer(String.serializer())),
entry.definitions.map { it.glosses }
)
)
insertStatement.setInt(7, entry.traditional.length)
insertStatement.setInt(9, entry.traditional.length)
} catch (_: Exception) {
// no-op
}

View File

@ -6,6 +6,7 @@ import java.sql.PreparedStatement
import java.sql.ResultSet
class SqliteDictionary(private val connection: Connection) : Dictionary {
private val whitespaceRegex = """\s+""".toRegex()
private val searchSimplifiedPreparedStatement: PreparedStatement by lazy {
connection.prepareStatement(
@ -29,6 +30,18 @@ class SqliteDictionary(private val connection: Connection) : Dictionary {
)
}
private val searchPinyinPreparedStatement: PreparedStatement by lazy {
connection.prepareStatement(
"""
SELECT traditional, simplified, pinyin_with_tone_marks, pinyin_with_tone_numbers, zhuyin, definitions
FROM cedict
WHERE searchable_pinyin GLOB ?
OR searchable_pinyin_with_tone_numbers GLOB ?
ORDER BY character_count ASC
""".trimIndent()
)
}
private val findWordsContaining: PreparedStatement by lazy {
connection.prepareStatement(
"""
@ -41,7 +54,7 @@ class SqliteDictionary(private val connection: Connection) : Dictionary {
}
override fun search(query: String, searchMode: SearchMode) = when (searchMode) {
SearchMode.PINYIN -> TODO()
SearchMode.PINYIN -> searchPinyin(query)
SearchMode.SIMPLIFIED -> searchSimplified(query)
SearchMode.TRADITIONAL -> searchTraditional(query)
SearchMode.ENGLISH -> TODO()
@ -71,6 +84,17 @@ class SqliteDictionary(private val connection: Connection) : Dictionary {
return resultSet.toListOfDictionaryEntries()
}
private fun searchPinyin(query: String): List<DictionaryEntry> {
val sanitizedQuery = query.lowercase().replace(whitespaceRegex, "")
searchPinyinPreparedStatement.setString(1, "$sanitizedQuery*")
searchPinyinPreparedStatement.setString(2, "$sanitizedQuery*")
val resultSet: ResultSet = searchPinyinPreparedStatement.executeQuery()
return resultSet.toListOfDictionaryEntries()
}
private fun searchTraditional(query: String): List<DictionaryEntry> {
searchTraditionalPreparedStatement.setString(1, "$query*")