chinese-transliteration/src/main/kotlin/com/marvinelsen/chinese/transliteration/api/PinyinSyllable.kt
2024-10-07 20:19:42 +02:00

120 lines
5.3 KiB
Kotlin

package com.marvinelsen.chinese.transliteration.api
import java.io.InputStream
@Suppress("MagicNumber", "MaximumLineLength", "MaxLineLength")
data class PinyinSyllable(
val pinyinSyllableWithoutTone: String,
val tone: Tone,
) {
companion object {
private val pinyinToZhuyin = parseTranscriptions(
this::class.java.getResourceAsStream("/pinyin_zhuyin_transcriptions.tsv")!!
)
private val zhuyinToPinyin = pinyinToZhuyin.entries.associate { it.value to it.key }
private val zhuyinToneMarkRegex = """[ˊˇˋ˙]""".toRegex()
fun isValidPinyinWithToneNumberSyllable(pinyinSyllable: String) =
pinyinSyllable.last().isDigit() && pinyinSyllable.last().digitToInt() in 1..5 && pinyinSyllable
.substring(0, pinyinSyllable.lastIndex)
.lowercase() in pinyinToZhuyin
fun fromPinyinWithToneNumber(pinyinWithToneNumber: String): PinyinSyllable {
val pinyinWithoutNumber = pinyinWithToneNumber.substring(0, pinyinWithToneNumber.lastIndex)
val lastCharacter = pinyinWithToneNumber.last()
require(lastCharacter.isDigit()) {
"'$pinyinWithToneNumber' is not a valid Pinyin with tone number syllable. Expected the last character to be a digit, but was '${pinyinWithToneNumber.last()}'"
}
require(lastCharacter.digitToInt() in 1..5) {
"'$pinyinWithToneNumber' is not a valid Pinyin with tone number syllable. Expected the tone number 'n' to be in range 1 <= n <= 5, but was '${pinyinWithToneNumber.last()}'"
}
require(
pinyinWithoutNumber.lowercase() in pinyinToZhuyin
) { "'$pinyinWithoutNumber' is not a valid Pinyin syllable." }
return PinyinSyllable(
pinyinSyllableWithoutTone = pinyinWithoutNumber,
tone = Tone.fromDigit(lastCharacter)
)
}
fun fromZhuyin(zhuyin: String): PinyinSyllable {
val zhuyinWithoutToneMark = zhuyin.replace(zhuyinToneMarkRegex, "")
require(zhuyinWithoutToneMark in zhuyinToPinyin) { "'$zhuyin' is not a valid Zhuyin syllable." }
return PinyinSyllable(
zhuyinToPinyin[zhuyinWithoutToneMark]!!,
Tone.fromZhuyinToneMarkOrNull(zhuyin.last()) ?: Tone.fromZhuyinToneMarkOrNull(zhuyin.first())
?: Tone.FIRST
)
}
private fun parseTranscriptions(inputStream: InputStream) =
inputStream.bufferedReader().useLines { lines ->
lines.map { it.split('\t') }
.associate { it[0] to it[1] }
}
}
fun format(transliterationSystem: TransliterationSystem) = when (transliterationSystem) {
TransliterationSystem.ZHUYIN -> formatToZhuyin()
TransliterationSystem.PINYIN_WITH_TONE_NUMBERS -> formatToPinyinWithToneNumbers()
TransliterationSystem.PINYIN_WITH_TONE_MARKS -> formatToPinyinWithToneMarks()
}
private fun formatToZhuyin(): String {
val zhuyinSyllable = pinyinToZhuyin[pinyinSyllableWithoutTone.lowercase()]
?: error("$pinyinSyllableWithoutTone is not a valid Pinyin syllable")
val zhuyinToneMark = tone.format(TransliterationSystem.ZHUYIN)
return when (tone) {
Tone.FIRST, Tone.SECOND, Tone.THIRD, Tone.FORTH -> zhuyinSyllable + zhuyinToneMark
Tone.FIFTH -> zhuyinToneMark + zhuyinSyllable
}
}
private fun formatToPinyinWithToneNumbers(): String {
check(pinyinSyllableWithoutTone.lowercase() in pinyinToZhuyin) {
"'$pinyinSyllableWithoutTone is not a valid Pinyin syllable."
}
return pinyinSyllableWithoutTone + tone.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS)
}
private fun formatToPinyinWithToneMarks(): String {
check(pinyinSyllableWithoutTone.lowercase() in pinyinToZhuyin) {
"'$pinyinSyllableWithoutTone is not a valid Pinyin syllable."
}
if (pinyinSyllableWithoutTone.lowercase() == "r" && tone == Tone.FIFTH) {
return pinyinSyllableWithoutTone
}
val sanitizedPinyinSyllableWithoutTone = pinyinSyllableWithoutTone.replace("v", "ü").replace("u:", "ü")
val characterToIndex = sanitizedPinyinSyllableWithoutTone.lowercase().withIndex().associate { it.value to it.index }
val vowelIndex = when {
'a' in characterToIndex -> characterToIndex['a']!!
'o' in characterToIndex -> characterToIndex['o']!!
'e' in characterToIndex -> characterToIndex['e']!!
'i' in characterToIndex ->
if (sanitizedPinyinSyllableWithoutTone.elementAtOrNull(characterToIndex['i']!! + 1) == 'u') {
characterToIndex['u']!!
} else {
characterToIndex['i']!!
}
'u' in characterToIndex -> characterToIndex['u']!!
'ü' in characterToIndex -> characterToIndex['ü']!!
else -> error("No vowel found in Pinyin syllable '$sanitizedPinyinSyllableWithoutTone'")
}
return buildString {
append(sanitizedPinyinSyllableWithoutTone)
insert(vowelIndex + 1, tone.format(TransliterationSystem.PINYIN_WITH_TONE_MARKS))
}
}
}