120 lines
5.3 KiB
Kotlin
120 lines
5.3 KiB
Kotlin
package com.marvinelsen.chinese.transliteration.api
|
|
|
|
import java.io.InputStream
|
|
|
|
@Suppress("MagicNumber", "MaximumLineLength", "MaxLineLength")
|
|
data class PinyinSyllable(
|
|
val pinyinSyllableWithoutTone: String,
|
|
val tone: Tone,
|
|
) {
|
|
companion object {
|
|
private val pinyinToZhuyin = parseTranscriptions(
|
|
this::class.java.getResourceAsStream("/pinyin_zhuyin_transcriptions.tsv")!!
|
|
)
|
|
private val zhuyinToPinyin = pinyinToZhuyin.entries.associate { it.value to it.key }
|
|
private val zhuyinToneMarkRegex = """[ˊˇˋ˙]""".toRegex()
|
|
|
|
fun isValidPinyinWithToneNumberSyllable(pinyinSyllable: String) =
|
|
pinyinSyllable.last().isDigit() && pinyinSyllable.last().digitToInt() in 1..5 && pinyinSyllable
|
|
.substring(0, pinyinSyllable.lastIndex)
|
|
.lowercase() in pinyinToZhuyin
|
|
|
|
fun fromPinyinWithToneNumber(pinyinWithToneNumber: String): PinyinSyllable {
|
|
val pinyinWithoutNumber = pinyinWithToneNumber.substring(0, pinyinWithToneNumber.lastIndex)
|
|
val lastCharacter = pinyinWithToneNumber.last()
|
|
|
|
require(lastCharacter.isDigit()) {
|
|
"'$pinyinWithToneNumber' is not a valid Pinyin with tone number syllable. Expected the last character to be a digit, but was '${pinyinWithToneNumber.last()}'"
|
|
}
|
|
require(lastCharacter.digitToInt() in 1..5) {
|
|
"'$pinyinWithToneNumber' is not a valid Pinyin with tone number syllable. Expected the tone number 'n' to be in range 1 <= n <= 5, but was '${pinyinWithToneNumber.last()}'"
|
|
}
|
|
require(
|
|
pinyinWithoutNumber.lowercase() in pinyinToZhuyin
|
|
) { "'$pinyinWithoutNumber' is not a valid Pinyin syllable." }
|
|
|
|
return PinyinSyllable(
|
|
pinyinSyllableWithoutTone = pinyinWithoutNumber,
|
|
tone = Tone.fromDigit(lastCharacter)
|
|
)
|
|
}
|
|
|
|
fun fromZhuyin(zhuyin: String): PinyinSyllable {
|
|
val zhuyinWithoutToneMark = zhuyin.replace(zhuyinToneMarkRegex, "")
|
|
|
|
require(zhuyinWithoutToneMark in zhuyinToPinyin) { "'$zhuyin' is not a valid Zhuyin syllable." }
|
|
|
|
return PinyinSyllable(
|
|
zhuyinToPinyin[zhuyinWithoutToneMark]!!,
|
|
Tone.fromZhuyinToneMarkOrNull(zhuyin.last()) ?: Tone.fromZhuyinToneMarkOrNull(zhuyin.first())
|
|
?: Tone.FIRST
|
|
)
|
|
}
|
|
|
|
private fun parseTranscriptions(inputStream: InputStream) =
|
|
inputStream.bufferedReader().useLines { lines ->
|
|
lines.map { it.split('\t') }
|
|
.associate { it[0] to it[1] }
|
|
}
|
|
}
|
|
|
|
fun format(transliterationSystem: TransliterationSystem) = when (transliterationSystem) {
|
|
TransliterationSystem.ZHUYIN -> formatToZhuyin()
|
|
TransliterationSystem.PINYIN_WITH_TONE_NUMBERS -> formatToPinyinWithToneNumbers()
|
|
TransliterationSystem.PINYIN_WITH_TONE_MARKS -> formatToPinyinWithToneMarks()
|
|
}
|
|
|
|
private fun formatToZhuyin(): String {
|
|
val zhuyinSyllable = pinyinToZhuyin[pinyinSyllableWithoutTone.lowercase()]
|
|
?: error("$pinyinSyllableWithoutTone is not a valid Pinyin syllable")
|
|
val zhuyinToneMark = tone.format(TransliterationSystem.ZHUYIN)
|
|
|
|
return when (tone) {
|
|
Tone.FIRST, Tone.SECOND, Tone.THIRD, Tone.FORTH -> zhuyinSyllable + zhuyinToneMark
|
|
Tone.FIFTH -> zhuyinToneMark + zhuyinSyllable
|
|
}
|
|
}
|
|
|
|
private fun formatToPinyinWithToneNumbers(): String {
|
|
check(pinyinSyllableWithoutTone.lowercase() in pinyinToZhuyin) {
|
|
"'$pinyinSyllableWithoutTone is not a valid Pinyin syllable."
|
|
}
|
|
|
|
return pinyinSyllableWithoutTone + tone.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS)
|
|
}
|
|
|
|
private fun formatToPinyinWithToneMarks(): String {
|
|
check(pinyinSyllableWithoutTone.lowercase() in pinyinToZhuyin) {
|
|
"'$pinyinSyllableWithoutTone is not a valid Pinyin syllable."
|
|
}
|
|
|
|
if (pinyinSyllableWithoutTone.lowercase() == "r" && tone == Tone.FIFTH) {
|
|
return pinyinSyllableWithoutTone
|
|
}
|
|
|
|
val sanitizedPinyinSyllableWithoutTone = pinyinSyllableWithoutTone.replace("v", "ü").replace("u:", "ü")
|
|
|
|
val characterToIndex = sanitizedPinyinSyllableWithoutTone.lowercase().withIndex().associate { it.value to it.index }
|
|
val vowelIndex = when {
|
|
'a' in characterToIndex -> characterToIndex['a']!!
|
|
'o' in characterToIndex -> characterToIndex['o']!!
|
|
'e' in characterToIndex -> characterToIndex['e']!!
|
|
'i' in characterToIndex ->
|
|
if (sanitizedPinyinSyllableWithoutTone.elementAtOrNull(characterToIndex['i']!! + 1) == 'u') {
|
|
characterToIndex['u']!!
|
|
} else {
|
|
characterToIndex['i']!!
|
|
}
|
|
|
|
'u' in characterToIndex -> characterToIndex['u']!!
|
|
'ü' in characterToIndex -> characterToIndex['ü']!!
|
|
else -> error("No vowel found in Pinyin syllable '$sanitizedPinyinSyllableWithoutTone'")
|
|
}
|
|
|
|
return buildString {
|
|
append(sanitizedPinyinSyllableWithoutTone)
|
|
insert(vowelIndex + 1, tone.format(TransliterationSystem.PINYIN_WITH_TONE_MARKS))
|
|
}
|
|
}
|
|
}
|