Compare commits

...

1 Commits

Author SHA1 Message Date
1eb9fb1d56
refactor: refactor everything 2025-05-06 22:21:54 +02:00
33 changed files with 614 additions and 391 deletions

View File

@ -31,7 +31,7 @@ publishing {
publications {
create<MavenPublication>("maven") {
groupId = project.group as String
artifactId = "chinese-transliteration"
artifactId = "chinese-phonetics"
version = project.version as String
from(components["java"])

View File

@ -1 +1 @@
rootProject.name = "chinese-transliteration"
rootProject.name = "chinese-phonetics"

View File

@ -0,0 +1,76 @@
package com.marvinelsen.chinese.phonetics
import com.marvinelsen.chinese.phonetics.internal.syllable.formatting.PinyinMarkSyllableFormatter
import com.marvinelsen.chinese.phonetics.internal.syllable.formatting.PinyinNumberSyllableFormatter
import com.marvinelsen.chinese.phonetics.internal.syllable.formatting.SyllableFormatter
import com.marvinelsen.chinese.phonetics.internal.syllable.formatting.ZhuyinSyllableFormatter
import com.marvinelsen.chinese.phonetics.internal.syllable.parsing.PinyinNumberSyllableParser
import com.marvinelsen.chinese.phonetics.internal.syllable.parsing.SyllableParser
import com.marvinelsen.chinese.phonetics.internal.syllable.parsing.ZhuyinSyllableParser
object ChinesePhonetics {
fun getToneFromNumberedPinyin(numberedPinyin: String) =
PinyinNumberSyllableParser.parseOrNull(numberedPinyin)?.tone
fun getToneFromZhuyin(zhuyin: String) =
ZhuyinSyllableParser.parseOrNull(zhuyin)?.tone
fun getTonesFromNumberedPinyinSequence(numberedPinyinSequence: String) = numberedPinyinSequence.trim()
.split(Regex("\\s+"))
.filter { it.isNotEmpty() }
.map { getToneFromNumberedPinyin(it) }
fun getTonesFromZhuyinSequence(zhuyinSequence: String) = zhuyinSequence.trim()
.split(Regex("\\s+"))
.filter { it.isNotEmpty() }
.map { getToneFromZhuyin(it) }
fun zhuyinToPinyinWithNumbers(zhuyin: String, strict: Boolean = true) = convertSyllableSequence(
input = zhuyin,
parser = ZhuyinSyllableParser,
formatter = PinyinNumberSyllableFormatter,
strict = strict
)
fun zhuyinToPinyinWithToneMarks(zhuyin: String, strict: Boolean = true) = convertSyllableSequence(
input = zhuyin,
parser = ZhuyinSyllableParser,
formatter = PinyinMarkSyllableFormatter,
strict = strict
)
fun pinyinWithNumbersToZhuyin(pinyinWithNumbers: String, strict: Boolean = true) = convertSyllableSequence(
input = pinyinWithNumbers,
parser = PinyinNumberSyllableParser,
formatter = ZhuyinSyllableFormatter,
strict = strict
)
fun pinyinWithNumbersToToneMarks(pinyinWithNumbers: String, strict: Boolean = true) = convertSyllableSequence(
input = pinyinWithNumbers,
parser = PinyinNumberSyllableParser,
formatter = PinyinMarkSyllableFormatter,
strict = strict
)
private fun convertSyllableSequence(
input: String,
parser: SyllableParser,
formatter: SyllableFormatter,
strict: Boolean
): String {
val originalParts = input.trim().split(Regex("\\s+")).filter { it.isNotEmpty() }
if (originalParts.isEmpty()) return ""
val convertedParts = originalParts.map { part ->
if (strict) {
val syllable = parser.parse(part)
formatter.format(syllable)
} else {
val syllable = parser.parseOrNull(part)
syllable?.let { formatter.format(it) } ?: part
}
}
return convertedParts.joinToString(" ")
}
}

View File

@ -0,0 +1,38 @@
package com.marvinelsen.chinese.phonetics
import com.marvinelsen.chinese.phonetics.internal.tone.formatting.PinyinNumberToneFormatter
import com.marvinelsen.chinese.phonetics.internal.tone.formatting.PinyinToneFormatter
import com.marvinelsen.chinese.phonetics.internal.tone.formatting.ZhuyinToneFormatter
import com.marvinelsen.chinese.phonetics.internal.tone.parsing.DigitToneParser
import com.marvinelsen.chinese.phonetics.internal.tone.parsing.IntToneParser
import com.marvinelsen.chinese.phonetics.internal.tone.parsing.PinyinToneParser
import com.marvinelsen.chinese.phonetics.internal.tone.parsing.ZhuyinToneParser
enum class Tone {
FIRST, SECOND, THIRD, FOURTH, FIFTH;
companion object {
fun fromInt(toneNumber: Int) = IntToneParser.parse(toneNumber)
fun fromDigit(digit: Char) = DigitToneParser.parse(digit)
fun fromPinyinTone(pinyinTone: Char) = PinyinToneParser.parse(pinyinTone)
fun fromZhuyinTone(zhuyinTone: Char) = ZhuyinToneParser.parse(zhuyinTone)
fun fromIntOrNull(toneNumber: Int) = IntToneParser.parseOrNull(toneNumber)
fun fromDigitOrNull(digit: Char) = DigitToneParser.parseOrNull(digit)
fun fromPinyinToneOrNull(pinyinTone: Char) = PinyinToneParser.parseOrNull(pinyinTone)
fun fromZhuyinToneOrNull(zhuyinTone: Char) = ZhuyinToneParser.parseOrNull(zhuyinTone)
}
}
@Suppress("MagicNumber")
fun Tone.toInt() = when (this) {
Tone.FIRST -> 1
Tone.SECOND -> 2
Tone.THIRD -> 3
Tone.FOURTH -> 4
Tone.FIFTH -> 5
}
fun Tone.toPinyinNumber() = PinyinNumberToneFormatter.format(this)
fun Tone.toPinyinTone() = PinyinToneFormatter.format(this)
fun Tone.toZhuyinTone() = ZhuyinToneFormatter.format(this)

View File

@ -0,0 +1,5 @@
package com.marvinelsen.chinese.phonetics
object Zhuyin {
const val SEPARATOR = " "
}

View File

@ -0,0 +1,3 @@
package com.marvinelsen.chinese.phonetics.exceptions
class InvalidSyllableInputException(message: String) : IllegalArgumentException(message)

View File

@ -0,0 +1,3 @@
package com.marvinelsen.chinese.phonetics.exceptions
class InvalidToneInputException(message: String) : IllegalArgumentException(message)

View File

@ -0,0 +1,8 @@
package com.marvinelsen.chinese.phonetics.internal.syllable
import com.marvinelsen.chinese.phonetics.Tone
internal data class Syllable(
val basePinyin: String,
val tone: Tone
)

View File

@ -0,0 +1,36 @@
package com.marvinelsen.chinese.phonetics.internal.syllable
import java.io.IOException
/**
* Manages the loading and access of Pinyin-Zhuyin transcription data.
* Intended for internal use by parsers and formatters.
* Loads data lazily upon first access.
*/
internal object TranscriptionDataRepository {
private const val TRANSCRIPTION_RESOURCE_PATH = "/pinyin_zhuyin_transcriptions.tsv"
val pinyinToZhuyin = loadTranscriptions()
val zhuyinToPinyin = pinyinToZhuyin.entries.associate { it.value to it.key }
fun isValidZhuyin(zhuyin: String) = zhuyin in zhuyinToPinyin
fun isValidPinyin(pinyin: String) = pinyin in pinyinToZhuyin
fun normalize(pinyin: String) = pinyin.lowercase()
.replace("v", "ü")
.replace("u:", "ü")
private fun loadTranscriptions(): Map<String, String> {
val inputStream = this::class.java.getResourceAsStream(TRANSCRIPTION_RESOURCE_PATH)
?: error("Cannot find transcription resource: $TRANSCRIPTION_RESOURCE_PATH")
return try {
inputStream.bufferedReader().useLines { lines ->
lines.map { it.split('\t') }
.associate { it[0] to it[1] }
}
} catch (e: IOException) {
throw IOException("Failed to load transcription data from $TRANSCRIPTION_RESOURCE_PATH", e)
}
}
}

View File

@ -0,0 +1,38 @@
package com.marvinelsen.chinese.phonetics.internal.syllable.formatting
import com.marvinelsen.chinese.phonetics.Tone
import com.marvinelsen.chinese.phonetics.internal.syllable.Syllable
import com.marvinelsen.chinese.phonetics.toPinyinTone
internal data object PinyinMarkSyllableFormatter : SyllableFormatter {
private fun findVowelIndexForToneMark(pinyin: String): Int {
val characterToIndex = pinyin.withIndex().associate { it.value to it.index }
return when {
'a' in characterToIndex -> characterToIndex['a']!!
'o' in characterToIndex -> characterToIndex['o']!!
'e' in characterToIndex -> characterToIndex['e']!!
'i' in characterToIndex ->
if (pinyin.elementAtOrNull(characterToIndex['i']!! + 1) == 'u') {
characterToIndex['u']!!
} else {
characterToIndex['i']!!
}
'u' in characterToIndex -> characterToIndex['u']!!
'ü' in characterToIndex -> characterToIndex['ü']!!
else -> error("No vowel found in Pinyin syllable '$pinyin'")
}
}
override fun format(syllable: Syllable): String {
if (syllable.tone == Tone.FIFTH) return syllable.basePinyin
val vowelIndex = findVowelIndexForToneMark(syllable.basePinyin)
val toneMark = syllable.tone.toPinyinTone()
return buildString {
append(syllable.basePinyin)
insert(vowelIndex + 1, toneMark)
}
}
}

View File

@ -0,0 +1,8 @@
package com.marvinelsen.chinese.phonetics.internal.syllable.formatting
import com.marvinelsen.chinese.phonetics.internal.syllable.Syllable
import com.marvinelsen.chinese.phonetics.toPinyinNumber
internal data object PinyinNumberSyllableFormatter : SyllableFormatter {
override fun format(syllable: Syllable) = syllable.basePinyin + syllable.tone.toPinyinNumber()
}

View File

@ -0,0 +1,7 @@
package com.marvinelsen.chinese.phonetics.internal.syllable.formatting
import com.marvinelsen.chinese.phonetics.internal.syllable.Syllable
internal sealed interface SyllableFormatter {
fun format(syllable: Syllable): String
}

View File

@ -0,0 +1,18 @@
package com.marvinelsen.chinese.phonetics.internal.syllable.formatting
import com.marvinelsen.chinese.phonetics.Tone
import com.marvinelsen.chinese.phonetics.internal.syllable.Syllable
import com.marvinelsen.chinese.phonetics.internal.syllable.TranscriptionDataRepository
import com.marvinelsen.chinese.phonetics.toZhuyinTone
internal data object ZhuyinSyllableFormatter : SyllableFormatter {
override fun format(syllable: Syllable): String {
val zhuyinBase = TranscriptionDataRepository.pinyinToZhuyin[syllable.basePinyin]!!
val zhuyinToneMark = syllable.tone.toZhuyinTone()
return when (syllable.tone) {
Tone.FIFTH -> zhuyinToneMark + zhuyinBase
Tone.FIRST, Tone.SECOND, Tone.THIRD, Tone.FOURTH -> zhuyinBase + zhuyinToneMark
}
}
}

View File

@ -0,0 +1,25 @@
package com.marvinelsen.chinese.phonetics.internal.syllable.parsing
import com.marvinelsen.chinese.phonetics.Tone
import com.marvinelsen.chinese.phonetics.internal.syllable.Syllable
import com.marvinelsen.chinese.phonetics.internal.syllable.TranscriptionDataRepository
internal data object PinyinNumberSyllableParser : SyllableParser {
@Suppress("ReturnCount", "MagicNumber")
override fun parseOrNull(input: String): Syllable? {
if (input.isBlank() || !input.last().isDigit()) return null
val toneNumber = input.last()
val tone = Tone.fromDigitOrNull(toneNumber) ?: return null
val pinyin = input.substring(0, input.lastIndex)
val normalizedPinyin = TranscriptionDataRepository.normalize(pinyin)
if (!TranscriptionDataRepository.isValidPinyin(normalizedPinyin)) return null
return Syllable(
basePinyin = normalizedPinyin,
tone = tone
)
}
}

View File

@ -0,0 +1,10 @@
package com.marvinelsen.chinese.phonetics.internal.syllable.parsing
import com.marvinelsen.chinese.phonetics.exceptions.InvalidSyllableInputException
import com.marvinelsen.chinese.phonetics.internal.syllable.Syllable
internal sealed interface SyllableParser {
fun parseOrNull(input: String): Syllable?
fun parse(input: String) = parseOrNull(input)
?: throw InvalidSyllableInputException("Invalid input for syllable parsing: '$input'")
}

View File

@ -0,0 +1,20 @@
package com.marvinelsen.chinese.phonetics.internal.syllable.parsing
import com.marvinelsen.chinese.phonetics.Tone
import com.marvinelsen.chinese.phonetics.internal.syllable.Syllable
import com.marvinelsen.chinese.phonetics.internal.syllable.TranscriptionDataRepository
internal data object ZhuyinSyllableParser : SyllableParser {
private val zhuyinToneMarkRegex = """[ˊˇˋ˙¯]""".toRegex()
override fun parseOrNull(input: String): Syllable? {
val zhuyinWithoutToneMark = input.replace(zhuyinToneMarkRegex, "")
if (!TranscriptionDataRepository.isValidZhuyin(zhuyinWithoutToneMark)) return null
return Syllable(
basePinyin = TranscriptionDataRepository.zhuyinToPinyin[zhuyinWithoutToneMark]!!,
tone = Tone.fromZhuyinToneOrNull(input.last()) ?: Tone.fromZhuyinToneOrNull(input.first()) ?: Tone.FIRST
)
}
}

View File

@ -0,0 +1,13 @@
package com.marvinelsen.chinese.phonetics.internal.tone.formatting
import com.marvinelsen.chinese.phonetics.Tone
internal data object PinyinNumberToneFormatter : ToneFormatter {
override fun format(tone: Tone) = when (tone) {
Tone.FIRST -> "1"
Tone.SECOND -> "2"
Tone.THIRD -> "3"
Tone.FOURTH -> "4"
Tone.FIFTH -> "5"
}
}

View File

@ -0,0 +1,13 @@
package com.marvinelsen.chinese.phonetics.internal.tone.formatting
import com.marvinelsen.chinese.phonetics.Tone
internal data object PinyinToneFormatter : ToneFormatter {
override fun format(tone: Tone) = when (tone) {
Tone.FIRST -> "\u0304"
Tone.SECOND -> "\u0301"
Tone.THIRD -> "\u030C"
Tone.FOURTH -> "\u0300"
Tone.FIFTH -> ""
}
}

View File

@ -0,0 +1,7 @@
package com.marvinelsen.chinese.phonetics.internal.tone.formatting
import com.marvinelsen.chinese.phonetics.Tone
internal sealed interface ToneFormatter {
fun format(tone: Tone): String
}

View File

@ -0,0 +1,13 @@
package com.marvinelsen.chinese.phonetics.internal.tone.formatting
import com.marvinelsen.chinese.phonetics.Tone
internal data object ZhuyinToneFormatter : ToneFormatter {
override fun format(tone: Tone) = when (tone) {
Tone.FIRST -> ""
Tone.SECOND -> "ˊ"
Tone.THIRD -> "ˇ"
Tone.FOURTH -> "ˋ"
Tone.FIFTH -> "˙"
}
}

View File

@ -0,0 +1,14 @@
package com.marvinelsen.chinese.phonetics.internal.tone.parsing
import com.marvinelsen.chinese.phonetics.Tone
internal data object DigitToneParser : ToneParser<Char> {
override fun parseOrNull(input: Char) = when (input) {
'1' -> Tone.FIRST
'2' -> Tone.SECOND
'3' -> Tone.THIRD
'4' -> Tone.FOURTH
'5' -> Tone.FIFTH
else -> null
}
}

View File

@ -0,0 +1,15 @@
package com.marvinelsen.chinese.phonetics.internal.tone.parsing
import com.marvinelsen.chinese.phonetics.Tone
@Suppress("MagicNumber")
internal data object IntToneParser : ToneParser<Int> {
override fun parseOrNull(input: Int) = when (input) {
1 -> Tone.FIRST
2 -> Tone.SECOND
3 -> Tone.THIRD
4 -> Tone.FOURTH
5 -> Tone.FIFTH
else -> null
}
}

View File

@ -0,0 +1,13 @@
package com.marvinelsen.chinese.phonetics.internal.tone.parsing
import com.marvinelsen.chinese.phonetics.Tone
internal data object PinyinToneParser : ToneParser<Char> {
override fun parseOrNull(input: Char) = when (input) {
'\u0304' -> Tone.FIRST
'\u0301' -> Tone.SECOND
'\u030C' -> Tone.THIRD
'\u0300' -> Tone.FOURTH
else -> null
}
}

View File

@ -0,0 +1,10 @@
package com.marvinelsen.chinese.phonetics.internal.tone.parsing
import com.marvinelsen.chinese.phonetics.Tone
import com.marvinelsen.chinese.phonetics.exceptions.InvalidToneInputException
internal sealed interface ToneParser<T> {
fun parseOrNull(input: T): Tone?
fun parse(input: T) = parseOrNull(input)
?: throw InvalidToneInputException("Invalid input for tone parsing: '$input'")
}

View File

@ -0,0 +1,14 @@
package com.marvinelsen.chinese.phonetics.internal.tone.parsing
import com.marvinelsen.chinese.phonetics.Tone
internal data object ZhuyinToneParser : ToneParser<Char> {
override fun parseOrNull(input: Char) = when (input) {
'¯' -> Tone.FIRST
'ˊ' -> Tone.SECOND
'ˇ' -> Tone.THIRD
'ˋ' -> Tone.FOURTH
'˙' -> Tone.FIFTH
else -> null
}
}

View File

@ -1,119 +0,0 @@
package com.marvinelsen.chinese.transliteration.api
import java.io.InputStream
@Suppress("MagicNumber", "MaximumLineLength", "MaxLineLength")
data class PinyinSyllable(
val pinyinSyllableWithoutTone: String,
val tone: Tone,
) {
companion object {
private val pinyinToZhuyin = parseTranscriptions(
this::class.java.getResourceAsStream("/pinyin_zhuyin_transcriptions.tsv")!!
)
private val zhuyinToPinyin = pinyinToZhuyin.entries.associate { it.value to it.key }
private val zhuyinToneMarkRegex = """[ˊˇˋ˙]""".toRegex()
fun isValidPinyinWithToneNumberSyllable(pinyinSyllable: String) =
pinyinSyllable.last().isDigit() && pinyinSyllable.last().digitToInt() in 1..5 && pinyinSyllable
.substring(0, pinyinSyllable.lastIndex)
.lowercase() in pinyinToZhuyin
fun fromPinyinWithToneNumber(pinyinWithToneNumber: String): PinyinSyllable {
val pinyinWithoutNumber = pinyinWithToneNumber.substring(0, pinyinWithToneNumber.lastIndex)
val lastCharacter = pinyinWithToneNumber.last()
require(lastCharacter.isDigit()) {
"'$pinyinWithToneNumber' is not a valid Pinyin with tone number syllable. Expected the last character to be a digit, but was '${pinyinWithToneNumber.last()}'"
}
require(lastCharacter.digitToInt() in 1..5) {
"'$pinyinWithToneNumber' is not a valid Pinyin with tone number syllable. Expected the tone number 'n' to be in range 1 <= n <= 5, but was '${pinyinWithToneNumber.last()}'"
}
require(
pinyinWithoutNumber.lowercase() in pinyinToZhuyin
) { "'$pinyinWithoutNumber' is not a valid Pinyin syllable." }
return PinyinSyllable(
pinyinSyllableWithoutTone = pinyinWithoutNumber,
tone = Tone.fromDigit(lastCharacter)
)
}
fun fromZhuyin(zhuyin: String): PinyinSyllable {
val zhuyinWithoutToneMark = zhuyin.replace(zhuyinToneMarkRegex, "")
require(zhuyinWithoutToneMark in zhuyinToPinyin) { "'$zhuyin' is not a valid Zhuyin syllable." }
return PinyinSyllable(
zhuyinToPinyin[zhuyinWithoutToneMark]!!,
Tone.fromZhuyinToneMarkOrNull(zhuyin.last()) ?: Tone.fromZhuyinToneMarkOrNull(zhuyin.first())
?: Tone.FIRST
)
}
private fun parseTranscriptions(inputStream: InputStream) =
inputStream.bufferedReader().useLines { lines ->
lines.map { it.split('\t') }
.associate { it[0] to it[1] }
}
}
fun format(transliterationSystem: TransliterationSystem) = when (transliterationSystem) {
TransliterationSystem.ZHUYIN -> formatToZhuyin()
TransliterationSystem.PINYIN_WITH_TONE_NUMBERS -> formatToPinyinWithToneNumbers()
TransliterationSystem.PINYIN_WITH_TONE_MARKS -> formatToPinyinWithToneMarks()
}
private fun formatToZhuyin(): String {
val zhuyinSyllable = pinyinToZhuyin[pinyinSyllableWithoutTone.lowercase()]
?: error("$pinyinSyllableWithoutTone is not a valid Pinyin syllable")
val zhuyinToneMark = tone.format(TransliterationSystem.ZHUYIN)
return when (tone) {
Tone.FIRST, Tone.SECOND, Tone.THIRD, Tone.FORTH -> zhuyinSyllable + zhuyinToneMark
Tone.FIFTH -> zhuyinToneMark + zhuyinSyllable
}
}
private fun formatToPinyinWithToneNumbers(): String {
check(pinyinSyllableWithoutTone.lowercase() in pinyinToZhuyin) {
"'$pinyinSyllableWithoutTone is not a valid Pinyin syllable."
}
return pinyinSyllableWithoutTone + tone.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS)
}
private fun formatToPinyinWithToneMarks(): String {
check(pinyinSyllableWithoutTone.lowercase() in pinyinToZhuyin) {
"'$pinyinSyllableWithoutTone is not a valid Pinyin syllable."
}
if (pinyinSyllableWithoutTone.lowercase() == "r" && tone == Tone.FIFTH) {
return pinyinSyllableWithoutTone
}
val sanitizedPinyinSyllableWithoutTone = pinyinSyllableWithoutTone.replace("v", "ü").replace("u:", "ü")
val characterToIndex = sanitizedPinyinSyllableWithoutTone.lowercase().withIndex().associate { it.value to it.index }
val vowelIndex = when {
'a' in characterToIndex -> characterToIndex['a']!!
'o' in characterToIndex -> characterToIndex['o']!!
'e' in characterToIndex -> characterToIndex['e']!!
'i' in characterToIndex ->
if (sanitizedPinyinSyllableWithoutTone.elementAtOrNull(characterToIndex['i']!! + 1) == 'u') {
characterToIndex['u']!!
} else {
characterToIndex['i']!!
}
'u' in characterToIndex -> characterToIndex['u']!!
'ü' in characterToIndex -> characterToIndex['ü']!!
else -> error("No vowel found in Pinyin syllable '$sanitizedPinyinSyllableWithoutTone'")
}
return buildString {
append(sanitizedPinyinSyllableWithoutTone)
insert(vowelIndex + 1, tone.format(TransliterationSystem.PINYIN_WITH_TONE_MARKS))
}
}
}

View File

@ -1,81 +0,0 @@
package com.marvinelsen.chinese.transliteration.api
@Suppress("MagicNumber")
enum class Tone {
FIRST, SECOND, THIRD, FORTH, FIFTH;
companion object {
fun fromInt(number: Int) =
fromIntOrNull(number) ?: throw IllegalArgumentException("Number $number is not a valid tone")
fun fromIntOrNull(number: Int) = when (number) {
1 -> FIRST
2 -> SECOND
3 -> THIRD
4 -> FORTH
5 -> FIFTH
else -> null
}
fun fromDigit(digit: Char) =
fromDigitOrNull(digit) ?: throw IllegalArgumentException("Digit $digit is not a valid tone")
fun fromDigitOrNull(digit: Char) = when (digit) {
'1' -> FIRST
'2' -> SECOND
'3' -> THIRD
'4' -> FORTH
'5' -> FIFTH
else -> null
}
fun fromZhuyinToneMark(zhuyinToneMark: Char) = fromZhuyinToneMarkOrNull(zhuyinToneMark)
?: throw IllegalArgumentException("Invalid zhuyin tone mark '$zhuyinToneMark'")
fun fromZhuyinToneMarkOrNull(zhuyinToneMark: Char) = when (zhuyinToneMark) {
'ˊ' -> SECOND
'ˇ' -> THIRD
'ˋ' -> FORTH
'˙' -> FIFTH
else -> null
}
}
fun toInt() = when (this) {
FIRST -> 1
SECOND -> 2
THIRD -> 3
FORTH -> 4
FIFTH -> 5
}
fun format(transliterationSystem: TransliterationSystem) = when (transliterationSystem) {
TransliterationSystem.ZHUYIN -> formatToZhuyin()
TransliterationSystem.PINYIN_WITH_TONE_NUMBERS -> formatToPinyinWithToneNumbers()
TransliterationSystem.PINYIN_WITH_TONE_MARKS -> formatToPinyinWithToneMarks()
}
private fun formatToPinyinWithToneNumbers() = when (this) {
FIRST -> "1"
SECOND -> "2"
THIRD -> "3"
FORTH -> "4"
FIFTH -> "5"
}
private fun formatToPinyinWithToneMarks() = when (this) {
FIRST -> "\u0304"
SECOND -> "\u0301"
THIRD -> "\u030C"
FORTH -> "\u0300"
FIFTH -> ""
}
private fun formatToZhuyin() = when (this) {
FIRST -> ""
SECOND -> "ˊ"
THIRD -> "ˇ"
FORTH -> "ˋ"
FIFTH -> "˙"
}
}

View File

@ -1,5 +0,0 @@
package com.marvinelsen.chinese.transliteration.api
enum class TransliterationSystem {
ZHUYIN, PINYIN_WITH_TONE_NUMBERS, PINYIN_WITH_TONE_MARKS
}

View File

@ -1,5 +0,0 @@
package com.marvinelsen.chinese.transliteration.api
object Zhuyin {
const val SEPARATOR = " "
}

View File

@ -195,17 +195,9 @@ lu ㄌㄨ
luan ㄌㄨㄢ
lun ㄌㄨㄣ
luo ㄌㄨㄛ
lu: ㄌㄩ
lv ㄌㄩ
lü ㄌㄩ
lu:e ㄌㄩㄝ
lve ㄌㄩㄝ
lüe ㄌㄩㄝ
lu:n ㄌㄩㄣ
lvn ㄌㄩㄣ
lün ㄌㄩㄣ
lu:an ㄌㄩㄢ
lvan ㄌㄩㄢ
lüan ㄌㄩㄢ
m ㄇ
ma ㄇㄚ
@ -251,11 +243,7 @@ nu ㄋㄨ
nuan ㄋㄨㄢ
nun ㄋㄨㄣ
nuo ㄋㄨㄛ
nu: ㄋㄩ
nv ㄋㄩ
nü ㄋㄩ
nu:e ㄋㄩㄝ
nve ㄋㄩㄝ
nüe ㄋㄩㄝ
o ㄛ
ou ㄡ

1 a
195 luan ㄌㄨㄢ
196 lun ㄌㄨㄣ
197 luo ㄌㄨㄛ
lu: ㄌㄩ
lv ㄌㄩ
198 ㄌㄩ
lu:e ㄌㄩㄝ
lve ㄌㄩㄝ
199 lüe ㄌㄩㄝ
lu:n ㄌㄩㄣ
lvn ㄌㄩㄣ
200 lün ㄌㄩㄣ
lu:an ㄌㄩㄢ
lvan ㄌㄩㄢ
201 lüan ㄌㄩㄢ
202 m
203 ma ㄇㄚ
243 nuan ㄋㄨㄢ
244 nun ㄋㄨㄣ
245 nuo ㄋㄨㄛ
nu: ㄋㄩ
nv ㄋㄩ
246 ㄋㄩ
nu:e ㄋㄩㄝ
nve ㄋㄩㄝ
247 nüe ㄋㄩㄝ
248 o
249 ou

View File

@ -0,0 +1,113 @@
package com.marvinelsen.chinese.phonetics
import com.marvinelsen.chinese.phonetics.internal.syllable.Syllable
import com.marvinelsen.chinese.phonetics.internal.syllable.formatting.PinyinMarkSyllableFormatter
import com.marvinelsen.chinese.phonetics.internal.syllable.formatting.PinyinNumberSyllableFormatter
import com.marvinelsen.chinese.phonetics.internal.syllable.formatting.ZhuyinSyllableFormatter
import com.marvinelsen.chinese.phonetics.internal.syllable.parsing.PinyinNumberSyllableParser
import com.marvinelsen.chinese.phonetics.internal.syllable.parsing.ZhuyinSyllableParser
import io.kotest.core.spec.style.ShouldSpec
import io.kotest.datatest.withData
import io.kotest.matchers.shouldBe
class ChinesePhoneticsTest : ShouldSpec({
context("from pinyin with tone numbers") {
withData(
nameFn = { "'${it.first}' -> ${it.second}" },
"sheng1" to Syllable("sheng", Tone.FIRST),
"zhi2" to Syllable("zhi", Tone.SECOND),
"ka3" to Syllable("ka", Tone.THIRD),
"yao4" to Syllable("yao", Tone.FOURTH),
"me5" to Syllable("me", Tone.FIFTH),
"Me5" to Syllable("me", Tone.FIFTH),
"nv3" to Syllable("", Tone.THIRD),
"nü3" to Syllable("", Tone.THIRD),
"nu:3" to Syllable("", Tone.THIRD),
"r5" to Syllable("r", Tone.FIFTH),
"R5" to Syllable("r", Tone.FIFTH),
"er2" to Syllable("er", Tone.SECOND),
"Er2" to Syllable("er", Tone.SECOND),
) { (pinyinWithNumber, expectedSyllable) ->
PinyinNumberSyllableParser.parse(pinyinWithNumber) shouldBe expectedSyllable
}
}
context("from zhuyin") {
withData(
nameFn = { "'${it.first}' -> ${it.second}" },
"ㄕㄥ" to Syllable("sheng", Tone.FIRST),
"ㄓˊ" to Syllable("zhi", Tone.SECOND),
"ㄎㄚˇ" to Syllable("ka", Tone.THIRD),
"ㄧㄠˋ" to Syllable("yao", Tone.FOURTH),
"ㄇㄜ˙" to Syllable("me", Tone.FIFTH),
"˙ㄇㄜ" to Syllable("me", Tone.FIFTH),
"ㄋㄩˇ" to Syllable("", Tone.THIRD),
) { (zhuyin, expectedSyllable) ->
ZhuyinSyllableParser.parse(zhuyin) shouldBe expectedSyllable
}
}
context("from invalid pinyin with tone numbers") {
}
context("from invalid zhuyin") {
}
context("format to zhuyin") {
withData(
nameFn = { "$it -> ${it.second}" },
Syllable("sheng", Tone.FIRST) to "ㄕㄥ",
Syllable("zhi", Tone.SECOND) to "ㄓˊ",
Syllable("ka", Tone.THIRD) to "ㄎㄚˇ",
Syllable("yao", Tone.FOURTH) to "ㄧㄠˋ",
Syllable("me", Tone.FIFTH) to "˙ㄇㄜ",
Syllable("", Tone.THIRD) to "ㄋㄩˇ",
Syllable("r", Tone.FIFTH) to "˙ㄦ",
Syllable("er", Tone.SECOND) to "ㄦˊ",
) { (syllable, expectedZhuyin) ->
ZhuyinSyllableFormatter.format(syllable) shouldBe expectedZhuyin
}
}
context("format to pinyin with tone numbers") {
withData(
nameFn = { "$it -> ${it.second}" },
Syllable("sheng", Tone.FIRST) to "sheng1",
Syllable("zhi", Tone.SECOND) to "zhi2",
Syllable("ka", Tone.THIRD) to "ka3",
Syllable("yao", Tone.FOURTH) to "yao4",
Syllable("me", Tone.FIFTH) to "me5",
Syllable("", Tone.THIRD) to "nü3",
Syllable("r", Tone.FIFTH) to "r5",
Syllable("er", Tone.SECOND) to "er2",
) { (syllable, expectedPinyinWithToneNumbers) ->
PinyinNumberSyllableFormatter.format(syllable) shouldBe expectedPinyinWithToneNumbers
}
}
context("format to pinyin with tone marks") {
withData(
nameFn = { "$it -> ${it.second}" },
Syllable("sheng", Tone.FIRST) to "shēng",
Syllable("zhi", Tone.SECOND) to "zhí",
Syllable("ka", Tone.THIRD) to "kǎ",
Syllable("yao", Tone.FOURTH) to "yào",
Syllable("me", Tone.FIFTH) to "me",
Syllable("zhui", Tone.FIRST) to "zhuī",
Syllable("liu", Tone.FIRST) to "liū",
Syllable("", Tone.THIRD) to "nǚ",
Syllable("r", Tone.FIFTH) to "r",
Syllable("er", Tone.SECOND) to "ér",
) { (syllable, expectedPinyinWithToneMarks) ->
PinyinMarkSyllableFormatter.format(syllable) shouldBe expectedPinyinWithToneMarks
}
}
context("format to zhuyin with invalid pinyin syllable") {
}
context("format to pinyin with tone diacritics with invalid pinyin syllable") {
}
context("format to pinyin with tone numbers with invalid pinyin syllable") {
}
})

View File

@ -1,5 +1,6 @@
package com.marvinelsen.chinese.transliteration.api
package com.marvinelsen.chinese.phonetics
import com.marvinelsen.chinese.phonetics.exceptions.InvalidToneInputException
import io.kotest.assertions.throwables.shouldThrow
import io.kotest.core.spec.style.ShouldSpec
import io.kotest.datatest.withData
@ -13,7 +14,7 @@ class ToneTest : ShouldSpec({
'1' to Tone.FIRST,
'2' to Tone.SECOND,
'3' to Tone.THIRD,
'4' to Tone.FORTH,
'4' to Tone.FOURTH,
'5' to Tone.FIFTH,
) { (digit, expectedTone) ->
Tone.fromDigit(digit) shouldBe expectedTone
@ -26,7 +27,7 @@ class ToneTest : ShouldSpec({
1 to Tone.FIRST,
2 to Tone.SECOND,
3 to Tone.THIRD,
4 to Tone.FORTH,
4 to Tone.FOURTH,
5 to Tone.FIFTH,
) { (number, expectedTone) ->
Tone.fromInt(number) shouldBe expectedTone
@ -36,12 +37,25 @@ class ToneTest : ShouldSpec({
context("convert correctly from Zhuyin tone mark") {
withData(
nameFn = { "'${it.first}' -> ${it.second}" },
'¯' to Tone.FIRST,
'ˊ' to Tone.SECOND,
'ˇ' to Tone.THIRD,
'ˋ' to Tone.FORTH,
'ˋ' to Tone.FOURTH,
'˙' to Tone.FIFTH,
) { (zhuyinToneMark, expectedTone) ->
Tone.fromZhuyinToneMark(zhuyinToneMark) shouldBe expectedTone
Tone.fromZhuyinTone(zhuyinToneMark) shouldBe expectedTone
}
}
context("convert correctly from Pinyin tone mark") {
withData(
nameFn = { "'${it.first}' -> ${it.second}" },
'\u0304' to Tone.FIRST,
'\u0301' to Tone.SECOND,
'\u030C' to Tone.THIRD,
'\u0300' to Tone.FOURTH,
) { (pinyinToneMark, expectedTone) ->
Tone.fromPinyinTone(pinyinToneMark) shouldBe expectedTone
}
}
@ -51,13 +65,52 @@ class ToneTest : ShouldSpec({
Tone.FIRST to 1,
Tone.SECOND to 2,
Tone.THIRD to 3,
Tone.FORTH to 4,
Tone.FOURTH to 4,
Tone.FIFTH to 5,
) { (tone, expectedInteger) ->
tone.toInt() shouldBe expectedInteger
}
}
context("convert correctly to Zhuyin tone mark") {
withData(
nameFn = { "${it.first} -> '${it.second}'" },
Tone.FIRST to "",
Tone.SECOND to "ˊ",
Tone.THIRD to "ˇ",
Tone.FOURTH to "ˋ",
Tone.FIFTH to "˙",
) { (tone, zhuyinTone) ->
tone.toZhuyinTone() shouldBe zhuyinTone
}
}
context("convert correctly to Pinyin tone mark") {
withData(
nameFn = { "${it.first} -> '${it.second}'" },
Tone.FIRST to "\u0304",
Tone.SECOND to "\u0301",
Tone.THIRD to "\u030C",
Tone.FOURTH to "\u0300",
Tone.FIFTH to "",
) { (tone, pinyinTone) ->
tone.toPinyinTone() shouldBe pinyinTone
}
}
context("convert correctly to Pinyin number") {
withData(
nameFn = { "${it.first} -> '${it.second}'" },
Tone.FIRST to "1",
Tone.SECOND to "2",
Tone.THIRD to "3",
Tone.FOURTH to "4",
Tone.FIFTH to "5",
) { (tone, pinyinNumber) ->
tone.toPinyinNumber() shouldBe pinyinNumber
}
}
context("return null when converting from invalid digit ") {
withData(
nameFn = { "'$it' -> null" },
@ -74,7 +127,7 @@ class ToneTest : ShouldSpec({
context("return null when converting from invalid int ") {
withData(
nameFn = { "'$it' -> throws exception" },
nameFn = { "'$it' -> null" },
0,
6,
-1,
@ -87,7 +140,7 @@ class ToneTest : ShouldSpec({
context("return null when converting from invalid Zhuyin tone mark ") {
withData(
nameFn = { "'$it' -> throws exception" },
nameFn = { "'$it' -> null" },
'0',
'6',
'a',
@ -95,7 +148,21 @@ class ToneTest : ShouldSpec({
'$',
'*',
) { invalidZhuyinToneMark ->
Tone.fromZhuyinToneMarkOrNull(invalidZhuyinToneMark).shouldBeNull()
Tone.fromZhuyinToneOrNull(invalidZhuyinToneMark).shouldBeNull()
}
}
context("return null when converting from invalid Pinyin tone mark ") {
withData(
nameFn = { "'$it' -> null" },
'0',
'6',
'a',
'z',
'$',
'*',
) { invalidPinyinToneMark ->
Tone.fromPinyinToneOrNull(invalidPinyinToneMark).shouldBeNull()
}
}
@ -109,7 +176,7 @@ class ToneTest : ShouldSpec({
'$',
'*',
) { invalidDigit ->
shouldThrow<IllegalArgumentException> {
shouldThrow<InvalidToneInputException> {
Tone.fromDigit(invalidDigit)
}
}
@ -124,7 +191,7 @@ class ToneTest : ShouldSpec({
Int.MAX_VALUE,
Int.MIN_VALUE,
) { invalidNumber ->
shouldThrow<IllegalArgumentException> {
shouldThrow<InvalidToneInputException> {
Tone.fromInt(invalidNumber)
}
}
@ -140,48 +207,25 @@ class ToneTest : ShouldSpec({
'$',
'*',
) { invalidZhuyinToneMark ->
shouldThrow<IllegalArgumentException> {
Tone.fromZhuyinToneMark(invalidZhuyinToneMark)
shouldThrow<InvalidToneInputException> {
Tone.fromZhuyinTone(invalidZhuyinToneMark)
}
}
}
context("format to Zhuyin correctly") {
context("throw exception when converting from invalid Pinyin tone mark ") {
withData(
nameFn = { "${it.first} -> '${it.second}'" },
Tone.FIRST to "",
Tone.SECOND to "ˊ",
Tone.THIRD to "ˇ",
Tone.FORTH to "ˋ",
Tone.FIFTH to "˙",
) { (tone, expectedZhuyinToneMark) ->
tone.format(TransliterationSystem.ZHUYIN) shouldBe expectedZhuyinToneMark
}
}
context("format to Pinyin with tone numbers correctly") {
withData(
nameFn = { "${it.first} -> '${it.second}'" },
Tone.FIRST to "1",
Tone.SECOND to "2",
Tone.THIRD to "3",
Tone.FORTH to "4",
Tone.FIFTH to "5",
) { (tone, expectedNumber) ->
tone.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS) shouldBe expectedNumber
}
}
context("format to Pinyin with tone marks correctly") {
withData(
nameFn = { "${it.first} -> '${it.second}'" },
Tone.FIRST to "\u0304",
Tone.SECOND to "\u0301",
Tone.THIRD to "\u030C",
Tone.FORTH to "\u0300",
Tone.FIFTH to "",
) { (tone, expectedAccent) ->
tone.format(TransliterationSystem.PINYIN_WITH_TONE_MARKS) shouldBe expectedAccent
nameFn = { "'$it' -> throws exception" },
'0',
'6',
'a',
'z',
'$',
'*',
) { invalidPinyinToneMark ->
shouldThrow<InvalidToneInputException> {
Tone.fromPinyinTone(invalidPinyinToneMark)
}
}
}
})

View File

@ -1,119 +0,0 @@
package com.marvinelsen.chinese.transliteration.api
import io.kotest.core.spec.style.ShouldSpec
import io.kotest.datatest.withData
import io.kotest.matchers.shouldBe
class PinyinSyllableTest : ShouldSpec({
context("from pinyin with tone numbers") {
withData(
nameFn = { "'${it.first}' -> ${it.second}" },
"sheng1" to PinyinSyllable("sheng", Tone.FIRST),
"zhi2" to PinyinSyllable("zhi", Tone.SECOND),
"ka3" to PinyinSyllable("ka", Tone.THIRD),
"yao4" to PinyinSyllable("yao", Tone.FORTH),
"me5" to PinyinSyllable("me", Tone.FIFTH),
"Me5" to PinyinSyllable("Me", Tone.FIFTH),
"nv3" to PinyinSyllable("nv", Tone.THIRD),
"nü3" to PinyinSyllable("", Tone.THIRD),
"nu:3" to PinyinSyllable("nu:", Tone.THIRD),
"r5" to PinyinSyllable("r", Tone.FIFTH),
"R5" to PinyinSyllable("R", Tone.FIFTH),
"er2" to PinyinSyllable("er", Tone.SECOND),
"Er2" to PinyinSyllable("Er", Tone.SECOND),
) { (pinyinWithNumber, expectedSyllable) ->
PinyinSyllable.fromPinyinWithToneNumber(pinyinWithNumber) shouldBe expectedSyllable
}
}
context("from zhuyin") {
withData(
nameFn = { "'${it.first}' -> ${it.second}" },
"ㄕㄥ" to PinyinSyllable("sheng", Tone.FIRST),
"ㄓˊ" to PinyinSyllable("zhi", Tone.SECOND),
"ㄎㄚˇ" to PinyinSyllable("ka", Tone.THIRD),
"ㄧㄠˋ" to PinyinSyllable("yao", Tone.FORTH),
"ㄇㄜ˙" to PinyinSyllable("me", Tone.FIFTH),
"˙ㄇㄜ" to PinyinSyllable("me", Tone.FIFTH),
"ㄋㄩˇ" to PinyinSyllable("", Tone.THIRD),
) { (zhuyin, expectedSyllable) ->
PinyinSyllable.fromZhuyin(zhuyin) shouldBe expectedSyllable
}
}
context("from invalid pinyin with tone numbers") {
}
context("from invalid zhuyin") {
}
context("format to zhuyin") {
withData(
nameFn = { "${it.first.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS)} -> ${it.second}" },
PinyinSyllable("sheng", Tone.FIRST) to "ㄕㄥ",
PinyinSyllable("zhi", Tone.SECOND) to "ㄓˊ",
PinyinSyllable("ka", Tone.THIRD) to "ㄎㄚˇ",
PinyinSyllable("yao", Tone.FORTH) to "ㄧㄠˋ",
PinyinSyllable("me", Tone.FIFTH) to "˙ㄇㄜ",
PinyinSyllable("", Tone.THIRD) to "ㄋㄩˇ",
PinyinSyllable("nu:", Tone.THIRD) to "ㄋㄩˇ",
PinyinSyllable("nv", Tone.THIRD) to "ㄋㄩˇ",
PinyinSyllable("r", Tone.FIFTH) to "˙ㄦ",
PinyinSyllable("R", Tone.FIFTH) to "˙ㄦ",
PinyinSyllable("er", Tone.SECOND) to "ㄦˊ",
PinyinSyllable("Er", Tone.SECOND) to "ㄦˊ",
) { (syllable, expectedZhuyin) ->
syllable.format(TransliterationSystem.ZHUYIN) shouldBe expectedZhuyin
}
}
context("format to pinyin with tone numbers") {
withData(
nameFn = { "${it.first.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS)} -> ${it.second}" },
PinyinSyllable("sheng", Tone.FIRST) to "sheng1",
PinyinSyllable("zhi", Tone.SECOND) to "zhi2",
PinyinSyllable("ka", Tone.THIRD) to "ka3",
PinyinSyllable("yao", Tone.FORTH) to "yao4",
PinyinSyllable("me", Tone.FIFTH) to "me5",
PinyinSyllable("", Tone.THIRD) to "nü3",
PinyinSyllable("nu:", Tone.THIRD) to "nu:3",
PinyinSyllable("nv", Tone.THIRD) to "nv3",
PinyinSyllable("r", Tone.FIFTH) to "r5",
PinyinSyllable("R", Tone.FIFTH) to "R5",
PinyinSyllable("er", Tone.SECOND) to "er2",
PinyinSyllable("Er", Tone.SECOND) to "Er2",
) { (syllable, expectedPinyinWithToneNumbers) ->
syllable.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS) shouldBe expectedPinyinWithToneNumbers
}
}
context("format to pinyin with tone marks") {
withData(
nameFn = { "${it.first.format(TransliterationSystem.PINYIN_WITH_TONE_NUMBERS)} -> ${it.second}" },
PinyinSyllable("sheng", Tone.FIRST) to "shēng",
PinyinSyllable("zhi", Tone.SECOND) to "zhí",
PinyinSyllable("ka", Tone.THIRD) to "kǎ",
PinyinSyllable("yao", Tone.FORTH) to "yào",
PinyinSyllable("me", Tone.FIFTH) to "me",
PinyinSyllable("zhui", Tone.FIRST) to "zhuī",
PinyinSyllable("liu", Tone.FIRST) to "liū",
PinyinSyllable("", Tone.THIRD) to "nǚ",
PinyinSyllable("nu:", Tone.THIRD) to "nǚ",
PinyinSyllable("nv", Tone.THIRD) to "nǚ",
PinyinSyllable("r", Tone.FIFTH) to "r",
PinyinSyllable("er", Tone.SECOND) to "ér",
PinyinSyllable("Er", Tone.SECOND) to "Ér",
) { (syllable, expectedPinyinWithToneMarks) ->
syllable.format(TransliterationSystem.PINYIN_WITH_TONE_MARKS) shouldBe expectedPinyinWithToneMarks
}
}
context("format to zhuyin with invalid pinyin syllable") {
}
context("format to pinyin with tone diacritics with invalid pinyin syllable") {
}
context("format to pinyin with tone numbers with invalid pinyin syllable") {
}
})