golangの日記

Go言語を中心にプログラミングについてのブログ

Golang unicodeパッケージ 文字の種類判別

golang.png


Unicodeのコードポイントで文字の種別を判別できる unicodeパッケージ の使い方。





目次



文字の種類の判別


unicodeパッケージに定義してある関数・変数を使って判別するとExampleを見るとわかる通り部分集合的に重なってたりする。例えば、改行("\n")は unicode.IsControlunicode.IsSpace のどちらでも true になる。
その他にも unicode.Hiragana には 🈀 のような意図しないかもしれないものまで含まれる。なので自分が意図する正確な判別をしたい場合は RangeTable を作る必要がありそう。

package main

import (
    "fmt"
    "unicode"
)

var (
    AlphabetUpperCase = &unicode.RangeTable{
        R16: []unicode.Range16{
            {0x0041, 0x005A, 1},
        },
        LatinOffset: 1,
    }

    AlphabetLowerCase = &unicode.RangeTable{
        R16: []unicode.Range16{
            {0x0061, 0x007A, 1},
        },
        LatinOffset: 1,
    }
)

func main() {
    s := "aAあア亜"
    for _, r := range s {
        if unicode.Is(unicode.Katakana, r) {
            fmt.Printf("%8d %-8U %s Katakana\n", r, r, string(r))
        }
        if unicode.Is(unicode.Hiragana, r) {
            fmt.Printf("%8d %-8U %s Hiragana\n", r, r, string(r))
        }
        if unicode.Is(unicode.Han, r) {
            fmt.Printf("%8d %-8U %s Han\n", r, r, string(r))
        }
        if unicode.In(r, AlphabetUpperCase, AlphabetLowerCase) {
            fmt.Printf("%8d %-8U %s Alphabet\n", r, r, string(r))
        }
    }
    //       97 U+0061   a Alphabet
    //       65 U+0041   A Alphabet
    //    12354 U+3042   あ Hiragana
    //    12450 U+30A2   ア Katakana
    //    20124 U+4E9C   亜 Han
}



RangeTable の使い方


  • Range16uint16 なので 0xFFFF まで、それ以上のコードポイントは Range32
  • Stride はpythonのスライスのステップみないなやつ。連番なら1。規則性があって2こ飛ばししたい場合は2にする。
  • LatinOffsetRange16unicode.MaxLatin1 (0x00FF) より Hi が小さいやつの数。
type Range16 struct {
    Lo     uint16
    Hi     uint16
    Stride uint16
}

type Range32 struct {
    Lo     uint32
    Hi     uint32
    Stride uint32
}

type RangeTable struct {
    R16         []Range16
    R32         []Range32
    LatinOffset int
}


unicode.Hiragana に何が含まれるか出力。

package main

import (
    "fmt"
    "unicode"
)

func output(table *unicode.RangeTable) {
    for _, v := range table.R16 {
        var step uint16 = 1
        if v.Stride > 1 {
            step = v.Stride
        }
        for i := v.Lo; i <= v.Hi; i += step {
            r := rune(i)
            s := string(r)
            if !unicode.IsPrint(r) {
                s = "Unprintable!"
            }
            fmt.Printf("%8d %-8U %s\n", i, i, s)
        }
    }
    for _, v := range table.R32 {
        var step uint32 = 1
        if v.Stride > 1 {
            step = v.Stride
        }
        for i := v.Lo; i <= v.Hi; i += step {
            r := rune(i)
            s := string(r)
            if !unicode.IsPrint(r) {
                s = "Unprintable!"
            }
            fmt.Printf("%8d %-8U %s\n", i, i, s)
        }
    }
}

func main() {
    output(unicode.Hiragana)
    // あ〜ん 以外に 江 とか 🈀 が出力される
}



ウィキペディアの平仮名_(Unicodeのブロック)をみると 0x3041 から 0x3093 までで良さそう。

package main

import (
    "fmt"
    "unicode"
)

var (
    hiragana = &unicode.RangeTable{
        R16: []unicode.Range16{
            {0x3041, 0x3093, 1},
        },
    }
)

func main() {
    s := "abcあん🈀"
    for _, r := range s {
        if unicode.Is(hiragana, r) {
            fmt.Println("hiragana: ", string(r))
            // hiragana:  あ
            // hiragana:  ん
        }
    }
}



判別2

unicodeパッケージに定義してある関数・変数フル活用して分類する。

package main

import (
    "fmt"
    "strings"
    "unicode"
)

type Categorizer struct {
    s string
    f func(rune) bool
    t *unicode.RangeTable
}

var (
    categorizer = []Categorizer{
        {s: "Control", f: unicode.IsControl},
        {s: "Space", f: unicode.IsSpace},
        {s: "Graphic", f: unicode.IsGraphic},
        {s: "Letter", f: unicode.IsLetter},
        {s: "Title", f: unicode.IsTitle},
        {s: "Mark", f: unicode.IsMark},
        {s: "Symbolic", f: unicode.IsSymbol},
        {s: "Punctuation", f: unicode.IsPunct},
        {s: "Digit", f: unicode.IsDigit},

        {s: "Adlam", t: unicode.Adlam},
        {s: "Ahom", t: unicode.Ahom},
        {s: "Anatolian-Hieroglyphs", t: unicode.Anatolian_Hieroglyphs},
        {s: "Arabic", t: unicode.Arabic},
        {s: "Armenian", t: unicode.Armenian},
        {s: "Avestan", t: unicode.Avestan},
        {s: "Balinese", t: unicode.Balinese},
        {s: "Bamum", t: unicode.Bamum},
        {s: "Bassa-Vah", t: unicode.Bassa_Vah},
        {s: "Batak", t: unicode.Batak},
        {s: "Bengali", t: unicode.Bengali},
        {s: "Bhaiksuki", t: unicode.Bhaiksuki},
        {s: "Bopomofo", t: unicode.Bopomofo},
        {s: "Brahmi", t: unicode.Brahmi},
        {s: "Braille", t: unicode.Braille},
        {s: "Buginese", t: unicode.Buginese},
        {s: "Buhid", t: unicode.Buhid},
        {s: "Canadian-Aboriginal", t: unicode.Canadian_Aboriginal},
        {s: "Carian", t: unicode.Carian},
        {s: "Caucasian-Albanian", t: unicode.Caucasian_Albanian},
        {s: "Chakma", t: unicode.Chakma},
        {s: "Cham", t: unicode.Cham},
        {s: "Cherokee", t: unicode.Cherokee},
        {s: "Chorasmian", t: unicode.Chorasmian},
        {s: "Common", t: unicode.Common},
        {s: "Coptic", t: unicode.Coptic},
        {s: "Cuneiform", t: unicode.Cuneiform},
        {s: "Cypriot", t: unicode.Cypriot},
        {s: "Cyrillic", t: unicode.Cyrillic},
        {s: "Deseret", t: unicode.Deseret},
        {s: "Devanagari", t: unicode.Devanagari},
        {s: "Dives-Akuru", t: unicode.Dives_Akuru},
        {s: "Dogra", t: unicode.Dogra},
        {s: "Duployan", t: unicode.Duployan},
        {s: "Egyptian-Hieroglyphs", t: unicode.Egyptian_Hieroglyphs},
        {s: "Elbasan", t: unicode.Elbasan},
        {s: "Elymaic", t: unicode.Elymaic},
        {s: "Ethiopic", t: unicode.Ethiopic},
        {s: "Georgian", t: unicode.Georgian},
        {s: "Glagolitic", t: unicode.Glagolitic},
        {s: "Gothic", t: unicode.Gothic},
        {s: "Grantha", t: unicode.Grantha},
        {s: "Greek", t: unicode.Greek},
        {s: "Gujarati", t: unicode.Gujarati},
        {s: "Gunjala-Gondi", t: unicode.Gunjala_Gondi},
        {s: "Gurmukhi", t: unicode.Gurmukhi},
        {s: "Han", t: unicode.Han},
        {s: "Hangul", t: unicode.Hangul},
        {s: "Hanifi-Rohingya", t: unicode.Hanifi_Rohingya},
        {s: "Hanunoo", t: unicode.Hanunoo},
        {s: "Hatran", t: unicode.Hatran},
        {s: "Hebrew", t: unicode.Hebrew},
        {s: "Hiragana", t: unicode.Hiragana},
        {s: "Imperial-Aramaic", t: unicode.Imperial_Aramaic},
        {s: "Inherited", t: unicode.Inherited},
        {s: "Inscriptional-Pahlavi", t: unicode.Inscriptional_Pahlavi},
        {s: "Inscriptional-Parthian", t: unicode.Inscriptional_Parthian},
        {s: "Javanese", t: unicode.Javanese},
        {s: "Kaithi", t: unicode.Kaithi},
        {s: "Kannada", t: unicode.Kannada},
        {s: "Katakana", t: unicode.Katakana},
        {s: "Kayah-Li", t: unicode.Kayah_Li},
        {s: "Kharoshthi", t: unicode.Kharoshthi},
        {s: "Khitan-Small-Script", t: unicode.Khitan_Small_Script},
        {s: "Khmer", t: unicode.Khmer},
        {s: "Khojki", t: unicode.Khojki},
        {s: "Khudawadi", t: unicode.Khudawadi},
        {s: "Lao", t: unicode.Lao},
        {s: "Latin", t: unicode.Latin},
        {s: "Lepcha", t: unicode.Lepcha},
        {s: "Limbu", t: unicode.Limbu},
        {s: "Linear-A", t: unicode.Linear_A},
        {s: "Linear-B", t: unicode.Linear_B},
        {s: "Lisu", t: unicode.Lisu},
        {s: "Lycian", t: unicode.Lycian},
        {s: "Lydian", t: unicode.Lydian},
        {s: "Mahajani", t: unicode.Mahajani},
        {s: "Makasar", t: unicode.Makasar},
        {s: "Malayalam", t: unicode.Malayalam},
        {s: "Mandaic", t: unicode.Mandaic},
        {s: "Manichaean", t: unicode.Manichaean},
        {s: "Marchen", t: unicode.Marchen},
        {s: "Masaram-Gondi", t: unicode.Masaram_Gondi},
        {s: "Medefaidrin", t: unicode.Medefaidrin},
        {s: "Meetei-Mayek", t: unicode.Meetei_Mayek},
        {s: "Mende-Kikakui", t: unicode.Mende_Kikakui},
        {s: "Meroitic-Cursive", t: unicode.Meroitic_Cursive},
        {s: "Meroitic-Hieroglyphs", t: unicode.Meroitic_Hieroglyphs},
        {s: "Miao", t: unicode.Miao},
        {s: "Modi", t: unicode.Modi},
        {s: "Mongolian", t: unicode.Mongolian},
        {s: "Mro", t: unicode.Mro},
        {s: "Multani", t: unicode.Multani},
        {s: "Myanmar", t: unicode.Myanmar},
        {s: "Nabataean", t: unicode.Nabataean},
        {s: "Nandinagari", t: unicode.Nandinagari},
        {s: "New-Tai-Lue", t: unicode.New_Tai_Lue},
        {s: "Newa", t: unicode.Newa},
        {s: "Nko", t: unicode.Nko},
        {s: "Nushu", t: unicode.Nushu},
        {s: "Nyiakeng-Puachue-Hmong", t: unicode.Nyiakeng_Puachue_Hmong},
        {s: "Ogham", t: unicode.Ogham},
        {s: "Ol-Chiki", t: unicode.Ol_Chiki},
        {s: "Old-Hungarian", t: unicode.Old_Hungarian},
        {s: "Old-Italic", t: unicode.Old_Italic},
        {s: "Old-North-Arabian", t: unicode.Old_North_Arabian},
        {s: "Old-Permic", t: unicode.Old_Permic},
        {s: "Old-Persian", t: unicode.Old_Persian},
        {s: "Old-Sogdian", t: unicode.Old_Sogdian},
        {s: "Old-South-Arabian", t: unicode.Old_South_Arabian},
        {s: "Old-Turkic", t: unicode.Old_Turkic},
        {s: "Oriya", t: unicode.Oriya},
        {s: "Osage", t: unicode.Osage},
        {s: "Osmanya", t: unicode.Osmanya},
        {s: "Pahawh-Hmong", t: unicode.Pahawh_Hmong},
        {s: "Palmyrene", t: unicode.Palmyrene},
        {s: "Pau-Cin-Hau", t: unicode.Pau_Cin_Hau},
        {s: "Phags-Pa", t: unicode.Phags_Pa},
        {s: "Phoenician", t: unicode.Phoenician},
        {s: "Psalter-Pahlavi", t: unicode.Psalter_Pahlavi},
        {s: "Rejang", t: unicode.Rejang},
        {s: "Runic", t: unicode.Runic},
        {s: "Samaritan", t: unicode.Samaritan},
        {s: "Saurashtra", t: unicode.Saurashtra},
        {s: "Sharada", t: unicode.Sharada},
        {s: "Shavian", t: unicode.Shavian},
        {s: "Siddham", t: unicode.Siddham},
        {s: "SignWriting", t: unicode.SignWriting},
        {s: "Sinhala", t: unicode.Sinhala},
        {s: "Sogdian", t: unicode.Sogdian},
        {s: "Sora-Sompeng", t: unicode.Sora_Sompeng},
        {s: "Soyombo", t: unicode.Soyombo},
        {s: "Sundanese", t: unicode.Sundanese},
        {s: "Syloti-Nagri", t: unicode.Syloti_Nagri},
        {s: "Syriac", t: unicode.Syriac},
        {s: "Tagalog", t: unicode.Tagalog},
        {s: "Tagbanwa", t: unicode.Tagbanwa},
        {s: "Tai-Le", t: unicode.Tai_Le},
        {s: "Tai-Tham", t: unicode.Tai_Tham},
        {s: "Tai-Viet", t: unicode.Tai_Viet},
        {s: "Takri", t: unicode.Takri},
        {s: "Tamil", t: unicode.Tamil},
        {s: "Tangut", t: unicode.Tangut},
        {s: "Telugu", t: unicode.Telugu},
        {s: "Thaana", t: unicode.Thaana},
        {s: "Thai", t: unicode.Thai},
        {s: "Tibetan", t: unicode.Tibetan},
        {s: "Tifinagh", t: unicode.Tifinagh},
        {s: "Tirhuta", t: unicode.Tirhuta},
        {s: "Ugaritic", t: unicode.Ugaritic},
        {s: "Vai", t: unicode.Vai},
        {s: "Wancho", t: unicode.Wancho},
        {s: "Warang-Citi", t: unicode.Warang_Citi},
        {s: "Yezidi", t: unicode.Yezidi},
        {s: "Yi", t: unicode.Yi},
        {s: "Zanabazar-Square", t: unicode.Zanabazar_Square},

        {s: "Lowercase", f: unicode.IsLower},
        {s: "Uppercase", f: unicode.IsUpper},

        {s: "ASCII-Hex-Digit", t: unicode.ASCII_Hex_Digit},
        {s: "Bidi-Control", t: unicode.Bidi_Control},
        {s: "Dash", t: unicode.Dash},
        {s: "Deprecated", t: unicode.Deprecated},
        {s: "Diacritic", t: unicode.Diacritic},
        {s: "Extender", t: unicode.Extender},
        {s: "Hex-Digit", t: unicode.Hex_Digit},
        {s: "Hyphen", t: unicode.Hyphen},
        {s: "IDS-Binary-Operator", t: unicode.IDS_Binary_Operator},
        {s: "IDS-Trinary-Operator", t: unicode.IDS_Trinary_Operator},
        {s: "Ideographic", t: unicode.Ideographic},
        {s: "Join-Control", t: unicode.Join_Control},
        {s: "Logical-Order-Exception", t: unicode.Logical_Order_Exception},
        {s: "Noncharacter-Code-Point", t: unicode.Noncharacter_Code_Point},
        {s: "Other-Alphabetic", t: unicode.Other_Alphabetic},
        {s: "Other-Default-Ignorable-Code-Point", t: unicode.Other_Default_Ignorable_Code_Point},
        {s: "Other-Grapheme-Extend", t: unicode.Other_Grapheme_Extend},
        {s: "Other-ID-Continue", t: unicode.Other_ID_Continue},
        {s: "Other-ID-Start", t: unicode.Other_ID_Start},
        {s: "Other-Lowercase", t: unicode.Other_Lowercase},
        {s: "Other-Math", t: unicode.Other_Math},
        {s: "Other-Uppercase", t: unicode.Other_Uppercase},
        {s: "Pattern-Syntax", t: unicode.Pattern_Syntax},
        {s: "Pattern-White-Space", t: unicode.Pattern_White_Space},
        {s: "Prepended-Concatenation-Mark", t: unicode.Prepended_Concatenation_Mark},
        {s: "Quotation-Mark", t: unicode.Quotation_Mark},
        {s: "Radical", t: unicode.Radical},
        {s: "Regional-Indicator", t: unicode.Regional_Indicator},
        {s: "STerm", t: unicode.STerm},
        {s: "Sentence-Terminal", t: unicode.Sentence_Terminal},
        {s: "Soft-Dotted", t: unicode.Soft_Dotted},
        {s: "Terminal-Punctuation", t: unicode.Terminal_Punctuation},
        {s: "Unified-Ideograph", t: unicode.Unified_Ideograph},
        {s: "Variation-Selector", t: unicode.Variation_Selector},
        {s: "White-Space", t: unicode.White_Space},
    }
)

func categorize(r rune) string {
    var a []string
    for _, v := range categorizer {
        if v.f != nil && v.f(r) {
            a = append(a, v.s)
        }
        if v.t != nil && unicode.Is(v.t, r) {
            a = append(a, v.s)
        }
    }
    return strings.Join(a, " ")
}

func main() {
    s := "!1aAあア啊아อาآه😀"

    for _, r := range s {
        fmt.Printf("%s %s\n", string(r), categorize(r))
        // ! Graphic Punctuation Common Pattern-Syntax STerm Sentence-Terminal Terminal-Punctuation
        // 1 Graphic Digit Common ASCII-Hex-Digit Hex-Digit
        // a Graphic Letter Latin Lowercase ASCII-Hex-Digit Hex-Digit
        // A Graphic Letter Latin Uppercase ASCII-Hex-Digit Hex-Digit
        // あ Graphic Letter Hiragana
        // ア Graphic Letter Katakana
        // 啊 Graphic Letter Han Ideographic Unified-Ideograph
        // 아 Graphic Letter Hangul
        // อ Graphic Letter Thai
        // า Graphic Letter Thai
        // آ Graphic Letter Arabic
        // ه Graphic Letter Arabic
        // 😀 Graphic Symbolic Common
    }
}