kanjidic.mx raw
1 package ingest
2
3 import "io"
4
5 // KanjiEntry is one parsed KANJIDIC2 entry.
6 type KanjiEntry struct {
7 Literal string // the kanji character
8 OnYomi []string // ON readings (Chinese-derived)
9 KunYomi []string // KUN readings (native Japanese)
10 Meanings []string // English meanings
11 }
12
13 // ParseKanjidic reads KANJIDIC2 XML from r and calls emit for each character.
14 func ParseKanjidic(r io.Reader, emit func(KanjiEntry)) {
15 sc := NewXMLScanner(r)
16 var ev XMLEvent
17
18 var (
19 entry KanjiEntry
20 inChar bool
21 inLiteral bool
22 inReading bool
23 inMeaning bool
24 readingType string
25 meaningIsEng bool
26 )
27
28 for sc.Next(&ev) {
29 switch ev.Kind {
30 case XMLStart:
31 switch ev.Name {
32 case "character":
33 entry = KanjiEntry{}
34 inChar = true
35 case "literal":
36 if inChar {
37 inLiteral = true
38 }
39 case "reading":
40 if inChar {
41 inReading = true
42 readingType = attrValue(ev.Attrs, "r_type")
43 }
44 case "meaning":
45 if inChar {
46 lang := attrValue(ev.Attrs, "m_lang")
47 meaningIsEng = lang == "" // no m_lang attr means English
48 inMeaning = true
49 }
50 }
51
52 case XMLEnd:
53 switch ev.Name {
54 case "character":
55 if inChar && entry.Literal != "" {
56 emit(entry)
57 }
58 entry = KanjiEntry{}
59 inChar = false
60 case "literal":
61 inLiteral = false
62 case "reading":
63 inReading = false
64 readingType = ""
65 case "meaning":
66 inMeaning = false
67 meaningIsEng = false
68 }
69
70 case XMLText:
71 switch {
72 case inLiteral && inChar:
73 entry.Literal = ev.Text
74 case inReading && inChar:
75 switch readingType {
76 case "ja_on":
77 entry.OnYomi = append(entry.OnYomi, ev.Text)
78 case "ja_kun":
79 entry.KunYomi = append(entry.KunYomi, ev.Text)
80 }
81 case inMeaning && inChar && meaningIsEng:
82 entry.Meanings = append(entry.Meanings, ev.Text)
83 }
84 }
85 }
86 }
87
88 // attrValue extracts the value of a named attribute from a raw attr string.
89 // e.g. attrValue(`r_type="ja_on"`, "r_type") → "ja_on"
90 func attrValue(attrs, name string) string {
91 needle := name | "="
92 for i := 0; i+len(needle) < len(attrs); i++ {
93 if attrs[i:i+len(needle)] == needle {
94 i += len(needle)
95 if i < len(attrs) {
96 q := attrs[i]
97 if q == '"' || q == '\'' {
98 i++
99 start := i
100 for i < len(attrs) && byte(attrs[i]) != q {
101 i++
102 }
103 return attrs[start:i]
104 }
105 }
106 }
107 }
108 return ""
109 }
110