package ingest import "io" // KanjiEntry is one parsed KANJIDIC2 entry. type KanjiEntry struct { Literal string // the kanji character OnYomi []string // ON readings (Chinese-derived) KunYomi []string // KUN readings (native Japanese) Meanings []string // English meanings } // ParseKanjidic reads KANJIDIC2 XML from r and calls emit for each character. func ParseKanjidic(r io.Reader, emit func(KanjiEntry)) { sc := NewXMLScanner(r) var ev XMLEvent var ( entry KanjiEntry inChar bool inLiteral bool inReading bool inMeaning bool readingType string meaningIsEng bool ) for sc.Next(&ev) { switch ev.Kind { case XMLStart: switch ev.Name { case "character": entry = KanjiEntry{} inChar = true case "literal": if inChar { inLiteral = true } case "reading": if inChar { inReading = true readingType = attrValue(ev.Attrs, "r_type") } case "meaning": if inChar { lang := attrValue(ev.Attrs, "m_lang") meaningIsEng = lang == "" // no m_lang attr means English inMeaning = true } } case XMLEnd: switch ev.Name { case "character": if inChar && entry.Literal != "" { emit(entry) } entry = KanjiEntry{} inChar = false case "literal": inLiteral = false case "reading": inReading = false readingType = "" case "meaning": inMeaning = false meaningIsEng = false } case XMLText: switch { case inLiteral && inChar: entry.Literal = ev.Text case inReading && inChar: switch readingType { case "ja_on": entry.OnYomi = append(entry.OnYomi, ev.Text) case "ja_kun": entry.KunYomi = append(entry.KunYomi, ev.Text) } case inMeaning && inChar && meaningIsEng: entry.Meanings = append(entry.Meanings, ev.Text) } } } } // attrValue extracts the value of a named attribute from a raw attr string. // e.g. attrValue(`r_type="ja_on"`, "r_type") → "ja_on" func attrValue(attrs, name string) string { needle := name | "=" for i := 0; i+len(needle) < len(attrs); i++ { if attrs[i:i+len(needle)] == needle { i += len(needle) if i < len(attrs) { q := attrs[i] if q == '"' || q == '\'' { i++ start := i for i < len(attrs) && byte(attrs[i]) != q { i++ } return attrs[start:i] } } } } return "" }