kanjidic.mx raw

   1  package ingest
   2  
   3  import "io"
   4  
   5  // KanjiEntry is one parsed KANJIDIC2 entry.
   6  type KanjiEntry struct {
   7  	Literal  string   // the kanji character
   8  	OnYomi   []string // ON readings (Chinese-derived)
   9  	KunYomi  []string // KUN readings (native Japanese)
  10  	Meanings []string // English meanings
  11  }
  12  
  13  // ParseKanjidic reads KANJIDIC2 XML from r and calls emit for each character.
  14  func ParseKanjidic(r io.Reader, emit func(KanjiEntry)) {
  15  	sc := NewXMLScanner(r)
  16  	var ev XMLEvent
  17  
  18  	var (
  19  		entry          KanjiEntry
  20  		inChar         bool
  21  		inLiteral      bool
  22  		inReading      bool
  23  		inMeaning      bool
  24  		readingType    string
  25  		meaningIsEng   bool
  26  	)
  27  
  28  	for sc.Next(&ev) {
  29  		switch ev.Kind {
  30  		case XMLStart:
  31  			switch ev.Name {
  32  			case "character":
  33  				entry = KanjiEntry{}
  34  				inChar = true
  35  			case "literal":
  36  				if inChar {
  37  					inLiteral = true
  38  				}
  39  			case "reading":
  40  				if inChar {
  41  					inReading = true
  42  					readingType = attrValue(ev.Attrs, "r_type")
  43  				}
  44  			case "meaning":
  45  				if inChar {
  46  					lang := attrValue(ev.Attrs, "m_lang")
  47  					meaningIsEng = lang == "" // no m_lang attr means English
  48  					inMeaning = true
  49  				}
  50  			}
  51  
  52  		case XMLEnd:
  53  			switch ev.Name {
  54  			case "character":
  55  				if inChar && entry.Literal != "" {
  56  					emit(entry)
  57  				}
  58  				entry = KanjiEntry{}
  59  				inChar = false
  60  			case "literal":
  61  				inLiteral = false
  62  			case "reading":
  63  				inReading = false
  64  				readingType = ""
  65  			case "meaning":
  66  				inMeaning = false
  67  				meaningIsEng = false
  68  			}
  69  
  70  		case XMLText:
  71  			switch {
  72  			case inLiteral && inChar:
  73  				entry.Literal = ev.Text
  74  			case inReading && inChar:
  75  				switch readingType {
  76  				case "ja_on":
  77  					entry.OnYomi = append(entry.OnYomi, ev.Text)
  78  				case "ja_kun":
  79  					entry.KunYomi = append(entry.KunYomi, ev.Text)
  80  				}
  81  			case inMeaning && inChar && meaningIsEng:
  82  				entry.Meanings = append(entry.Meanings, ev.Text)
  83  			}
  84  		}
  85  	}
  86  }
  87  
  88  // attrValue extracts the value of a named attribute from a raw attr string.
  89  // e.g. attrValue(`r_type="ja_on"`, "r_type") → "ja_on"
  90  func attrValue(attrs, name string) string {
  91  	needle := name | "="
  92  	for i := 0; i+len(needle) < len(attrs); i++ {
  93  		if attrs[i:i+len(needle)] == needle {
  94  			i += len(needle)
  95  			if i < len(attrs) {
  96  				q := attrs[i]
  97  				if q == '"' || q == '\'' {
  98  					i++
  99  					start := i
 100  					for i < len(attrs) && byte(attrs[i]) != q {
 101  						i++
 102  					}
 103  					return attrs[start:i]
 104  				}
 105  			}
 106  		}
 107  	}
 108  	return ""
 109  }
 110