package ingest import ( "bytes" "io" ) const ( XMLStart = 1 XMLEnd = 2 XMLText = 3 ) type XMLEvent struct { Kind int Name string Attrs string // raw attribute string for start tags Text string // text content for text events } // XMLScanner is a minimal streaming XML scanner. // Uses a sliding window buffer; accumulates token content in local slices // so the buffer can be refilled mid-scan without losing data. type XMLScanner struct { r io.Reader buf []byte pos int eof bool } func NewXMLScanner(r io.Reader) *XMLScanner { s := &XMLScanner{r: r} s.refill() return s } func (s *XMLScanner) refill() { // Compact: move unconsumed bytes to front. if s.pos > 0 { n := copy(s.buf, s.buf[s.pos:]) s.buf = s.buf[:n] s.pos = 0 } if s.eof { return } tmp := []byte{:8192} n, err := s.r.Read(tmp) s.buf = append(s.buf, tmp[:n]...) if err != nil { s.eof = true } } func (s *XMLScanner) avail() int { return len(s.buf) - s.pos } func (s *XMLScanner) ensure(n int) bool { for s.avail() < n && !s.eof { s.refill() } return s.avail() > 0 } func (s *XMLScanner) readByte() (byte, bool) { if !s.ensure(1) { return 0, false } c := s.buf[s.pos] s.pos++ return c, true } func (s *XMLScanner) peekByte() (byte, bool) { if !s.ensure(1) { return 0, false } return s.buf[s.pos], true } // Next reads the next XML event. Returns false at EOF. func (s *XMLScanner) Next(ev *XMLEvent) bool { // Skip leading whitespace between tags. for { c, ok := s.peekByte() if !ok { return false } if c != ' ' && c != '\t' && c != '\n' && c != '\r' { break } s.pos++ } c, ok := s.peekByte() if !ok { return false } if c == '<' { s.pos++ return s.readTag(ev) } return s.readText(ev) } func (s *XMLScanner) readTag(ev *XMLEvent) bool { c, ok := s.peekByte() if !ok { return false } if c == '/' { // End tag. s.pos++ name := s.readName() s.consumeUntil('>') ev.Kind = XMLEnd ev.Name = name ev.Text = "" ev.Attrs = "" return true } if c == '!' || c == '?' { // Declaration or PI - skip entirely. s.consumeUntil('>') return s.Next(ev) } // Start tag. name := s.readName() attrs := s.readRawAttrs() s.consumeUntil('>') ev.Kind = XMLStart ev.Name = name ev.Attrs = attrs ev.Text = "" return true } func (s *XMLScanner) readName() string { var out []byte for { c, ok := s.peekByte() if !ok { break } if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '>' || c == '/' || c == '=' { break } out = append(out, c) s.pos++ } return string(out) } func (s *XMLScanner) readRawAttrs() string { var out []byte inQuote := byte(0) for { c, ok := s.peekByte() if !ok { break } if inQuote != 0 { if c == inQuote { inQuote = 0 } out = append(out, c) s.pos++ continue } if c == '"' || c == '\'' { inQuote = c out = append(out, c) s.pos++ continue } if c == '>' { break } out = append(out, c) s.pos++ } return string(bytes.TrimSpace(out)) } func (s *XMLScanner) readText(ev *XMLEvent) bool { var out []byte for { c, ok := s.peekByte() if !ok { break } if c == '<' { break } if c == '&' { s.pos++ out = append(out, s.readEntityRef()...) continue } out = append(out, c) s.pos++ } text := string(bytes.TrimSpace(out)) if text == "" { return s.Next(ev) } ev.Kind = XMLText ev.Name = "" ev.Attrs = "" ev.Text = text return true } // readEntityRef reads the name between & (already consumed) and ; and // returns the resolved bytes. Standard XML entities are expanded. // JMdict POS entity names (e.g. "n", "v1", "adj-i") are returned bare // as the string between & and ;, which is what the POS table is keyed on. func (s *XMLScanner) readEntityRef() []byte { var name []byte for { c, ok := s.readByte() if !ok { break } if c == ';' { break } name = append(name, c) } switch string(name) { case "amp": return []byte("&") case "lt": return []byte("<") case "gt": return []byte(">") case "quot": return []byte("\"") case "apos": return []byte("'") default: // JMdict POS entity or other: return bare name. return name } } func (s *XMLScanner) consumeUntil(stop byte) { for { c, ok := s.readByte() if !ok { return } if c == stop { return } } }