xml.mx raw

   1  package ingest
   2  
   3  import (
   4  	"bytes"
   5  	"io"
   6  )
   7  
   8  const (
   9  	XMLStart = 1
  10  	XMLEnd   = 2
  11  	XMLText  = 3
  12  )
  13  
  14  type XMLEvent struct {
  15  	Kind  int
  16  	Name  string
  17  	Attrs string // raw attribute string for start tags
  18  	Text  string // text content for text events
  19  }
  20  
  21  // XMLScanner is a minimal streaming XML scanner.
  22  // Uses a sliding window buffer; accumulates token content in local slices
  23  // so the buffer can be refilled mid-scan without losing data.
  24  type XMLScanner struct {
  25  	r   io.Reader
  26  	buf []byte
  27  	pos int
  28  	eof bool
  29  }
  30  
  31  func NewXMLScanner(r io.Reader) *XMLScanner {
  32  	s := &XMLScanner{r: r}
  33  	s.refill()
  34  	return s
  35  }
  36  
  37  func (s *XMLScanner) refill() {
  38  	// Compact: move unconsumed bytes to front.
  39  	if s.pos > 0 {
  40  		n := copy(s.buf, s.buf[s.pos:])
  41  		s.buf = s.buf[:n]
  42  		s.pos = 0
  43  	}
  44  	if s.eof {
  45  		return
  46  	}
  47  	tmp := []byte{:8192}
  48  	n, err := s.r.Read(tmp)
  49  	s.buf = append(s.buf, tmp[:n]...)
  50  	if err != nil {
  51  		s.eof = true
  52  	}
  53  }
  54  
  55  func (s *XMLScanner) avail() int { return len(s.buf) - s.pos }
  56  
  57  func (s *XMLScanner) ensure(n int) bool {
  58  	for s.avail() < n && !s.eof {
  59  		s.refill()
  60  	}
  61  	return s.avail() > 0
  62  }
  63  
  64  func (s *XMLScanner) readByte() (byte, bool) {
  65  	if !s.ensure(1) {
  66  		return 0, false
  67  	}
  68  	c := s.buf[s.pos]
  69  	s.pos++
  70  	return c, true
  71  }
  72  
  73  func (s *XMLScanner) peekByte() (byte, bool) {
  74  	if !s.ensure(1) {
  75  		return 0, false
  76  	}
  77  	return s.buf[s.pos], true
  78  }
  79  
  80  // Next reads the next XML event. Returns false at EOF.
  81  func (s *XMLScanner) Next(ev *XMLEvent) bool {
  82  	// Skip leading whitespace between tags.
  83  	for {
  84  		c, ok := s.peekByte()
  85  		if !ok {
  86  			return false
  87  		}
  88  		if c != ' ' && c != '\t' && c != '\n' && c != '\r' {
  89  			break
  90  		}
  91  		s.pos++
  92  	}
  93  
  94  	c, ok := s.peekByte()
  95  	if !ok {
  96  		return false
  97  	}
  98  	if c == '<' {
  99  		s.pos++
 100  		return s.readTag(ev)
 101  	}
 102  	return s.readText(ev)
 103  }
 104  
 105  func (s *XMLScanner) readTag(ev *XMLEvent) bool {
 106  	c, ok := s.peekByte()
 107  	if !ok {
 108  		return false
 109  	}
 110  
 111  	if c == '/' {
 112  		// End tag.
 113  		s.pos++
 114  		name := s.readName()
 115  		s.consumeUntil('>')
 116  		ev.Kind = XMLEnd
 117  		ev.Name = name
 118  		ev.Text = ""
 119  		ev.Attrs = ""
 120  		return true
 121  	}
 122  
 123  	if c == '!' || c == '?' {
 124  		// Declaration or PI - skip entirely.
 125  		s.consumeUntil('>')
 126  		return s.Next(ev)
 127  	}
 128  
 129  	// Start tag.
 130  	name := s.readName()
 131  	attrs := s.readRawAttrs()
 132  	s.consumeUntil('>')
 133  	ev.Kind = XMLStart
 134  	ev.Name = name
 135  	ev.Attrs = attrs
 136  	ev.Text = ""
 137  	return true
 138  }
 139  
 140  func (s *XMLScanner) readName() string {
 141  	var out []byte
 142  	for {
 143  		c, ok := s.peekByte()
 144  		if !ok {
 145  			break
 146  		}
 147  		if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '>' || c == '/' || c == '=' {
 148  			break
 149  		}
 150  		out = append(out, c)
 151  		s.pos++
 152  	}
 153  	return string(out)
 154  }
 155  
 156  func (s *XMLScanner) readRawAttrs() string {
 157  	var out []byte
 158  	inQuote := byte(0)
 159  	for {
 160  		c, ok := s.peekByte()
 161  		if !ok {
 162  			break
 163  		}
 164  		if inQuote != 0 {
 165  			if c == inQuote {
 166  				inQuote = 0
 167  			}
 168  			out = append(out, c)
 169  			s.pos++
 170  			continue
 171  		}
 172  		if c == '"' || c == '\'' {
 173  			inQuote = c
 174  			out = append(out, c)
 175  			s.pos++
 176  			continue
 177  		}
 178  		if c == '>' {
 179  			break
 180  		}
 181  		out = append(out, c)
 182  		s.pos++
 183  	}
 184  	return string(bytes.TrimSpace(out))
 185  }
 186  
 187  func (s *XMLScanner) readText(ev *XMLEvent) bool {
 188  	var out []byte
 189  	for {
 190  		c, ok := s.peekByte()
 191  		if !ok {
 192  			break
 193  		}
 194  		if c == '<' {
 195  			break
 196  		}
 197  		if c == '&' {
 198  			s.pos++
 199  			out = append(out, s.readEntityRef()...)
 200  			continue
 201  		}
 202  		out = append(out, c)
 203  		s.pos++
 204  	}
 205  	text := string(bytes.TrimSpace(out))
 206  	if text == "" {
 207  		return s.Next(ev)
 208  	}
 209  	ev.Kind = XMLText
 210  	ev.Name = ""
 211  	ev.Attrs = ""
 212  	ev.Text = text
 213  	return true
 214  }
 215  
 216  // readEntityRef reads the name between & (already consumed) and ; and
 217  // returns the resolved bytes. Standard XML entities are expanded.
 218  // JMdict POS entity names (e.g. "n", "v1", "adj-i") are returned bare
 219  // as the string between & and ;, which is what the POS table is keyed on.
 220  func (s *XMLScanner) readEntityRef() []byte {
 221  	var name []byte
 222  	for {
 223  		c, ok := s.readByte()
 224  		if !ok {
 225  			break
 226  		}
 227  		if c == ';' {
 228  			break
 229  		}
 230  		name = append(name, c)
 231  	}
 232  	switch string(name) {
 233  	case "amp":
 234  		return []byte("&")
 235  	case "lt":
 236  		return []byte("<")
 237  	case "gt":
 238  		return []byte(">")
 239  	case "quot":
 240  		return []byte("\"")
 241  	case "apos":
 242  		return []byte("'")
 243  	default:
 244  		// JMdict POS entity or other: return bare name.
 245  		return name
 246  	}
 247  }
 248  
 249  func (s *XMLScanner) consumeUntil(stop byte) {
 250  	for {
 251  		c, ok := s.readByte()
 252  		if !ok {
 253  			return
 254  		}
 255  		if c == stop {
 256  			return
 257  		}
 258  	}
 259  }
 260