xml.mx raw
1 package ingest
2
3 import (
4 "bytes"
5 "io"
6 )
7
8 const (
9 XMLStart = 1
10 XMLEnd = 2
11 XMLText = 3
12 )
13
14 type XMLEvent struct {
15 Kind int
16 Name string
17 Attrs string // raw attribute string for start tags
18 Text string // text content for text events
19 }
20
21 // XMLScanner is a minimal streaming XML scanner.
22 // Uses a sliding window buffer; accumulates token content in local slices
23 // so the buffer can be refilled mid-scan without losing data.
24 type XMLScanner struct {
25 r io.Reader
26 buf []byte
27 pos int
28 eof bool
29 }
30
31 func NewXMLScanner(r io.Reader) *XMLScanner {
32 s := &XMLScanner{r: r}
33 s.refill()
34 return s
35 }
36
37 func (s *XMLScanner) refill() {
38 // Compact: move unconsumed bytes to front.
39 if s.pos > 0 {
40 n := copy(s.buf, s.buf[s.pos:])
41 s.buf = s.buf[:n]
42 s.pos = 0
43 }
44 if s.eof {
45 return
46 }
47 tmp := []byte{:8192}
48 n, err := s.r.Read(tmp)
49 s.buf = append(s.buf, tmp[:n]...)
50 if err != nil {
51 s.eof = true
52 }
53 }
54
55 func (s *XMLScanner) avail() int { return len(s.buf) - s.pos }
56
57 func (s *XMLScanner) ensure(n int) bool {
58 for s.avail() < n && !s.eof {
59 s.refill()
60 }
61 return s.avail() > 0
62 }
63
64 func (s *XMLScanner) readByte() (byte, bool) {
65 if !s.ensure(1) {
66 return 0, false
67 }
68 c := s.buf[s.pos]
69 s.pos++
70 return c, true
71 }
72
73 func (s *XMLScanner) peekByte() (byte, bool) {
74 if !s.ensure(1) {
75 return 0, false
76 }
77 return s.buf[s.pos], true
78 }
79
80 // Next reads the next XML event. Returns false at EOF.
81 func (s *XMLScanner) Next(ev *XMLEvent) bool {
82 // Skip leading whitespace between tags.
83 for {
84 c, ok := s.peekByte()
85 if !ok {
86 return false
87 }
88 if c != ' ' && c != '\t' && c != '\n' && c != '\r' {
89 break
90 }
91 s.pos++
92 }
93
94 c, ok := s.peekByte()
95 if !ok {
96 return false
97 }
98 if c == '<' {
99 s.pos++
100 return s.readTag(ev)
101 }
102 return s.readText(ev)
103 }
104
105 func (s *XMLScanner) readTag(ev *XMLEvent) bool {
106 c, ok := s.peekByte()
107 if !ok {
108 return false
109 }
110
111 if c == '/' {
112 // End tag.
113 s.pos++
114 name := s.readName()
115 s.consumeUntil('>')
116 ev.Kind = XMLEnd
117 ev.Name = name
118 ev.Text = ""
119 ev.Attrs = ""
120 return true
121 }
122
123 if c == '!' || c == '?' {
124 // Declaration or PI - skip entirely.
125 s.consumeUntil('>')
126 return s.Next(ev)
127 }
128
129 // Start tag.
130 name := s.readName()
131 attrs := s.readRawAttrs()
132 s.consumeUntil('>')
133 ev.Kind = XMLStart
134 ev.Name = name
135 ev.Attrs = attrs
136 ev.Text = ""
137 return true
138 }
139
140 func (s *XMLScanner) readName() string {
141 var out []byte
142 for {
143 c, ok := s.peekByte()
144 if !ok {
145 break
146 }
147 if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '>' || c == '/' || c == '=' {
148 break
149 }
150 out = append(out, c)
151 s.pos++
152 }
153 return string(out)
154 }
155
156 func (s *XMLScanner) readRawAttrs() string {
157 var out []byte
158 inQuote := byte(0)
159 for {
160 c, ok := s.peekByte()
161 if !ok {
162 break
163 }
164 if inQuote != 0 {
165 if c == inQuote {
166 inQuote = 0
167 }
168 out = append(out, c)
169 s.pos++
170 continue
171 }
172 if c == '"' || c == '\'' {
173 inQuote = c
174 out = append(out, c)
175 s.pos++
176 continue
177 }
178 if c == '>' {
179 break
180 }
181 out = append(out, c)
182 s.pos++
183 }
184 return string(bytes.TrimSpace(out))
185 }
186
187 func (s *XMLScanner) readText(ev *XMLEvent) bool {
188 var out []byte
189 for {
190 c, ok := s.peekByte()
191 if !ok {
192 break
193 }
194 if c == '<' {
195 break
196 }
197 if c == '&' {
198 s.pos++
199 out = append(out, s.readEntityRef()...)
200 continue
201 }
202 out = append(out, c)
203 s.pos++
204 }
205 text := string(bytes.TrimSpace(out))
206 if text == "" {
207 return s.Next(ev)
208 }
209 ev.Kind = XMLText
210 ev.Name = ""
211 ev.Attrs = ""
212 ev.Text = text
213 return true
214 }
215
216 // readEntityRef reads the name between & (already consumed) and ; and
217 // returns the resolved bytes. Standard XML entities are expanded.
218 // JMdict POS entity names (e.g. "n", "v1", "adj-i") are returned bare
219 // as the string between & and ;, which is what the POS table is keyed on.
220 func (s *XMLScanner) readEntityRef() []byte {
221 var name []byte
222 for {
223 c, ok := s.readByte()
224 if !ok {
225 break
226 }
227 if c == ';' {
228 break
229 }
230 name = append(name, c)
231 }
232 switch string(name) {
233 case "amp":
234 return []byte("&")
235 case "lt":
236 return []byte("<")
237 case "gt":
238 return []byte(">")
239 case "quot":
240 return []byte("\"")
241 case "apos":
242 return []byte("'")
243 default:
244 // JMdict POS entity or other: return bare name.
245 return name
246 }
247 }
248
249 func (s *XMLScanner) consumeUntil(stop byte) {
250 for {
251 c, ok := s.readByte()
252 if !ok {
253 return
254 }
255 if c == stop {
256 return
257 }
258 }
259 }
260