offload_linux.go raw
1 /* SPDX-License-Identifier: MIT
2 *
3 * Copyright (C) 2017-2025 WireGuard LLC. All Rights Reserved.
4 */
5
6 package tun
7
8 import (
9 "bytes"
10 "encoding/binary"
11 "errors"
12 "io"
13 "unsafe"
14
15 "golang.org/x/sys/unix"
16 "golang.zx2c4.com/wireguard/conn"
17 )
18
19 const tcpFlagsOffset = 13
20
21 const (
22 tcpFlagFIN uint8 = 0x01
23 tcpFlagPSH uint8 = 0x08
24 tcpFlagACK uint8 = 0x10
25 )
26
27 // virtioNetHdr is defined in the kernel in include/uapi/linux/virtio_net.h. The
28 // kernel symbol is virtio_net_hdr.
29 type virtioNetHdr struct {
30 flags uint8
31 gsoType uint8
32 hdrLen uint16
33 gsoSize uint16
34 csumStart uint16
35 csumOffset uint16
36 }
37
38 func (v *virtioNetHdr) decode(b []byte) error {
39 if len(b) < virtioNetHdrLen {
40 return io.ErrShortBuffer
41 }
42 copy(unsafe.Slice((*byte)(unsafe.Pointer(v)), virtioNetHdrLen), b[:virtioNetHdrLen])
43 return nil
44 }
45
46 func (v *virtioNetHdr) encode(b []byte) error {
47 if len(b) < virtioNetHdrLen {
48 return io.ErrShortBuffer
49 }
50 copy(b[:virtioNetHdrLen], unsafe.Slice((*byte)(unsafe.Pointer(v)), virtioNetHdrLen))
51 return nil
52 }
53
54 const (
55 // virtioNetHdrLen is the length in bytes of virtioNetHdr. This matches the
56 // shape of the C ABI for its kernel counterpart -- sizeof(virtio_net_hdr).
57 virtioNetHdrLen = int(unsafe.Sizeof(virtioNetHdr{}))
58 )
59
60 // tcpFlowKey represents the key for a TCP flow.
61 type tcpFlowKey struct {
62 srcAddr, dstAddr [16]byte
63 srcPort, dstPort uint16
64 rxAck uint32 // varying ack values should not be coalesced. Treat them as separate flows.
65 isV6 bool
66 }
67
68 // tcpGROTable holds flow and coalescing information for the purposes of TCP GRO.
69 type tcpGROTable struct {
70 itemsByFlow map[tcpFlowKey][]tcpGROItem
71 itemsPool [][]tcpGROItem
72 }
73
74 func newTCPGROTable() *tcpGROTable {
75 t := &tcpGROTable{
76 itemsByFlow: make(map[tcpFlowKey][]tcpGROItem, conn.IdealBatchSize),
77 itemsPool: make([][]tcpGROItem, conn.IdealBatchSize),
78 }
79 for i := range t.itemsPool {
80 t.itemsPool[i] = make([]tcpGROItem, 0, conn.IdealBatchSize)
81 }
82 return t
83 }
84
85 func newTCPFlowKey(pkt []byte, srcAddrOffset, dstAddrOffset, tcphOffset int) tcpFlowKey {
86 key := tcpFlowKey{}
87 addrSize := dstAddrOffset - srcAddrOffset
88 copy(key.srcAddr[:], pkt[srcAddrOffset:dstAddrOffset])
89 copy(key.dstAddr[:], pkt[dstAddrOffset:dstAddrOffset+addrSize])
90 key.srcPort = binary.BigEndian.Uint16(pkt[tcphOffset:])
91 key.dstPort = binary.BigEndian.Uint16(pkt[tcphOffset+2:])
92 key.rxAck = binary.BigEndian.Uint32(pkt[tcphOffset+8:])
93 key.isV6 = addrSize == 16
94 return key
95 }
96
97 // lookupOrInsert looks up a flow for the provided packet and metadata,
98 // returning the packets found for the flow, or inserting a new one if none
99 // is found.
100 func (t *tcpGROTable) lookupOrInsert(pkt []byte, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex int) ([]tcpGROItem, bool) {
101 key := newTCPFlowKey(pkt, srcAddrOffset, dstAddrOffset, tcphOffset)
102 items, ok := t.itemsByFlow[key]
103 if ok {
104 return items, ok
105 }
106 // TODO: insert() performs another map lookup. This could be rearranged to avoid.
107 t.insert(pkt, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex)
108 return nil, false
109 }
110
111 // insert an item in the table for the provided packet and packet metadata.
112 func (t *tcpGROTable) insert(pkt []byte, srcAddrOffset, dstAddrOffset, tcphOffset, tcphLen, bufsIndex int) {
113 key := newTCPFlowKey(pkt, srcAddrOffset, dstAddrOffset, tcphOffset)
114 item := tcpGROItem{
115 key: key,
116 bufsIndex: uint16(bufsIndex),
117 gsoSize: uint16(len(pkt[tcphOffset+tcphLen:])),
118 iphLen: uint8(tcphOffset),
119 tcphLen: uint8(tcphLen),
120 sentSeq: binary.BigEndian.Uint32(pkt[tcphOffset+4:]),
121 pshSet: pkt[tcphOffset+tcpFlagsOffset]&tcpFlagPSH != 0,
122 }
123 items, ok := t.itemsByFlow[key]
124 if !ok {
125 items = t.newItems()
126 }
127 items = append(items, item)
128 t.itemsByFlow[key] = items
129 }
130
131 func (t *tcpGROTable) updateAt(item tcpGROItem, i int) {
132 items, _ := t.itemsByFlow[item.key]
133 items[i] = item
134 }
135
136 func (t *tcpGROTable) deleteAt(key tcpFlowKey, i int) {
137 items, _ := t.itemsByFlow[key]
138 items = append(items[:i], items[i+1:]...)
139 t.itemsByFlow[key] = items
140 }
141
142 // tcpGROItem represents bookkeeping data for a TCP packet during the lifetime
143 // of a GRO evaluation across a vector of packets.
144 type tcpGROItem struct {
145 key tcpFlowKey
146 sentSeq uint32 // the sequence number
147 bufsIndex uint16 // the index into the original bufs slice
148 numMerged uint16 // the number of packets merged into this item
149 gsoSize uint16 // payload size
150 iphLen uint8 // ip header len
151 tcphLen uint8 // tcp header len
152 pshSet bool // psh flag is set
153 }
154
155 func (t *tcpGROTable) newItems() []tcpGROItem {
156 var items []tcpGROItem
157 items, t.itemsPool = t.itemsPool[len(t.itemsPool)-1], t.itemsPool[:len(t.itemsPool)-1]
158 return items
159 }
160
161 func (t *tcpGROTable) reset() {
162 for k, items := range t.itemsByFlow {
163 items = items[:0]
164 t.itemsPool = append(t.itemsPool, items)
165 delete(t.itemsByFlow, k)
166 }
167 }
168
169 // udpFlowKey represents the key for a UDP flow.
170 type udpFlowKey struct {
171 srcAddr, dstAddr [16]byte
172 srcPort, dstPort uint16
173 isV6 bool
174 }
175
176 // udpGROTable holds flow and coalescing information for the purposes of UDP GRO.
177 type udpGROTable struct {
178 itemsByFlow map[udpFlowKey][]udpGROItem
179 itemsPool [][]udpGROItem
180 }
181
182 func newUDPGROTable() *udpGROTable {
183 u := &udpGROTable{
184 itemsByFlow: make(map[udpFlowKey][]udpGROItem, conn.IdealBatchSize),
185 itemsPool: make([][]udpGROItem, conn.IdealBatchSize),
186 }
187 for i := range u.itemsPool {
188 u.itemsPool[i] = make([]udpGROItem, 0, conn.IdealBatchSize)
189 }
190 return u
191 }
192
193 func newUDPFlowKey(pkt []byte, srcAddrOffset, dstAddrOffset, udphOffset int) udpFlowKey {
194 key := udpFlowKey{}
195 addrSize := dstAddrOffset - srcAddrOffset
196 copy(key.srcAddr[:], pkt[srcAddrOffset:dstAddrOffset])
197 copy(key.dstAddr[:], pkt[dstAddrOffset:dstAddrOffset+addrSize])
198 key.srcPort = binary.BigEndian.Uint16(pkt[udphOffset:])
199 key.dstPort = binary.BigEndian.Uint16(pkt[udphOffset+2:])
200 key.isV6 = addrSize == 16
201 return key
202 }
203
204 // lookupOrInsert looks up a flow for the provided packet and metadata,
205 // returning the packets found for the flow, or inserting a new one if none
206 // is found.
207 func (u *udpGROTable) lookupOrInsert(pkt []byte, srcAddrOffset, dstAddrOffset, udphOffset, bufsIndex int) ([]udpGROItem, bool) {
208 key := newUDPFlowKey(pkt, srcAddrOffset, dstAddrOffset, udphOffset)
209 items, ok := u.itemsByFlow[key]
210 if ok {
211 return items, ok
212 }
213 // TODO: insert() performs another map lookup. This could be rearranged to avoid.
214 u.insert(pkt, srcAddrOffset, dstAddrOffset, udphOffset, bufsIndex, false)
215 return nil, false
216 }
217
218 // insert an item in the table for the provided packet and packet metadata.
219 func (u *udpGROTable) insert(pkt []byte, srcAddrOffset, dstAddrOffset, udphOffset, bufsIndex int, cSumKnownInvalid bool) {
220 key := newUDPFlowKey(pkt, srcAddrOffset, dstAddrOffset, udphOffset)
221 item := udpGROItem{
222 key: key,
223 bufsIndex: uint16(bufsIndex),
224 gsoSize: uint16(len(pkt[udphOffset+udphLen:])),
225 iphLen: uint8(udphOffset),
226 cSumKnownInvalid: cSumKnownInvalid,
227 }
228 items, ok := u.itemsByFlow[key]
229 if !ok {
230 items = u.newItems()
231 }
232 items = append(items, item)
233 u.itemsByFlow[key] = items
234 }
235
236 func (u *udpGROTable) updateAt(item udpGROItem, i int) {
237 items, _ := u.itemsByFlow[item.key]
238 items[i] = item
239 }
240
241 // udpGROItem represents bookkeeping data for a UDP packet during the lifetime
242 // of a GRO evaluation across a vector of packets.
243 type udpGROItem struct {
244 key udpFlowKey
245 bufsIndex uint16 // the index into the original bufs slice
246 numMerged uint16 // the number of packets merged into this item
247 gsoSize uint16 // payload size
248 iphLen uint8 // ip header len
249 cSumKnownInvalid bool // UDP header checksum validity; a false value DOES NOT imply valid, just unknown.
250 }
251
252 func (u *udpGROTable) newItems() []udpGROItem {
253 var items []udpGROItem
254 items, u.itemsPool = u.itemsPool[len(u.itemsPool)-1], u.itemsPool[:len(u.itemsPool)-1]
255 return items
256 }
257
258 func (u *udpGROTable) reset() {
259 for k, items := range u.itemsByFlow {
260 items = items[:0]
261 u.itemsPool = append(u.itemsPool, items)
262 delete(u.itemsByFlow, k)
263 }
264 }
265
266 // canCoalesce represents the outcome of checking if two TCP packets are
267 // candidates for coalescing.
268 type canCoalesce int
269
270 const (
271 coalescePrepend canCoalesce = -1
272 coalesceUnavailable canCoalesce = 0
273 coalesceAppend canCoalesce = 1
274 )
275
276 // ipHeadersCanCoalesce returns true if the IP headers found in pktA and pktB
277 // meet all requirements to be merged as part of a GRO operation, otherwise it
278 // returns false.
279 func ipHeadersCanCoalesce(pktA, pktB []byte) bool {
280 if len(pktA) < 9 || len(pktB) < 9 {
281 return false
282 }
283 if pktA[0]>>4 == 6 {
284 if pktA[0] != pktB[0] || pktA[1]>>4 != pktB[1]>>4 {
285 // cannot coalesce with unequal Traffic class values
286 return false
287 }
288 if pktA[7] != pktB[7] {
289 // cannot coalesce with unequal Hop limit values
290 return false
291 }
292 } else {
293 if pktA[1] != pktB[1] {
294 // cannot coalesce with unequal ToS values
295 return false
296 }
297 if pktA[6]>>5 != pktB[6]>>5 {
298 // cannot coalesce with unequal DF or reserved bits. MF is checked
299 // further up the stack.
300 return false
301 }
302 if pktA[8] != pktB[8] {
303 // cannot coalesce with unequal TTL values
304 return false
305 }
306 }
307 return true
308 }
309
310 // udpPacketsCanCoalesce evaluates if pkt can be coalesced with the packet
311 // described by item. iphLen and gsoSize describe pkt. bufs is the vector of
312 // packets involved in the current GRO evaluation. bufsOffset is the offset at
313 // which packet data begins within bufs.
314 func udpPacketsCanCoalesce(pkt []byte, iphLen uint8, gsoSize uint16, item udpGROItem, bufs [][]byte, bufsOffset int) canCoalesce {
315 pktTarget := bufs[item.bufsIndex][bufsOffset:]
316 if !ipHeadersCanCoalesce(pkt, pktTarget) {
317 return coalesceUnavailable
318 }
319 if len(pktTarget[iphLen+udphLen:])%int(item.gsoSize) != 0 {
320 // A smaller than gsoSize packet has been appended previously.
321 // Nothing can come after a smaller packet on the end.
322 return coalesceUnavailable
323 }
324 if gsoSize > item.gsoSize {
325 // We cannot have a larger packet following a smaller one.
326 return coalesceUnavailable
327 }
328 return coalesceAppend
329 }
330
331 // tcpPacketsCanCoalesce evaluates if pkt can be coalesced with the packet
332 // described by item. This function makes considerations that match the kernel's
333 // GRO self tests, which can be found in tools/testing/selftests/net/gro.c.
334 func tcpPacketsCanCoalesce(pkt []byte, iphLen, tcphLen uint8, seq uint32, pshSet bool, gsoSize uint16, item tcpGROItem, bufs [][]byte, bufsOffset int) canCoalesce {
335 pktTarget := bufs[item.bufsIndex][bufsOffset:]
336 if tcphLen != item.tcphLen {
337 // cannot coalesce with unequal tcp options len
338 return coalesceUnavailable
339 }
340 if tcphLen > 20 {
341 if !bytes.Equal(pkt[iphLen+20:iphLen+tcphLen], pktTarget[item.iphLen+20:iphLen+tcphLen]) {
342 // cannot coalesce with unequal tcp options
343 return coalesceUnavailable
344 }
345 }
346 if !ipHeadersCanCoalesce(pkt, pktTarget) {
347 return coalesceUnavailable
348 }
349 // seq adjacency
350 lhsLen := item.gsoSize
351 lhsLen += item.numMerged * item.gsoSize
352 if seq == item.sentSeq+uint32(lhsLen) { // pkt aligns following item from a seq num perspective
353 if item.pshSet {
354 // We cannot append to a segment that has the PSH flag set, PSH
355 // can only be set on the final segment in a reassembled group.
356 return coalesceUnavailable
357 }
358 if len(pktTarget[iphLen+tcphLen:])%int(item.gsoSize) != 0 {
359 // A smaller than gsoSize packet has been appended previously.
360 // Nothing can come after a smaller packet on the end.
361 return coalesceUnavailable
362 }
363 if gsoSize > item.gsoSize {
364 // We cannot have a larger packet following a smaller one.
365 return coalesceUnavailable
366 }
367 return coalesceAppend
368 } else if seq+uint32(gsoSize) == item.sentSeq { // pkt aligns in front of item from a seq num perspective
369 if pshSet {
370 // We cannot prepend with a segment that has the PSH flag set, PSH
371 // can only be set on the final segment in a reassembled group.
372 return coalesceUnavailable
373 }
374 if gsoSize < item.gsoSize {
375 // We cannot have a larger packet following a smaller one.
376 return coalesceUnavailable
377 }
378 if gsoSize > item.gsoSize && item.numMerged > 0 {
379 // There's at least one previous merge, and we're larger than all
380 // previous. This would put multiple smaller packets on the end.
381 return coalesceUnavailable
382 }
383 return coalescePrepend
384 }
385 return coalesceUnavailable
386 }
387
388 func checksumValid(pkt []byte, iphLen, proto uint8, isV6 bool) bool {
389 srcAddrAt := ipv4SrcAddrOffset
390 addrSize := 4
391 if isV6 {
392 srcAddrAt = ipv6SrcAddrOffset
393 addrSize = 16
394 }
395 lenForPseudo := uint16(len(pkt) - int(iphLen))
396 cSum := pseudoHeaderChecksumNoFold(proto, pkt[srcAddrAt:srcAddrAt+addrSize], pkt[srcAddrAt+addrSize:srcAddrAt+addrSize*2], lenForPseudo)
397 return ^checksum(pkt[iphLen:], cSum) == 0
398 }
399
400 // coalesceResult represents the result of attempting to coalesce two TCP
401 // packets.
402 type coalesceResult int
403
404 const (
405 coalesceInsufficientCap coalesceResult = iota
406 coalescePSHEnding
407 coalesceItemInvalidCSum
408 coalescePktInvalidCSum
409 coalesceSuccess
410 )
411
412 // coalesceUDPPackets attempts to coalesce pkt with the packet described by
413 // item, and returns the outcome.
414 func coalesceUDPPackets(pkt []byte, item *udpGROItem, bufs [][]byte, bufsOffset int, isV6 bool) coalesceResult {
415 pktHead := bufs[item.bufsIndex][bufsOffset:] // the packet that will end up at the front
416 headersLen := item.iphLen + udphLen
417 coalescedLen := len(bufs[item.bufsIndex][bufsOffset:]) + len(pkt) - int(headersLen)
418
419 if cap(pktHead)-bufsOffset < coalescedLen {
420 // We don't want to allocate a new underlying array if capacity is
421 // too small.
422 return coalesceInsufficientCap
423 }
424 if item.numMerged == 0 {
425 if item.cSumKnownInvalid || !checksumValid(bufs[item.bufsIndex][bufsOffset:], item.iphLen, unix.IPPROTO_UDP, isV6) {
426 return coalesceItemInvalidCSum
427 }
428 }
429 if !checksumValid(pkt, item.iphLen, unix.IPPROTO_UDP, isV6) {
430 return coalescePktInvalidCSum
431 }
432 extendBy := len(pkt) - int(headersLen)
433 bufs[item.bufsIndex] = append(bufs[item.bufsIndex], make([]byte, extendBy)...)
434 copy(bufs[item.bufsIndex][bufsOffset+len(pktHead):], pkt[headersLen:])
435
436 item.numMerged++
437 return coalesceSuccess
438 }
439
440 // coalesceTCPPackets attempts to coalesce pkt with the packet described by
441 // item, and returns the outcome. This function may swap bufs elements in the
442 // event of a prepend as item's bufs index is already being tracked for writing
443 // to a Device.
444 func coalesceTCPPackets(mode canCoalesce, pkt []byte, pktBuffsIndex int, gsoSize uint16, seq uint32, pshSet bool, item *tcpGROItem, bufs [][]byte, bufsOffset int, isV6 bool) coalesceResult {
445 var pktHead []byte // the packet that will end up at the front
446 headersLen := item.iphLen + item.tcphLen
447 coalescedLen := len(bufs[item.bufsIndex][bufsOffset:]) + len(pkt) - int(headersLen)
448
449 // Copy data
450 if mode == coalescePrepend {
451 pktHead = pkt
452 if cap(pkt)-bufsOffset < coalescedLen {
453 // We don't want to allocate a new underlying array if capacity is
454 // too small.
455 return coalesceInsufficientCap
456 }
457 if pshSet {
458 return coalescePSHEnding
459 }
460 if item.numMerged == 0 {
461 if !checksumValid(bufs[item.bufsIndex][bufsOffset:], item.iphLen, unix.IPPROTO_TCP, isV6) {
462 return coalesceItemInvalidCSum
463 }
464 }
465 if !checksumValid(pkt, item.iphLen, unix.IPPROTO_TCP, isV6) {
466 return coalescePktInvalidCSum
467 }
468 item.sentSeq = seq
469 extendBy := coalescedLen - len(pktHead)
470 bufs[pktBuffsIndex] = append(bufs[pktBuffsIndex], make([]byte, extendBy)...)
471 copy(bufs[pktBuffsIndex][bufsOffset+len(pkt):], bufs[item.bufsIndex][bufsOffset+int(headersLen):])
472 // Flip the slice headers in bufs as part of prepend. The index of item
473 // is already being tracked for writing.
474 bufs[item.bufsIndex], bufs[pktBuffsIndex] = bufs[pktBuffsIndex], bufs[item.bufsIndex]
475 } else {
476 pktHead = bufs[item.bufsIndex][bufsOffset:]
477 if cap(pktHead)-bufsOffset < coalescedLen {
478 // We don't want to allocate a new underlying array if capacity is
479 // too small.
480 return coalesceInsufficientCap
481 }
482 if item.numMerged == 0 {
483 if !checksumValid(bufs[item.bufsIndex][bufsOffset:], item.iphLen, unix.IPPROTO_TCP, isV6) {
484 return coalesceItemInvalidCSum
485 }
486 }
487 if !checksumValid(pkt, item.iphLen, unix.IPPROTO_TCP, isV6) {
488 return coalescePktInvalidCSum
489 }
490 if pshSet {
491 // We are appending a segment with PSH set.
492 item.pshSet = pshSet
493 pktHead[item.iphLen+tcpFlagsOffset] |= tcpFlagPSH
494 }
495 extendBy := len(pkt) - int(headersLen)
496 bufs[item.bufsIndex] = append(bufs[item.bufsIndex], make([]byte, extendBy)...)
497 copy(bufs[item.bufsIndex][bufsOffset+len(pktHead):], pkt[headersLen:])
498 }
499
500 if gsoSize > item.gsoSize {
501 item.gsoSize = gsoSize
502 }
503
504 item.numMerged++
505 return coalesceSuccess
506 }
507
508 const (
509 ipv4FlagMoreFragments uint8 = 0x20
510 )
511
512 const (
513 ipv4SrcAddrOffset = 12
514 ipv6SrcAddrOffset = 8
515 maxUint16 = 1<<16 - 1
516 )
517
518 type groResult int
519
520 const (
521 groResultNoop groResult = iota
522 groResultTableInsert
523 groResultCoalesced
524 )
525
526 // tcpGRO evaluates the TCP packet at pktI in bufs for coalescing with
527 // existing packets tracked in table. It returns a groResultNoop when no
528 // action was taken, groResultTableInsert when the evaluated packet was
529 // inserted into table, and groResultCoalesced when the evaluated packet was
530 // coalesced with another packet in table.
531 func tcpGRO(bufs [][]byte, offset int, pktI int, table *tcpGROTable, isV6 bool) groResult {
532 pkt := bufs[pktI][offset:]
533 if len(pkt) > maxUint16 {
534 // A valid IPv4 or IPv6 packet will never exceed this.
535 return groResultNoop
536 }
537 iphLen := int((pkt[0] & 0x0F) * 4)
538 if isV6 {
539 iphLen = 40
540 ipv6HPayloadLen := int(binary.BigEndian.Uint16(pkt[4:]))
541 if ipv6HPayloadLen != len(pkt)-iphLen {
542 return groResultNoop
543 }
544 } else {
545 totalLen := int(binary.BigEndian.Uint16(pkt[2:]))
546 if totalLen != len(pkt) {
547 return groResultNoop
548 }
549 }
550 if len(pkt) < iphLen {
551 return groResultNoop
552 }
553 tcphLen := int((pkt[iphLen+12] >> 4) * 4)
554 if tcphLen < 20 || tcphLen > 60 {
555 return groResultNoop
556 }
557 if len(pkt) < iphLen+tcphLen {
558 return groResultNoop
559 }
560 if !isV6 {
561 if pkt[6]&ipv4FlagMoreFragments != 0 || pkt[6]<<3 != 0 || pkt[7] != 0 {
562 // no GRO support for fragmented segments for now
563 return groResultNoop
564 }
565 }
566 tcpFlags := pkt[iphLen+tcpFlagsOffset]
567 var pshSet bool
568 // not a candidate if any non-ACK flags (except PSH+ACK) are set
569 if tcpFlags != tcpFlagACK {
570 if pkt[iphLen+tcpFlagsOffset] != tcpFlagACK|tcpFlagPSH {
571 return groResultNoop
572 }
573 pshSet = true
574 }
575 gsoSize := uint16(len(pkt) - tcphLen - iphLen)
576 // not a candidate if payload len is 0
577 if gsoSize < 1 {
578 return groResultNoop
579 }
580 seq := binary.BigEndian.Uint32(pkt[iphLen+4:])
581 srcAddrOffset := ipv4SrcAddrOffset
582 addrLen := 4
583 if isV6 {
584 srcAddrOffset = ipv6SrcAddrOffset
585 addrLen = 16
586 }
587 items, existing := table.lookupOrInsert(pkt, srcAddrOffset, srcAddrOffset+addrLen, iphLen, tcphLen, pktI)
588 if !existing {
589 return groResultTableInsert
590 }
591 for i := len(items) - 1; i >= 0; i-- {
592 // In the best case of packets arriving in order iterating in reverse is
593 // more efficient if there are multiple items for a given flow. This
594 // also enables a natural table.deleteAt() in the
595 // coalesceItemInvalidCSum case without the need for index tracking.
596 // This algorithm makes a best effort to coalesce in the event of
597 // unordered packets, where pkt may land anywhere in items from a
598 // sequence number perspective, however once an item is inserted into
599 // the table it is never compared across other items later.
600 item := items[i]
601 can := tcpPacketsCanCoalesce(pkt, uint8(iphLen), uint8(tcphLen), seq, pshSet, gsoSize, item, bufs, offset)
602 if can != coalesceUnavailable {
603 result := coalesceTCPPackets(can, pkt, pktI, gsoSize, seq, pshSet, &item, bufs, offset, isV6)
604 switch result {
605 case coalesceSuccess:
606 table.updateAt(item, i)
607 return groResultCoalesced
608 case coalesceItemInvalidCSum:
609 // delete the item with an invalid csum
610 table.deleteAt(item.key, i)
611 case coalescePktInvalidCSum:
612 // no point in inserting an item that we can't coalesce
613 return groResultNoop
614 default:
615 }
616 }
617 }
618 // failed to coalesce with any other packets; store the item in the flow
619 table.insert(pkt, srcAddrOffset, srcAddrOffset+addrLen, iphLen, tcphLen, pktI)
620 return groResultTableInsert
621 }
622
623 // applyTCPCoalesceAccounting updates bufs to account for coalescing based on the
624 // metadata found in table.
625 func applyTCPCoalesceAccounting(bufs [][]byte, offset int, table *tcpGROTable) error {
626 for _, items := range table.itemsByFlow {
627 for _, item := range items {
628 if item.numMerged > 0 {
629 hdr := virtioNetHdr{
630 flags: unix.VIRTIO_NET_HDR_F_NEEDS_CSUM, // this turns into CHECKSUM_PARTIAL in the skb
631 hdrLen: uint16(item.iphLen + item.tcphLen),
632 gsoSize: item.gsoSize,
633 csumStart: uint16(item.iphLen),
634 csumOffset: 16,
635 }
636 pkt := bufs[item.bufsIndex][offset:]
637
638 // Recalculate the total len (IPv4) or payload len (IPv6).
639 // Recalculate the (IPv4) header checksum.
640 if item.key.isV6 {
641 hdr.gsoType = unix.VIRTIO_NET_HDR_GSO_TCPV6
642 binary.BigEndian.PutUint16(pkt[4:], uint16(len(pkt))-uint16(item.iphLen)) // set new IPv6 header payload len
643 } else {
644 hdr.gsoType = unix.VIRTIO_NET_HDR_GSO_TCPV4
645 pkt[10], pkt[11] = 0, 0
646 binary.BigEndian.PutUint16(pkt[2:], uint16(len(pkt))) // set new total length
647 iphCSum := ^checksum(pkt[:item.iphLen], 0) // compute IPv4 header checksum
648 binary.BigEndian.PutUint16(pkt[10:], iphCSum) // set IPv4 header checksum field
649 }
650 err := hdr.encode(bufs[item.bufsIndex][offset-virtioNetHdrLen:])
651 if err != nil {
652 return err
653 }
654
655 // Calculate the pseudo header checksum and place it at the TCP
656 // checksum offset. Downstream checksum offloading will combine
657 // this with computation of the tcp header and payload checksum.
658 addrLen := 4
659 addrOffset := ipv4SrcAddrOffset
660 if item.key.isV6 {
661 addrLen = 16
662 addrOffset = ipv6SrcAddrOffset
663 }
664 srcAddrAt := offset + addrOffset
665 srcAddr := bufs[item.bufsIndex][srcAddrAt : srcAddrAt+addrLen]
666 dstAddr := bufs[item.bufsIndex][srcAddrAt+addrLen : srcAddrAt+addrLen*2]
667 psum := pseudoHeaderChecksumNoFold(unix.IPPROTO_TCP, srcAddr, dstAddr, uint16(len(pkt)-int(item.iphLen)))
668 binary.BigEndian.PutUint16(pkt[hdr.csumStart+hdr.csumOffset:], checksum([]byte{}, psum))
669 } else {
670 hdr := virtioNetHdr{}
671 err := hdr.encode(bufs[item.bufsIndex][offset-virtioNetHdrLen:])
672 if err != nil {
673 return err
674 }
675 }
676 }
677 }
678 return nil
679 }
680
681 // applyUDPCoalesceAccounting updates bufs to account for coalescing based on the
682 // metadata found in table.
683 func applyUDPCoalesceAccounting(bufs [][]byte, offset int, table *udpGROTable) error {
684 for _, items := range table.itemsByFlow {
685 for _, item := range items {
686 if item.numMerged > 0 {
687 hdr := virtioNetHdr{
688 flags: unix.VIRTIO_NET_HDR_F_NEEDS_CSUM, // this turns into CHECKSUM_PARTIAL in the skb
689 hdrLen: uint16(item.iphLen + udphLen),
690 gsoSize: item.gsoSize,
691 csumStart: uint16(item.iphLen),
692 csumOffset: 6,
693 }
694 pkt := bufs[item.bufsIndex][offset:]
695
696 // Recalculate the total len (IPv4) or payload len (IPv6).
697 // Recalculate the (IPv4) header checksum.
698 hdr.gsoType = unix.VIRTIO_NET_HDR_GSO_UDP_L4
699 if item.key.isV6 {
700 binary.BigEndian.PutUint16(pkt[4:], uint16(len(pkt))-uint16(item.iphLen)) // set new IPv6 header payload len
701 } else {
702 pkt[10], pkt[11] = 0, 0
703 binary.BigEndian.PutUint16(pkt[2:], uint16(len(pkt))) // set new total length
704 iphCSum := ^checksum(pkt[:item.iphLen], 0) // compute IPv4 header checksum
705 binary.BigEndian.PutUint16(pkt[10:], iphCSum) // set IPv4 header checksum field
706 }
707 err := hdr.encode(bufs[item.bufsIndex][offset-virtioNetHdrLen:])
708 if err != nil {
709 return err
710 }
711
712 // Recalculate the UDP len field value
713 binary.BigEndian.PutUint16(pkt[item.iphLen+4:], uint16(len(pkt[item.iphLen:])))
714
715 // Calculate the pseudo header checksum and place it at the UDP
716 // checksum offset. Downstream checksum offloading will combine
717 // this with computation of the udp header and payload checksum.
718 addrLen := 4
719 addrOffset := ipv4SrcAddrOffset
720 if item.key.isV6 {
721 addrLen = 16
722 addrOffset = ipv6SrcAddrOffset
723 }
724 srcAddrAt := offset + addrOffset
725 srcAddr := bufs[item.bufsIndex][srcAddrAt : srcAddrAt+addrLen]
726 dstAddr := bufs[item.bufsIndex][srcAddrAt+addrLen : srcAddrAt+addrLen*2]
727 psum := pseudoHeaderChecksumNoFold(unix.IPPROTO_UDP, srcAddr, dstAddr, uint16(len(pkt)-int(item.iphLen)))
728 binary.BigEndian.PutUint16(pkt[hdr.csumStart+hdr.csumOffset:], checksum([]byte{}, psum))
729 } else {
730 hdr := virtioNetHdr{}
731 err := hdr.encode(bufs[item.bufsIndex][offset-virtioNetHdrLen:])
732 if err != nil {
733 return err
734 }
735 }
736 }
737 }
738 return nil
739 }
740
741 type groCandidateType uint8
742
743 const (
744 notGROCandidate groCandidateType = iota
745 tcp4GROCandidate
746 tcp6GROCandidate
747 udp4GROCandidate
748 udp6GROCandidate
749 )
750
751 func packetIsGROCandidate(b []byte, canUDPGRO bool) groCandidateType {
752 if len(b) < 28 {
753 return notGROCandidate
754 }
755 if b[0]>>4 == 4 {
756 if b[0]&0x0F != 5 {
757 // IPv4 packets w/IP options do not coalesce
758 return notGROCandidate
759 }
760 if b[9] == unix.IPPROTO_TCP && len(b) >= 40 {
761 return tcp4GROCandidate
762 }
763 if b[9] == unix.IPPROTO_UDP && canUDPGRO {
764 return udp4GROCandidate
765 }
766 } else if b[0]>>4 == 6 {
767 if b[6] == unix.IPPROTO_TCP && len(b) >= 60 {
768 return tcp6GROCandidate
769 }
770 if b[6] == unix.IPPROTO_UDP && len(b) >= 48 && canUDPGRO {
771 return udp6GROCandidate
772 }
773 }
774 return notGROCandidate
775 }
776
777 const (
778 udphLen = 8
779 )
780
781 // udpGRO evaluates the UDP packet at pktI in bufs for coalescing with
782 // existing packets tracked in table. It returns a groResultNoop when no
783 // action was taken, groResultTableInsert when the evaluated packet was
784 // inserted into table, and groResultCoalesced when the evaluated packet was
785 // coalesced with another packet in table.
786 func udpGRO(bufs [][]byte, offset int, pktI int, table *udpGROTable, isV6 bool) groResult {
787 pkt := bufs[pktI][offset:]
788 if len(pkt) > maxUint16 {
789 // A valid IPv4 or IPv6 packet will never exceed this.
790 return groResultNoop
791 }
792 iphLen := int((pkt[0] & 0x0F) * 4)
793 if isV6 {
794 iphLen = 40
795 ipv6HPayloadLen := int(binary.BigEndian.Uint16(pkt[4:]))
796 if ipv6HPayloadLen != len(pkt)-iphLen {
797 return groResultNoop
798 }
799 } else {
800 totalLen := int(binary.BigEndian.Uint16(pkt[2:]))
801 if totalLen != len(pkt) {
802 return groResultNoop
803 }
804 }
805 if len(pkt) < iphLen {
806 return groResultNoop
807 }
808 if len(pkt) < iphLen+udphLen {
809 return groResultNoop
810 }
811 if !isV6 {
812 if pkt[6]&ipv4FlagMoreFragments != 0 || pkt[6]<<3 != 0 || pkt[7] != 0 {
813 // no GRO support for fragmented segments for now
814 return groResultNoop
815 }
816 }
817 gsoSize := uint16(len(pkt) - udphLen - iphLen)
818 // not a candidate if payload len is 0
819 if gsoSize < 1 {
820 return groResultNoop
821 }
822 srcAddrOffset := ipv4SrcAddrOffset
823 addrLen := 4
824 if isV6 {
825 srcAddrOffset = ipv6SrcAddrOffset
826 addrLen = 16
827 }
828 items, existing := table.lookupOrInsert(pkt, srcAddrOffset, srcAddrOffset+addrLen, iphLen, pktI)
829 if !existing {
830 return groResultTableInsert
831 }
832 // With UDP we only check the last item, otherwise we could reorder packets
833 // for a given flow. We must also always insert a new item, or successfully
834 // coalesce with an existing item, for the same reason.
835 item := items[len(items)-1]
836 can := udpPacketsCanCoalesce(pkt, uint8(iphLen), gsoSize, item, bufs, offset)
837 var pktCSumKnownInvalid bool
838 if can == coalesceAppend {
839 result := coalesceUDPPackets(pkt, &item, bufs, offset, isV6)
840 switch result {
841 case coalesceSuccess:
842 table.updateAt(item, len(items)-1)
843 return groResultCoalesced
844 case coalesceItemInvalidCSum:
845 // If the existing item has an invalid csum we take no action. A new
846 // item will be stored after it, and the existing item will never be
847 // revisited as part of future coalescing candidacy checks.
848 case coalescePktInvalidCSum:
849 // We must insert a new item, but we also mark it as invalid csum
850 // to prevent a repeat checksum validation.
851 pktCSumKnownInvalid = true
852 default:
853 }
854 }
855 // failed to coalesce with any other packets; store the item in the flow
856 table.insert(pkt, srcAddrOffset, srcAddrOffset+addrLen, iphLen, pktI, pktCSumKnownInvalid)
857 return groResultTableInsert
858 }
859
860 // handleGRO evaluates bufs for GRO, and writes the indices of the resulting
861 // packets into toWrite. toWrite, tcpTable, and udpTable should initially be
862 // empty (but non-nil), and are passed in to save allocs as the caller may reset
863 // and recycle them across vectors of packets. canUDPGRO indicates if UDP GRO is
864 // supported.
865 func handleGRO(bufs [][]byte, offset int, tcpTable *tcpGROTable, udpTable *udpGROTable, canUDPGRO bool, toWrite *[]int) error {
866 for i := range bufs {
867 if offset < virtioNetHdrLen || offset > len(bufs[i])-1 {
868 return errors.New("invalid offset")
869 }
870 var result groResult
871 switch packetIsGROCandidate(bufs[i][offset:], canUDPGRO) {
872 case tcp4GROCandidate:
873 result = tcpGRO(bufs, offset, i, tcpTable, false)
874 case tcp6GROCandidate:
875 result = tcpGRO(bufs, offset, i, tcpTable, true)
876 case udp4GROCandidate:
877 result = udpGRO(bufs, offset, i, udpTable, false)
878 case udp6GROCandidate:
879 result = udpGRO(bufs, offset, i, udpTable, true)
880 }
881 switch result {
882 case groResultNoop:
883 hdr := virtioNetHdr{}
884 err := hdr.encode(bufs[i][offset-virtioNetHdrLen:])
885 if err != nil {
886 return err
887 }
888 fallthrough
889 case groResultTableInsert:
890 *toWrite = append(*toWrite, i)
891 }
892 }
893 errTCP := applyTCPCoalesceAccounting(bufs, offset, tcpTable)
894 errUDP := applyUDPCoalesceAccounting(bufs, offset, udpTable)
895 return errors.Join(errTCP, errUDP)
896 }
897
898 // gsoSplit splits packets from in into outBuffs, writing the size of each
899 // element into sizes. It returns the number of buffers populated, and/or an
900 // error.
901 func gsoSplit(in []byte, hdr virtioNetHdr, outBuffs [][]byte, sizes []int, outOffset int, isV6 bool) (int, error) {
902 iphLen := int(hdr.csumStart)
903 srcAddrOffset := ipv6SrcAddrOffset
904 addrLen := 16
905 if !isV6 {
906 in[10], in[11] = 0, 0 // clear ipv4 header checksum
907 srcAddrOffset = ipv4SrcAddrOffset
908 addrLen = 4
909 }
910 transportCsumAt := int(hdr.csumStart + hdr.csumOffset)
911 in[transportCsumAt], in[transportCsumAt+1] = 0, 0 // clear tcp/udp checksum
912 var firstTCPSeqNum uint32
913 var protocol uint8
914 if hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_TCPV4 || hdr.gsoType == unix.VIRTIO_NET_HDR_GSO_TCPV6 {
915 protocol = unix.IPPROTO_TCP
916 firstTCPSeqNum = binary.BigEndian.Uint32(in[hdr.csumStart+4:])
917 } else {
918 protocol = unix.IPPROTO_UDP
919 }
920 nextSegmentDataAt := int(hdr.hdrLen)
921 i := 0
922 for ; nextSegmentDataAt < len(in); i++ {
923 if i == len(outBuffs) {
924 return i - 1, ErrTooManySegments
925 }
926 nextSegmentEnd := nextSegmentDataAt + int(hdr.gsoSize)
927 if nextSegmentEnd > len(in) {
928 nextSegmentEnd = len(in)
929 }
930 segmentDataLen := nextSegmentEnd - nextSegmentDataAt
931 totalLen := int(hdr.hdrLen) + segmentDataLen
932 sizes[i] = totalLen
933 out := outBuffs[i][outOffset:]
934
935 copy(out, in[:iphLen])
936 if !isV6 {
937 // For IPv4 we are responsible for incrementing the ID field,
938 // updating the total len field, and recalculating the header
939 // checksum.
940 if i > 0 {
941 id := binary.BigEndian.Uint16(out[4:])
942 id += uint16(i)
943 binary.BigEndian.PutUint16(out[4:], id)
944 }
945 binary.BigEndian.PutUint16(out[2:], uint16(totalLen))
946 ipv4CSum := ^checksum(out[:iphLen], 0)
947 binary.BigEndian.PutUint16(out[10:], ipv4CSum)
948 } else {
949 // For IPv6 we are responsible for updating the payload length field.
950 binary.BigEndian.PutUint16(out[4:], uint16(totalLen-iphLen))
951 }
952
953 // copy transport header
954 copy(out[hdr.csumStart:hdr.hdrLen], in[hdr.csumStart:hdr.hdrLen])
955
956 if protocol == unix.IPPROTO_TCP {
957 // set TCP seq and adjust TCP flags
958 tcpSeq := firstTCPSeqNum + uint32(hdr.gsoSize*uint16(i))
959 binary.BigEndian.PutUint32(out[hdr.csumStart+4:], tcpSeq)
960 if nextSegmentEnd != len(in) {
961 // FIN and PSH should only be set on last segment
962 clearFlags := tcpFlagFIN | tcpFlagPSH
963 out[hdr.csumStart+tcpFlagsOffset] &^= clearFlags
964 }
965 } else {
966 // set UDP header len
967 binary.BigEndian.PutUint16(out[hdr.csumStart+4:], uint16(segmentDataLen)+(hdr.hdrLen-hdr.csumStart))
968 }
969
970 // payload
971 copy(out[hdr.hdrLen:], in[nextSegmentDataAt:nextSegmentEnd])
972
973 // transport checksum
974 transportHeaderLen := int(hdr.hdrLen - hdr.csumStart)
975 lenForPseudo := uint16(transportHeaderLen + segmentDataLen)
976 transportCSumNoFold := pseudoHeaderChecksumNoFold(protocol, in[srcAddrOffset:srcAddrOffset+addrLen], in[srcAddrOffset+addrLen:srcAddrOffset+addrLen*2], lenForPseudo)
977 transportCSum := ^checksum(out[hdr.csumStart:totalLen], transportCSumNoFold)
978 binary.BigEndian.PutUint16(out[hdr.csumStart+hdr.csumOffset:], transportCSum)
979
980 nextSegmentDataAt += int(hdr.gsoSize)
981 }
982 return i, nil
983 }
984
985 func gsoNoneChecksum(in []byte, cSumStart, cSumOffset uint16) error {
986 cSumAt := cSumStart + cSumOffset
987 // The initial value at the checksum offset should be summed with the
988 // checksum we compute. This is typically the pseudo-header checksum.
989 initial := binary.BigEndian.Uint16(in[cSumAt:])
990 in[cSumAt], in[cSumAt+1] = 0, 0
991 binary.BigEndian.PutUint16(in[cSumAt:], ^checksum(in[cSumStart:], uint64(initial)))
992 return nil
993 }
994