32 














































   32 













































    3 







    3 









    6 


    1 



    5 







    1 




    4 



    4 
    1 

    1 











    1 

    1 




    1 

    1 






    1 











    1 


    1 




    1 





















































































   33 






   13 








    2 

    2 










    1 







   31 






   30 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ipv4

import (
        "fmt"
        "sync/atomic"
        "time"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

const (
        // igmpV1PresentDefault is the initial state for igmpV1Present in the
        // igmpState. As per RFC 2236 Page 9 says "No IGMPv1 Router Present ... is
        // the initial state."
        igmpV1PresentDefault = 0

        // v1RouterPresentTimeout from RFC 2236 Section 8.11, Page 18
        // See note on igmpState.igmpV1Present for more detail.
        v1RouterPresentTimeout = 400 * time.Second

        // v1MaxRespTime from RFC 2236 Section 4, Page 5. "The IGMPv1 router
        // will send General Queries with the Max Response Time set to 0. This MUST
        // be interpreted as a value of 100 (10 seconds)."
        //
        // Note that the Max Response Time field is a value in units of deciseconds.
        v1MaxRespTime = 10 * time.Second

        // UnsolicitedReportIntervalMax is the maximum delay between sending
        // unsolicited IGMP reports.
        //
        // Obtained from RFC 2236 Section 8.10, Page 19.
        UnsolicitedReportIntervalMax = 10 * time.Second
)

// IGMPOptions holds options for IGMP.
type IGMPOptions struct {
        // Enabled indicates whether IGMP will be performed.
        //
        // When enabled, IGMP may transmit IGMP report and leave messages when
        // joining and leaving multicast groups respectively, and handle incoming
        // IGMP packets.
        //
        // This field is ignored and is always assumed to be false for interfaces
        // without neighbouring nodes (e.g. loopback).
        Enabled bool
}

var _ ip.MulticastGroupProtocol = (*igmpState)(nil)

// igmpState is the per-interface IGMP state.
//
// igmpState.init() MUST be called after creating an IGMP state.
type igmpState struct {
        // The IPv4 endpoint this igmpState is for.
        ep *endpoint

        genericMulticastProtocol ip.GenericMulticastProtocolState

        // igmpV1Present is for maintaining compatibility with IGMPv1 Routers, from
        // RFC 2236 Section 4 Page 6: "The IGMPv1 router expects Version 1
        // Membership Reports in response to its Queries, and will not pay
        // attention to Version 2 Membership Reports.  Therefore, a state variable
        // MUST be kept for each interface, describing whether the multicast
        // Querier on that interface is running IGMPv1 or IGMPv2.  This variable
        // MUST be based upon whether or not an IGMPv1 query was heard in the last
        // [Version 1 Router Present Timeout] seconds".
        //
        // Must be accessed with atomic operations. Holds a value of 1 when true, 0
        // when false.
        igmpV1Present uint32

        // igmpV1Job is scheduled when this interface receives an IGMPv1 style
        // message, upon expiration the igmpV1Present flag is cleared.
        // igmpV1Job may not be nil once igmpState is initialized.
        igmpV1Job *tcpip.Job
}

// Enabled implements ip.MulticastGroupProtocol.
func (igmp *igmpState) Enabled() bool {
        // No need to perform IGMP on loopback interfaces since they don't have
        // neighbouring nodes.
        return igmp.ep.protocol.options.IGMP.Enabled && !igmp.ep.nic.IsLoopback() && igmp.ep.Enabled()
}

// SendReport implements ip.MulticastGroupProtocol.
//
// Precondition: igmp.ep.mu must be read locked.
func (igmp *igmpState) SendReport(groupAddress tcpip.Address) (bool, tcpip.Error) {
        igmpType := header.IGMPv2MembershipReport
        if igmp.v1Present() {
                igmpType = header.IGMPv1MembershipReport
        }
        return igmp.writePacket(groupAddress, groupAddress, igmpType)
}

// SendLeave implements ip.MulticastGroupProtocol.
//
// Precondition: igmp.ep.mu must be read locked.
func (igmp *igmpState) SendLeave(groupAddress tcpip.Address) tcpip.Error {
        // As per RFC 2236 Section 6, Page 8: "If the interface state says the
        // Querier is running IGMPv1, this action SHOULD be skipped. If the flag
        // saying we were the last host to report is cleared, this action MAY be
        // skipped."
        if igmp.v1Present() {
                return nil
        }
        _, err := igmp.writePacket(header.IPv4AllRoutersGroup, groupAddress, header.IGMPLeaveGroup)
        return err
}

// ShouldPerformProtocol implements ip.MulticastGroupProtocol.
func (igmp *igmpState) ShouldPerformProtocol(groupAddress tcpip.Address) bool {
        // As per RFC 2236 section 6 page 10,
        //
        //   The all-systems group (address 224.0.0.1) is handled as a special
        //   case. The host starts in Idle Member state for that group on every
        //   interface, never transitions to another state, and never sends a
        //   report for that group.
        return groupAddress != header.IPv4AllSystems
}

// init sets up an igmpState struct, and is required to be called before using
// a new igmpState.
//
// Must only be called once for the lifetime of igmp.
func (igmp *igmpState) init(ep *endpoint) {
        igmp.ep = ep
        igmp.genericMulticastProtocol.Init(&ep.mu.RWMutex, ip.GenericMulticastProtocolOptions{
                Rand:                      ep.protocol.stack.Rand(),
                Clock:                     ep.protocol.stack.Clock(),
                Protocol:                  igmp,
                MaxUnsolicitedReportDelay: UnsolicitedReportIntervalMax,
        })
        igmp.igmpV1Present = igmpV1PresentDefault
        igmp.igmpV1Job = ep.protocol.stack.NewJob(&ep.mu, func() {
                igmp.setV1Present(false)
        })
}

// Precondition: igmp.ep.mu must be locked.
func (igmp *igmpState) isSourceIPValidLocked(src tcpip.Address, messageType header.IGMPType) bool {
        if messageType == header.IGMPMembershipQuery {
                // RFC 2236 does not require the IGMP implementation to check the source IP
                // for Membership Query messages.
                return true
        }

        // As per RFC 2236 section 10,
        //
        //   Ignore the Report if you cannot identify the source address of the
        //   packet as belonging to a subnet assigned to the interface on which the
        //   packet was received.
        //
        //   Ignore the Leave message if you cannot identify the source address of
        //   the packet as belonging to a subnet assigned to the interface on which
        //   the packet was received.
        //
        // Note: this rule applies to both V1 and V2 Membership Reports.
        var isSourceIPValid bool
        igmp.ep.mu.addressableEndpointState.ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) bool {
                if subnet := addressEndpoint.Subnet(); subnet.Contains(src) {
                        isSourceIPValid = true
                        return false
                }
                return true
        })

        return isSourceIPValid
}

// Precondition: igmp.ep.mu must be locked.
func (igmp *igmpState) isPacketValidLocked(pkt *stack.PacketBuffer, messageType header.IGMPType, hasRouterAlertOption bool) bool {
        // We can safely assume that the IP header is valid if we got this far.
        iph := header.IPv4(pkt.NetworkHeader().View())

        // As per RFC 2236 section 2,
        //
        //   All IGMP messages described in this document are sent with IP TTL 1, and
        //   contain the IP Router Alert option [RFC 2113] in their IP header.
        if !hasRouterAlertOption || iph.TTL() != header.IGMPTTL {
                return false
        }

        return igmp.isSourceIPValidLocked(iph.SourceAddress(), messageType)
}

// handleIGMP handles an IGMP packet.
//
// Precondition: igmp.ep.mu must be locked.
func (igmp *igmpState) handleIGMP(pkt *stack.PacketBuffer, hasRouterAlertOption bool) {
        received := igmp.ep.stats.igmp.packetsReceived
        headerView, ok := pkt.Data().PullUp(header.IGMPMinimumSize)
        if !ok {
                received.invalid.Increment()
                return
        }
        h := header.IGMP(headerView)

        // As per RFC 1071 section 1.3,
        //
        //   To check a checksum, the 1's complement sum is computed over the
        //   same set of octets, including the checksum field. If the result
        //   is all 1 bits (-0 in 1's complement arithmetic), the check
        //   succeeds.
        if pkt.Data().AsRange().Checksum() != 0xFFFF {
                received.checksumErrors.Increment()
                return
        }

        isValid := func(minimumSize int) bool {
                return len(headerView) >= minimumSize && igmp.isPacketValidLocked(pkt, h.Type(), hasRouterAlertOption)
        }

        switch h.Type() {
        case header.IGMPMembershipQuery:
                received.membershipQuery.Increment()
                if !isValid(header.IGMPQueryMinimumSize) {
                        received.invalid.Increment()
                        return
                }
                igmp.handleMembershipQuery(h.GroupAddress(), h.MaxRespTime())
        case header.IGMPv1MembershipReport:
                received.v1MembershipReport.Increment()
                if !isValid(header.IGMPReportMinimumSize) {
                        received.invalid.Increment()
                        return
                }
                igmp.handleMembershipReport(h.GroupAddress())
        case header.IGMPv2MembershipReport:
                received.v2MembershipReport.Increment()
                if !isValid(header.IGMPReportMinimumSize) {
                        received.invalid.Increment()
                        return
                }
                igmp.handleMembershipReport(h.GroupAddress())
        case header.IGMPLeaveGroup:
                received.leaveGroup.Increment()
                if !isValid(header.IGMPLeaveMessageMinimumSize) {
                        received.invalid.Increment()
                        return
                }
                // As per RFC 2236 Section 6, Page 7: "IGMP messages other than Query or
                // Report, are ignored in all states"

        default:
                // As per RFC 2236 Section 2.1 Page 3: "Unrecognized message types should
                // be silently ignored. New message types may be used by newer versions of
                // IGMP, by multicast routing protocols, or other uses."
                received.unrecognized.Increment()
        }
}

func (igmp *igmpState) v1Present() bool {
        return atomic.LoadUint32(&igmp.igmpV1Present) == 1
}

func (igmp *igmpState) setV1Present(v bool) {
        if v {
                atomic.StoreUint32(&igmp.igmpV1Present, 1)
        } else {
                atomic.StoreUint32(&igmp.igmpV1Present, 0)
        }
}

func (igmp *igmpState) resetV1Present() {
        igmp.igmpV1Job.Cancel()
        igmp.setV1Present(false)
}

// handleMembershipQuery handles a membership query.
//
// Precondition: igmp.ep.mu must be locked.
func (igmp *igmpState) handleMembershipQuery(groupAddress tcpip.Address, maxRespTime time.Duration) {
        // As per RFC 2236 Section 6, Page 10: If the maximum response time is zero
        // then change the state to note that an IGMPv1 router is present and
        // schedule the query received Job.
        if maxRespTime == 0 && igmp.Enabled() {
                igmp.igmpV1Job.Cancel()
                igmp.igmpV1Job.Schedule(v1RouterPresentTimeout)
                igmp.setV1Present(true)
                maxRespTime = v1MaxRespTime
        }

        igmp.genericMulticastProtocol.HandleQueryLocked(groupAddress, maxRespTime)
}

// handleMembershipReport handles a membership report.
//
// Precondition: igmp.ep.mu must be locked.
func (igmp *igmpState) handleMembershipReport(groupAddress tcpip.Address) {
        igmp.genericMulticastProtocol.HandleReportLocked(groupAddress)
}

// writePacket assembles and sends an IGMP packet.
//
// Precondition: igmp.ep.mu must be read locked.
func (igmp *igmpState) writePacket(destAddress tcpip.Address, groupAddress tcpip.Address, igmpType header.IGMPType) (bool, tcpip.Error) {
        igmpData := header.IGMP(buffer.NewView(header.IGMPReportMinimumSize))
        igmpData.SetType(igmpType)
        igmpData.SetGroupAddress(groupAddress)
        igmpData.SetChecksum(header.IGMPCalculateChecksum(igmpData))

        pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                ReserveHeaderBytes: int(igmp.ep.MaxHeaderLength()),
                Data:               buffer.View(igmpData).ToVectorisedView(),
        })

        addressEndpoint := igmp.ep.acquireOutgoingPrimaryAddressRLocked(destAddress, false /* allowExpired */)
        if addressEndpoint == nil {
                return false, nil
        }
        localAddr := addressEndpoint.AddressWithPrefix().Address
        addressEndpoint.DecRef()
        addressEndpoint = nil
        if err := igmp.ep.addIPHeader(localAddr, destAddress, pkt, stack.NetworkHeaderParams{
                Protocol: header.IGMPProtocolNumber,
                TTL:      header.IGMPTTL,
                TOS:      stack.DefaultTOS,
        }, header.IPv4OptionsSerializer{
                &header.IPv4SerializableRouterAlertOption{},
        }); err != nil {
                panic(fmt.Sprintf("failed to add IP header: %s", err))
        }

        sentStats := igmp.ep.stats.igmp.packetsSent
        if err := igmp.ep.nic.WritePacketToRemote(header.EthernetAddressFromMulticastIPv4Address(destAddress), ProtocolNumber, pkt); err != nil {
                sentStats.dropped.Increment()
                return false, err
        }
        switch igmpType {
        case header.IGMPv1MembershipReport:
                sentStats.v1MembershipReport.Increment()
        case header.IGMPv2MembershipReport:
                sentStats.v2MembershipReport.Increment()
        case header.IGMPLeaveGroup:
                sentStats.leaveGroup.Increment()
        default:
                panic(fmt.Sprintf("unrecognized igmp type = %d", igmpType))
        }
        return true, nil
}

// joinGroup handles adding a new group to the membership map, setting up the
// IGMP state for the group, and sending and scheduling the required
// messages.
//
// If the group already exists in the membership map, returns
// *tcpip.ErrDuplicateAddress.
//
// Precondition: igmp.ep.mu must be locked.
func (igmp *igmpState) joinGroup(groupAddress tcpip.Address) {
        igmp.genericMulticastProtocol.JoinGroupLocked(groupAddress)
}

// isInGroup returns true if the specified group has been joined locally.
//
// Precondition: igmp.ep.mu must be read locked.
func (igmp *igmpState) isInGroup(groupAddress tcpip.Address) bool {
        return igmp.genericMulticastProtocol.IsLocallyJoinedRLocked(groupAddress)
}

// leaveGroup handles removing the group from the membership map, cancels any
// delay timers associated with that group, and sends the Leave Group message
// if required.
//
// Precondition: igmp.ep.mu must be locked.
func (igmp *igmpState) leaveGroup(groupAddress tcpip.Address) tcpip.Error {
        // LeaveGroup returns false only if the group was not joined.
        if igmp.genericMulticastProtocol.LeaveGroupLocked(groupAddress) {
                return nil
        }

        return &tcpip.ErrBadLocalAddress{}
}

// softLeaveAll leaves all groups from the perspective of IGMP, but remains
// joined locally.
//
// Precondition: igmp.ep.mu must be locked.
func (igmp *igmpState) softLeaveAll() {
        igmp.genericMulticastProtocol.MakeAllNonMemberLocked()
}

// initializeAll attemps to initialize the IGMP state for each group that has
// been joined locally.
//
// Precondition: igmp.ep.mu must be locked.
func (igmp *igmpState) initializeAll() {
        igmp.genericMulticastProtocol.InitializeGroupsLocked()
}

// sendQueuedReports attempts to send any reports that are queued for sending.
//
// Precondition: igmp.ep.mu must be locked.
func (igmp *igmpState) sendQueuedReports() {
        igmp.genericMulticastProtocol.SendQueuedReportsLocked()
}

























































   24 








   21 






   21 




   21 








    8 





    1 



    7 














    7 
    3 




    2 



    1 




    2 


    2 


    1 




    3 









    4 



   28 












   28 
    3 


   23 

   23 





   28 

















   20 

   13 



   20 



   20 



   20 



  229 

    1 

  227 

    4 







    1 




    9 




    9 




  195 




  191 




    4 




    2 







    4 




    6 



    5 


    6 








    5 

    1 


    6 



    6 

    4 


    6 



    9 










    5 

    4 


    9 

    5 


    9 







    3 
    3 


    3 







    2 
    2 


    2 





    2 
    2 


    2 







    1 
    1 









    4 
    3 
























    5 












    5 
    1 



    4 
    4 
    2 


    4 


    4 

    4 


    2 

    2 



    4 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pipe

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// This file contains types enabling the pipe package to be used with the vfs
// package.

// VFSPipe represents the actual pipe, analagous to an inode. VFSPipes should
// not be copied.
//
// +stateify savable
type VFSPipe struct {
        // mu protects the fields below.
        mu sync.Mutex `state:"nosave"`

        // pipe is the underlying pipe.
        pipe Pipe

        // Channels for synchronizing the creation of new readers and writers
        // of this fifo. See waitFor and newHandleLocked.
        //
        // These are not saved/restored because all waiters are unblocked on
        // save, and either automatically restart (via ERESTARTSYS) or return
        // EINTR on resume. On restarts via ERESTARTSYS, the appropriate
        // channel will be recreated.
        rWakeup chan struct{} `state:"nosave"`
        wWakeup chan struct{} `state:"nosave"`
}

// NewVFSPipe returns an initialized VFSPipe.
func NewVFSPipe(isNamed bool, sizeBytes int64) *VFSPipe {
        var vp VFSPipe
        initPipe(&vp.pipe, isNamed, sizeBytes)
        return &vp
}

// ReaderWriterPair returns read-only and write-only FDs for vp.
//
// Preconditions: statusFlags should not contain an open access mode.
func (vp *VFSPipe) ReaderWriterPair(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32) (*vfs.FileDescription, *vfs.FileDescription, error) {
        // Connected pipes share the same locks.
        locks := &vfs.FileLocks{}
        r, err := vp.newFD(mnt, vfsd, linux.O_RDONLY|statusFlags, locks)
        if err != nil {
                return nil, nil, err
        }
        w, err := vp.newFD(mnt, vfsd, linux.O_WRONLY|statusFlags, locks)
        if err != nil {
                r.DecRef(ctx)
                return nil, nil, err
        }
        return r, w, nil
}

// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (*VFSPipe) Allocate(context.Context, uint64, uint64, uint64) error {
        return linuxerr.ESPIPE
}

// Open opens the pipe represented by vp.
func (vp *VFSPipe) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
        vp.mu.Lock()
        defer vp.mu.Unlock()

        readable := vfs.MayReadFileWithOpenFlags(statusFlags)
        writable := vfs.MayWriteFileWithOpenFlags(statusFlags)
        if !readable && !writable {
                return nil, linuxerr.EINVAL
        }

        fd, err := vp.newFD(mnt, vfsd, statusFlags, locks)
        if err != nil {
                return nil, err
        }

        // Named pipes have special blocking semantics during open:
        //
        // "Normally, opening the FIFO blocks until the other end is opened also. A
        // process can open a FIFO in nonblocking mode. In this case, opening for
        // read-only will succeed even if no-one has opened on the write side yet,
        // opening for write-only will fail with ENXIO (no such device or address)
        // unless the other end has already been opened. Under Linux, opening a
        // FIFO for read and write will succeed both in blocking and nonblocking
        // mode. POSIX leaves this behavior undefined. This can be used to open a
        // FIFO for writing while there are no readers available." - fifo(7)
        switch {
        case readable && writable:
                // Pipes opened for read-write always succeed without blocking.
                newHandleLocked(&vp.rWakeup)
                newHandleLocked(&vp.wWakeup)

        case readable:
                newHandleLocked(&vp.rWakeup)
                // If this pipe is being opened as blocking and there's no
                // writer, we have to wait for a writer to open the other end.
                if vp.pipe.isNamed && statusFlags&linux.O_NONBLOCK == 0 && !vp.pipe.HasWriters() && !waitFor(&vp.mu, &vp.wWakeup, ctx) {
                        fd.DecRef(ctx)
                        return nil, syserror.EINTR
                }

        case writable:
                newHandleLocked(&vp.wWakeup)

                if vp.pipe.isNamed && !vp.pipe.HasReaders() {
                        // Non-blocking, write-only opens fail with ENXIO when the read
                        // side isn't open yet.
                        if statusFlags&linux.O_NONBLOCK != 0 {
                                fd.DecRef(ctx)
                                return nil, linuxerr.ENXIO
                        }
                        // Wait for a reader to open the other end.
                        if !waitFor(&vp.mu, &vp.rWakeup, ctx) {
                                fd.DecRef(ctx)
                                return nil, syserror.EINTR
                        }
                }

        default:
                panic("invalid pipe flags: must be readable, writable, or both")
        }

        return fd, nil
}

// Preconditions: vp.mu must be held.
func (vp *VFSPipe) newFD(mnt *vfs.Mount, vfsd *vfs.Dentry, statusFlags uint32, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
        fd := &VFSPipeFD{
                pipe: &vp.pipe,
        }
        fd.LockFD.Init(locks)
        if err := fd.vfsfd.Init(fd, statusFlags, mnt, vfsd, &vfs.FileDescriptionOptions{
                DenyPRead:         true,
                DenyPWrite:        true,
                UseDentryMetadata: true,
        }); err != nil {
                return nil, err
        }

        switch {
        case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable():
                vp.pipe.rOpen()
                vp.pipe.wOpen()
        case fd.vfsfd.IsReadable():
                vp.pipe.rOpen()
        case fd.vfsfd.IsWritable():
                vp.pipe.wOpen()
        default:
                panic("invalid pipe flags: must be readable, writable, or both")
        }

        return &fd.vfsfd, nil
}

// VFSPipeFD implements vfs.FileDescriptionImpl for pipes. It also implements
// non-atomic usermem.IO methods, allowing it to be passed as usermem.IO to
// other FileDescriptions for splice(2) and tee(2).
//
// +stateify savable
type VFSPipeFD struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.DentryMetadataFileDescriptionImpl
        vfs.LockFD

        pipe *Pipe
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *VFSPipeFD) Release(context.Context) {
        var event waiter.EventMask
        if fd.vfsfd.IsReadable() {
                fd.pipe.rClose()
                event |= waiter.WritableEvents
        }
        if fd.vfsfd.IsWritable() {
                fd.pipe.wClose()
                event |= waiter.ReadableEvents | waiter.EventHUp
        }
        if event == 0 {
                panic("invalid pipe flags: must be readable, writable, or both")
        }

        fd.pipe.Notify(event)
}

// Readiness implements waiter.Waitable.Readiness.
func (fd *VFSPipeFD) Readiness(mask waiter.EventMask) waiter.EventMask {
        switch {
        case fd.vfsfd.IsReadable() && fd.vfsfd.IsWritable():
                return fd.pipe.rwReadiness()
        case fd.vfsfd.IsReadable():
                return fd.pipe.rReadiness()
        case fd.vfsfd.IsWritable():
                return fd.pipe.wReadiness()
        default:
                panic("pipe FD is neither readable nor writable")
        }
}

// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (fd *VFSPipeFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
        return linuxerr.ESPIPE
}

// EventRegister implements waiter.Waitable.EventRegister.
func (fd *VFSPipeFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        fd.pipe.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (fd *VFSPipeFD) EventUnregister(e *waiter.Entry) {
        fd.pipe.EventUnregister(e)
}

// Read implements vfs.FileDescriptionImpl.Read.
func (fd *VFSPipeFD) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
        return fd.pipe.Read(ctx, dst)
}

// Write implements vfs.FileDescriptionImpl.Write.
func (fd *VFSPipeFD) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
        return fd.pipe.Write(ctx, src)
}

// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
func (fd *VFSPipeFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        return fd.pipe.Ioctl(ctx, uio, args)
}

// PipeSize implements fcntl(F_GETPIPE_SZ).
func (fd *VFSPipeFD) PipeSize() int64 {
        // Inline Pipe.FifoSize() since we don't have a fs.File.
        fd.pipe.mu.Lock()
        defer fd.pipe.mu.Unlock()
        return fd.pipe.max
}

// SetPipeSize implements fcntl(F_SETPIPE_SZ).
func (fd *VFSPipeFD) SetPipeSize(size int64) (int64, error) {
        return fd.pipe.SetFifoSize(size)
}

// SpliceToNonPipe performs a splice operation from fd to a non-pipe file.
func (fd *VFSPipeFD) SpliceToNonPipe(ctx context.Context, out *vfs.FileDescription, off, count int64) (int64, error) {
        fd.pipe.mu.Lock()

        // Cap the sequence at number of bytes actually available.
        if count > fd.pipe.size {
                count = fd.pipe.size
        }
        src := usermem.IOSequence{
                IO:    fd,
                Addrs: hostarch.AddrRangeSeqOf(hostarch.AddrRange{0, hostarch.Addr(count)}),
        }

        var (
                n   int64
                err error
        )
        if off == -1 {
                n, err = out.Write(ctx, src, vfs.WriteOptions{})
        } else {
                n, err = out.PWrite(ctx, src, off, vfs.WriteOptions{})
        }
        if n > 0 {
                fd.pipe.consumeLocked(n)
        }

        fd.pipe.mu.Unlock()

        if n > 0 {
                fd.pipe.Notify(waiter.WritableEvents)
        }
        return n, err
}

// SpliceFromNonPipe performs a splice operation from a non-pipe file to fd.
func (fd *VFSPipeFD) SpliceFromNonPipe(ctx context.Context, in *vfs.FileDescription, off, count int64) (int64, error) {
        dst := usermem.IOSequence{
                IO:    fd,
                Addrs: hostarch.AddrRangeSeqOf(hostarch.AddrRange{0, hostarch.Addr(count)}),
        }

        var (
                n   int64
                err error
        )
        fd.pipe.mu.Lock()
        if off == -1 {
                n, err = in.Read(ctx, dst, vfs.ReadOptions{})
        } else {
                n, err = in.PRead(ctx, dst, off, vfs.ReadOptions{})
        }
        fd.pipe.mu.Unlock()

        if n > 0 {
                fd.pipe.Notify(waiter.ReadableEvents)
        }
        return n, err
}

// CopyIn implements usermem.IO.CopyIn. Note that it is the caller's
// responsibility to call fd.pipe.consumeLocked() and
// fd.pipe.Notify(waiter.WritableEvents) after the read is completed.
//
// Preconditions: fd.pipe.mu must be locked.
func (fd *VFSPipeFD) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
        n, err := fd.pipe.peekLocked(int64(len(dst)), func(srcs safemem.BlockSeq) (uint64, error) {
                return safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), srcs)
        })
        return int(n), err
}

// CopyOut implements usermem.IO.CopyOut. Note that it is the caller's
// responsibility to call fd.pipe.Notify(waiter.ReadableEvents) after the write
// is completed.
//
// Preconditions: fd.pipe.mu must be locked.
func (fd *VFSPipeFD) CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts usermem.IOOpts) (int, error) {
        n, err := fd.pipe.writeLocked(int64(len(src)), func(dsts safemem.BlockSeq) (uint64, error) {
                return safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src)))
        })
        return int(n), err
}

// ZeroOut implements usermem.IO.ZeroOut.
//
// Preconditions: fd.pipe.mu must be locked.
func (fd *VFSPipeFD) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
        n, err := fd.pipe.writeLocked(toZero, func(dsts safemem.BlockSeq) (uint64, error) {
                return safemem.ZeroSeq(dsts)
        })
        return n, err
}

// CopyInTo implements usermem.IO.CopyInTo. Note that it is the caller's
// responsibility to call fd.pipe.consumeLocked() and
// fd.pipe.Notify(waiter.WritableEvents) after the read is completed.
//
// Preconditions: fd.pipe.mu must be locked.
func (fd *VFSPipeFD) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
        return fd.pipe.peekLocked(ars.NumBytes(), func(srcs safemem.BlockSeq) (uint64, error) {
                return dst.WriteFromBlocks(srcs)
        })
}

// CopyOutFrom implements usermem.IO.CopyOutFrom. Note that it is the caller's
// responsibility to call fd.pipe.Notify(waiter.ReadableEvents) after the write
// is completed.
//
// Preconditions: fd.pipe.mu must be locked.
func (fd *VFSPipeFD) CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
        return fd.pipe.writeLocked(ars.NumBytes(), func(dsts safemem.BlockSeq) (uint64, error) {
                return src.ReadToBlocks(dsts)
        })
}

// SwapUint32 implements usermem.IO.SwapUint32.
func (fd *VFSPipeFD) SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
        // How did a pipe get passed as the virtual address space to futex(2)?
        panic("VFSPipeFD.SwapUint32 called unexpectedly")
}

// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
func (fd *VFSPipeFD) CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
        panic("VFSPipeFD.CompareAndSwapUint32 called unexpectedly")
}

// LoadUint32 implements usermem.IO.LoadUint32.
func (fd *VFSPipeFD) LoadUint32(ctx context.Context, addr hostarch.Addr, opts usermem.IOOpts) (uint32, error) {
        panic("VFSPipeFD.LoadUint32 called unexpectedly")
}

// Splice reads up to count bytes from src and writes them to dst. It returns
// the number of bytes moved.
//
// Preconditions: count > 0.
func Splice(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
        return spliceOrTee(ctx, dst, src, count, true /* removeFromSrc */)
}

// Tee reads up to count bytes from src and writes them to dst, without
// removing the read bytes from src. It returns the number of bytes copied.
//
// Preconditions: count > 0.
func Tee(ctx context.Context, dst, src *VFSPipeFD, count int64) (int64, error) {
        return spliceOrTee(ctx, dst, src, count, false /* removeFromSrc */)
}

// Preconditions: count > 0.
func spliceOrTee(ctx context.Context, dst, src *VFSPipeFD, count int64, removeFromSrc bool) (int64, error) {
        if dst.pipe == src.pipe {
                return 0, linuxerr.EINVAL
        }

        lockTwoPipes(dst.pipe, src.pipe)
        n, err := dst.pipe.writeLocked(count, func(dsts safemem.BlockSeq) (uint64, error) {
                n, err := src.pipe.peekLocked(int64(dsts.NumBytes()), func(srcs safemem.BlockSeq) (uint64, error) {
                        return safemem.CopySeq(dsts, srcs)
                })
                if n > 0 && removeFromSrc {
                        src.pipe.consumeLocked(n)
                }
                return uint64(n), err
        })
        dst.pipe.mu.Unlock()
        src.pipe.mu.Unlock()

        if n > 0 {
                dst.pipe.Notify(waiter.ReadableEvents)
                if removeFromSrc {
                        src.pipe.Notify(waiter.WritableEvents)
                }
        }
        return n, err
}



































































































































   32 






































    1 






   27 

















































   32 










   32 




   32 




   31 
















   32 



   92 





   97 






   32 
   32 


    1 



    1 





    1 
    1 




    1 
    1 






    1 



    1 








    1 







   34 





   21 




   21 




   37 




    1 



   35 





   36 



   36 







   36 







































   36 






   36 













   36 









   36 


   37 
   13 





   37 





   30 






   30 







   29 


















   29 



   29 

























































































    5 



    1 



    4 
    1 



    3 



    3 






    1 




    3 









    3 








    1 



    2 















































































































































   66 









   68 
    3 




   64 
   38 





   37 





   38 

    1 











   36 







   62 





   14 












   14 


   71 



   67 



   70 










    1 




   70 











   70 



   13 
    1 



























   69 





   66 
    1 






    2 














    2 













    2 















    1 




    1 













   65 


   21 








   52 
   10 

    4 
    4 





    4 

    6 




    6 




   48 






   42 
   30 
   12 


















    1 









   31 




   31 


   31 










   37 






   92 




   85 








    6 









    6 




   19 













    1 








   33 




   33 




    1 








    2 




   13 






   20 
































   32 














   43 





   31 

   31 



















   31 




   35 













   75 

    2 



   73 






   72 



   72 

   59 
    9 







   72 



   75 
    2 



   73 





   38 
    1 







   36 



   37 
   32 



   37 


   30 










   36 







   36 
















   31 




   31 


   31 

   31 














   31 






















































































   12 




























    1 










    5 



    2 



    3 








    5 



    1 








    4 





















    2 




    1 




    1 



    1 




    2 





























    2 


    2 





    4 









    4 





    1 



















    3 






    2 

























    1 




    1 


    1 




    3 




    1 





    2 














   16 























   16 
   16 

    5 


   15 
    1 



   15 





   13 





   13 

   13 

    5 


    5 






    4 


    4 





    3 


    3 





    1 





    1 


   13 



    8 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package ipv4 contains the implementation of the ipv4 network protocol.
package ipv4

import (
        "fmt"
        "math"
        "reflect"
        "sync/atomic"
        "time"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/header/parse"
        "gvisor.dev/gvisor/pkg/tcpip/network/hash"
        "gvisor.dev/gvisor/pkg/tcpip/network/internal/fragmentation"
        "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

const (
        // ReassembleTimeout is the time a packet stays in the reassembly
        // system before being evicted.
        // As per RFC 791 section 3.2:
        //   The current recommendation for the initial timer setting is 15 seconds.
        //   This may be changed as experience with this protocol accumulates.
        //
        // Considering that it is an old recommendation, we use the same reassembly
        // timeout that linux defines, which is 30 seconds:
        // https://github.com/torvalds/linux/blob/47ec5303d73ea344e84f46660fff693c57641386/include/net/ip.h#L138
        ReassembleTimeout = 30 * time.Second

        // ProtocolNumber is the ipv4 protocol number.
        ProtocolNumber = header.IPv4ProtocolNumber

        // MaxTotalSize is maximum size that can be encoded in the 16-bit
        // TotalLength field of the ipv4 header.
        MaxTotalSize = 0xffff

        // DefaultTTL is the default time-to-live value for this endpoint.
        DefaultTTL = 64

        // buckets is the number of identifier buckets.
        buckets = 2048

        // The size of a fragment block, in bytes, as per RFC 791 section 3.1,
        // page 14.
        fragmentblockSize = 8
)

const (
        forwardingDisabled = 0
        forwardingEnabled  = 1
)

var ipv4BroadcastAddr = header.IPv4Broadcast.WithPrefix()

var _ stack.LinkResolvableNetworkEndpoint = (*endpoint)(nil)
var _ stack.ForwardingNetworkEndpoint = (*endpoint)(nil)
var _ stack.GroupAddressableEndpoint = (*endpoint)(nil)
var _ stack.AddressableEndpoint = (*endpoint)(nil)
var _ stack.NetworkEndpoint = (*endpoint)(nil)

type endpoint struct {
        nic        stack.NetworkInterface
        dispatcher stack.TransportDispatcher
        protocol   *protocol
        stats      sharedStats

        // enabled is set to 1 when the endpoint is enabled and 0 when it is
        // disabled.
        //
        // Must be accessed using atomic operations.
        enabled uint32

        // forwarding is set to forwardingEnabled when the endpoint has forwarding
        // enabled and forwardingDisabled when it is disabled.
        //
        // Must be accessed using atomic operations.
        forwarding uint32

        mu struct {
                sync.RWMutex

                addressableEndpointState stack.AddressableEndpointState
                igmp                     igmpState
        }
}

// HandleLinkResolutionFailure implements stack.LinkResolvableNetworkEndpoint.
func (e *endpoint) HandleLinkResolutionFailure(pkt *stack.PacketBuffer) {
        // If we are operating as a router, return an ICMP error to the original
        // packet's sender.
        if pkt.NetworkPacketInfo.IsForwardedPacket {
                // TODO(gvisor.dev/issue/6005): Propagate asynchronously generated ICMP
                // errors to local endpoints.
                e.protocol.returnError(&icmpReasonHostUnreachable{}, pkt)
                e.stats.ip.Forwarding.Errors.Increment()
                e.stats.ip.Forwarding.HostUnreachable.Increment()
                return
        }
        // handleControl expects the entire offending packet to be in the packet
        // buffer's data field.
        pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{
                Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()),
        })
        pkt.NICID = e.nic.ID()
        pkt.NetworkProtocolNumber = ProtocolNumber
        // Use the same control type as an ICMPv4 destination host unreachable error
        // since the host is considered unreachable if we cannot resolve the link
        // address to the next hop.
        e.handleControl(&icmpv4DestinationHostUnreachableSockError{}, pkt)
}

// NewEndpoint creates a new ipv4 endpoint.
func (p *protocol) NewEndpoint(nic stack.NetworkInterface, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint {
        e := &endpoint{
                nic:        nic,
                dispatcher: dispatcher,
                protocol:   p,
        }
        e.mu.Lock()
        e.mu.addressableEndpointState.Init(e)
        e.mu.igmp.init(e)
        e.mu.Unlock()

        tcpip.InitStatCounters(reflect.ValueOf(&e.stats.localStats).Elem())

        stackStats := p.stack.Stats()
        e.stats.ip.Init(&e.stats.localStats.IP, &stackStats.IP)
        e.stats.icmp.init(&e.stats.localStats.ICMP, &stackStats.ICMP.V4)
        e.stats.igmp.init(&e.stats.localStats.IGMP, &stackStats.IGMP)

        p.mu.Lock()
        p.mu.eps[nic.ID()] = e
        p.mu.Unlock()

        return e
}

func (p *protocol) findEndpointWithAddress(addr tcpip.Address) *endpoint {
        p.mu.RLock()
        defer p.mu.RUnlock()

        for _, e := range p.mu.eps {
                if addressEndpoint := e.AcquireAssignedAddress(addr, false /* allowTemp */, stack.NeverPrimaryEndpoint); addressEndpoint != nil {
                        addressEndpoint.DecRef()
                        return e
                }
        }

        return nil
}

func (p *protocol) forgetEndpoint(nicID tcpip.NICID) {
        p.mu.Lock()
        defer p.mu.Unlock()
        delete(p.mu.eps, nicID)
}

// Forwarding implements stack.ForwardingNetworkEndpoint.
func (e *endpoint) Forwarding() bool {
        return atomic.LoadUint32(&e.forwarding) == forwardingEnabled
}

// setForwarding sets the forwarding status for the endpoint.
//
// Returns true if the forwarding status was updated.
func (e *endpoint) setForwarding(v bool) bool {
        forwarding := uint32(forwardingDisabled)
        if v {
                forwarding = forwardingEnabled
        }

        return atomic.SwapUint32(&e.forwarding, forwarding) != forwarding
}

// SetForwarding implements stack.ForwardingNetworkEndpoint.
func (e *endpoint) SetForwarding(forwarding bool) {
        e.mu.Lock()
        defer e.mu.Unlock()

        if !e.setForwarding(forwarding) {
                return
        }

        if forwarding {
                // There does not seem to be an RFC requirement for a node to join the all
                // routers multicast address but
                // https://www.iana.org/assignments/multicast-addresses/multicast-addresses.xhtml
                // specifies the address as a group for all routers on a subnet so we join
                // the group here.
                if err := e.joinGroupLocked(header.IPv4AllRoutersGroup); err != nil {
                        // joinGroupLocked only returns an error if the group address is not a
                        // valid IPv4 multicast address.
                        panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv4AllRoutersGroup, err))
                }

                return
        }

        switch err := e.leaveGroupLocked(header.IPv4AllRoutersGroup).(type) {
        case nil:
        case *tcpip.ErrBadLocalAddress:
                // The endpoint may have already left the multicast group.
        default:
                panic(fmt.Sprintf("e.leaveGroupLocked(%s): %s", header.IPv4AllRoutersGroup, err))
        }
}

// Enable implements stack.NetworkEndpoint.
func (e *endpoint) Enable() tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()

        // If the NIC is not enabled, the endpoint can't do anything meaningful so
        // don't enable the endpoint.
        if !e.nic.Enabled() {
                return &tcpip.ErrNotPermitted{}
        }

        // If the endpoint is already enabled, there is nothing for it to do.
        if !e.setEnabled(true) {
                return nil
        }

        // Create an endpoint to receive broadcast packets on this interface.
        ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(ipv4BroadcastAddr, stack.NeverPrimaryEndpoint, stack.AddressConfigStatic, false /* deprecated */)
        if err != nil {
                return err
        }
        // We have no need for the address endpoint.
        ep.DecRef()

        // Groups may have been joined while the endpoint was disabled, or the
        // endpoint may have left groups from the perspective of IGMP when the
        // endpoint was disabled. Either way, we need to let routers know to
        // send us multicast traffic.
        e.mu.igmp.initializeAll()

        // As per RFC 1122 section 3.3.7, all hosts should join the all-hosts
        // multicast group. Note, the IANA calls the all-hosts multicast group the
        // all-systems multicast group.
        if err := e.joinGroupLocked(header.IPv4AllSystems); err != nil {
                // joinGroupLocked only returns an error if the group address is not a valid
                // IPv4 multicast address.
                panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv4AllSystems, err))
        }

        return nil
}

// Enabled implements stack.NetworkEndpoint.
func (e *endpoint) Enabled() bool {
        return e.nic.Enabled() && e.isEnabled()
}

// isEnabled returns true if the endpoint is enabled, regardless of the
// enabled status of the NIC.
func (e *endpoint) isEnabled() bool {
        return atomic.LoadUint32(&e.enabled) == 1
}

// setEnabled sets the enabled status for the endpoint.
//
// Returns true if the enabled status was updated.
func (e *endpoint) setEnabled(v bool) bool {
        if v {
                return atomic.SwapUint32(&e.enabled, 1) == 0
        }
        return atomic.SwapUint32(&e.enabled, 0) == 1
}

// Disable implements stack.NetworkEndpoint.
func (e *endpoint) Disable() {
        e.mu.Lock()
        defer e.mu.Unlock()
        e.disableLocked()
}

func (e *endpoint) disableLocked() {
        if !e.isEnabled() {
                return
        }

        // The endpoint may have already left the multicast group.
        switch err := e.leaveGroupLocked(header.IPv4AllSystems).(type) {
        case nil, *tcpip.ErrBadLocalAddress:
        default:
                panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv4AllSystems, err))
        }

        // Leave groups from the perspective of IGMP so that routers know that
        // we are no longer interested in the group.
        e.mu.igmp.softLeaveAll()

        // The address may have already been removed.
        switch err := e.mu.addressableEndpointState.RemovePermanentAddress(ipv4BroadcastAddr.Address); err.(type) {
        case nil, *tcpip.ErrBadLocalAddress:
        default:
                panic(fmt.Sprintf("unexpected error when removing address = %s: %s", ipv4BroadcastAddr.Address, err))
        }

        // Reset the IGMP V1 present flag.
        //
        // If the node comes back up on the same network, it will re-learn that it
        // needs to perform IGMPv1.
        e.mu.igmp.resetV1Present()

        if !e.setEnabled(false) {
                panic("should have only done work to disable the endpoint if it was enabled")
        }
}

// DefaultTTL is the default time-to-live value for this endpoint.
func (e *endpoint) DefaultTTL() uint8 {
        return e.protocol.DefaultTTL()
}

// MTU implements stack.NetworkEndpoint. It returns the link-layer MTU minus the
// network layer max header length.
func (e *endpoint) MTU() uint32 {
        networkMTU, err := calculateNetworkMTU(e.nic.MTU(), header.IPv4MinimumSize)
        if err != nil {
                return 0
        }
        return networkMTU
}

// MaxHeaderLength returns the maximum length needed by ipv4 headers (and
// underlying protocols).
func (e *endpoint) MaxHeaderLength() uint16 {
        return e.nic.MaxHeaderLength() + header.IPv4MaximumHeaderSize
}

// NetworkProtocolNumber implements stack.NetworkEndpoint.
func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
        return e.protocol.Number()
}

func (e *endpoint) addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, options header.IPv4OptionsSerializer) tcpip.Error {
        hdrLen := header.IPv4MinimumSize
        var optLen int
        if options != nil {
                optLen = int(options.Length())
        }
        hdrLen += optLen
        if hdrLen > header.IPv4MaximumHeaderSize {
                return &tcpip.ErrMessageTooLong{}
        }
        ipH := header.IPv4(pkt.NetworkHeader().Push(hdrLen))
        length := pkt.Size()
        if length > math.MaxUint16 {
                return &tcpip.ErrMessageTooLong{}
        }
        // RFC 6864 section 4.3 mandates uniqueness of ID values for non-atomic
        // datagrams. Since the DF bit is never being set here, all datagrams
        // are non-atomic and need an ID.
        id := atomic.AddUint32(&e.protocol.ids[hashRoute(srcAddr, dstAddr, params.Protocol, e.protocol.hashIV)%buckets], 1)
        ipH.Encode(&header.IPv4Fields{
                TotalLength: uint16(length),
                ID:          uint16(id),
                TTL:         params.TTL,
                TOS:         params.TOS,
                Protocol:    uint8(params.Protocol),
                SrcAddr:     srcAddr,
                DstAddr:     dstAddr,
                Options:     options,
        })
        ipH.SetChecksum(^ipH.CalculateChecksum())
        pkt.NetworkProtocolNumber = ProtocolNumber
        return nil
}

// handleFragments fragments pkt and calls the handler function on each
// fragment. It returns the number of fragments handled and the number of
// fragments left to be processed. The IP header must already be present in the
// original packet.
func (e *endpoint) handleFragments(_ *stack.Route, networkMTU uint32, pkt *stack.PacketBuffer, handler func(*stack.PacketBuffer) tcpip.Error) (int, int, tcpip.Error) {
        // Round the MTU down to align to 8 bytes.
        fragmentPayloadSize := networkMTU &^ 7
        networkHeader := header.IPv4(pkt.NetworkHeader().View())
        pf := fragmentation.MakePacketFragmenter(pkt, fragmentPayloadSize, pkt.AvailableHeaderBytes()+len(networkHeader))

        var n int
        for {
                fragPkt, more := buildNextFragment(&pf, networkHeader)
                if err := handler(fragPkt); err != nil {
                        return n, pf.RemainingFragmentCount() + 1, err
                }
                n++
                if !more {
                        return n, pf.RemainingFragmentCount(), nil
                }
        }
}

// WritePacket writes a packet to the given destination address and protocol.
func (e *endpoint) WritePacket(r *stack.Route, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) tcpip.Error {
        if err := e.addIPHeader(r.LocalAddress(), r.RemoteAddress(), pkt, params, nil /* options */); err != nil {
                return err
        }

        // iptables filtering. All packets that reach here are locally
        // generated.
        outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
        if ok := e.protocol.stack.IPTables().Check(stack.Output, pkt, r, "" /* preroutingAddr */, "" /* inNicName */, outNicName); !ok {
                // iptables is telling us to drop the packet.
                e.stats.ip.IPTablesOutputDropped.Increment()
                return nil
        }

        // If the packet is manipulated as per NAT Output rules, handle packet
        // based on destination address and do not send the packet to link
        // layer.
        //
        // We should do this for every packet, rather than only NATted packets, but
        // removing this check short circuits broadcasts before they are sent out to
        // other hosts.
        if pkt.NatDone {
                netHeader := header.IPv4(pkt.NetworkHeader().View())
                if ep := e.protocol.findEndpointWithAddress(netHeader.DestinationAddress()); ep != nil {
                        // Since we rewrote the packet but it is being routed back to us, we
                        // can safely assume the checksum is valid.
                        ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */)
                        return nil
                }
        }

        return e.writePacket(r, pkt, false /* headerIncluded */)
}

func (e *endpoint) writePacket(r *stack.Route, pkt *stack.PacketBuffer, headerIncluded bool) tcpip.Error {
        if r.Loop()&stack.PacketLoop != 0 {
                // If the packet was generated by the stack (not a raw/packet endpoint
                // where a packet may be written with the header included), then we can
                // safely assume the checksum is valid.
                e.handleLocalPacket(pkt, !headerIncluded /* canSkipRXChecksum */)
        }
        if r.Loop()&stack.PacketOut == 0 {
                return nil
        }

        // Postrouting NAT can only change the source address, and does not alter the
        // route or outgoing interface of the packet.
        outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
        if ok := e.protocol.stack.IPTables().Check(stack.Postrouting, pkt, r, "" /* preroutingAddr */, "" /* inNicName */, outNicName); !ok {
                // iptables is telling us to drop the packet.
                e.stats.ip.IPTablesPostroutingDropped.Increment()
                return nil
        }

        stats := e.stats.ip

        networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size()))
        if err != nil {
                stats.OutgoingPacketErrors.Increment()
                return err
        }

        if packetMustBeFragmented(pkt, networkMTU) {
                h := header.IPv4(pkt.NetworkHeader().View())
                if h.Flags()&header.IPv4FlagDontFragment != 0 && pkt.NetworkPacketInfo.IsForwardedPacket {
                        // TODO(gvisor.dev/issue/5919): Handle error condition in which DontFragment
                        // is set but the packet must be fragmented for the non-forwarding case.
                        return &tcpip.ErrMessageTooLong{}
                }
                sent, remain, err := e.handleFragments(r, networkMTU, pkt, func(fragPkt *stack.PacketBuffer) tcpip.Error {
                        // TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each
                        // fragment one by one using WritePacket() (current strategy) or if we
                        // want to create a PacketBufferList from the fragments and feed it to
                        // WritePackets(). It'll be faster but cost more memory.
                        return e.nic.WritePacket(r, ProtocolNumber, fragPkt)
                })
                stats.PacketsSent.IncrementBy(uint64(sent))
                stats.OutgoingPacketErrors.IncrementBy(uint64(remain))
                return err
        }

        if err := e.nic.WritePacket(r, ProtocolNumber, pkt); err != nil {
                stats.OutgoingPacketErrors.Increment()
                return err
        }
        stats.PacketsSent.Increment()
        return nil
}

// WritePackets implements stack.NetworkEndpoint.
func (e *endpoint) WritePackets(r *stack.Route, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, tcpip.Error) {
        if r.Loop()&stack.PacketLoop != 0 {
                panic("multiple packets in local loop")
        }
        if r.Loop()&stack.PacketOut == 0 {
                return pkts.Len(), nil
        }

        stats := e.stats.ip

        for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
                if err := e.addIPHeader(r.LocalAddress(), r.RemoteAddress(), pkt, params, nil /* options */); err != nil {
                        return 0, err
                }

                networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size()))
                if err != nil {
                        stats.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len()))
                        return 0, err
                }

                if packetMustBeFragmented(pkt, networkMTU) {
                        // Keep track of the packet that is about to be fragmented so it can be
                        // removed once the fragmentation is done.
                        originalPkt := pkt
                        if _, _, err := e.handleFragments(r, networkMTU, pkt, func(fragPkt *stack.PacketBuffer) tcpip.Error {
                                // Modify the packet list in place with the new fragments.
                                pkts.InsertAfter(pkt, fragPkt)
                                pkt = fragPkt
                                return nil
                        }); err != nil {
                                panic(fmt.Sprintf("e.handleFragments(_, _, %d, _, _) = %s", networkMTU, err))
                        }
                        // Remove the packet that was just fragmented and process the rest.
                        pkts.Remove(originalPkt)
                }
        }

        outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
        // iptables filtering. All packets that reach here are locally
        // generated.
        outputDropped, natPkts := e.protocol.stack.IPTables().CheckPackets(stack.Output, pkts, r, "" /* inNicName */, outNicName)
        stats.IPTablesOutputDropped.IncrementBy(uint64(len(outputDropped)))
        for pkt := range outputDropped {
                pkts.Remove(pkt)
        }

        // The NAT-ed packets may now be destined for us.
        locallyDelivered := 0
        for pkt := range natPkts {
                ep := e.protocol.findEndpointWithAddress(header.IPv4(pkt.NetworkHeader().View()).DestinationAddress())
                if ep == nil {
                        // The NAT-ed packet is still destined for some remote node.
                        continue
                }

                // Do not send the locally destined packet out the NIC.
                pkts.Remove(pkt)

                // Deliver the packet locally.
                ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */)
                locallyDelivered++

        }

        // We ignore the list of NAT-ed packets here because Postrouting NAT can only
        // change the source address, and does not alter the route or outgoing
        // interface of the packet.
        postroutingDropped, _ := e.protocol.stack.IPTables().CheckPackets(stack.Postrouting, pkts, r, "" /* inNicName */, outNicName)
        stats.IPTablesPostroutingDropped.IncrementBy(uint64(len(postroutingDropped)))
        for pkt := range postroutingDropped {
                pkts.Remove(pkt)
        }

        // The rest of the packets can be delivered to the NIC as a batch.
        pktsLen := pkts.Len()
        written, err := e.nic.WritePackets(r, pkts, ProtocolNumber)
        stats.PacketsSent.IncrementBy(uint64(written))
        stats.OutgoingPacketErrors.IncrementBy(uint64(pktsLen - written))

        // Dropped packets aren't errors, so include them in the return value.
        return locallyDelivered + written + len(outputDropped) + len(postroutingDropped), err
}

// WriteHeaderIncludedPacket implements stack.NetworkEndpoint.
func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) tcpip.Error {
        // The packet already has an IP header, but there are a few required
        // checks.
        h, ok := pkt.Data().PullUp(header.IPv4MinimumSize)
        if !ok {
                return &tcpip.ErrMalformedHeader{}
        }

        hdrLen := header.IPv4(h).HeaderLength()
        if hdrLen < header.IPv4MinimumSize {
                return &tcpip.ErrMalformedHeader{}
        }

        h, ok = pkt.Data().PullUp(int(hdrLen))
        if !ok {
                return &tcpip.ErrMalformedHeader{}
        }
        ipH := header.IPv4(h)

        // Always set the total length.
        pktSize := pkt.Data().Size()
        ipH.SetTotalLength(uint16(pktSize))

        // Set the source address when zero.
        if ipH.SourceAddress() == header.IPv4Any {
                ipH.SetSourceAddress(r.LocalAddress())
        }

        // Set the packet ID when zero.
        if ipH.ID() == 0 {
                // RFC 6864 section 4.3 mandates uniqueness of ID values for
                // non-atomic datagrams, so assign an ID to all such datagrams
                // according to the definition given in RFC 6864 section 4.
                if ipH.Flags()&header.IPv4FlagDontFragment == 0 || ipH.Flags()&header.IPv4FlagMoreFragments != 0 || ipH.FragmentOffset() > 0 {
                        ipH.SetID(uint16(atomic.AddUint32(&e.protocol.ids[hashRoute(r.LocalAddress(), r.RemoteAddress(), 0 /* protocol */, e.protocol.hashIV)%buckets], 1)))
                }
        }

        // Always set the checksum.
        ipH.SetChecksum(0)
        ipH.SetChecksum(^ipH.CalculateChecksum())

        // Populate the packet buffer's network header and don't allow an invalid
        // packet to be sent.
        //
        // Note that parsing only makes sure that the packet is well formed as per the
        // wire format. We also want to check if the header's fields are valid before
        // sending the packet.
        if !parse.IPv4(pkt) || !header.IPv4(pkt.NetworkHeader().View()).IsValid(pktSize) {
                return &tcpip.ErrMalformedHeader{}
        }

        return e.writePacket(r, pkt, true /* headerIncluded */)
}

// forwardPacket attempts to forward a packet to its final destination.
func (e *endpoint) forwardPacket(pkt *stack.PacketBuffer) ip.ForwardingError {
        h := header.IPv4(pkt.NetworkHeader().View())

        dstAddr := h.DestinationAddress()
        // As per RFC 3927 section 7,
        //
        //   A router MUST NOT forward a packet with an IPv4 Link-Local source or
        //   destination address, irrespective of the router's default route
        //   configuration or routes obtained from dynamic routing protocols.
        //
        //   A router which receives a packet with an IPv4 Link-Local source or
        //   destination address MUST NOT forward the packet.  This prevents
        //   forwarding of packets back onto the network segment from which they
        //   originated, or to any other segment.
        if header.IsV4LinkLocalUnicastAddress(h.SourceAddress()) {
                return &ip.ErrLinkLocalSourceAddress{}
        }
        if header.IsV4LinkLocalUnicastAddress(dstAddr) || header.IsV4LinkLocalMulticastAddress(dstAddr) {
                return &ip.ErrLinkLocalDestinationAddress{}
        }

        ttl := h.TTL()
        if ttl == 0 {
                // As per RFC 792 page 6, Time Exceeded Message,
                //
                //  If the gateway processing a datagram finds the time to live field
                //  is zero it must discard the datagram.  The gateway may also notify
                //  the source host via the time exceeded message.
                //
                // We return the original error rather than the result of returning
                // the ICMP packet because the original error is more relevant to
                // the caller.
                _ = e.protocol.returnError(&icmpReasonTTLExceeded{}, pkt)
                return &ip.ErrTTLExceeded{}
        }

        if opts := h.Options(); len(opts) != 0 {
                newOpts, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageForward{})
                if optProblem != nil {
                        if optProblem.NeedICMP {
                                _ = e.protocol.returnError(&icmpReasonParamProblem{
                                        pointer:    optProblem.Pointer,
                                        forwarding: true,
                                }, pkt)
                        }
                        return &ip.ErrParameterProblem{}
                }
                copied := copy(opts, newOpts)
                if copied != len(newOpts) {
                        panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOpts)))
                }
                // Since in forwarding we handle all options, including copying those we
                // do not recognise, the options region should remain the same size which
                // simplifies processing. As we MAY receive a packet with a lot of padded
                // bytes after the "end of options list" byte, make sure we copy
                // them as the legal padding value (0).
                for i := copied; i < len(opts); i++ {
                        // Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero".
                        opts[i] = byte(header.IPv4OptionListEndType)
                }
        }

        stk := e.protocol.stack

        // Check if the destination is owned by the stack.
        if ep := e.protocol.findEndpointWithAddress(dstAddr); ep != nil {
                inNicName := stk.FindNICNameFromID(e.nic.ID())
                outNicName := stk.FindNICNameFromID(ep.nic.ID())
                if ok := stk.IPTables().Check(stack.Forward, pkt, nil, "" /* preroutingAddr */, inNicName, outNicName); !ok {
                        // iptables is telling us to drop the packet.
                        e.stats.ip.IPTablesForwardDropped.Increment()
                        return nil
                }

                // The packet originally arrived on e so provide its NIC as the input NIC.
                ep.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */)
                return nil
        }

        r, err := stk.FindRoute(0, "", dstAddr, ProtocolNumber, false /* multicastLoop */)
        switch err.(type) {
        case nil:
        case *tcpip.ErrNoRoute, *tcpip.ErrNetworkUnreachable:
                // We return the original error rather than the result of returning
                // the ICMP packet because the original error is more relevant to
                // the caller.
                _ = e.protocol.returnError(&icmpReasonNetworkUnreachable{}, pkt)
                return &ip.ErrNoRoute{}
        default:
                return &ip.ErrOther{Err: err}
        }
        defer r.Release()

        inNicName := stk.FindNICNameFromID(e.nic.ID())
        outNicName := stk.FindNICNameFromID(r.NICID())
        if ok := stk.IPTables().Check(stack.Forward, pkt, nil, "" /* preroutingAddr */, inNicName, outNicName); !ok {
                // iptables is telling us to drop the packet.
                e.stats.ip.IPTablesForwardDropped.Increment()
                return nil
        }

        // We need to do a deep copy of the IP packet because
        // WriteHeaderIncludedPacket takes ownership of the packet buffer, but we do
        // not own it.
        newHdr := header.IPv4(stack.PayloadSince(pkt.NetworkHeader()))

        // As per RFC 791 page 30, Time to Live,
        //
        //   This field must be decreased at each point that the internet header
        //   is processed to reflect the time spent processing the datagram.
        //   Even if no local information is available on the time actually
        //   spent, the field must be decremented by 1.
        newHdr.SetTTL(ttl - 1)

        switch err := r.WriteHeaderIncludedPacket(stack.NewPacketBuffer(stack.PacketBufferOptions{
                ReserveHeaderBytes: int(r.MaxHeaderLength()),
                Data:               buffer.View(newHdr).ToVectorisedView(),
                IsForwardedPacket:  true,
        })); err.(type) {
        case nil:
                return nil
        case *tcpip.ErrMessageTooLong:
                // As per RFC 792, page 4, Destination Unreachable:
                //
                //   Another case is when a datagram must be fragmented to be forwarded by a
                //   gateway yet the Don't Fragment flag is on. In this case the gateway must
                //   discard the datagram and may return a destination unreachable message.
                //
                // WriteHeaderIncludedPacket checks for the presence of the Don't Fragment bit
                // while sending the packet and returns this error iff fragmentation is
                // necessary and the bit is also set.
                _ = e.protocol.returnError(&icmpReasonFragmentationNeeded{}, pkt)
                return &ip.ErrMessageTooLong{}
        default:
                return &ip.ErrOther{Err: err}
        }
}

// HandlePacket is called by the link layer when new ipv4 packets arrive for
// this endpoint.
func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) {
        stats := e.stats.ip

        stats.PacketsReceived.Increment()

        if !e.isEnabled() {
                stats.DisabledPacketsReceived.Increment()
                return
        }

        h, ok := e.protocol.parseAndValidate(pkt)
        if !ok {
                stats.MalformedPacketsReceived.Increment()
                return
        }

        if !e.nic.IsLoopback() {
                if !e.protocol.options.AllowExternalLoopbackTraffic {
                        if header.IsV4LoopbackAddress(h.SourceAddress()) {
                                stats.InvalidSourceAddressesReceived.Increment()
                                return
                        }

                        if header.IsV4LoopbackAddress(h.DestinationAddress()) {
                                stats.InvalidDestinationAddressesReceived.Increment()
                                return
                        }
                }

                if e.protocol.stack.HandleLocal() {
                        addressEndpoint := e.AcquireAssignedAddress(header.IPv4(pkt.NetworkHeader().View()).SourceAddress(), e.nic.Promiscuous(), stack.CanBePrimaryEndpoint)
                        if addressEndpoint != nil {
                                addressEndpoint.DecRef()

                                // The source address is one of our own, so we never should have gotten
                                // a packet like this unless HandleLocal is false or our NIC is the
                                // loopback interface.
                                stats.InvalidSourceAddressesReceived.Increment()
                                return
                        }
                }

                // Loopback traffic skips the prerouting chain.
                inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
                if ok := e.protocol.stack.IPTables().Check(stack.Prerouting, pkt, nil, e.MainAddress().Address, inNicName, "" /* outNicName */); !ok {
                        // iptables is telling us to drop the packet.
                        stats.IPTablesPreroutingDropped.Increment()
                        return
                }
        }

        e.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */)
}

// handleLocalPacket is like HandlePacket except it does not perform the
// prerouting iptables hook or check for loopback traffic that originated from
// outside of the netstack (i.e. martian loopback packets).
func (e *endpoint) handleLocalPacket(pkt *stack.PacketBuffer, canSkipRXChecksum bool) {
        stats := e.stats.ip
        stats.PacketsReceived.Increment()

        pkt = pkt.CloneToInbound()
        pkt.RXTransportChecksumValidated = canSkipRXChecksum

        h, ok := e.protocol.parseAndValidate(pkt)
        if !ok {
                stats.MalformedPacketsReceived.Increment()
                return
        }

        e.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */)
}

func (e *endpoint) handleValidatedPacket(h header.IPv4, pkt *stack.PacketBuffer, inNICName string) {
        // Raw socket packets are delivered based solely on the transport protocol
        // number. We only require that the packet be valid IPv4, and that they not
        // be fragmented.
        if !h.More() && h.FragmentOffset() == 0 {
                e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt)
        }

        pkt.NICID = e.nic.ID()
        stats := e.stats
        stats.ip.ValidPacketsReceived.Increment()

        srcAddr := h.SourceAddress()
        dstAddr := h.DestinationAddress()

        // As per RFC 1122 section 3.2.1.3:
        //   When a host sends any datagram, the IP source address MUST
        //   be one of its own IP addresses (but not a broadcast or
        //   multicast address).
        if srcAddr == header.IPv4Broadcast || header.IsV4MulticastAddress(srcAddr) {
                stats.ip.InvalidSourceAddressesReceived.Increment()
                return
        }
        // Make sure the source address is not a subnet-local broadcast address.
        if addressEndpoint := e.AcquireAssignedAddress(srcAddr, false /* createTemp */, stack.NeverPrimaryEndpoint); addressEndpoint != nil {
                subnet := addressEndpoint.Subnet()
                addressEndpoint.DecRef()
                if subnet.IsBroadcast(srcAddr) {
                        stats.ip.InvalidSourceAddressesReceived.Increment()
                        return
                }
        }

        // Before we do any processing, note if the packet was received as some
        // sort of broadcast. The destination address should be an address we own
        // or a group we joined.
        if addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint); addressEndpoint != nil {
                subnet := addressEndpoint.AddressWithPrefix().Subnet()
                addressEndpoint.DecRef()
                pkt.NetworkPacketInfo.LocalAddressBroadcast = subnet.IsBroadcast(dstAddr) || dstAddr == header.IPv4Broadcast
        } else if !e.IsInGroup(dstAddr) {
                if !e.Forwarding() {
                        stats.ip.InvalidDestinationAddressesReceived.Increment()
                        return
                }
                switch err := e.forwardPacket(pkt); err.(type) {
                case nil:
                        return
                case *ip.ErrLinkLocalSourceAddress:
                        stats.ip.Forwarding.LinkLocalSource.Increment()
                case *ip.ErrLinkLocalDestinationAddress:
                        stats.ip.Forwarding.LinkLocalDestination.Increment()
                case *ip.ErrTTLExceeded:
                        stats.ip.Forwarding.ExhaustedTTL.Increment()
                case *ip.ErrNoRoute:
                        stats.ip.Forwarding.Unrouteable.Increment()
                case *ip.ErrParameterProblem:
                        stats.ip.MalformedPacketsReceived.Increment()
                case *ip.ErrMessageTooLong:
                        stats.ip.Forwarding.PacketTooBig.Increment()
                default:
                        panic(fmt.Sprintf("unexpected error %s while trying to forward packet: %#v", err, pkt))
                }
                stats.ip.Forwarding.Errors.Increment()
                return
        }

        // iptables filtering. All packets that reach here are intended for
        // this machine and will not be forwarded.
        if ok := e.protocol.stack.IPTables().Check(stack.Input, pkt, nil, "" /* preroutingAddr */, inNICName, "" /* outNicName */); !ok {
                // iptables is telling us to drop the packet.
                stats.ip.IPTablesInputDropped.Increment()
                return
        }

        if h.More() || h.FragmentOffset() != 0 {
                if pkt.Data().Size()+pkt.TransportHeader().View().Size() == 0 {
                        // Drop the packet as it's marked as a fragment but has
                        // no payload.
                        stats.ip.MalformedPacketsReceived.Increment()
                        stats.ip.MalformedFragmentsReceived.Increment()
                        return
                }
                if opts := h.Options(); len(opts) != 0 {
                        // If there are options we need to check them before we do assembly
                        // or we could be assembling errant packets. However we do not change the
                        // options as that could lead to double processing later.
                        if _, _, optProblem := e.processIPOptions(pkt, opts, &optionUsageVerify{}); optProblem != nil {
                                if optProblem.NeedICMP {
                                        _ = e.protocol.returnError(&icmpReasonParamProblem{
                                                pointer: optProblem.Pointer,
                                        }, pkt)
                                        e.stats.ip.MalformedPacketsReceived.Increment()
                                }
                                return
                        }
                }
                // The packet is a fragment, let's try to reassemble it.
                start := h.FragmentOffset()
                // Drop the fragment if the size of the reassembled payload would exceed the
                // maximum payload size.
                //
                // Note that this addition doesn't overflow even on 32bit architecture
                // because pkt.Data().Size() should not exceed 65535 (the max IP datagram
                // size). Otherwise the packet would've been rejected as invalid before
                // reaching here.
                if int(start)+pkt.Data().Size() > header.IPv4MaximumPayloadSize {
                        stats.ip.MalformedPacketsReceived.Increment()
                        stats.ip.MalformedFragmentsReceived.Increment()
                        return
                }

                proto := h.Protocol()
                resPkt, _, ready, err := e.protocol.fragmentation.Process(
                        // As per RFC 791 section 2.3, the identification value is unique
                        // for a source-destination pair and protocol.
                        fragmentation.FragmentID{
                                Source:      h.SourceAddress(),
                                Destination: h.DestinationAddress(),
                                ID:          uint32(h.ID()),
                                Protocol:    proto,
                        },
                        start,
                        start+uint16(pkt.Data().Size())-1,
                        h.More(),
                        proto,
                        pkt,
                )
                if err != nil {
                        stats.ip.MalformedPacketsReceived.Increment()
                        stats.ip.MalformedFragmentsReceived.Increment()
                        return
                }
                if !ready {
                        return
                }
                pkt = resPkt
                h = header.IPv4(pkt.NetworkHeader().View())

                // The reassembler doesn't take care of fixing up the header, so we need
                // to do it here.
                h.SetTotalLength(uint16(pkt.Data().Size() + len(h)))
                h.SetFlagsFragmentOffset(0, 0)

                // Now that the packet is reassembled, it can be sent to raw sockets.
                e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt)
        }
        stats.ip.PacketsDelivered.Increment()

        p := h.TransportProtocol()
        if p == header.ICMPv4ProtocolNumber {
                // TODO(gvisor.dev/issues/3810): when we sort out ICMP and transport
                // headers, the setting of the transport number here should be
                // unnecessary and removed.
                pkt.TransportProtocolNumber = p
                e.handleICMP(pkt)
                return
        }
        // ICMP handles options itself but do it here for all remaining destinations.
        var hasRouterAlertOption bool
        if opts := h.Options(); len(opts) != 0 {
                newOpts, processedOpts, optProblem := e.processIPOptions(pkt, opts, &optionUsageReceive{})
                if optProblem != nil {
                        if optProblem.NeedICMP {
                                _ = e.protocol.returnError(&icmpReasonParamProblem{
                                        pointer: optProblem.Pointer,
                                }, pkt)
                                stats.ip.MalformedPacketsReceived.Increment()
                        }
                        return
                }
                hasRouterAlertOption = processedOpts.routerAlert
                copied := copy(opts, newOpts)
                if copied != len(newOpts) {
                        panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOpts)))
                }
                for i := copied; i < len(opts); i++ {
                        // Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero".
                        opts[i] = byte(header.IPv4OptionListEndType)
                }
        }
        if p == header.IGMPProtocolNumber {
                e.mu.Lock()
                e.mu.igmp.handleIGMP(pkt, hasRouterAlertOption)
                e.mu.Unlock()
                return
        }

        switch res := e.dispatcher.DeliverTransportPacket(p, pkt); res {
        case stack.TransportPacketHandled:
        case stack.TransportPacketDestinationPortUnreachable:
                // As per RFC: 1122 Section 3.2.2.1 A host SHOULD generate Destination
                //   Unreachable messages with code:
                //     3 (Port Unreachable), when the designated transport protocol
                //     (e.g., UDP) is unable to demultiplex the datagram but has no
                //     protocol mechanism to inform the sender.
                _ = e.protocol.returnError(&icmpReasonPortUnreachable{}, pkt)
        case stack.TransportPacketProtocolUnreachable:
                // As per RFC: 1122 Section 3.2.2.1
                //   A host SHOULD generate Destination Unreachable messages with code:
                //     2 (Protocol Unreachable), when the designated transport protocol
                //     is not supported
                _ = e.protocol.returnError(&icmpReasonProtoUnreachable{}, pkt)
        default:
                panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res))
        }
}

// Close cleans up resources associated with the endpoint.
func (e *endpoint) Close() {
        e.mu.Lock()
        e.disableLocked()
        e.mu.addressableEndpointState.Cleanup()
        e.mu.Unlock()

        e.protocol.forgetEndpoint(e.nic.ID())
}

// AddAndAcquirePermanentAddress implements stack.AddressableEndpoint.
func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, tcpip.Error) {
        e.mu.RLock()
        defer e.mu.RUnlock()

        ep, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, peb, configType, deprecated)
        if err == nil {
                e.mu.igmp.sendQueuedReports()
        }
        return ep, err
}

// RemovePermanentAddress implements stack.AddressableEndpoint.
func (e *endpoint) RemovePermanentAddress(addr tcpip.Address) tcpip.Error {
        e.mu.RLock()
        defer e.mu.RUnlock()
        return e.mu.addressableEndpointState.RemovePermanentAddress(addr)
}

// MainAddress implements stack.AddressableEndpoint.
func (e *endpoint) MainAddress() tcpip.AddressWithPrefix {
        e.mu.RLock()
        defer e.mu.RUnlock()
        return e.mu.addressableEndpointState.MainAddress()
}

// AcquireAssignedAddress implements stack.AddressableEndpoint.
func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint {
        e.mu.RLock()
        defer e.mu.RUnlock()

        loopback := e.nic.IsLoopback()
        return e.mu.addressableEndpointState.AcquireAssignedAddressOrMatching(localAddr, func(addressEndpoint stack.AddressEndpoint) bool {
                subnet := addressEndpoint.Subnet()
                // IPv4 has a notion of a subnet broadcast address and considers the
                // loopback interface bound to an address's whole subnet (on linux).
                return subnet.IsBroadcast(localAddr) || (loopback && subnet.Contains(localAddr))
        }, allowTemp, tempPEB)
}

// AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint.
func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
        e.mu.RLock()
        defer e.mu.RUnlock()
        return e.acquireOutgoingPrimaryAddressRLocked(remoteAddr, allowExpired)
}

// acquireOutgoingPrimaryAddressRLocked is like AcquireOutgoingPrimaryAddress
// but with locking requirements
//
// Precondition: igmp.ep.mu must be read locked.
func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
        return e.mu.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, allowExpired)
}

// PrimaryAddresses implements stack.AddressableEndpoint.
func (e *endpoint) PrimaryAddresses() []tcpip.AddressWithPrefix {
        e.mu.RLock()
        defer e.mu.RUnlock()
        return e.mu.addressableEndpointState.PrimaryAddresses()
}

// PermanentAddresses implements stack.AddressableEndpoint.
func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix {
        e.mu.RLock()
        defer e.mu.RUnlock()
        return e.mu.addressableEndpointState.PermanentAddresses()
}

// JoinGroup implements stack.GroupAddressableEndpoint.
func (e *endpoint) JoinGroup(addr tcpip.Address) tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()
        return e.joinGroupLocked(addr)
}

// joinGroupLocked is like JoinGroup but with locking requirements.
//
// Precondition: e.mu must be locked.
func (e *endpoint) joinGroupLocked(addr tcpip.Address) tcpip.Error {
        if !header.IsV4MulticastAddress(addr) {
                return &tcpip.ErrBadAddress{}
        }

        e.mu.igmp.joinGroup(addr)
        return nil
}

// LeaveGroup implements stack.GroupAddressableEndpoint.
func (e *endpoint) LeaveGroup(addr tcpip.Address) tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()
        return e.leaveGroupLocked(addr)
}

// leaveGroupLocked is like LeaveGroup but with locking requirements.
//
// Precondition: e.mu must be locked.
func (e *endpoint) leaveGroupLocked(addr tcpip.Address) tcpip.Error {
        return e.mu.igmp.leaveGroup(addr)
}

// IsInGroup implements stack.GroupAddressableEndpoint.
func (e *endpoint) IsInGroup(addr tcpip.Address) bool {
        e.mu.RLock()
        defer e.mu.RUnlock()
        return e.mu.igmp.isInGroup(addr)
}

// Stats implements stack.NetworkEndpoint.
func (e *endpoint) Stats() stack.NetworkEndpointStats {
        return &e.stats.localStats
}

var _ stack.NetworkProtocol = (*protocol)(nil)
var _ fragmentation.TimeoutHandler = (*protocol)(nil)

type protocol struct {
        stack *stack.Stack

        mu struct {
                sync.RWMutex

                // eps is keyed by NICID to allow protocol methods to retrieve an endpoint
                // when handling a packet, by looking at which NIC handled the packet.
                eps map[tcpip.NICID]*endpoint
        }

        // defaultTTL is the current default TTL for the protocol. Only the
        // uint8 portion of it is meaningful.
        //
        // Must be accessed using atomic operations.
        defaultTTL uint32

        ids    []uint32
        hashIV uint32

        fragmentation *fragmentation.Fragmentation

        options Options
}

// Number returns the ipv4 protocol number.
func (p *protocol) Number() tcpip.NetworkProtocolNumber {
        return ProtocolNumber
}

// MinimumPacketSize returns the minimum valid ipv4 packet size.
func (p *protocol) MinimumPacketSize() int {
        return header.IPv4MinimumSize
}

// DefaultPrefixLen returns the IPv4 default prefix length.
func (p *protocol) DefaultPrefixLen() int {
        return header.IPv4AddressSize * 8
}

// ParseAddresses implements stack.NetworkProtocol.
func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
        h := header.IPv4(v)
        return h.SourceAddress(), h.DestinationAddress()
}

// SetOption implements stack.NetworkProtocol.
func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) tcpip.Error {
        switch v := option.(type) {
        case *tcpip.DefaultTTLOption:
                p.SetDefaultTTL(uint8(*v))
                return nil
        default:
                return &tcpip.ErrUnknownProtocolOption{}
        }
}

// Option implements stack.NetworkProtocol.
func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) tcpip.Error {
        switch v := option.(type) {
        case *tcpip.DefaultTTLOption:
                *v = tcpip.DefaultTTLOption(p.DefaultTTL())
                return nil
        default:
                return &tcpip.ErrUnknownProtocolOption{}
        }
}

// SetDefaultTTL sets the default TTL for endpoints created with this protocol.
func (p *protocol) SetDefaultTTL(ttl uint8) {
        atomic.StoreUint32(&p.defaultTTL, uint32(ttl))
}

// DefaultTTL returns the default TTL for endpoints created with this protocol.
func (p *protocol) DefaultTTL() uint8 {
        return uint8(atomic.LoadUint32(&p.defaultTTL))
}

// Close implements stack.TransportProtocol.
func (*protocol) Close() {}

// Wait implements stack.TransportProtocol.
func (*protocol) Wait() {}

// parseAndValidate parses the packet (including its transport layer header) and
// returns the parsed IP header.
//
// Returns true if the IP header was successfully parsed.
func (p *protocol) parseAndValidate(pkt *stack.PacketBuffer) (header.IPv4, bool) {
        transProtoNum, hasTransportHdr, ok := p.Parse(pkt)
        if !ok {
                return nil, false
        }

        h := header.IPv4(pkt.NetworkHeader().View())
        // Do not include the link header's size when calculating the size of the IP
        // packet.
        if !h.IsValid(pkt.Size() - pkt.LinkHeader().View().Size()) {
                return nil, false
        }

        if !h.IsChecksumValid() {
                return nil, false
        }

        if hasTransportHdr {
                switch err := p.stack.ParsePacketBufferTransport(transProtoNum, pkt); err {
                case stack.ParsedOK:
                case stack.UnknownTransportProtocol, stack.TransportLayerParseError:
                        // The transport layer will handle unknown protocols and transport layer
                        // parsing errors.
                default:
                        panic(fmt.Sprintf("unexpected error parsing transport header = %d", err))
                }
        }

        return h, true
}

// Parse implements stack.NetworkProtocol.
func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
        if ok := parse.IPv4(pkt); !ok {
                return 0, false, false
        }

        ipHdr := header.IPv4(pkt.NetworkHeader().View())
        return ipHdr.TransportProtocol(), !ipHdr.More() && ipHdr.FragmentOffset() == 0, true
}

// calculateNetworkMTU calculates the network-layer payload MTU based on the
// link-layer payload mtu.
func calculateNetworkMTU(linkMTU, networkHeaderSize uint32) (uint32, tcpip.Error) {
        if linkMTU < header.IPv4MinimumMTU {
                return 0, &tcpip.ErrInvalidEndpointState{}
        }

        // As per RFC 791 section 3.1, an IPv4 header cannot exceed 60 bytes in
        // length:
        //   The maximal internet header is 60 octets, and a typical internet header
        //   is 20 octets, allowing a margin for headers of higher level protocols.
        if networkHeaderSize > header.IPv4MaximumHeaderSize {
                return 0, &tcpip.ErrMalformedHeader{}
        }

        networkMTU := linkMTU
        if networkMTU > MaxTotalSize {
                networkMTU = MaxTotalSize
        }

        return networkMTU - networkHeaderSize, nil
}

func packetMustBeFragmented(pkt *stack.PacketBuffer, networkMTU uint32) bool {
        payload := pkt.TransportHeader().View().Size() + pkt.Data().Size()
        return pkt.GSOOptions.Type == stack.GSONone && uint32(payload) > networkMTU
}

// addressToUint32 translates an IPv4 address into its little endian uint32
// representation.
//
// This function does the same thing as binary.LittleEndian.Uint32 but operates
// on a tcpip.Address (a string) without the need to convert it to a byte slice,
// which would cause an allocation.
func addressToUint32(addr tcpip.Address) uint32 {
        _ = addr[3] // bounds check hint to compiler
        return uint32(addr[0]) | uint32(addr[1])<<8 | uint32(addr[2])<<16 | uint32(addr[3])<<24
}

// hashRoute calculates a hash value for the given source/destination pair using
// the addresses, transport protocol number and a 32-bit number to generate the
// hash.
func hashRoute(srcAddr, dstAddr tcpip.Address, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 {
        a := addressToUint32(srcAddr)
        b := addressToUint32(dstAddr)
        return hash.Hash3Words(a, b, uint32(protocol), hashIV)
}

// Options holds options to configure a new protocol.
type Options struct {
        // IGMP holds options for IGMP.
        IGMP IGMPOptions

        // AllowExternalLoopbackTraffic indicates that inbound loopback packets (i.e.
        // martian loopback packets) should be accepted.
        AllowExternalLoopbackTraffic bool
}

// NewProtocolWithOptions returns an IPv4 network protocol.
func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory {
        ids := make([]uint32, buckets)

        // Randomly initialize hashIV and the ids.
        r := hash.RandN32(1 + buckets)
        for i := range ids {
                ids[i] = r[i]
        }
        hashIV := r[buckets]

        return func(s *stack.Stack) stack.NetworkProtocol {
                p := &protocol{
                        stack:      s,
                        ids:        ids,
                        hashIV:     hashIV,
                        defaultTTL: DefaultTTL,
                        options:    opts,
                }
                p.fragmentation = fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p)
                p.mu.eps = make(map[tcpip.NICID]*endpoint)
                return p
        }
}

// NewProtocol is equivalent to NewProtocolWithOptions with an empty Options.
func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
        return NewProtocolWithOptions(Options{})(s)
}

func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeader header.IPv4) (*stack.PacketBuffer, bool) {
        fragPkt, offset, copied, more := pf.BuildNextFragment()
        fragPkt.NetworkProtocolNumber = ProtocolNumber

        originalIPHeaderLength := len(originalIPHeader)
        nextFragIPHeader := header.IPv4(fragPkt.NetworkHeader().Push(originalIPHeaderLength))
        fragPkt.NetworkProtocolNumber = ProtocolNumber

        if copied := copy(nextFragIPHeader, originalIPHeader); copied != len(originalIPHeader) {
                panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got = %d, want = %d", copied, originalIPHeaderLength))
        }

        flags := originalIPHeader.Flags()
        if more {
                flags |= header.IPv4FlagMoreFragments
        }
        nextFragIPHeader.SetFlagsFragmentOffset(flags, uint16(offset))
        nextFragIPHeader.SetTotalLength(uint16(nextFragIPHeader.HeaderLength()) + uint16(copied))
        nextFragIPHeader.SetChecksum(0)
        nextFragIPHeader.SetChecksum(^nextFragIPHeader.CalculateChecksum())

        return fragPkt, more
}

// optionAction describes possible actions that may be taken on an option
// while processing it.
type optionAction uint8

const (
        // optionRemove says that the option should not be in the output option set.
        optionRemove optionAction = iota

        // optionProcess says that the option should be fully processed.
        optionProcess

        // optionVerify says the option should be checked and passed unchanged.
        optionVerify

        // optionPass says to pass the output set without checking.
        optionPass
)

// optionActions list what to do for each option in a given scenario.
type optionActions struct {
        // timestamp controls what to do with a Timestamp option.
        timestamp optionAction

        // recordRoute controls what to do with a Record Route option.
        recordRoute optionAction

        // routerAlert controls what to do with a Router Alert option.
        routerAlert optionAction

        // unknown controls what to do with an unknown option.
        unknown optionAction
}

// optionsUsage specifies the ways options may be operated upon for a given
// scenario during packet processing.
type optionsUsage interface {
        actions() optionActions
}

// optionUsageVerify implements optionsUsage for when we just want to check
// fragments. Don't change anything, just check and reject if bad. No
// replacement options are generated.
type optionUsageVerify struct{}

// actions implements optionsUsage.
func (*optionUsageVerify) actions() optionActions {
        return optionActions{
                timestamp:   optionVerify,
                recordRoute: optionVerify,
                routerAlert: optionVerify,
                unknown:     optionRemove,
        }
}

// optionUsageReceive implements optionsUsage for packets we will pass
// to the transport layer (with the exception of Echo requests).
type optionUsageReceive struct{}

// actions implements optionsUsage.
func (*optionUsageReceive) actions() optionActions {
        return optionActions{
                timestamp:   optionProcess,
                recordRoute: optionProcess,
                routerAlert: optionVerify,
                unknown:     optionPass,
        }
}

// optionUsageForward implements optionsUsage for packets about to be forwarded.
// All options are passed on regardless of whether we recognise them, however
// we do process the Timestamp and Record Route options.
type optionUsageForward struct{}

// actions implements optionsUsage.
func (*optionUsageForward) actions() optionActions {
        return optionActions{
                timestamp:   optionProcess,
                recordRoute: optionProcess,
                routerAlert: optionVerify,
                unknown:     optionPass,
        }
}

// optionUsageEcho implements optionsUsage for echo packet processing.
// Only Timestamp and RecordRoute are processed and sent back.
type optionUsageEcho struct{}

// actions implements optionsUsage.
func (*optionUsageEcho) actions() optionActions {
        return optionActions{
                timestamp:   optionProcess,
                recordRoute: optionProcess,
                routerAlert: optionVerify,
                unknown:     optionRemove,
        }
}

// handleTimestamp does any required processing on a Timestamp option
// in place.
func handleTimestamp(tsOpt header.IPv4OptionTimestamp, localAddress tcpip.Address, clock tcpip.Clock, usage optionsUsage) *header.IPv4OptParameterProblem {
        flags := tsOpt.Flags()
        var entrySize uint8
        switch flags {
        case header.IPv4OptionTimestampOnlyFlag:
                entrySize = header.IPv4OptionTimestampSize
        case
                header.IPv4OptionTimestampWithIPFlag,
                header.IPv4OptionTimestampWithPredefinedIPFlag:
                entrySize = header.IPv4OptionTimestampWithAddrSize
        default:
                return &header.IPv4OptParameterProblem{
                        Pointer:  header.IPv4OptTSOFLWAndFLGOffset,
                        NeedICMP: true,
                }
        }

        pointer := tsOpt.Pointer()
        // RFC 791 page 22 states: "The smallest legal value is 5."
        // Since the pointer is 1 based, and the header is 4 bytes long the
        // pointer must point beyond the header therefore 4 or less is bad.
        if pointer <= header.IPv4OptionTimestampHdrLength {
                return &header.IPv4OptParameterProblem{
                        Pointer:  header.IPv4OptTSPointerOffset,
                        NeedICMP: true,
                }
        }
        // To simplify processing below, base further work on the array of timestamps
        // beyond the header, rather than on the whole option. Also to aid
        // calculations set 'nextSlot' to be 0 based as in the packet it is 1 based.
        nextSlot := pointer - (header.IPv4OptionTimestampHdrLength + 1)
        optLen := tsOpt.Size()
        dataLength := optLen - header.IPv4OptionTimestampHdrLength

        // In the section below, we verify the pointer, length and overflow counter
        // fields of the option. The distinction is in which byte you return as being
        // in error in the ICMP packet. Offsets 1 (length), 2 pointer)
        // or 3 (overflowed counter).
        //
        // The following RFC sections cover this section:
        //
        // RFC 791 (page 22):
        //    If there is some room but not enough room for a full timestamp
        //    to be inserted, or the overflow count itself overflows, the
        //    original datagram is considered to be in error and is discarded.
        //    In either case an ICMP parameter problem message may be sent to
        //    the source host [3].
        //
        // You can get this situation in two ways. Firstly if the data area is not
        // a multiple of the entry size or secondly, if the pointer is not at a
        // multiple of the entry size. The wording of the RFC suggests that
        // this is not an error until you actually run out of space.
        if pointer > optLen {
                // RFC 791 (page 22) says we should switch to using the overflow count.
                //    If the timestamp data area is already full (the pointer exceeds
                //    the length) the datagram is forwarded without inserting the
                //    timestamp, but the overflow count is incremented by one.
                if flags == header.IPv4OptionTimestampWithPredefinedIPFlag {
                        // By definition we have nothing to do.
                        return nil
                }

                if tsOpt.IncOverflow() != 0 {
                        return nil
                }
                // The overflow count is also full.
                return &header.IPv4OptParameterProblem{
                        Pointer:  header.IPv4OptTSOFLWAndFLGOffset,
                        NeedICMP: true,
                }
        }
        if nextSlot+entrySize > dataLength {
                // The data area isn't full but there isn't room for a new entry.
                // Either Length or Pointer could be bad.
                if false {
                        // We must select Pointer for Linux compatibility, even if
                        // only the length is bad.
                        // The Linux code is at (in October 2020)
                        // https://github.com/torvalds/linux/blob/bbf5c979011a099af5dc76498918ed7df445635b/net/ipv4/ip_options.c#L367-L370
                        //                if (optptr[2]+3 > optlen) {
                        //                        pp_ptr = optptr + 2;
                        //                        goto error;
                        //                }
                        // which doesn't distinguish between which of optptr[2] or optlen
                        // is wrong, but just arbitrarily decides on optptr+2.
                        if dataLength%entrySize != 0 {
                                // The Data section size should be a multiple of the expected
                                // timestamp entry size.
                                return &header.IPv4OptParameterProblem{
                                        Pointer:  header.IPv4OptionLengthOffset,
                                        NeedICMP: false,
                                }
                        }
                        // If the size is OK, the pointer must be corrupted.
                }
                return &header.IPv4OptParameterProblem{
                        Pointer:  header.IPv4OptTSPointerOffset,
                        NeedICMP: true,
                }
        }

        if usage.actions().timestamp == optionProcess {
                tsOpt.UpdateTimestamp(localAddress, clock)
        }
        return nil
}

// handleRecordRoute checks and processes a Record route option. It is much
// like the timestamp type 1 option, but without timestamps. The passed in
// address is stored in the option in the correct spot if possible.
func handleRecordRoute(rrOpt header.IPv4OptionRecordRoute, localAddress tcpip.Address, usage optionsUsage) *header.IPv4OptParameterProblem {
        optlen := rrOpt.Size()

        if optlen < header.IPv4AddressSize+header.IPv4OptionRecordRouteHdrLength {
                return &header.IPv4OptParameterProblem{
                        Pointer:  header.IPv4OptionLengthOffset,
                        NeedICMP: true,
                }
        }

        pointer := rrOpt.Pointer()
        // RFC 791 page 20 states:
        //      The pointer is relative to this option, and the
        //      smallest legal value for the pointer is 4.
        // Since the pointer is 1 based, and the header is 3 bytes long the
        // pointer must point beyond the header therefore 3 or less is bad.
        if pointer <= header.IPv4OptionRecordRouteHdrLength {
                return &header.IPv4OptParameterProblem{
                        Pointer:  header.IPv4OptRRPointerOffset,
                        NeedICMP: true,
                }
        }

        // RFC 791 page 21 says
        //       If the route data area is already full (the pointer exceeds the
        //       length) the datagram is forwarded without inserting the address
        //       into the recorded route. If there is some room but not enough
        //       room for a full address to be inserted, the original datagram is
        //       considered to be in error and is discarded.  In either case an
        //       ICMP parameter problem message may be sent to the source
        //       host.
        // The use of the words "In either case" suggests that a 'full' RR option
        // could generate an ICMP at every hop after it fills up. We chose to not
        // do this (as do most implementations). It is probable that the inclusion
        // of these words is a copy/paste error from the timestamp option where
        // there are two failure reasons given.
        if pointer > optlen {
                return nil
        }

        // The data area isn't full but there isn't room for a new entry.
        // Either Length or Pointer could be bad. We must select Pointer for Linux
        // compatibility, even if only the length is bad. NB. pointer is 1 based.
        if pointer+header.IPv4AddressSize > optlen+1 {
                if false {
                        // This is what we would do if we were not being Linux compatible.
                        // Check for bad pointer or length value. Must be a multiple of 4 after
                        // accounting for the 3 byte header and not within that header.
                        // RFC 791, page 20 says:
                        //       The pointer is relative to this option, and the
                        //       smallest legal value for the pointer is 4.
                        //
                        //       A recorded route is composed of a series of internet addresses.
                        //       Each internet address is 32 bits or 4 octets.
                        // Linux skips this test so we must too.  See Linux code at:
                        // https://github.com/torvalds/linux/blob/bbf5c979011a099af5dc76498918ed7df445635b/net/ipv4/ip_options.c#L338-L341
                        //    if (optptr[2]+3 > optlen) {
                        //      pp_ptr = optptr + 2;
                        //      goto error;
                        //    }
                        if (optlen-header.IPv4OptionRecordRouteHdrLength)%header.IPv4AddressSize != 0 {
                                // Length is bad, not on integral number of slots.
                                return &header.IPv4OptParameterProblem{
                                        Pointer:  header.IPv4OptionLengthOffset,
                                        NeedICMP: true,
                                }
                        }
                        // If not length, the fault must be with the pointer.
                }
                return &header.IPv4OptParameterProblem{
                        Pointer:  header.IPv4OptRRPointerOffset,
                        NeedICMP: true,
                }
        }
        if usage.actions().recordRoute == optionVerify {
                return nil
        }
        rrOpt.StoreAddress(localAddress)
        return nil
}

// handleRouterAlert performs sanity checks on a Router Alert option.
func handleRouterAlert(raOpt header.IPv4OptionRouterAlert) *header.IPv4OptParameterProblem {
        // Only the zero value is acceptable, as per RFC 2113, section 2.1:
        //   Value:  A two octet code with the following values:
        //     0 - Router shall examine packet
        //     1-65535 - Reserved
        if raOpt.Value() != header.IPv4OptionRouterAlertValue {
                return &header.IPv4OptParameterProblem{
                        Pointer:  header.IPv4OptionRouterAlertValueOffset,
                        NeedICMP: true,
                }
        }
        return nil
}

type optionTracker struct {
        timestamp   bool
        recordRoute bool
        routerAlert bool
}

// processIPOptions parses the IPv4 options and produces a new set of options
// suitable for use in the next step of packet processing as informed by usage.
// The original will not be touched.
//
// If there were no errors during parsing, the new set of options is returned as
// a new buffer.
func (e *endpoint) processIPOptions(pkt *stack.PacketBuffer, opts header.IPv4Options, usage optionsUsage) (header.IPv4Options, optionTracker, *header.IPv4OptParameterProblem) {
        stats := e.stats.ip
        optIter := opts.MakeIterator()

        // Except NOP, each option must only appear at most once (RFC 791 section 3.1,
        // at the definition of every type).
        // Keep track of each option we find to enable duplicate option detection.
        var seenOptions [math.MaxUint8 + 1]bool

        // TODO(https://gvisor.dev/issue/4586): This will need tweaking when we start
        // really forwarding packets as we may need to get two addresses, for rx and
        // tx interfaces. We will also have to take usage into account.
        localAddress := e.MainAddress().Address
        if len(localAddress) == 0 {
                h := header.IPv4(pkt.NetworkHeader().View())
                dstAddr := h.DestinationAddress()
                if pkt.NetworkPacketInfo.LocalAddressBroadcast || header.IsV4MulticastAddress(dstAddr) {
                        return nil, optionTracker{}, &header.IPv4OptParameterProblem{
                                NeedICMP: false,
                        }
                }
                localAddress = dstAddr
        }

        var optionsProcessed optionTracker
        for {
                option, done, optProblem := optIter.Next()
                if done || optProblem != nil {
                        return optIter.Finalize(), optionsProcessed, optProblem
                }
                optType := option.Type()
                if optType == header.IPv4OptionNOPType {
                        optIter.PushNOPOrEnd(optType)
                        continue
                }
                if optType == header.IPv4OptionListEndType {
                        optIter.PushNOPOrEnd(optType)
                        return optIter.Finalize(), optionsProcessed, nil
                }

                // check for repeating options (multiple NOPs are OK)
                if seenOptions[optType] {
                        return nil, optionTracker{}, &header.IPv4OptParameterProblem{
                                Pointer:  optIter.ErrCursor,
                                NeedICMP: true,
                        }
                }
                seenOptions[optType] = true

                optLen, optProblem := func() (int, *header.IPv4OptParameterProblem) {
                        switch option := option.(type) {
                        case *header.IPv4OptionTimestamp:
                                stats.OptionTimestampReceived.Increment()
                                optionsProcessed.timestamp = true
                                if usage.actions().timestamp != optionRemove {
                                        clock := e.protocol.stack.Clock()
                                        newBuffer := optIter.InitReplacement(option)
                                        optProblem := handleTimestamp(header.IPv4OptionTimestamp(newBuffer), localAddress, clock, usage)
                                        return len(newBuffer), optProblem
                                }

                        case *header.IPv4OptionRecordRoute:
                                stats.OptionRecordRouteReceived.Increment()
                                optionsProcessed.recordRoute = true
                                if usage.actions().recordRoute != optionRemove {
                                        newBuffer := optIter.InitReplacement(option)
                                        optProblem := handleRecordRoute(header.IPv4OptionRecordRoute(newBuffer), localAddress, usage)
                                        return len(newBuffer), optProblem
                                }

                        case *header.IPv4OptionRouterAlert:
                                stats.OptionRouterAlertReceived.Increment()
                                optionsProcessed.routerAlert = true
                                if usage.actions().routerAlert != optionRemove {
                                        newBuffer := optIter.InitReplacement(option)
                                        optProblem := handleRouterAlert(header.IPv4OptionRouterAlert(newBuffer))
                                        return len(newBuffer), optProblem
                                }

                        default:
                                stats.OptionUnknownReceived.Increment()
                                if usage.actions().unknown == optionPass {
                                        return len(optIter.InitReplacement(option)), nil
                                }
                        }
                        return 0, nil
                }()

                if optProblem != nil {
                        optProblem.Pointer += optIter.ErrCursor
                        return nil, optionTracker{}, optProblem
                }
                optIter.ConsumeBuffer(optLen)
        }
}

























































   25 












































































































































































   26 




   27 




   25 




   25 





   32 









   24 




   25 




    9 



















    3 





































   10 





    9 

















   22 







   23 






























































   10 













    9 

    1 

    5 

    7 



    7 
    1 


    6 


    6 
    1 


    5 



    5 


    5 



    5 
    4 




    5 

    7 
    1 


    6 


    1 




    1 





    1 



    7 




   25 


   22 

    3 

   17 

   17 



   17 



    1 




    1 




    1 

    1 







    1 
   15 




   15 


    2 


   13 


   23 






   13 



   13 







   11 



   11 







   18 



   18 









   11 




   11 







    2 



    2 



    2 


    2 




    2 

    2 



    2 



   18 



   18 







   20 






   20 





    8 
    2 


    8 









    5 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import (
        "encoding/binary"

        "github.com/google/btree"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/seqnum"
)

// These constants are the offsets of the respective fields in the TCP header.
const (
        TCPSrcPortOffset   = 0
        TCPDstPortOffset   = 2
        TCPSeqNumOffset    = 4
        TCPAckNumOffset    = 8
        TCPDataOffset      = 12
        TCPFlagsOffset     = 13
        TCPWinSizeOffset   = 14
        TCPChecksumOffset  = 16
        TCPUrgentPtrOffset = 18
)

const (
        // MaxWndScale is maximum allowed window scaling, as described in
        // RFC 1323, section 2.3, page 11.
        MaxWndScale = 14

        // TCPMaxSACKBlocks is the maximum number of SACK blocks that can
        // be encoded in a TCP option field.
        TCPMaxSACKBlocks = 4
)

// TCPFlags is the dedicated type for TCP flags.
type TCPFlags uint8

// Intersects returns true iff there are flags common to both f and o.
func (f TCPFlags) Intersects(o TCPFlags) bool {
        return f&o != 0
}

// Contains returns true iff all the flags in o are contained within f.
func (f TCPFlags) Contains(o TCPFlags) bool {
        return f&o == o
}

// String implements Stringer.String.
func (f TCPFlags) String() string {
        flagsStr := []byte("FSRPAU")
        for i := range flagsStr {
                if f&(1<<uint(i)) == 0 {
                        flagsStr[i] = ' '
                }
        }
        return string(flagsStr)
}

// Flags that may be set in a TCP segment.
const (
        TCPFlagFin TCPFlags = 1 << iota
        TCPFlagSyn
        TCPFlagRst
        TCPFlagPsh
        TCPFlagAck
        TCPFlagUrg
)

// Options that may be present in a TCP segment.
const (
        TCPOptionEOL           = 0
        TCPOptionNOP           = 1
        TCPOptionMSS           = 2
        TCPOptionWS            = 3
        TCPOptionTS            = 8
        TCPOptionSACKPermitted = 4
        TCPOptionSACK          = 5
)

// Option Lengths.
const (
        TCPOptionMSSLength           = 4
        TCPOptionTSLength            = 10
        TCPOptionWSLength            = 3
        TCPOptionSackPermittedLength = 2
)

// TCPFields contains the fields of a TCP packet. It is used to describe the
// fields of a packet that needs to be encoded.
type TCPFields struct {
        // SrcPort is the "source port" field of a TCP packet.
        SrcPort uint16

        // DstPort is the "destination port" field of a TCP packet.
        DstPort uint16

        // SeqNum is the "sequence number" field of a TCP packet.
        SeqNum uint32

        // AckNum is the "acknowledgement number" field of a TCP packet.
        AckNum uint32

        // DataOffset is the "data offset" field of a TCP packet. It is the length of
        // the TCP header in bytes.
        DataOffset uint8

        // Flags is the "flags" field of a TCP packet.
        Flags TCPFlags

        // WindowSize is the "window size" field of a TCP packet.
        WindowSize uint16

        // Checksum is the "checksum" field of a TCP packet.
        Checksum uint16

        // UrgentPointer is the "urgent pointer" field of a TCP packet.
        UrgentPointer uint16
}

// TCPSynOptions is used to return the parsed TCP Options in a syn
// segment.
type TCPSynOptions struct {
        // MSS is the maximum segment size provided by the peer in the SYN.
        MSS uint16

        // WS is the window scale option provided by the peer in the SYN.
        //
        // Set to -1 if no window scale option was provided.
        WS int

        // TS is true if the timestamp option was provided in the syn/syn-ack.
        TS bool

        // TSVal is the value of the TSVal field in the timestamp option.
        TSVal uint32

        // TSEcr is the value of the TSEcr field in the timestamp option.
        TSEcr uint32

        // SACKPermitted is true if the SACK option was provided in the SYN/SYN-ACK.
        SACKPermitted bool
}

// SACKBlock represents a single contiguous SACK block.
//
// +stateify savable
type SACKBlock struct {
        // Start indicates the lowest sequence number in the block.
        Start seqnum.Value

        // End indicates the sequence number immediately following the last
        // sequence number of this block.
        End seqnum.Value
}

// Less returns true if r.Start < b.Start.
func (r SACKBlock) Less(b btree.Item) bool {
        return r.Start.LessThan(b.(SACKBlock).Start)
}

// Contains returns true if b is completely contained in r.
func (r SACKBlock) Contains(b SACKBlock) bool {
        return r.Start.LessThanEq(b.Start) && b.End.LessThanEq(r.End)
}

// TCPOptions are used to parse and cache the TCP segment options for a non
// syn/syn-ack segment.
//
// +stateify savable
type TCPOptions struct {
        // TS is true if the TimeStamp option is enabled.
        TS bool

        // TSVal is the value in the TSVal field of the segment.
        TSVal uint32

        // TSEcr is the value in the TSEcr field of the segment.
        TSEcr uint32

        // SACKBlocks are the SACK blocks specified in the segment.
        SACKBlocks []SACKBlock
}

// TCP represents a TCP header stored in a byte array.
type TCP []byte

const (
        // TCPMinimumSize is the minimum size of a valid TCP packet.
        TCPMinimumSize = 20

        // TCPOptionsMaximumSize is the maximum size of TCP options.
        TCPOptionsMaximumSize = 40

        // TCPHeaderMaximumSize is the maximum header size of a TCP packet.
        TCPHeaderMaximumSize = TCPMinimumSize + TCPOptionsMaximumSize

        // TCPProtocolNumber is TCP's transport protocol number.
        TCPProtocolNumber tcpip.TransportProtocolNumber = 6

        // TCPMinimumMSS is the minimum acceptable value for MSS. This is the
        // same as the value TCP_MIN_MSS defined net/tcp.h.
        TCPMinimumMSS = IPv4MaximumHeaderSize + TCPHeaderMaximumSize + MinIPFragmentPayloadSize - IPv4MinimumSize - TCPMinimumSize

        // TCPMaximumMSS is the maximum acceptable value for MSS.
        TCPMaximumMSS = 0xffff

        // TCPDefaultMSS is the MSS value that should be used if an MSS option
        // is not received from the peer. It's also the value returned by
        // TCP_MAXSEG option for a socket in an unconnected state.
        //
        // Per RFC 1122, page 85: "If an MSS option is not received at
        // connection setup, TCP MUST assume a default send MSS of 536."
        TCPDefaultMSS = 536
)

// SourcePort returns the "source port" field of the TCP header.
func (b TCP) SourcePort() uint16 {
        return binary.BigEndian.Uint16(b[TCPSrcPortOffset:])
}

// DestinationPort returns the "destination port" field of the TCP header.
func (b TCP) DestinationPort() uint16 {
        return binary.BigEndian.Uint16(b[TCPDstPortOffset:])
}

// SequenceNumber returns the "sequence number" field of the TCP header.
func (b TCP) SequenceNumber() uint32 {
        return binary.BigEndian.Uint32(b[TCPSeqNumOffset:])
}

// AckNumber returns the "ack number" field of the TCP header.
func (b TCP) AckNumber() uint32 {
        return binary.BigEndian.Uint32(b[TCPAckNumOffset:])
}

// DataOffset returns the "data offset" field of the TCP header. The return
// value is the length of the TCP header in bytes.
func (b TCP) DataOffset() uint8 {
        return (b[TCPDataOffset] >> 4) * 4
}

// Payload returns the data in the TCP packet.
func (b TCP) Payload() []byte {
        return b[b.DataOffset():]
}

// Flags returns the flags field of the TCP header.
func (b TCP) Flags() TCPFlags {
        return TCPFlags(b[TCPFlagsOffset])
}

// WindowSize returns the "window size" field of the TCP header.
func (b TCP) WindowSize() uint16 {
        return binary.BigEndian.Uint16(b[TCPWinSizeOffset:])
}

// Checksum returns the "checksum" field of the TCP header.
func (b TCP) Checksum() uint16 {
        return binary.BigEndian.Uint16(b[TCPChecksumOffset:])
}

// UrgentPointer returns the "urgent pointer" field of the TCP header.
func (b TCP) UrgentPointer() uint16 {
        return binary.BigEndian.Uint16(b[TCPUrgentPtrOffset:])
}

// SetSourcePort sets the "source port" field of the TCP header.
func (b TCP) SetSourcePort(port uint16) {
        binary.BigEndian.PutUint16(b[TCPSrcPortOffset:], port)
}

// SetDestinationPort sets the "destination port" field of the TCP header.
func (b TCP) SetDestinationPort(port uint16) {
        binary.BigEndian.PutUint16(b[TCPDstPortOffset:], port)
}

// SetChecksum sets the checksum field of the TCP header.
func (b TCP) SetChecksum(checksum uint16) {
        binary.BigEndian.PutUint16(b[TCPChecksumOffset:], checksum)
}

// SetDataOffset sets the data offset field of the TCP header. headerLen should
// be the length of the TCP header in bytes.
func (b TCP) SetDataOffset(headerLen uint8) {
        b[TCPDataOffset] = (headerLen / 4) << 4
}

// SetSequenceNumber sets the sequence number field of the TCP header.
func (b TCP) SetSequenceNumber(seqNum uint32) {
        binary.BigEndian.PutUint32(b[TCPSeqNumOffset:], seqNum)
}

// SetAckNumber sets the ack number field of the TCP header.
func (b TCP) SetAckNumber(ackNum uint32) {
        binary.BigEndian.PutUint32(b[TCPAckNumOffset:], ackNum)
}

// SetFlags sets the flags field of the TCP header.
func (b TCP) SetFlags(flags uint8) {
        b[TCPFlagsOffset] = flags
}

// SetWindowSize sets the window size field of the TCP header.
func (b TCP) SetWindowSize(rcvwnd uint16) {
        binary.BigEndian.PutUint16(b[TCPWinSizeOffset:], rcvwnd)
}

// SetUrgentPoiner sets the window size field of the TCP header.
func (b TCP) SetUrgentPoiner(urgentPointer uint16) {
        binary.BigEndian.PutUint16(b[TCPUrgentPtrOffset:], urgentPointer)
}

// CalculateChecksum calculates the checksum of the TCP segment.
// partialChecksum is the checksum of the network-layer pseudo-header
// and the checksum of the segment data.
func (b TCP) CalculateChecksum(partialChecksum uint16) uint16 {
        // Calculate the rest of the checksum.
        return Checksum(b[:b.DataOffset()], partialChecksum)
}

// IsChecksumValid returns true iff the TCP header's checksum is valid.
func (b TCP) IsChecksumValid(src, dst tcpip.Address, payloadChecksum, payloadLength uint16) bool {
        xsum := PseudoHeaderChecksum(TCPProtocolNumber, src, dst, uint16(b.DataOffset())+payloadLength)
        xsum = ChecksumCombine(xsum, payloadChecksum)
        return b.CalculateChecksum(xsum) == 0xffff
}

// Options returns a slice that holds the unparsed TCP options in the segment.
func (b TCP) Options() []byte {
        return b[TCPMinimumSize:b.DataOffset()]
}

// ParsedOptions returns a TCPOptions structure which parses and caches the TCP
// option values in the TCP segment. NOTE: Invoking this function repeatedly is
// expensive as it reparses the options on each invocation.
func (b TCP) ParsedOptions() TCPOptions {
        return ParseTCPOptions(b.Options())
}

func (b TCP) encodeSubset(seq, ack uint32, flags TCPFlags, rcvwnd uint16) {
        binary.BigEndian.PutUint32(b[TCPSeqNumOffset:], seq)
        binary.BigEndian.PutUint32(b[TCPAckNumOffset:], ack)
        b[TCPFlagsOffset] = uint8(flags)
        binary.BigEndian.PutUint16(b[TCPWinSizeOffset:], rcvwnd)
}

// Encode encodes all the fields of the TCP header.
func (b TCP) Encode(t *TCPFields) {
        b.encodeSubset(t.SeqNum, t.AckNum, t.Flags, t.WindowSize)
        binary.BigEndian.PutUint16(b[TCPSrcPortOffset:], t.SrcPort)
        binary.BigEndian.PutUint16(b[TCPDstPortOffset:], t.DstPort)
        b[TCPDataOffset] = (t.DataOffset / 4) << 4
        binary.BigEndian.PutUint16(b[TCPChecksumOffset:], t.Checksum)
        binary.BigEndian.PutUint16(b[TCPUrgentPtrOffset:], t.UrgentPointer)
}

// EncodePartial updates a subset of the fields of the TCP header. It is useful
// in cases when similar segments are produced.
func (b TCP) EncodePartial(partialChecksum, length uint16, seqnum, acknum uint32, flags TCPFlags, rcvwnd uint16) {
        // Add the total length and "flags" field contributions to the checksum.
        // We don't use the flags field directly from the header because it's a
        // one-byte field with an odd offset, so it would be accounted for
        // incorrectly by the Checksum routine.
        tmp := make([]byte, 4)
        binary.BigEndian.PutUint16(tmp, length)
        binary.BigEndian.PutUint16(tmp[2:], uint16(flags))
        checksum := Checksum(tmp, partialChecksum)

        // Encode the passed-in fields.
        b.encodeSubset(seqnum, acknum, flags, rcvwnd)

        // Add the contributions of the passed-in fields to the checksum.
        checksum = Checksum(b[TCPSeqNumOffset:TCPSeqNumOffset+8], checksum)
        checksum = Checksum(b[TCPWinSizeOffset:TCPWinSizeOffset+2], checksum)

        // Encode the checksum.
        b.SetChecksum(^checksum)
}

// SetSourcePortWithChecksumUpdate implements ChecksummableTransport.
func (b TCP) SetSourcePortWithChecksumUpdate(new uint16) {
        old := b.SourcePort()
        b.SetSourcePort(new)
        b.SetChecksum(^checksumUpdate2ByteAlignedUint16(^b.Checksum(), old, new))
}

// SetDestinationPortWithChecksumUpdate implements ChecksummableTransport.
func (b TCP) SetDestinationPortWithChecksumUpdate(new uint16) {
        old := b.DestinationPort()
        b.SetDestinationPort(new)
        b.SetChecksum(^checksumUpdate2ByteAlignedUint16(^b.Checksum(), old, new))
}

// UpdateChecksumPseudoHeaderAddress implements ChecksummableTransport.
func (b TCP) UpdateChecksumPseudoHeaderAddress(old, new tcpip.Address, fullChecksum bool) {
        xsum := b.Checksum()
        if fullChecksum {
                xsum = ^xsum
        }

        xsum = checksumUpdate2ByteAlignedAddress(xsum, old, new)
        if fullChecksum {
                xsum = ^xsum
        }

        b.SetChecksum(xsum)
}

// ParseSynOptions parses the options received in a SYN segment and returns the
// relevant ones. opts should point to the option part of the TCP header.
func ParseSynOptions(opts []byte, isAck bool) TCPSynOptions {
        limit := len(opts)

        synOpts := TCPSynOptions{
                // Per RFC 1122, page 85: "If an MSS option is not received at
                // connection setup, TCP MUST assume a default send MSS of 536."
                MSS: TCPDefaultMSS,
                // If no window scale option is specified, WS in options is
                // returned as -1; this is because the absence of the option
                // indicates that the we cannot use window scaling on the
                // receive end either.
                WS: -1,
        }

        for i := 0; i < limit; {
                switch opts[i] {
                case TCPOptionEOL:
                        i = limit
                case TCPOptionNOP:
                        i++
                case TCPOptionMSS:
                        if i+4 > limit || opts[i+1] != 4 {
                                return synOpts
                        }
                        mss := uint16(opts[i+2])<<8 | uint16(opts[i+3])
                        if mss == 0 {
                                return synOpts
                        }
                        synOpts.MSS = mss
                        i += 4

                case TCPOptionWS:
                        if i+3 > limit || opts[i+1] != 3 {
                                return synOpts
                        }
                        ws := int(opts[i+2])
                        if ws > MaxWndScale {
                                ws = MaxWndScale
                        }
                        synOpts.WS = ws
                        i += 3

                case TCPOptionTS:
                        if i+10 > limit || opts[i+1] != 10 {
                                return synOpts
                        }
                        synOpts.TSVal = binary.BigEndian.Uint32(opts[i+2:])
                        if isAck {
                                // If the segment is a SYN-ACK then store the Timestamp Echo Reply
                                // in the segment.
                                synOpts.TSEcr = binary.BigEndian.Uint32(opts[i+6:])
                        }
                        synOpts.TS = true
                        i += 10
                case TCPOptionSACKPermitted:
                        if i+2 > limit || opts[i+1] != 2 {
                                return synOpts
                        }
                        synOpts.SACKPermitted = true
                        i += 2

                default:
                        // We don't recognize this option, just skip over it.
                        if i+2 > limit {
                                return synOpts
                        }
                        l := int(opts[i+1])
                        // If the length is incorrect or if l+i overflows the
                        // total options length then return false.
                        if l < 2 || i+l > limit {
                                return synOpts
                        }
                        i += l
                }
        }

        return synOpts
}

// ParseTCPOptions extracts and stores all known options in the provided byte
// slice in a TCPOptions structure.
func ParseTCPOptions(b []byte) TCPOptions {
        opts := TCPOptions{}
        limit := len(b)
        for i := 0; i < limit; {
                switch b[i] {
                case TCPOptionEOL:
                        i = limit
                case TCPOptionNOP:
                        i++
                case TCPOptionTS:
                        if i+10 > limit || (b[i+1] != 10) {
                                return opts
                        }
                        opts.TS = true
                        opts.TSVal = binary.BigEndian.Uint32(b[i+2:])
                        opts.TSEcr = binary.BigEndian.Uint32(b[i+6:])
                        i += 10
                case TCPOptionSACK:
                        if i+2 > limit {
                                // Malformed SACK block, just return and stop parsing.
                                return opts
                        }
                        sackOptionLen := int(b[i+1])
                        if i+sackOptionLen > limit || (sackOptionLen-2)%8 != 0 {
                                // Malformed SACK block, just return and stop parsing.
                                return opts
                        }
                        numBlocks := (sackOptionLen - 2) / 8
                        opts.SACKBlocks = []SACKBlock{}
                        for j := 0; j < numBlocks; j++ {
                                start := binary.BigEndian.Uint32(b[i+2+j*8:])
                                end := binary.BigEndian.Uint32(b[i+2+j*8+4:])
                                opts.SACKBlocks = append(opts.SACKBlocks, SACKBlock{
                                        Start: seqnum.Value(start),
                                        End:   seqnum.Value(end),
                                })
                        }
                        i += sackOptionLen
                default:
                        // We don't recognize this option, just skip over it.
                        if i+2 > limit {
                                return opts
                        }
                        l := int(b[i+1])
                        // If the length is incorrect or if l+i overflows the
                        // total options length then return false.
                        if l < 2 || i+l > limit {
                                return opts
                        }
                        i += l
                }
        }
        return opts
}

// EncodeMSSOption encodes the MSS TCP option with the provided MSS values in
// the supplied buffer. If the provided buffer is not large enough then it just
// returns without encoding anything. It returns the number of bytes written to
// the provided buffer.
func EncodeMSSOption(mss uint32, b []byte) int {
        if len(b) < TCPOptionMSSLength {
                return 0
        }
        b[0], b[1], b[2], b[3] = TCPOptionMSS, TCPOptionMSSLength, byte(mss>>8), byte(mss)
        return TCPOptionMSSLength
}

// EncodeWSOption encodes the WS TCP option with the WS value in the
// provided buffer. If the provided buffer is not large enough then it just
// returns without encoding anything. It returns the number of bytes written to
// the provided buffer.
func EncodeWSOption(ws int, b []byte) int {
        if len(b) < TCPOptionWSLength {
                return 0
        }
        b[0], b[1], b[2] = TCPOptionWS, TCPOptionWSLength, uint8(ws)
        return int(b[1])
}

// EncodeTSOption encodes the provided tsVal and tsEcr values as a TCP timestamp
// option into the provided buffer. If the buffer is smaller than expected it
// just returns without encoding anything. It returns the number of bytes
// written to the provided buffer.
func EncodeTSOption(tsVal, tsEcr uint32, b []byte) int {
        if len(b) < TCPOptionTSLength {
                return 0
        }
        b[0], b[1] = TCPOptionTS, TCPOptionTSLength
        binary.BigEndian.PutUint32(b[2:], tsVal)
        binary.BigEndian.PutUint32(b[6:], tsEcr)
        return int(b[1])
}

// EncodeSACKPermittedOption encodes a SACKPermitted option into the provided
// buffer. If the buffer is smaller than required it just returns without
// encoding anything. It returns the number of bytes written to the provided
// buffer.
func EncodeSACKPermittedOption(b []byte) int {
        if len(b) < TCPOptionSackPermittedLength {
                return 0
        }

        b[0], b[1] = TCPOptionSACKPermitted, TCPOptionSackPermittedLength
        return int(b[1])
}

// EncodeSACKBlocks encodes the provided SACK blocks as a TCP SACK option block
// in the provided slice. It tries to fit in as many blocks as possible based on
// number of bytes available in the provided buffer. It returns the number of
// bytes written to the provided buffer.
func EncodeSACKBlocks(sackBlocks []SACKBlock, b []byte) int {
        if len(sackBlocks) == 0 {
                return 0
        }
        l := len(sackBlocks)
        if l > TCPMaxSACKBlocks {
                l = TCPMaxSACKBlocks
        }
        if ll := (len(b) - 2) / 8; ll < l {
                l = ll
        }
        if l == 0 {
                // There is not enough space in the provided buffer to add
                // any SACK blocks.
                return 0
        }
        b[0] = TCPOptionSACK
        b[1] = byte(l*8 + 2)
        for i := 0; i < l; i++ {
                binary.BigEndian.PutUint32(b[i*8+2:], uint32(sackBlocks[i].Start))
                binary.BigEndian.PutUint32(b[i*8+6:], uint32(sackBlocks[i].End))
        }
        return int(b[1])
}

// EncodeNOP adds an explicit NOP to the option list.
func EncodeNOP(b []byte) int {
        if len(b) == 0 {
                return 0
        }
        b[0] = TCPOptionNOP
        return 1
}

// AddTCPOptionPadding adds the required number of TCPOptionNOP to quad align
// the option buffer. It adds padding bytes after the offset specified and
// returns the number of padding bytes added. The passed in options slice
// must have space for the padding bytes.
func AddTCPOptionPadding(options []byte, offset int) int {
        paddingToAdd := -offset & 3
        // Now add any padding bytes that might be required to quad align the
        // options.
        for i := offset; i < offset+paddingToAdd; i++ {
                options[i] = TCPOptionNOP
        }
        return paddingToAdd
}

// Acceptable checks if a segment that starts at segSeq and has length segLen is
// "acceptable" for arriving in a receive window that starts at rcvNxt and ends
// before rcvAcc, according to the table on page 26 and 69 of RFC 793.
func Acceptable(segSeq seqnum.Value, segLen seqnum.Size, rcvNxt, rcvAcc seqnum.Value) bool {
        if rcvNxt == rcvAcc {
                return segLen == 0 && segSeq == rcvNxt
        }
        if segLen == 0 {
                // rcvWnd is incremented by 1 because that is Linux's behavior despite the
                // RFC.
                return segSeq.InRange(rcvNxt, rcvAcc.Add(1))
        }
        // Page 70 of RFC 793 allows packets that can be made "acceptable" by trimming
        // the payload, so we'll accept any payload that overlaps the receieve window.
        // segSeq < rcvAcc is more correct according to RFC, however, Linux does it
        // differently, it uses segSeq <= rcvAcc, we'd want to keep the same behavior
        // as Linux.
        return rcvNxt.LessThan(segSeq.Add(segLen)) && segSeq.LessThanEq(rcvAcc)
}













































   14 











   11 




    2 




    2 




































    4 







    4 

    1 




    3 
    2 

    1 


    1 


    1 
    1 








    1 
    1 

    1 












    1 



    1 


    1 




    1 




















    1 




























































































































    1 








    1 




    1 


    1 



    2 










    2 


    1 

    1 








    1 


    1 














































    3 






    3 

    1 



    2 

    1 


    1 























    1 







    2 






    2 





    2 




    1 


















    1 




    1 



   15 




    1 






   14 






   14 









    1 


   13 

    4 

    2 

    3 

    2 





    2 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package route provides a NETLINK_ROUTE socket protocol.
package route

import (
        "bytes"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/inet"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
        "gvisor.dev/gvisor/pkg/syserr"
)

// commandKind describes the operational class of a message type.
//
// The route message types use the lower 2 bits of the type to describe class
// of command.
type commandKind int

const (
        kindNew commandKind = 0x0
        kindDel commandKind = 0x1
        kindGet commandKind = 0x2
        kindSet commandKind = 0x3
)

func typeKind(typ uint16) commandKind {
        return commandKind(typ & 0x3)
}

// Protocol implements netlink.Protocol.
//
// +stateify savable
type Protocol struct{}

var _ netlink.Protocol = (*Protocol)(nil)

// NewProtocol creates a NETLINK_ROUTE netlink.Protocol.
func NewProtocol(t *kernel.Task) (netlink.Protocol, *syserr.Error) {
        return &Protocol{}, nil
}

// Protocol implements netlink.Protocol.Protocol.
func (p *Protocol) Protocol() int {
        return linux.NETLINK_ROUTE
}

// CanSend implements netlink.Protocol.CanSend.
func (p *Protocol) CanSend() bool {
        return true
}

// dumpLinks handles RTM_GETLINK dump requests.
func (p *Protocol) dumpLinks(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
        // NLM_F_DUMP + RTM_GETLINK messages are supposed to include an
        // ifinfomsg. However, Linux <3.9 only checked for rtgenmsg, and some
        // userspace applications (including glibc) still include rtgenmsg.
        // Linux has a workaround based on the total message length.
        //
        // We don't bother to check for either, since we don't support any
        // extra attributes that may be included anyways.
        //
        // The message may also contain netlink attribute IFLA_EXT_MASK, which
        // we don't support.

        // The RTM_GETLINK dump response is a set of messages each containing
        // an InterfaceInfoMessage followed by a set of netlink attributes.

        // We always send back an NLMSG_DONE.
        ms.Multi = true

        stack := inet.StackFromContext(ctx)
        if stack == nil {
                // No network devices.
                return nil
        }

        for idx, i := range stack.Interfaces() {
                addNewLinkMessage(ms, idx, i)
        }

        return nil
}

// getLinks handles RTM_GETLINK requests.
func (p *Protocol) getLink(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
        stack := inet.StackFromContext(ctx)
        if stack == nil {
                // No network devices.
                return nil
        }

        // Parse message.
        var ifi linux.InterfaceInfoMessage
        attrs, ok := msg.GetData(&ifi)
        if !ok {
                return syserr.ErrInvalidArgument
        }

        // Parse attributes.
        var byName []byte
        for !attrs.Empty() {
                ahdr, value, rest, ok := attrs.ParseFirst()
                if !ok {
                        return syserr.ErrInvalidArgument
                }
                attrs = rest

                switch ahdr.Type {
                case linux.IFLA_IFNAME:
                        if len(value) < 1 {
                                return syserr.ErrInvalidArgument
                        }
                        byName = value[:len(value)-1]

                        // TODO(gvisor.dev/issue/578): Support IFLA_EXT_MASK.
                }
        }

        found := false
        for idx, i := range stack.Interfaces() {
                switch {
                case ifi.Index > 0:
                        if idx != ifi.Index {
                                continue
                        }
                case byName != nil:
                        if string(byName) != i.Name {
                                continue
                        }
                default:
                        // Criteria not specified.
                        return syserr.ErrInvalidArgument
                }

                addNewLinkMessage(ms, idx, i)
                found = true
                break
        }
        if !found {
                return syserr.ErrNoDevice
        }
        return nil
}

// addNewLinkMessage appends RTM_NEWLINK message for the given interface into
// the message set.
func addNewLinkMessage(ms *netlink.MessageSet, idx int32, i inet.Interface) {
        m := ms.AddMessage(linux.NetlinkMessageHeader{
                Type: linux.RTM_NEWLINK,
        })

        m.Put(&linux.InterfaceInfoMessage{
                Family: linux.AF_UNSPEC,
                Type:   i.DeviceType,
                Index:  idx,
                Flags:  i.Flags,
        })

        m.PutAttrString(linux.IFLA_IFNAME, i.Name)
        m.PutAttr(linux.IFLA_MTU, primitive.AllocateUint32(i.MTU))

        mac := make([]byte, 6)
        brd := mac
        if len(i.Addr) > 0 {
                mac = i.Addr
                brd = bytes.Repeat([]byte{0xff}, len(i.Addr))
        }
        m.PutAttr(linux.IFLA_ADDRESS, primitive.AsByteSlice(mac))
        m.PutAttr(linux.IFLA_BROADCAST, primitive.AsByteSlice(brd))

        // TODO(gvisor.dev/issue/578): There are many more attributes.
}

// dumpAddrs handles RTM_GETADDR dump requests.
func (p *Protocol) dumpAddrs(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
        // RTM_GETADDR dump requests need not contain anything more than the
        // netlink header and 1 byte protocol family common to all
        // NETLINK_ROUTE requests.
        //
        // TODO(b/68878065): Filter output by passed protocol family.

        // The RTM_GETADDR dump response is a set of RTM_NEWADDR messages each
        // containing an InterfaceAddrMessage followed by a set of netlink
        // attributes.

        // We always send back an NLMSG_DONE.
        ms.Multi = true

        stack := inet.StackFromContext(ctx)
        if stack == nil {
                // No network devices.
                return nil
        }

        for id, as := range stack.InterfaceAddrs() {
                for _, a := range as {
                        m := ms.AddMessage(linux.NetlinkMessageHeader{
                                Type: linux.RTM_NEWADDR,
                        })

                        m.Put(&linux.InterfaceAddrMessage{
                                Family:    a.Family,
                                PrefixLen: a.PrefixLen,
                                Index:     uint32(id),
                        })

                        addr := primitive.ByteSlice([]byte(a.Addr))
                        m.PutAttr(linux.IFA_LOCAL, &addr)
                        m.PutAttr(linux.IFA_ADDRESS, &addr)

                        // TODO(gvisor.dev/issue/578): There are many more attributes.
                }
        }

        return nil
}

// commonPrefixLen reports the length of the longest IP address prefix.
// This is a simplied version from Golang's src/net/addrselect.go.
func commonPrefixLen(a, b []byte) (cpl int) {
        for len(a) > 0 {
                if a[0] == b[0] {
                        cpl += 8
                        a = a[1:]
                        b = b[1:]
                        continue
                }
                bits := 8
                ab, bb := a[0], b[0]
                for {
                        ab >>= 1
                        bb >>= 1
                        bits--
                        if ab == bb {
                                cpl += bits
                                return
                        }
                }
        }
        return
}

// fillRoute returns the Route using LPM algorithm. Refer to Linux's
// net/ipv4/route.c:rt_fill_info().
func fillRoute(routes []inet.Route, addr []byte) (inet.Route, *syserr.Error) {
        family := uint8(linux.AF_INET)
        if len(addr) != 4 {
                family = linux.AF_INET6
        }

        idx := -1    // Index of the Route rule to be returned.
        idxDef := -1 // Index of the default route rule.
        prefix := 0  // Current longest prefix.
        for i, route := range routes {
                if route.Family != family {
                        continue
                }

                if len(route.GatewayAddr) > 0 && route.DstLen == 0 {
                        idxDef = i
                        continue
                }

                cpl := commonPrefixLen(addr, route.DstAddr)
                if cpl < int(route.DstLen) {
                        continue
                }
                cpl = int(route.DstLen)
                if cpl > prefix {
                        idx = i
                        prefix = cpl
                }
        }
        if idx == -1 {
                idx = idxDef
        }
        if idx == -1 {
                return inet.Route{}, syserr.ErrNoRoute
        }

        route := routes[idx]
        if family == linux.AF_INET {
                route.DstLen = 32
        } else {
                route.DstLen = 128
        }
        route.DstAddr = addr
        route.Flags |= linux.RTM_F_CLONED // This route is cloned.
        return route, nil
}

// parseForDestination parses a message as format of RouteMessage-RtAttr-dst.
func parseForDestination(msg *netlink.Message) ([]byte, *syserr.Error) {
        var rtMsg linux.RouteMessage
        attrs, ok := msg.GetData(&rtMsg)
        if !ok {
                return nil, syserr.ErrInvalidArgument
        }
        // iproute2 added the RTM_F_LOOKUP_TABLE flag in version v4.4.0. See
        // commit bc234301af12. Note we don't check this flag for backward
        // compatibility.
        if rtMsg.Flags != 0 && rtMsg.Flags != linux.RTM_F_LOOKUP_TABLE {
                return nil, syserr.ErrNotSupported
        }

        // Expect first attribute is RTA_DST.
        if hdr, value, _, ok := attrs.ParseFirst(); ok && hdr.Type == linux.RTA_DST {
                return value, nil
        }
        return nil, syserr.ErrInvalidArgument
}

// dumpRoutes handles RTM_GETROUTE requests.
func (p *Protocol) dumpRoutes(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
        // RTM_GETROUTE dump requests need not contain anything more than the
        // netlink header and 1 byte protocol family common to all
        // NETLINK_ROUTE requests.

        stack := inet.StackFromContext(ctx)
        if stack == nil {
                // No network routes.
                return nil
        }

        hdr := msg.Header()
        routeTables := stack.RouteTable()

        if hdr.Flags == linux.NLM_F_REQUEST {
                dst, err := parseForDestination(msg)
                if err != nil {
                        return err
                }
                route, err := fillRoute(routeTables, dst)
                if err != nil {
                        // TODO(gvisor.dev/issue/1237): return NLMSG_ERROR with ENETUNREACH.
                        return syserr.ErrNotSupported
                }
                routeTables = append([]inet.Route{}, route)
        } else if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP {
                // We always send back an NLMSG_DONE.
                ms.Multi = true
        } else {
                // TODO(b/68878065): Only above cases are supported.
                return syserr.ErrNotSupported
        }

        for _, rt := range routeTables {
                m := ms.AddMessage(linux.NetlinkMessageHeader{
                        Type: linux.RTM_NEWROUTE,
                })

                m.Put(&linux.RouteMessage{
                        Family: rt.Family,
                        DstLen: rt.DstLen,
                        SrcLen: rt.SrcLen,
                        TOS:    rt.TOS,

                        // Always return the main table since we don't have multiple
                        // routing tables.
                        Table:    linux.RT_TABLE_MAIN,
                        Protocol: rt.Protocol,
                        Scope:    rt.Scope,
                        Type:     rt.Type,

                        Flags: rt.Flags,
                })

                m.PutAttr(254, primitive.AsByteSlice([]byte{123}))
                if rt.DstLen > 0 {
                        m.PutAttr(linux.RTA_DST, primitive.AsByteSlice(rt.DstAddr))
                }
                if rt.SrcLen > 0 {
                        m.PutAttr(linux.RTA_SRC, primitive.AsByteSlice(rt.SrcAddr))
                }
                if rt.OutputInterface != 0 {
                        m.PutAttr(linux.RTA_OIF, primitive.AllocateInt32(rt.OutputInterface))
                }
                if len(rt.GatewayAddr) > 0 {
                        m.PutAttr(linux.RTA_GATEWAY, primitive.AsByteSlice(rt.GatewayAddr))
                }

                // TODO(gvisor.dev/issue/578): There are many more attributes.
        }

        return nil
}

// newAddr handles RTM_NEWADDR requests.
func (p *Protocol) newAddr(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
        stack := inet.StackFromContext(ctx)
        if stack == nil {
                // No network stack.
                return syserr.ErrProtocolNotSupported
        }

        var ifa linux.InterfaceAddrMessage
        attrs, ok := msg.GetData(&ifa)
        if !ok {
                return syserr.ErrInvalidArgument
        }

        for !attrs.Empty() {
                ahdr, value, rest, ok := attrs.ParseFirst()
                if !ok {
                        return syserr.ErrInvalidArgument
                }
                attrs = rest

                // NOTE: A netlink message will contain multiple header attributes.
                // Both the IFA_ADDRESS and IFA_LOCAL attributes are typically sent
                // with IFA_ADDRESS being a prefix address and IFA_LOCAL being the
                // local interface address. We add the local interface address here
                // and ignore the IFA_ADDRESS.
                switch ahdr.Type {
                case linux.IFA_LOCAL:
                        err := stack.AddInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{
                                Family:    ifa.Family,
                                PrefixLen: ifa.PrefixLen,
                                Flags:     ifa.Flags,
                                Addr:      value,
                        })
                        if err == unix.EEXIST {
                                flags := msg.Header().Flags
                                if flags&linux.NLM_F_EXCL != 0 {
                                        return syserr.ErrExists
                                }
                        } else if err != nil {
                                return syserr.ErrInvalidArgument
                        }
                case linux.IFA_ADDRESS:
                default:
                        return syserr.ErrNotSupported
                }
        }
        return nil
}

// delAddr handles RTM_DELADDR requests.
func (p *Protocol) delAddr(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
        stack := inet.StackFromContext(ctx)
        if stack == nil {
                // No network stack.
                return syserr.ErrProtocolNotSupported
        }

        var ifa linux.InterfaceAddrMessage
        attrs, ok := msg.GetData(&ifa)
        if !ok {
                return syserr.ErrInvalidArgument
        }

        for !attrs.Empty() {
                ahdr, value, rest, ok := attrs.ParseFirst()
                if !ok {
                        return syserr.ErrInvalidArgument
                }
                attrs = rest

                // NOTE: A netlink message will contain multiple header attributes.
                // Both the IFA_ADDRESS and IFA_LOCAL attributes are typically sent
                // with IFA_ADDRESS being a prefix address and IFA_LOCAL being the
                // local interface address. We use the local interface address to
                // remove the address and ignore the IFA_ADDRESS.
                switch ahdr.Type {
                case linux.IFA_LOCAL:
                        err := stack.RemoveInterfaceAddr(int32(ifa.Index), inet.InterfaceAddr{
                                Family:    ifa.Family,
                                PrefixLen: ifa.PrefixLen,
                                Flags:     ifa.Flags,
                                Addr:      value,
                        })
                        if err != nil {
                                return syserr.ErrBadLocalAddress
                        }
                case linux.IFA_ADDRESS:
                default:
                        return syserr.ErrNotSupported
                }
        }

        return nil
}

// ProcessMessage implements netlink.Protocol.ProcessMessage.
func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
        hdr := msg.Header()

        // All messages start with a 1 byte protocol family.
        var family primitive.Uint8
        if _, ok := msg.GetData(&family); !ok {
                // Linux ignores messages missing the protocol family. See
                // net/core/rtnetlink.c:rtnetlink_rcv_msg.
                return nil
        }

        // Non-GET message types require CAP_NET_ADMIN.
        if typeKind(hdr.Type) != kindGet {
                creds := auth.CredentialsFromContext(ctx)
                if !creds.HasCapability(linux.CAP_NET_ADMIN) {
                        return syserr.ErrPermissionDenied
                }
        }

        if hdr.Flags&linux.NLM_F_DUMP == linux.NLM_F_DUMP {
                // TODO(b/68878065): Only the dump variant of the types below are
                // supported.
                switch hdr.Type {
                case linux.RTM_GETLINK:
                        return p.dumpLinks(ctx, msg, ms)
                case linux.RTM_GETADDR:
                        return p.dumpAddrs(ctx, msg, ms)
                case linux.RTM_GETROUTE:
                        return p.dumpRoutes(ctx, msg, ms)
                default:
                        return syserr.ErrNotSupported
                }
        } else if hdr.Flags&linux.NLM_F_REQUEST == linux.NLM_F_REQUEST {
                switch hdr.Type {
                case linux.RTM_GETLINK:
                        return p.getLink(ctx, msg, ms)
                case linux.RTM_GETROUTE:
                        return p.dumpRoutes(ctx, msg, ms)
                case linux.RTM_NEWADDR:
                        return p.newAddr(ctx, msg, ms)
                case linux.RTM_DELADDR:
                        return p.delAddr(ctx, msg, ms)
                default:
                        return syserr.ErrNotSupported
                }
        }
        return syserr.ErrNotSupported
}

// init registers the NETLINK_ROUTE provider.
func init() {
        netlink.RegisterProvider(linux.NETLINK_ROUTE, NewProtocol)
}

































































































































































    3 





    3 
    1 


    2 


    1 










    2 




    2 






    2 















    1 




    1 



    1 


    1 




    1 

    1 


    2 
    1 


    1 














    1 




















    1 










    1 




    1 

    1 





    1 


























    1 


















































    1 















    7 




    7 




    8 



    8 



    8 



    8 



    8 



    8 












































   65 





   18 
    1 



   17 


    1 


   16 



   15 



    8 

    2 







    2 



    2 

    4 
    1 



    3 


    2 


    1 


    1 
    1 








    1 



    6 

    2 



    2 





    3 
    1 



    2 















    1 





    1 



    1 


    1 




















    1 





    1 











   61 



   61 
    2 

    1 


    1 









   61 


  188 



  188 


  186 



    2 



    2 






  129 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package control provides internal representations of socket control
// messages.
package control

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/bits"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/socket"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
)

const maxInt = int(^uint(0) >> 1)

// SCMCredentials represents a SCM_CREDENTIALS socket control message.
type SCMCredentials interface {
        transport.CredentialsControlMessage

        // Credentials returns properly namespaced values for the sender's pid, uid
        // and gid.
        Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID)
}

// LINT.IfChange

// SCMRights represents a SCM_RIGHTS socket control message.
type SCMRights interface {
        transport.RightsControlMessage

        // Files returns up to max RightsFiles.
        //
        // Returned files are consumed and ownership is transferred to the caller.
        // Subsequent calls to Files will return the next files.
        Files(ctx context.Context, max int) (rf RightsFiles, truncated bool)
}

// RightsFiles represents a SCM_RIGHTS socket control message. A reference is
// maintained for each fs.File and is release either when an FD is created or
// when the Release method is called.
//
// +stateify savable
type RightsFiles []*fs.File

// NewSCMRights creates a new SCM_RIGHTS socket control message representation
// using local sentry FDs.
func NewSCMRights(t *kernel.Task, fds []int32) (SCMRights, error) {
        files := make(RightsFiles, 0, len(fds))
        for _, fd := range fds {
                file := t.GetFile(fd)
                if file == nil {
                        files.Release(t)
                        return nil, linuxerr.EBADF
                }
                files = append(files, file)
        }
        return &files, nil
}

// Files implements SCMRights.Files.
func (fs *RightsFiles) Files(ctx context.Context, max int) (RightsFiles, bool) {
        n := max
        var trunc bool
        if l := len(*fs); n > l {
                n = l
        } else if n < l {
                trunc = true
        }
        rf := (*fs)[:n]
        *fs = (*fs)[n:]
        return rf, trunc
}

// Clone implements transport.RightsControlMessage.Clone.
func (fs *RightsFiles) Clone() transport.RightsControlMessage {
        nfs := append(RightsFiles(nil), *fs...)
        for _, nf := range nfs {
                nf.IncRef()
        }
        return &nfs
}

// Release implements transport.RightsControlMessage.Release.
func (fs *RightsFiles) Release(ctx context.Context) {
        for _, f := range *fs {
                f.DecRef(ctx)
        }
        *fs = nil
}

// rightsFDs gets up to the specified maximum number of FDs.
func rightsFDs(t *kernel.Task, rights SCMRights, cloexec bool, max int) ([]int32, bool) {
        files, trunc := rights.Files(t, max)
        fds := make([]int32, 0, len(files))
        for i := 0; i < max && len(files) > 0; i++ {
                fd, err := t.NewFDFrom(0, files[0], kernel.FDFlags{
                        CloseOnExec: cloexec,
                })
                files[0].DecRef(t)
                files = files[1:]
                if err != nil {
                        t.Warningf("Error inserting FD: %v", err)
                        // This is what Linux does.
                        break
                }

                fds = append(fds, int32(fd))
        }
        return fds, trunc
}

// PackRights packs as many FDs as will fit into the unused capacity of buf.
func PackRights(t *kernel.Task, rights SCMRights, cloexec bool, buf []byte, flags int) ([]byte, int) {
        maxFDs := (cap(buf) - len(buf) - linux.SizeOfControlMessageHeader) / 4
        // Linux does not return any FDs if none fit.
        if maxFDs <= 0 {
                flags |= linux.MSG_CTRUNC
                return buf, flags
        }
        fds, trunc := rightsFDs(t, rights, cloexec, maxFDs)
        if trunc {
                flags |= linux.MSG_CTRUNC
        }
        align := t.Arch().Width()
        return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds)
}

// LINT.ThenChange(./control_vfs2.go)

// scmCredentials represents an SCM_CREDENTIALS socket control message.
//
// +stateify savable
type scmCredentials struct {
        t    *kernel.Task
        kuid auth.KUID
        kgid auth.KGID
}

// NewSCMCredentials creates a new SCM_CREDENTIALS socket control message
// representation.
func NewSCMCredentials(t *kernel.Task, cred linux.ControlMessageCredentials) (SCMCredentials, error) {
        tcred := t.Credentials()
        kuid, err := tcred.UseUID(auth.UID(cred.UID))
        if err != nil {
                return nil, err
        }
        kgid, err := tcred.UseGID(auth.GID(cred.GID))
        if err != nil {
                return nil, err
        }
        if kernel.ThreadID(cred.PID) != t.ThreadGroup().ID() && !t.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.PIDNamespace().UserNamespace()) {
                return nil, linuxerr.EPERM
        }
        return &scmCredentials{t, kuid, kgid}, nil
}

// Equals implements transport.CredentialsControlMessage.Equals.
func (c *scmCredentials) Equals(oc transport.CredentialsControlMessage) bool {
        if oc, _ := oc.(*scmCredentials); oc != nil && *c == *oc {
                return true
        }
        return false
}

func putUint64(buf []byte, n uint64) []byte {
        hostarch.ByteOrder.PutUint64(buf[len(buf):len(buf)+8], n)
        return buf[:len(buf)+8]
}

func putUint32(buf []byte, n uint32) []byte {
        hostarch.ByteOrder.PutUint32(buf[len(buf):len(buf)+4], n)
        return buf[:len(buf)+4]
}

// putCmsg writes a control message header and as much data as will fit into
// the unused capacity of a buffer.
func putCmsg(buf []byte, flags int, msgType uint32, align uint, data []int32) ([]byte, int) {
        space := bits.AlignDown(cap(buf)-len(buf), 4)

        // We can't write to space that doesn't exist, so if we are going to align
        // the available space, we must align down.
        //
        // align must be >= 4 and each data int32 is 4 bytes. The length of the
        // header is already aligned, so if we align to the width of the data there
        // are two cases:
        // 1. The aligned length is less than the length of the header. The
        // unaligned length was also less than the length of the header, so we
        // can't write anything.
        // 2. The aligned length is greater than or equal to the length of the
        // header. We can write the header plus zero or more bytes of data. We can't
        // write a partial int32, so the length of the message will be
        // min(aligned length, header + data).
        if space < linux.SizeOfControlMessageHeader {
                flags |= linux.MSG_CTRUNC
                return buf, flags
        }

        length := 4*len(data) + linux.SizeOfControlMessageHeader
        if length > space {
                length = space
        }
        buf = putUint64(buf, uint64(length))
        buf = putUint32(buf, linux.SOL_SOCKET)
        buf = putUint32(buf, msgType)
        for _, d := range data {
                if len(buf)+4 > cap(buf) {
                        flags |= linux.MSG_CTRUNC
                        break
                }
                buf = putUint32(buf, uint32(d))
        }
        return alignSlice(buf, align), flags
}

func putCmsgStruct(buf []byte, msgLevel, msgType uint32, align uint, data marshal.Marshallable) []byte {
        if cap(buf)-len(buf) < linux.SizeOfControlMessageHeader {
                return buf
        }
        ob := buf

        buf = putUint64(buf, uint64(linux.SizeOfControlMessageHeader))
        buf = putUint32(buf, msgLevel)
        buf = putUint32(buf, msgType)

        hdrBuf := buf
        buf = append(buf, marshal.Marshal(data)...)

        // If the control message data brought us over capacity, omit it.
        if cap(buf) != cap(ob) {
                return hdrBuf
        }

        // Update control message length to include data.
        putUint64(ob, uint64(len(buf)-len(ob)))

        return alignSlice(buf, align)
}

// Credentials implements SCMCredentials.Credentials.
func (c *scmCredentials) Credentials(t *kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) {
        // "When a process's user and group IDs are passed over a UNIX domain
        // socket to a process in a different user namespace (see the description
        // of SCM_CREDENTIALS in unix(7)), they are translated into the
        // corresponding values as per the receiving process's user and group ID
        // mappings." - user_namespaces(7)
        pid := t.PIDNamespace().IDOfTask(c.t)
        uid := c.kuid.In(t.UserNamespace()).OrOverflow()
        gid := c.kgid.In(t.UserNamespace()).OrOverflow()

        return pid, uid, gid
}

// PackCredentials packs the credentials in the control message (or default
// credentials if none) into a buffer.
func PackCredentials(t *kernel.Task, creds SCMCredentials, buf []byte, flags int) ([]byte, int) {
        align := t.Arch().Width()

        // Default credentials if none are available.
        pid := kernel.ThreadID(0)
        uid := auth.UID(auth.NobodyKUID)
        gid := auth.GID(auth.NobodyKGID)

        if creds != nil {
                pid, uid, gid = creds.Credentials(t)
        }
        c := []int32{int32(pid), int32(uid), int32(gid)}
        return putCmsg(buf, flags, linux.SCM_CREDENTIALS, align, c)
}

// alignSlice extends a slice's length (up to the capacity) to align it.
func alignSlice(buf []byte, align uint) []byte {
        aligned := bits.AlignUp(len(buf), align)
        if aligned > cap(buf) {
                // Linux allows unaligned data if there isn't room for alignment.
                // Since there isn't room for alignment, there isn't room for any
                // additional messages either.
                return buf
        }
        return buf[:aligned]
}

// PackTimestamp packs a SO_TIMESTAMP socket control message.
func PackTimestamp(t *kernel.Task, timestamp int64, buf []byte) []byte {
        timestampP := linux.NsecToTimeval(timestamp)
        return putCmsgStruct(
                buf,
                linux.SOL_SOCKET,
                linux.SO_TIMESTAMP,
                t.Arch().Width(),
                &timestampP,
        )
}

// PackInq packs a TCP_INQ socket control message.
func PackInq(t *kernel.Task, inq int32, buf []byte) []byte {
        return putCmsgStruct(
                buf,
                linux.SOL_TCP,
                linux.TCP_INQ,
                t.Arch().Width(),
                primitive.AllocateInt32(inq),
        )
}

// PackTOS packs an IP_TOS socket control message.
func PackTOS(t *kernel.Task, tos uint8, buf []byte) []byte {
        return putCmsgStruct(
                buf,
                linux.SOL_IP,
                linux.IP_TOS,
                t.Arch().Width(),
                primitive.AllocateUint8(tos),
        )
}

// PackTClass packs an IPV6_TCLASS socket control message.
func PackTClass(t *kernel.Task, tClass uint32, buf []byte) []byte {
        return putCmsgStruct(
                buf,
                linux.SOL_IPV6,
                linux.IPV6_TCLASS,
                t.Arch().Width(),
                primitive.AllocateUint32(tClass),
        )
}

// PackIPPacketInfo packs an IP_PKTINFO socket control message.
func PackIPPacketInfo(t *kernel.Task, packetInfo *linux.ControlMessageIPPacketInfo, buf []byte) []byte {
        return putCmsgStruct(
                buf,
                linux.SOL_IP,
                linux.IP_PKTINFO,
                t.Arch().Width(),
                packetInfo,
        )
}

// PackOriginalDstAddress packs an IP_RECVORIGINALDSTADDR socket control message.
func PackOriginalDstAddress(t *kernel.Task, originalDstAddress linux.SockAddr, buf []byte) []byte {
        var level uint32
        var optType uint32
        switch originalDstAddress.(type) {
        case *linux.SockAddrInet:
                level = linux.SOL_IP
                optType = linux.IP_RECVORIGDSTADDR
        case *linux.SockAddrInet6:
                level = linux.SOL_IPV6
                optType = linux.IPV6_RECVORIGDSTADDR
        default:
                panic("invalid address type, must be an IP address for IP_RECVORIGINALDSTADDR cmsg")
        }
        return putCmsgStruct(
                buf, level, optType, t.Arch().Width(), originalDstAddress)
}

// PackSockExtendedErr packs an IP*_RECVERR socket control message.
func PackSockExtendedErr(t *kernel.Task, sockErr linux.SockErrCMsg, buf []byte) []byte {
        return putCmsgStruct(
                buf,
                sockErr.CMsgLevel(),
                sockErr.CMsgType(),
                t.Arch().Width(),
                sockErr,
        )
}

// PackControlMessages packs control messages into the given buffer.
//
// We skip control messages specific to Unix domain sockets.
//
// Note that some control messages may be truncated if they do not fit under
// the capacity of buf.
func PackControlMessages(t *kernel.Task, cmsgs socket.ControlMessages, buf []byte) []byte {
        if cmsgs.IP.HasTimestamp {
                buf = PackTimestamp(t, cmsgs.IP.Timestamp, buf)
        }

        if cmsgs.IP.HasInq {
                // In Linux, TCP_CM_INQ is added after SO_TIMESTAMP.
                buf = PackInq(t, cmsgs.IP.Inq, buf)
        }

        if cmsgs.IP.HasTOS {
                buf = PackTOS(t, cmsgs.IP.TOS, buf)
        }

        if cmsgs.IP.HasTClass {
                buf = PackTClass(t, cmsgs.IP.TClass, buf)
        }

        if cmsgs.IP.HasIPPacketInfo {
                buf = PackIPPacketInfo(t, &cmsgs.IP.PacketInfo, buf)
        }

        if cmsgs.IP.OriginalDstAddress != nil {
                buf = PackOriginalDstAddress(t, cmsgs.IP.OriginalDstAddress, buf)
        }

        if cmsgs.IP.SockErr != nil {
                buf = PackSockExtendedErr(t, cmsgs.IP.SockErr, buf)
        }

        return buf
}

// cmsgSpace is equivalent to CMSG_SPACE in Linux.
func cmsgSpace(t *kernel.Task, dataLen int) int {
        return linux.SizeOfControlMessageHeader + bits.AlignUp(dataLen, t.Arch().Width())
}

// CmsgsSpace returns the number of bytes needed to fit the control messages
// represented in cmsgs.
func CmsgsSpace(t *kernel.Task, cmsgs socket.ControlMessages) int {
        space := 0

        if cmsgs.IP.HasTimestamp {
                space += cmsgSpace(t, linux.SizeOfTimeval)
        }

        if cmsgs.IP.HasInq {
                space += cmsgSpace(t, linux.SizeOfControlMessageInq)
        }

        if cmsgs.IP.HasTOS {
                space += cmsgSpace(t, linux.SizeOfControlMessageTOS)
        }

        if cmsgs.IP.HasTClass {
                space += cmsgSpace(t, linux.SizeOfControlMessageTClass)
        }

        if cmsgs.IP.HasIPPacketInfo {
                space += cmsgSpace(t, linux.SizeOfControlMessageIPPacketInfo)
        }

        if cmsgs.IP.OriginalDstAddress != nil {
                space += cmsgSpace(t, cmsgs.IP.OriginalDstAddress.SizeBytes())
        }

        if cmsgs.IP.SockErr != nil {
                space += cmsgSpace(t, cmsgs.IP.SockErr.SizeBytes())
        }

        return space
}

// Parse parses a raw socket control message into portable objects.
func Parse(t *kernel.Task, socketOrEndpoint interface{}, buf []byte, width uint) (socket.ControlMessages, error) {
        var (
                cmsgs socket.ControlMessages
                fds   linux.ControlMessageRights
        )

        for i := 0; i < len(buf); {
                if i+linux.SizeOfControlMessageHeader > len(buf) {
                        return cmsgs, linuxerr.EINVAL
                }

                var h linux.ControlMessageHeader
                h.UnmarshalUnsafe(buf[i : i+linux.SizeOfControlMessageHeader])

                if h.Length < uint64(linux.SizeOfControlMessageHeader) {
                        return socket.ControlMessages{}, linuxerr.EINVAL
                }
                if h.Length > uint64(len(buf)-i) {
                        return socket.ControlMessages{}, linuxerr.EINVAL
                }

                i += linux.SizeOfControlMessageHeader
                length := int(h.Length) - linux.SizeOfControlMessageHeader

                switch h.Level {
                case linux.SOL_SOCKET:
                        switch h.Type {
                        case linux.SCM_RIGHTS:
                                rightsSize := bits.AlignDown(length, linux.SizeOfControlMessageRight)
                                numRights := rightsSize / linux.SizeOfControlMessageRight

                                if len(fds)+numRights > linux.SCM_MAX_FD {
                                        return socket.ControlMessages{}, linuxerr.EINVAL
                                }

                                for j := i; j < i+rightsSize; j += linux.SizeOfControlMessageRight {
                                        fds = append(fds, int32(hostarch.ByteOrder.Uint32(buf[j:j+linux.SizeOfControlMessageRight])))
                                }

                                i += bits.AlignUp(length, width)

                        case linux.SCM_CREDENTIALS:
                                if length < linux.SizeOfControlMessageCredentials {
                                        return socket.ControlMessages{}, linuxerr.EINVAL
                                }

                                var creds linux.ControlMessageCredentials
                                creds.UnmarshalUnsafe(buf[i : i+linux.SizeOfControlMessageCredentials])
                                scmCreds, err := NewSCMCredentials(t, creds)
                                if err != nil {
                                        return socket.ControlMessages{}, err
                                }
                                cmsgs.Unix.Credentials = scmCreds
                                i += bits.AlignUp(length, width)

                        case linux.SO_TIMESTAMP:
                                if length < linux.SizeOfTimeval {
                                        return socket.ControlMessages{}, linuxerr.EINVAL
                                }
                                var ts linux.Timeval
                                ts.UnmarshalUnsafe(buf[i : i+linux.SizeOfTimeval])
                                cmsgs.IP.Timestamp = ts.ToNsecCapped()
                                cmsgs.IP.HasTimestamp = true
                                i += bits.AlignUp(length, width)

                        default:
                                // Unknown message type.
                                return socket.ControlMessages{}, linuxerr.EINVAL
                        }
                case linux.SOL_IP:
                        switch h.Type {
                        case linux.IP_TOS:
                                if length < linux.SizeOfControlMessageTOS {
                                        return socket.ControlMessages{}, linuxerr.EINVAL
                                }
                                cmsgs.IP.HasTOS = true
                                var tos primitive.Uint8
                                tos.UnmarshalUnsafe(buf[i : i+linux.SizeOfControlMessageTOS])
                                cmsgs.IP.TOS = uint8(tos)
                                i += bits.AlignUp(length, width)

                        case linux.IP_PKTINFO:
                                if length < linux.SizeOfControlMessageIPPacketInfo {
                                        return socket.ControlMessages{}, linuxerr.EINVAL
                                }

                                cmsgs.IP.HasIPPacketInfo = true
                                var packetInfo linux.ControlMessageIPPacketInfo
                                packetInfo.UnmarshalUnsafe(buf[i : i+linux.SizeOfControlMessageIPPacketInfo])

                                cmsgs.IP.PacketInfo = packetInfo
                                i += bits.AlignUp(length, width)

                        case linux.IP_RECVORIGDSTADDR:
                                var addr linux.SockAddrInet
                                if length < addr.SizeBytes() {
                                        return socket.ControlMessages{}, linuxerr.EINVAL
                                }
                                addr.UnmarshalUnsafe(buf[i : i+addr.SizeBytes()])
                                cmsgs.IP.OriginalDstAddress = &addr
                                i += bits.AlignUp(length, width)

                        case linux.IP_RECVERR:
                                var errCmsg linux.SockErrCMsgIPv4
                                if length < errCmsg.SizeBytes() {
                                        return socket.ControlMessages{}, linuxerr.EINVAL
                                }

                                errCmsg.UnmarshalBytes(buf[i : i+errCmsg.SizeBytes()])
                                cmsgs.IP.SockErr = &errCmsg
                                i += bits.AlignUp(length, width)

                        default:
                                return socket.ControlMessages{}, linuxerr.EINVAL
                        }
                case linux.SOL_IPV6:
                        switch h.Type {
                        case linux.IPV6_TCLASS:
                                if length < linux.SizeOfControlMessageTClass {
                                        return socket.ControlMessages{}, linuxerr.EINVAL
                                }
                                cmsgs.IP.HasTClass = true
                                var tclass primitive.Uint32
                                tclass.UnmarshalUnsafe(buf[i : i+linux.SizeOfControlMessageTClass])
                                cmsgs.IP.TClass = uint32(tclass)
                                i += bits.AlignUp(length, width)

                        case linux.IPV6_RECVORIGDSTADDR:
                                var addr linux.SockAddrInet6
                                if length < addr.SizeBytes() {
                                        return socket.ControlMessages{}, linuxerr.EINVAL
                                }
                                addr.UnmarshalUnsafe(buf[i : i+addr.SizeBytes()])
                                cmsgs.IP.OriginalDstAddress = &addr
                                i += bits.AlignUp(length, width)

                        case linux.IPV6_RECVERR:
                                var errCmsg linux.SockErrCMsgIPv6
                                if length < errCmsg.SizeBytes() {
                                        return socket.ControlMessages{}, linuxerr.EINVAL
                                }

                                errCmsg.UnmarshalBytes(buf[i : i+errCmsg.SizeBytes()])
                                cmsgs.IP.SockErr = &errCmsg
                                i += bits.AlignUp(length, width)

                        default:
                                return socket.ControlMessages{}, linuxerr.EINVAL
                        }
                default:
                        return socket.ControlMessages{}, linuxerr.EINVAL
                }
        }

        if cmsgs.Unix.Credentials == nil {
                cmsgs.Unix.Credentials = makeCreds(t, socketOrEndpoint)
        }

        if len(fds) > 0 {
                if kernel.VFS2Enabled {
                        rights, err := NewSCMRightsVFS2(t, fds)
                        if err != nil {
                                return socket.ControlMessages{}, err
                        }
                        cmsgs.Unix.Rights = rights
                } else {
                        rights, err := NewSCMRights(t, fds)
                        if err != nil {
                                return socket.ControlMessages{}, err
                        }
                        cmsgs.Unix.Rights = rights
                }
        }

        return cmsgs, nil
}

func makeCreds(t *kernel.Task, socketOrEndpoint interface{}) SCMCredentials {
        if t == nil || socketOrEndpoint == nil {
                return nil
        }
        if cr, ok := socketOrEndpoint.(transport.Credentialer); ok && (cr.Passcred() || cr.ConnectedPasscred()) {
                return MakeCreds(t)
        }
        return nil
}

// MakeCreds creates default SCMCredentials.
func MakeCreds(t *kernel.Task) SCMCredentials {
        if t == nil {
                return nil
        }
        tcred := t.Credentials()
        return &scmCredentials{t, tcred.EffectiveKUID, tcred.EffectiveKGID}
}

// LINT.IfChange

// New creates default control messages if needed.
func New(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRights) transport.ControlMessages {
        return transport.ControlMessages{
                Credentials: makeCreds(t, socketOrEndpoint),
                Rights:      rights,
        }
}

// LINT.ThenChange(./control_vfs2.go)







































    3 










    3 




    3 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cgroupfs

import (
        "bytes"
        "fmt"
        "math"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/usage"
)

// +stateify savable
type memoryController struct {
        controllerCommon

        limitBytes int64
}

var _ controller = (*memoryController)(nil)

func newMemoryController(fs *filesystem, defaults map[string]int64) *memoryController {
        c := &memoryController{
                // Linux sets this to (PAGE_COUNTER_MAX * PAGE_SIZE) by default, which
                // is ~ 2**63 on a 64-bit system. So essentially, inifinity. The exact
                // value isn't very important.
                limitBytes: math.MaxInt64,
        }
        if val, ok := defaults["memory.limit_in_bytes"]; ok {
                c.limitBytes = val
                delete(defaults, "memory.limit_in_bytes")
        }
        c.controllerCommon.init(controllerMemory, fs)
        return c
}

// AddControlFiles implements controller.AddControlFiles.
func (c *memoryController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) {
        contents["memory.usage_in_bytes"] = c.fs.newControllerFile(ctx, creds, &memoryUsageInBytesData{})
        contents["memory.limit_in_bytes"] = c.fs.newStaticControllerFile(ctx, creds, linux.FileMode(0644), fmt.Sprintf("%d\n", c.limitBytes))
}

// +stateify savable
type memoryUsageInBytesData struct{}

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *memoryUsageInBytesData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        // TODO(b/183151557): This is a giant hack, we're using system-wide
        // accounting since we know there is only one cgroup.
        k := kernel.KernelFromContext(ctx)
        mf := k.MemoryFile()
        mf.UpdateUsage()
        _, totalBytes := usage.MemoryAccounting.Copy()

        fmt.Fprintf(buf, "%d\n", totalBytes)
        return nil
}






























   42 
   42 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package inet

import (
        "gvisor.dev/gvisor/pkg/context"
)

// contextID is the inet package's type for context.Context.Value keys.
type contextID int

const (
        // CtxStack is a Context.Value key for a network stack.
        CtxStack contextID = iota
)

// StackFromContext returns the network stack associated with ctx.
func StackFromContext(ctx context.Context) Stack {
        if v := ctx.Value(CtxStack); v != nil {
                return v.(Stack)
        }
        return nil
}











































    3 





    3 





















































    3 









    3 



    3 














    1 




    1 



    1 


    1 



    3 






    3 





    1 
    1 




































    1 










    1 





    1 
    1 



    1 

    1 



    1 



    1 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cgroupfs

import (
        "bytes"
        "fmt"
        "sort"
        "strconv"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/usermem"
)

// controllerCommon implements kernel.CgroupController.
//
// Must call init before use.
//
// +stateify savable
type controllerCommon struct {
        ty kernel.CgroupControllerType
        fs *filesystem
}

func (c *controllerCommon) init(ty kernel.CgroupControllerType, fs *filesystem) {
        c.ty = ty
        c.fs = fs
}

// Type implements kernel.CgroupController.Type.
func (c *controllerCommon) Type() kernel.CgroupControllerType {
        return kernel.CgroupControllerType(c.ty)
}

// HierarchyID implements kernel.CgroupController.HierarchyID.
func (c *controllerCommon) HierarchyID() uint32 {
        return c.fs.hierarchyID
}

// NumCgroups implements kernel.CgroupController.NumCgroups.
func (c *controllerCommon) NumCgroups() uint64 {
        return atomic.LoadUint64(&c.fs.numCgroups)
}

// Enabled implements kernel.CgroupController.Enabled.
//
// Controllers are currently always enabled.
func (c *controllerCommon) Enabled() bool {
        return true
}

// RootCgroup implements kernel.CgroupController.RootCgroup.
func (c *controllerCommon) RootCgroup() kernel.Cgroup {
        return c.fs.rootCgroup()
}

// controller is an interface for common functionality related to all cgroups.
// It is an extension of the public cgroup interface, containing cgroup
// functionality private to cgroupfs.
type controller interface {
        kernel.CgroupController

        // AddControlFiles should extend the contents map with inodes representing
        // control files defined by this controller.
        AddControlFiles(ctx context.Context, creds *auth.Credentials, c *cgroupInode, contents map[string]kernfs.Inode)
}

// cgroupInode implements kernel.CgroupImpl and kernfs.Inode.
//
// +stateify savable
type cgroupInode struct {
        dir
        fs *filesystem

        // ts is the list of tasks in this cgroup. The kernel is responsible for
        // removing tasks from this list before they're destroyed, so any tasks on
        // this list are always valid.
        //
        // ts, and cgroup membership in general is protected by fs.tasksMu.
        ts map[*kernel.Task]struct{}
}

var _ kernel.CgroupImpl = (*cgroupInode)(nil)

func (fs *filesystem) newCgroupInode(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
        c := &cgroupInode{
                fs: fs,
                ts: make(map[*kernel.Task]struct{}),
        }

        contents := make(map[string]kernfs.Inode)
        contents["cgroup.procs"] = fs.newControllerFile(ctx, creds, &cgroupProcsData{c})
        contents["tasks"] = fs.newControllerFile(ctx, creds, &tasksData{c})

        for _, ctl := range fs.controllers {
                ctl.AddControlFiles(ctx, creds, c, contents)
        }

        c.dir.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|linux.FileMode(0555))
        c.dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
        c.dir.InitRefs()
        c.dir.IncLinks(c.dir.OrderedChildren.Populate(contents))

        atomic.AddUint64(&fs.numCgroups, 1)

        return c
}

func (c *cgroupInode) HierarchyID() uint32 {
        return c.fs.hierarchyID
}

// Controllers implements kernel.CgroupImpl.Controllers.
func (c *cgroupInode) Controllers() []kernel.CgroupController {
        return c.fs.kcontrollers
}

// tasks returns a snapshot of the tasks inside the cgroup.
func (c *cgroupInode) tasks() []*kernel.Task {
        c.fs.tasksMu.RLock()
        defer c.fs.tasksMu.RUnlock()
        ts := make([]*kernel.Task, 0, len(c.ts))
        for t := range c.ts {
                ts = append(ts, t)
        }
        return ts
}

// Enter implements kernel.CgroupImpl.Enter.
func (c *cgroupInode) Enter(t *kernel.Task) {
        c.fs.tasksMu.Lock()
        c.ts[t] = struct{}{}
        c.fs.tasksMu.Unlock()
}

// Leave implements kernel.CgroupImpl.Leave.
func (c *cgroupInode) Leave(t *kernel.Task) {
        c.fs.tasksMu.Lock()
        delete(c.ts, t)
        c.fs.tasksMu.Unlock()
}

func sortTIDs(tids []kernel.ThreadID) {
        sort.Slice(tids, func(i, j int) bool { return tids[i] < tids[j] })
}

// +stateify savable
type cgroupProcsData struct {
        *cgroupInode
}

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *cgroupProcsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        t := kernel.TaskFromContext(ctx)
        currPidns := t.ThreadGroup().PIDNamespace()

        pgids := make(map[kernel.ThreadID]struct{})

        for _, task := range d.tasks() {
                // Map dedups pgid, since iterating over all tasks produces multiple
                // entries for the group leaders.
                if pgid := currPidns.IDOfThreadGroup(task.ThreadGroup()); pgid != 0 {
                        pgids[pgid] = struct{}{}
                }
        }

        pgidList := make([]kernel.ThreadID, 0, len(pgids))
        for pgid, _ := range pgids {
                pgidList = append(pgidList, pgid)
        }
        sortTIDs(pgidList)

        for _, pgid := range pgidList {
                fmt.Fprintf(buf, "%d\n", pgid)
        }

        return nil
}

// Write implements vfs.WritableDynamicBytesSource.Write.
func (d *cgroupProcsData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
        // TODO(b/183137098): Payload is the pid for a process to add to this cgroup.
        return src.NumBytes(), nil
}

// +stateify savable
type tasksData struct {
        *cgroupInode
}

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *tasksData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        t := kernel.TaskFromContext(ctx)
        currPidns := t.ThreadGroup().PIDNamespace()

        var pids []kernel.ThreadID

        for _, task := range d.tasks() {
                if pid := currPidns.IDOfTask(task); pid != 0 {
                        pids = append(pids, pid)
                }
        }
        sortTIDs(pids)

        for _, pid := range pids {
                fmt.Fprintf(buf, "%d\n", pid)
        }

        return nil
}

// Write implements vfs.WritableDynamicBytesSource.Write.
func (d *tasksData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
        // TODO(b/183137098): Payload is the pid for a process to add to this cgroup.
        return src.NumBytes(), nil
}

// parseInt64FromString interprets src as string encoding a int64 value, and
// returns the parsed value.
func parseInt64FromString(ctx context.Context, src usermem.IOSequence, offset int64) (val, len int64, err error) {
        const maxInt64StrLen = 20 // i.e. len(fmt.Sprintf("%d", math.MinInt64)) == 20

        t := kernel.TaskFromContext(ctx)
        src = src.DropFirst64(offset)

        buf := t.CopyScratchBuffer(maxInt64StrLen)
        n, err := src.CopyIn(ctx, buf)
        if err != nil {
                return 0, int64(n), err
        }
        buf = buf[:n]

        val, err = strconv.ParseInt(string(buf), 10, 64)
        if err != nil {
                // Note: This also handles zero-len writes if offset is beyond the end
                // of src, or src is empty.
                ctx.Warningf("cgroupfs.parseInt64FromString: failed to parse %q: %v", string(buf), err)
                return 0, int64(n), linuxerr.EINVAL
        }

        return val, int64(n), nil
}






























    3 






    3 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cgroupfs

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)

// +stateify savable
type cpusetController struct {
        controllerCommon
}

var _ controller = (*cpusetController)(nil)

func newCPUSetController(fs *filesystem) *cpusetController {
        c := &cpusetController{}
        c.controllerCommon.init(controllerCPUSet, fs)
        return c
}

// AddControlFiles implements controller.AddControlFiles.
func (c *cpusetController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) {
        // This controller is currently intentionally empty.
}












































    1 






  165 
  163 


  163 


  165 


  163 





    6 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package buffer

const (
        // embeddedCount is the number of buffer structures embedded in the pool. It
        // is also the number for overflow allocations.
        embeddedCount = 8

        // defaultBufferSize is the default size for each underlying storage buffer.
        //
        // It is slightly less than two pages. This is done intentionally to ensure
        // that the buffer object aligns with runtime internals. This two page size
        // will effectively minimize internal fragmentation, but still have a large
        // enough chunk to limit excessive segmentation.
        defaultBufferSize = 8144
)

// pool allocates buffer.
//
// It contains an embedded buffer storage for fast path when the number of
// buffers needed is small.
//
// +stateify savable
type pool struct {
        bufferSize      int
        avail           []buffer              `state:"nosave"`
        embeddedStorage [embeddedCount]buffer `state:"wait"`
}

// get gets a new buffer from p.
func (p *pool) get() *buffer {
        buf := p.getNoInit()
        buf.init(p.bufferSize)
        return buf
}

// get gets a new buffer from p without initializing it.
func (p *pool) getNoInit() *buffer {
        if p.avail == nil {
                p.avail = p.embeddedStorage[:]
        }
        if len(p.avail) == 0 {
                p.avail = make([]buffer, embeddedCount)
        }
        if p.bufferSize <= 0 {
                p.bufferSize = defaultBufferSize
        }
        buf := &p.avail[0]
        p.avail = p.avail[1:]
        return buf
}

// put releases buf.
func (p *pool) put(buf *buffer) {
        // Remove reference to the underlying storage, allowing it to be garbage
        // collected.
        buf.data = nil
        buf.Reset()
}

// setBufferSize sets the size of underlying storage buffer for future
// allocations. It can be called at any time.
func (p *pool) setBufferSize(size int) {
        p.bufferSize = size
}

// afterLoad is invoked by stateify.
func (p *pool) afterLoad() {
        // S/R does not save subslice into embeddedStorage correctly. Restore
        // available portion of embeddedStorage manually. Restore as nil if none used.
        for i := len(p.embeddedStorage); i > 0; i-- {
                if p.embeddedStorage[i-1].data != nil {
                        p.avail = p.embeddedStorage[i:]
                        break
                }
        }
}


































   25 



    1 

    1 









    2 


    1 

    1 







    1 


    1 







    1 


    1 
    1 





    2 





    2 

    1 



    1 

    1 





    1 

    2 



    2 



    2 
    1 



    4 
    1 



    3 
    2 



    2 























    1 



    1 



    1 
    1 






    2 
    1 


    1 

    2 


    1 





    1 




    1 



    3 





    3 




    1 




    1 
    1 


    1 

    2 

    1 


    1 

















    1 



    1 



    2 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/fsbridge"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/mm"
)

// Prctl implements linux syscall prctl(2).
// It has a list of subfunctions which operate on the process. The arguments are
// all based on each subfunction.
func Prctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        option := args[0].Int()

        switch option {
        case linux.PR_SET_PDEATHSIG:
                sig := linux.Signal(args[1].Int())
                if sig != 0 && !sig.IsValid() {
                        return 0, nil, linuxerr.EINVAL
                }
                t.SetParentDeathSignal(sig)
                return 0, nil, nil

        case linux.PR_GET_PDEATHSIG:
                _, err := primitive.CopyInt32Out(t, args[1].Pointer(), int32(t.ParentDeathSignal()))
                return 0, nil, err

        case linux.PR_GET_DUMPABLE:
                d := t.MemoryManager().Dumpability()
                switch d {
                case mm.NotDumpable:
                        return linux.SUID_DUMP_DISABLE, nil, nil
                case mm.UserDumpable:
                        return linux.SUID_DUMP_USER, nil, nil
                case mm.RootDumpable:
                        return linux.SUID_DUMP_ROOT, nil, nil
                default:
                        panic(fmt.Sprintf("Unknown dumpability %v", d))
                }

        case linux.PR_SET_DUMPABLE:
                var d mm.Dumpability
                switch args[1].Int() {
                case linux.SUID_DUMP_DISABLE:
                        d = mm.NotDumpable
                case linux.SUID_DUMP_USER:
                        d = mm.UserDumpable
                default:
                        // N.B. Userspace may not pass SUID_DUMP_ROOT.
                        return 0, nil, linuxerr.EINVAL
                }
                t.MemoryManager().SetDumpability(d)
                return 0, nil, nil

        case linux.PR_GET_KEEPCAPS:
                if t.Credentials().KeepCaps {
                        return 1, nil, nil
                }

                return 0, nil, nil

        case linux.PR_SET_KEEPCAPS:
                val := args[1].Int()
                // prctl(2): arg2 must be either 0 (permitted capabilities are cleared)
                // or 1 (permitted capabilities are kept).
                if val == 0 {
                        t.SetKeepCaps(false)
                } else if val == 1 {
                        t.SetKeepCaps(true)
                } else {
                        return 0, nil, linuxerr.EINVAL
                }

                return 0, nil, nil

        case linux.PR_SET_NAME:
                addr := args[1].Pointer()
                name, err := t.CopyInString(addr, linux.TASK_COMM_LEN-1)
                if err != nil && !linuxerr.Equals(linuxerr.ENAMETOOLONG, err) {
                        return 0, nil, err
                }
                t.SetName(name)

        case linux.PR_GET_NAME:
                addr := args[1].Pointer()
                buf := t.CopyScratchBuffer(linux.TASK_COMM_LEN)
                len := copy(buf, t.Name())
                if len < linux.TASK_COMM_LEN {
                        buf[len] = 0
                        len++
                }
                _, err := t.CopyOutBytes(addr, buf[:len])
                if err != nil {
                        return 0, nil, err
                }

        case linux.PR_SET_MM:
                if !t.HasCapability(linux.CAP_SYS_RESOURCE) {
                        return 0, nil, linuxerr.EPERM
                }

                switch args[1].Int() {
                case linux.PR_SET_MM_EXE_FILE:
                        fd := args[2].Int()

                        file := t.GetFile(fd)
                        if file == nil {
                                return 0, nil, linuxerr.EBADF
                        }
                        defer file.DecRef(t)

                        // They trying to set exe to a non-file?
                        if !fs.IsFile(file.Dirent.Inode.StableAttr) {
                                return 0, nil, linuxerr.EBADF
                        }

                        // Set the underlying executable.
                        t.MemoryManager().SetExecutable(t, fsbridge.NewFSFile(file))

                case linux.PR_SET_MM_AUXV,
                        linux.PR_SET_MM_START_CODE,
                        linux.PR_SET_MM_END_CODE,
                        linux.PR_SET_MM_START_DATA,
                        linux.PR_SET_MM_END_DATA,
                        linux.PR_SET_MM_START_STACK,
                        linux.PR_SET_MM_START_BRK,
                        linux.PR_SET_MM_BRK,
                        linux.PR_SET_MM_ARG_START,
                        linux.PR_SET_MM_ARG_END,
                        linux.PR_SET_MM_ENV_START,
                        linux.PR_SET_MM_ENV_END:

                        t.Kernel().EmitUnimplementedEvent(t)
                        fallthrough
                default:
                        return 0, nil, linuxerr.EINVAL
                }

        case linux.PR_SET_NO_NEW_PRIVS:
                if args[1].Int() != 1 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
                        return 0, nil, linuxerr.EINVAL
                }
                // PR_SET_NO_NEW_PRIVS is assumed to always be set.
                // See kernel.Task.updateCredsForExecLocked.
                return 0, nil, nil

        case linux.PR_GET_NO_NEW_PRIVS:
                if args[1].Int() != 0 || args[2].Int() != 0 || args[3].Int() != 0 || args[4].Int() != 0 {
                        return 0, nil, linuxerr.EINVAL
                }
                return 1, nil, nil

        case linux.PR_SET_PTRACER:
                pid := args[1].Int()
                switch pid {
                case 0:
                        t.ClearYAMAException()
                        return 0, nil, nil
                case linux.PR_SET_PTRACER_ANY:
                        t.SetYAMAException(nil)
                        return 0, nil, nil
                default:
                        tracer := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid))
                        if tracer == nil {
                                return 0, nil, linuxerr.EINVAL
                        }
                        t.SetYAMAException(tracer)
                        return 0, nil, nil
                }

        case linux.PR_SET_SECCOMP:
                if args[1].Int() != linux.SECCOMP_MODE_FILTER {
                        // Unsupported mode.
                        return 0, nil, linuxerr.EINVAL
                }

                return 0, nil, seccomp(t, linux.SECCOMP_SET_MODE_FILTER, 0, args[2].Pointer())

        case linux.PR_GET_SECCOMP:
                return uintptr(t.SeccompMode()), nil, nil

        case linux.PR_CAPBSET_READ:
                cp := linux.Capability(args[1].Uint64())
                if !cp.Ok() {
                        return 0, nil, linuxerr.EINVAL
                }
                var rv uintptr
                if auth.CapabilitySetOf(cp)&t.Credentials().BoundingCaps != 0 {
                        rv = 1
                }
                return rv, nil, nil

        case linux.PR_CAPBSET_DROP:
                cp := linux.Capability(args[1].Uint64())
                if !cp.Ok() {
                        return 0, nil, linuxerr.EINVAL
                }
                return 0, nil, t.DropBoundingCapability(cp)

        case linux.PR_GET_TIMING,
                linux.PR_SET_TIMING,
                linux.PR_GET_TSC,
                linux.PR_SET_TSC,
                linux.PR_TASK_PERF_EVENTS_DISABLE,
                linux.PR_TASK_PERF_EVENTS_ENABLE,
                linux.PR_GET_TIMERSLACK,
                linux.PR_SET_TIMERSLACK,
                linux.PR_MCE_KILL,
                linux.PR_MCE_KILL_GET,
                linux.PR_GET_TID_ADDRESS,
                linux.PR_SET_CHILD_SUBREAPER,
                linux.PR_GET_CHILD_SUBREAPER,
                linux.PR_GET_THP_DISABLE,
                linux.PR_SET_THP_DISABLE,
                linux.PR_MPX_ENABLE_MANAGEMENT,
                linux.PR_MPX_DISABLE_MANAGEMENT:

                t.Kernel().EmitUnimplementedEvent(t)
                fallthrough
        default:
                return 0, nil, linuxerr.EINVAL
        }

        return 0, nil, nil
}



























   45 









    2 


   44 
    3 



   38 




   41 
    9 





    1 





   40 





   40 
    1 



   39 










    1 



   38 
    2 


   38 


   38 


   38 


   37 


   37 





   37 





    6 












    6 
    1 



    5 



    5 



    5 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

// Mount implements Linux syscall mount(2).
func Mount(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        sourceAddr := args[0].Pointer()
        targetAddr := args[1].Pointer()
        typeAddr := args[2].Pointer()
        flags := args[3].Uint64()
        dataAddr := args[4].Pointer()

        // For null-terminated strings related to mount(2), Linux copies in at most
        // a page worth of data. See fs/namespace.c:copy_mount_string().
        fsType, err := t.CopyInString(typeAddr, hostarch.PageSize)
        if err != nil {
                return 0, nil, err
        }
        source, err := t.CopyInString(sourceAddr, hostarch.PageSize)
        if err != nil {
                return 0, nil, err
        }

        targetPath, err := copyInPath(t, targetAddr)
        if err != nil {
                return 0, nil, err
        }

        data := ""
        if dataAddr != 0 {
                // In Linux, a full page is always copied in regardless of null
                // character placement, and the address is passed to each file system.
                // Most file systems always treat this data as a string, though, and so
                // do all of the ones we implement.
                data, err = t.CopyInString(dataAddr, hostarch.PageSize)
                if err != nil {
                        return 0, nil, err
                }
        }

        // Ignore magic value that was required before Linux 2.4.
        if flags&linux.MS_MGC_MSK == linux.MS_MGC_VAL {
                flags = flags &^ linux.MS_MGC_MSK
        }

        // Must have CAP_SYS_ADMIN in the current mount namespace's associated user
        // namespace.
        creds := t.Credentials()
        if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
                return 0, nil, linuxerr.EPERM
        }

        const unsupportedOps = linux.MS_REMOUNT | linux.MS_BIND |
                linux.MS_SHARED | linux.MS_PRIVATE | linux.MS_SLAVE |
                linux.MS_UNBINDABLE | linux.MS_MOVE

        // Silently allow MS_NOSUID, since we don't implement set-id bits
        // anyway.
        const unsupportedFlags = linux.MS_NODIRATIME | linux.MS_STRICTATIME

        // Linux just allows passing any flags to mount(2) - it won't fail when
        // unknown or unsupported flags are passed. Since we don't implement
        // everything, we fail explicitly on flags that are unimplemented.
        if flags&(unsupportedOps|unsupportedFlags) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        var opts vfs.MountOptions
        if flags&linux.MS_NOATIME == linux.MS_NOATIME {
                opts.Flags.NoATime = true
        }
        if flags&linux.MS_NOEXEC == linux.MS_NOEXEC {
                opts.Flags.NoExec = true
        }
        if flags&linux.MS_NODEV == linux.MS_NODEV {
                opts.Flags.NoDev = true
        }
        if flags&linux.MS_NOSUID == linux.MS_NOSUID {
                opts.Flags.NoSUID = true
        }
        if flags&linux.MS_RDONLY == linux.MS_RDONLY {
                opts.ReadOnly = true
        }
        opts.GetFilesystemOptions.Data = data

        target, err := getTaskPathOperation(t, linux.AT_FDCWD, targetPath, disallowEmptyPath, nofollowFinalSymlink)
        if err != nil {
                return 0, nil, err
        }
        defer target.Release(t)
        _, err = t.Kernel().VFS().MountAt(t, creds, source, &target.pop, fsType, &opts)
        return 0, nil, err
}

// Umount2 implements Linux syscall umount2(2).
func Umount2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        flags := args[1].Int()

        // Must have CAP_SYS_ADMIN in the mount namespace's associated user
        // namespace.
        //
        // Currently, this is always the init task's user namespace.
        creds := t.Credentials()
        if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, t.MountNamespaceVFS2().Owner) {
                return 0, nil, linuxerr.EPERM
        }

        const unsupported = linux.MNT_FORCE | linux.MNT_EXPIRE
        if flags&unsupported != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        path, err := copyInPath(t, addr)
        if err != nil {
                return 0, nil, err
        }
        tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, nofollowFinalSymlink)
        if err != nil {
                return 0, nil, err
        }
        defer tpop.Release(t)

        opts := vfs.UmountOptions{
                Flags: uint32(flags),
        }

        return 0, nil, t.Kernel().VFS().UmountAt(t, creds, &tpop.pop, &opts)
}





























    4 





    4 



    1 



    3 


    2 




    7 





    7 








    5 
    1 



    6 
    1 



    5 


    1 




    1 








    1 





    1 



   20 









   20 
   16 

    1 




   19 
    2 



   17 




   17 



   17 









   17 
    1 


   16 



   16 
















    3 



    1 


    2 



    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/syserror"
)

const nsecPerSec = int64(time.Second)

// Getitimer implements linux syscall getitimer(2).
func Getitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        if t.Arch().Width() != 8 {
                // Definition of linux.ItimerVal assumes 64-bit architecture.
                return 0, nil, syserror.ENOSYS
        }

        timerID := args[0].Int()
        addr := args[1].Pointer()

        olditv, err := t.Getitimer(timerID)
        if err != nil {
                return 0, nil, err
        }
        // A NULL address is allowed, in which case no copy out takes place.
        if addr == 0 {
                return 0, nil, nil
        }
        _, err = olditv.CopyOut(t, addr)
        return 0, nil, err
}

// Setitimer implements linux syscall setitimer(2).
func Setitimer(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        if t.Arch().Width() != 8 {
                // Definition of linux.ItimerVal assumes 64-bit architecture.
                return 0, nil, syserror.ENOSYS
        }

        timerID := args[0].Int()
        newAddr := args[1].Pointer()
        oldAddr := args[2].Pointer()

        var newitv linux.ItimerVal
        // A NULL address is allowed because because Linux allows
        // setitimer(which, NULL, &old_value) which disables the timer. There is a
        // KERN_WARN message saying this misfeature will be removed. However, that
        // hasn't happened as of 3.19, so we continue to support it.
        if newAddr != 0 {
                if _, err := newitv.CopyIn(t, newAddr); err != nil {
                        return 0, nil, err
                }
        }
        olditv, err := t.Setitimer(timerID, newitv)
        if err != nil {
                return 0, nil, err
        }
        // A NULL address is allowed, in which case no copy out takes place.
        if oldAddr == 0 {
                return 0, nil, nil
        }
        _, err = olditv.CopyOut(t, oldAddr)
        return 0, nil, err
}

// Alarm implements linux syscall alarm(2).
func Alarm(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        duration := time.Duration(args[0].Uint()) * time.Second

        olditv, err := t.Setitimer(linux.ITIMER_REAL, linux.ItimerVal{
                Value: linux.DurationToTimeval(duration),
        })
        if err != nil {
                return 0, nil, err
        }
        olddur := olditv.Value.ToDuration()
        secs := olddur.Round(time.Second).Nanoseconds() / nsecPerSec
        if secs == 0 && olddur != 0 {
                // We can't return 0 if an alarm was previously scheduled.
                secs = 1
        }
        return uintptr(secs), nil, nil
}

// TimerCreate implements linux syscall timer_create(2).
func TimerCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        clockID := args[0].Int()
        sevp := args[1].Pointer()
        timerIDp := args[2].Pointer()

        c, err := getClock(t, clockID)
        if err != nil {
                return 0, nil, err
        }

        var sev *linux.Sigevent
        if sevp != 0 {
                sev = &linux.Sigevent{}
                if _, err = sev.CopyIn(t, sevp); err != nil {
                        return 0, nil, err
                }
        }

        id, err := t.IntervalTimerCreate(c, sev)
        if err != nil {
                return 0, nil, err
        }

        if _, err := id.CopyOut(t, timerIDp); err != nil {
                t.IntervalTimerDelete(id)
                return 0, nil, err
        }

        return 0, nil, nil
}

// TimerSettime implements linux syscall timer_settime(2).
func TimerSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        timerID := linux.TimerID(args[0].Value)
        flags := args[1].Int()
        newValAddr := args[2].Pointer()
        oldValAddr := args[3].Pointer()

        var newVal linux.Itimerspec
        if _, err := newVal.CopyIn(t, newValAddr); err != nil {
                return 0, nil, err
        }
        oldVal, err := t.IntervalTimerSettime(timerID, newVal, flags&linux.TIMER_ABSTIME != 0)
        if err != nil {
                return 0, nil, err
        }
        if oldValAddr != 0 {
                _, err = oldVal.CopyOut(t, oldValAddr)
                return 0, nil, err
        }
        return 0, nil, nil
}

// TimerGettime implements linux syscall timer_gettime(2).
func TimerGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        timerID := linux.TimerID(args[0].Value)
        curValAddr := args[1].Pointer()

        curVal, err := t.IntervalTimerGettime(timerID)
        if err != nil {
                return 0, nil, err
        }
        _, err = curVal.CopyOut(t, curValAddr)
        return 0, nil, err
}

// TimerGetoverrun implements linux syscall timer_getoverrun(2).
func TimerGetoverrun(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        timerID := linux.TimerID(args[0].Value)

        o, err := t.IntervalTimerGetoverrun(timerID)
        if err != nil {
                return 0, nil, err
        }
        return uintptr(o), nil, nil
}

// TimerDelete implements linux syscall timer_delete(2).
func TimerDelete(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        timerID := linux.TimerID(args[0].Value)
        return 0, nil, t.IntervalTimerDelete(timerID)
}



























    8 



    1 






    7 
    1 



    7 

    5 

    1 

    1 


    6 




    6 



    1 


    5 



    5 









    5 
    1 


    4 


    1 



    3 



    3 



    3 
    1 

    1 



    2 



    2 







    2 


    1 



    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
)

// TimerfdCreate implements Linux syscall timerfd_create(2).
func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        clockID := args[0].Int()
        flags := args[1].Int()

        if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Timerfds aren't writable per se (their implementation of Write just
        // returns EINVAL), but they are "opened for writing", which is necessary
        // to actually reach said implementation of Write.
        fileFlags := uint32(linux.O_RDWR)
        if flags&linux.TFD_NONBLOCK != 0 {
                fileFlags |= linux.O_NONBLOCK
        }

        var clock ktime.Clock
        switch clockID {
        case linux.CLOCK_REALTIME:
                clock = t.Kernel().RealtimeClock()
        case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME:
                clock = t.Kernel().MonotonicClock()
        default:
                return 0, nil, linuxerr.EINVAL
        }
        vfsObj := t.Kernel().VFS()
        file, err := timerfd.New(t, vfsObj, clock, fileFlags)
        if err != nil {
                return 0, nil, err
        }
        defer file.DecRef(t)
        fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
                CloseOnExec: flags&linux.TFD_CLOEXEC != 0,
        })
        if err != nil {
                return 0, nil, err
        }
        return uintptr(fd), nil, nil
}

// TimerfdSettime implements Linux syscall timerfd_settime(2).
func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        flags := args[1].Int()
        newValAddr := args[2].Pointer()
        oldValAddr := args[3].Pointer()

        if flags&^(linux.TFD_TIMER_ABSTIME) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        tfd, ok := file.Impl().(*timerfd.TimerFileDescription)
        if !ok {
                return 0, nil, linuxerr.EINVAL
        }

        var newVal linux.Itimerspec
        if _, err := newVal.CopyIn(t, newValAddr); err != nil {
                return 0, nil, err
        }
        newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock())
        if err != nil {
                return 0, nil, err
        }
        tm, oldS := tfd.SetTime(newS)
        if oldValAddr != 0 {
                oldVal := ktime.ItimerspecFromSetting(tm, oldS)
                if _, err := oldVal.CopyOut(t, oldValAddr); err != nil {
                        return 0, nil, err
                }
        }
        return 0, nil, nil
}

// TimerfdGettime implements Linux syscall timerfd_gettime(2).
func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        curValAddr := args[1].Pointer()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        tfd, ok := file.Impl().(*timerfd.TimerFileDescription)
        if !ok {
                return 0, nil, linuxerr.EINVAL
        }

        tm, s := tfd.GetTime()
        curVal := ktime.ItimerspecFromSetting(tm, s)
        _, err := curVal.CopyOut(t, curValAddr)
        return 0, nil, err
}



























































 1956 
    1 


 1965 




  771 


 1957 










  193 


  191 




  193 

  192 






  196 

    1 

  130 

  194 








  197 
  195 
  197 



    1 






 1961 




 1871 




 1873 



 1875 



 1961 






 1962 



 1968 


  418 



 1959 










 1871 



 1879 


   27 

    4 



   26 













 1960 



 1963 



 1961 
 1961 


 1967 



 1958 

  420 


 1968 

 1962 


  418 



   15 
















 1959 
  135 


 1968 


 1963 





















 1961 

 1957 




  123 


 1961 


 1958 



   30 

   30 





   30 

   30 



    2 

    2 









    2 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package safemem

import (
        "bytes"
        "fmt"
        "unsafe"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/gohacks"
)

// A BlockSeq represents a sequence of Blocks, each of which has non-zero
// length.
//
// BlockSeqs are immutable and may be copied by value. The zero value of
// BlockSeq represents an empty sequence.
type BlockSeq struct {
        // If length is 0, then the BlockSeq is empty. Invariants: data == 0;
        // offset == 0; limit == 0.
        //
        // If length is -1, then the BlockSeq represents the single Block{data,
        // limit, false}. Invariants: offset == 0; limit > 0; limit does not
        // overflow the range of an int.
        //
        // If length is -2, then the BlockSeq represents the single Block{data,
        // limit, true}. Invariants: offset == 0; limit > 0; limit does not
        // overflow the range of an int.
        //
        // Otherwise, length >= 2, and the BlockSeq represents the `length` Blocks
        // in the array of Blocks starting at address `data`, starting at `offset`
        // bytes into the first Block and limited to the following `limit` bytes.
        // Invariants: data != 0; offset < len(data[0]); limit > 0; offset+limit <=
        // the combined length of all Blocks in the array; the first Block in the
        // array has non-zero length.
        //
        // length is never 1; sequences consisting of a single Block are always
        // stored inline (with length < 0).
        data   unsafe.Pointer
        length int
        offset int
        limit  uint64
}

// BlockSeqOf returns a BlockSeq representing the single Block b.
func BlockSeqOf(b Block) BlockSeq {
        if b.length == 0 {
                return BlockSeq{}
        }
        bs := BlockSeq{
                data:   b.start,
                length: -1,
                limit:  uint64(b.length),
        }
        if b.needSafecopy {
                bs.length = -2
        }
        return bs
}

// BlockSeqFromSlice returns a BlockSeq representing all Blocks in slice.
// If slice contains Blocks with zero length, BlockSeq will skip them during
// iteration.
//
// Whether the returned BlockSeq shares memory with slice is unspecified;
// clients should avoid mutating slices passed to BlockSeqFromSlice.
//
// Preconditions: The combined length of all Blocks in slice <= math.MaxUint64.
func BlockSeqFromSlice(slice []Block) BlockSeq {
        slice = skipEmpty(slice)
        var limit uint64
        for _, b := range slice {
                sum := limit + uint64(b.Len())
                if sum < limit {
                        panic("BlockSeq length overflows uint64")
                }
                limit = sum
        }
        return blockSeqFromSliceLimited(slice, limit)
}

// Preconditions:
// * The combined length of all Blocks in slice <= limit.
// * If len(slice) != 0, the first Block in slice has non-zero length and
//   limit > 0.
func blockSeqFromSliceLimited(slice []Block, limit uint64) BlockSeq {
        switch len(slice) {
        case 0:
                return BlockSeq{}
        case 1:
                return BlockSeqOf(slice[0].TakeFirst64(limit))
        default:
                return BlockSeq{
                        data:   unsafe.Pointer(&slice[0]),
                        length: len(slice),
                        limit:  limit,
                }
        }
}

func skipEmpty(slice []Block) []Block {
        for i, b := range slice {
                if b.Len() != 0 {
                        return slice[i:]
                }
        }
        return nil
}

// IsEmpty returns true if bs contains no Blocks.
//
// Invariants: bs.IsEmpty() == (bs.NumBlocks() == 0) == (bs.NumBytes() == 0).
// (Of these, prefer to use bs.IsEmpty().)
func (bs BlockSeq) IsEmpty() bool {
        return bs.length == 0
}

// NumBlocks returns the number of Blocks in bs.
func (bs BlockSeq) NumBlocks() int {
        // In general, we have to count: if bs represents a windowed slice then the
        // slice may contain Blocks with zero length, and bs.length may be larger
        // than the actual number of Blocks due to bs.limit.
        var n int
        for !bs.IsEmpty() {
                n++
                bs = bs.Tail()
        }
        return n
}

// NumBytes returns the sum of Block.Len() for all Blocks in bs.
func (bs BlockSeq) NumBytes() uint64 {
        return bs.limit
}

// Head returns the first Block in bs.
//
// Preconditions: !bs.IsEmpty().
func (bs BlockSeq) Head() Block {
        if bs.length == 0 {
                panic("empty BlockSeq")
        }
        if bs.length < 0 {
                return bs.internalBlock()
        }
        return (*Block)(bs.data).DropFirst(bs.offset).TakeFirst64(bs.limit)
}

// Preconditions: bs.length < 0.
func (bs BlockSeq) internalBlock() Block {
        return Block{
                start:        bs.data,
                length:       int(bs.limit),
                needSafecopy: bs.length == -2,
        }
}

// Tail returns a BlockSeq consisting of all Blocks in bs after the first.
//
// Preconditions: !bs.IsEmpty().
func (bs BlockSeq) Tail() BlockSeq {
        if bs.length == 0 {
                panic("empty BlockSeq")
        }
        if bs.length < 0 {
                return BlockSeq{}
        }
        head := (*Block)(bs.data).DropFirst(bs.offset)
        headLen := uint64(head.Len())
        if headLen >= bs.limit {
                // The head Block exhausts the limit, so the tail is empty.
                return BlockSeq{}
        }
        var extSlice []Block
        extSliceHdr := (*gohacks.SliceHeader)(unsafe.Pointer(&extSlice))
        extSliceHdr.Data = bs.data
        extSliceHdr.Len = bs.length
        extSliceHdr.Cap = bs.length
        tailSlice := skipEmpty(extSlice[1:])
        tailLimit := bs.limit - headLen
        return blockSeqFromSliceLimited(tailSlice, tailLimit)
}

// DropFirst returns a BlockSeq equivalent to bs, but with the first n bytes
// omitted. If n > bs.NumBytes(), DropFirst returns an empty BlockSeq.
//
// Preconditions: n >= 0.
func (bs BlockSeq) DropFirst(n int) BlockSeq {
        if n < 0 {
                panic(fmt.Sprintf("invalid n: %d", n))
        }
        return bs.DropFirst64(uint64(n))
}

// DropFirst64 is equivalent to DropFirst but takes an uint64.
func (bs BlockSeq) DropFirst64(n uint64) BlockSeq {
        if n >= bs.limit {
                return BlockSeq{}
        }
        for {
                // Calling bs.Head() here is surprisingly expensive, so inline getting
                // the head's length.
                var headLen uint64
                if bs.length < 0 {
                        headLen = bs.limit
                } else {
                        headLen = uint64((*Block)(bs.data).Len() - bs.offset)
                }
                if n < headLen {
                        // Dropping ends partway through the head Block.
                        if bs.length < 0 {
                                return BlockSeqOf(bs.internalBlock().DropFirst64(n))
                        }
                        bs.offset += int(n)
                        bs.limit -= n
                        return bs
                }
                n -= headLen
                bs = bs.Tail()
        }
}

// TakeFirst returns a BlockSeq equivalent to the first n bytes of bs. If n >
// bs.NumBytes(), TakeFirst returns a BlockSeq equivalent to bs.
//
// Preconditions: n >= 0.
func (bs BlockSeq) TakeFirst(n int) BlockSeq {
        if n < 0 {
                panic(fmt.Sprintf("invalid n: %d", n))
        }
        return bs.TakeFirst64(uint64(n))
}

// TakeFirst64 is equivalent to TakeFirst but takes a uint64.
func (bs BlockSeq) TakeFirst64(n uint64) BlockSeq {
        if n == 0 {
                return BlockSeq{}
        }
        if bs.limit > n {
                bs.limit = n
        }
        return bs
}

// String implements fmt.Stringer.String.
func (bs BlockSeq) String() string {
        var buf bytes.Buffer
        buf.WriteByte('[')
        var sep string
        for !bs.IsEmpty() {
                buf.WriteString(sep)
                sep = " "
                buf.WriteString(bs.Head().String())
                bs = bs.Tail()
        }
        buf.WriteByte(']')
        return buf.String()
}

// CopySeq copies srcs.NumBytes() or dsts.NumBytes() bytes, whichever is less,
// from srcs to dsts and returns the number of bytes copied.
//
// If srcs and dsts overlap, the data stored in dsts is unspecified.
func CopySeq(dsts, srcs BlockSeq) (uint64, error) {
        var done uint64
        for !dsts.IsEmpty() && !srcs.IsEmpty() {
                dst := dsts.Head()
                src := srcs.Head()
                n, err := Copy(dst, src)
                done += uint64(n)
                if err != nil {
                        return done, err
                }
                dsts = dsts.DropFirst(n)
                srcs = srcs.DropFirst(n)
        }
        return done, nil
}

// ZeroSeq sets all bytes in dsts to 0 and returns the number of bytes zeroed.
func ZeroSeq(dsts BlockSeq) (uint64, error) {
        var done uint64
        for !dsts.IsEmpty() {
                n, err := Zero(dsts.Head())
                done += uint64(n)
                if err != nil {
                        return done, err
                }
                dsts = dsts.DropFirst(n)
        }
        return done, nil
}

// IovecsFromBlockSeq returns a []unix.Iovec representing seq.
func IovecsFromBlockSeq(bs BlockSeq) []unix.Iovec {
        iovs := make([]unix.Iovec, 0, bs.NumBlocks())
        for ; !bs.IsEmpty(); bs = bs.Tail() {
                b := bs.Head()
                iovs = append(iovs, unix.Iovec{
                        Base: &b.ToSlice()[0],
                        Len:  uint64(b.Len()),
                })
                // We don't need to care about b.NeedSafecopy(), because the host
                // kernel will handle such address ranges just fine (by returning
                // EFAULT).
        }
        return iovs
}






















































    3 











    3 


    3 



































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package devtmpfs provides an implementation of /dev based on tmpfs,
// analogous to Linux's devtmpfs.
package devtmpfs

import (
        "fmt"
        "path"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
)

// Name is the default filesystem name.
const Name = "devtmpfs"

// FilesystemType implements vfs.FilesystemType.
//
// +stateify savable
type FilesystemType struct {
        initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
        initErr  error

        // fs is the tmpfs filesystem that backs all mounts of this FilesystemType.
        // root is fs' root. fs and root are immutable.
        fs   *vfs.Filesystem
        root *vfs.Dentry
}

// Name implements vfs.FilesystemType.Name.
func (*FilesystemType) Name() string {
        return Name
}

// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
        fst.initOnce.Do(func() {
                fs, root, err := tmpfs.FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "" /* source */, vfs.GetFilesystemOptions{
                        Data: "mode=0755", // opts from drivers/base/devtmpfs.c:devtmpfs_init()
                })
                if err != nil {
                        fst.initErr = err
                        return
                }
                fst.fs = fs
                fst.root = root
        })
        if fst.initErr != nil {
                return nil, nil, fst.initErr
        }
        fst.fs.IncRef()
        fst.root.IncRef()
        return fst.fs, fst.root, nil
}

// Release implements vfs.FilesystemType.Release.
func (fst *FilesystemType) Release(ctx context.Context) {
        if fst.fs != nil {
                // Release the original reference obtained when creating the filesystem.
                fst.root.DecRef(ctx)
                fst.fs.DecRef(ctx)
        }
}

// Accessor allows devices to create device special files in devtmpfs.
type Accessor struct {
        vfsObj *vfs.VirtualFilesystem
        mntns  *vfs.MountNamespace
        root   vfs.VirtualDentry
        creds  *auth.Credentials
}

// NewAccessor returns an Accessor that supports creation of device special
// files in the devtmpfs instance registered with name fsTypeName in vfsObj.
func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) {
        mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.MountOptions{})
        if err != nil {
                return nil, err
        }
        // Pass a reference on root to the Accessor.
        root := mntns.Root()
        root.IncRef()
        return &Accessor{
                vfsObj: vfsObj,
                mntns:  mntns,
                root:   root,
                creds:  creds,
        }, nil
}

// Release must be called when a is no longer in use.
func (a *Accessor) Release(ctx context.Context) {
        a.root.DecRef(ctx)
        a.mntns.DecRef(ctx)
}

// accessorContext implements context.Context by extending an existing
// context.Context with an Accessor's values for VFS-relevant state.
type accessorContext struct {
        context.Context
        a *Accessor
}

func (a *Accessor) wrapContext(ctx context.Context) *accessorContext {
        return &accessorContext{
                Context: ctx,
                a:       a,
        }
}

// Value implements context.Context.Value.
func (ac *accessorContext) Value(key interface{}) interface{} {
        switch key {
        case vfs.CtxMountNamespace:
                ac.a.mntns.IncRef()
                return ac.a.mntns
        case vfs.CtxRoot:
                ac.a.root.IncRef()
                return ac.a.root
        default:
                return ac.Context.Value(key)
        }
}

func (a *Accessor) pathOperationAt(pathname string) *vfs.PathOperation {
        return &vfs.PathOperation{
                Root:  a.root,
                Start: a.root,
                Path:  fspath.Parse(pathname),
        }
}

// CreateDeviceFile creates a device special file at the given pathname in the
// devtmpfs instance accessed by the Accessor.
func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind vfs.DeviceKind, major, minor uint32, perms uint16) error {
        actx := a.wrapContext(ctx)

        mode := (linux.FileMode)(perms)
        switch kind {
        case vfs.BlockDevice:
                mode |= linux.S_IFBLK
        case vfs.CharDevice:
                mode |= linux.S_IFCHR
        default:
                panic(fmt.Sprintf("invalid vfs.DeviceKind: %v", kind))
        }

        // Create any parent directories. See
        // devtmpfs.c:handle_create()=>path_create().
        parent := path.Dir(pathname)
        if err := a.vfsObj.MkdirAllAt(ctx, parent, a.root, a.creds, &vfs.MkdirOptions{
                Mode: 0755,
        }); err != nil {
                return fmt.Errorf("failed to create device parent directory %q: %v", parent, err)
        }

        // NOTE: Linux's devtmpfs refuses to automatically delete files it didn't
        // create, which it recognizes by storing a pointer to the kdevtmpfs struct
        // thread in struct inode::i_private. Accessor doesn't yet support deletion
        // of files at all, and probably won't as long as we don't need to support
        // kernel modules, so this is moot for now.
        return a.vfsObj.MknodAt(actx, a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{
                Mode:     mode,
                DevMajor: major,
                DevMinor: minor,
        })
}

// UserspaceInit creates symbolic links and mount points in the devtmpfs
// instance accessed by the Accessor that are created by userspace in Linux. It
// does not create mounts.
func (a *Accessor) UserspaceInit(ctx context.Context) error {
        actx := a.wrapContext(ctx)

        // Initialize symlinks.
        for _, symlink := range []struct {
                source string
                target string
        }{
                // systemd: src/shared/dev-setup.c:dev_setup()
                {source: "fd", target: "/proc/self/fd"},
                {source: "stdin", target: "/proc/self/fd/0"},
                {source: "stdout", target: "/proc/self/fd/1"},
                {source: "stderr", target: "/proc/self/fd/2"},
                // /proc/kcore is not implemented.

                // Linux implements /dev/ptmx as a device node, but advises
                // container implementations to create /dev/ptmx as a symlink
                // to pts/ptmx (Documentation/filesystems/devpts.txt). Systemd
                // follows this advice (src/nspawn/nspawn.c:setup_pts()), while
                // LXC tries to create a bind mount and falls back to a symlink
                // (src/lxc/conf.c:lxc_setup_devpts()).
                {source: "ptmx", target: "pts/ptmx"},
        } {
                if err := a.vfsObj.SymlinkAt(actx, a.creds, a.pathOperationAt(symlink.source), symlink.target); err != nil {
                        return fmt.Errorf("failed to create symlink %q => %q: %v", symlink.source, symlink.target, err)
                }
        }

        // systemd: src/core/mount-setup.c:mount_table
        for _, dir := range []string{
                "shm",
                "pts",
        } {
                if err := a.vfsObj.MkdirAt(actx, a.creds, a.pathOperationAt(dir), &vfs.MkdirOptions{
                        // systemd: src/core/mount-setup.c:mount_one()
                        Mode: 0755,
                }); err != nil {
                        return fmt.Errorf("failed to create directory %q: %v", dir, err)
                }
        }

        return nil
}


























































































  542 




  224 




   45 




   30 




  407 
  383 


   31 



   37 
   35 


    2 




  521 





  215 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package auth

import (
        "math"
)

// UID is a user ID in an unspecified user namespace.
//
// +marshal
type UID uint32

// GID is a group ID in an unspecified user namespace.
//
// +marshal slice:GIDSlice
type GID uint32

// In the root user namespace, user/group IDs have a 1-to-1 relationship with
// the users/groups they represent. In other user namespaces, this is not the
// case; for example, two different unmapped users may both "have" the overflow
// UID. This means that it is generally only valid to compare user and group
// IDs in the root user namespace. We assign distinct types, KUID/KGID, to such
// IDs to emphasize this distinction. ("k" is for "key", as in "unique key".
// Linux also uses the prefix "k", but I think they mean "kernel".)

// KUID is a user ID in the root user namespace.
type KUID uint32

// KGID is a group ID in the root user namespace.
type KGID uint32

const (
        // NoID is uint32(-1). -1 is consistently used as a special value, in Linux
        // and by extension in the auth package, to mean "no ID":
        //
        // - ID mapping returns -1 if the ID is not mapped.
        //
        // - Most set*id() syscalls accept -1 to mean "do not change this ID".
        NoID = math.MaxUint32

        // OverflowUID is the default value of /proc/sys/kernel/overflowuid. The
        // "overflow UID" is usually [1] used when translating a user ID between
        // namespaces fails because the ID is not mapped. (We don't implement this
        // file, so the overflow UID is constant.)
        //
        // [1] "There is one notable case where unmapped user and group IDs are not
        // converted to the corresponding overflow ID value. When viewing a uid_map
        // or gid_map file in which there is no mapping for the second field, that
        // field is displayed as 4294967295 (-1 as an unsigned integer);" -
        // user_namespaces(7)
        OverflowUID = UID(65534)

        // OverflowGID is the group equivalent to OverflowUID.
        OverflowGID = GID(65534)

        // NobodyKUID is the user ID usually reserved for the least privileged user
        // "nobody".
        NobodyKUID = KUID(65534)

        // NobodyKGID is the group equivalent to NobodyKUID.
        NobodyKGID = KGID(65534)

        // RootKUID is the user ID usually used for the most privileged user "root".
        RootKUID = KUID(0)

        // RootKGID is the group equivalent to RootKUID.
        RootKGID = KGID(0)

        // RootUID is the root user.
        RootUID = UID(0)

        // RootGID is the root group.
        RootGID = GID(0)
)

// Ok returns true if uid is not -1.
func (uid UID) Ok() bool {
        return uid != NoID
}

// Ok returns true if gid is not -1.
func (gid GID) Ok() bool {
        return gid != NoID
}

// Ok returns true if kuid is not -1.
func (kuid KUID) Ok() bool {
        return kuid != NoID
}

// Ok returns true if kgid is not -1.
func (kgid KGID) Ok() bool {
        return kgid != NoID
}

// OrOverflow returns uid if it is valid and the overflow UID otherwise.
func (uid UID) OrOverflow() UID {
        if uid.Ok() {
                return uid
        }
        return OverflowUID
}

// OrOverflow returns gid if it is valid and the overflow GID otherwise.
func (gid GID) OrOverflow() GID {
        if gid.Ok() {
                return gid
        }
        return OverflowGID
}

// In translates kuid into user namespace ns. If kuid is not mapped in ns, In
// returns NoID.
func (kuid KUID) In(ns *UserNamespace) UID {
        return ns.MapFromKUID(kuid)
}

// In translates kgid into user namespace ns. If kgid is not mapped in ns, In
// returns NoID.
func (kgid KGID) In(ns *UserNamespace) GID {
        return ns.MapFromKGID(kgid)
}


































   98 





   98 





    3 



    3 



   95 



   78 


   94 



   93 

    3 




    3 
    2 


    3 








   93 
   79 

   16 




   93 









   93 




   93 















   41 


   92 



   93 

























  105 
   89 


  105 





  105 
   92 
   85 



    7 






   20 




   17 
    1 



   17 


   17 


   17 


  503 























   17 
   17 
   17 










   17 






    3 

    3 
    1 



    3 

















 1104 








 1103 
  308 


 1070 



 1106 

 1104 


  307 




 1074 
   15 


 1066 



 1058 

 1050 


   14 















   15 
   15 

    1 


   15 



    1 

















  119 








  119 











  117 






  116 
   34 


  117 



   42 


  114 


  113 
  108 


  114 


  113 


  117 











   72 






  130 






  329 



   70 



  145 





  153 











  152 



   17 


   17 


  104 

   11 


  104 


  104 





    4 












    4 













  137 















  137 








  132 















  132 



























   19 


   18 



    6 


    5 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mm

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/syserror"
)

// Preconditions:
// * mm.mappingMu must be locked for writing.
// * opts must be valid as defined by the checks in MMap.
func (mm *MemoryManager) createVMALocked(ctx context.Context, opts memmap.MMapOpts) (vmaIterator, hostarch.AddrRange, error) {
        if opts.MaxPerms != opts.MaxPerms.Effective() {
                panic(fmt.Sprintf("Non-effective MaxPerms %s cannot be enforced", opts.MaxPerms))
        }

        // Find a usable range.
        addr, err := mm.findAvailableLocked(opts.Length, findAvailableOpts{
                Addr:     opts.Addr,
                Fixed:    opts.Fixed,
                Unmap:    opts.Unmap,
                Map32Bit: opts.Map32Bit,
        })
        if err != nil {
                // Can't force without opts.Unmap and opts.Fixed.
                if opts.Force && opts.Unmap && opts.Fixed {
                        addr = opts.Addr
                } else {
                        return vmaIterator{}, hostarch.AddrRange{}, err
                }
        }
        ar, _ := addr.ToRange(opts.Length)

        // Check against RLIMIT_AS.
        newUsageAS := mm.usageAS + opts.Length
        if opts.Unmap {
                newUsageAS -= uint64(mm.vmas.SpanRange(ar))
        }
        if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
                return vmaIterator{}, hostarch.AddrRange{}, syserror.ENOMEM
        }

        if opts.MLockMode != memmap.MLockNone {
                // Check against RLIMIT_MEMLOCK.
                if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
                        mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
                        if mlockLimit == 0 {
                                return vmaIterator{}, hostarch.AddrRange{}, linuxerr.EPERM
                        }
                        newLockedAS := mm.lockedAS + opts.Length
                        if opts.Unmap {
                                newLockedAS -= mm.mlockedBytesRangeLocked(ar)
                        }
                        if newLockedAS > mlockLimit {
                                return vmaIterator{}, hostarch.AddrRange{}, linuxerr.EAGAIN
                        }
                }
        }

        // Remove overwritten mappings. This ordering is consistent with Linux:
        // compare Linux's mm/mmap.c:mmap_region() => do_munmap(),
        // file->f_op->mmap().
        var vgap vmaGapIterator
        if opts.Unmap {
                vgap = mm.unmapLocked(ctx, ar)
        } else {
                vgap = mm.vmas.FindGap(ar.Start)
        }

        // Inform the Mappable, if any, of the new mapping.
        if opts.Mappable != nil {
                // The expression for writable is vma.canWriteMappableLocked(), but we
                // don't yet have a vma.
                if err := opts.Mappable.AddMapping(ctx, mm, ar, opts.Offset, !opts.Private && opts.MaxPerms.Write); err != nil {
                        return vmaIterator{}, hostarch.AddrRange{}, err
                }
        }

        // Take a reference on opts.MappingIdentity before inserting the vma since
        // vma merging can drop the reference.
        if opts.MappingIdentity != nil {
                opts.MappingIdentity.IncRef()
        }

        // Finally insert the vma.
        v := vma{
                mappable:       opts.Mappable,
                off:            opts.Offset,
                realPerms:      opts.Perms,
                effectivePerms: opts.Perms.Effective(),
                maxPerms:       opts.MaxPerms,
                private:        opts.Private,
                growsDown:      opts.GrowsDown,
                mlockMode:      opts.MLockMode,
                numaPolicy:     linux.MPOL_DEFAULT,
                id:             opts.MappingIdentity,
                hint:           opts.Hint,
        }

        vseg := mm.vmas.Insert(vgap, ar, v)
        mm.usageAS += opts.Length
        if v.isPrivateDataLocked() {
                mm.dataAS += opts.Length
        }
        if opts.MLockMode != memmap.MLockNone {
                mm.lockedAS += opts.Length
        }

        return vseg, ar, nil
}

type findAvailableOpts struct {
        // These fields are equivalent to those in memmap.MMapOpts, except that:
        //
        // - Addr must be page-aligned.
        //
        // - Unmap allows existing guard pages in the returned range.

        Addr     hostarch.Addr
        Fixed    bool
        Unmap    bool
        Map32Bit bool
}

// map32Start/End are the bounds to which MAP_32BIT mappings are constrained,
// and are equivalent to Linux's MAP32_BASE and MAP32_MAX respectively.
const (
        map32Start = 0x40000000
        map32End   = 0x80000000
)

// findAvailableLocked finds an allocatable range.
//
// Preconditions: mm.mappingMu must be locked.
func (mm *MemoryManager) findAvailableLocked(length uint64, opts findAvailableOpts) (hostarch.Addr, error) {
        if opts.Fixed {
                opts.Map32Bit = false
        }
        allowedAR := mm.applicationAddrRange()
        if opts.Map32Bit {
                allowedAR = allowedAR.Intersect(hostarch.AddrRange{map32Start, map32End})
        }

        // Does the provided suggestion work?
        if ar, ok := opts.Addr.ToRange(length); ok {
                if allowedAR.IsSupersetOf(ar) {
                        if opts.Unmap {
                                return ar.Start, nil
                        }
                        // Check for the presence of an existing vma or guard page.
                        if vgap := mm.vmas.FindGap(ar.Start); vgap.Ok() && vgap.availableRange().IsSupersetOf(ar) {
                                return ar.Start, nil
                        }
                }
        }

        // Fixed mappings accept only the requested address.
        if opts.Fixed {
                return 0, syserror.ENOMEM
        }

        // Prefer hugepage alignment if a hugepage or more is requested.
        alignment := uint64(hostarch.PageSize)
        if length >= hostarch.HugePageSize {
                alignment = hostarch.HugePageSize
        }

        if opts.Map32Bit {
                return mm.findLowestAvailableLocked(length, alignment, allowedAR)
        }
        if mm.layout.DefaultDirection == arch.MmapBottomUp {
                return mm.findLowestAvailableLocked(length, alignment, hostarch.AddrRange{mm.layout.BottomUpBase, mm.layout.MaxAddr})
        }
        return mm.findHighestAvailableLocked(length, alignment, hostarch.AddrRange{mm.layout.MinAddr, mm.layout.TopDownBase})
}

func (mm *MemoryManager) applicationAddrRange() hostarch.AddrRange {
        return hostarch.AddrRange{mm.layout.MinAddr, mm.layout.MaxAddr}
}

// Preconditions: mm.mappingMu must be locked.
func (mm *MemoryManager) findLowestAvailableLocked(length, alignment uint64, bounds hostarch.AddrRange) (hostarch.Addr, error) {
        for gap := mm.vmas.LowerBoundGap(bounds.Start); gap.Ok() && gap.Start() < bounds.End; gap = gap.NextLargeEnoughGap(hostarch.Addr(length)) {
                if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
                        // Can we shift up to match the alignment?
                        if offset := uint64(gr.Start) % alignment; offset != 0 {
                                if uint64(gr.Length()) >= length+alignment-offset {
                                        // Yes, we're aligned.
                                        return gr.Start + hostarch.Addr(alignment-offset), nil
                                }
                        }

                        // Either aligned perfectly, or can't align it.
                        return gr.Start, nil
                }
        }
        return 0, syserror.ENOMEM
}

// Preconditions: mm.mappingMu must be locked.
func (mm *MemoryManager) findHighestAvailableLocked(length, alignment uint64, bounds hostarch.AddrRange) (hostarch.Addr, error) {
        for gap := mm.vmas.UpperBoundGap(bounds.End); gap.Ok() && gap.End() > bounds.Start; gap = gap.PrevLargeEnoughGap(hostarch.Addr(length)) {
                if gr := gap.availableRange().Intersect(bounds); uint64(gr.Length()) >= length {
                        // Can we shift down to match the alignment?
                        start := gr.End - hostarch.Addr(length)
                        if offset := uint64(start) % alignment; offset != 0 {
                                if gr.Start <= start-hostarch.Addr(offset) {
                                        // Yes, we're aligned.
                                        return start - hostarch.Addr(offset), nil
                                }
                        }

                        // Either aligned perfectly, or can't align it.
                        return start, nil
                }
        }
        return 0, syserror.ENOMEM
}

// Preconditions: mm.mappingMu must be locked.
func (mm *MemoryManager) mlockedBytesRangeLocked(ar hostarch.AddrRange) uint64 {
        var total uint64
        for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
                if vseg.ValuePtr().mlockMode != memmap.MLockNone {
                        total += uint64(vseg.Range().Intersect(ar).Length())
                }
        }
        return total
}

// getVMAsLocked ensures that vmas exist for all addresses in ar, and support
// access of type (at, ignorePermissions). It returns:
//
// - An iterator to the vma containing ar.Start. If no vma contains ar.Start,
// the iterator is unspecified.
//
// - An iterator to the gap after the last vma containing an address in ar. If
// vmas exist for no addresses in ar, the iterator is to a gap that begins
// before ar.Start.
//
// - An error that is non-nil if vmas exist for only a subset of ar.
//
// Preconditions:
// * mm.mappingMu must be locked for reading; it may be temporarily unlocked.
// * ar.Length() != 0.
func (mm *MemoryManager) getVMAsLocked(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool) (vmaIterator, vmaGapIterator, error) {
        if checkInvariants {
                if !ar.WellFormed() || ar.Length() == 0 {
                        panic(fmt.Sprintf("invalid ar: %v", ar))
                }
        }

        // Inline mm.vmas.LowerBoundSegment so that we have the preceding gap if
        // !vbegin.Ok().
        vbegin, vgap := mm.vmas.Find(ar.Start)
        if !vbegin.Ok() {
                vbegin = vgap.NextSegment()
                // vseg.Ok() is checked before entering the following loop.
        } else {
                vgap = vbegin.PrevGap()
        }

        addr := ar.Start
        vseg := vbegin
        for vseg.Ok() {
                // Loop invariants: vgap = vseg.PrevGap(); addr < vseg.End().
                vma := vseg.ValuePtr()
                if addr < vseg.Start() {
                        // TODO(jamieliu): Implement vma.growsDown here.
                        return vbegin, vgap, linuxerr.EFAULT
                }

                perms := vma.effectivePerms
                if ignorePermissions {
                        perms = vma.maxPerms
                }
                if !perms.SupersetOf(at) {
                        return vbegin, vgap, linuxerr.EPERM
                }

                addr = vseg.End()
                vgap = vseg.NextGap()
                if addr >= ar.End {
                        return vbegin, vgap, nil
                }
                vseg = vgap.NextSegment()
        }

        // Ran out of vmas before ar.End.
        return vbegin, vgap, linuxerr.EFAULT
}

// getVecVMAsLocked ensures that vmas exist for all addresses in ars, and
// support access to type of (at, ignorePermissions). It returns the subset of
// ars for which vmas exist. If this is not equal to ars, it returns a non-nil
// error explaining why.
//
// Preconditions: mm.mappingMu must be locked for reading; it may be
// temporarily unlocked.
//
// Postconditions: ars is not mutated.
func (mm *MemoryManager) getVecVMAsLocked(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool) (hostarch.AddrRangeSeq, error) {
        for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
                ar := arsit.Head()
                if ar.Length() == 0 {
                        continue
                }
                if _, vend, err := mm.getVMAsLocked(ctx, ar, at, ignorePermissions); err != nil {
                        return truncatedAddrRangeSeq(ars, arsit, vend.Start()), err
                }
        }
        return ars, nil
}

// vma extension will not shrink the number of unmapped bytes between the start
// of a growsDown vma and the end of its predecessor non-growsDown vma below
// guardBytes.
//
// guardBytes is equivalent to Linux's stack_guard_gap after upstream
// 1be7107fbe18 "mm: larger stack guard gap, between vmas".
const guardBytes = 256 * hostarch.PageSize

// unmapLocked unmaps all addresses in ar and returns the resulting gap in
// mm.vmas.
//
// Preconditions:
// * mm.mappingMu must be locked for writing.
// * ar.Length() != 0.
// * ar must be page-aligned.
func (mm *MemoryManager) unmapLocked(ctx context.Context, ar hostarch.AddrRange) vmaGapIterator {
        if checkInvariants {
                if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
                        panic(fmt.Sprintf("invalid ar: %v", ar))
                }
        }

        // AddressSpace mappings and pmas must be invalidated before
        // mm.removeVMAsLocked() => memmap.Mappable.RemoveMapping().
        mm.Invalidate(ar, memmap.InvalidateOpts{InvalidatePrivate: true})
        return mm.removeVMAsLocked(ctx, ar)
}

// removeVMAsLocked removes vmas for addresses in ar and returns the resulting
// gap in mm.vmas. It does not remove pmas or AddressSpace mappings; clients
// must do so before calling removeVMAsLocked.
//
// Preconditions:
// * mm.mappingMu must be locked for writing.
// * ar.Length() != 0.
// * ar must be page-aligned.
func (mm *MemoryManager) removeVMAsLocked(ctx context.Context, ar hostarch.AddrRange) vmaGapIterator {
        if checkInvariants {
                if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
                        panic(fmt.Sprintf("invalid ar: %v", ar))
                }
        }

        vseg, vgap := mm.vmas.Find(ar.Start)
        if vgap.Ok() {
                vseg = vgap.NextSegment()
        }
        for vseg.Ok() && vseg.Start() < ar.End {
                vseg = mm.vmas.Isolate(vseg, ar)
                vmaAR := vseg.Range()
                vma := vseg.ValuePtr()
                if vma.mappable != nil {
                        vma.mappable.RemoveMapping(ctx, mm, vmaAR, vma.off, vma.canWriteMappableLocked())
                }
                if vma.id != nil {
                        vma.id.DecRef(ctx)
                }
                mm.usageAS -= uint64(vmaAR.Length())
                if vma.isPrivateDataLocked() {
                        mm.dataAS -= uint64(vmaAR.Length())
                }
                if vma.mlockMode != memmap.MLockNone {
                        mm.lockedAS -= uint64(vmaAR.Length())
                }
                vgap = mm.vmas.Remove(vseg)
                vseg = vgap.NextSegment()
        }
        return vgap
}

// canWriteMappableLocked returns true if it is possible for vma.mappable to be
// written to via this vma, i.e. if it is possible that
// vma.mappable.Translate(at.Write=true) may be called as a result of this vma.
// This includes via I/O with usermem.IOOpts.IgnorePermissions = true, such as
// PTRACE_POKEDATA.
//
// canWriteMappableLocked is equivalent to Linux's VM_SHARED.
//
// Preconditions: mm.mappingMu must be locked.
func (vma *vma) canWriteMappableLocked() bool {
        return !vma.private && vma.maxPerms.Write
}

// isPrivateDataLocked identify the data segments - private, writable, not stack
//
// Preconditions: mm.mappingMu must be locked.
func (vma *vma) isPrivateDataLocked() bool {
        return vma.realPerms.Write && vma.private && !vma.growsDown
}

// vmaSetFunctions implements segment.Functions for vmaSet.
type vmaSetFunctions struct{}

func (vmaSetFunctions) MinKey() hostarch.Addr {
        return 0
}

func (vmaSetFunctions) MaxKey() hostarch.Addr {
        return ^hostarch.Addr(0)
}

func (vmaSetFunctions) ClearValue(vma *vma) {
        vma.mappable = nil
        vma.id = nil
        vma.hint = ""
}

func (vmaSetFunctions) Merge(ar1 hostarch.AddrRange, vma1 vma, ar2 hostarch.AddrRange, vma2 vma) (vma, bool) {
        if vma1.mappable != vma2.mappable ||
                (vma1.mappable != nil && vma1.off+uint64(ar1.Length()) != vma2.off) ||
                vma1.realPerms != vma2.realPerms ||
                vma1.maxPerms != vma2.maxPerms ||
                vma1.private != vma2.private ||
                vma1.growsDown != vma2.growsDown ||
                vma1.mlockMode != vma2.mlockMode ||
                vma1.numaPolicy != vma2.numaPolicy ||
                vma1.numaNodemask != vma2.numaNodemask ||
                vma1.dontfork != vma2.dontfork ||
                vma1.id != vma2.id ||
                vma1.hint != vma2.hint {
                return vma{}, false
        }

        if vma2.id != nil {
                vma2.id.DecRef(context.Background())
        }
        return vma1, true
}

func (vmaSetFunctions) Split(ar hostarch.AddrRange, v vma, split hostarch.Addr) (vma, vma) {
        v2 := v
        if v2.mappable != nil {
                v2.off += uint64(split - ar.Start)
        }
        if v2.id != nil {
                v2.id.IncRef()
        }
        return v, v2
}

// Preconditions:
// * vseg.ValuePtr().mappable != nil.
// * vseg.Range().Contains(addr).
func (vseg vmaIterator) mappableOffsetAt(addr hostarch.Addr) uint64 {
        if checkInvariants {
                if !vseg.Ok() {
                        panic("terminal vma iterator")
                }
                if vseg.ValuePtr().mappable == nil {
                        panic("Mappable offset is meaningless for anonymous vma")
                }
                if !vseg.Range().Contains(addr) {
                        panic(fmt.Sprintf("addr %v out of bounds %v", addr, vseg.Range()))
                }
        }

        vma := vseg.ValuePtr()
        vstart := vseg.Start()
        return vma.off + uint64(addr-vstart)
}

// Preconditions: vseg.ValuePtr().mappable != nil.
func (vseg vmaIterator) mappableRange() memmap.MappableRange {
        return vseg.mappableRangeOf(vseg.Range())
}

// Preconditions:
// * vseg.ValuePtr().mappable != nil.
// * vseg.Range().IsSupersetOf(ar).
// * ar.Length() != 0.
func (vseg vmaIterator) mappableRangeOf(ar hostarch.AddrRange) memmap.MappableRange {
        if checkInvariants {
                if !vseg.Ok() {
                        panic("terminal vma iterator")
                }
                if vseg.ValuePtr().mappable == nil {
                        panic("MappableRange is meaningless for anonymous vma")
                }
                if !ar.WellFormed() || ar.Length() == 0 {
                        panic(fmt.Sprintf("invalid ar: %v", ar))
                }
                if !vseg.Range().IsSupersetOf(ar) {
                        panic(fmt.Sprintf("ar %v out of bounds %v", ar, vseg.Range()))
                }
        }

        vma := vseg.ValuePtr()
        vstart := vseg.Start()
        return memmap.MappableRange{vma.off + uint64(ar.Start-vstart), vma.off + uint64(ar.End-vstart)}
}

// Preconditions:
// * vseg.ValuePtr().mappable != nil.
// * vseg.mappableRange().IsSupersetOf(mr).
// * mr.Length() != 0.
func (vseg vmaIterator) addrRangeOf(mr memmap.MappableRange) hostarch.AddrRange {
        if checkInvariants {
                if !vseg.Ok() {
                        panic("terminal vma iterator")
                }
                if vseg.ValuePtr().mappable == nil {
                        panic("MappableRange is meaningless for anonymous vma")
                }
                if !mr.WellFormed() || mr.Length() == 0 {
                        panic(fmt.Sprintf("invalid mr: %v", mr))
                }
                if !vseg.mappableRange().IsSupersetOf(mr) {
                        panic(fmt.Sprintf("mr %v out of bounds %v", mr, vseg.mappableRange()))
                }
        }

        vma := vseg.ValuePtr()
        vstart := vseg.Start()
        return hostarch.AddrRange{vstart + hostarch.Addr(mr.Start-vma.off), vstart + hostarch.Addr(mr.End-vma.off)}
}

// seekNextLowerBound returns mm.vmas.LowerBoundSegment(addr), but does so by
// scanning linearly forward from vseg.
//
// Preconditions:
// * mm.mappingMu must be locked.
// * addr >= vseg.Start().
func (vseg vmaIterator) seekNextLowerBound(addr hostarch.Addr) vmaIterator {
        if checkInvariants {
                if !vseg.Ok() {
                        panic("terminal vma iterator")
                }
                if addr < vseg.Start() {
                        panic(fmt.Sprintf("can't seek forward to %#x from %#x", addr, vseg.Start()))
                }
        }
        for vseg.Ok() && addr >= vseg.End() {
                vseg = vseg.NextSegment()
        }
        return vseg
}

// availableRange returns the subset of vgap.Range() in which new vmas may be
// created without MMapOpts.Unmap == true.
func (vgap vmaGapIterator) availableRange() hostarch.AddrRange {
        ar := vgap.Range()
        next := vgap.NextSegment()
        if !next.Ok() || !next.ValuePtr().growsDown {
                return ar
        }
        // Exclude guard pages.
        if ar.Length() < guardBytes {
                return hostarch.AddrRange{ar.Start, ar.Start}
        }
        ar.End -= guardBytes
        return ar
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/bits/bits64.go: no such file or directory




































    8 

    2 



    7 






    8 

    8 





    6 























    8 










   31 



   37 



    8 

    6 


    8 











    8 



    1 



    1 




    1 
    5 


















   31 




   31 











   31 





   31 



   21 










   22 






   21 

    7 



   22 






   22 




    8 


    8 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
        "encoding/binary"
        "math/rand"

        "gvisor.dev/gvisor/pkg/sleep"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

// epQueue is a queue of endpoints.
type epQueue struct {
        mu   sync.Mutex
        list endpointList
}

// enqueue adds e to the queue if the endpoint is not already on the queue.
func (q *epQueue) enqueue(e *endpoint) {
        q.mu.Lock()
        if e.pendingProcessing {
                q.mu.Unlock()
                return
        }
        q.list.PushBack(e)
        e.pendingProcessing = true
        q.mu.Unlock()
}

// dequeue removes and returns the first element from the queue if available,
// returns nil otherwise.
func (q *epQueue) dequeue() *endpoint {
        q.mu.Lock()
        if e := q.list.Front(); e != nil {
                q.list.Remove(e)
                e.pendingProcessing = false
                q.mu.Unlock()
                return e
        }
        q.mu.Unlock()
        return nil
}

// empty returns true if the queue is empty, false otherwise.
func (q *epQueue) empty() bool {
        q.mu.Lock()
        v := q.list.Empty()
        q.mu.Unlock()
        return v
}

// processor is responsible for processing packets queued to a tcp endpoint.
type processor struct {
        epQ              epQueue
        sleeper          sleep.Sleeper
        newEndpointWaker sleep.Waker
        closeWaker       sleep.Waker
}

func (p *processor) close() {
        p.closeWaker.Assert()
}

func (p *processor) queueEndpoint(ep *endpoint) {
        // Queue an endpoint for processing by the processor goroutine.
        p.epQ.enqueue(ep)
        p.newEndpointWaker.Assert()
}

const (
        newEndpointWaker = 1
        closeWaker       = 2
)

func (p *processor) start(wg *sync.WaitGroup) {
        defer wg.Done()
        defer p.sleeper.Done()

        for {
                if id, _ := p.sleeper.Fetch(true); id == closeWaker {
                        break
                }
                for {
                        ep := p.epQ.dequeue()
                        if ep == nil {
                                break
                        }
                        if ep.segmentQueue.empty() {
                                continue
                        }

                        // If socket has transitioned out of connected state then just let the
                        // worker handle the packet.
                        //
                        // NOTE: We read this outside of e.mu lock which means that by the time
                        // we get to handleSegments the endpoint may not be in ESTABLISHED. But
                        // this should be fine as all normal shutdown states are handled by
                        // handleSegments and if the endpoint moves to a CLOSED/ERROR state
                        // then handleSegments is a noop.
                        if ep.EndpointState() == StateEstablished && ep.mu.TryLock() {
                                // If the endpoint is in a connected state then we do direct delivery
                                // to ensure low latency and avoid scheduler interactions.
                                switch err := ep.handleSegmentsLocked(true /* fastPath */); {
                                case err != nil:
                                        // Send any active resets if required.
                                        ep.resetConnectionLocked(err)
                                        fallthrough
                                case ep.EndpointState() == StateClose:
                                        ep.notifyProtocolGoroutine(notifyTickleWorker)
                                case !ep.segmentQueue.empty():
                                        p.epQ.enqueue(ep)
                                }
                                ep.mu.Unlock() // +checklocksforce
                        } else {
                                ep.newSegmentWaker.Assert()
                        }
                }
        }
}

// dispatcher manages a pool of TCP endpoint processors which are responsible
// for the processing of inbound segments. This fixed pool of processor
// goroutines do full tcp processing. The processor is selected based on the
// hash of the endpoint id to ensure that delivery for the same endpoint happens
// in-order.
type dispatcher struct {
        processors []processor
        // seed is a random secret for a jenkins hash.
        seed uint32
        wg   sync.WaitGroup
}

func (d *dispatcher) init(rng *rand.Rand, nProcessors int) {
        d.close()
        d.wait()
        d.processors = make([]processor, nProcessors)
        d.seed = rng.Uint32()
        for i := range d.processors {
                p := &d.processors[i]
                p.sleeper.AddWaker(&p.newEndpointWaker, newEndpointWaker)
                p.sleeper.AddWaker(&p.closeWaker, closeWaker)
                d.wg.Add(1)
                // NB: sleeper-waker registration must happen synchronously to avoid races
                // with `close`.  It's possible to pull all this logic into `start`, but
                // that results in a heap-allocated function literal.
                go p.start(&d.wg)
        }
}

func (d *dispatcher) close() {
        for i := range d.processors {
                d.processors[i].close()
        }
}

func (d *dispatcher) wait() {
        d.wg.Wait()
}

func (d *dispatcher) queuePacket(stackEP stack.TransportEndpoint, id stack.TransportEndpointID, clock tcpip.Clock, pkt *stack.PacketBuffer) {
        ep := stackEP.(*endpoint)

        s := newIncomingSegment(id, clock, pkt)
        if !s.parse(pkt.RXTransportChecksumValidated) {
                ep.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
                ep.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
                s.decRef()
                return
        }

        if !s.csumValid {
                ep.stack.Stats().TCP.ChecksumErrors.Increment()
                ep.stats.ReceiveErrors.ChecksumErrors.Increment()
                s.decRef()
                return
        }

        ep.stack.Stats().TCP.ValidSegmentsReceived.Increment()
        ep.stats.SegmentsReceived.Increment()
        if (s.flags & header.TCPFlagRst) != 0 {
                ep.stack.Stats().TCP.ResetsReceived.Increment()
        }

        if !ep.enqueueSegment(s) {
                s.decRef()
                return
        }

        // For sockets not in established state let the worker goroutine
        // handle the packets.
        if ep.EndpointState() != StateEstablished {
                ep.newSegmentWaker.Assert()
                return
        }

        d.selectProcessor(id).queueEndpoint(ep)
}

func (d *dispatcher) selectProcessor(id stack.TransportEndpointID) *processor {
        var payload [4]byte
        binary.LittleEndian.PutUint16(payload[0:], id.LocalPort)
        binary.LittleEndian.PutUint16(payload[2:], id.RemotePort)

        h := jenkins.Sum32(d.seed)
        h.Write(payload[:])
        h.Write([]byte(id.LocalAddress))
        h.Write([]byte(id.RemoteAddress))

        return &d.processors[h.Sum32()%uint32(len(d.processors))]
}































































































































































    5 






























    5 





    2 



    5 















    5 





    5 





    5 







    5 








    5 

    5 





    5 
    2 


    3 






    5 





    5 



























































    7 





    5 

    3 



    3 

    2 




    3 


    3 











    3 















    5 

    4 



































    1 


    1 






























































































































    7 

    2 



    4 




    3 




    3 













    2 



    3 


















































































































    8 


    8 

    6 

    6 













    6 
    1 




    1 



    6 




















    4 







    7 



    7 
    2 



    2 






    2 


    5 





    5 



    5 




















    5 

    3 















    5 






    5 



    5 



    5 


    7 



    7 



    7 












    2 




    2 


    2 









    8 
    7 








    8 




    8 

    7 




    7 

    7 










    8 




    8 





    3 
    2 




    7 
    8 




    8 





    8 


    7 




    8 















































    8 







    5 


    5 

























































    5 











    5 





































    5 




    5 











































































































    5 







    5 




    5 
































    4 


    5 


    5 








    5 





    5 




    5 













    5 





    5 










    5 






    5 



    5 







    5 









    5 



    5 

    5 














    5 




    5 




    5 




    5 




    5 




    5 






    5 

    5 







    5 






    5 



    5 



    5 








    5 

























    5 













    5 



    7 







    6 


















    7 




    7 

    5 



    7 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
        "fmt"
        "math"
        "sort"
        "time"

        "gvisor.dev/gvisor/pkg/sleep"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/seqnum"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

const (
        // MinRTO is the minimum allowed value for the retransmit timeout.
        MinRTO = 200 * time.Millisecond

        // MaxRTO is the maximum allowed value for the retransmit timeout.
        MaxRTO = 120 * time.Second

        // InitialCwnd is the initial congestion window.
        InitialCwnd = 10

        // nDupAckThreshold is the number of duplicate ACK's required
        // before fast-retransmit is entered.
        nDupAckThreshold = 3

        // MaxRetries is the maximum number of probe retries sender does
        // before timing out the connection.
        // Linux default TCP_RETR2, net.ipv4.tcp_retries2.
        MaxRetries = 15
)

// congestionControl is an interface that must be implemented by any supported
// congestion control algorithm.
type congestionControl interface {
        // HandleLossDetected is invoked when the loss is detected by RACK or
        // sender.dupAckCount >= nDupAckThreshold just before entering fast
        // retransmit.
        HandleLossDetected()

        // HandleRTOExpired is invoked when the retransmit timer expires.
        HandleRTOExpired()

        // Update is invoked when processing inbound acks. It's passed the
        // number of packet's that were acked by the most recent cumulative
        // acknowledgement.
        Update(packetsAcked int)

        // PostRecovery is invoked when the sender is exiting a fast retransmit/
        // recovery phase. This provides congestion control algorithms a way
        // to adjust their state when exiting recovery.
        PostRecovery()
}

// lossRecovery is an interface that must be implemented by any supported
// loss recovery algorithm.
type lossRecovery interface {
        // DoRecovery is invoked when loss is detected and segments need
        // to be retransmitted. The cumulative or selective ACK is passed along
        // with the flag which identifies whether the connection entered fast
        // retransmit with this ACK and to retransmit the first unacknowledged
        // segment.
        DoRecovery(rcvdSeg *segment, fastRetransmit bool)
}

// sender holds the state necessary to send TCP segments.
//
// +stateify savable
type sender struct {
        stack.TCPSenderState
        ep *endpoint

        // lr is the loss recovery algorithm used by the sender.
        lr lossRecovery

        // firstRetransmittedSegXmitTime is the original transmit time of
        // the first segment that was retransmitted due to RTO expiration.
        firstRetransmittedSegXmitTime tcpip.MonotonicTime

        // zeroWindowProbing is set if the sender is currently probing
        // for zero receive window.
        zeroWindowProbing bool `state:"nosave"`

        // unackZeroWindowProbes is the number of unacknowledged zero
        // window probes.
        unackZeroWindowProbes uint32 `state:"nosave"`

        writeNext   *segment
        writeList   segmentList
        resendTimer timer       `state:"nosave"`
        resendWaker sleep.Waker `state:"nosave"`

        // rtt.TCPRTTState.SRTT and rtt.TCPRTTState.RTTVar are the "smoothed
        // round-trip time", and "round-trip time variation", as defined in
        // section 2 of RFC 6298.
        rtt rtt

        // minRTO is the minimum permitted value for sender.rto.
        minRTO time.Duration

        // maxRTO is the maximum permitted value for sender.rto.
        maxRTO time.Duration

        // maxRetries is the maximum permitted retransmissions.
        maxRetries uint32

        // gso is set if generic segmentation offload is enabled.
        gso bool

        // state is the current state of congestion control for this endpoint.
        state tcpip.CongestionControlState

        // cc is the congestion control algorithm in use for this sender.
        cc congestionControl

        // rc has the fields needed for implementing RACK loss detection
        // algorithm.
        rc rackControl

        // reorderTimer is the timer used to retransmit the segments after RACK
        // detects them as lost.
        reorderTimer timer       `state:"nosave"`
        reorderWaker sleep.Waker `state:"nosave"`

        // probeTimer and probeWaker are used to schedule PTO for RACK TLP algorithm.
        probeTimer timer       `state:"nosave"`
        probeWaker sleep.Waker `state:"nosave"`
}

// rtt is a synchronization wrapper used to appease stateify. See the comment
// in sender, where it is used.
//
// +stateify savable
type rtt struct {
        sync.Mutex `state:"nosave"`

        stack.TCPRTTState
}

func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender {
        // The sender MUST reduce the TCP data length to account for any IP or
        // TCP options that it is including in the packets that it sends.
        // See: https://tools.ietf.org/html/rfc6691#section-2
        maxPayloadSize := int(mss) - ep.maxOptionSize()

        s := &sender{
                ep: ep,
                TCPSenderState: stack.TCPSenderState{
                        SndWnd:           sndWnd,
                        SndUna:           iss + 1,
                        SndNxt:           iss + 1,
                        RTTMeasureSeqNum: iss + 1,
                        LastSendTime:     ep.stack.Clock().NowMonotonic(),
                        MaxPayloadSize:   maxPayloadSize,
                        MaxSentAck:       irs + 1,
                        FastRecovery: stack.TCPFastRecoveryState{
                                // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1.
                                Last:      iss,
                                HighRxt:   iss,
                                RescueRxt: iss,
                        },
                        RTO: 1 * time.Second,
                },
                gso: ep.gso.Type != stack.GSONone,
        }

        if s.gso {
                s.ep.gso.MSS = uint16(maxPayloadSize)
        }

        s.cc = s.initCongestionControl(ep.cc)
        s.lr = s.initLossRecovery()
        s.rc.init(s, iss)

        // A negative sndWndScale means that no scaling is in use, otherwise we
        // store the scaling value.
        if sndWndScale > 0 {
                s.SndWndScale = uint8(sndWndScale)
        }

        s.resendTimer.init(s.ep.stack.Clock(), &s.resendWaker)
        s.reorderTimer.init(s.ep.stack.Clock(), &s.reorderWaker)
        s.probeTimer.init(s.ep.stack.Clock(), &s.probeWaker)

        s.updateMaxPayloadSize(int(ep.route.MTU()), 0)

        // Initialize SACK Scoreboard after updating max payload size as we use
        // the maxPayloadSize as the smss when determining if a segment is lost
        // etc.
        s.ep.scoreboard = NewSACKScoreboard(uint16(s.MaxPayloadSize), iss)

        // Get Stack wide config.
        var minRTO tcpip.TCPMinRTOOption
        if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil {
                panic(fmt.Sprintf("unable to get minRTO from stack: %s", err))
        }
        s.minRTO = time.Duration(minRTO)

        var maxRTO tcpip.TCPMaxRTOOption
        if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil {
                panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err))
        }
        s.maxRTO = time.Duration(maxRTO)

        var maxRetries tcpip.TCPMaxRetriesOption
        if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil {
                panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err))
        }
        s.maxRetries = uint32(maxRetries)

        return s
}

// initCongestionControl initializes the specified congestion control module and
// returns a handle to it. It also initializes the sndCwnd and sndSsThresh to
// their initial values.
func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl {
        s.SndCwnd = InitialCwnd
        // Set sndSsthresh to the maximum int value, which depends on the
        // platform.
        s.Ssthresh = int(^uint(0) >> 1)

        switch congestionControlName {
        case ccCubic:
                return newCubicCC(s)
        case ccReno:
                fallthrough
        default:
                return newRenoCC(s)
        }
}

// initLossRecovery initiates the loss recovery algorithm for the sender.
func (s *sender) initLossRecovery() lossRecovery {
        if s.ep.SACKPermitted {
                return newSACKRecovery(s)
        }
        return newRenoRecovery(s)
}

// updateMaxPayloadSize updates the maximum payload size based on the given
// MTU. If this is in response to "packet too big" control packets (indicated
// by the count argument), it also reduces the number of outstanding packets and
// attempts to retransmit the first packet above the MTU size.
func (s *sender) updateMaxPayloadSize(mtu, count int) {
        m := mtu - header.TCPMinimumSize

        m -= s.ep.maxOptionSize()

        // We don't adjust up for now.
        if m >= s.MaxPayloadSize {
                return
        }

        // Make sure we can transmit at least one byte.
        if m <= 0 {
                m = 1
        }

        oldMSS := s.MaxPayloadSize
        s.MaxPayloadSize = m
        if s.gso {
                s.ep.gso.MSS = uint16(m)
        }

        if count == 0 {
                // updateMaxPayloadSize is also called when the sender is created.
                // and there is no data to send in such cases. Return immediately.
                return
        }

        // Update the scoreboard's smss to reflect the new lowered
        // maxPayloadSize.
        s.ep.scoreboard.smss = uint16(m)

        s.Outstanding -= count
        if s.Outstanding < 0 {
                s.Outstanding = 0
        }

        // Rewind writeNext to the first segment exceeding the MTU. Do nothing
        // if it is already before such a packet.
        nextSeg := s.writeNext
        for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
                if seg == s.writeNext {
                        // We got to writeNext before we could find a segment
                        // exceeding the MTU.
                        break
                }

                if nextSeg == s.writeNext && seg.data.Size() > m {
                        // We found a segment exceeding the MTU. Rewind
                        // writeNext and try to retransmit it.
                        nextSeg = seg
                }

                if s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
                        // Update sackedOut for new maximum payload size.
                        s.SackedOut -= s.pCount(seg, oldMSS)
                        s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
                }
        }

        // Since we likely reduced the number of outstanding packets, we may be
        // ready to send some more.
        s.writeNext = nextSeg
        s.sendData()
}

// sendAck sends an ACK segment.
func (s *sender) sendAck() {
        s.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, s.SndNxt)
}

// updateRTO updates the retransmit timeout when a new roud-trip time is
// available. This is done in accordance with section 2 of RFC 6298.
func (s *sender) updateRTO(rtt time.Duration) {
        s.rtt.Lock()
        if !s.rtt.TCPRTTState.SRTTInited {
                s.rtt.TCPRTTState.RTTVar = rtt / 2
                s.rtt.TCPRTTState.SRTT = rtt
                s.rtt.TCPRTTState.SRTTInited = true
        } else {
                diff := s.rtt.TCPRTTState.SRTT - rtt
                if diff < 0 {
                        diff = -diff
                }
                // Use RFC6298 standard algorithm to update TCPRTTState.RTTVar and TCPRTTState.SRTT when
                // no timestamps are available.
                if !s.ep.SendTSOk {
                        s.rtt.TCPRTTState.RTTVar = (3*s.rtt.TCPRTTState.RTTVar + diff) / 4
                        s.rtt.TCPRTTState.SRTT = (7*s.rtt.TCPRTTState.SRTT + rtt) / 8
                } else {
                        // When we are taking RTT measurements of every ACK then
                        // we need to use a modified method as specified in
                        // https://tools.ietf.org/html/rfc7323#appendix-G
                        if s.Outstanding == 0 {
                                s.rtt.Unlock()
                                return
                        }
                        // Netstack measures congestion window/inflight all in
                        // terms of packets and not bytes. This is similar to
                        // how linux also does cwnd and inflight. In practice
                        // this approximation works as expected.
                        expectedSamples := math.Ceil(float64(s.Outstanding) / 2)

                        // alpha & beta values are the original values as recommended in
                        // https://tools.ietf.org/html/rfc6298#section-2.3.
                        const alpha = 0.125
                        const beta = 0.25

                        alphaPrime := alpha / expectedSamples
                        betaPrime := beta / expectedSamples
                        rttVar := (1-betaPrime)*s.rtt.TCPRTTState.RTTVar.Seconds() + betaPrime*diff.Seconds()
                        srtt := (1-alphaPrime)*s.rtt.TCPRTTState.SRTT.Seconds() + alphaPrime*rtt.Seconds()
                        s.rtt.TCPRTTState.RTTVar = time.Duration(rttVar * float64(time.Second))
                        s.rtt.TCPRTTState.SRTT = time.Duration(srtt * float64(time.Second))
                }
        }

        s.RTO = s.rtt.TCPRTTState.SRTT + 4*s.rtt.TCPRTTState.RTTVar
        s.rtt.Unlock()
        if s.RTO < s.minRTO {
                s.RTO = s.minRTO
        }
}

// resendSegment resends the first unacknowledged segment.
func (s *sender) resendSegment() {
        // Don't use any segments we already sent to measure RTT as they may
        // have been affected by packets being lost.
        s.RTTMeasureSeqNum = s.SndNxt

        // Resend the segment.
        if seg := s.writeList.Front(); seg != nil {
                if seg.data.Size() > s.MaxPayloadSize {
                        s.splitSeg(seg, s.MaxPayloadSize)
                }

                // See: RFC 6675 section 5 Step 4.3
                //
                // To prevent retransmission, set both the HighRXT and RescueRXT
                // to the highest sequence number in the retransmitted segment.
                s.FastRecovery.HighRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1
                s.FastRecovery.RescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1
                s.sendSegment(seg)
                s.ep.stack.Stats().TCP.FastRetransmit.Increment()
                s.ep.stats.SendErrors.FastRetransmit.Increment()

                // Run SetPipe() as per RFC 6675 section 5 Step 4.4
                s.SetPipe()
        }
}

// retransmitTimerExpired is called when the retransmit timer expires, and
// unacknowledged segments are assumed lost, and thus need to be resent.
// Returns true if the connection is still usable, or false if the connection
// is deemed lost.
func (s *sender) retransmitTimerExpired() bool {
        // Check if the timer actually expired or if it's a spurious wake due
        // to a previously orphaned runtime timer.
        if !s.resendTimer.checkExpiration() {
                return true
        }

        // TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases
        // when writeList is empty. Remove this once we have a proper fix for this
        // issue.
        if s.writeList.Front() == nil {
                return true
        }

        s.ep.stack.Stats().TCP.Timeouts.Increment()
        s.ep.stats.SendErrors.Timeouts.Increment()

        // Set TLPRxtOut to false according to
        // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
        s.rc.tlpRxtOut = false

        // Give up if we've waited more than a minute since the last resend or
        // if a user time out is set and we have exceeded the user specified
        // timeout since the first retransmission.
        uto := s.ep.userTimeout

        if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
                // We store the original xmitTime of the segment that we are
                // about to retransmit as the retransmission time. This is
                // required as by the time the retransmitTimer has expired the
                // segment has already been sent and unacked for the RTO at the
                // time the segment was sent.
                s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime
        }

        elapsed := s.ep.stack.Clock().NowMonotonic().Sub(s.firstRetransmittedSegXmitTime)
        remaining := s.maxRTO
        if uto != 0 {
                // Cap to the user specified timeout if one is specified.
                remaining = uto - elapsed
        }

        // Always honor the user-timeout irrespective of whether the zero
        // window probes were acknowledged.
        // net/ipv4/tcp_timer.c::tcp_probe_timer()
        if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries {
                return false
        }

        // Set new timeout. The timer will be restarted by the call to sendData
        // below.
        s.RTO *= 2
        // Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5
        if s.RTO > s.maxRTO {
                s.RTO = s.maxRTO
        }

        // Cap RTO to remaining time.
        if s.RTO > remaining {
                s.RTO = remaining
        }

        // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4.
        //
        // Retransmit timeouts:
        //     After a retransmit timeout, record the highest sequence number
        //     transmitted in the variable recover, and exit the fast recovery
        //     procedure if applicable.
        s.FastRecovery.Last = s.SndNxt - 1

        if s.FastRecovery.Active {
                // We were attempting fast recovery but were not successful.
                // Leave the state. We don't need to update ssthresh because it
                // has already been updated when entered fast-recovery.
                s.leaveRecovery()
        }

        s.state = tcpip.RTORecovery
        s.cc.HandleRTOExpired()

        // Mark the next segment to be sent as the first unacknowledged one and
        // start sending again. Set the number of outstanding packets to 0 so
        // that we'll be able to retransmit.
        //
        // We'll keep on transmitting (or retransmitting) as we get acks for
        // the data we transmit.
        s.Outstanding = 0

        // Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1
        //
        //  In order to avoid memory deadlocks, the TCP receiver is allowed to
        //  discard data that has already been selectively acknowledged. As a
        //  result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK
        //  information gathered from a receiver upon a retransmission timeout
        //  (RTO) "since the timeout might indicate that the data receiver has
        //  reneged." Additionally, a TCP sender MUST "ignore prior SACK
        //  information in determining which data to retransmit."
        //
        // NOTE: We take the stricter interpretation and just expunge all
        // information as we lack more rigorous checks to validate if the SACK
        // information is usable after an RTO.
        s.ep.scoreboard.Reset()
        s.writeNext = s.writeList.Front()

        // RFC 1122 4.2.2.17: Start sending zero window probes when we still see a
        // zero receive window after retransmission interval and we have data to
        // send.
        if s.zeroWindowProbing {
                s.sendZeroWindowProbe()
                // RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed
                // indefinitely.  As long as the receiving TCP continues to send
                // acknowledgments in response to the probe segments, the sending TCP
                // MUST allow the connection to stay open.
                return true
        }

        seg := s.writeNext
        // RFC 1122 4.2.3.5: Close the connection when the number of
        // retransmissions for this segment is beyond a limit.
        if seg != nil && seg.xmitCount > s.maxRetries {
                return false
        }

        s.sendData()

        return true
}

// pCount returns the number of packets in the segment. Due to GSO, a segment
// can be composed of multiple packets.
func (s *sender) pCount(seg *segment, maxPayloadSize int) int {
        size := seg.data.Size()
        if size == 0 {
                return 1
        }

        return (size-1)/maxPayloadSize + 1
}

// splitSeg splits a given segment at the size specified and inserts the
// remainder as a new segment after the current one in the write list.
func (s *sender) splitSeg(seg *segment, size int) {
        if seg.data.Size() <= size {
                return
        }
        // Split this segment up.
        nSeg := seg.clone()
        nSeg.data.TrimFront(size)
        nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
        s.writeList.InsertAfter(seg, nSeg)

        // The segment being split does not carry PUSH flag because it is
        // followed by the newly split segment.
        // RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered
        // segment (i.e., when there is no more queued data to be sent).
        // Linux removes PSH flag only when the segment is being split over MSS
        // and retains it when we are splitting the segment over lack of sender
        // window space.
        // ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point()
        // ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test()
        if seg.data.Size() > s.MaxPayloadSize {
                seg.flags ^= header.TCPFlagPsh
        }

        seg.data.CapLength(size)
}

// NextSeg implements the RFC6675 NextSeg() operation.
//
// NextSeg starts scanning the writeList starting from nextSegHint and returns
// the hint to be passed on the next call to NextSeg. This is required to avoid
// iterating the write list repeatedly when NextSeg is invoked in a loop during
// recovery. The returned hint will be nil if there are no more segments that
// can match rules defined by NextSeg operation in RFC6675.
//
// rescueRtx will be true only if nextSeg is a rescue retransmission as
// described by Step 4) of the NextSeg algorithm.
func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) {
        var s3 *segment
        var s4 *segment
        // Step 1.
        for seg := nextSegHint; seg != nil; seg = seg.Next() {
                // Stop iteration if we hit a segment that has never been
                // transmitted (i.e. either it has no assigned sequence number
                // or if it does have one, it's >= the next sequence number
                // to be sent [i.e. >= s.sndNxt]).
                if !s.isAssignedSequenceNumber(seg) || s.SndNxt.LessThanEq(seg.sequenceNumber) {
                        hint = nil
                        break
                }
                segSeq := seg.sequenceNumber
                if smss := s.ep.scoreboard.SMSS(); seg.data.Size() > int(smss) {
                        s.splitSeg(seg, int(smss))
                }

                // See RFC 6675 Section 4
                //
                //     1. If there exists a smallest unSACKED sequence number
                //     'S2' that meets the following 3 criteria for determinig
                //     loss, the sequence range of one segment of up to SMSS
                //     octects starting with S2 MUST be returned.
                if !s.ep.scoreboard.IsSACKED(header.SACKBlock{Start: segSeq, End: segSeq.Add(1)}) {
                        // NextSeg():
                        //
                        //    (1.a) S2 is greater than HighRxt
                        //    (1.b) S2 is less than highest octect covered by
                        //    any received SACK.
                        if s.FastRecovery.HighRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) {
                                // NextSeg():
                                //     (1.c) IsLost(S2) returns true.
                                if s.ep.scoreboard.IsLost(segSeq) {
                                        return seg, seg.Next(), false
                                }

                                // NextSeg():
                                //
                                // (3): If the conditions for rules (1) and (2)
                                // fail, but there exists an unSACKed sequence
                                // number S3 that meets the criteria for
                                // detecting loss given in steps 1.a and 1.b
                                // above (specifically excluding (1.c)) then one
                                // segment of upto SMSS octets starting with S3
                                // SHOULD be returned.
                                if s3 == nil {
                                        s3 = seg
                                        hint = seg.Next()
                                }
                        }
                        // NextSeg():
                        //
                        //     (4) If the conditions for (1), (2) and (3) fail,
                        //     but there exists outstanding unSACKED data, we
                        //     provide the opportunity for a single "rescue"
                        //     retransmission per entry into loss recovery. If
                        //     HighACK is greater than RescueRxt (or RescueRxt
                        //     is undefined), then one segment of upto SMSS
                        //     octects that MUST include the highest outstanding
                        //     unSACKed sequence number SHOULD be returned, and
                        //     RescueRxt set to RecoveryPoint. HighRxt MUST NOT
                        //     be updated.
                        if s.FastRecovery.RescueRxt.LessThan(s.SndUna - 1) {
                                if s4 != nil {
                                        if s4.sequenceNumber.LessThan(segSeq) {
                                                s4 = seg
                                        }
                                } else {
                                        s4 = seg
                                }
                        }
                }
        }

        // If we got here then no segment matched step (1).
        // Step (2): "If no sequence number 'S2' per rule (1)
        // exists but there exists available unsent data and the
        // receiver's advertised window allows, the sequence
        // range of one segment of up to SMSS octets of
        // previously unsent data starting with sequence number
        // HighData+1 MUST be returned."
        for seg := s.writeNext; seg != nil; seg = seg.Next() {
                if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.SndNxt) {
                        continue
                }
                // We do not split the segment here to <= smss as it has
                // potentially not been assigned a sequence number yet.
                return seg, nil, false
        }

        if s3 != nil {
                return s3, hint, false
        }

        return s4, nil, true
}

// maybeSendSegment tries to send the specified segment and either coalesces
// other segments into this one or splits the specified segment based on the
// lower of the specified limit value or the receivers window size specified by
// end.
func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) {
        // We abuse the flags field to determine if we have already
        // assigned a sequence number to this segment.
        if !s.isAssignedSequenceNumber(seg) {
                // Merge segments if allowed.
                if seg.data.Size() != 0 {
                        available := int(s.SndNxt.Size(end))
                        if available > limit {
                                available = limit
                        }

                        // nextTooBig indicates that the next segment was too
                        // large to entirely fit in the current segment. It
                        // would be possible to split the next segment and merge
                        // the portion that fits, but unexpectedly splitting
                        // segments can have user visible side-effects which can
                        // break applications. For example, RFC 7766 section 8
                        // says that the length and data of a DNS response
                        // should be sent in the same TCP segment to avoid
                        // triggering bugs in poorly written DNS
                        // implementations.
                        var nextTooBig bool
                        for nSeg := seg.Next(); nSeg != nil && nSeg.data.Size() != 0; nSeg = seg.Next() {
                                if seg.data.Size()+nSeg.data.Size() > available {
                                        nextTooBig = true
                                        break
                                }
                                seg.merge(nSeg)
                                s.writeList.Remove(nSeg)
                                nSeg.decRef()
                        }
                        if !nextTooBig && seg.data.Size() < available {
                                // Segment is not full.
                                if s.Outstanding > 0 && s.ep.ops.GetDelayOption() {
                                        // Nagle's algorithm. From Wikipedia:
                                        //   Nagle's algorithm works by
                                        //   combining a number of small
                                        //   outgoing messages and sending them
                                        //   all at once. Specifically, as long
                                        //   as there is a sent packet for which
                                        //   the sender has received no
                                        //   acknowledgment, the sender should
                                        //   keep buffering its output until it
                                        //   has a full packet's worth of
                                        //   output, thus allowing output to be
                                        //   sent all at once.
                                        return false
                                }
                                // With TCP_CORK, hold back until minimum of the available
                                // send space and MSS.
                                // TODO(gvisor.dev/issue/2833): Drain the held segments after a
                                // timeout.
                                if seg.data.Size() < s.MaxPayloadSize && s.ep.ops.GetCorkOption() {
                                        return false
                                }
                        }
                }

                // Assign flags. We don't do it above so that we can merge
                // additional data if Nagle holds the segment.
                seg.sequenceNumber = s.SndNxt
                seg.flags = header.TCPFlagAck | header.TCPFlagPsh
        }

        var segEnd seqnum.Value
        if seg.data.Size() == 0 {
                if s.writeList.Back() != seg {
                        panic("FIN segments must be the final segment in the write list.")
                }
                seg.flags = header.TCPFlagAck | header.TCPFlagFin
                segEnd = seg.sequenceNumber.Add(1)
                // Update the state to reflect that we have now
                // queued a FIN.
                switch s.ep.EndpointState() {
                case StateCloseWait:
                        s.ep.setEndpointState(StateLastAck)
                default:
                        s.ep.setEndpointState(StateFinWait1)
                }
        } else {
                // We're sending a non-FIN segment.
                if seg.flags&header.TCPFlagFin != 0 {
                        panic("Netstack queues FIN segments without data.")
                }

                if !seg.sequenceNumber.LessThan(end) {
                        return false
                }

                available := int(seg.sequenceNumber.Size(end))
                if available == 0 {
                        return false
                }

                // If the whole segment or at least 1MSS sized segment cannot
                // be accomodated in the receiver advertized window, skip
                // splitting and sending of the segment. ref:
                // net/ipv4/tcp_output.c::tcp_snd_wnd_test()
                //
                // Linux checks this for all segment transmits not triggered by
                // a probe timer. On this condition, it defers the segment split
                // and transmit to a short probe timer.
                //
                // ref: include/net/tcp.h::tcp_check_probe_timer()
                // ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
                //
                // Instead of defining a new transmit timer, we attempt to split
                // the segment right here if there are no pending segments. If
                // there are pending segments, segment transmits are deferred to
                // the retransmit timer handler.
                if s.SndUna != s.SndNxt {
                        switch {
                        case available >= seg.data.Size():
                                // OK to send, the whole segments fits in the
                                // receiver's advertised window.
                        case available >= s.MaxPayloadSize:
                                // OK to send, at least 1 MSS sized segment fits
                                // in the receiver's advertised window.
                        default:
                                return false
                        }
                }

                // The segment size limit is computed as a function of sender
                // congestion window and MSS. When sender congestion window is >
                // 1, this limit can be larger than MSS. Ensure that the
                // currently available send space is not greater than minimum of
                // this limit and MSS.
                if available > limit {
                        available = limit
                }

                // If GSO is not in use then cap available to
                // maxPayloadSize. When GSO is in use the gVisor GSO logic or
                // the host GSO logic will cap the segment to the correct size.
                if s.ep.gso.Type == stack.GSONone && available > s.MaxPayloadSize {
                        available = s.MaxPayloadSize
                }

                if seg.data.Size() > available {
                        s.splitSeg(seg, available)
                }

                segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size()))
        }

        s.sendSegment(seg)

        // Update sndNxt if we actually sent new data (as opposed to
        // retransmitting some previously sent data).
        if s.SndNxt.LessThan(segEnd) {
                s.SndNxt = segEnd
        }

        return true
}

func (s *sender) sendZeroWindowProbe() {
        ack, win := s.ep.rcv.getSendParams()
        s.unackZeroWindowProbes++
        // Send a zero window probe with sequence number pointing to
        // the last acknowledged byte.
        s.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, s.SndUna-1, ack, win)
        // Rearm the timer to continue probing.
        s.resendTimer.enable(s.RTO)
}

func (s *sender) enableZeroWindowProbing() {
        s.zeroWindowProbing = true
        // We piggyback the probing on the retransmit timer with the
        // current retranmission interval, as we may start probing while
        // segment retransmissions.
        if s.firstRetransmittedSegXmitTime == (tcpip.MonotonicTime{}) {
                s.firstRetransmittedSegXmitTime = s.ep.stack.Clock().NowMonotonic()
        }
        s.resendTimer.enable(s.RTO)
}

func (s *sender) disableZeroWindowProbing() {
        s.zeroWindowProbing = false
        s.unackZeroWindowProbes = 0
        s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
        s.resendTimer.disable()
}

func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) {
        if dataSent {
                // We sent data, so we should stop the keepalive timer to ensure
                // that no keepalives are sent while there is pending data.
                s.ep.disableKeepaliveTimer()
        }

        // If the sender has advertized zero receive window and we have
        // data to be sent out, start zero window probing to query the
        // the remote for it's receive window size.
        if s.writeNext != nil && s.SndWnd == 0 {
                s.enableZeroWindowProbing()
        }

        // If we have no more pending data, start the keepalive timer.
        if s.SndUna == s.SndNxt {
                s.ep.resetKeepaliveTimer(false)
        } else {
                // Enable timers if we have pending data.
                if shouldScheduleProbe && s.shouldSchedulePTO() {
                        // Schedule PTO after transmitting new data that wasn't itself a TLP probe.
                        s.schedulePTO()
                } else if !s.resendTimer.enabled() {
                        s.probeTimer.disable()
                        if s.Outstanding > 0 {
                                // Enable the resend timer if it's not enabled yet and there is
                                // outstanding data.
                                s.resendTimer.enable(s.RTO)
                        }
                }
        }
}

// sendData sends new data segments. It is called when data becomes available or
// when the send window opens up.
func (s *sender) sendData() {
        limit := s.MaxPayloadSize
        if s.gso {
                limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize)
        }
        end := s.SndUna.Add(s.SndWnd)

        // Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10.
        // "A TCP SHOULD set cwnd to no more than RW before beginning
        // transmission if the TCP has not sent data in the interval exceeding
        // the retrasmission timeout."
        if !s.FastRecovery.Active && s.state != tcpip.RTORecovery && s.ep.stack.Clock().NowMonotonic().Sub(s.LastSendTime) > s.RTO {
                if s.SndCwnd > InitialCwnd {
                        s.SndCwnd = InitialCwnd
                }
        }

        var dataSent bool
        for seg := s.writeNext; seg != nil && s.Outstanding < s.SndCwnd; seg = seg.Next() {
                cwndLimit := (s.SndCwnd - s.Outstanding) * s.MaxPayloadSize
                if cwndLimit < limit {
                        limit = cwndLimit
                }
                if s.isAssignedSequenceNumber(seg) && s.ep.SACKPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
                        // Move writeNext along so that we don't try and scan data that
                        // has already been SACKED.
                        s.writeNext = seg.Next()
                        continue
                }
                if sent := s.maybeSendSegment(seg, limit, end); !sent {
                        break
                }
                dataSent = true
                s.Outstanding += s.pCount(seg, s.MaxPayloadSize)
                s.writeNext = seg.Next()
        }

        s.postXmit(dataSent, true /* shouldScheduleProbe */)
}

func (s *sender) enterRecovery() {
        s.FastRecovery.Active = true
        // Save state to reflect we're now in fast recovery.
        //
        // See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3.
        // We inflate the cwnd by 3 to account for the 3 packets which triggered
        // the 3 duplicate ACKs and are now not in flight.
        s.SndCwnd = s.Ssthresh + 3
        s.SackedOut = 0
        s.DupAckCount = 0
        s.FastRecovery.First = s.SndUna
        s.FastRecovery.Last = s.SndNxt - 1
        s.FastRecovery.MaxCwnd = s.SndCwnd + s.Outstanding
        s.FastRecovery.HighRxt = s.SndUna
        s.FastRecovery.RescueRxt = s.SndUna
        if s.ep.SACKPermitted {
                s.state = tcpip.SACKRecovery
                s.ep.stack.Stats().TCP.SACKRecovery.Increment()
                // Set TLPRxtOut to false according to
                // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1.
                if s.rc.tlpRxtOut {
                        // The tail loss probe triggered recovery.
                        s.ep.stack.Stats().TCP.TLPRecovery.Increment()
                }
                s.rc.tlpRxtOut = false
                return
        }
        s.state = tcpip.FastRecovery
        s.ep.stack.Stats().TCP.FastRecovery.Increment()
}

func (s *sender) leaveRecovery() {
        s.FastRecovery.Active = false
        s.FastRecovery.MaxCwnd = 0
        s.DupAckCount = 0

        // Deflate cwnd. It had been artificially inflated when new dups arrived.
        s.SndCwnd = s.Ssthresh
        s.cc.PostRecovery()
}

// isAssignedSequenceNumber relies on the fact that we only set flags once a
// sequencenumber is assigned and that is only done right before we send the
// segment. As a result any segment that has a non-zero flag has a valid
// sequence number assigned to it.
func (s *sender) isAssignedSequenceNumber(seg *segment) bool {
        return seg.flags != 0
}

// SetPipe implements the SetPipe() function described in RFC6675. Netstack
// maintains the congestion window in number of packets and not bytes, so
// SetPipe() here measures number of outstanding packets rather than actual
// outstanding bytes in the network.
func (s *sender) SetPipe() {
        // If SACK isn't permitted or it is permitted but recovery is not active
        // then ignore pipe calculations.
        if !s.ep.SACKPermitted || !s.FastRecovery.Active {
                return
        }
        pipe := 0
        smss := seqnum.Size(s.ep.scoreboard.SMSS())
        for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() {
                // With GSO each segment can be much larger than SMSS. So check the segment
                // in SMSS sized ranges.
                segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.data.Size()))
                for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) {
                        endSeq := startSeq.Add(smss)
                        if segEnd.LessThan(endSeq) {
                                endSeq = segEnd
                        }
                        sb := header.SACKBlock{Start: startSeq, End: endSeq}
                        // SetPipe():
                        //
                        // After initializing pipe to zero, the following steps are
                        // taken for each octet 'S1' in the sequence space between
                        // HighACK and HighData that has not been SACKed:
                        if !s1.sequenceNumber.LessThan(s.SndNxt) {
                                break
                        }
                        if s.ep.scoreboard.IsSACKED(sb) {
                                continue
                        }

                        // SetPipe():
                        //
                        //    (a) If IsLost(S1) returns false, Pipe is incremened by 1.
                        //
                        // NOTE: here we mark the whole segment as lost. We do not try
                        // and test every byte in our write buffer as we maintain our
                        // pipe in terms of oustanding packets and not bytes.
                        if !s.ep.scoreboard.IsRangeLost(sb) {
                                pipe++
                        }
                        // SetPipe():
                        //    (b) If S1 <= HighRxt, Pipe is incremented by 1.
                        if s1.sequenceNumber.LessThanEq(s.FastRecovery.HighRxt) {
                                pipe++
                        }
                }
        }
        s.Outstanding = pipe
}

// shouldEnterRecovery returns true if the sender should enter fast recovery
// based on dupAck count and sack scoreboard.
// See RFC 6675 section 5.
func (s *sender) shouldEnterRecovery() bool {
        return s.DupAckCount >= nDupAckThreshold ||
                (s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.SndUna))
}

// detectLoss is called when an ack is received and returns whether a loss is
// detected. It manages the state related to duplicate acks and determines if
// a retransmit is needed according to the rules in RFC 6582 (NewReno).
func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) {
        // We're not in fast recovery yet.

        // If RACK is enabled and there is no reordering we should honor the
        // three duplicate ACK rule to enter recovery.
        // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4
        if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
                if s.rc.Reord {
                        return false
                }
        }

        if !s.isDupAck(seg) {
                s.DupAckCount = 0
                return false
        }

        s.DupAckCount++

        // Do not enter fast recovery until we reach nDupAckThreshold or the
        // first unacknowledged byte is considered lost as per SACK scoreboard.
        if !s.shouldEnterRecovery() {
                // RFC 6675 Step 3.
                s.FastRecovery.HighRxt = s.SndUna - 1
                // Do run SetPipe() to calculate the outstanding segments.
                s.SetPipe()
                s.state = tcpip.Disorder
                return false
        }

        // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2
        //
        // We only do the check here, the incrementing of last to the highest
        // sequence number transmitted till now is done when enterRecovery
        // is invoked.
        //
        // Note that we only enter recovery when at least one more byte of data
        // beyond s.fr.last (the highest byte that was outstanding when fast
        // retransmit was last entered) is acked.
        if !s.FastRecovery.Last.LessThan(seg.ackNumber - 1) {
                s.DupAckCount = 0
                return false
        }
        s.cc.HandleLossDetected()
        s.enterRecovery()
        return true
}

// isDupAck determines if seg is a duplicate ack as defined in
// https://tools.ietf.org/html/rfc5681#section-2.
func (s *sender) isDupAck(seg *segment) bool {
        // A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883]
        // can leverage the SACK information to determine when an incoming ACK is a
        // "duplicate" (e.g., if the ACK contains previously unknown SACK
        // information).
        if s.ep.SACKPermitted && !seg.hasNewSACKInfo {
                return false
        }

        // (a) The receiver of the ACK has outstanding data.
        return s.SndUna != s.SndNxt &&
                // (b) The incoming acknowledgment carries no data.
                seg.logicalLen() == 0 &&
                // (c) The SYN and FIN bits are both off.
                !seg.flags.Intersects(header.TCPFlagFin|header.TCPFlagSyn) &&
                // (d) the ACK number is equal to the greatest acknowledgment received on
                // the given connection (TCP.UNA from RFC793).
                seg.ackNumber == s.SndUna &&
                // (e) the advertised window in the incoming acknowledgment equals the
                // advertised window in the last incoming acknowledgment.
                s.SndWnd == seg.window
}

// Iterate the writeList and update RACK for each segment which is newly acked
// either cumulatively or selectively. Loop through the segments which are
// sacked, and update the RACK related variables and check for reordering.
//
// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
// steps 2 and 3.
func (s *sender) walkSACK(rcvdSeg *segment) {
        s.rc.setDSACKSeen(false)

        // Look for DSACK block.
        idx := 0
        n := len(rcvdSeg.parsedOptions.SACKBlocks)
        if checkDSACK(rcvdSeg) {
                s.rc.setDSACKSeen(true)
                idx = 1
                n--
        }

        if n == 0 {
                return
        }

        // Sort the SACK blocks. The first block is the most recent unacked
        // block. The following blocks can be in arbitrary order.
        sackBlocks := make([]header.SACKBlock, n)
        copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:])
        sort.Slice(sackBlocks, func(i, j int) bool {
                return sackBlocks[j].Start.LessThan(sackBlocks[i].Start)
        })

        seg := s.writeList.Front()
        for _, sb := range sackBlocks {
                for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 {
                        if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked {
                                s.rc.update(seg, rcvdSeg)
                                s.rc.detectReorder(seg)
                                seg.acked = true
                                s.SackedOut += s.pCount(seg, s.MaxPayloadSize)
                        }
                        seg = seg.Next()
                }
        }
}

// checkDSACK checks if a DSACK is reported.
func checkDSACK(rcvdSeg *segment) bool {
        n := len(rcvdSeg.parsedOptions.SACKBlocks)
        if n == 0 {
                return false
        }

        sb := rcvdSeg.parsedOptions.SACKBlocks[0]
        // Check if SACK block is invalid.
        if sb.End.LessThan(sb.Start) {
                return false
        }

        // See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in
        // at most one SACK block. DSACK is detected in the below two cases:
        // * If the SACK sequence space is less than this cumulative ACK, it is
        //   an indication that the segment identified by the SACK block has
        //   been received more than once by the receiver.
        // * If the sequence space in the first SACK block is greater than the
        //   cumulative ACK, then the sender next compares the sequence space
        //   in the first SACK block with the sequence space in the second SACK
        //   block, if there is one. This comparison can determine if the first
        //   SACK block is reporting duplicate data that lies above the
        //   cumulative ACK.
        if sb.Start.LessThan(rcvdSeg.ackNumber) {
                return true
        }

        if n > 1 {
                sb1 := rcvdSeg.parsedOptions.SACKBlocks[1]
                if sb1.End.LessThan(sb1.Start) {
                        return false
                }

                // If the first SACK block is fully covered by second SACK
                // block, then the first block is a DSACK block.
                if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) {
                        return true
                }
        }

        return false
}

// handleRcvdSegment is called when a segment is received; it is responsible for
// updating the send-related state.
func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
        // Check if we can extract an RTT measurement from this ack.
        if !rcvdSeg.parsedOptions.TS && s.RTTMeasureSeqNum.LessThan(rcvdSeg.ackNumber) {
                s.updateRTO(s.ep.stack.Clock().NowMonotonic().Sub(s.RTTMeasureTime))
                s.RTTMeasureSeqNum = s.SndNxt
        }

        // Update Timestamp if required. See RFC7323, section-4.3.
        if s.ep.SendTSOk && rcvdSeg.parsedOptions.TS {
                s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.MaxSentAck, rcvdSeg.sequenceNumber)
        }

        // Insert SACKBlock information into our scoreboard.
        if s.ep.SACKPermitted {
                for _, sb := range rcvdSeg.parsedOptions.SACKBlocks {
                        // Only insert the SACK block if the following holds
                        // true:
                        //  * SACK block acks data after the ack number in the
                        //    current segment.
                        //  * SACK block represents a sequence
                        //    between sndUna and sndNxt (i.e. data that is
                        //    currently unacked and in-flight).
                        //  * SACK block that has not been SACKed already.
                        //
                        // NOTE: This check specifically excludes DSACK blocks
                        // which have start/end before sndUna and are used to
                        // indicate spurious retransmissions.
                        if rcvdSeg.ackNumber.LessThan(sb.Start) && s.SndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.SndNxt) && !s.ep.scoreboard.IsSACKED(sb) {
                                s.ep.scoreboard.Insert(sb)
                                rcvdSeg.hasNewSACKInfo = true
                        }
                }

                // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08
                // section-7.2
                // * Step 2: Update RACK stats.
                //   If the ACK is not ignored as invalid, update the RACK.rtt
                //   to be the RTT sample calculated using this ACK, and
                //   continue.  If this ACK or SACK was for the most recently
                //   sent packet, then record the RACK.xmit_ts timestamp and
                //   RACK.end_seq sequence implied by this ACK.
                // * Step 3: Detect packet reordering.
                //   If the ACK selectively or cumulatively acknowledges an
                //   unacknowledged and also never retransmitted sequence below
                //   RACK.fack, then the corresponding packet has been
                //   reordered and RACK.reord is set to TRUE.
                if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
                        s.walkSACK(rcvdSeg)
                }
                s.SetPipe()
        }

        ack := rcvdSeg.ackNumber
        fastRetransmit := false
        // Do not leave fast recovery, if the ACK is out of range.
        if s.FastRecovery.Active {
                // Leave fast recovery if it acknowledges all the data covered by
                // this fast recovery session.
                if (ack-1).InRange(s.SndUna, s.SndNxt) && s.FastRecovery.Last.LessThan(ack) {
                        s.leaveRecovery()
                }
        } else {
                // Detect loss by counting the duplicates and enter recovery.
                fastRetransmit = s.detectLoss(rcvdSeg)
        }

        // See if TLP based recovery was successful.
        if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
                s.detectTLPRecovery(ack, rcvdSeg)
        }

        // Stash away the current window size.
        s.SndWnd = rcvdSeg.window

        // Disable zero window probing if remote advertizes a non-zero receive
        // window. This can be with an ACK to the zero window probe (where the
        // acknumber refers to the already acknowledged byte) OR to any previously
        // unacknowledged segment.
        if s.zeroWindowProbing && rcvdSeg.window > 0 &&
                (ack == s.SndUna || (ack-1).InRange(s.SndUna, s.SndNxt)) {
                s.disableZeroWindowProbing()
        }

        // On receiving the ACK for the zero window probe, account for it and
        // skip trying to send any segment as we are still probing for
        // receive window to become non-zero.
        if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.SndUna {
                s.unackZeroWindowProbes--
                return
        }

        // Ignore ack if it doesn't acknowledge any new data.
        if (ack - 1).InRange(s.SndUna, s.SndNxt) {
                s.DupAckCount = 0

                // See : https://tools.ietf.org/html/rfc1323#section-3.3.
                // Specifically we should only update the RTO using TSEcr if the
                // following condition holds:
                //
                //    A TSecr value received in a segment is used to update the
                //    averaged RTT measurement only if the segment acknowledges
                //    some new data, i.e., only if it advances the left edge of
                //    the send window.
                if s.ep.SendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 {
                        // TSVal/Ecr values sent by Netstack are at a millisecond
                        // granularity.
                        elapsed := time.Duration(s.ep.timestamp()-rcvdSeg.parsedOptions.TSEcr) * time.Millisecond
                        s.updateRTO(elapsed)
                }

                if s.shouldSchedulePTO() {
                        // Schedule PTO upon receiving an ACK that cumulatively acknowledges data.
                        // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1.
                        s.schedulePTO()
                } else {
                        // When an ack is received we must rearm the timer.
                        // RFC 6298 5.3
                        s.probeTimer.disable()
                        s.resendTimer.enable(s.RTO)
                }

                // Remove all acknowledged data from the write list.
                acked := s.SndUna.Size(ack)
                s.SndUna = ack

                // The remote ACK-ing at least 1 byte is an indication that we have a
                // full-duplex connection to the remote as the only way we will receive an
                // ACK is if the remote received data that we previously sent.
                //
                // As of writing, linux seems to only confirm a route as reachable when
                // forward progress is made which is indicated by an ACK that removes data
                // from the retransmit queue.
                if acked > 0 {
                        s.ep.route.ConfirmReachable()
                }

                ackLeft := acked
                originalOutstanding := s.Outstanding
                for ackLeft > 0 {
                        // We use logicalLen here because we can have FIN
                        // segments (which are always at the end of list) that
                        // have no data, but do consume a sequence number.
                        seg := s.writeList.Front()
                        datalen := seg.logicalLen()

                        if datalen > ackLeft {
                                prevCount := s.pCount(seg, s.MaxPayloadSize)
                                seg.data.TrimFront(int(ackLeft))
                                seg.sequenceNumber.UpdateForward(ackLeft)
                                s.Outstanding -= prevCount - s.pCount(seg, s.MaxPayloadSize)
                                break
                        }

                        if s.writeNext == seg {
                                s.writeNext = seg.Next()
                        }

                        // Update the RACK fields if SACK is enabled.
                        if s.ep.SACKPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
                                s.rc.update(seg, rcvdSeg)
                                s.rc.detectReorder(seg)
                        }

                        s.writeList.Remove(seg)

                        // If SACK is enabled then only reduce outstanding if
                        // the segment was not previously SACKED as these have
                        // already been accounted for in SetPipe().
                        if !s.ep.SACKPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
                                s.Outstanding -= s.pCount(seg, s.MaxPayloadSize)
                        } else {
                                s.SackedOut -= s.pCount(seg, s.MaxPayloadSize)
                        }
                        seg.decRef()
                        ackLeft -= datalen
                }

                // Update the send buffer usage and notify potential waiters.
                s.ep.updateSndBufferUsage(int(acked))

                // Clear SACK information for all acked data.
                s.ep.scoreboard.Delete(s.SndUna)

                // If we are not in fast recovery then update the congestion
                // window based on the number of acknowledged packets.
                if !s.FastRecovery.Active {
                        s.cc.Update(originalOutstanding - s.Outstanding)
                        if s.FastRecovery.Last.LessThan(s.SndUna) {
                                s.state = tcpip.Open
                                // Update RACK when we are exiting fast or RTO
                                // recovery as described in the RFC
                                // draft-ietf-tcpm-rack-08 Section-7.2 Step 4.
                                if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
                                        s.rc.exitRecovery()
                                }
                                s.reorderTimer.disable()
                        }
                }

                // It is possible for s.outstanding to drop below zero if we get
                // a retransmit timeout, reset outstanding to zero but later
                // get an ack that cover previously sent data.
                if s.Outstanding < 0 {
                        s.Outstanding = 0
                }

                s.SetPipe()

                // If all outstanding data was acknowledged the disable the timer.
                // RFC 6298 Rule 5.3
                if s.SndUna == s.SndNxt {
                        s.Outstanding = 0
                        // Reset firstRetransmittedSegXmitTime to the zero value.
                        s.firstRetransmittedSegXmitTime = tcpip.MonotonicTime{}
                        s.resendTimer.disable()
                        s.probeTimer.disable()
                }
        }

        if s.ep.SACKPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 {
                // Update RACK reorder window.
                // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
                // * Upon receiving an ACK:
                // * Step 4: Update RACK reordering window
                s.rc.updateRACKReorderWindow()

                // After the reorder window is calculated, detect any loss by checking
                // if the time elapsed after the segments are sent is greater than the
                // reorder window.
                if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.FastRecovery.Active {
                        // If any segment is marked as lost by
                        // RACK, enter recovery and retransmit
                        // the lost segments.
                        s.cc.HandleLossDetected()
                        s.enterRecovery()
                        fastRetransmit = true
                }

                if s.FastRecovery.Active {
                        s.rc.DoRecovery(nil, fastRetransmit)
                }
        }

        // Now that we've popped all acknowledged data from the retransmit
        // queue, retransmit if needed.
        if s.FastRecovery.Active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 {
                s.lr.DoRecovery(rcvdSeg, fastRetransmit)
                // When SACK is enabled data sending is governed by steps in
                // RFC 6675 Section 5 recovery steps  A-C.
                // See: https://tools.ietf.org/html/rfc6675#section-5.
                if s.ep.SACKPermitted {
                        return
                }
        }

        // Send more data now that some of the pending data has been ack'd, or
        // that the window opened up, or the congestion window was inflated due
        // to a duplicate ack during fast recovery. This will also re-enable
        // the retransmit timer if needed.
        s.sendData()
}

// sendSegment sends the specified segment.
func (s *sender) sendSegment(seg *segment) tcpip.Error {
        if seg.xmitCount > 0 {
                s.ep.stack.Stats().TCP.Retransmits.Increment()
                s.ep.stats.SendErrors.Retransmits.Increment()
                if s.SndCwnd < s.Ssthresh {
                        s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment()
                }
        }
        seg.xmitTime = s.ep.stack.Clock().NowMonotonic()
        seg.xmitCount++
        seg.lost = false
        err := s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber)

        // Every time a packet containing data is sent (including a
        // retransmission), if SACK is enabled and we are retransmitting data
        // then use the conservative timer described in RFC6675 Section 6.0,
        // otherwise follow the standard time described in RFC6298 Section 5.1.
        if err != nil && seg.data.Size() != 0 {
                if s.FastRecovery.Active && seg.xmitCount > 1 && s.ep.SACKPermitted {
                        s.resendTimer.enable(s.RTO)
                } else {
                        if !s.resendTimer.enabled() {
                                s.resendTimer.enable(s.RTO)
                        }
                }
        }

        return err
}

// sendSegmentFromView sends a new segment containing the given payload, flags
// and sequence number.
func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags header.TCPFlags, seq seqnum.Value) tcpip.Error {
        s.LastSendTime = s.ep.stack.Clock().NowMonotonic()
        if seq == s.RTTMeasureSeqNum {
                s.RTTMeasureTime = s.LastSendTime
        }

        rcvNxt, rcvWnd := s.ep.rcv.getSendParams()

        // Remember the max sent ack.
        s.MaxSentAck = rcvNxt

        return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd)
}

// maybeSendOutOfWindowAck sends an ACK if we are not being rate limited
// currently.
func (s *sender) maybeSendOutOfWindowAck(seg *segment) {
        // Data packets are unlikely to be part of an ACK loop. So always send
        // an ACK for a packet w/ data.
        if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() {
                s.sendAck()
        }
}

























   23 



   23 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "gvisor.dev/gvisor/pkg/context"
)

// AIOCallback is an function that does asynchronous I/O on behalf of a task.
type AIOCallback func(context.Context)

// QueueAIO queues an AIOCallback which will be run asynchronously.
func (t *Task) QueueAIO(cb AIOCallback) {
        ctx := t.AsyncContext()
        wg := &t.TaskSet().aioGoroutines
        wg.Add(1)
        go func() {
                cb(ctx)
                wg.Done()
        }()
}
































   15 






    2 


   13 






    9 



    2 


    7 





    7 



    8 



    3 


    1 


    2 



    5 



   14 




    1 



   13 


   18 
    1 



   17 
    1 


   16 




    3 


   13 



   63 




    2 


   61 

    2 

    1 

    1 




   60 
    6 




    1 


    5 
    1 

    1 







    4 

    2 




    2 


    1 
    7 


    6 

    1 


    5 


    6 
    6 




    6 

    6 


    1 


    5 
    3 




    3 
    2 


    1 
    1 




    1 
    2 


    5 
    1 


    4 

    6 

   11 

    5 

    1 

    1 




    2 


    2 





    9 





    9 

    1 




    1 




    6 




    1 




   12 

   11 

    1 



   11 
    2 




    9 
    2 

    1 


    1 

    2 

    1 


    1 

    5 




    5 






    5 



    1 


    4 

    2 

    1 

    1 


    3 
    1 



    2 
    1 


    1 



    1 








    1 



   15 



    1 



   14 
   11 



   14 
    2 



   12 
    6 



    6 

    3 
    1 


    2 

    2 


    1 






    5 





    1 



    4 
    1 


    3 






    3 



    3 
    1 


    1 

    1 





    3 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fs/lock"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/fasync"
        "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
        slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

// Close implements Linux syscall close(2).
func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()

        // Note that Remove provides a reference on the file that we may use to
        // flush. It is still active until we drop the final reference below
        // (and other reference-holding operations complete).
        _, file := t.FDTable().Remove(t, fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        err := file.OnClose(t)
        return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file)
}

// Dup implements Linux syscall dup(2).
func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
        if err != nil {
                return 0, nil, linuxerr.EMFILE
        }
        return uintptr(newFD), nil, nil
}

// Dup2 implements Linux syscall dup2(2).
func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        oldfd := args[0].Int()
        newfd := args[1].Int()

        if oldfd == newfd {
                // As long as oldfd is valid, dup2() does nothing and returns newfd.
                file := t.GetFileVFS2(oldfd)
                if file == nil {
                        return 0, nil, linuxerr.EBADF
                }
                file.DecRef(t)
                return uintptr(newfd), nil, nil
        }

        return dup3(t, oldfd, newfd, 0)
}

// Dup3 implements Linux syscall dup3(2).
func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        oldfd := args[0].Int()
        newfd := args[1].Int()
        flags := args[2].Uint()

        if oldfd == newfd {
                return 0, nil, linuxerr.EINVAL
        }

        return dup3(t, oldfd, newfd, flags)
}

func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) {
        if flags&^linux.O_CLOEXEC != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        file := t.GetFileVFS2(oldfd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{
                CloseOnExec: flags&linux.O_CLOEXEC != 0,
        })
        if err != nil {
                return 0, nil, err
        }
        return uintptr(newfd), nil, nil
}

// Fcntl implements linux syscall fcntl(2).
func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        cmd := args[1].Int()

        file, flags := t.FDTable().GetVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        if file.StatusFlags()&linux.O_PATH != 0 {
                switch cmd {
                case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC, linux.F_GETFD, linux.F_SETFD, linux.F_GETFL:
                        // allowed
                default:
                        return 0, nil, linuxerr.EBADF
                }
        }

        switch cmd {
        case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
                minfd := args[2].Int()
                fd, err := t.NewFDFromVFS2(minfd, file, kernel.FDFlags{
                        CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
                })
                if err != nil {
                        return 0, nil, err
                }
                return uintptr(fd), nil, nil
        case linux.F_GETFD:
                return uintptr(flags.ToLinuxFDFlags()), nil, nil
        case linux.F_SETFD:
                flags := args[2].Uint()
                err := t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
                        CloseOnExec: flags&linux.FD_CLOEXEC != 0,
                })
                return 0, nil, err
        case linux.F_GETFL:
                return uintptr(file.StatusFlags()), nil, nil
        case linux.F_SETFL:
                return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
        case linux.F_GETOWN:
                owner, hasOwner := getAsyncOwner(t, file)
                if !hasOwner {
                        return 0, nil, nil
                }
                if owner.Type == linux.F_OWNER_PGRP {
                        return uintptr(-owner.PID), nil, nil
                }
                return uintptr(owner.PID), nil, nil
        case linux.F_SETOWN:
                who := args[2].Int()
                ownerType := int32(linux.F_OWNER_PID)
                if who < 0 {
                        // Check for overflow before flipping the sign.
                        if who-1 > who {
                                return 0, nil, linuxerr.EINVAL
                        }
                        ownerType = linux.F_OWNER_PGRP
                        who = -who
                }
                return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who)
        case linux.F_GETOWN_EX:
                owner, hasOwner := getAsyncOwner(t, file)
                if !hasOwner {
                        return 0, nil, nil
                }
                _, err := owner.CopyOut(t, args[2].Pointer())
                return 0, nil, err
        case linux.F_SETOWN_EX:
                var owner linux.FOwnerEx
                _, err := owner.CopyIn(t, args[2].Pointer())
                if err != nil {
                        return 0, nil, err
                }
                return 0, nil, setAsyncOwner(t, int(fd), file, owner.Type, owner.PID)
        case linux.F_SETPIPE_SZ:
                pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
                if !ok {
                        return 0, nil, linuxerr.EBADF
                }
                n, err := pipefile.SetPipeSize(int64(args[2].Int()))
                if err != nil {
                        return 0, nil, err
                }
                return uintptr(n), nil, nil
        case linux.F_GETPIPE_SZ:
                pipefile, ok := file.Impl().(*pipe.VFSPipeFD)
                if !ok {
                        return 0, nil, linuxerr.EBADF
                }
                return uintptr(pipefile.PipeSize()), nil, nil
        case linux.F_GET_SEALS:
                val, err := tmpfs.GetSeals(file)
                return uintptr(val), nil, err
        case linux.F_ADD_SEALS:
                if !file.IsWritable() {
                        return 0, nil, linuxerr.EPERM
                }
                err := tmpfs.AddSeals(file, args[2].Uint())
                return 0, nil, err
        case linux.F_SETLK:
                return 0, nil, posixLock(t, args, file, false /* blocking */)
        case linux.F_SETLKW:
                return 0, nil, posixLock(t, args, file, true /* blocking */)
        case linux.F_GETLK:
                return 0, nil, posixTestLock(t, args, file)
        case linux.F_GETSIG:
                a := file.AsyncHandler()
                if a == nil {
                        // Default behavior aka SIGIO.
                        return 0, nil, nil
                }
                return uintptr(a.(*fasync.FileAsync).Signal()), nil, nil
        case linux.F_SETSIG:
                a := file.SetAsyncHandler(fasync.NewVFS2(int(fd))).(*fasync.FileAsync)
                return 0, nil, a.SetSignal(linux.Signal(args[2].Int()))
        default:
                // Everything else is not yet supported.
                return 0, nil, linuxerr.EINVAL
        }
}

func getAsyncOwner(t *kernel.Task, fd *vfs.FileDescription) (ownerEx linux.FOwnerEx, hasOwner bool) {
        a := fd.AsyncHandler()
        if a == nil {
                return linux.FOwnerEx{}, false
        }

        ot, otg, opg := a.(*fasync.FileAsync).Owner()
        switch {
        case ot != nil:
                return linux.FOwnerEx{
                        Type: linux.F_OWNER_TID,
                        PID:  int32(t.PIDNamespace().IDOfTask(ot)),
                }, true
        case otg != nil:
                return linux.FOwnerEx{
                        Type: linux.F_OWNER_PID,
                        PID:  int32(t.PIDNamespace().IDOfThreadGroup(otg)),
                }, true
        case opg != nil:
                return linux.FOwnerEx{
                        Type: linux.F_OWNER_PGRP,
                        PID:  int32(t.PIDNamespace().IDOfProcessGroup(opg)),
                }, true
        default:
                return linux.FOwnerEx{}, true
        }
}

func setAsyncOwner(t *kernel.Task, fd int, file *vfs.FileDescription, ownerType, pid int32) error {
        switch ownerType {
        case linux.F_OWNER_TID, linux.F_OWNER_PID, linux.F_OWNER_PGRP:
                // Acceptable type.
        default:
                return linuxerr.EINVAL
        }

        a := file.SetAsyncHandler(fasync.NewVFS2(fd)).(*fasync.FileAsync)
        if pid == 0 {
                a.ClearOwner()
                return nil
        }

        switch ownerType {
        case linux.F_OWNER_TID:
                task := t.PIDNamespace().TaskWithID(kernel.ThreadID(pid))
                if task == nil {
                        return linuxerr.ESRCH
                }
                a.SetOwnerTask(t, task)
                return nil
        case linux.F_OWNER_PID:
                tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(pid))
                if tg == nil {
                        return linuxerr.ESRCH
                }
                a.SetOwnerThreadGroup(t, tg)
                return nil
        case linux.F_OWNER_PGRP:
                pg := t.PIDNamespace().ProcessGroupWithID(kernel.ProcessGroupID(pid))
                if pg == nil {
                        return linuxerr.ESRCH
                }
                a.SetOwnerProcessGroup(t, pg)
                return nil
        default:
                return linuxerr.EINVAL
        }
}

func posixTestLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription) error {
        // Copy in the lock request.
        flockAddr := args[2].Pointer()
        var flock linux.Flock
        if _, err := flock.CopyIn(t, flockAddr); err != nil {
                return err
        }
        var typ lock.LockType
        switch flock.Type {
        case linux.F_RDLCK:
                typ = lock.ReadLock
        case linux.F_WRLCK:
                typ = lock.WriteLock
        default:
                return linuxerr.EINVAL
        }
        r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence)
        if err != nil {
                return err
        }

        newFlock, err := file.TestPOSIX(t, t.FDTable(), typ, r)
        if err != nil {
                return err
        }
        newFlock.PID = translatePID(t.PIDNamespace().Root(), t.PIDNamespace(), newFlock.PID)
        if _, err = newFlock.CopyOut(t, flockAddr); err != nil {
                return err
        }
        return nil
}

// translatePID translates a pid from one namespace to another. Note that this
// may race with task termination/creation, in which case the original task
// corresponding to pid may no longer exist. This is used to implement the
// F_GETLK fcntl, which has the same potential race in Linux as well (i.e.,
// there is no synchronization between retrieving the lock PID and translating
// it). See fs/locks.c:posix_lock_to_flock.
func translatePID(old, new *kernel.PIDNamespace, pid int32) int32 {
        return int32(new.IDOfTask(old.TaskWithID(kernel.ThreadID(pid))))
}

func posixLock(t *kernel.Task, args arch.SyscallArguments, file *vfs.FileDescription, blocking bool) error {
        // Copy in the lock request.
        flockAddr := args[2].Pointer()
        var flock linux.Flock
        if _, err := flock.CopyIn(t, flockAddr); err != nil {
                return err
        }

        var blocker lock.Blocker
        if blocking {
                blocker = t
        }

        r, err := file.ComputeLockRange(t, uint64(flock.Start), uint64(flock.Len), flock.Whence)
        if err != nil {
                return err
        }

        switch flock.Type {
        case linux.F_RDLCK:
                if !file.IsReadable() {
                        return linuxerr.EBADF
                }
                return file.LockPOSIX(t, t.FDTable(), int32(t.TGIDInRoot()), lock.ReadLock, r, blocker)

        case linux.F_WRLCK:
                if !file.IsWritable() {
                        return linuxerr.EBADF
                }
                return file.LockPOSIX(t, t.FDTable(), int32(t.TGIDInRoot()), lock.WriteLock, r, blocker)

        case linux.F_UNLCK:
                return file.UnlockPOSIX(t, t.FDTable(), r)

        default:
                return linuxerr.EINVAL
        }
}

// Fadvise64 implements fadvise64(2).
// This implementation currently ignores the provided advice.
func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        length := args[2].Int64()
        advice := args[3].Int()

        // Note: offset is allowed to be negative.
        if length < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        if file.StatusFlags()&linux.O_PATH != 0 {
                return 0, nil, linuxerr.EBADF
        }

        // If the FD refers to a pipe or FIFO, return error.
        if _, isPipe := file.Impl().(*pipe.VFSPipeFD); isPipe {
                return 0, nil, linuxerr.ESPIPE
        }

        switch advice {
        case linux.POSIX_FADV_NORMAL:
        case linux.POSIX_FADV_RANDOM:
        case linux.POSIX_FADV_SEQUENTIAL:
        case linux.POSIX_FADV_WILLNEED:
        case linux.POSIX_FADV_DONTNEED:
        case linux.POSIX_FADV_NOREUSE:
        default:
                return 0, nil, linuxerr.EINVAL
        }

        // Sure, whatever.
        return 0, nil, nil
}








































































































































































































































































































































































    8 
    3 


    8 


    8 


    8 




   45 
   29 


   20 


    7 


    7 

























   25 












   24 











    8 




    8 


    8 


    8 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fs

import (
        "fmt"
        "os"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/p9"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
)

// InodeType enumerates types of Inodes.
type InodeType int

const (
        // RegularFile is a regular file.
        RegularFile InodeType = iota

        // SpecialFile is a file that doesn't support SeekEnd. It is used for
        // things like proc files.
        SpecialFile

        // Directory is a directory.
        Directory

        // SpecialDirectory is a directory that *does* support SeekEnd. It's
        // the opposite of the SpecialFile scenario above. It similarly
        // supports proc files.
        SpecialDirectory

        // Symlink is a symbolic link.
        Symlink

        // Pipe is a pipe (named or regular).
        Pipe

        // Socket is a socket.
        Socket

        // CharacterDevice is a character device.
        CharacterDevice

        // BlockDevice is a block device.
        BlockDevice

        // Anonymous is an anonymous type when none of the above apply.
        // Epoll fds and event-driven fds fit this category.
        Anonymous
)

// String returns a human-readable representation of the InodeType.
func (n InodeType) String() string {
        switch n {
        case RegularFile, SpecialFile:
                return "file"
        case Directory, SpecialDirectory:
                return "directory"
        case Symlink:
                return "symlink"
        case Pipe:
                return "pipe"
        case Socket:
                return "socket"
        case CharacterDevice:
                return "character-device"
        case BlockDevice:
                return "block-device"
        case Anonymous:
                return "anonymous"
        default:
                return "unknown"
        }
}

// LinuxType returns the linux file type for this inode type.
func (n InodeType) LinuxType() uint32 {
        switch n {
        case RegularFile, SpecialFile:
                return linux.ModeRegular
        case Directory, SpecialDirectory:
                return linux.ModeDirectory
        case Symlink:
                return linux.ModeSymlink
        case Pipe:
                return linux.ModeNamedPipe
        case CharacterDevice:
                return linux.ModeCharacterDevice
        case BlockDevice:
                return linux.ModeBlockDevice
        case Socket:
                return linux.ModeSocket
        default:
                return 0
        }
}

// ToDirentType converts an InodeType to a linux dirent type field.
func ToDirentType(nodeType InodeType) uint8 {
        switch nodeType {
        case RegularFile, SpecialFile:
                return linux.DT_REG
        case Symlink:
                return linux.DT_LNK
        case Directory, SpecialDirectory:
                return linux.DT_DIR
        case Pipe:
                return linux.DT_FIFO
        case CharacterDevice:
                return linux.DT_CHR
        case BlockDevice:
                return linux.DT_BLK
        case Socket:
                return linux.DT_SOCK
        default:
                return linux.DT_UNKNOWN
        }
}

// ToInodeType coverts a linux file type to InodeType.
func ToInodeType(linuxFileType linux.FileMode) InodeType {
        switch linuxFileType {
        case linux.ModeRegular:
                return RegularFile
        case linux.ModeDirectory:
                return Directory
        case linux.ModeSymlink:
                return Symlink
        case linux.ModeNamedPipe:
                return Pipe
        case linux.ModeCharacterDevice:
                return CharacterDevice
        case linux.ModeBlockDevice:
                return BlockDevice
        case linux.ModeSocket:
                return Socket
        default:
                panic(fmt.Sprintf("unknown file mode: %d", linuxFileType))
        }
}

// StableAttr contains Inode attributes that will be stable throughout the
// lifetime of the Inode.
//
// +stateify savable
type StableAttr struct {
        // Type is the InodeType of a InodeOperations.
        Type InodeType

        // DeviceID is the device on which a InodeOperations resides.
        DeviceID uint64

        // InodeID uniquely identifies InodeOperations on its device.
        InodeID uint64

        // BlockSize is the block size of data backing this InodeOperations.
        BlockSize int64

        // DeviceFileMajor is the major device number of this Node, if it is a
        // device file.
        DeviceFileMajor uint16

        // DeviceFileMinor is the minor device number of this Node, if it is a
        // device file.
        DeviceFileMinor uint32
}

// IsRegular returns true if StableAttr.Type matches a regular file.
func IsRegular(s StableAttr) bool {
        return s.Type == RegularFile
}

// IsFile returns true if StableAttr.Type matches any type of file.
func IsFile(s StableAttr) bool {
        return s.Type == RegularFile || s.Type == SpecialFile
}

// IsDir returns true if StableAttr.Type matches any type of directory.
func IsDir(s StableAttr) bool {
        return s.Type == Directory || s.Type == SpecialDirectory
}

// IsSymlink returns true if StableAttr.Type matches a symlink.
func IsSymlink(s StableAttr) bool {
        return s.Type == Symlink
}

// IsPipe returns true if StableAttr.Type matches any type of pipe.
func IsPipe(s StableAttr) bool {
        return s.Type == Pipe
}

// IsAnonymous returns true if StableAttr.Type matches any type of anonymous.
func IsAnonymous(s StableAttr) bool {
        return s.Type == Anonymous
}

// IsSocket returns true if StableAttr.Type matches any type of socket.
func IsSocket(s StableAttr) bool {
        return s.Type == Socket
}

// IsCharDevice returns true if StableAttr.Type matches a character device.
func IsCharDevice(s StableAttr) bool {
        return s.Type == CharacterDevice
}

// UnstableAttr contains Inode attributes that may change over the lifetime
// of the Inode.
//
// +stateify savable
type UnstableAttr struct {
        // Size is the file size in bytes.
        Size int64

        // Usage is the actual data usage in bytes.
        Usage int64

        // Perms is the protection (read/write/execute for user/group/other).
        Perms FilePermissions

        // Owner describes the ownership of this file.
        Owner FileOwner

        // AccessTime is the time of last access
        AccessTime ktime.Time

        // ModificationTime is the time of last modification.
        ModificationTime ktime.Time

        // StatusChangeTime is the time of last attribute modification.
        StatusChangeTime ktime.Time

        // Links is the number of hard links.
        Links uint64
}

// SetOwner sets the owner and group if they are valid.
//
// This method is NOT thread-safe. Callers must prevent concurrent calls.
func (ua *UnstableAttr) SetOwner(ctx context.Context, owner FileOwner) {
        if owner.UID.Ok() {
                ua.Owner.UID = owner.UID
        }
        if owner.GID.Ok() {
                ua.Owner.GID = owner.GID
        }
        ua.StatusChangeTime = ktime.NowFromContext(ctx)
}

// SetPermissions sets the permissions.
//
// This method is NOT thread-safe. Callers must prevent concurrent calls.
func (ua *UnstableAttr) SetPermissions(ctx context.Context, p FilePermissions) {
        ua.Perms = p
        ua.StatusChangeTime = ktime.NowFromContext(ctx)
}

// SetTimestamps sets the timestamps according to the TimeSpec.
//
// This method is NOT thread-safe. Callers must prevent concurrent calls.
func (ua *UnstableAttr) SetTimestamps(ctx context.Context, ts TimeSpec) {
        if ts.ATimeOmit && ts.MTimeOmit {
                return
        }

        now := ktime.NowFromContext(ctx)
        if !ts.ATimeOmit {
                if ts.ATimeSetSystemTime {
                        ua.AccessTime = now
                } else {
                        ua.AccessTime = ts.ATime
                }
        }
        if !ts.MTimeOmit {
                if ts.MTimeSetSystemTime {
                        ua.ModificationTime = now
                } else {
                        ua.ModificationTime = ts.MTime
                }
        }
        ua.StatusChangeTime = now
}

// WithCurrentTime returns u with AccessTime == ModificationTime == current time.
func WithCurrentTime(ctx context.Context, u UnstableAttr) UnstableAttr {
        t := ktime.NowFromContext(ctx)
        u.AccessTime = t
        u.ModificationTime = t
        u.StatusChangeTime = t
        return u
}

// AttrMask contains fields to mask StableAttr and UnstableAttr.
//
// +stateify savable
type AttrMask struct {
        Type             bool
        DeviceID         bool
        InodeID          bool
        BlockSize        bool
        Size             bool
        Usage            bool
        Perms            bool
        UID              bool
        GID              bool
        AccessTime       bool
        ModificationTime bool
        StatusChangeTime bool
        Links            bool
}

// Empty returns true if all fields in AttrMask are false.
func (a AttrMask) Empty() bool {
        return a == AttrMask{}
}

// PermMask are file access permissions.
//
// +stateify savable
type PermMask struct {
        // Read indicates reading is permitted.
        Read bool

        // Write indicates writing is permitted.
        Write bool

        // Execute indicates execution is permitted.
        Execute bool
}

// OnlyRead returns true when only the read bit is set.
func (p PermMask) OnlyRead() bool {
        return p.Read && !p.Write && !p.Execute
}

// String implements the fmt.Stringer interface for PermMask.
func (p PermMask) String() string {
        return fmt.Sprintf("PermMask{Read: %v, Write: %v, Execute: %v}", p.Read, p.Write, p.Execute)
}

// Mode returns the system mode (unix.S_IXOTH, etc.) for these permissions
// in the "other" bits.
func (p PermMask) Mode() (mode os.FileMode) {
        if p.Read {
                mode |= unix.S_IROTH
        }
        if p.Write {
                mode |= unix.S_IWOTH
        }
        if p.Execute {
                mode |= unix.S_IXOTH
        }
        return
}

// SupersetOf returns true iff the permissions in p are a superset of the
// permissions in other.
func (p PermMask) SupersetOf(other PermMask) bool {
        if !p.Read && other.Read {
                return false
        }
        if !p.Write && other.Write {
                return false
        }
        if !p.Execute && other.Execute {
                return false
        }
        return true
}

// FilePermissions represents the permissions of a file, with
// Read/Write/Execute bits for user, group, and other.
//
// +stateify savable
type FilePermissions struct {
        User  PermMask
        Group PermMask
        Other PermMask

        // Sticky, if set on directories, restricts renaming and deletion of
        // files in those directories to the directory owner, file owner, or
        // CAP_FOWNER. The sticky bit is ignored when set on other files.
        Sticky bool

        // SetUID executables can call UID-setting syscalls without CAP_SETUID.
        SetUID bool

        // SetGID executables can call GID-setting syscalls without CAP_SETGID.
        SetGID bool
}

// PermsFromMode takes the Other permissions (last 3 bits) of a FileMode and
// returns a set of PermMask.
func PermsFromMode(mode linux.FileMode) (perms PermMask) {
        perms.Read = mode&linux.ModeOtherRead != 0
        perms.Write = mode&linux.ModeOtherWrite != 0
        perms.Execute = mode&linux.ModeOtherExec != 0
        return
}

// FilePermsFromP9 converts a p9.FileMode to a FilePermissions struct.
func FilePermsFromP9(mode p9.FileMode) FilePermissions {
        return FilePermsFromMode(linux.FileMode(mode))
}

// FilePermsFromMode converts a system file mode to a FilePermissions struct.
func FilePermsFromMode(mode linux.FileMode) (fp FilePermissions) {
        perm := mode.Permissions()
        fp.Other = PermsFromMode(perm)
        fp.Group = PermsFromMode(perm >> 3)
        fp.User = PermsFromMode(perm >> 6)
        fp.Sticky = mode&linux.ModeSticky == linux.ModeSticky
        fp.SetUID = mode&linux.ModeSetUID == linux.ModeSetUID
        fp.SetGID = mode&linux.ModeSetGID == linux.ModeSetGID
        return
}

// LinuxMode returns the linux mode_t representation of these permissions.
func (f FilePermissions) LinuxMode() linux.FileMode {
        m := linux.FileMode(f.User.Mode()<<6 | f.Group.Mode()<<3 | f.Other.Mode())
        if f.SetUID {
                m |= linux.ModeSetUID
        }
        if f.SetGID {
                m |= linux.ModeSetGID
        }
        if f.Sticky {
                m |= linux.ModeSticky
        }
        return m
}

// OSMode returns the Go runtime's OS independent os.FileMode representation of
// these permissions.
func (f FilePermissions) OSMode() os.FileMode {
        m := os.FileMode(f.User.Mode()<<6 | f.Group.Mode()<<3 | f.Other.Mode())
        if f.SetUID {
                m |= os.ModeSetuid
        }
        if f.SetGID {
                m |= os.ModeSetgid
        }
        if f.Sticky {
                m |= os.ModeSticky
        }
        return m
}

// AnyExecute returns true if any of U/G/O have the execute bit set.
func (f FilePermissions) AnyExecute() bool {
        return f.User.Execute || f.Group.Execute || f.Other.Execute
}

// AnyWrite returns true if any of U/G/O have the write bit set.
func (f FilePermissions) AnyWrite() bool {
        return f.User.Write || f.Group.Write || f.Other.Write
}

// AnyRead returns true if any of U/G/O have the read bit set.
func (f FilePermissions) AnyRead() bool {
        return f.User.Read || f.Group.Read || f.Other.Read
}

// HasSetUIDOrGID returns true if either the setuid or setgid bit is set.
func (f FilePermissions) HasSetUIDOrGID() bool {
        return f.SetUID || f.SetGID
}

// DropSetUIDAndMaybeGID turns off setuid, and turns off setgid if f allows
// group execution.
func (f *FilePermissions) DropSetUIDAndMaybeGID() {
        f.SetUID = false
        if f.Group.Execute {
                f.SetGID = false
        }
}

// FileOwner represents ownership of a file.
//
// +stateify savable
type FileOwner struct {
        UID auth.KUID
        GID auth.KGID
}

// RootOwner corresponds to KUID/KGID 0/0.
var RootOwner = FileOwner{
        UID: auth.RootKUID,
        GID: auth.RootKGID,
}































































































   70 


















   32 





   32 










   32 







    4 





    4 









   28 




   28 


   28 




    3 






   11 

    1 





   11 
    5 






    5 





    7 





    5 





    3 











    5 






   99 










  100 
   96 










   98 



   40 





   40 












    4 




   25 







    8 
    5 


    3 


   23 



   23 




   23 

   23 












   23 


   23 


   20 


















   36 



















   29 












   29 



   26 


   27 








   10 








   14 






   44 








   44 



   43 
    3 


   41 



    3 



   40 




   40 
    3 


   37 




   37 


   37 



   37 




   31 








    6 



    1 

    4 

    1 




    6 


    5 




   27 















   48 




   48 






   48 





   26 





    7 


   25 
   10 



   25 

   25 


   21 







   21 








   21 

    5 












    5 


   24 





   37 











   36 


    1 














    1 


    1 







   35 








   35 


   36 








   35 



    2 





   35 

   32 









   32 








   36 



   36 



    3 

    2 


    1 






    6 

    2 


    4 





    1 





    3 






    3 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tmpfs

import (
        "fmt"
        "io"
        "math"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
        "gvisor.dev/gvisor/pkg/sentry/fsmetric"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        "gvisor.dev/gvisor/pkg/sentry/usage"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/usermem"
)

// regularFile is a regular (=S_IFREG) tmpfs file.
//
// +stateify savable
type regularFile struct {
        inode inode

        // memFile is a platform.File used to allocate pages to this regularFile.
        memFile *pgalloc.MemoryFile `state:"nosave"`

        // memoryUsageKind is the memory accounting category under which pages backing
        // this regularFile's contents are accounted.
        memoryUsageKind usage.MemoryKind

        // mapsMu protects mappings.
        mapsMu sync.Mutex `state:"nosave"`

        // mappings tracks mappings of the file into memmap.MappingSpaces.
        //
        // Protected by mapsMu.
        mappings memmap.MappingSet

        // writableMappingPages tracks how many pages of virtual memory are mapped
        // as potentially writable from this file. If a page has multiple mappings,
        // each mapping is counted separately.
        //
        // This counter is susceptible to overflow as we can potentially count
        // mappings from many VMAs. We count pages rather than bytes to slightly
        // mitigate this.
        //
        // Protected by mapsMu.
        writableMappingPages uint64

        // dataMu protects the fields below.
        dataMu sync.RWMutex `state:"nosave"`

        // data maps offsets into the file to offsets into memFile that store
        // the file's data.
        //
        // Protected by dataMu.
        data fsutil.FileRangeSet

        // seals represents file seals on this inode.
        //
        // Protected by dataMu.
        seals uint32

        // size is the size of data.
        //
        // Protected by both dataMu and inode.mu; reading it requires holding
        // either mutex, while writing requires holding both AND using atomics.
        // Readers that do not require consistency (like Stat) may read the
        // value atomically without holding either lock.
        size uint64
}

func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) *inode {
        file := &regularFile{
                memFile:         fs.mfp.MemoryFile(),
                memoryUsageKind: usage.Tmpfs,
                seals:           linux.F_SEAL_SEAL,
        }
        file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode, parentDir)
        file.inode.nlink = 1 // from parent directory
        return &file.inode
}

// newUnlinkedRegularFileDescription creates a regular file on the tmpfs
// filesystem represented by mount and returns an FD representing that file.
// The new file is not reachable by path traversal from any other file.
//
// newUnlinkedRegularFileDescription is analogous to Linux's
// mm/shmem.c:__shmem_file_setup().
//
// Preconditions: mount must be a tmpfs mount.
func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) {
        fs, ok := mount.Filesystem().Impl().(*filesystem)
        if !ok {
                panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount")
        }

        inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777, nil /* parentDir */)
        d := fs.newDentry(inode)
        defer d.DecRef(ctx)
        d.name = name

        fd := &regularFileFD{}
        fd.Init(&inode.locks)
        flags := uint32(linux.O_RDWR)
        if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
                return nil, err
        }
        return fd, nil
}

// NewZeroFile creates a new regular file and file description as for
// mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is
// initially (implicitly) filled with zeroes.
//
// Preconditions: mount must be a tmpfs mount.
func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) {
        // Compare mm/shmem.c:shmem_zero_setup().
        fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero")
        if err != nil {
                return nil, err
        }
        rf := fd.inode().impl.(*regularFile)
        rf.memoryUsageKind = usage.Anonymous
        rf.size = size
        return &fd.vfsfd, err
}

// NewMemfd creates a new regular file and file description as for
// memfd_create.
//
// Preconditions: mount must be a tmpfs mount.
func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
        fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name)
        if err != nil {
                return nil, err
        }
        if allowSeals {
                fd.inode().impl.(*regularFile).seals = 0
        }
        return &fd.vfsfd, nil
}

// truncate grows or shrinks the file to the given size. It returns true if the
// file size was updated.
func (rf *regularFile) truncate(newSize uint64) (bool, error) {
        rf.inode.mu.Lock()
        defer rf.inode.mu.Unlock()
        return rf.truncateLocked(newSize)
}

// Preconditions: rf.inode.mu must be held.
func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) {
        oldSize := rf.size
        if newSize == oldSize {
                // Nothing to do.
                return false, nil
        }

        // Need to hold inode.mu and dataMu while modifying size.
        rf.dataMu.Lock()
        if newSize > oldSize {
                // Can we grow the file?
                if rf.seals&linux.F_SEAL_GROW != 0 {
                        rf.dataMu.Unlock()
                        return false, linuxerr.EPERM
                }
                // We only need to update the file size.
                atomic.StoreUint64(&rf.size, newSize)
                rf.dataMu.Unlock()
                return true, nil
        }

        // We are shrinking the file. First check if this is allowed.
        if rf.seals&linux.F_SEAL_SHRINK != 0 {
                rf.dataMu.Unlock()
                return false, linuxerr.EPERM
        }

        // Update the file size.
        atomic.StoreUint64(&rf.size, newSize)
        rf.dataMu.Unlock()

        // Invalidate past translations of truncated pages.
        oldpgend := fs.OffsetPageEnd(int64(oldSize))
        newpgend := fs.OffsetPageEnd(int64(newSize))
        if newpgend < oldpgend {
                rf.mapsMu.Lock()
                rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
                        // Compare Linux's mm/shmem.c:shmem_setattr() =>
                        // mm/memory.c:unmap_mapping_range(evencows=1).
                        InvalidatePrivate: true,
                })
                rf.mapsMu.Unlock()
        }

        // We are now guaranteed that there are no translations of truncated pages,
        // and can remove them.
        rf.dataMu.Lock()
        rf.data.Truncate(newSize, rf.memFile)
        rf.dataMu.Unlock()
        return true, nil
}

// AddMapping implements memmap.Mappable.AddMapping.
func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
        rf.mapsMu.Lock()
        defer rf.mapsMu.Unlock()
        rf.dataMu.RLock()
        defer rf.dataMu.RUnlock()

        // Reject writable mapping if F_SEAL_WRITE is set.
        if rf.seals&linux.F_SEAL_WRITE != 0 && writable {
                return linuxerr.EPERM
        }

        rf.mappings.AddMapping(ms, ar, offset, writable)
        if writable {
                pagesBefore := rf.writableMappingPages

                // ar is guaranteed to be page aligned per memmap.Mappable.
                rf.writableMappingPages += uint64(ar.Length() / hostarch.PageSize)

                if rf.writableMappingPages < pagesBefore {
                        panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
                }
        }

        return nil
}

// RemoveMapping implements memmap.Mappable.RemoveMapping.
func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
        rf.mapsMu.Lock()
        defer rf.mapsMu.Unlock()

        rf.mappings.RemoveMapping(ms, ar, offset, writable)

        if writable {
                pagesBefore := rf.writableMappingPages

                // ar is guaranteed to be page aligned per memmap.Mappable.
                rf.writableMappingPages -= uint64(ar.Length() / hostarch.PageSize)

                if rf.writableMappingPages > pagesBefore {
                        panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
                }
        }
}

// CopyMapping implements memmap.Mappable.CopyMapping.
func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
        return rf.AddMapping(ctx, ms, dstAR, offset, writable)
}

// Translate implements memmap.Mappable.Translate.
func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
        rf.dataMu.Lock()
        defer rf.dataMu.Unlock()

        // Constrain translations to f.attr.Size (rounded up) to prevent
        // translation to pages that may be concurrently truncated.
        pgend := fs.OffsetPageEnd(int64(rf.size))
        var beyondEOF bool
        if required.End > pgend {
                if required.Start >= pgend {
                        return nil, &memmap.BusError{io.EOF}
                }
                beyondEOF = true
                required.End = pgend
        }
        if optional.End > pgend {
                optional.End = pgend
        }

        cerr := rf.data.Fill(ctx, required, optional, rf.size, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
                // Newly-allocated pages are zeroed, so we don't need to do anything.
                return dsts.NumBytes(), nil
        })

        var ts []memmap.Translation
        var translatedEnd uint64
        for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
                segMR := seg.Range().Intersect(optional)
                ts = append(ts, memmap.Translation{
                        Source: segMR,
                        File:   rf.memFile,
                        Offset: seg.FileRangeOf(segMR).Start,
                        Perms:  hostarch.AnyAccess,
                })
                translatedEnd = segMR.End
        }

        // Don't return the error returned by f.data.Fill if it occurred outside of
        // required.
        if translatedEnd < required.End && cerr != nil {
                return ts, &memmap.BusError{cerr}
        }
        if beyondEOF {
                return ts, &memmap.BusError{io.EOF}
        }
        return ts, nil
}

// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
func (*regularFile) InvalidateUnsavable(context.Context) error {
        return nil
}

// +stateify savable
type regularFileFD struct {
        fileDescription

        // off is the file offset. off is accessed using atomic memory operations.
        // offMu serializes operations that may mutate off.
        off   int64
        offMu sync.Mutex `state:"nosave"`
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *regularFileFD) Release(context.Context) {
        // noop
}

// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
        f := fd.inode().impl.(*regularFile)

        f.inode.mu.Lock()
        defer f.inode.mu.Unlock()
        oldSize := f.size
        size := offset + length
        if oldSize >= size {
                return nil
        }
        _, err := f.truncateLocked(size)
        return err
}

// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        start := fsmetric.StartReadWait()
        defer fsmetric.FinishReadWait(fsmetric.TmpfsReadWait, start)
        fsmetric.TmpfsReads.Increment()

        if offset < 0 {
                return 0, linuxerr.EINVAL
        }

        // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
        // all state is in-memory.
        //
        // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
        if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
                return 0, linuxerr.EOPNOTSUPP
        }

        if dst.NumBytes() == 0 {
                return 0, nil
        }
        f := fd.inode().impl.(*regularFile)
        rw := getRegularFileReadWriter(f, offset)
        n, err := dst.CopyOutFrom(ctx, rw)
        putRegularFileReadWriter(rw)
        fd.inode().touchAtime(fd.vfsfd.Mount())
        return n, err
}

// Read implements vfs.FileDescriptionImpl.Read.
func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        fd.offMu.Lock()
        n, err := fd.PRead(ctx, dst, fd.off, opts)
        fd.off += n
        fd.offMu.Unlock()
        return n, err
}

// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
        n, _, err := fd.pwrite(ctx, src, offset, opts)
        return n, err
}

// pwrite returns the number of bytes written, final offset and error. The
// final offset should be ignored by PWrite.
func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
        if offset < 0 {
                return 0, offset, linuxerr.EINVAL
        }

        // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
        // all state is in-memory.
        //
        // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
        if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
                return 0, offset, linuxerr.EOPNOTSUPP
        }

        srclen := src.NumBytes()
        if srclen == 0 {
                return 0, offset, nil
        }
        f := fd.inode().impl.(*regularFile)
        f.inode.mu.Lock()
        defer f.inode.mu.Unlock()
        // If the file is opened with O_APPEND, update offset to file size.
        if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
                // Locking f.inode.mu is sufficient for reading f.size.
                offset = int64(f.size)
        }
        if end := offset + srclen; end < offset {
                // Overflow.
                return 0, offset, linuxerr.EINVAL
        }

        srclen, err = vfs.CheckLimit(ctx, offset, srclen)
        if err != nil {
                return 0, offset, err
        }
        src = src.TakeFirst64(srclen)

        rw := getRegularFileReadWriter(f, offset)
        n, err := src.CopyInTo(ctx, rw)
        f.inode.touchCMtimeLocked()
        for {
                old := atomic.LoadUint32(&f.inode.mode)
                new := vfs.ClearSUIDAndSGID(old)
                if swapped := atomic.CompareAndSwapUint32(&f.inode.mode, old, new); swapped {
                        break
                }
        }
        putRegularFileReadWriter(rw)
        return n, n + offset, err
}

// Write implements vfs.FileDescriptionImpl.Write.
func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        fd.offMu.Lock()
        n, off, err := fd.pwrite(ctx, src, fd.off, opts)
        fd.off = off
        fd.offMu.Unlock()
        return n, err
}

// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        fd.offMu.Lock()
        defer fd.offMu.Unlock()
        switch whence {
        case linux.SEEK_SET:
                // use offset as specified
        case linux.SEEK_CUR:
                offset += fd.off
        case linux.SEEK_END:
                offset += int64(atomic.LoadUint64(&fd.inode().impl.(*regularFile).size))
        default:
                return 0, linuxerr.EINVAL
        }
        if offset < 0 {
                return 0, linuxerr.EINVAL
        }
        fd.off = offset
        return offset, nil
}

// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
        file := fd.inode().impl.(*regularFile)
        opts.SentryOwnedContent = true
        return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts)
}

// regularFileReadWriter implements safemem.Reader and Safemem.Writer.
type regularFileReadWriter struct {
        file *regularFile

        // Offset into the file to read/write at. Note that this may be
        // different from the FD offset if PRead/PWrite is used.
        off uint64
}

var regularFileReadWriterPool = sync.Pool{
        New: func() interface{} {
                return &regularFileReadWriter{}
        },
}

func getRegularFileReadWriter(file *regularFile, offset int64) *regularFileReadWriter {
        rw := regularFileReadWriterPool.Get().(*regularFileReadWriter)
        rw.file = file
        rw.off = uint64(offset)
        return rw
}

func putRegularFileReadWriter(rw *regularFileReadWriter) {
        rw.file = nil
        regularFileReadWriterPool.Put(rw)
}

// ReadToBlocks implements safemem.Reader.ReadToBlocks.
func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
        rw.file.dataMu.RLock()
        defer rw.file.dataMu.RUnlock()
        size := rw.file.size

        // Compute the range to read (limited by file size and overflow-checked).
        if rw.off >= size {
                return 0, io.EOF
        }
        end := size
        if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
                end = rend
        }

        var done uint64
        seg, gap := rw.file.data.Find(uint64(rw.off))
        for rw.off < end {
                mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
                switch {
                case seg.Ok():
                        // Get internal mappings.
                        ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read)
                        if err != nil {
                                return done, err
                        }

                        // Copy from internal mappings.
                        n, err := safemem.CopySeq(dsts, ims)
                        done += n
                        rw.off += uint64(n)
                        dsts = dsts.DropFirst64(n)
                        if err != nil {
                                return done, err
                        }

                        // Continue.
                        seg, gap = seg.NextNonEmpty()

                case gap.Ok():
                        // Tmpfs holes are zero-filled.
                        gapmr := gap.Range().Intersect(mr)
                        dst := dsts.TakeFirst64(gapmr.Length())
                        n, err := safemem.ZeroSeq(dst)
                        done += n
                        rw.off += uint64(n)
                        dsts = dsts.DropFirst64(n)
                        if err != nil {
                                return done, err
                        }

                        // Continue.
                        seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
                }
        }
        return done, nil
}

// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
//
// Preconditions: rw.file.inode.mu must be held.
func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
        // Hold dataMu so we can modify size.
        rw.file.dataMu.Lock()
        defer rw.file.dataMu.Unlock()

        // Compute the range to write (overflow-checked).
        end := rw.off + srcs.NumBytes()
        if end <= rw.off {
                end = math.MaxInt64
        }

        // Check if seals prevent either file growth or all writes.
        switch {
        case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
                return 0, linuxerr.EPERM
        case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
                // When growth is sealed, Linux effectively allows writes which would
                // normally grow the file to partially succeed up to the current EOF,
                // rounded down to the page boundary before the EOF.
                //
                // This happens because writes (and thus the growth check) for tmpfs
                // files proceed page-by-page on Linux, and the final write to the page
                // containing EOF fails, resulting in a partial write up to the start of
                // that page.
                //
                // To emulate this behaviour, artifically truncate the write to the
                // start of the page containing the current EOF.
                //
                // See Linux, mm/filemap.c:generic_perform_write() and
                // mm/shmem.c:shmem_write_begin().
                if pgstart := uint64(hostarch.Addr(rw.file.size).RoundDown()); end > pgstart {
                        end = pgstart
                }
                if end <= rw.off {
                        // Truncation would result in no data being written.
                        return 0, linuxerr.EPERM
                }
        }

        // Page-aligned mr for when we need to allocate memory. RoundUp can't
        // overflow since end is an int64.
        pgstartaddr := hostarch.Addr(rw.off).RoundDown()
        pgendaddr, _ := hostarch.Addr(end).RoundUp()
        pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)}

        var (
                done   uint64
                retErr error
        )
        seg, gap := rw.file.data.Find(uint64(rw.off))
        for rw.off < end {
                mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
                switch {
                case seg.Ok():
                        // Get internal mappings.
                        ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Write)
                        if err != nil {
                                retErr = err
                                goto exitLoop
                        }

                        // Copy to internal mappings.
                        n, err := safemem.CopySeq(ims, srcs)
                        done += n
                        rw.off += uint64(n)
                        srcs = srcs.DropFirst64(n)
                        if err != nil {
                                retErr = err
                                goto exitLoop
                        }

                        // Continue.
                        seg, gap = seg.NextNonEmpty()

                case gap.Ok():
                        // Allocate memory for the write.
                        gapMR := gap.Range().Intersect(pgMR)
                        fr, err := rw.file.memFile.Allocate(gapMR.Length(), rw.file.memoryUsageKind)
                        if err != nil {
                                retErr = err
                                goto exitLoop
                        }

                        // Write to that memory as usual.
                        seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}

                default:
                        panic("unreachable")
                }
        }
exitLoop:
        // If the write ends beyond the file's previous size, it causes the
        // file to grow.
        if rw.off > rw.file.size {
                atomic.StoreUint64(&rw.file.size, rw.off)
        }

        return done, retErr
}

// GetSeals returns the current set of seals on a memfd inode.
func GetSeals(fd *vfs.FileDescription) (uint32, error) {
        f, ok := fd.Impl().(*regularFileFD)
        if !ok {
                return 0, linuxerr.EINVAL
        }
        rf := f.inode().impl.(*regularFile)
        rf.dataMu.RLock()
        defer rf.dataMu.RUnlock()
        return rf.seals, nil
}

// AddSeals adds new file seals to a memfd inode.
func AddSeals(fd *vfs.FileDescription, val uint32) error {
        f, ok := fd.Impl().(*regularFileFD)
        if !ok {
                return linuxerr.EINVAL
        }
        rf := f.inode().impl.(*regularFile)
        rf.mapsMu.Lock()
        defer rf.mapsMu.Unlock()
        rf.dataMu.RLock()
        defer rf.dataMu.RUnlock()

        if rf.seals&linux.F_SEAL_SEAL != 0 {
                // Seal applied which prevents addition of any new seals.
                return linuxerr.EPERM
        }

        // F_SEAL_WRITE can only be added if there are no active writable maps.
        if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 {
                if rf.writableMappingPages > 0 {
                        return linuxerr.EBUSY
                }
        }

        // Seals can only be added, never removed.
        rf.seals |= val
        return nil
}















































  198 

  196 


  195 



  193 
  192 


  195 


  195 



  195 


  196 














  190 

  189 


  193 

    1 



  193 



  189 
  189 


  189 


  190 






















    4 


    2 





    2 








    2 












    2 







    1 





    2 








    2 

    3 





    5 

    2 

    3 









    7 
    2 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pipe

import (
        "io"
        "math"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/amutex"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// This file contains Pipe file functionality that is tied to neither VFS nor
// the old fs architecture.

// Release cleans up the pipe's state.
func (p *Pipe) Release(context.Context) {
        p.rClose()
        p.wClose()

        // Wake up readers and writers.
        p.Notify(waiter.ReadableEvents | waiter.WritableEvents)
}

// Read reads from the Pipe into dst.
func (p *Pipe) Read(ctx context.Context, dst usermem.IOSequence) (int64, error) {
        n, err := dst.CopyOutFrom(ctx, p)
        if n > 0 {
                p.Notify(waiter.WritableEvents)
        }
        return n, err
}

// ReadToBlocks implements safemem.Reader.ReadToBlocks for Pipe.Read.
func (p *Pipe) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
        n, err := p.read(int64(dsts.NumBytes()), func(srcs safemem.BlockSeq) (uint64, error) {
                return safemem.CopySeq(dsts, srcs)
        }, true /* removeFromSrc */)
        return uint64(n), err
}

func (p *Pipe) read(count int64, f func(srcs safemem.BlockSeq) (uint64, error), removeFromSrc bool) (int64, error) {
        p.mu.Lock()
        defer p.mu.Unlock()
        n, err := p.peekLocked(count, f)
        if n > 0 && removeFromSrc {
                p.consumeLocked(n)
        }
        return n, err
}

// WriteTo writes to w from the Pipe.
func (p *Pipe) WriteTo(ctx context.Context, w io.Writer, count int64, dup bool) (int64, error) {
        n, err := p.read(count, func(srcs safemem.BlockSeq) (uint64, error) {
                return safemem.FromIOWriter{w}.WriteFromBlocks(srcs)
        }, !dup /* removeFromSrc */)
        if n > 0 && !dup {
                p.Notify(waiter.WritableEvents)
        }
        return n, err
}

// Write writes to the Pipe from src.
func (p *Pipe) Write(ctx context.Context, src usermem.IOSequence) (int64, error) {
        n, err := src.CopyInTo(ctx, p)
        if n > 0 {
                p.Notify(waiter.ReadableEvents)
        }
        if linuxerr.Equals(linuxerr.EPIPE, err) {
                // If we are returning EPIPE send SIGPIPE to the task.
                if sendSig := linux.SignalNoInfoFuncFromContext(ctx); sendSig != nil {
                        sendSig(linux.SIGPIPE)
                }
        }
        return n, err
}

// WriteFromBlocks implements safemem.Writer.WriteFromBlocks for Pipe.Write.
func (p *Pipe) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
        n, err := p.write(int64(srcs.NumBytes()), func(dsts safemem.BlockSeq) (uint64, error) {
                return safemem.CopySeq(dsts, srcs)
        })
        return uint64(n), err
}

func (p *Pipe) write(count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
        p.mu.Lock()
        defer p.mu.Unlock()
        return p.writeLocked(count, f)
}

// ReadFrom reads from r to the Pipe.
func (p *Pipe) ReadFrom(ctx context.Context, r io.Reader, count int64) (int64, error) {
        n, err := p.write(count, func(dsts safemem.BlockSeq) (uint64, error) {
                return safemem.FromIOReader{r}.ReadToBlocks(dsts)
        })
        if n > 0 {
                p.Notify(waiter.ReadableEvents)
        }
        return n, err
}

// Readiness returns the ready events in the underlying pipe.
func (p *Pipe) Readiness(mask waiter.EventMask) waiter.EventMask {
        return p.rwReadiness() & mask
}

// Ioctl implements ioctls on the Pipe.
func (p *Pipe) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        // Switch on ioctl request.
        switch int(args[1].Int()) {
        case linux.FIONREAD:
                v := p.queued()
                if v > math.MaxInt32 {
                        v = math.MaxInt32 // Silently truncate.
                }
                // Copy result to userspace.
                iocc := usermem.IOCopyContext{
                        IO:  io,
                        Ctx: ctx,
                        Opts: usermem.IOOpts{
                                AddressSpaceActive: true,
                        },
                }
                _, err := primitive.CopyInt32Out(&iocc, args[2].Pointer(), int32(v))
                return 0, err
        default:
                return 0, unix.ENOTTY
        }
}

// waitFor blocks until the underlying pipe has at least one reader/writer is
// announced via 'wakeupChan', or until 'sleeper' is cancelled. Any call to this
// function will block for either readers or writers, depending on where
// 'wakeupChan' points.
//
// mu must be held by the caller. waitFor returns with mu held, but it will
// drop mu before blocking for any reader/writers.
// +checklocks:mu
func waitFor(mu *sync.Mutex, wakeupChan *chan struct{}, sleeper amutex.Sleeper) bool {
        // Ideally this function would simply use a condition variable. However, the
        // wait needs to be interruptible via 'sleeper', so we must sychronize via a
        // channel. The synchronization below relies on the fact that closing a
        // channel unblocks all receives on the channel.

        // Does an appropriate wakeup channel already exist? If not, create a new
        // one. This is all done under f.mu to avoid races.
        if *wakeupChan == nil {
                *wakeupChan = make(chan struct{})
        }

        // Grab a local reference to the wakeup channel since it may disappear as
        // soon as we drop f.mu.
        wakeup := *wakeupChan

        // Drop the lock and prepare to sleep.
        mu.Unlock()
        cancel := sleeper.SleepStart()

        // Wait for either a new reader/write to be signalled via 'wakeup', or
        // for the sleep to be cancelled.
        select {
        case <-wakeup:
                sleeper.SleepFinish(true)
        case <-cancel:
                sleeper.SleepFinish(false)
        }

        // Take the lock and check if we were woken. If we were woken and
        // interrupted, the former takes priority.
        mu.Lock()
        select {
        case <-wakeup:
                return true
        default:
                return false
        }
}

// newHandleLocked signals a new pipe reader or writer depending on where
// 'wakeupChan' points. This unblocks any corresponding reader or writer
// waiting for the other end of the channel to be opened, see Fifo.waitFor.
//
// Precondition: the mutex protecting wakeupChan must be held.
func newHandleLocked(wakeupChan *chan struct{}) {
        if *wakeupChan != nil {
                close(*wakeupChan)
                *wakeupChan = nil
        }
}





























































































































































   16 
   16 















    1 



   16 















  519 










   12 


   11 

    3 





   13 



    9 
    7 


    2 





    9 

    7 

    5 




    2 




    1 



    2 

    2 

    2 





    1 



    1 




    1 










   16 


   10 



    2 




    9 


   14 




   16 

   12 
    7 

    7 




    1 

    7 
    4 

    2 






    3 






    2 
    1 


    1 







   16 






   16 






   16 



   15 
   13 


    9 



   15 

    4 





    9 




    1 

    8 



   15 






  520 





  517 




  520 
    9 



    1 






    8 



    1 




    7 


    7 

    1 





    1 
    1 



    1 


    8 

    1 









   21 



    1 






   20 
    8 





   17 










    4 





    4 


   20 












    2 


    1 
    1 
    1 








    1 
    1 













    1 











    2 


    2 




    2 





    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package lock is the API for POSIX-style advisory regional file locks and
// BSD-style full file locks.
//
// Callers needing to enforce these types of locks, like sys_fcntl, can call
// LockRegion and UnlockRegion on a thread-safe set of Locks.  Locks are
// specific to a unique file (unique device/inode pair) and for this reason
// should not be shared between files.
//
// A Lock has a set of holders identified by UniqueID.  Normally this is the
// pid of the thread attempting to acquire the lock.
//
// Since these are advisory locks, they do not need to be integrated into
// Reads/Writes and for this reason there is no way to *check* if a lock is
// held.  One can only attempt to take a lock or unlock an existing lock.
//
// A Lock in a set of Locks is typed: it is either a read lock with any number
// of readers and no writer, or a write lock with no readers.
//
// As expected from POSIX, any attempt to acquire a write lock on a file region
// when there already exits a write lock held by a different uid will fail. Any
// attempt to acquire a write lock on a file region when there is more than one
// reader will fail.  Any attempt to acquire a read lock on a file region when
// there is already a writer will fail.
//
// In special cases, a read lock may be upgraded to a write lock and a write lock
// can be downgraded to a read lock.  This can only happen if:
//
//  * read lock upgrade to write lock: There can be only one reader and the reader
//    must be the same as the requested write lock holder.
//
//  * write lock downgrade to read lock: The writer must be the same as the requested
//    read lock holder.
//
// UnlockRegion always succeeds.  If LockRegion fails the caller should normally
// interpret this as "try again later".
package lock

import (
        "fmt"
        "math"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/waiter"
)

// LockType is a type of regional file lock.
type LockType int

// UniqueID is a unique identifier of the holder of a regional file lock.
type UniqueID interface{}

const (
        // ReadLock describes a POSIX regional file lock to be taken
        // read only.  There may be multiple of these locks on a single
        // file region as long as there is no writer lock on the same
        // region.
        ReadLock LockType = iota

        // WriteLock describes a POSIX regional file lock to be taken
        // write only.  There may be only a single holder of this lock
        // and no read locks.
        WriteLock
)

// LockEOF is the maximal possible end of a regional file lock.
//
// A BSD-style full file lock can be represented as a regional file lock from
// offset 0 to LockEOF.
const LockEOF = math.MaxUint64

// OwnerInfo describes the owner of a lock.
//
// TODO(gvisor.dev/issue/5264): We may need to add other fields in the future
// (e.g., Linux's file_lock.fl_flags to support open file-descriptor locks).
//
// +stateify savable
type OwnerInfo struct {
        // PID is the process ID of the lock owner.
        PID int32
}

// Lock is a regional file lock.  It consists of either a single writer
// or a set of readers.
//
// A Lock may be upgraded from a read lock to a write lock only if there
// is a single reader and that reader has the same uid as the write lock.
//
// A Lock may be downgraded from a write lock to a read lock only if
// the write lock's uid is the same as the read lock.
//
// Accesses to Lock are synchronized through the Locks object to which it
// belongs.
//
// +stateify savable
type Lock struct {
        // Readers are the set of read lock holders identified by UniqueID.
        // If len(Readers) > 0 then Writer must be nil.
        Readers map[UniqueID]OwnerInfo

        // Writer holds the writer unique ID. It's nil if there are no writers.
        Writer UniqueID

        // WriterInfo describes the writer. It is only meaningful if Writer != nil.
        WriterInfo OwnerInfo
}

// Locks is a thread-safe wrapper around a LockSet.
//
// +stateify savable
type Locks struct {
        // mu protects locks below.
        mu sync.Mutex `state:"nosave"`

        // locks is the set of region locks currently held on an Inode.
        locks LockSet

        // blockedQueue is the queue of waiters that are waiting on a lock.
        blockedQueue waiter.Queue `state:"zerovalue"`
}

// Blocker is the interface used for blocking locks. Passing a nil Blocker
// will be treated as non-blocking.
type Blocker interface {
        Block(C <-chan struct{}) error
}

const (
        // EventMaskAll is the mask we will always use for locks, by using the
        // same mask all the time we can wake up everyone anytime the lock
        // changes state.
        EventMaskAll waiter.EventMask = 0xFFFF
)

// LockRegion attempts to acquire a typed lock for the uid on a region
// of a file. Returns true if successful in locking the region. If false
// is returned, the caller should normally interpret this as "try again later" if
// acquiring the lock in a non-blocking mode or "interrupted" if in a blocking mode.
// Blocker is the interface used to provide blocking behavior, passing a nil Blocker
// will result in non-blocking behavior.
func (l *Locks) LockRegion(uid UniqueID, ownerPID int32, t LockType, r LockRange, block Blocker) bool {
        for {
                l.mu.Lock()

                // Blocking locks must run in a loop because we'll be woken up whenever an unlock event
                // happens for this lock. We will then attempt to take the lock again and if it fails
                // continue blocking.
                res := l.locks.lock(uid, ownerPID, t, r)
                if !res && block != nil {
                        e, ch := waiter.NewChannelEntry(nil)
                        l.blockedQueue.EventRegister(&e, EventMaskAll)
                        l.mu.Unlock()
                        if err := block.Block(ch); err != nil {
                                // We were interrupted, the caller can translate this to EINTR if applicable.
                                l.blockedQueue.EventUnregister(&e)
                                return false
                        }
                        l.blockedQueue.EventUnregister(&e)
                        continue // Try again now that someone has unlocked.
                }

                l.mu.Unlock()
                return res
        }
}

// LockRegionVFS1 is a wrapper around LockRegion for VFS1, which does not implement
// F_GETLK (and does not care about storing PIDs as a result).
//
// TODO(gvisor.dev/issue/1624): Delete.
func (l *Locks) LockRegionVFS1(uid UniqueID, t LockType, r LockRange, block Blocker) bool {
        return l.LockRegion(uid, 0 /* ownerPID */, t, r, block)
}

// UnlockRegion attempts to release a lock for the uid on a region of a file.
// This operation is always successful, even if there did not exist a lock on
// the requested region held by uid in the first place.
func (l *Locks) UnlockRegion(uid UniqueID, r LockRange) {
        l.mu.Lock()
        defer l.mu.Unlock()
        l.locks.unlock(uid, r)

        // Now that we've released the lock, we need to wake up any waiters.
        l.blockedQueue.Notify(EventMaskAll)
}

// makeLock returns a new typed Lock that has either uid as its only reader
// or uid as its only writer.
func makeLock(uid UniqueID, ownerPID int32, t LockType) Lock {
        value := Lock{Readers: make(map[UniqueID]OwnerInfo)}
        switch t {
        case ReadLock:
                value.Readers[uid] = OwnerInfo{PID: ownerPID}
        case WriteLock:
                value.Writer = uid
                value.WriterInfo = OwnerInfo{PID: ownerPID}
        default:
                panic(fmt.Sprintf("makeLock: invalid lock type %d", t))
        }
        return value
}

// isHeld returns true if uid is a holder of Lock.
func (l Lock) isHeld(uid UniqueID) bool {
        if _, ok := l.Readers[uid]; ok {
                return true
        }
        return l.Writer == uid
}

// lock sets uid as a holder of a typed lock on Lock.
//
// Preconditions: canLock is true for the range containing this Lock.
func (l *Lock) lock(uid UniqueID, ownerPID int32, t LockType) {
        switch t {
        case ReadLock:
                // If we are already a reader, then this is a no-op.
                if _, ok := l.Readers[uid]; ok {
                        return
                }
                // We cannot downgrade a write lock to a read lock unless the
                // uid is the same.
                if l.Writer != nil {
                        if l.Writer != uid {
                                panic(fmt.Sprintf("lock: cannot downgrade write lock to read lock for uid %d, writer is %d", uid, l.Writer))
                        }
                        // Ensure that there is only one reader if upgrading.
                        l.Readers = make(map[UniqueID]OwnerInfo)
                        // Ensure that there is no longer a writer.
                        l.Writer = nil
                }
                l.Readers[uid] = OwnerInfo{PID: ownerPID}
                return
        case WriteLock:
                // If we are already the writer, then this is a no-op.
                if l.Writer == uid {
                        return
                }
                // We can only upgrade a read lock to a write lock if there
                // is only one reader and that reader has the same uid as
                // the write lock.
                if readers := len(l.Readers); readers > 0 {
                        if readers != 1 {
                                panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, too many readers %v", uid, l.Readers))
                        }
                        if _, ok := l.Readers[uid]; !ok {
                                panic(fmt.Sprintf("lock: cannot upgrade read lock to write lock for uid %d, conflicting reader %v", uid, l.Readers))
                        }
                }
                // Ensure that there is only a writer.
                l.Readers = make(map[UniqueID]OwnerInfo)
                l.Writer = uid
                l.WriterInfo = OwnerInfo{PID: ownerPID}
        default:
                panic(fmt.Sprintf("lock: invalid lock type %d", t))
        }
}

// lockable returns true if check returns true for every Lock in LockRange.
// Further, check should return true if Lock meets the callers requirements
// for locking Lock.
func (l LockSet) lockable(r LockRange, check func(value Lock) bool) bool {
        // Get our starting point.
        seg := l.LowerBoundSegment(r.Start)
        for seg.Ok() && seg.Start() < r.End {
                // Note that we don't care about overrunning the end of the
                // last segment because if everything checks out we'll just
                // split the last segment.
                if !check(seg.Value()) {
                        return false
                }
                // Jump to the next segment, ignoring gaps, for the same
                // reason we ignored the first gap.
                seg = seg.NextSegment()
        }
        // No conflict, we can get a lock for uid over the entire range.
        return true
}

// canLock returns true if uid will be able to take a Lock of type t on the
// entire range specified by LockRange.
func (l LockSet) canLock(uid UniqueID, t LockType, r LockRange) bool {
        switch t {
        case ReadLock:
                return l.lockable(r, func(value Lock) bool {
                        // If there is no writer, there's no problem adding another reader.
                        if value.Writer == nil {
                                return true
                        }
                        // If there is a writer, then it must be the same uid
                        // in order to downgrade the lock to a read lock.
                        return value.Writer == uid
                })
        case WriteLock:
                return l.lockable(r, func(value Lock) bool {
                        // If there are only readers.
                        if value.Writer == nil {
                                // Then this uid can only take a write lock if this is a private
                                // upgrade, meaning that the only reader is uid.
                                return value.isOnlyReader(uid)
                        }
                        // If the uid is already a writer on this region, then
                        // adding a write lock would be a no-op.
                        return value.Writer == uid
                })
        default:
                panic(fmt.Sprintf("canLock: invalid lock type %d", t))
        }
}

func (l *Lock) isOnlyReader(uid UniqueID) bool {
        if len(l.Readers) != 1 {
                return false
        }
        _, ok := l.Readers[uid]
        return ok
}

// lock returns true if uid took a lock of type t on the entire range of
// LockRange.
//
// Preconditions: r.Start <= r.End (will panic otherwise).
func (l *LockSet) lock(uid UniqueID, ownerPID int32, t LockType, r LockRange) bool {
        if r.Start > r.End {
                panic(fmt.Sprintf("lock: r.Start %d > r.End %d", r.Start, r.End))
        }

        // Don't attempt to insert anything with a range of 0 and treat this
        // as a successful no-op.
        if r.Length() == 0 {
                return true
        }

        // Do a first-pass check.  We *could* hold onto the segments we
        // checked if canLock would return true, but traversing the segment
        // set should be fast and this keeps things simple.
        if !l.canLock(uid, t, r) {
                return false
        }
        // Get our starting point.
        seg, gap := l.Find(r.Start)
        if gap.Ok() {
                // Fill in the gap and get the next segment to modify.
                seg = l.Insert(gap, gap.Range().Intersect(r), makeLock(uid, ownerPID, t)).NextSegment()
        } else if seg.Start() < r.Start {
                // Get our first segment to modify.
                _, seg = l.Split(seg, r.Start)
        }
        for seg.Ok() && seg.Start() < r.End {
                // Split the last one if necessary.
                if seg.End() > r.End {
                        seg, _ = l.SplitUnchecked(seg, r.End)
                }

                // Set the lock on the segment. This is guaranteed to
                // always be safe, given canLock above.
                value := seg.ValuePtr()
                value.lock(uid, ownerPID, t)

                // Fill subsequent gaps.
                gap = seg.NextGap()
                if gr := gap.Range().Intersect(r); gr.Length() > 0 {
                        seg = l.Insert(gap, gr, makeLock(uid, ownerPID, t)).NextSegment()
                } else {
                        seg = gap.NextSegment()
                }
        }
        return true
}

// unlock is always successful.  If uid has no locks held for the range LockRange,
// unlock is a no-op.
//
// Preconditions: same as lock.
func (l *LockSet) unlock(uid UniqueID, r LockRange) {
        if r.Start > r.End {
                panic(fmt.Sprintf("unlock: r.Start %d > r.End %d", r.Start, r.End))
        }

        // Same as setlock.
        if r.Length() == 0 {
                return
        }

        // Get our starting point.
        seg := l.LowerBoundSegment(r.Start)
        for seg.Ok() && seg.Start() < r.End {
                // If this segment doesn't have a lock from uid then
                // there is no need to fragment the set with Isolate (below).
                // In this case just move on to the next segment.
                if !seg.Value().isHeld(uid) {
                        seg = seg.NextSegment()
                        continue
                }

                // Ensure that if we need to unlock a sub-segment that
                // we don't unlock/remove that entire segment.
                seg = l.Isolate(seg, r)

                value := seg.Value()
                var remove bool
                if value.Writer == uid {
                        // If we are unlocking a writer, then since there can
                        // only ever be one writer and no readers, then this
                        // lock should always be removed from the set.
                        remove = true
                } else if _, ok := value.Readers[uid]; ok {
                        // If uid is the last reader, then just remove the entire
                        // segment.
                        if len(value.Readers) == 1 {
                                remove = true
                        } else {
                                // Otherwise we need to remove this reader without
                                // affecting any other segment's readers.  To do
                                // this, we need to make a copy of the Readers map
                                // and not add this uid.
                                newValue := Lock{Readers: make(map[UniqueID]OwnerInfo)}
                                for k, v := range value.Readers {
                                        if k != uid {
                                                newValue.Readers[k] = v
                                        }
                                }
                                seg.SetValue(newValue)
                        }
                }
                if remove {
                        seg = l.Remove(seg).NextSegment()
                } else {
                        seg = seg.NextSegment()
                }
        }
}

// ComputeRange takes a positive file offset and computes the start of a LockRange
// using start (relative to offset) and the end of the LockRange using length. The
// values of start and length may be negative but the resulting LockRange must
// preserve that LockRange.Start < LockRange.End and LockRange.Start > 0.
func ComputeRange(start, length, offset int64) (LockRange, error) {
        offset += start
        // fcntl(2): "l_start can be a negative number provided the offset
        // does not lie before the start of the file"
        if offset < 0 {
                return LockRange{}, unix.EINVAL
        }

        // fcntl(2): Specifying 0 for l_len has the  special meaning: lock all
        // bytes starting at the location specified by l_whence and l_start
        // through to the end of file, no matter how large the file grows.
        end := uint64(LockEOF)
        if length > 0 {
                // fcntl(2): If l_len is positive, then the range to be locked
                // covers bytes l_start up to and including l_start+l_len-1.
                //
                // Since LockRange.End is exclusive we need not -1 from length..
                end = uint64(offset + length)
        } else if length < 0 {
                // fcntl(2): If l_len is negative, the interval described by
                // lock covers bytes l_start+l_len up to and including l_start-1.
                //
                // Since LockRange.End is exclusive we need not -1 from offset.
                signedEnd := offset
                // Add to offset using a negative length (subtract).
                offset += length
                if offset < 0 {
                        return LockRange{}, unix.EINVAL
                }
                if signedEnd < offset {
                        return LockRange{}, unix.EOVERFLOW
                }
                // At this point signedEnd cannot be negative,
                // since we asserted that offset is not negative
                // and it is not less than offset.
                end = uint64(signedEnd)
        }
        // Offset is guaranteed to be positive at this point.
        return LockRange{Start: uint64(offset), End: end}, nil
}

// TestRegion checks whether the lock holder identified by uid can hold a lock
// of type t on range r. It returns a Flock struct representing this
// information as the F_GETLK fcntl does.
//
// Note that the PID returned in the flock structure is relative to the root PID
// namespace. It needs to be converted to the caller's PID namespace before
// returning to userspace.
//
// TODO(gvisor.dev/issue/5264): we don't support OFD locks through fcntl, which
// would return a struct with pid = -1.
func (l *Locks) TestRegion(ctx context.Context, uid UniqueID, t LockType, r LockRange) linux.Flock {
        f := linux.Flock{Type: linux.F_UNLCK}
        switch t {
        case ReadLock:
                l.testRegion(r, func(lock Lock, start, length uint64) bool {
                        if lock.Writer == nil || lock.Writer == uid {
                                return true
                        }
                        f.Type = linux.F_WRLCK
                        f.PID = lock.WriterInfo.PID
                        f.Start = int64(start)
                        f.Len = int64(length)
                        return false
                })
        case WriteLock:
                l.testRegion(r, func(lock Lock, start, length uint64) bool {
                        if lock.Writer == nil {
                                for k, v := range lock.Readers {
                                        if k != uid {
                                                // Stop at the first conflict detected.
                                                f.Type = linux.F_RDLCK
                                                f.PID = v.PID
                                                f.Start = int64(start)
                                                f.Len = int64(length)
                                                return false
                                        }
                                }
                                return true
                        }
                        if lock.Writer == uid {
                                return true
                        }
                        f.Type = linux.F_WRLCK
                        f.PID = lock.WriterInfo.PID
                        f.Start = int64(start)
                        f.Len = int64(length)
                        return false
                })
        default:
                panic(fmt.Sprintf("TestRegion: invalid lock type %d", t))
        }
        return f
}

func (l *Locks) testRegion(r LockRange, check func(lock Lock, start, length uint64) bool) {
        l.mu.Lock()
        defer l.mu.Unlock()

        seg := l.locks.LowerBoundSegment(r.Start)
        for seg.Ok() && seg.Start() < r.End {
                lock := seg.Value()
                if !check(lock, seg.Start(), seg.End()-seg.Start()) {
                        // Stop at the first conflict detected.
                        return
                }
                seg = seg.NextSegment()
        }
}

































   10 

















































   32 













   14 





   11 


   10 
   10 







  127 


   49 





   32 


























   32 


   32 






    1 
    1 






   32 




   31 




   83 




  222 






   32 
   32 


    1 
















    1 









    1 










    1 





    1 










   32 







   32 





   32 





    1 





    1 




    1 











  143 







  176 




   47 




   36 

   37 



   36 







   47 


   36 

   10 
























   11 






   47 









   46 


































   42 







   17 





   17 




   17 






















   56 

   44 




   20 



    8 












    8 



    8 




    8 





   90 





   91 




   91 




   30 





   31 




   31 
   31 



   30 




















   20 

   20 

   20 



   20 



   20 















































































    1 
    1 




    1 




    1 










    1 




    1 




    1 





    1 




    1 
























  127 











  127 



    1 







  127 


  125 








   49 




  127 


  126 



  127 























   61 

    2 




   59 



    9 


    5 








    4 





   55 





   55 




   55 






   35 




   26 








   25 
    1 


   14 

   11 







   11 

    1 



   10 





    1 



    9 




    9 
    8 





  120 



    1 


  117 



  163 




  134 


























   15 




    1 


   14 




   12 




    1 


   11 





   62 







    1 
    1 








    4 
    4 








   18 




   18 




   13 


























   26 





   26 
   20 



   26 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "fmt"
        "reflect"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
)

type linkResolver struct {
        resolver LinkAddressResolver

        neigh neighborCache
}

func (l *linkResolver) getNeighborLinkAddress(addr, localAddr tcpip.Address, onResolve func(LinkResolutionResult)) (tcpip.LinkAddress, <-chan struct{}, tcpip.Error) {
        entry, ch, err := l.neigh.entry(addr, localAddr, onResolve)
        return entry.LinkAddr, ch, err
}

func (l *linkResolver) confirmReachable(addr tcpip.Address) {
        l.neigh.handleUpperLevelConfirmation(addr)
}

var _ NetworkInterface = (*nic)(nil)

// nic represents a "network interface card" to which the networking stack is
// attached.
type nic struct {
        LinkEndpoint

        stack   *Stack
        id      tcpip.NICID
        name    string
        context NICContext

        stats sharedStats

        // The network endpoints themselves may be modified by calling the interface's
        // methods, but the map reference and entries must be constant.
        networkEndpoints          map[tcpip.NetworkProtocolNumber]NetworkEndpoint
        linkAddrResolvers         map[tcpip.NetworkProtocolNumber]*linkResolver
        duplicateAddressDetectors map[tcpip.NetworkProtocolNumber]DuplicateAddressDetector

        // enabled is set to 1 when the NIC is enabled and 0 when it is disabled.
        //
        // Must be accessed using atomic operations.
        enabled uint32

        // linkResQueue holds packets that are waiting for link resolution to
        // complete.
        linkResQueue packetsPendingLinkResolution

        mu struct {
                sync.RWMutex
                spoofing    bool
                promiscuous bool
                // packetEPs is protected by mu, but the contained packetEndpointList are
                // not.
                packetEPs map[tcpip.NetworkProtocolNumber]*packetEndpointList
        }
}

// makeNICStats initializes the NIC statistics and associates them to the global
// NIC statistics.
func makeNICStats(global tcpip.NICStats) sharedStats {
        var stats sharedStats
        tcpip.InitStatCounters(reflect.ValueOf(&stats.local).Elem())
        stats.init(&stats.local, &global)
        return stats
}

type packetEndpointList struct {
        mu sync.RWMutex

        // eps is protected by mu, but the contained PacketEndpoint values are not.
        eps []PacketEndpoint
}

func (p *packetEndpointList) add(ep PacketEndpoint) {
        p.mu.Lock()
        defer p.mu.Unlock()
        p.eps = append(p.eps, ep)
}

func (p *packetEndpointList) remove(ep PacketEndpoint) {
        p.mu.Lock()
        defer p.mu.Unlock()
        for i, epOther := range p.eps {
                if epOther == ep {
                        p.eps = append(p.eps[:i], p.eps[i+1:]...)
                        break
                }
        }
}

// forEach calls fn with each endpoints in p while holding the read lock on p.
func (p *packetEndpointList) forEach(fn func(PacketEndpoint)) {
        p.mu.RLock()
        defer p.mu.RUnlock()
        for _, ep := range p.eps {
                fn(ep)
        }
}

// newNIC returns a new NIC using the default NDP configurations from stack.
func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICContext) *nic {
        // TODO(b/141011931): Validate a LinkEndpoint (ep) is valid. For
        // example, make sure that the link address it provides is a valid
        // unicast ethernet address.

        // TODO(b/143357959): RFC 8200 section 5 requires that IPv6 endpoints
        // observe an MTU of at least 1280 bytes. Ensure that this requirement
        // of IPv6 is supported on this endpoint's LinkEndpoint.

        nic := &nic{
                LinkEndpoint: ep,

                stack:                     stack,
                id:                        id,
                name:                      name,
                context:                   ctx,
                stats:                     makeNICStats(stack.Stats().NICs),
                networkEndpoints:          make(map[tcpip.NetworkProtocolNumber]NetworkEndpoint),
                linkAddrResolvers:         make(map[tcpip.NetworkProtocolNumber]*linkResolver),
                duplicateAddressDetectors: make(map[tcpip.NetworkProtocolNumber]DuplicateAddressDetector),
        }
        nic.linkResQueue.init(nic)
        nic.mu.packetEPs = make(map[tcpip.NetworkProtocolNumber]*packetEndpointList)

        resolutionRequired := ep.Capabilities()&CapabilityResolutionRequired != 0

        // Register supported packet and network endpoint protocols.
        for _, netProto := range header.Ethertypes {
                nic.mu.packetEPs[netProto] = new(packetEndpointList)
        }
        for _, netProto := range stack.networkProtocols {
                netNum := netProto.Number()
                nic.mu.packetEPs[netNum] = new(packetEndpointList)

                netEP := netProto.NewEndpoint(nic, nic)
                nic.networkEndpoints[netNum] = netEP

                if resolutionRequired {
                        if r, ok := netEP.(LinkAddressResolver); ok {
                                l := &linkResolver{resolver: r}
                                l.neigh.init(nic, r)
                                nic.linkAddrResolvers[r.LinkAddressProtocol()] = l
                        }
                }

                if d, ok := netEP.(DuplicateAddressDetector); ok {
                        nic.duplicateAddressDetectors[d.DuplicateAddressProtocol()] = d
                }
        }

        nic.LinkEndpoint.Attach(nic)

        return nic
}

func (n *nic) getNetworkEndpoint(proto tcpip.NetworkProtocolNumber) NetworkEndpoint {
        return n.networkEndpoints[proto]
}

// Enabled implements NetworkInterface.
func (n *nic) Enabled() bool {
        return atomic.LoadUint32(&n.enabled) == 1
}

// setEnabled sets the enabled status for the NIC.
//
// Returns true if the enabled status was updated.
func (n *nic) setEnabled(v bool) bool {
        if v {
                return atomic.SwapUint32(&n.enabled, 1) == 0
        }
        return atomic.SwapUint32(&n.enabled, 0) == 1
}

// disable disables n.
//
// It undoes the work done by enable.
func (n *nic) disable() {
        n.mu.Lock()
        n.disableLocked()
        n.mu.Unlock()
}

// disableLocked disables n.
//
// It undoes the work done by enable.
//
// n MUST be locked.
func (n *nic) disableLocked() {
        if !n.Enabled() {
                return
        }

        // TODO(gvisor.dev/issue/1491): Should Routes that are currently bound to n be
        // invalidated? Currently, Routes will continue to work when a NIC is enabled
        // again, and applications may not know that the underlying NIC was ever
        // disabled.

        for _, ep := range n.networkEndpoints {
                ep.Disable()

                // Clear the neighbour table (including static entries) as we cannot
                // guarantee that the current neighbour table will be valid when the NIC is
                // enabled again.
                //
                // This matches linux's behaviour at the time of writing:
                // https://github.com/torvalds/linux/blob/71c061d2443814de15e177489d5cc00a4a253ef3/net/core/neighbour.c#L371
                netProto := ep.NetworkProtocolNumber()
                switch err := n.clearNeighbors(netProto); err.(type) {
                case nil, *tcpip.ErrNotSupported:
                default:
                        panic(fmt.Sprintf("n.clearNeighbors(%d): %s", netProto, err))
                }
        }

        if !n.setEnabled(false) {
                panic("should have only done work to disable the NIC if it was enabled")
        }
}

// enable enables n.
//
// If the stack has IPv6 enabled, enable will join the IPv6 All-Nodes Multicast
// address (ff02::1), start DAD for permanent addresses, and start soliciting
// routers if the stack is not operating as a router. If the stack is also
// configured to auto-generate a link-local address, one will be generated.
func (n *nic) enable() tcpip.Error {
        n.mu.Lock()
        defer n.mu.Unlock()

        if !n.setEnabled(true) {
                return nil
        }

        for _, ep := range n.networkEndpoints {
                if err := ep.Enable(); err != nil {
                        return err
                }
        }

        return nil
}

// remove detaches NIC from the link endpoint and releases network endpoint
// resources. This guarantees no packets between this NIC and the network
// stack.
func (n *nic) remove() tcpip.Error {
        n.mu.Lock()
        defer n.mu.Unlock()

        n.disableLocked()

        for _, ep := range n.networkEndpoints {
                ep.Close()
        }

        // Detach from link endpoint, so no packet comes in.
        n.LinkEndpoint.Attach(nil)
        return nil
}

// setPromiscuousMode enables or disables promiscuous mode.
func (n *nic) setPromiscuousMode(enable bool) {
        n.mu.Lock()
        n.mu.promiscuous = enable
        n.mu.Unlock()
}

// Promiscuous implements NetworkInterface.
func (n *nic) Promiscuous() bool {
        n.mu.RLock()
        rv := n.mu.promiscuous
        n.mu.RUnlock()
        return rv
}

// IsLoopback implements NetworkInterface.
func (n *nic) IsLoopback() bool {
        return n.LinkEndpoint.Capabilities()&CapabilityLoopback != 0
}

// WritePacket implements NetworkLinkEndpoint.
func (n *nic) WritePacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) tcpip.Error {
        _, err := n.enqueuePacketBuffer(r, protocol, pkt)
        return err
}

func (n *nic) writePacketBuffer(r RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt pendingPacketBuffer) (int, tcpip.Error) {
        switch pkt := pkt.(type) {
        case *PacketBuffer:
                if err := n.writePacket(r, protocol, pkt); err != nil {
                        return 0, err
                }
                return 1, nil
        case *PacketBufferList:
                return n.writePackets(r, protocol, *pkt)
        default:
                panic(fmt.Sprintf("unrecognized pending packet buffer type = %T", pkt))
        }
}

func (n *nic) enqueuePacketBuffer(r *Route, protocol tcpip.NetworkProtocolNumber, pkt pendingPacketBuffer) (int, tcpip.Error) {
        routeInfo, _, err := r.resolvedFields(nil)
        switch err.(type) {
        case nil:
                return n.writePacketBuffer(routeInfo, protocol, pkt)
        case *tcpip.ErrWouldBlock:
                // As per relevant RFCs, we should queue packets while we wait for link
                // resolution to complete.
                //
                // RFC 1122 section 2.3.2.2 (for IPv4):
                //   The link layer SHOULD save (rather than discard) at least
                //   one (the latest) packet of each set of packets destined to
                //   the same unresolved IP address, and transmit the saved
                //   packet when the address has been resolved.
                //
                // RFC 4861 section 7.2.2 (for IPv6):
                //   While waiting for address resolution to complete, the sender MUST, for
                //   each neighbor, retain a small queue of packets waiting for address
                //   resolution to complete. The queue MUST hold at least one packet, and
                //   MAY contain more. However, the number of queued packets per neighbor
                //   SHOULD be limited to some small value. When a queue overflows, the new
                //   arrival SHOULD replace the oldest entry. Once address resolution
                //   completes, the node transmits any queued packets.
                return n.linkResQueue.enqueue(r, protocol, pkt)
        default:
                return 0, err
        }
}

// WritePacketToRemote implements NetworkInterface.
func (n *nic) WritePacketToRemote(remoteLinkAddr tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) tcpip.Error {
        var r RouteInfo
        r.NetProto = protocol
        r.RemoteLinkAddress = remoteLinkAddr
        return n.writePacket(r, protocol, pkt)
}

func (n *nic) writePacket(r RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) tcpip.Error {
        // WritePacket takes ownership of pkt, calculate numBytes first.
        numBytes := pkt.Size()

        pkt.EgressRoute = r
        pkt.NetworkProtocolNumber = protocol
        if err := n.LinkEndpoint.WritePacket(r, protocol, pkt); err != nil {
                return err
        }

        n.stats.tx.packets.Increment()
        n.stats.tx.bytes.IncrementBy(uint64(numBytes))
        return nil
}

// WritePackets implements NetworkLinkEndpoint.
func (n *nic) WritePackets(r *Route, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, tcpip.Error) {
        return n.enqueuePacketBuffer(r, protocol, &pkts)
}

func (n *nic) writePackets(r RouteInfo, protocol tcpip.NetworkProtocolNumber, pkts PacketBufferList) (int, tcpip.Error) {
        for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
                pkt.EgressRoute = r
                pkt.NetworkProtocolNumber = protocol
        }

        writtenPackets, err := n.LinkEndpoint.WritePackets(r, pkts, protocol)
        n.stats.tx.packets.IncrementBy(uint64(writtenPackets))
        writtenBytes := 0
        for i, pb := 0, pkts.Front(); i < writtenPackets && pb != nil; i, pb = i+1, pb.Next() {
                writtenBytes += pb.Size()
        }

        n.stats.tx.bytes.IncrementBy(uint64(writtenBytes))
        return writtenPackets, err
}

// setSpoofing enables or disables address spoofing.
func (n *nic) setSpoofing(enable bool) {
        n.mu.Lock()
        n.mu.spoofing = enable
        n.mu.Unlock()
}

// Spoofing implements NetworkInterface.
func (n *nic) Spoofing() bool {
        n.mu.RLock()
        defer n.mu.RUnlock()
        return n.mu.spoofing
}

// primaryAddress returns an address that can be used to communicate with
// remoteAddr.
func (n *nic) primaryEndpoint(protocol tcpip.NetworkProtocolNumber, remoteAddr tcpip.Address) AssignableAddressEndpoint {
        ep, ok := n.networkEndpoints[protocol]
        if !ok {
                return nil
        }

        addressableEndpoint, ok := ep.(AddressableEndpoint)
        if !ok {
                return nil
        }

        n.mu.RLock()
        spoofing := n.mu.spoofing
        n.mu.RUnlock()

        return addressableEndpoint.AcquireOutgoingPrimaryAddress(remoteAddr, spoofing)
}

type getAddressBehaviour int

const (
        // spoofing indicates that the NIC's spoofing flag should be observed when
        // getting a NIC's address endpoint.
        spoofing getAddressBehaviour = iota

        // promiscuous indicates that the NIC's promiscuous flag should be observed
        // when getting a NIC's address endpoint.
        promiscuous
)

func (n *nic) getAddress(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) AssignableAddressEndpoint {
        return n.getAddressOrCreateTemp(protocol, dst, CanBePrimaryEndpoint, promiscuous)
}

func (n *nic) hasAddress(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) bool {
        ep := n.getAddressOrCreateTempInner(protocol, addr, false, NeverPrimaryEndpoint)
        if ep != nil {
                ep.DecRef()
                return true
        }

        return false
}

// findEndpoint finds the endpoint, if any, with the given address.
func (n *nic) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) AssignableAddressEndpoint {
        return n.getAddressOrCreateTemp(protocol, address, peb, spoofing)
}

// getAddressEpOrCreateTemp returns the address endpoint for the given protocol
// and address.
//
// If none exists a temporary one may be created if we are in promiscuous mode
// or spoofing. Promiscuous mode will only be checked if promiscuous is true.
// Similarly, spoofing will only be checked if spoofing is true.
//
// If the address is the IPv4 broadcast address for an endpoint's network, that
// endpoint will be returned.
func (n *nic) getAddressOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior, tempRef getAddressBehaviour) AssignableAddressEndpoint {
        n.mu.RLock()
        var spoofingOrPromiscuous bool
        switch tempRef {
        case spoofing:
                spoofingOrPromiscuous = n.mu.spoofing
        case promiscuous:
                spoofingOrPromiscuous = n.mu.promiscuous
        }
        n.mu.RUnlock()
        return n.getAddressOrCreateTempInner(protocol, address, spoofingOrPromiscuous, peb)
}

// getAddressOrCreateTempInner is like getAddressEpOrCreateTemp except a boolean
// is passed to indicate whether or not we should generate temporary endpoints.
func (n *nic) getAddressOrCreateTempInner(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, createTemp bool, peb PrimaryEndpointBehavior) AssignableAddressEndpoint {
        ep, ok := n.networkEndpoints[protocol]
        if !ok {
                return nil
        }

        addressableEndpoint, ok := ep.(AddressableEndpoint)
        if !ok {
                return nil
        }

        return addressableEndpoint.AcquireAssignedAddress(address, createTemp, peb)
}

// addAddress adds a new address to n, so that it starts accepting packets
// targeted at the given address (and network protocol).
func (n *nic) addAddress(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) tcpip.Error {
        ep, ok := n.networkEndpoints[protocolAddress.Protocol]
        if !ok {
                return &tcpip.ErrUnknownProtocol{}
        }

        addressableEndpoint, ok := ep.(AddressableEndpoint)
        if !ok {
                return &tcpip.ErrNotSupported{}
        }

        addressEndpoint, err := addressableEndpoint.AddAndAcquirePermanentAddress(protocolAddress.AddressWithPrefix, peb, AddressConfigStatic, false /* deprecated */)
        if err == nil {
                // We have no need for the address endpoint.
                addressEndpoint.DecRef()
        }
        return err
}

// allPermanentAddresses returns all permanent addresses associated with
// this NIC.
func (n *nic) allPermanentAddresses() []tcpip.ProtocolAddress {
        var addrs []tcpip.ProtocolAddress
        for p, ep := range n.networkEndpoints {
                addressableEndpoint, ok := ep.(AddressableEndpoint)
                if !ok {
                        continue
                }

                for _, a := range addressableEndpoint.PermanentAddresses() {
                        addrs = append(addrs, tcpip.ProtocolAddress{Protocol: p, AddressWithPrefix: a})
                }
        }
        return addrs
}

// primaryAddresses returns the primary addresses associated with this NIC.
func (n *nic) primaryAddresses() []tcpip.ProtocolAddress {
        var addrs []tcpip.ProtocolAddress
        for p, ep := range n.networkEndpoints {
                addressableEndpoint, ok := ep.(AddressableEndpoint)
                if !ok {
                        continue
                }

                for _, a := range addressableEndpoint.PrimaryAddresses() {
                        addrs = append(addrs, tcpip.ProtocolAddress{Protocol: p, AddressWithPrefix: a})
                }
        }
        return addrs
}

// PrimaryAddress implements NetworkInterface.
func (n *nic) PrimaryAddress(proto tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, tcpip.Error) {
        ep, ok := n.networkEndpoints[proto]
        if !ok {
                return tcpip.AddressWithPrefix{}, &tcpip.ErrUnknownProtocol{}
        }

        addressableEndpoint, ok := ep.(AddressableEndpoint)
        if !ok {
                return tcpip.AddressWithPrefix{}, &tcpip.ErrNotSupported{}
        }

        return addressableEndpoint.MainAddress(), nil
}

// removeAddress removes an address from n.
func (n *nic) removeAddress(addr tcpip.Address) tcpip.Error {
        for _, ep := range n.networkEndpoints {
                addressableEndpoint, ok := ep.(AddressableEndpoint)
                if !ok {
                        continue
                }

                switch err := addressableEndpoint.RemovePermanentAddress(addr); err.(type) {
                case *tcpip.ErrBadLocalAddress:
                        continue
                default:
                        return err
                }
        }

        return &tcpip.ErrBadLocalAddress{}
}

func (n *nic) getLinkAddress(addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, onResolve func(LinkResolutionResult)) tcpip.Error {
        linkRes, ok := n.linkAddrResolvers[protocol]
        if !ok {
                return &tcpip.ErrNotSupported{}
        }

        if linkAddr, ok := linkRes.resolver.ResolveStaticAddress(addr); ok {
                onResolve(LinkResolutionResult{LinkAddress: linkAddr, Err: nil})
                return nil
        }

        _, _, err := linkRes.getNeighborLinkAddress(addr, localAddr, onResolve)
        return err
}

func (n *nic) neighbors(protocol tcpip.NetworkProtocolNumber) ([]NeighborEntry, tcpip.Error) {
        if linkRes, ok := n.linkAddrResolvers[protocol]; ok {
                return linkRes.neigh.entries(), nil
        }

        return nil, &tcpip.ErrNotSupported{}
}

func (n *nic) addStaticNeighbor(addr tcpip.Address, protocol tcpip.NetworkProtocolNumber, linkAddress tcpip.LinkAddress) tcpip.Error {
        if linkRes, ok := n.linkAddrResolvers[protocol]; ok {
                linkRes.neigh.addStaticEntry(addr, linkAddress)
                return nil
        }

        return &tcpip.ErrNotSupported{}
}

func (n *nic) removeNeighbor(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.Error {
        if linkRes, ok := n.linkAddrResolvers[protocol]; ok {
                if !linkRes.neigh.removeEntry(addr) {
                        return &tcpip.ErrBadAddress{}
                }
                return nil
        }

        return &tcpip.ErrNotSupported{}
}

func (n *nic) clearNeighbors(protocol tcpip.NetworkProtocolNumber) tcpip.Error {
        if linkRes, ok := n.linkAddrResolvers[protocol]; ok {
                linkRes.neigh.clear()
                return nil
        }

        return &tcpip.ErrNotSupported{}
}

// joinGroup adds a new endpoint for the given multicast address, if none
// exists yet. Otherwise it just increments its count.
func (n *nic) joinGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.Error {
        // TODO(b/143102137): When implementing MLD, make sure MLD packets are
        // not sent unless a valid link-local address is available for use on n
        // as an MLD packet's source address must be a link-local address as
        // outlined in RFC 3810 section 5.

        ep, ok := n.networkEndpoints[protocol]
        if !ok {
                return &tcpip.ErrNotSupported{}
        }

        gep, ok := ep.(GroupAddressableEndpoint)
        if !ok {
                return &tcpip.ErrNotSupported{}
        }

        return gep.JoinGroup(addr)
}

// leaveGroup decrements the count for the given multicast address, and when it
// reaches zero removes the endpoint for this address.
func (n *nic) leaveGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.Error {
        ep, ok := n.networkEndpoints[protocol]
        if !ok {
                return &tcpip.ErrNotSupported{}
        }

        gep, ok := ep.(GroupAddressableEndpoint)
        if !ok {
                return &tcpip.ErrNotSupported{}
        }

        return gep.LeaveGroup(addr)
}

// isInGroup returns true if n has joined the multicast group addr.
func (n *nic) isInGroup(addr tcpip.Address) bool {
        for _, ep := range n.networkEndpoints {
                gep, ok := ep.(GroupAddressableEndpoint)
                if !ok {
                        continue
                }

                if gep.IsInGroup(addr) {
                        return true
                }
        }

        return false
}

// DeliverNetworkPacket finds the appropriate network protocol endpoint and
// hands the packet over for further processing. This function is called when
// the NIC receives a packet from the link endpoint.
// Note that the ownership of the slice backing vv is retained by the caller.
// This rule applies only to the slice itself, not to the items of the slice;
// the ownership of the items is not retained by the caller.
func (n *nic) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
        n.mu.RLock()
        enabled := n.Enabled()
        // If the NIC is not yet enabled, don't receive any packets.
        if !enabled {
                n.mu.RUnlock()

                n.stats.disabledRx.packets.Increment()
                n.stats.disabledRx.bytes.IncrementBy(uint64(pkt.Data().Size()))
                return
        }

        n.stats.rx.packets.Increment()
        n.stats.rx.bytes.IncrementBy(uint64(pkt.Data().Size()))

        networkEndpoint, ok := n.networkEndpoints[protocol]
        if !ok {
                n.mu.RUnlock()
                n.stats.unknownL3ProtocolRcvdPackets.Increment()
                return
        }

        // If no local link layer address is provided, assume it was sent
        // directly to this NIC.
        if local == "" {
                local = n.LinkEndpoint.LinkAddress()
        }
        pkt.RXTransportChecksumValidated = n.LinkEndpoint.Capabilities()&CapabilityRXChecksumOffload != 0

        // Are any packet type sockets listening for this network protocol?
        protoEPs := n.mu.packetEPs[protocol]
        // Other packet type sockets that are listening for all protocols.
        anyEPs := n.mu.packetEPs[header.EthernetProtocolAll]
        n.mu.RUnlock()

        // Deliver to interested packet endpoints without holding NIC lock.
        deliverPacketEPs := func(ep PacketEndpoint) {
                p := pkt.Clone()
                p.PktType = tcpip.PacketHost
                ep.HandlePacket(n.id, local, protocol, p)
        }
        if protoEPs != nil {
                protoEPs.forEach(deliverPacketEPs)
        }
        if anyEPs != nil {
                anyEPs.forEach(deliverPacketEPs)
        }

        networkEndpoint.HandlePacket(pkt)
}

// DeliverOutboundPacket implements NetworkDispatcher.DeliverOutboundPacket.
func (n *nic) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
        n.mu.RLock()
        // We do not deliver to protocol specific packet endpoints as on Linux
        // only ETH_P_ALL endpoints get outbound packets.
        // Add any other packet sockets that maybe listening for all protocols.
        eps := n.mu.packetEPs[header.EthernetProtocolAll]
        n.mu.RUnlock()

        eps.forEach(func(ep PacketEndpoint) {
                p := pkt.Clone()
                p.PktType = tcpip.PacketOutgoing
                // Add the link layer header as outgoing packets are intercepted
                // before the link layer header is created.
                n.LinkEndpoint.AddHeader(local, remote, protocol, p)
                ep.HandlePacket(n.id, local, protocol, p)
        })
}

// DeliverTransportPacket delivers the packets to the appropriate transport
// protocol endpoint.
func (n *nic) DeliverTransportPacket(protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) TransportPacketDisposition {
        state, ok := n.stack.transportProtocols[protocol]
        if !ok {
                n.stats.unknownL4ProtocolRcvdPackets.Increment()
                return TransportPacketProtocolUnreachable
        }

        transProto := state.proto

        // TransportHeader is empty only when pkt is an ICMP packet or was reassembled
        // from fragments.
        if pkt.TransportHeader().View().IsEmpty() {
                // ICMP packets don't have their TransportHeader fields set yet, parse it
                // here. See icmp/protocol.go:protocol.Parse for a full explanation.
                if protocol == header.ICMPv4ProtocolNumber || protocol == header.ICMPv6ProtocolNumber {
                        // ICMP packets may be longer, but until icmp.Parse is implemented, here
                        // we parse it using the minimum size.
                        if _, ok := pkt.TransportHeader().Consume(transProto.MinimumPacketSize()); !ok {
                                n.stats.malformedL4RcvdPackets.Increment()
                                // We consider a malformed transport packet handled because there is
                                // nothing the caller can do.
                                return TransportPacketHandled
                        }
                } else if !transProto.Parse(pkt) {
                        n.stats.malformedL4RcvdPackets.Increment()
                        return TransportPacketHandled
                }
        }

        srcPort, dstPort, err := transProto.ParsePorts(pkt.TransportHeader().View())
        if err != nil {
                n.stats.malformedL4RcvdPackets.Increment()
                return TransportPacketHandled
        }

        netProto, ok := n.stack.networkProtocols[pkt.NetworkProtocolNumber]
        if !ok {
                panic(fmt.Sprintf("expected network protocol = %d, have = %#v", pkt.NetworkProtocolNumber, n.stack.networkProtocolNumbers()))
        }

        src, dst := netProto.ParseAddresses(pkt.NetworkHeader().View())
        id := TransportEndpointID{
                LocalPort:     dstPort,
                LocalAddress:  dst,
                RemotePort:    srcPort,
                RemoteAddress: src,
        }
        if n.stack.demux.deliverPacket(protocol, pkt, id) {
                return TransportPacketHandled
        }

        // Try to deliver to per-stack default handler.
        if state.defaultHandler != nil {
                if state.defaultHandler(id, pkt) {
                        return TransportPacketHandled
                }
        }

        // We could not find an appropriate destination for this packet so
        // give the protocol specific error handler a chance to handle it.
        // If it doesn't handle it then we should do so.
        switch res := transProto.HandleUnknownDestinationPacket(id, pkt); res {
        case UnknownDestinationPacketMalformed:
                n.stats.malformedL4RcvdPackets.Increment()
                return TransportPacketHandled
        case UnknownDestinationPacketUnhandled:
                return TransportPacketDestinationPortUnreachable
        case UnknownDestinationPacketHandled:
                return TransportPacketHandled
        default:
                panic(fmt.Sprintf("unrecognized result from HandleUnknownDestinationPacket = %d", res))
        }
}

// DeliverTransportError implements TransportDispatcher.
func (n *nic) DeliverTransportError(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, transErr TransportError, pkt *PacketBuffer) {
        state, ok := n.stack.transportProtocols[trans]
        if !ok {
                return
        }

        transProto := state.proto

        // ICMPv4 only guarantees that 8 bytes of the transport protocol will
        // be present in the payload. We know that the ports are within the
        // first 8 bytes for all known transport protocols.
        transHeader, ok := pkt.Data().PullUp(8)
        if !ok {
                return
        }

        srcPort, dstPort, err := transProto.ParsePorts(transHeader)
        if err != nil {
                return
        }

        id := TransportEndpointID{srcPort, local, dstPort, remote}
        if n.stack.demux.deliverError(n, net, trans, transErr, pkt, id) {
                return
        }
}

// DeliverRawPacket implements TransportDispatcher.
func (n *nic) DeliverRawPacket(protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) {
        // For ICMPv4 only we validate the header length for compatibility with
        // raw(7) ICMP_FILTER. The same check is made in Linux here:
        // https://github.com/torvalds/linux/blob/70585216/net/ipv4/raw.c#L189.
        if protocol == header.ICMPv4ProtocolNumber && pkt.TransportHeader().View().Size()+pkt.Data().Size() < header.ICMPv4MinimumSize {
                return
        }
        n.stack.demux.deliverRawPacket(protocol, pkt)
}

// ID implements NetworkInterface.
func (n *nic) ID() tcpip.NICID {
        return n.id
}

// Name implements NetworkInterface.
func (n *nic) Name() string {
        return n.name
}

// nudConfigs gets the NUD configurations for n.
func (n *nic) nudConfigs(protocol tcpip.NetworkProtocolNumber) (NUDConfigurations, tcpip.Error) {
        if linkRes, ok := n.linkAddrResolvers[protocol]; ok {
                return linkRes.neigh.config(), nil
        }

        return NUDConfigurations{}, &tcpip.ErrNotSupported{}
}

// setNUDConfigs sets the NUD configurations for n.
//
// Note, if c contains invalid NUD configuration values, it will be fixed to
// use default values for the erroneous values.
func (n *nic) setNUDConfigs(protocol tcpip.NetworkProtocolNumber, c NUDConfigurations) tcpip.Error {
        if linkRes, ok := n.linkAddrResolvers[protocol]; ok {
                c.resetInvalidFields()
                linkRes.neigh.setConfig(c)
                return nil
        }

        return &tcpip.ErrNotSupported{}
}

func (n *nic) registerPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) tcpip.Error {
        n.mu.Lock()
        defer n.mu.Unlock()

        eps, ok := n.mu.packetEPs[netProto]
        if !ok {
                return &tcpip.ErrNotSupported{}
        }
        eps.add(ep)

        return nil
}

func (n *nic) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) {
        n.mu.Lock()
        defer n.mu.Unlock()

        eps, ok := n.mu.packetEPs[netProto]
        if !ok {
                return
        }
        eps.remove(ep)
}

// isValidForOutgoing returns true if the endpoint can be used to send out a
// packet. It requires the endpoint to not be marked expired (i.e., its address
// has been removed) unless the NIC is in spoofing mode, or temporary.
func (n *nic) isValidForOutgoing(ep AssignableAddressEndpoint) bool {
        n.mu.RLock()
        spoofing := n.mu.spoofing
        n.mu.RUnlock()
        return n.Enabled() && ep.IsAssigned(spoofing)
}

// HandleNeighborProbe implements NetworkInterface.
func (n *nic) HandleNeighborProbe(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, linkAddr tcpip.LinkAddress) tcpip.Error {
        if l, ok := n.linkAddrResolvers[protocol]; ok {
                l.neigh.handleProbe(addr, linkAddr)
                return nil
        }

        return &tcpip.ErrNotSupported{}
}

// HandleNeighborConfirmation implements NetworkInterface.
func (n *nic) HandleNeighborConfirmation(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, linkAddr tcpip.LinkAddress, flags ReachabilityConfirmationFlags) tcpip.Error {
        if l, ok := n.linkAddrResolvers[protocol]; ok {
                l.neigh.handleConfirmation(addr, linkAddr, flags)
                return nil
        }

        return &tcpip.ErrNotSupported{}
}

// CheckLocalAddress implements NetworkInterface.
func (n *nic) CheckLocalAddress(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) bool {
        if n.Spoofing() {
                return true
        }

        if addressEndpoint := n.getAddressOrCreateTempInner(protocol, addr, false /* createTemp */, NeverPrimaryEndpoint); addressEndpoint != nil {
                addressEndpoint.DecRef()
                return true
        }

        return false
}

func (n *nic) checkDuplicateAddress(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, h DADCompletionHandler) (DADCheckAddressDisposition, tcpip.Error) {
        d, ok := n.duplicateAddressDetectors[protocol]
        if !ok {
                return 0, &tcpip.ErrNotSupported{}
        }

        return d.CheckDuplicateAddress(addr, h), nil
}

func (n *nic) setForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) tcpip.Error {
        ep := n.getNetworkEndpoint(protocol)
        if ep == nil {
                return &tcpip.ErrUnknownProtocol{}
        }

        forwardingEP, ok := ep.(ForwardingNetworkEndpoint)
        if !ok {
                return &tcpip.ErrNotSupported{}
        }

        forwardingEP.SetForwarding(enable)
        return nil
}

func (n *nic) forwarding(protocol tcpip.NetworkProtocolNumber) (bool, tcpip.Error) {
        ep := n.getNetworkEndpoint(protocol)
        if ep == nil {
                return false, &tcpip.ErrUnknownProtocol{}
        }

        forwardingEP, ok := ep.(ForwardingNetworkEndpoint)
        if !ok {
                return false, &tcpip.ErrNotSupported{}
        }

        return forwardingEP.Forwarding(), nil
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fs/fsutil/frame_ref_set_impl.go: no such file or directory















































   46 

   46 
   45 



   45 





    2 

    2 


    2 


    1 


  184 






  184 











    1 

    1 



    1 
































































    3 
    1 


    2 
    1 



    1 









    1 







    1 







    1 








    1 









































    1 









  187 





    3 






    3 



    1 














    1 



















    4 

    2 

    2 


    4 


    4 



   11 




    1 


   10 





   10 

    2 


   10 

    1 


   10 




    9 
   10 





   10 

   10 
    3 

    7 


   10 





    4 


















  187 






  186 



  186 


































































































































    1 
    1 


    1 














    2 
    2 


    2 




















    3 




    3 


    3 






    3 




    3 













    3 
    1 




    3 








    3 


    3 




















    1 

    1 
    1 





    1 
















    3 





    3 


    3 




    3 


    3 
    3 


    3 





    3 

































    1 

























    1 
    1 


    1 




    4 
    1 




    3 



    1 



    2 


    1 





















  186 









    2 




    2 






    2 







  181 



  182 



  180 

  181 









  182 




  181 


  182 





















  185 


























































    1 

    1 


    1 



    1 




    1 

















    2 

    2 


    2 



    2 




    2 











  187 






















    4 





    4 



























    4 



    4 



    4 






    4 

















    1 





    3 






    1 














    2 









    1 



    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package proc

import (
        "bytes"
        "fmt"
        "io"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/fsbridge"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/sentry/mm"
        "gvisor.dev/gvisor/pkg/sentry/usage"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
)

// "There is an (arbitrary) limit on the number of lines in the file. As at
// Linux 3.18, the limit is five lines." - user_namespaces(7)
const maxIDMapLines = 5

// mm gets the kernel task's MemoryManager. No additional reference is taken on
// mm here. This is safe because MemoryManager.destroy is required to leave the
// MemoryManager in a state where it's still usable as a DynamicBytesSource.
func getMM(task *kernel.Task) *mm.MemoryManager {
        var tmm *mm.MemoryManager
        task.WithMuLocked(func(t *kernel.Task) {
                if mm := t.MemoryManager(); mm != nil {
                        tmm = mm
                }
        })
        return tmm
}

// getMMIncRef returns t's MemoryManager. If getMMIncRef succeeds, the
// MemoryManager's users count is incremented, and must be decremented by the
// caller when it is no longer in use.
func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) {
        var m *mm.MemoryManager
        task.WithMuLocked(func(t *kernel.Task) {
                m = t.MemoryManager()
        })
        if m == nil || !m.IncUsers() {
                return nil, io.EOF
        }
        return m, nil
}

func checkTaskState(t *kernel.Task) error {
        switch t.ExitState() {
        case kernel.TaskExitZombie:
                return linuxerr.EACCES
        case kernel.TaskExitDead:
                return linuxerr.ESRCH
        }
        return nil
}

type bufferWriter struct {
        buf *bytes.Buffer
}

// WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns
// the number of bytes written. It may return a partial write without an
// error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not
// return a full write with an error (i.e. srcs.NumBytes(), err) where err
// != nil).
func (w *bufferWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
        written := srcs.NumBytes()
        for !srcs.IsEmpty() {
                w.buf.Write(srcs.Head().ToSlice())
                srcs = srcs.Tail()
        }
        return written, nil
}

// auxvData implements vfs.DynamicBytesSource for /proc/[pid]/auxv.
//
// +stateify savable
type auxvData struct {
        kernfs.DynamicBytesFile

        task *kernel.Task
}

var _ dynamicInode = (*auxvData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        if d.task.ExitState() == kernel.TaskExitDead {
                return linuxerr.ESRCH
        }
        m, err := getMMIncRef(d.task)
        if err != nil {
                // Return empty file.
                return nil
        }
        defer m.DecUsers(ctx)

        auxv := m.Auxv()
        // Space for buffer with AT_NULL (0) terminator at the end.
        buf.Grow((len(auxv) + 1) * 16)
        for _, e := range auxv {
                var tmp [16]byte
                hostarch.ByteOrder.PutUint64(tmp[:8], e.Key)
                hostarch.ByteOrder.PutUint64(tmp[8:], uint64(e.Value))
                buf.Write(tmp[:])
        }
        var atNull [16]byte
        buf.Write(atNull[:])

        return nil
}

// execArgType enumerates the types of exec arguments that are exposed through
// proc.
type execArgType int

const (
        cmdlineDataArg execArgType = iota
        environDataArg
)

// cmdlineData implements vfs.DynamicBytesSource for /proc/[pid]/cmdline.
//
// +stateify savable
type cmdlineData struct {
        kernfs.DynamicBytesFile

        task *kernel.Task

        // arg is the type of exec argument this file contains.
        arg execArgType
}

var _ dynamicInode = (*cmdlineData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        if d.task.ExitState() == kernel.TaskExitDead {
                return linuxerr.ESRCH
        }
        m, err := getMMIncRef(d.task)
        if err != nil {
                // Return empty file.
                return nil
        }
        defer m.DecUsers(ctx)

        // Figure out the bounds of the exec arg we are trying to read.
        var ar hostarch.AddrRange
        switch d.arg {
        case cmdlineDataArg:
                ar = hostarch.AddrRange{
                        Start: m.ArgvStart(),
                        End:   m.ArgvEnd(),
                }
        case environDataArg:
                ar = hostarch.AddrRange{
                        Start: m.EnvvStart(),
                        End:   m.EnvvEnd(),
                }
        default:
                panic(fmt.Sprintf("unknown exec arg type %v", d.arg))
        }
        if ar.Start == 0 || ar.End == 0 {
                // Don't attempt to read before the start/end are set up.
                return io.EOF
        }

        // N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true
        // until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading
        // cmdline and environment").
        writer := &bufferWriter{buf: buf}
        if n, err := m.CopyInTo(ctx, hostarch.AddrRangeSeqOf(ar), writer, usermem.IOOpts{}); n == 0 || err != nil {
                // Nothing to copy or something went wrong.
                return err
        }

        // On Linux, if the NULL byte at the end of the argument vector has been
        // overwritten, it continues reading the environment vector as part of
        // the argument vector.
        if d.arg == cmdlineDataArg && buf.Bytes()[buf.Len()-1] != 0 {
                if end := bytes.IndexByte(buf.Bytes(), 0); end != -1 {
                        // If we found a NULL character somewhere else in argv, truncate the
                        // return up to the NULL terminator (including it).
                        buf.Truncate(end)
                        return nil
                }

                // There is no NULL terminator in the string, return into envp.
                arEnvv := hostarch.AddrRange{
                        Start: m.EnvvStart(),
                        End:   m.EnvvEnd(),
                }

                // Upstream limits the returned amount to one page of slop.
                // https://elixir.bootlin.com/linux/v4.20/source/fs/proc/base.c#L208
                // we'll return one page total between argv and envp because of the
                // above page restrictions.
                if buf.Len() >= hostarch.PageSize {
                        // Returned at least one page already, nothing else to add.
                        return nil
                }
                remaining := hostarch.PageSize - buf.Len()
                if int(arEnvv.Length()) > remaining {
                        end, ok := arEnvv.Start.AddLength(uint64(remaining))
                        if !ok {
                                return linuxerr.EFAULT
                        }
                        arEnvv.End = end
                }
                if _, err := m.CopyInTo(ctx, hostarch.AddrRangeSeqOf(arEnvv), writer, usermem.IOOpts{}); err != nil {
                        return err
                }

                // Linux will return envp up to and including the first NULL character,
                // so find it.
                envStart := int(ar.Length())
                if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 {
                        buf.Truncate(envStart + nullIdx)
                }
        }

        return nil
}

// +stateify savable
type commInode struct {
        kernfs.DynamicBytesFile

        task *kernel.Task
}

func (fs *filesystem) newComm(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode {
        inode := &commInode{task: task}
        inode.DynamicBytesFile.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm)
        return inode
}

func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
        // This file can always be read or written by members of the same thread
        // group. See fs/proc/base.c:proc_tid_comm_permission.
        //
        // N.B. This check is currently a no-op as we don't yet support writing and
        // this file is world-readable anyways.
        t := kernel.TaskFromContext(ctx)
        if t != nil && t.ThreadGroup() == i.task.ThreadGroup() && !ats.MayExec() {
                return nil
        }

        return i.DynamicBytesFile.CheckPermissions(ctx, creds, ats)
}

// commData implements vfs.DynamicBytesSource for /proc/[pid]/comm.
//
// +stateify savable
type commData struct {
        kernfs.DynamicBytesFile

        task *kernel.Task
}

var _ dynamicInode = (*commData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        buf.WriteString(d.task.Name())
        buf.WriteString("\n")
        return nil
}

// idMapData implements vfs.WritableDynamicBytesSource for
// /proc/[pid]/{gid_map|uid_map}.
//
// +stateify savable
type idMapData struct {
        kernfs.DynamicBytesFile

        task *kernel.Task
        gids bool
}

var _ dynamicInode = (*idMapData)(nil)

// Generate implements vfs.WritableDynamicBytesSource.Generate.
func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        var entries []auth.IDMapEntry
        if d.gids {
                entries = d.task.UserNamespace().GIDMap()
        } else {
                entries = d.task.UserNamespace().UIDMap()
        }
        for _, e := range entries {
                fmt.Fprintf(buf, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length)
        }
        return nil
}

// Write implements vfs.WritableDynamicBytesSource.Write.
func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
        // "In addition, the number of bytes written to the file must be less than
        // the system page size, and the write must be performed at the start of
        // the file ..." - user_namespaces(7)
        srclen := src.NumBytes()
        if srclen >= hostarch.PageSize || offset != 0 {
                return 0, linuxerr.EINVAL
        }
        b := make([]byte, srclen)
        if _, err := src.CopyIn(ctx, b); err != nil {
                return 0, err
        }

        // Truncate from the first NULL byte.
        var nul int64
        nul = int64(bytes.IndexByte(b, 0))
        if nul == -1 {
                nul = srclen
        }
        b = b[:nul]
        // Remove the last \n.
        if nul >= 1 && b[nul-1] == '\n' {
                b = b[:nul-1]
        }
        lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1)
        if len(lines) > maxIDMapLines {
                return 0, linuxerr.EINVAL
        }

        entries := make([]auth.IDMapEntry, len(lines))
        for i, l := range lines {
                var e auth.IDMapEntry
                _, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length)
                if err != nil {
                        return 0, linuxerr.EINVAL
                }
                entries[i] = e
        }
        var err error
        if d.gids {
                err = d.task.UserNamespace().SetGIDMap(ctx, entries)
        } else {
                err = d.task.UserNamespace().SetUIDMap(ctx, entries)
        }
        if err != nil {
                return 0, err
        }

        // On success, Linux's kernel/user_namespace.c:map_write() always returns
        // count, even if fewer bytes were used.
        return int64(srclen), nil
}

var _ kernfs.Inode = (*memInode)(nil)

// memInode implements kernfs.Inode for /proc/[pid]/mem.
//
// +stateify savable
type memInode struct {
        kernfs.InodeAttrs
        kernfs.InodeNoStatFS
        kernfs.InodeNoopRefCount
        kernfs.InodeNotDirectory
        kernfs.InodeNotSymlink

        task  *kernel.Task
        locks vfs.FileLocks
}

func (fs *filesystem) newMemInode(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode {
        // Note: credentials are overridden by taskOwnedInode.
        inode := &memInode{task: task}
        inode.init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm)
        return &taskOwnedInode{Inode: inode, owner: task}
}

func (f *memInode) init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
        if perm&^linux.PermissionsMask != 0 {
                panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
        }
        f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
}

// Open implements kernfs.Inode.Open.
func (f *memInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        // TODO(gvisor.dev/issue/260): Add check for PTRACE_MODE_ATTACH_FSCREDS
        // Permission to read this file is governed by PTRACE_MODE_ATTACH_FSCREDS
        // Since we dont implement setfsuid/setfsgid we can just use PTRACE_MODE_ATTACH
        if !kernel.ContextCanTrace(ctx, f.task, true) {
                return nil, linuxerr.EACCES
        }
        if err := checkTaskState(f.task); err != nil {
                return nil, err
        }
        fd := &memFD{}
        if err := fd.Init(rp.Mount(), d, f, opts.Flags); err != nil {
                return nil, err
        }
        return &fd.vfsfd, nil
}

// SetStat implements kernfs.Inode.SetStat.
func (*memInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
        return linuxerr.EPERM
}

var _ vfs.FileDescriptionImpl = (*memFD)(nil)

// memFD implements vfs.FileDescriptionImpl for /proc/[pid]/mem.
//
// +stateify savable
type memFD struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.LockFD

        inode *memInode

        // mu guards the fields below.
        mu     sync.Mutex `state:"nosave"`
        offset int64
}

// Init initializes memFD.
func (fd *memFD) Init(m *vfs.Mount, d *kernfs.Dentry, inode *memInode, flags uint32) error {
        fd.LockFD.Init(&inode.locks)
        if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
                return err
        }
        fd.inode = inode
        return nil
}

// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *memFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        fd.mu.Lock()
        defer fd.mu.Unlock()
        switch whence {
        case linux.SEEK_SET:
        case linux.SEEK_CUR:
                offset += fd.offset
        default:
                return 0, linuxerr.EINVAL
        }
        if offset < 0 {
                return 0, linuxerr.EINVAL
        }
        fd.offset = offset
        return offset, nil
}

// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *memFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        if dst.NumBytes() == 0 {
                return 0, nil
        }
        m, err := getMMIncRef(fd.inode.task)
        if err != nil {
                return 0, err
        }
        defer m.DecUsers(ctx)
        // Buffer the read data because of MM locks
        buf := make([]byte, dst.NumBytes())
        n, readErr := m.CopyIn(ctx, hostarch.Addr(offset), buf, usermem.IOOpts{IgnorePermissions: true})
        if n > 0 {
                if _, err := dst.CopyOut(ctx, buf[:n]); err != nil {
                        return 0, linuxerr.EFAULT
                }
                return int64(n), nil
        }
        if readErr != nil {
                return 0, syserror.EIO
        }
        return 0, nil
}

// Read implements vfs.FileDescriptionImpl.Read.
func (fd *memFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        fd.mu.Lock()
        n, err := fd.PRead(ctx, dst, fd.offset, opts)
        fd.offset += n
        fd.mu.Unlock()
        return n, err
}

// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *memFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
        fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
        return fd.inode.Stat(ctx, fs, opts)
}

// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *memFD) SetStat(context.Context, vfs.SetStatOptions) error {
        return linuxerr.EPERM
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *memFD) Release(context.Context) {}

// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
//
// +stateify savable
type mapsData struct {
        kernfs.DynamicBytesFile

        task *kernel.Task
}

var _ dynamicInode = (*mapsData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        if mm := getMM(d.task); mm != nil {
                mm.ReadMapsDataInto(ctx, buf)
        }
        return nil
}

// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps.
//
// +stateify savable
type smapsData struct {
        kernfs.DynamicBytesFile

        task *kernel.Task
}

var _ dynamicInode = (*smapsData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        if mm := getMM(d.task); mm != nil {
                mm.ReadSmapsDataInto(ctx, buf)
        }
        return nil
}

// +stateify savable
type taskStatData struct {
        kernfs.DynamicBytesFile

        task *kernel.Task

        // If tgstats is true, accumulate fault stats (not implemented) and CPU
        // time across all tasks in t's thread group.
        tgstats bool

        // pidns is the PID namespace associated with the proc filesystem that
        // includes the file using this statData.
        pidns *kernel.PIDNamespace
}

var _ dynamicInode = (*taskStatData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.task))
        fmt.Fprintf(buf, "(%s) ", s.task.Name())
        fmt.Fprintf(buf, "%c ", s.task.StateStatus()[0])
        ppid := kernel.ThreadID(0)
        if parent := s.task.Parent(); parent != nil {
                ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
        }
        fmt.Fprintf(buf, "%d ", ppid)
        fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.task.ThreadGroup().ProcessGroup()))
        fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.task.ThreadGroup().Session()))
        fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */)
        fmt.Fprintf(buf, "0 " /* flags */)
        fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
        var cputime usage.CPUStats
        if s.tgstats {
                cputime = s.task.ThreadGroup().CPUStats()
        } else {
                cputime = s.task.CPUStats()
        }
        fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
        cputime = s.task.ThreadGroup().JoinedChildCPUStats()
        fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
        fmt.Fprintf(buf, "%d %d ", s.task.Priority(), s.task.Niceness())
        fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Count())

        // itrealvalue. Since kernel 2.6.17, this field is no longer
        // maintained, and is hard coded as 0.
        fmt.Fprintf(buf, "0 ")

        // Start time is relative to boot time, expressed in clock ticks.
        fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime())))

        var vss, rss uint64
        s.task.WithMuLocked(func(t *kernel.Task) {
                if mm := t.MemoryManager(); mm != nil {
                        vss = mm.VirtualMemorySize()
                        rss = mm.ResidentSetSize()
                }
        })
        fmt.Fprintf(buf, "%d %d ", vss, rss/hostarch.PageSize)

        // rsslim.
        fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Limits().Get(limits.Rss).Cur)

        fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
        fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
        fmt.Fprintf(buf, "0 0 " /* nswap cnswap */)
        terminationSignal := linux.Signal(0)
        if s.task == s.task.ThreadGroup().Leader() {
                terminationSignal = s.task.ThreadGroup().TerminationSignal()
        }
        fmt.Fprintf(buf, "%d ", terminationSignal)
        fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */)
        fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
        fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
        fmt.Fprintf(buf, "0\n" /* exit_code */)

        return nil
}

// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm.
//
// +stateify savable
type statmData struct {
        kernfs.DynamicBytesFile

        task *kernel.Task
}

var _ dynamicInode = (*statmData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        var vss, rss uint64
        s.task.WithMuLocked(func(t *kernel.Task) {
                if mm := t.MemoryManager(); mm != nil {
                        vss = mm.VirtualMemorySize()
                        rss = mm.ResidentSetSize()
                }
        })

        fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/hostarch.PageSize, rss/hostarch.PageSize)
        return nil
}

// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status.
//
// +stateify savable
type statusData struct {
        kernfs.DynamicBytesFile

        task  *kernel.Task
        pidns *kernel.PIDNamespace
}

var _ dynamicInode = (*statusData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name())
        fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus())
        fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup()))
        fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task))
        ppid := kernel.ThreadID(0)
        if parent := s.task.Parent(); parent != nil {
                ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
        }
        fmt.Fprintf(buf, "PPid:\t%d\n", ppid)
        tpid := kernel.ThreadID(0)
        if tracer := s.task.Tracer(); tracer != nil {
                tpid = s.pidns.IDOfTask(tracer)
        }
        fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid)
        var fds int
        var vss, rss, data uint64
        s.task.WithMuLocked(func(t *kernel.Task) {
                if fdTable := t.FDTable(); fdTable != nil {
                        fds = fdTable.CurrentMaxFDs()
                }
                if mm := t.MemoryManager(); mm != nil {
                        vss = mm.VirtualMemorySize()
                        rss = mm.ResidentSetSize()
                        data = mm.VirtualDataSize()
                }
        })
        fmt.Fprintf(buf, "FDSize:\t%d\n", fds)
        fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10)
        fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10)
        fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10)
        fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count())
        creds := s.task.Credentials()
        fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps)
        fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
        fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
        fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
        fmt.Fprintf(buf, "Seccomp:\t%d\n", s.task.SeccompMode())
        // We unconditionally report a single NUMA node. See
        // pkg/sentry/syscalls/linux/sys_mempolicy.go.
        fmt.Fprintf(buf, "Mems_allowed:\t1\n")
        fmt.Fprintf(buf, "Mems_allowed_list:\t0\n")
        return nil
}

// ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider.
type ioUsage interface {
        // IOUsage returns the io usage data.
        IOUsage() *usage.IO
}

// +stateify savable
type ioData struct {
        kernfs.DynamicBytesFile

        ioUsage
}

var _ dynamicInode = (*ioData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        io := usage.IO{}
        io.Accumulate(i.IOUsage())

        fmt.Fprintf(buf, "char: %d\n", io.CharsRead)
        fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten)
        fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls)
        fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls)
        fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead)
        fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten)
        fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
        return nil
}

// oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file.
//
// +stateify savable
type oomScoreAdj struct {
        kernfs.DynamicBytesFile

        task *kernel.Task
}

var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error {
        if o.task.ExitState() == kernel.TaskExitDead {
                return linuxerr.ESRCH
        }
        fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj())
        return nil
}

// Write implements vfs.WritableDynamicBytesSource.Write.
func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
        if src.NumBytes() == 0 {
                return 0, nil
        }

        // Limit input size so as not to impact performance if input size is large.
        src = src.TakeFirst(hostarch.PageSize - 1)

        var v int32
        n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
        if err != nil {
                return 0, err
        }

        if o.task.ExitState() == kernel.TaskExitDead {
                return 0, linuxerr.ESRCH
        }
        if err := o.task.SetOOMScoreAdj(v); err != nil {
                return 0, err
        }

        return n, nil
}

// exeSymlink is an symlink for the /proc/[pid]/exe file.
//
// +stateify savable
type exeSymlink struct {
        implStatFS
        kernfs.InodeAttrs
        kernfs.InodeNoopRefCount
        kernfs.InodeSymlink

        fs   *filesystem
        task *kernel.Task
}

var _ kernfs.Inode = (*exeSymlink)(nil)

func (fs *filesystem) newExeSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode {
        inode := &exeSymlink{
                fs:   fs,
                task: task,
        }
        inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
        return inode
}

// Readlink implements kernfs.Inode.Readlink.
func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
        exec, _, err := s.Getlink(ctx, nil)
        if err != nil {
                return "", err
        }
        defer s.fs.SafeDecRef(ctx, exec)

        root := vfs.RootFromContext(ctx)
        if !root.Ok() {
                // It could have raced with process deletion.
                return "", linuxerr.ESRCH
        }
        defer s.fs.SafeDecRef(ctx, root)

        vfsObj := exec.Mount().Filesystem().VirtualFilesystem()
        name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec)
        return name, nil
}

// Getlink implements kernfs.Inode.Getlink.
func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
        if !kernel.ContextCanTrace(ctx, s.task, false) {
                return vfs.VirtualDentry{}, "", linuxerr.EACCES
        }
        if err := checkTaskState(s.task); err != nil {
                return vfs.VirtualDentry{}, "", err
        }

        var err error
        var exec fsbridge.File
        s.task.WithMuLocked(func(t *kernel.Task) {
                mm := t.MemoryManager()
                if mm == nil {
                        err = linuxerr.EACCES
                        return
                }

                // The MemoryManager may be destroyed, in which case
                // MemoryManager.destroy will simply set the executable to nil
                // (with locks held).
                exec = mm.Executable()
                if exec == nil {
                        err = linuxerr.ESRCH
                }
        })
        if err != nil {
                return vfs.VirtualDentry{}, "", err
        }
        defer exec.DecRef(ctx)

        vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
        vd.IncRef()
        return vd, "", nil
}

// cwdSymlink is an symlink for the /proc/[pid]/cwd file.
//
// +stateify savable
type cwdSymlink struct {
        implStatFS
        kernfs.InodeAttrs
        kernfs.InodeNoopRefCount
        kernfs.InodeSymlink

        fs   *filesystem
        task *kernel.Task
}

var _ kernfs.Inode = (*cwdSymlink)(nil)

func (fs *filesystem) newCwdSymlink(ctx context.Context, task *kernel.Task, ino uint64) kernfs.Inode {
        inode := &cwdSymlink{
                fs:   fs,
                task: task,
        }
        inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
        return inode
}

// Readlink implements kernfs.Inode.Readlink.
func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
        cwd, _, err := s.Getlink(ctx, nil)
        if err != nil {
                return "", err
        }
        defer s.fs.SafeDecRef(ctx, cwd)

        root := vfs.RootFromContext(ctx)
        if !root.Ok() {
                // It could have raced with process deletion.
                return "", linuxerr.ESRCH
        }
        defer s.fs.SafeDecRef(ctx, root)

        vfsObj := cwd.Mount().Filesystem().VirtualFilesystem()
        name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd)
        return name, nil
}

// Getlink implements kernfs.Inode.Getlink.
func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
        if !kernel.ContextCanTrace(ctx, s.task, false) {
                return vfs.VirtualDentry{}, "", linuxerr.EACCES
        }
        if err := checkTaskState(s.task); err != nil {
                return vfs.VirtualDentry{}, "", err
        }
        cwd := s.task.FSContext().WorkingDirectoryVFS2()
        if !cwd.Ok() {
                // It could have raced with process deletion.
                return vfs.VirtualDentry{}, "", linuxerr.ESRCH
        }
        // The reference is transferred to the caller.
        return cwd, "", nil
}

// mountInfoData is used to implement /proc/[pid]/mountinfo.
//
// +stateify savable
type mountInfoData struct {
        kernfs.DynamicBytesFile

        fs   *filesystem
        task *kernel.Task
}

var _ dynamicInode = (*mountInfoData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        var fsctx *kernel.FSContext
        i.task.WithMuLocked(func(t *kernel.Task) {
                fsctx = t.FSContext()
        })
        if fsctx == nil {
                // The task has been destroyed. Nothing to show here.
                return nil
        }
        rootDir := fsctx.RootDirectoryVFS2()
        if !rootDir.Ok() {
                // Root has been destroyed. Don't try to read mounts.
                return nil
        }
        defer i.fs.SafeDecRef(ctx, rootDir)
        i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf)
        return nil
}

// mountsData is used to implement /proc/[pid]/mounts.
//
// +stateify savable
type mountsData struct {
        kernfs.DynamicBytesFile

        fs   *filesystem
        task *kernel.Task
}

var _ dynamicInode = (*mountsData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        var fsctx *kernel.FSContext
        i.task.WithMuLocked(func(t *kernel.Task) {
                fsctx = t.FSContext()
        })
        if fsctx == nil {
                // The task has been destroyed. Nothing to show here.
                return nil
        }
        rootDir := fsctx.RootDirectoryVFS2()
        if !rootDir.Ok() {
                // Root has been destroyed. Don't try to read mounts.
                return nil
        }
        defer i.fs.SafeDecRef(ctx, rootDir)
        i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf)
        return nil
}

// +stateify savable
type namespaceSymlink struct {
        kernfs.StaticSymlink

        task *kernel.Task
}

func (fs *filesystem) newNamespaceSymlink(ctx context.Context, task *kernel.Task, ino uint64, ns string) kernfs.Inode {
        // Namespace symlinks should contain the namespace name and the inode number
        // for the namespace instance, so for example user:[123456]. We currently fake
        // the inode number by sticking the symlink inode in its place.
        target := fmt.Sprintf("%s:[%d]", ns, ino)

        inode := &namespaceSymlink{task: task}
        // Note: credentials are overridden by taskOwnedInode.
        inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)

        taskInode := &taskOwnedInode{Inode: inode, owner: task}
        return taskInode
}

// Readlink implements kernfs.Inode.Readlink.
func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
        if err := checkTaskState(s.task); err != nil {
                return "", err
        }
        return s.StaticSymlink.Readlink(ctx, mnt)
}

// Getlink implements kernfs.Inode.Getlink.
func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
        if err := checkTaskState(s.task); err != nil {
                return vfs.VirtualDentry{}, "", err
        }

        // Create a synthetic inode to represent the namespace.
        fs := mnt.Filesystem().Impl().(*filesystem)
        nsInode := &namespaceInode{}
        nsInode.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0444)
        dentry := &kernfs.Dentry{}
        dentry.Init(&fs.Filesystem, nsInode)
        vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry())
        // Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1.
        mnt.IncRef()
        return vd, "", nil
}

// namespaceInode is a synthetic inode created to represent a namespace in
// /proc/[pid]/ns/*.
//
// +stateify savable
type namespaceInode struct {
        implStatFS
        kernfs.InodeAttrs
        kernfs.InodeNoopRefCount
        kernfs.InodeNotDirectory
        kernfs.InodeNotSymlink

        locks vfs.FileLocks
}

var _ kernfs.Inode = (*namespaceInode)(nil)

// Init initializes a namespace inode.
func (i *namespaceInode) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
        if perm&^linux.PermissionsMask != 0 {
                panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
        }
        i.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
}

// Open implements kernfs.Inode.Open.
func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd := &namespaceFD{inode: i}
        i.IncRef()
        fd.LockFD.Init(&i.locks)
        if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
                return nil, err
        }
        return &fd.vfsfd, nil
}

// namespace FD is a synthetic file that represents a namespace in
// /proc/[pid]/ns/*.
//
// +stateify savable
type namespaceFD struct {
        vfs.FileDescriptionDefaultImpl
        vfs.LockFD

        vfsfd vfs.FileDescription
        inode *namespaceInode
}

var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)

// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
        vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
        return fd.inode.Stat(ctx, vfs, opts)
}

// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
        vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
        creds := auth.CredentialsFromContext(ctx)
        return fd.inode.SetStat(ctx, vfs, creds, opts)
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *namespaceFD) Release(ctx context.Context) {
        fd.inode.DecRef(ctx)
}

// taskCgroupData generates data for /proc/[pid]/cgroup.
//
// +stateify savable
type taskCgroupData struct {
        dynamicBytesFileSetAttr
        task *kernel.Task
}

var _ dynamicInode = (*taskCgroupData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *taskCgroupData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        // When a task is existing on Linux, a task's cgroup set is cleared and
        // reset to the initial cgroup set, which is essentially the set of root
        // cgroups. Because of this, the /proc/<pid>/cgroup file is always readable
        // on Linux throughout a task's lifetime.
        //
        // The sentry removes tasks from cgroups during the exit process, but
        // doesn't move them into an initial cgroup set, so partway through task
        // exit this file show a task is in no cgroups, which is incorrect. Instead,
        // once a task has left its cgroups, we return an error.
        if d.task.ExitState() >= kernel.TaskExitInitiated {
                return linuxerr.ESRCH
        }

        d.task.GenerateProcTaskCgroup(buf)
        return nil
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/proc/tasks_inode_refs.go: no such file or directory

























   12 



   15 



    9 



    1 




    1 




    1 


    1 


    7 



    6 


    7 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package lock

import (
        "math"
)

// LockSet maps a set of Locks into a file.  The key is the file offset.

type lockSetFunctions struct{}

func (lockSetFunctions) MinKey() uint64 {
        return 0
}

func (lockSetFunctions) MaxKey() uint64 {
        return math.MaxUint64
}

func (lockSetFunctions) ClearValue(l *Lock) {
        *l = Lock{}
}

func (lockSetFunctions) Merge(r1 LockRange, val1 Lock, r2 LockRange, val2 Lock) (Lock, bool) {
        // Merge only if the Readers/Writers are identical.
        if len(val1.Readers) != len(val2.Readers) {
                return Lock{}, false
        }
        for k := range val1.Readers {
                if _, ok := val2.Readers[k]; !ok {
                        return Lock{}, false
                }
        }
        if val1.Writer != val2.Writer {
                return Lock{}, false
        }
        return val1, true
}

func (lockSetFunctions) Split(r LockRange, val Lock, split uint64) (Lock, Lock) {
        // Copy the segment so that split segments don't contain map references
        // to other segments.
        val0 := Lock{Readers: make(map[UniqueID]OwnerInfo)}
        for k, v := range val.Readers {
                val0.Readers[k] = v
        }
        val0.Writer = val.Writer
        val0.WriterInfo = val.WriterInfo

        return val, val0
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/tmpfs/inode_refs.go: no such file or directory
















































































































































    3 












    4 





    4 










    4 




    4 



    4 



    4 



    4 



    4 









    4 




    4 




    3 
























    3 











    3 


    3 

    3 

    3 

    3 

    3 




    3 


    3 







    3 
    3 
    3 



    3 















    3 




    3 



























































    1 




    1 






    1 








    1 











    3 





    3 



















    3 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package cgroupfs implements cgroupfs.
//
// A cgroup is a collection of tasks on the system, organized into a tree-like
// structure similar to a filesystem directory tree. In fact, each cgroup is
// represented by a directory on cgroupfs, and is manipulated through control
// files in the directory.
//
// All cgroups on a system are organized into hierarchies. Hierarchies are a
// distinct tree of cgroups, with a common set of controllers. One or more
// cgroupfs mounts may point to each hierarchy. These mounts provide a common
// view into the same tree of cgroups.
//
// A controller (also known as a "resource controller", or a cgroup "subsystem")
// determines the behaviour of each cgroup.
//
// In addition to cgroupfs, the kernel has a cgroup registry that tracks
// system-wide state related to cgroups such as active hierarchies and the
// controllers associated with them.
//
// Since cgroupfs doesn't allow hardlinks, there is a unique mapping between
// cgroupfs dentries and inodes.
//
// # Synchronization
//
// Cgroup hierarchy creation and destruction is protected by the
// kernel.CgroupRegistry.mu. Once created, a hierarchy's set of controllers, the
// filesystem associated with it, and the root cgroup for the hierarchy are
// immutable.
//
// Membership of tasks within cgroups is protected by
// cgroupfs.filesystem.tasksMu. Tasks also maintain a set of all cgroups they're
// in, and this list is protected by Task.mu.
//
// Lock order:
//
// kernel.CgroupRegistry.mu
//   cgroupfs.filesystem.mu
//     kernel.TaskSet.mu
//       kernel.Task.mu
//         cgroupfs.filesystem.tasksMu.
package cgroupfs

import (
        "fmt"
        "sort"
        "strconv"
        "strings"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
)

const (
        // Name is the default filesystem name.
        Name                     = "cgroup"
        readonlyFileMode         = linux.FileMode(0444)
        writableFileMode         = linux.FileMode(0644)
        defaultMaxCachedDentries = uint64(1000)
)

const (
        controllerCPU     = kernel.CgroupControllerType("cpu")
        controllerCPUAcct = kernel.CgroupControllerType("cpuacct")
        controllerCPUSet  = kernel.CgroupControllerType("cpuset")
        controllerJob     = kernel.CgroupControllerType("job")
        controllerMemory  = kernel.CgroupControllerType("memory")
)

var allControllers = []kernel.CgroupControllerType{
        controllerCPU,
        controllerCPUAcct,
        controllerCPUSet,
        controllerJob,
        controllerMemory,
}

// SupportedMountOptions is the set of supported mount options for cgroupfs.
var SupportedMountOptions = []string{"all", "cpu", "cpuacct", "cpuset", "job", "memory"}

// FilesystemType implements vfs.FilesystemType.
//
// +stateify savable
type FilesystemType struct{}

// InternalData contains internal data passed in to the cgroupfs mount via
// vfs.GetFilesystemOptions.InternalData.
//
// +stateify savable
type InternalData struct {
        DefaultControlValues map[string]int64
}

// filesystem implements vfs.FilesystemImpl and kernel.cgroupFS.
//
// +stateify savable
type filesystem struct {
        kernfs.Filesystem
        devMinor uint32

        // hierarchyID is the id the cgroup registry assigns to this hierarchy. Has
        // the value kernel.InvalidCgroupHierarchyID until the FS is fully
        // initialized.
        //
        // hierarchyID is immutable after initialization.
        hierarchyID uint32

        // controllers and kcontrollers are both the list of controllers attached to
        // this cgroupfs. Both lists are the same set of controllers, but typecast
        // to different interfaces for convenience. Both must stay in sync, and are
        // immutable.
        controllers  []controller
        kcontrollers []kernel.CgroupController

        numCgroups uint64 // Protected by atomic ops.

        root *kernfs.Dentry

        // tasksMu serializes task membership changes across all cgroups within a
        // filesystem.
        tasksMu sync.RWMutex `state:"nosave"`
}

// InitializeHierarchyID implements kernel.cgroupFS.InitializeHierarchyID.
func (fs *filesystem) InitializeHierarchyID(hid uint32) {
        fs.hierarchyID = hid
}

// Name implements vfs.FilesystemType.Name.
func (FilesystemType) Name() string {
        return Name
}

// Release implements vfs.FilesystemType.Release.
func (FilesystemType) Release(ctx context.Context) {}

// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
        devMinor, err := vfsObj.GetAnonBlockDevMinor()
        if err != nil {
                return nil, nil, err
        }

        mopts := vfs.GenericParseMountOptions(opts.Data)
        maxCachedDentries := defaultMaxCachedDentries
        if str, ok := mopts["dentry_cache_limit"]; ok {
                delete(mopts, "dentry_cache_limit")
                maxCachedDentries, err = strconv.ParseUint(str, 10, 64)
                if err != nil {
                        ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
                        return nil, nil, linuxerr.EINVAL
                }
        }

        var wantControllers []kernel.CgroupControllerType
        if _, ok := mopts["cpu"]; ok {
                delete(mopts, "cpu")
                wantControllers = append(wantControllers, controllerCPU)
        }
        if _, ok := mopts["cpuacct"]; ok {
                delete(mopts, "cpuacct")
                wantControllers = append(wantControllers, controllerCPUAcct)
        }
        if _, ok := mopts["cpuset"]; ok {
                delete(mopts, "cpuset")
                wantControllers = append(wantControllers, controllerCPUSet)
        }
        if _, ok := mopts["job"]; ok {
                delete(mopts, "job")
                wantControllers = append(wantControllers, controllerJob)
        }
        if _, ok := mopts["memory"]; ok {
                delete(mopts, "memory")
                wantControllers = append(wantControllers, controllerMemory)
        }
        if _, ok := mopts["all"]; ok {
                if len(wantControllers) > 0 {
                        ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: other controllers specified with all: %v", wantControllers)
                        return nil, nil, linuxerr.EINVAL
                }

                delete(mopts, "all")
                wantControllers = allControllers
        }

        if len(wantControllers) == 0 {
                // Specifying no controllers implies all controllers.
                wantControllers = allControllers
        }

        if len(mopts) != 0 {
                ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
                return nil, nil, linuxerr.EINVAL
        }

        k := kernel.KernelFromContext(ctx)
        r := k.CgroupRegistry()

        // "It is not possible to mount the same controller against multiple
        // cgroup hierarchies. For example, it is not possible to mount both
        // the cpu and cpuacct controllers against one hierarchy, and to mount
        // the cpu controller alone against another hierarchy." - man cgroups(7)
        //
        // Is there a hierarchy available with all the controllers we want? If so,
        // this mount is a view into the same hierarchy.
        //
        // Note: we're guaranteed to have at least one requested controller, since
        // no explicit controller name implies all controllers.
        if vfsfs := r.FindHierarchy(wantControllers); vfsfs != nil {
                fs := vfsfs.Impl().(*filesystem)
                ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: mounting new view to hierarchy %v", fs.hierarchyID)
                fs.root.IncRef()
                return vfsfs, fs.root.VFSDentry(), nil
        }

        // No existing hierarchy with the exactly controllers found. Make a new
        // one. Note that it's possible this mount creation is unsatisfiable, if one
        // or more of the requested controllers are already on existing
        // hierarchies. We'll find out about such collisions when we try to register
        // the new hierarchy later.
        fs := &filesystem{
                devMinor: devMinor,
        }
        fs.MaxCachedDentries = maxCachedDentries
        fs.VFSFilesystem().Init(vfsObj, &fsType, fs)

        var defaults map[string]int64
        if opts.InternalData != nil {
                ctx.Debugf("cgroupfs.FilesystemType.GetFilesystem: default control values: %v", defaults)
                defaults = opts.InternalData.(*InternalData).DefaultControlValues
        }

        for _, ty := range wantControllers {
                var c controller
                switch ty {
                case controllerCPU:
                        c = newCPUController(fs, defaults)
                case controllerCPUAcct:
                        c = newCPUAcctController(fs)
                case controllerCPUSet:
                        c = newCPUSetController(fs)
                case controllerJob:
                        c = newJobController(fs)
                case controllerMemory:
                        c = newMemoryController(fs, defaults)
                default:
                        panic(fmt.Sprintf("Unreachable: unknown cgroup controller %q", ty))
                }
                fs.controllers = append(fs.controllers, c)
        }

        if len(defaults) != 0 {
                // Internal data is always provided at sentry startup and unused values
                // indicate a problem with the sandbox config. Fail fast.
                panic(fmt.Sprintf("cgroupfs.FilesystemType.GetFilesystem: unknown internal mount data: %v", defaults))
        }

        // Controllers usually appear in alphabetical order when displayed. Sort it
        // here now, so it never needs to be sorted elsewhere.
        sort.Slice(fs.controllers, func(i, j int) bool { return fs.controllers[i].Type() < fs.controllers[j].Type() })
        fs.kcontrollers = make([]kernel.CgroupController, 0, len(fs.controllers))
        for _, c := range fs.controllers {
                fs.kcontrollers = append(fs.kcontrollers, c)
        }

        root := fs.newCgroupInode(ctx, creds)
        var rootD kernfs.Dentry
        rootD.InitRoot(&fs.Filesystem, root)
        fs.root = &rootD

        // Register controllers. The registry may be modified concurrently, so if we
        // get an error, we raced with someone else who registered the same
        // controllers first.
        if err := r.Register(fs.kcontrollers, fs); err != nil {
                ctx.Infof("cgroupfs.FilesystemType.GetFilesystem: failed to register new hierarchy with controllers %v: %v", wantControllers, err)
                rootD.DecRef(ctx)
                fs.VFSFilesystem().DecRef(ctx)
                return nil, nil, linuxerr.EBUSY
        }

        // Move all existing tasks to the root of the new hierarchy.
        k.PopulateNewCgroupHierarchy(fs.rootCgroup())

        return fs.VFSFilesystem(), rootD.VFSDentry(), nil
}

func (fs *filesystem) rootCgroup() kernel.Cgroup {
        return kernel.Cgroup{
                Dentry:     fs.root,
                CgroupImpl: fs.root.Inode().(kernel.CgroupImpl),
        }
}

// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
        k := kernel.KernelFromContext(ctx)
        r := k.CgroupRegistry()

        if fs.hierarchyID != kernel.InvalidCgroupHierarchyID {
                k.ReleaseCgroupHierarchy(fs.hierarchyID)
                r.Unregister(fs.hierarchyID)
        }

        fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
        fs.Filesystem.Release(ctx)
}

// MountOptions implements vfs.FilesystemImpl.MountOptions.
func (fs *filesystem) MountOptions() string {
        var cnames []string
        for _, c := range fs.controllers {
                cnames = append(cnames, string(c.Type()))
        }
        return strings.Join(cnames, ",")
}

// +stateify savable
type implStatFS struct{}

// StatFS implements kernfs.Inode.StatFS.
func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
        return vfs.GenericStatFS(linux.CGROUP_SUPER_MAGIC), nil
}

// dir implements kernfs.Inode for a generic cgroup resource controller
// directory. Specific controllers extend this to add their own functionality.
//
// +stateify savable
type dir struct {
        dirRefs
        kernfs.InodeAlwaysValid
        kernfs.InodeAttrs
        kernfs.InodeNotSymlink
        kernfs.InodeDirectoryNoNewChildren // TODO(b/183137098): Implement mkdir.
        kernfs.OrderedChildren
        implStatFS

        locks vfs.FileLocks
}

// Keep implements kernfs.Inode.Keep.
func (*dir) Keep() bool {
        return true
}

// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
        return linuxerr.EPERM
}

// Open implements kernfs.Inode.Open.
func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
                SeekEnd: kernfs.SeekEndStaticEntries,
        })
        if err != nil {
                return nil, err
        }
        return fd.VFSFileDescription(), nil
}

// DecRef implements kernfs.Inode.DecRef.
func (d *dir) DecRef(ctx context.Context) {
        d.dirRefs.DecRef(func() { d.Destroy(ctx) })
}

// StatFS implements kernfs.Inode.StatFS.
func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
        return vfs.GenericStatFS(linux.CGROUP_SUPER_MAGIC), nil
}

// controllerFile represents a generic control file that appears within a cgroup
// directory.
//
// +stateify savable
type controllerFile struct {
        kernfs.DynamicBytesFile
}

func (fs *filesystem) newControllerFile(ctx context.Context, creds *auth.Credentials, data vfs.DynamicBytesSource) kernfs.Inode {
        f := &controllerFile{}
        f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, readonlyFileMode)
        return f
}

func (fs *filesystem) newControllerWritableFile(ctx context.Context, creds *auth.Credentials, data vfs.WritableDynamicBytesSource) kernfs.Inode {
        f := &controllerFile{}
        f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), data, writableFileMode)
        return f
}

// staticControllerFile represents a generic control file that appears within a
// cgroup directory which always returns the same data when read.
// staticControllerFiles are not writable.
//
// +stateify savable
type staticControllerFile struct {
        kernfs.DynamicBytesFile
        vfs.StaticData
}

// Note: We let the caller provide the mode so that static files may be used to
// fake both readable and writable control files. However, static files are
// effectively readonly, as attempting to write to them will return EIO
// regardless of the mode.
func (fs *filesystem) newStaticControllerFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, data string) kernfs.Inode {
        f := &staticControllerFile{StaticData: vfs.StaticData{Data: data}}
        f.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), f, mode)
        return f
}





















































































































































































































  134 




  135 


























   79 




   38 



   38 



    5 




   74 




   72 




   75 




    1 




   73 




   77 









   75 





   74 



























   67 





   74 














    3 









    1 




   39 




   39 





   36 










    2 





    1 




   76 




   37 








   37 


   37 






















   75 




   76 





   76 



   75 















   74 



























  156 
   53 


  114 




  108 
   45 


   69 


































































    6 


























   16 







   12 




   12 




   12 




    8 





    6 



    6 





   11 



















   16 


    4 



   16 



    6 









   14 






   14 

    1 









   13 





    5 







    5 


    4 







    4 


    3 







    3 


    1 


















































































    2 



























    5 


    5 


    5 


    5 




    5 




    1 






    1 





    2 






    1 



    1 

    1 
    1 










































    4 




    1 





    1 



    4 


    4 


    4 






































    2 


    3 


    3 


    3 


































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import (
        "encoding/binary"
        "fmt"
        "time"

        "gvisor.dev/gvisor/pkg/tcpip"
)

// RFC 971 defines the fields of the IPv4 header on page 11 using the following
// diagram: ("Figure 4")
//    0                   1                   2                   3
//    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//   |Version|  IHL  |Type of Service|          Total Length         |
//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//   |         Identification        |Flags|      Fragment Offset    |
//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//   |  Time to Live |    Protocol   |         Header Checksum       |
//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//   |                       Source Address                          |
//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//   |                    Destination Address                        |
//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//   |                    Options                    |    Padding    |
//   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
const (
        versIHL = 0
        tos     = 1
        // IPv4TotalLenOffset is the offset of the total length field in the
        // IPv4 header.
        IPv4TotalLenOffset = 2
        id                 = 4
        flagsFO            = 6
        ttl                = 8
        protocol           = 9
        checksum           = 10
        srcAddr            = 12
        dstAddr            = 16
        options            = 20
)

// IPv4Fields contains the fields of an IPv4 packet. It is used to describe the
// fields of a packet that needs to be encoded. The IHL field is not here as
// it is totally defined by the size of the options.
type IPv4Fields struct {
        // TOS is the "type of service" field of an IPv4 packet.
        TOS uint8

        // TotalLength is the "total length" field of an IPv4 packet.
        TotalLength uint16

        // ID is the "identification" field of an IPv4 packet.
        ID uint16

        // Flags is the "flags" field of an IPv4 packet.
        Flags uint8

        // FragmentOffset is the "fragment offset" field of an IPv4 packet.
        FragmentOffset uint16

        // TTL is the "time to live" field of an IPv4 packet.
        TTL uint8

        // Protocol is the "protocol" field of an IPv4 packet.
        Protocol uint8

        // Checksum is the "checksum" field of an IPv4 packet.
        Checksum uint16

        // SrcAddr is the "source ip address" of an IPv4 packet.
        SrcAddr tcpip.Address

        // DstAddr is the "destination ip address" of an IPv4 packet.
        DstAddr tcpip.Address

        // Options must be 40 bytes or less as they must fit along with the
        // rest of the IPv4 header into the maximum size describable in the
        // IHL field. RFC 791 section 3.1 says:
        //    IHL:  4 bits
        //
        //    Internet Header Length is the length of the internet header in 32
        //    bit words, and thus points to the beginning of the data.  Note that
        //    the minimum value for a correct header is 5.
        //
        // That leaves ten 32 bit (4 byte) fields for options. An attempt to encode
        // more will fail.
        Options IPv4OptionsSerializer
}

// IPv4 is an IPv4 header.
// Most of the methods of IPv4 access to the underlying slice without
// checking the boundaries and could panic because of 'index out of range'.
// Always call IsValid() to validate an instance of IPv4 before using other
// methods.
type IPv4 []byte

const (
        // IPv4MinimumSize is the minimum size of a valid IPv4 packet;
        // i.e. a packet header with no options.
        IPv4MinimumSize = 20

        // IPv4MaximumHeaderSize is the maximum size of an IPv4 header. Given
        // that there are only 4 bits (max 0xF (15)) to represent the header length
        // in 32-bit (4 byte) units, the header cannot exceed 15*4 = 60 bytes.
        IPv4MaximumHeaderSize = 60

        // IPv4MaximumOptionsSize is the largest size the IPv4 options can be.
        IPv4MaximumOptionsSize = IPv4MaximumHeaderSize - IPv4MinimumSize

        // IPv4MaximumPayloadSize is the maximum size of a valid IPv4 payload.
        //
        // Linux limits this to 65,515 octets (the max IP datagram size - the IPv4
        // header size). But RFC 791 section 3.2 discusses the design of the IPv4
        // fragment "allows 2**13 = 8192 fragments of 8 octets each for a total of
        // 65,536 octets. Note that this is consistent with the datagram total
        // length field (of course, the header is counted in the total length and not
        // in the fragments)."
        IPv4MaximumPayloadSize = 65536

        // MinIPFragmentPayloadSize is the minimum number of payload bytes that
        // the first fragment must carry when an IPv4 packet is fragmented.
        MinIPFragmentPayloadSize = 8

        // IPv4AddressSize is the size, in bytes, of an IPv4 address.
        IPv4AddressSize = 4

        // IPv4ProtocolNumber is IPv4's network protocol number.
        IPv4ProtocolNumber tcpip.NetworkProtocolNumber = 0x0800

        // IPv4Version is the version of the IPv4 protocol.
        IPv4Version = 4

        // IPv4AllSystems is the all systems IPv4 multicast address as per
        // IANA's IPv4 Multicast Address Space Registry. See
        // https://www.iana.org/assignments/multicast-addresses/multicast-addresses.xhtml.
        IPv4AllSystems tcpip.Address = "\xe0\x00\x00\x01"

        // IPv4Broadcast is the broadcast address of the IPv4 procotol.
        IPv4Broadcast tcpip.Address = "\xff\xff\xff\xff"

        // IPv4Any is the non-routable IPv4 "any" meta address.
        IPv4Any tcpip.Address = "\x00\x00\x00\x00"

        // IPv4AllRoutersGroup is a multicast address for all routers.
        IPv4AllRoutersGroup tcpip.Address = "\xe0\x00\x00\x02"

        // IPv4MinimumProcessableDatagramSize is the minimum size of an IP
        // packet that every IPv4 capable host must be able to
        // process/reassemble.
        IPv4MinimumProcessableDatagramSize = 576

        // IPv4MinimumMTU is the minimum MTU required by IPv4, per RFC 791,
        // section 3.2:
        //   Every internet module must be able to forward a datagram of 68 octets
        //   without further fragmentation.  This is because an internet header may be
        //   up to 60 octets, and the minimum fragment is 8 octets.
        IPv4MinimumMTU = 68
)

// Flags that may be set in an IPv4 packet.
const (
        IPv4FlagMoreFragments = 1 << iota
        IPv4FlagDontFragment
)

// ipv4LinkLocalUnicastSubnet is the IPv4 link local unicast subnet as defined
// by RFC 3927 section 1.
var ipv4LinkLocalUnicastSubnet = func() tcpip.Subnet {
        subnet, err := tcpip.NewSubnet("\xa9\xfe\x00\x00", "\xff\xff\x00\x00")
        if err != nil {
                panic(err)
        }
        return subnet
}()

// ipv4LinkLocalMulticastSubnet is the IPv4 link local multicast subnet as
// defined by RFC 5771 section 4.
var ipv4LinkLocalMulticastSubnet = func() tcpip.Subnet {
        subnet, err := tcpip.NewSubnet("\xe0\x00\x00\x00", "\xff\xff\xff\x00")
        if err != nil {
                panic(err)
        }
        return subnet
}()

// IPv4EmptySubnet is the empty IPv4 subnet.
var IPv4EmptySubnet = func() tcpip.Subnet {
        subnet, err := tcpip.NewSubnet(IPv4Any, tcpip.AddressMask(IPv4Any))
        if err != nil {
                panic(err)
        }
        return subnet
}()

// IPVersion returns the version of IP used in the given packet. It returns -1
// if the packet is not large enough to contain the version field.
func IPVersion(b []byte) int {
        // Length must be at least offset+length of version field.
        if len(b) < versIHL+1 {
                return -1
        }
        return int(b[versIHL] >> ipVersionShift)
}

// RFC 791 page 11 shows the header length (IHL) is in the lower 4 bits
// of the first byte, and is counted in multiples of 4 bytes.
//
//     0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
//    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//    |Version|  IHL  |Type of Service|          Total Length         |
//    +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//      (...)
//     Version:  4 bits
//       The Version field indicates the format of the internet header.  This
//       document describes version 4.
//
//     IHL:  4 bits
//       Internet Header Length is the length of the internet header in 32
//       bit words, and thus points to the beginning of the data.  Note that
//       the minimum value for a correct header is 5.
const (
        ipVersionShift = 4
        ipIHLMask      = 0x0f
        IPv4IHLStride  = 4
)

// HeaderLength returns the value of the "header length" field of the IPv4
// header. The length returned is in bytes.
func (b IPv4) HeaderLength() uint8 {
        return (b[versIHL] & ipIHLMask) * IPv4IHLStride
}

// SetHeaderLength sets the value of the "Internet Header Length" field.
func (b IPv4) SetHeaderLength(hdrLen uint8) {
        if hdrLen > IPv4MaximumHeaderSize {
                panic(fmt.Sprintf("got IPv4 Header size = %d, want <= %d", hdrLen, IPv4MaximumHeaderSize))
        }
        b[versIHL] = (IPv4Version << ipVersionShift) | ((hdrLen / IPv4IHLStride) & ipIHLMask)
}

// ID returns the value of the identifier field of the IPv4 header.
func (b IPv4) ID() uint16 {
        return binary.BigEndian.Uint16(b[id:])
}

// Protocol returns the value of the protocol field of the IPv4 header.
func (b IPv4) Protocol() uint8 {
        return b[protocol]
}

// Flags returns the "flags" field of the IPv4 header.
func (b IPv4) Flags() uint8 {
        return uint8(binary.BigEndian.Uint16(b[flagsFO:]) >> 13)
}

// More returns whether the more fragments flag is set.
func (b IPv4) More() bool {
        return b.Flags()&IPv4FlagMoreFragments != 0
}

// TTL returns the "TTL" field of the IPv4 header.
func (b IPv4) TTL() uint8 {
        return b[ttl]
}

// FragmentOffset returns the "fragment offset" field of the IPv4 header.
func (b IPv4) FragmentOffset() uint16 {
        return binary.BigEndian.Uint16(b[flagsFO:]) << 3
}

// TotalLength returns the "total length" field of the IPv4 header.
func (b IPv4) TotalLength() uint16 {
        return binary.BigEndian.Uint16(b[IPv4TotalLenOffset:])
}

// Checksum returns the checksum field of the IPv4 header.
func (b IPv4) Checksum() uint16 {
        return binary.BigEndian.Uint16(b[checksum:])
}

// SourceAddress returns the "source address" field of the IPv4 header.
func (b IPv4) SourceAddress() tcpip.Address {
        return tcpip.Address(b[srcAddr : srcAddr+IPv4AddressSize])
}

// DestinationAddress returns the "destination address" field of the IPv4
// header.
func (b IPv4) DestinationAddress() tcpip.Address {
        return tcpip.Address(b[dstAddr : dstAddr+IPv4AddressSize])
}

// SetSourceAddressWithChecksumUpdate implements ChecksummableNetwork.
func (b IPv4) SetSourceAddressWithChecksumUpdate(new tcpip.Address) {
        b.SetChecksum(^checksumUpdate2ByteAlignedAddress(^b.Checksum(), b.SourceAddress(), new))
        b.SetSourceAddress(new)
}

// SetDestinationAddressWithChecksumUpdate implements ChecksummableNetwork.
func (b IPv4) SetDestinationAddressWithChecksumUpdate(new tcpip.Address) {
        b.SetChecksum(^checksumUpdate2ByteAlignedAddress(^b.Checksum(), b.DestinationAddress(), new))
        b.SetDestinationAddress(new)
}

// padIPv4OptionsLength returns the total length for IPv4 options of length l
// after applying padding according to RFC 791:
//    The internet header padding is used to ensure that the internet
//    header ends on a 32 bit boundary.
func padIPv4OptionsLength(length uint8) uint8 {
        return (length + IPv4IHLStride - 1) & ^uint8(IPv4IHLStride-1)
}

// IPv4Options is a buffer that holds all the raw IP options.
type IPv4Options []byte

// Options returns a buffer holding the options.
func (b IPv4) Options() IPv4Options {
        hdrLen := b.HeaderLength()
        return IPv4Options(b[options:hdrLen:hdrLen])
}

// TransportProtocol implements Network.TransportProtocol.
func (b IPv4) TransportProtocol() tcpip.TransportProtocolNumber {
        return tcpip.TransportProtocolNumber(b.Protocol())
}

// Payload implements Network.Payload.
func (b IPv4) Payload() []byte {
        return b[b.HeaderLength():][:b.PayloadLength()]
}

// PayloadLength returns the length of the payload portion of the IPv4 packet.
func (b IPv4) PayloadLength() uint16 {
        return b.TotalLength() - uint16(b.HeaderLength())
}

// TOS returns the "type of service" field of the IPv4 header.
func (b IPv4) TOS() (uint8, uint32) {
        return b[tos], 0
}

// SetTOS sets the "type of service" field of the IPv4 header.
func (b IPv4) SetTOS(v uint8, _ uint32) {
        b[tos] = v
}

// SetTTL sets the "Time to Live" field of the IPv4 header.
func (b IPv4) SetTTL(v byte) {
        b[ttl] = v
}

// SetTotalLength sets the "total length" field of the IPv4 header.
func (b IPv4) SetTotalLength(totalLength uint16) {
        binary.BigEndian.PutUint16(b[IPv4TotalLenOffset:], totalLength)
}

// SetChecksum sets the checksum field of the IPv4 header.
func (b IPv4) SetChecksum(v uint16) {
        binary.BigEndian.PutUint16(b[checksum:], v)
}

// SetFlagsFragmentOffset sets the "flags" and "fragment offset" fields of the
// IPv4 header.
func (b IPv4) SetFlagsFragmentOffset(flags uint8, offset uint16) {
        v := (uint16(flags) << 13) | (offset >> 3)
        binary.BigEndian.PutUint16(b[flagsFO:], v)
}

// SetID sets the identification field.
func (b IPv4) SetID(v uint16) {
        binary.BigEndian.PutUint16(b[id:], v)
}

// SetSourceAddress sets the "source address" field of the IPv4 header.
func (b IPv4) SetSourceAddress(addr tcpip.Address) {
        copy(b[srcAddr:srcAddr+IPv4AddressSize], addr)
}

// SetDestinationAddress sets the "destination address" field of the IPv4
// header.
func (b IPv4) SetDestinationAddress(addr tcpip.Address) {
        copy(b[dstAddr:dstAddr+IPv4AddressSize], addr)
}

// CalculateChecksum calculates the checksum of the IPv4 header.
func (b IPv4) CalculateChecksum() uint16 {
        return Checksum(b[:b.HeaderLength()], 0)
}

// Encode encodes all the fields of the IPv4 header.
func (b IPv4) Encode(i *IPv4Fields) {
        // The size of the options defines the size of the whole header and thus the
        // IHL field. Options are rare and this is a heavily used function so it is
        // worth a bit of optimisation here to keep the serializer out of the fast
        // path.
        hdrLen := uint8(IPv4MinimumSize)
        if len(i.Options) != 0 {
                hdrLen += i.Options.Serialize(b[options:])
        }
        if hdrLen > IPv4MaximumHeaderSize {
                panic(fmt.Sprintf("%d is larger than maximum IPv4 header size of %d", hdrLen, IPv4MaximumHeaderSize))
        }
        b.SetHeaderLength(hdrLen)
        b[tos] = i.TOS
        b.SetTotalLength(i.TotalLength)
        binary.BigEndian.PutUint16(b[id:], i.ID)
        b.SetFlagsFragmentOffset(i.Flags, i.FragmentOffset)
        b[ttl] = i.TTL
        b[protocol] = i.Protocol
        b.SetChecksum(i.Checksum)
        copy(b[srcAddr:srcAddr+IPv4AddressSize], i.SrcAddr)
        copy(b[dstAddr:dstAddr+IPv4AddressSize], i.DstAddr)
}

// EncodePartial updates the total length and checksum fields of IPv4 header,
// taking in the partial checksum, which is the checksum of the header without
// the total length and checksum fields. It is useful in cases when similar
// packets are produced.
func (b IPv4) EncodePartial(partialChecksum, totalLength uint16) {
        b.SetTotalLength(totalLength)
        checksum := Checksum(b[IPv4TotalLenOffset:IPv4TotalLenOffset+2], partialChecksum)
        b.SetChecksum(^checksum)
}

// IsValid performs basic validation on the packet.
func (b IPv4) IsValid(pktSize int) bool {
        if len(b) < IPv4MinimumSize {
                return false
        }

        hlen := int(b.HeaderLength())
        tlen := int(b.TotalLength())
        if hlen < IPv4MinimumSize || hlen > tlen || tlen > pktSize {
                return false
        }

        if IPVersion(b) != IPv4Version {
                return false
        }

        return true
}

// IsV4LinkLocalUnicastAddress determines if the provided address is an IPv4
// link-local unicast address.
func IsV4LinkLocalUnicastAddress(addr tcpip.Address) bool {
        return ipv4LinkLocalUnicastSubnet.Contains(addr)
}

// IsV4LinkLocalMulticastAddress determines if the provided address is an IPv4
// link-local multicast address.
func IsV4LinkLocalMulticastAddress(addr tcpip.Address) bool {
        return ipv4LinkLocalMulticastSubnet.Contains(addr)
}

// IsChecksumValid returns true iff the IPv4 header's checksum is valid.
func (b IPv4) IsChecksumValid() bool {
        // There has been some confusion regarding verifying checksums. We need
        // just look for negative 0 (0xffff) as the checksum, as it's not possible to
        // get positive 0 (0) for the checksum. Some bad implementations could get it
        // when doing entry replacement in the early days of the Internet,
        // however the lore that one needs to check for both persists.
        //
        // RFC 1624 section 1 describes the source of this confusion as:
        //     [the partial recalculation method described in RFC 1071] computes a
        //     result for certain cases that differs from the one obtained from
        //     scratch (one's complement of one's complement sum of the original
        //     fields).
        //
        // However RFC 1624 section 5 clarifies that if using the verification method
        // "recommended by RFC 1071, it does not matter if an intermediate system
        // generated a -0 instead of +0".
        //
        // RFC1071 page 1 specifies the verification method as:
        //          (3)  To check a checksum, the 1's complement sum is computed over the
        //        same set of octets, including the checksum field.  If the result
        //        is all 1 bits (-0 in 1's complement arithmetic), the check
        //        succeeds.
        return b.CalculateChecksum() == 0xffff
}

// IsV4MulticastAddress determines if the provided address is an IPv4 multicast
// address (range 224.0.0.0 to 239.255.255.255). The four most significant bits
// will be 1110 = 0xe0.
func IsV4MulticastAddress(addr tcpip.Address) bool {
        if len(addr) != IPv4AddressSize {
                return false
        }
        return (addr[0] & 0xf0) == 0xe0
}

// IsV4LoopbackAddress determines if the provided address is an IPv4 loopback
// address (belongs to 127.0.0.0/8 subnet). See RFC 1122 section 3.2.1.3.
func IsV4LoopbackAddress(addr tcpip.Address) bool {
        if len(addr) != IPv4AddressSize {
                return false
        }
        return addr[0] == 0x7f
}

// ========================= Options ==========================

// An IPv4OptionType can hold the valuse for the Type in an IPv4 option.
type IPv4OptionType byte

// These constants are needed to identify individual options in the option list.
// While RFC 791 (page 31) says "Every internet module must be able to act on
// every option." This has not generally been adhered to and some options have
// very low rates of support. We do not support options other than those shown
// below.

const (
        // IPv4OptionListEndType is the option type for the End Of Option List
        // option. Anything following is ignored.
        IPv4OptionListEndType IPv4OptionType = 0

        // IPv4OptionNOPType is the No-Operation option. May appear between other
        // options and may appear multiple times.
        IPv4OptionNOPType IPv4OptionType = 1

        // IPv4OptionRouterAlertType is the option type for the Router Alert option,
        // defined in RFC 2113 Section 2.1.
        IPv4OptionRouterAlertType IPv4OptionType = 20 | 0x80

        // IPv4OptionRecordRouteType is used by each router on the path of the packet
        // to record its path. It is carried over to an Echo Reply.
        IPv4OptionRecordRouteType IPv4OptionType = 7

        // IPv4OptionTimestampType is the option type for the Timestamp option.
        IPv4OptionTimestampType IPv4OptionType = 68

        // ipv4OptionTypeOffset is the offset in an option of its type field.
        ipv4OptionTypeOffset = 0

        // IPv4OptionLengthOffset is the offset in an option of its length field.
        IPv4OptionLengthOffset = 1
)

// IPv4OptParameterProblem indicates that a Parameter Problem message
// should be generated, and gives the offset in the current entity
// that should be used in that packet.
type IPv4OptParameterProblem struct {
        Pointer  uint8
        NeedICMP bool
}

// IPv4Option is an interface representing various option types.
type IPv4Option interface {
        // Type returns the type identifier of the option.
        Type() IPv4OptionType

        // Size returns the size of the option in bytes.
        Size() uint8

        // Contents returns a slice holding the contents of the option.
        Contents() []byte
}

var _ IPv4Option = (*IPv4OptionGeneric)(nil)

// IPv4OptionGeneric is an IPv4 Option of unknown type.
type IPv4OptionGeneric []byte

// Type implements IPv4Option.
func (o *IPv4OptionGeneric) Type() IPv4OptionType {
        return IPv4OptionType((*o)[ipv4OptionTypeOffset])
}

// Size implements IPv4Option.
func (o *IPv4OptionGeneric) Size() uint8 { return uint8(len(*o)) }

// Contents implements IPv4Option.
func (o *IPv4OptionGeneric) Contents() []byte { return *o }

// IPv4OptionIterator is an iterator pointing to a specific IP option
// at any point of time. It also holds information as to a new options buffer
// that we are building up to hand back to the caller.
// TODO(https://gvisor.dev/issues/5513): Add unit tests for IPv4OptionIterator.
type IPv4OptionIterator struct {
        options IPv4Options
        // ErrCursor is where we are while parsing options. It is exported as any
        // resulting ICMP packet is supposed to have a pointer to the byte within
        // the IP packet where the error was detected.
        ErrCursor     uint8
        nextErrCursor uint8
        newOptions    [IPv4MaximumOptionsSize]byte
        writePoint    int
}

// MakeIterator sets up and returns an iterator of options. It also sets up the
// building of a new option set.
func (o IPv4Options) MakeIterator() IPv4OptionIterator {
        return IPv4OptionIterator{
                options:       o,
                nextErrCursor: IPv4MinimumSize,
        }
}

// InitReplacement copies the option into the new option buffer.
func (i *IPv4OptionIterator) InitReplacement(option IPv4Option) IPv4Options {
        replacementOption := i.RemainingBuffer()[:option.Size()]
        if copied := copy(replacementOption, option.Contents()); copied != len(replacementOption) {
                panic(fmt.Sprintf("copied %d bytes in the replacement option buffer, expected %d bytes", copied, len(replacementOption)))
        }
        return replacementOption
}

// RemainingBuffer returns the remaining (unused) part of the new option buffer,
// into which a new option may be written.
func (i *IPv4OptionIterator) RemainingBuffer() IPv4Options {
        return i.newOptions[i.writePoint:]
}

// ConsumeBuffer marks a portion of the new buffer as used.
func (i *IPv4OptionIterator) ConsumeBuffer(size int) {
        i.writePoint += size
}

// PushNOPOrEnd puts one of the single byte options onto the new options.
// Only values 0 or 1 (ListEnd or NOP) are valid input.
func (i *IPv4OptionIterator) PushNOPOrEnd(val IPv4OptionType) {
        if val > IPv4OptionNOPType {
                panic(fmt.Sprintf("invalid option type %d pushed onto option build buffer", val))
        }
        i.newOptions[i.writePoint] = byte(val)
        i.writePoint++
}

// Finalize returns the completed replacement options buffer padded
// as needed.
func (i *IPv4OptionIterator) Finalize() IPv4Options {
        // RFC 791 page 31 says:
        //     The options might not end on a 32-bit boundary.  The internet header
        //     must be filled out with octets of zeros.  The first of these would
        //     be interpreted as the end-of-options option, and the remainder as
        //     internet header padding.
        // Since the buffer is already zero filled we just need to step the write
        // pointer up to the next multiple of 4.
        options := IPv4Options(i.newOptions[:(i.writePoint+0x3) & ^0x3])
        // Poison the write pointer.
        i.writePoint = len(i.newOptions)
        return options
}

// Next returns the next IP option in the buffer/list of IP options.
// It returns
// - A slice of bytes holding the next option or nil if there is error.
// - A boolean which is true if parsing of all the options is complete.
//   Undefined in the case of error.
// - An error indication which is non-nil if an error condition was found.
func (i *IPv4OptionIterator) Next() (IPv4Option, bool, *IPv4OptParameterProblem) {
        // The opts slice gets shorter as we process the options. When we have no
        // bytes left we are done.
        if len(i.options) == 0 {
                return nil, true, nil
        }

        i.ErrCursor = i.nextErrCursor

        optType := IPv4OptionType(i.options[ipv4OptionTypeOffset])

        if optType == IPv4OptionNOPType || optType == IPv4OptionListEndType {
                optionBody := i.options[:1]
                i.options = i.options[1:]
                i.nextErrCursor = i.ErrCursor + 1
                retval := IPv4OptionGeneric(optionBody)
                return &retval, false, nil
        }

        // There are no more single byte options defined.  All the rest have a length
        // field so we need to sanity check it.
        if len(i.options) == 1 {
                return nil, false, &IPv4OptParameterProblem{
                        Pointer:  i.ErrCursor,
                        NeedICMP: true,
                }
        }

        optLen := i.options[IPv4OptionLengthOffset]

        if optLen <= IPv4OptionLengthOffset || optLen > uint8(len(i.options)) {
                // The actual error is in the length (2nd byte of the option) but we
                // return the start of the option for compatibility with Linux.

                return nil, false, &IPv4OptParameterProblem{
                        Pointer:  i.ErrCursor,
                        NeedICMP: true,
                }
        }

        optionBody := i.options[:optLen]
        i.nextErrCursor = i.ErrCursor + optLen
        i.options = i.options[optLen:]

        // Check the length of some option types that we know.
        switch optType {
        case IPv4OptionTimestampType:
                if optLen < IPv4OptionTimestampHdrLength {
                        i.ErrCursor++
                        return nil, false, &IPv4OptParameterProblem{
                                Pointer:  i.ErrCursor,
                                NeedICMP: true,
                        }
                }
                retval := IPv4OptionTimestamp(optionBody)
                return &retval, false, nil

        case IPv4OptionRecordRouteType:
                if optLen < IPv4OptionRecordRouteHdrLength {
                        i.ErrCursor++
                        return nil, false, &IPv4OptParameterProblem{
                                Pointer:  i.ErrCursor,
                                NeedICMP: true,
                        }
                }
                retval := IPv4OptionRecordRoute(optionBody)
                return &retval, false, nil

        case IPv4OptionRouterAlertType:
                if optLen != IPv4OptionRouterAlertLength {
                        i.ErrCursor++
                        return nil, false, &IPv4OptParameterProblem{
                                Pointer:  i.ErrCursor,
                                NeedICMP: true,
                        }
                }
                retval := IPv4OptionRouterAlert(optionBody)
                return &retval, false, nil
        }
        retval := IPv4OptionGeneric(optionBody)
        return &retval, false, nil
}

//
// IP Timestamp option - RFC 791 page 22.
// +--------+--------+--------+--------+
// |01000100| length | pointer|oflw|flg|
// +--------+--------+--------+--------+
// |         internet address          |
// +--------+--------+--------+--------+
// |             timestamp             |
// +--------+--------+--------+--------+
// |                ...                |
//
// Type = 68
//
// The Option Length is the number of octets in the option counting
// the type, length, pointer, and overflow/flag octets (maximum
// length 40).
//
// The Pointer is the number of octets from the beginning of this
// option to the end of timestamps plus one (i.e., it points to the
// octet beginning the space for next timestamp).  The smallest
// legal value is 5.  The timestamp area is full when the pointer
// is greater than the length.
//
// The Overflow (oflw) [4 bits] is the number of IP modules that
// cannot register timestamps due to lack of space.
//
// The Flag (flg) [4 bits] values are
//
//   0 -- time stamps only, stored in consecutive 32-bit words,
//
//   1 -- each timestamp is preceded with internet address of the
//        registering entity,
//
//   3 -- the internet address fields are prespecified.  An IP
//        module only registers its timestamp if it matches its own
//        address with the next specified internet address.
//
// Timestamps are defined in RFC 791 page 22 as milliseconds since midnight UTC.
//
//        The Timestamp is a right-justified, 32-bit timestamp in
//        milliseconds since midnight UT.  If the time is not available in
//        milliseconds or cannot be provided with respect to midnight UT
//        then any time may be inserted as a timestamp provided the high
//        order bit of the timestamp field is set to one to indicate the
//        use of a non-standard value.

// IPv4OptTSFlags sefines the values expected in the Timestamp
// option Flags field.
type IPv4OptTSFlags uint8

//
// Timestamp option specific related constants.
const (
        // IPv4OptionTimestampHdrLength is the length of the timestamp option header.
        IPv4OptionTimestampHdrLength = 4

        // IPv4OptionTimestampSize is the size of an IP timestamp.
        IPv4OptionTimestampSize = 4

        // IPv4OptionTimestampWithAddrSize is the size of an IP timestamp + Address.
        IPv4OptionTimestampWithAddrSize = IPv4AddressSize + IPv4OptionTimestampSize

        // IPv4OptionTimestampMaxSize is limited by space for options
        IPv4OptionTimestampMaxSize = IPv4MaximumOptionsSize

        // IPv4OptionTimestampOnlyFlag is a flag indicating that only timestamp
        // is present.
        IPv4OptionTimestampOnlyFlag IPv4OptTSFlags = 0

        // IPv4OptionTimestampWithIPFlag is a flag indicating that both timestamps and
        // IP are present.
        IPv4OptionTimestampWithIPFlag IPv4OptTSFlags = 1

        // IPv4OptionTimestampWithPredefinedIPFlag is a flag indicating that
        // predefined IP is present.
        IPv4OptionTimestampWithPredefinedIPFlag IPv4OptTSFlags = 3
)

// ipv4TimestampTime provides the current time as specified in RFC 791.
func ipv4TimestampTime(clock tcpip.Clock) uint32 {
        // Per RFC 791 page 21:
        //   The Timestamp is a right-justified, 32-bit timestamp in
        //   milliseconds since midnight UT.
        now := clock.Now().UTC()
        midnight := now.Truncate(24 * time.Hour)
        return uint32(now.Sub(midnight).Milliseconds())
}

// IP Timestamp option fields.
const (
        // IPv4OptTSPointerOffset is the offset of the Timestamp pointer field.
        IPv4OptTSPointerOffset = 2

        // IPv4OptTSPointerOffset is the offset of the combined Flag and Overflow
        // fields, (each being 4 bits).
        IPv4OptTSOFLWAndFLGOffset = 3
        // These constants define the sub byte fields of the Flag and OverFlow field.
        ipv4OptionTimestampOverflowshift      = 4
        ipv4OptionTimestampFlagsMask     byte = 0x0f
)

var _ IPv4Option = (*IPv4OptionTimestamp)(nil)

// IPv4OptionTimestamp is a Timestamp option from RFC 791.
type IPv4OptionTimestamp []byte

// Type implements IPv4Option.Type().
func (ts *IPv4OptionTimestamp) Type() IPv4OptionType { return IPv4OptionTimestampType }

// Size implements IPv4Option.
func (ts *IPv4OptionTimestamp) Size() uint8 { return uint8(len(*ts)) }

// Contents implements IPv4Option.
func (ts *IPv4OptionTimestamp) Contents() []byte { return *ts }

// Pointer returns the pointer field in the IP Timestamp option.
func (ts *IPv4OptionTimestamp) Pointer() uint8 {
        return (*ts)[IPv4OptTSPointerOffset]
}

// Flags returns the flags field in the IP Timestamp option.
func (ts *IPv4OptionTimestamp) Flags() IPv4OptTSFlags {
        return IPv4OptTSFlags((*ts)[IPv4OptTSOFLWAndFLGOffset] & ipv4OptionTimestampFlagsMask)
}

// Overflow returns the Overflow field in the IP Timestamp option.
func (ts *IPv4OptionTimestamp) Overflow() uint8 {
        return (*ts)[IPv4OptTSOFLWAndFLGOffset] >> ipv4OptionTimestampOverflowshift
}

// IncOverflow increments the Overflow field in the IP Timestamp option. It
// returns the incremented value. If the return value is 0 then the field
// overflowed.
func (ts *IPv4OptionTimestamp) IncOverflow() uint8 {
        (*ts)[IPv4OptTSOFLWAndFLGOffset] += 1 << ipv4OptionTimestampOverflowshift
        return ts.Overflow()
}

// UpdateTimestamp updates the fields of the next free timestamp slot.
func (ts *IPv4OptionTimestamp) UpdateTimestamp(addr tcpip.Address, clock tcpip.Clock) {
        slot := (*ts)[ts.Pointer()-1:]

        switch ts.Flags() {
        case IPv4OptionTimestampOnlyFlag:
                binary.BigEndian.PutUint32(slot, ipv4TimestampTime(clock))
                (*ts)[IPv4OptTSPointerOffset] += IPv4OptionTimestampSize
        case IPv4OptionTimestampWithIPFlag:
                if n := copy(slot, addr); n != IPv4AddressSize {
                        panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, IPv4AddressSize))
                }
                binary.BigEndian.PutUint32(slot[IPv4AddressSize:], ipv4TimestampTime(clock))
                (*ts)[IPv4OptTSPointerOffset] += IPv4OptionTimestampWithAddrSize
        case IPv4OptionTimestampWithPredefinedIPFlag:
                if tcpip.Address(slot[:IPv4AddressSize]) == addr {
                        binary.BigEndian.PutUint32(slot[IPv4AddressSize:], ipv4TimestampTime(clock))
                        (*ts)[IPv4OptTSPointerOffset] += IPv4OptionTimestampWithAddrSize
                }
        }
}

// RecordRoute option specific related constants.
//
// from RFC 791 page 20:
//   Record Route
//
//         +--------+--------+--------+---------//--------+
//         |00000111| length | pointer|     route data    |
//         +--------+--------+--------+---------//--------+
//           Type=7
//
//         The record route option provides a means to record the route of
//         an internet datagram.
//
//         The option begins with the option type code.  The second octet
//         is the option length which includes the option type code and the
//         length octet, the pointer octet, and length-3 octets of route
//         data.  The third octet is the pointer into the route data
//         indicating the octet which begins the next area to store a route
//         address.  The pointer is relative to this option, and the
//         smallest legal value for the pointer is 4.
const (
        // IPv4OptionRecordRouteHdrLength is the length of the Record Route option
        // header.
        IPv4OptionRecordRouteHdrLength = 3

        // IPv4OptRRPointerOffset is the offset to the pointer field in an RR
        // option, which points to the next free slot in the list of addresses.
        IPv4OptRRPointerOffset = 2
)

var _ IPv4Option = (*IPv4OptionRecordRoute)(nil)

// IPv4OptionRecordRoute is an IPv4 RecordRoute option defined by RFC 791.
type IPv4OptionRecordRoute []byte

// Pointer returns the pointer field in the IP RecordRoute option.
func (rr *IPv4OptionRecordRoute) Pointer() uint8 {
        return (*rr)[IPv4OptRRPointerOffset]
}

// StoreAddress stores the given IPv4 address into the next free slot.
func (rr *IPv4OptionRecordRoute) StoreAddress(addr tcpip.Address) {
        start := rr.Pointer() - 1 // A one based number.
        // start and room checked by caller.
        if n := copy((*rr)[start:], addr); n != IPv4AddressSize {
                panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, IPv4AddressSize))
        }
        (*rr)[IPv4OptRRPointerOffset] += IPv4AddressSize
}

// Type implements IPv4Option.
func (rr *IPv4OptionRecordRoute) Type() IPv4OptionType { return IPv4OptionRecordRouteType }

// Size implements IPv4Option.
func (rr *IPv4OptionRecordRoute) Size() uint8 { return uint8(len(*rr)) }

// Contents implements IPv4Option.
func (rr *IPv4OptionRecordRoute) Contents() []byte { return *rr }

// Router Alert option specific related constants.
//
// from RFC 2113 section 2.1:
//
//     +--------+--------+--------+--------+
//     |10010100|00000100|  2 octet value  |
//     +--------+--------+--------+--------+
//
//     Type:
//     Copied flag:  1 (all fragments must carry the option)
//     Option class: 0 (control)
//     Option number: 20 (decimal)
//
//     Length: 4
//
//     Value:  A two octet code with the following values:
//     0 - Router shall examine packet
//     1-65535 - Reserved
const (
        // IPv4OptionRouterAlertLength is the length of a Router Alert option.
        IPv4OptionRouterAlertLength = 4

        // IPv4OptionRouterAlertValue is the only permissible value of the 16 bit
        // payload of the router alert option.
        IPv4OptionRouterAlertValue = 0

        // IPv4OptionRouterAlertValueOffset is the offset for the value of a
        // RouterAlert option.
        IPv4OptionRouterAlertValueOffset = 2
)

var _ IPv4Option = (*IPv4OptionRouterAlert)(nil)

// IPv4OptionRouterAlert is an IPv4 RouterAlert option defined by RFC 2113.
type IPv4OptionRouterAlert []byte

// Type implements IPv4Option.
func (*IPv4OptionRouterAlert) Type() IPv4OptionType { return IPv4OptionRouterAlertType }

// Size implements IPv4Option.
func (ra *IPv4OptionRouterAlert) Size() uint8 { return uint8(len(*ra)) }

// Contents implements IPv4Option.
func (ra *IPv4OptionRouterAlert) Contents() []byte { return *ra }

// Value returns the value of the IPv4OptionRouterAlert.
func (ra *IPv4OptionRouterAlert) Value() uint16 {
        return binary.BigEndian.Uint16(ra.Contents()[IPv4OptionRouterAlertValueOffset:])
}

// IPv4SerializableOption is an interface to represent serializable IPv4 option
// types.
type IPv4SerializableOption interface {
        // optionType returns the type identifier of the option.
        optionType() IPv4OptionType
}

// IPv4SerializableOptionPayload is an interface providing serialization of the
// payload of an IPv4 option.
type IPv4SerializableOptionPayload interface {
        // length returns the size of the payload.
        length() uint8

        // serializeInto serializes the payload into the provided byte buffer.
        //
        // Note, the caller MUST provide a byte buffer with size of at least
        // Length. Implementers of this function may assume that the byte buffer
        // is of sufficient size. serializeInto MUST panic if the provided byte
        // buffer is not of sufficient size.
        //
        // serializeInto will return the number of bytes that was used to
        // serialize the receiver. Implementers must only use the number of
        // bytes required to serialize the receiver. Callers MAY provide a
        // larger buffer than required to serialize into.
        serializeInto(buffer []byte) uint8
}

// IPv4OptionsSerializer is a serializer for IPv4 options.
type IPv4OptionsSerializer []IPv4SerializableOption

// Length returns the total number of bytes required to serialize the options.
func (s IPv4OptionsSerializer) Length() uint8 {
        var total uint8
        for _, opt := range s {
                total++
                if withPayload, ok := opt.(IPv4SerializableOptionPayload); ok {
                        // Add 1 to reported length to account for the length byte.
                        total += 1 + withPayload.length()
                }
        }
        return padIPv4OptionsLength(total)
}

// Serialize serializes the provided list of IPV4 options into b.
//
// Note, b must be of sufficient size to hold all the options in s. See
// IPv4OptionsSerializer.Length for details on the getting the total size
// of a serialized IPv4OptionsSerializer.
//
// Serialize panics if b is not of sufficient size to hold all the options in s.
func (s IPv4OptionsSerializer) Serialize(b []byte) uint8 {
        var total uint8
        for _, opt := range s {
                ty := opt.optionType()
                if withPayload, ok := opt.(IPv4SerializableOptionPayload); ok {
                        // Serialize first to reduce bounds checks.
                        l := 2 + withPayload.serializeInto(b[2:])
                        b[0] = byte(ty)
                        b[1] = l
                        b = b[l:]
                        total += l
                        continue
                }
                // Options without payload consist only of the type field.
                //
                // NB: Repeating code from the branch above is intentional to minimize
                // bounds checks.
                b[0] = byte(ty)
                b = b[1:]
                total++
        }

        // According to RFC 791:
        //
        //  The internet header padding is used to ensure that the internet
        //  header ends on a 32 bit boundary. The padding is zero.
        padded := padIPv4OptionsLength(total)
        b = b[:padded-total]
        for i := range b {
                b[i] = 0
        }
        return padded
}

var _ IPv4SerializableOptionPayload = (*IPv4SerializableRouterAlertOption)(nil)
var _ IPv4SerializableOption = (*IPv4SerializableRouterAlertOption)(nil)

// IPv4SerializableRouterAlertOption provides serialization of the Router Alert
// IPv4 option according to RFC 2113.
type IPv4SerializableRouterAlertOption struct{}

// Type implements IPv4SerializableOption.
func (*IPv4SerializableRouterAlertOption) optionType() IPv4OptionType {
        return IPv4OptionRouterAlertType
}

// Length implements IPv4SerializableOption.
func (*IPv4SerializableRouterAlertOption) length() uint8 {
        return IPv4OptionRouterAlertLength - IPv4OptionRouterAlertValueOffset
}

// SerializeInto implements IPv4SerializableOption.
func (o *IPv4SerializableRouterAlertOption) serializeInto(buffer []byte) uint8 {
        binary.BigEndian.PutUint16(buffer, IPv4OptionRouterAlertValue)
        return o.length()
}

var _ IPv4SerializableOption = (*IPv4SerializableNOPOption)(nil)

// IPv4SerializableNOPOption provides serialization for the IPv4 no-op option.
type IPv4SerializableNOPOption struct{}

// Type implements IPv4SerializableOption.
func (*IPv4SerializableNOPOption) optionType() IPv4OptionType {
        return IPv4OptionNOPType
}

var _ IPv4SerializableOption = (*IPv4SerializableListEndOption)(nil)

// IPv4SerializableListEndOption provides serialization for the IPv4 List End
// option.
type IPv4SerializableListEndOption struct{}

// Type implements IPv4SerializableOption.
func (*IPv4SerializableListEndOption) optionType() IPv4OptionType {
        return IPv4OptionListEndType
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/gofer/fstree.go: no such file or directory















































































































































































































































































    1 




    1 






    1 




    1 







































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package loader

import (
        "bytes"
        "debug/elf"
        "fmt"
        "io"
        "strings"

        "gvisor.dev/gvisor/pkg/abi"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/loader/vdsodata"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/mm"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        "gvisor.dev/gvisor/pkg/sentry/uniqueid"
        "gvisor.dev/gvisor/pkg/sentry/usage"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
)

const vdsoPrelink = 0xffffffffff700000

type fileContext struct {
        context.Context
}

func (f *fileContext) Value(key interface{}) interface{} {
        switch key {
        case uniqueid.CtxGlobalUniqueID:
                return uint64(0)
        default:
                return f.Context.Value(key)
        }
}

type byteFullReader struct {
        data []byte
}

func (b *byteFullReader) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
        if offset < 0 {
                return 0, linuxerr.EINVAL
        }
        if offset >= int64(len(b.data)) {
                return 0, io.EOF
        }
        n, err := dst.CopyOut(ctx, b.data[offset:])
        return int64(n), err
}

// validateVDSO checks that the VDSO can be loaded by loadVDSO.
//
// VDSOs are special (see below). Since we are going to map the VDSO directly
// rather than using a normal loading process, we require that the PT_LOAD
// segments have the same layout in the ELF as they expect to have in memory.
//
// Namely, this means that we must verify:
// * PT_LOAD file offsets are equivalent to the memory offset from the first
//   segment.
// * No extra zeroed space (memsz) is required.
// * PT_LOAD segments are in order.
// * No two PT_LOAD segments occupy parts of the same page.
// * PT_LOAD segments don't extend beyond the end of the file.
//
// ctx may be nil if f does not need it.
func validateVDSO(ctx context.Context, f fullReader, size uint64) (elfInfo, error) {
        info, err := parseHeader(ctx, f)
        if err != nil {
                log.Infof("Unable to parse VDSO header: %v", err)
                return elfInfo{}, err
        }

        var first *elf.ProgHeader
        var prev *elf.ProgHeader
        var prevEnd hostarch.Addr
        for i, phdr := range info.phdrs {
                if phdr.Type != elf.PT_LOAD {
                        continue
                }

                if first == nil {
                        first = &info.phdrs[i]
                        if phdr.Off != 0 {
                                log.Warningf("First PT_LOAD segment has non-zero file offset")
                                return elfInfo{}, syserror.ENOEXEC
                        }
                }

                memoryOffset := phdr.Vaddr - first.Vaddr
                if memoryOffset != phdr.Off {
                        log.Warningf("PT_LOAD segment memory offset %#x != file offset %#x", memoryOffset, phdr.Off)
                        return elfInfo{}, syserror.ENOEXEC
                }

                // memsz larger than filesz means that extra zeroed space should be
                // provided at the end of the segment. Since we are mapping the ELF
                // directly, we don't want to just overwrite part of the ELF with
                // zeroes.
                if phdr.Memsz != phdr.Filesz {
                        log.Warningf("PT_LOAD segment memsz %#x != filesz %#x", phdr.Memsz, phdr.Filesz)
                        return elfInfo{}, syserror.ENOEXEC
                }

                start := hostarch.Addr(memoryOffset)
                end, ok := start.AddLength(phdr.Memsz)
                if !ok {
                        log.Warningf("PT_LOAD segment size overflows: %#x + %#x", start, end)
                        return elfInfo{}, syserror.ENOEXEC
                }
                if uint64(end) > size {
                        log.Warningf("PT_LOAD segment end %#x extends beyond end of file %#x", end, size)
                        return elfInfo{}, syserror.ENOEXEC
                }

                if prev != nil {
                        if start < prevEnd {
                                log.Warningf("PT_LOAD segments out of order")
                                return elfInfo{}, syserror.ENOEXEC
                        }

                        // We mprotect entire pages, so each segment must be in
                        // its own page.
                        prevEndPage := prevEnd.RoundDown()
                        startPage := start.RoundDown()
                        if prevEndPage >= startPage {
                                log.Warningf("PT_LOAD segments share a page: %#x", prevEndPage)
                                return elfInfo{}, syserror.ENOEXEC
                        }
                }
                prev = &info.phdrs[i]
                prevEnd = end
        }

        return info, nil
}

// VDSO describes a VDSO.
//
// NOTE(mpratt): to support multiple architectures or operating systems, this
// would need to contain a VDSO for each.
//
// +stateify savable
type VDSO struct {
        // ParamPage is the VDSO parameter page. This page should be updated to
        // inform the VDSO for timekeeping data.
        ParamPage *mm.SpecialMappable

        // vdso is the VDSO ELF itself.
        vdso *mm.SpecialMappable

        // os is the operating system targeted by the VDSO.
        os abi.OS

        // arch is the architecture targeted by the VDSO.
        arch arch.Arch

        // phdrs are the VDSO ELF phdrs.
        phdrs []elf.ProgHeader `state:".([]elfProgHeader)"`
}

// getSymbolValueFromVDSO returns the specific symbol value in vdso.so.
func getSymbolValueFromVDSO(symbol string) (uint64, error) {
        f, err := elf.NewFile(bytes.NewReader(vdsodata.Binary))
        if err != nil {
                return 0, err
        }
        syms, err := f.Symbols()
        if err != nil {
                return 0, err
        }

        for _, sym := range syms {
                if elf.ST_BIND(sym.Info) != elf.STB_LOCAL && sym.Section != elf.SHN_UNDEF {
                        if strings.Contains(sym.Name, symbol) {
                                return sym.Value, nil
                        }
                }
        }
        return 0, fmt.Errorf("no %v in vdso.so", symbol)
}

// PrepareVDSO validates the system VDSO and returns a VDSO, containing the
// param page for updating by the kernel.
func PrepareVDSO(mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
        vdsoFile := &byteFullReader{data: vdsodata.Binary}

        // First make sure the VDSO is valid. vdsoFile does not use ctx, so a
        // nil context can be passed.
        info, err := validateVDSO(nil, vdsoFile, uint64(len(vdsodata.Binary)))
        if err != nil {
                return nil, err
        }

        // Then copy it into a VDSO mapping.
        size, ok := hostarch.Addr(len(vdsodata.Binary)).RoundUp()
        if !ok {
                return nil, fmt.Errorf("VDSO size overflows? %#x", len(vdsodata.Binary))
        }

        mf := mfp.MemoryFile()
        vdso, err := mf.Allocate(uint64(size), usage.System)
        if err != nil {
                return nil, fmt.Errorf("unable to allocate VDSO memory: %v", err)
        }

        ims, err := mf.MapInternal(vdso, hostarch.ReadWrite)
        if err != nil {
                mf.DecRef(vdso)
                return nil, fmt.Errorf("unable to map VDSO memory: %v", err)
        }

        _, err = safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(vdsodata.Binary)))
        if err != nil {
                mf.DecRef(vdso)
                return nil, fmt.Errorf("unable to copy VDSO into memory: %v", err)
        }

        // Finally, allocate a param page for this VDSO.
        paramPage, err := mf.Allocate(hostarch.PageSize, usage.System)
        if err != nil {
                mf.DecRef(vdso)
                return nil, fmt.Errorf("unable to allocate VDSO param page: %v", err)
        }

        return &VDSO{
                ParamPage: mm.NewSpecialMappable("[vvar]", mfp, paramPage),
                // TODO(gvisor.dev/issue/157): Don't advertise the VDSO, as
                // some applications may not be able to handle multiple [vdso]
                // hints.
                vdso:  mm.NewSpecialMappable("", mfp, vdso),
                os:    info.os,
                arch:  info.arch,
                phdrs: info.phdrs,
        }, nil
}

// loadVDSO loads the VDSO into m.
//
// VDSOs are special.
//
// VDSOs are fully position independent. However, instead of loading a VDSO
// like a normal ELF binary, mapping only the PT_LOAD segments, the Linux
// kernel simply directly maps the entire file into process memory, with very
// little real ELF parsing.
//
// NOTE(b/25323870): This means that userspace can, and unfortunately does,
// depend on parts of the ELF that would normally not be mapped.  To maintain
// compatibility with such binaries, we load the VDSO much like Linux.
//
// loadVDSO takes a reference on the VDSO and parameter page FrameRegions.
func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF) (hostarch.Addr, error) {
        if v.os != bin.os {
                ctx.Warningf("Binary ELF OS %v and VDSO ELF OS %v differ", bin.os, v.os)
                return 0, syserror.ENOEXEC
        }
        if v.arch != bin.arch {
                ctx.Warningf("Binary ELF arch %v and VDSO ELF arch %v differ", bin.arch, v.arch)
                return 0, syserror.ENOEXEC
        }

        // Reserve address space for the VDSO and its parameter page, which is
        // mapped just before the VDSO.
        mapSize := v.vdso.Length() + v.ParamPage.Length()
        addr, err := m.MMap(ctx, memmap.MMapOpts{
                Length:  mapSize,
                Private: true,
        })
        if err != nil {
                ctx.Infof("Unable to reserve VDSO address space: %v", err)
                return 0, err
        }

        // Now map the param page.
        _, err = m.MMap(ctx, memmap.MMapOpts{
                Length:          v.ParamPage.Length(),
                MappingIdentity: v.ParamPage,
                Mappable:        v.ParamPage,
                Addr:            addr,
                Fixed:           true,
                Unmap:           true,
                Private:         true,
                Perms:           hostarch.Read,
                MaxPerms:        hostarch.Read,
        })
        if err != nil {
                ctx.Infof("Unable to map VDSO param page: %v", err)
                return 0, err
        }

        // Now map the VDSO itself.
        vdsoAddr, ok := addr.AddLength(v.ParamPage.Length())
        if !ok {
                panic(fmt.Sprintf("Part of mapped range overflows? %#x + %#x", addr, v.ParamPage.Length()))
        }
        _, err = m.MMap(ctx, memmap.MMapOpts{
                Length:          v.vdso.Length(),
                MappingIdentity: v.vdso,
                Mappable:        v.vdso,
                Addr:            vdsoAddr,
                Fixed:           true,
                Unmap:           true,
                Private:         true,
                Perms:           hostarch.Read,
                MaxPerms:        hostarch.AnyAccess,
        })
        if err != nil {
                ctx.Infof("Unable to map VDSO: %v", err)
                return 0, err
        }

        vdsoEnd, ok := vdsoAddr.AddLength(v.vdso.Length())
        if !ok {
                panic(fmt.Sprintf("VDSO mapping overflows? %#x + %#x", vdsoAddr, v.vdso.Length()))
        }

        // Set additional protections for the individual segments.
        var first *elf.ProgHeader
        for i, phdr := range v.phdrs {
                if phdr.Type != elf.PT_LOAD {
                        continue
                }

                if first == nil {
                        first = &v.phdrs[i]
                }

                memoryOffset := phdr.Vaddr - first.Vaddr
                segAddr, ok := vdsoAddr.AddLength(memoryOffset)
                if !ok {
                        ctx.Warningf("PT_LOAD segment address overflows: %#x + %#x", segAddr, memoryOffset)
                        return 0, syserror.ENOEXEC
                }
                segPage := segAddr.RoundDown()
                segSize := hostarch.Addr(phdr.Memsz)
                segSize, ok = segSize.AddLength(segAddr.PageOffset())
                if !ok {
                        ctx.Warningf("PT_LOAD segment memsize %#x + offset %#x overflows", phdr.Memsz, segAddr.PageOffset())
                        return 0, syserror.ENOEXEC
                }
                segSize, ok = segSize.RoundUp()
                if !ok {
                        ctx.Warningf("PT_LOAD segment size overflows: %#x", phdr.Memsz+segAddr.PageOffset())
                        return 0, syserror.ENOEXEC
                }
                segEnd, ok := segPage.AddLength(uint64(segSize))
                if !ok {
                        ctx.Warningf("PT_LOAD segment range overflows: %#x + %#x", segAddr, segSize)
                        return 0, syserror.ENOEXEC
                }
                if segEnd > vdsoEnd {
                        ctx.Warningf("PT_LOAD segment ends beyond VDSO: %#x > %#x", segEnd, vdsoEnd)
                        return 0, syserror.ENOEXEC
                }

                perms := progFlagsAsPerms(phdr.Flags)
                if perms != hostarch.Read {
                        if err := m.MProtect(segPage, uint64(segSize), perms, false); err != nil {
                                ctx.Warningf("Unable to set PT_LOAD segment protections %+v at [%#x, %#x): %v", perms, segAddr, segEnd, err)
                                return 0, syserror.ENOEXEC
                        }
                }
        }

        return vdsoAddr, nil
}

// Release drops references on mappings held by v.
func (v *VDSO) Release(ctx context.Context) {
        v.ParamPage.DecRef(ctx)
        v.vdso.DecRef(ctx)
}
































































































































































































































































































































































































































































































































































































































































 1959 
    9 


 1954 



 1790 




 1959 






   56 








   27 









   17 












   78 
   79 




























  567 








  126 






    2 







 1676 


















   57 





















  511 

















   23 




  240 













   69 











  693 




    1 





    1 
    1 

















    4 




  669 
    3 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        gocontext "context"
        "runtime/trace"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/bpf"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/inet"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/kernel/futex"
        "gvisor.dev/gvisor/pkg/sentry/kernel/sched"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/platform"
        "gvisor.dev/gvisor/pkg/sentry/usage"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/waiter"
)

// Task represents a thread of execution in the untrusted app.  It
// includes registers and any thread-specific state that you would
// normally expect.
//
// Each task is associated with a goroutine, called the task goroutine, that
// executes code (application code, system calls, etc.) on behalf of that task.
// See Task.run (task_run.go).
//
// All fields that are "owned by the task goroutine" can only be mutated by the
// task goroutine while it is running. The task goroutine does not require
// synchronization to read these fields, although it still requires
// synchronization as described for those fields to mutate them.
//
// All fields that are "exclusive to the task goroutine" can only be accessed
// by the task goroutine while it is running. The task goroutine does not
// require synchronization to read or write these fields.
//
// +stateify savable
type Task struct {
        taskNode

        // goid is the task goroutine's ID. goid is owned by the task goroutine,
        // but since it's used to detect cases where non-task goroutines
        // incorrectly access state owned by, or exclusive to, the task goroutine,
        // goid is always accessed using atomic memory operations.
        goid int64 `state:"nosave"`

        // runState is what the task goroutine is executing if it is not stopped.
        // If runState is nil, the task goroutine should exit or has exited.
        // runState is exclusive to the task goroutine.
        runState taskRunState

        // taskWorkCount represents the current size of the task work queue. It is
        // used to avoid acquiring taskWorkMu when the queue is empty.
        //
        // Must accessed with atomic memory operations.
        taskWorkCount int32

        // taskWorkMu protects taskWork.
        taskWorkMu sync.Mutex `state:"nosave"`

        // taskWork is a queue of work to be executed before resuming user execution.
        // It is similar to the task_work mechanism in Linux.
        //
        // taskWork is exclusive to the task goroutine.
        taskWork []TaskWorker

        // haveSyscallReturn is true if image.Arch().Return() represents a value
        // returned by a syscall (or set by ptrace after a syscall).
        //
        // haveSyscallReturn is exclusive to the task goroutine.
        haveSyscallReturn bool

        // interruptChan is notified whenever the task goroutine is interrupted
        // (usually by a pending signal). interruptChan is effectively a condition
        // variable that can be used in select statements.
        //
        // interruptChan is not saved; because saving interrupts all tasks,
        // interruptChan is always notified after restore (see Task.run).
        interruptChan chan struct{} `state:"nosave"`

        // gosched contains the current scheduling state of the task goroutine.
        //
        // gosched is protected by goschedSeq. gosched is owned by the task
        // goroutine.
        goschedSeq sync.SeqCount `state:"nosave"`
        gosched    TaskGoroutineSchedInfo

        // yieldCount is the number of times the task goroutine has called
        // Task.InterruptibleSleepStart, Task.UninterruptibleSleepStart, or
        // Task.Yield(), voluntarily ceasing execution.
        //
        // yieldCount is accessed using atomic memory operations. yieldCount is
        // owned by the task goroutine.
        yieldCount uint64

        // pendingSignals is the set of pending signals that may be handled only by
        // this task.
        //
        // pendingSignals is protected by (taskNode.)tg.signalHandlers.mu
        // (hereafter "the signal mutex"); see comment on
        // ThreadGroup.signalHandlers.
        pendingSignals pendingSignals

        // signalMask is the set of signals whose delivery is currently blocked.
        //
        // signalMask is accessed using atomic memory operations, and is protected
        // by the signal mutex (such that reading signalMask is safe if either the
        // signal mutex is locked or if atomic memory operations are used, while
        // writing signalMask requires both). signalMask is owned by the task
        // goroutine.
        signalMask linux.SignalSet

        // If the task goroutine is currently executing Task.sigtimedwait,
        // realSignalMask is the previous value of signalMask, which has temporarily
        // been replaced by Task.sigtimedwait. Otherwise, realSignalMask is 0.
        //
        // realSignalMask is exclusive to the task goroutine.
        realSignalMask linux.SignalSet

        // If haveSavedSignalMask is true, savedSignalMask is the signal mask that
        // should be applied after the task has either delivered one signal to a
        // user handler or is about to resume execution in the untrusted
        // application.
        //
        // Both haveSavedSignalMask and savedSignalMask are exclusive to the task
        // goroutine.
        haveSavedSignalMask bool
        savedSignalMask     linux.SignalSet

        // signalStack is the alternate signal stack used by signal handlers for
        // which the SA_ONSTACK flag is set.
        //
        // signalStack is exclusive to the task goroutine.
        signalStack linux.SignalStack

        // signalQueue is a set of registered waiters for signal-related events.
        //
        // signalQueue is protected by the signalMutex. Note that the task does
        // not implement all queue methods, specifically the readiness checks.
        // The task only broadcast a notification on signal delivery.
        signalQueue waiter.Queue `state:"zerovalue"`

        // If groupStopPending is true, the task should participate in a group
        // stop in the interrupt path.
        //
        // groupStopPending is analogous to JOBCTL_STOP_PENDING in Linux.
        //
        // groupStopPending is protected by the signal mutex.
        groupStopPending bool

        // If groupStopAcknowledged is true, the task has already acknowledged that
        // it is entering the most recent group stop that has been initiated on its
        // thread group.
        //
        // groupStopAcknowledged is analogous to !JOBCTL_STOP_CONSUME in Linux.
        //
        // groupStopAcknowledged is protected by the signal mutex.
        groupStopAcknowledged bool

        // If trapStopPending is true, the task goroutine should enter a
        // PTRACE_INTERRUPT-induced stop from the interrupt path.
        //
        // trapStopPending is analogous to JOBCTL_TRAP_STOP in Linux, except that
        // Linux also sets JOBCTL_TRAP_STOP when a ptraced task detects
        // JOBCTL_STOP_PENDING.
        //
        // trapStopPending is protected by the signal mutex.
        trapStopPending bool

        // If trapNotifyPending is true, this task is PTRACE_SEIZEd, and a group
        // stop has begun or ended since the last time the task entered a
        // ptrace-stop from the group-stop path.
        //
        // trapNotifyPending is analogous to JOBCTL_TRAP_NOTIFY in Linux.
        //
        // trapNotifyPending is protected by the signal mutex.
        trapNotifyPending bool

        // If stop is not nil, it is the internally-initiated condition that
        // currently prevents the task goroutine from running.
        //
        // stop is protected by the signal mutex.
        stop TaskStop

        // stopCount is the number of active external stops (calls to
        // Task.BeginExternalStop that have not been paired with a call to
        // Task.EndExternalStop), plus 1 if stop is not nil. Hence stopCount is
        // non-zero if the task goroutine should stop.
        //
        // Mutating stopCount requires both locking the signal mutex and using
        // atomic memory operations. Reading stopCount requires either locking the
        // signal mutex or using atomic memory operations. This allows Task.doStop
        // to require only a single atomic read in the common case where stopCount
        // is 0.
        //
        // stopCount is not saved, because external stops cannot be retained across
        // a save/restore cycle. (Suppose a sentryctl command issues an external
        // stop; after a save/restore cycle, the restored sentry has no knowledge
        // of the pre-save sentryctl command, and the stopped task would remain
        // stopped forever.)
        stopCount int32 `state:"nosave"`

        // endStopCond is signaled when stopCount transitions to 0. The combination
        // of stopCount and endStopCond effectively form a sync.WaitGroup, but
        // WaitGroup provides no way to read its counter value.
        //
        // Invariant: endStopCond.L is the signal mutex. (This is not racy because
        // sync.Cond.Wait is the only user of sync.Cond.L; only the task goroutine
        // calls sync.Cond.Wait; and only the task goroutine can change the
        // identity of the signal mutex, in Task.finishExec.)
        endStopCond sync.Cond `state:"nosave"`

        // exitStatus is the task's exit status.
        //
        // exitStatus is protected by the signal mutex.
        exitStatus linux.WaitStatus

        // syscallRestartBlock represents a custom restart function to run in
        // restart_syscall(2) to resume an interrupted syscall.
        //
        // syscallRestartBlock is exclusive to the task goroutine.
        syscallRestartBlock SyscallRestartBlock

        // p provides the mechanism by which the task runs code in userspace. The p
        // interface object is immutable.
        p platform.Context `state:"nosave"`

        // k is the Kernel that this task belongs to. The k pointer is immutable.
        k *Kernel

        // containerID has no equivalent in Linux; it's used by runsc to track all
        // tasks that belong to a given containers since cgroups aren't implemented.
        // It's inherited by the children, is immutable, and may be empty.
        //
        // NOTE: cgroups can be used to track this when implemented.
        containerID string

        // mu protects some of the following fields.
        mu sync.Mutex `state:"nosave"`

        // image holds task data provided by the ELF loader.
        //
        // image is protected by mu, and is owned by the task goroutine.
        image TaskImage

        // fsContext is the task's filesystem context.
        //
        // fsContext is protected by mu, and is owned by the task goroutine.
        fsContext *FSContext

        // fdTable is the task's file descriptor table.
        //
        // fdTable is protected by mu, and is owned by the task goroutine.
        fdTable *FDTable

        // If vforkParent is not nil, it is the task that created this task with
        // vfork() or clone(CLONE_VFORK), and should have its vforkStop ended when
        // this TaskImage is released.
        //
        // vforkParent is protected by the TaskSet mutex.
        vforkParent *Task

        // exitState is the task's progress through the exit path.
        //
        // exitState is protected by the TaskSet mutex. exitState is owned by the
        // task goroutine.
        exitState TaskExitState

        // exitTracerNotified is true if the exit path has either signaled the
        // task's tracer to indicate the exit, or determined that no such signal is
        // needed. exitTracerNotified can only be true if exitState is
        // TaskExitZombie or TaskExitDead.
        //
        // exitTracerNotified is protected by the TaskSet mutex.
        exitTracerNotified bool

        // exitTracerAcked is true if exitTracerNotified is true and either the
        // task's tracer has acknowledged the exit notification, or the exit path
        // has determined that no such notification is needed.
        //
        // exitTracerAcked is protected by the TaskSet mutex.
        exitTracerAcked bool

        // exitParentNotified is true if the exit path has either signaled the
        // task's parent to indicate the exit, or determined that no such signal is
        // needed. exitParentNotified can only be true if exitState is
        // TaskExitZombie or TaskExitDead.
        //
        // exitParentNotified is protected by the TaskSet mutex.
        exitParentNotified bool

        // exitParentAcked is true if exitParentNotified is true and either the
        // task's parent has acknowledged the exit notification, or the exit path
        // has determined that no such acknowledgment is needed.
        //
        // exitParentAcked is protected by the TaskSet mutex.
        exitParentAcked bool

        // goroutineStopped is a WaitGroup whose counter value is 1 when the task
        // goroutine is running and 0 when the task goroutine is stopped or has
        // exited.
        goroutineStopped sync.WaitGroup `state:"nosave"`

        // ptraceTracer is the task that is ptrace-attached to this one. If
        // ptraceTracer is nil, this task is not being traced. Note that due to
        // atomic.Value limitations (atomic.Value.Store(nil) panics), a nil
        // ptraceTracer is always represented as a typed nil (i.e. (*Task)(nil)).
        //
        // ptraceTracer is protected by the TaskSet mutex, and accessed with atomic
        // operations. This allows paths that wouldn't otherwise lock the TaskSet
        // mutex, notably the syscall path, to check if ptraceTracer is nil without
        // additional synchronization.
        ptraceTracer atomic.Value `state:".(*Task)"`

        // ptraceTracees is the set of tasks that this task is ptrace-attached to.
        //
        // ptraceTracees is protected by the TaskSet mutex.
        ptraceTracees map[*Task]struct{}

        // ptraceSeized is true if ptraceTracer attached to this task with
        // PTRACE_SEIZE.
        //
        // ptraceSeized is protected by the TaskSet mutex.
        ptraceSeized bool

        // ptraceOpts contains ptrace options explicitly set by the tracer. If
        // ptraceTracer is nil, ptraceOpts is expected to be the zero value.
        //
        // ptraceOpts is protected by the TaskSet mutex.
        ptraceOpts ptraceOptions

        // ptraceSyscallMode controls ptrace behavior around syscall entry and
        // exit.
        //
        // ptraceSyscallMode is protected by the TaskSet mutex.
        ptraceSyscallMode ptraceSyscallMode

        // If ptraceSinglestep is true, the next time the task executes application
        // code, single-stepping should be enabled. ptraceSinglestep is stored
        // independently of the architecture-specific trap flag because tracer
        // detaching (which can happen concurrently with the tracee's execution if
        // the tracer exits) must disable single-stepping, and the task's
        // architectural state is implicitly exclusive to the task goroutine (no
        // synchronization occurs before passing registers to SwitchToApp).
        //
        // ptraceSinglestep is analogous to Linux's TIF_SINGLESTEP.
        //
        // ptraceSinglestep is protected by the TaskSet mutex.
        ptraceSinglestep bool

        // If t is ptrace-stopped, ptraceCode is a ptrace-defined value set at the
        // time that t entered the ptrace stop, reset to 0 when the tracer
        // acknowledges the stop with a wait*() syscall. Otherwise, it is the
        // signal number passed to the ptrace operation that ended the last ptrace
        // stop on this task. In the latter case, the effect of ptraceCode depends
        // on the nature of the ptrace stop; signal-delivery-stop uses it to
        // conditionally override ptraceSiginfo, syscall-entry/exit-stops send the
        // signal to the task after leaving the stop, and PTRACE_EVENT stops and
        // traced group stops ignore it entirely.
        //
        // Linux contextually stores the equivalent of ptraceCode in
        // task_struct::exit_code.
        //
        // ptraceCode is protected by the TaskSet mutex.
        ptraceCode int32

        // ptraceSiginfo is the value returned to the tracer by
        // ptrace(PTRACE_GETSIGINFO) and modified by ptrace(PTRACE_SETSIGINFO).
        // (Despite the name, PTRACE_PEEKSIGINFO is completely unrelated.)
        // ptraceSiginfo is nil if the task is in a ptraced group-stop (this is
        // required for PTRACE_GETSIGINFO to return EINVAL during such stops, which
        // is in turn required to distinguish group stops from other ptrace stops,
        // per subsection "Group-stop" in ptrace(2)).
        //
        // ptraceSiginfo is analogous to Linux's task_struct::last_siginfo.
        //
        // ptraceSiginfo is protected by the TaskSet mutex.
        ptraceSiginfo *linux.SignalInfo

        // ptraceEventMsg is the value set by PTRACE_EVENT stops and returned to
        // the tracer by ptrace(PTRACE_GETEVENTMSG).
        //
        // ptraceEventMsg is protected by the TaskSet mutex.
        ptraceEventMsg uint64

        // ptraceYAMAExceptionAdded is true if a YAMA exception involving the task has
        // been added before. This is used during task exit to decide whether we need
        // to clean up YAMA exceptions.
        //
        // ptraceYAMAExceptionAdded is protected by the TaskSet mutex.
        ptraceYAMAExceptionAdded bool

        // The struct that holds the IO-related usage. The ioUsage pointer is
        // immutable.
        ioUsage *usage.IO

        // logPrefix is a string containing the task's thread ID in the root PID
        // namespace, and is prepended to log messages emitted by Task.Infof etc.
        logPrefix atomic.Value `state:"nosave"`

        // traceContext and traceTask are both used for tracing, and are
        // updated along with the logPrefix in updateInfoLocked.
        //
        // These are exclusive to the task goroutine.
        traceContext gocontext.Context `state:"nosave"`
        traceTask    *trace.Task       `state:"nosave"`

        // creds is the task's credentials.
        //
        // creds.Load() may be called without synchronization. creds.Store() is
        // serialized by mu. creds is owned by the task goroutine. All
        // auth.Credentials objects that creds may point to, or have pointed to
        // in the past, must be treated as immutable.
        creds auth.AtomicPtrCredentials

        // utsns is the task's UTS namespace.
        //
        // utsns is protected by mu. utsns is owned by the task goroutine.
        utsns *UTSNamespace

        // ipcns is the task's IPC namespace.
        //
        // ipcns is protected by mu. ipcns is owned by the task goroutine.
        ipcns *IPCNamespace

        // abstractSockets tracks abstract sockets that are in use.
        //
        // abstractSockets is protected by mu.
        abstractSockets *AbstractSocketNamespace

        // mountNamespaceVFS2 is the task's mount namespace.
        //
        // It is protected by mu. It is owned by the task goroutine.
        mountNamespaceVFS2 *vfs.MountNamespace

        // parentDeathSignal is sent to this task's thread group when its parent exits.
        //
        // parentDeathSignal is protected by mu.
        parentDeathSignal linux.Signal

        // syscallFilters is all seccomp-bpf syscall filters applicable to the
        // task, in the order in which they were installed. The type of the atomic
        // is []bpf.Program. Writing needs to be protected by the signal mutex.
        //
        // syscallFilters is owned by the task goroutine.
        syscallFilters atomic.Value `state:".([]bpf.Program)"`

        // If cleartid is non-zero, treat it as a pointer to a ThreadID in the
        // task's virtual address space; when the task exits, set the pointed-to
        // ThreadID to 0, and wake any futex waiters.
        //
        // cleartid is exclusive to the task goroutine.
        cleartid hostarch.Addr

        // This is mostly a fake cpumask just for sched_set/getaffinity as we
        // don't really control the affinity.
        //
        // Invariant: allowedCPUMask.Size() ==
        // sched.CPUMaskSize(Kernel.applicationCores).
        //
        // allowedCPUMask is protected by mu.
        allowedCPUMask sched.CPUSet

        // cpu is the fake cpu number returned by getcpu(2). cpu is ignored
        // entirely if Kernel.useHostCores is true.
        //
        // cpu is accessed using atomic memory operations.
        cpu int32

        // This is used to keep track of changes made to a process' priority/niceness.
        // It is mostly used to provide some reasonable return value from
        // getpriority(2) after a call to setpriority(2) has been made.
        // We currently do not actually modify a process' scheduling priority.
        // NOTE: This represents the userspace view of priority (nice).
        // This means that the value should be in the range [-20, 19].
        //
        // niceness is protected by mu.
        niceness int

        // This is used to track the numa policy for the current thread. This can be
        // modified through a set_mempolicy(2) syscall. Since we always report a
        // single numa node, all policies are no-ops. We only track this information
        // so that we can return reasonable values if the application calls
        // get_mempolicy(2) after setting a non-default policy. Note that in the
        // real syscall, nodemask can be longer than a single unsigned long, but we
        // always report a single node so never need to save more than a single
        // bit.
        //
        // numaPolicy and numaNodeMask are protected by mu.
        numaPolicy   linux.NumaPolicy
        numaNodeMask uint64

        // netns is the task's network namespace. netns is never nil.
        //
        // netns is protected by mu.
        netns *inet.Namespace

        // If rseqPreempted is true, before the next call to p.Switch(),
        // interrupt rseq critical regions as defined by rseqAddr and
        // tg.oldRSeqCritical and write the task goroutine's CPU number to
        // rseqAddr/oldRSeqCPUAddr.
        //
        // We support two ABIs for restartable sequences:
        //
        //  1. The upstream interface added in v4.18,
        //  2. An "old" interface never merged upstream. In the implementation,
        //     this is referred to as "old rseq".
        //
        // rseqPreempted is exclusive to the task goroutine.
        rseqPreempted bool `state:"nosave"`

        // rseqCPU is the last CPU number written to rseqAddr/oldRSeqCPUAddr.
        //
        // If rseq is unused, rseqCPU is -1 for convenient use in
        // platform.Context.Switch.
        //
        // rseqCPU is exclusive to the task goroutine.
        rseqCPU int32

        // oldRSeqCPUAddr is a pointer to the userspace old rseq CPU variable.
        //
        // oldRSeqCPUAddr is exclusive to the task goroutine.
        oldRSeqCPUAddr hostarch.Addr

        // rseqAddr is a pointer to the userspace linux.RSeq structure.
        //
        // rseqAddr is exclusive to the task goroutine.
        rseqAddr hostarch.Addr

        // rseqSignature is the signature that the rseq abort IP must be signed
        // with.
        //
        // rseqSignature is exclusive to the task goroutine.
        rseqSignature uint32

        // copyScratchBuffer is a buffer available to CopyIn/CopyOut
        // implementations that require an intermediate buffer to copy data
        // into/out of. It prevents these buffers from being allocated/zeroed in
        // each syscall and eventually garbage collected.
        //
        // copyScratchBuffer is exclusive to the task goroutine.
        copyScratchBuffer [copyScratchBufferLen]byte `state:"nosave"`

        // blockingTimer is used for blocking timeouts. blockingTimerChan is the
        // channel that is sent to when blockingTimer fires.
        //
        // blockingTimer is exclusive to the task goroutine.
        blockingTimer     *ktime.Timer    `state:"nosave"`
        blockingTimerChan <-chan struct{} `state:"nosave"`

        // futexWaiter is used for futex(FUTEX_WAIT) syscalls.
        //
        // futexWaiter is exclusive to the task goroutine.
        futexWaiter *futex.Waiter `state:"nosave"`

        // robustList is a pointer to the head of the tasks's robust futex
        // list.
        robustList hostarch.Addr

        // startTime is the real time at which the task started. It is set when
        // a Task is created or invokes execve(2).
        //
        // startTime is protected by mu.
        startTime ktime.Time

        // kcov is the kcov instance providing code coverage owned by this task.
        //
        // kcov is exclusive to the task goroutine.
        kcov *Kcov

        // cgroups is the set of cgroups this task belongs to. This may be empty if
        // no cgroup controllers are enabled. Protected by mu.
        //
        // +checklocks:mu
        cgroups map[Cgroup]struct{}
}

func (t *Task) savePtraceTracer() *Task {
        return t.ptraceTracer.Load().(*Task)
}

func (t *Task) loadPtraceTracer(tracer *Task) {
        t.ptraceTracer.Store(tracer)
}

func (t *Task) saveSyscallFilters() []bpf.Program {
        if f := t.syscallFilters.Load(); f != nil {
                return f.([]bpf.Program)
        }
        return nil
}

func (t *Task) loadSyscallFilters(filters []bpf.Program) {
        t.syscallFilters.Store(filters)
}

// afterLoad is invoked by stateify.
func (t *Task) afterLoad() {
        t.updateInfoLocked()
        t.interruptChan = make(chan struct{}, 1)
        t.gosched.State = TaskGoroutineNonexistent
        if t.stop != nil {
                t.stopCount = 1
        }
        t.endStopCond.L = &t.tg.signalHandlers.mu
        t.p = t.k.Platform.NewContext()
        t.rseqPreempted = true
        t.futexWaiter = futex.NewWaiter()
}

// copyScratchBufferLen is the length of Task.copyScratchBuffer.
const copyScratchBufferLen = 144 // sizeof(struct stat)

// CopyScratchBuffer returns a scratch buffer to be used in CopyIn/CopyOut
// functions. It must only be used within those functions and can only be used
// by the task goroutine; it exists to improve performance and thus
// intentionally lacks any synchronization.
//
// Callers should pass a constant value as an argument if possible, which will
// allow the compiler to inline and optimize out the if statement below.
func (t *Task) CopyScratchBuffer(size int) []byte {
        if size > copyScratchBufferLen {
                return make([]byte, size)
        }
        return t.copyScratchBuffer[:size]
}

// FutexWaiter returns the Task's futex.Waiter.
func (t *Task) FutexWaiter() *futex.Waiter {
        return t.futexWaiter
}

// Kernel returns the Kernel containing t.
func (t *Task) Kernel() *Kernel {
        return t.k
}

// SetClearTID sets t's cleartid.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) SetClearTID(addr hostarch.Addr) {
        t.cleartid = addr
}

// SetSyscallRestartBlock sets the restart block for use in
// restart_syscall(2). After registering a restart block, a syscall should
// return ERESTART_RESTARTBLOCK to request a restart using the block.
//
// Precondition: The caller must be running on the task goroutine.
func (t *Task) SetSyscallRestartBlock(r SyscallRestartBlock) {
        t.syscallRestartBlock = r
}

// SyscallRestartBlock returns the currently registered restart block for use in
// restart_syscall(2). This function is *not* idempotent and may be called once
// per syscall. This function must not be called if a restart block has not been
// registered for the current syscall.
//
// Precondition: The caller must be running on the task goroutine.
func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
        r := t.syscallRestartBlock
        // Explicitly set the restart block to nil so that a future syscall can't
        // accidentally reuse it.
        t.syscallRestartBlock = nil
        return r
}

// IsChrooted returns true if the root directory of t's FSContext is not the
// root directory of t's MountNamespace.
//
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) IsChrooted() bool {
        if VFS2Enabled {
                realRoot := t.mountNamespaceVFS2.Root()
                root := t.fsContext.RootDirectoryVFS2()
                defer root.DecRef(t)
                return root != realRoot
        }

        realRoot := t.tg.mounts.Root()
        defer realRoot.DecRef(t)
        root := t.fsContext.RootDirectory()
        if root != nil {
                defer root.DecRef(t)
        }
        return root != realRoot
}

// TaskImage returns t's TaskImage.
//
// Precondition: The caller must be running on the task goroutine, or t.mu must
// be locked.
func (t *Task) TaskImage() *TaskImage {
        return &t.image
}

// FSContext returns t's FSContext. FSContext does not take an additional
// reference on the returned FSContext.
//
// Precondition: The caller must be running on the task goroutine, or t.mu must
// be locked.
func (t *Task) FSContext() *FSContext {
        return t.fsContext
}

// FDTable returns t's FDTable. FDMTable does not take an additional reference
// on the returned FDMap.
//
// Precondition: The caller must be running on the task goroutine, or t.mu must
// be locked.
func (t *Task) FDTable() *FDTable {
        return t.fdTable
}

// GetFile is a convenience wrapper for t.FDTable().Get.
//
// Precondition: same as FDTable.Get.
func (t *Task) GetFile(fd int32) *fs.File {
        f, _ := t.fdTable.Get(fd)
        return f
}

// GetFileVFS2 is a convenience wrapper for t.FDTable().GetVFS2.
//
// Precondition: same as FDTable.Get.
func (t *Task) GetFileVFS2(fd int32) *vfs.FileDescription {
        f, _ := t.fdTable.GetVFS2(fd)
        return f
}

// NewFDs is a convenience wrapper for t.FDTable().NewFDs.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.
func (t *Task) NewFDs(fd int32, files []*fs.File, flags FDFlags) ([]int32, error) {
        return t.fdTable.NewFDs(t, fd, files, flags)
}

// NewFDsVFS2 is a convenience wrapper for t.FDTable().NewFDsVFS2.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.
func (t *Task) NewFDsVFS2(fd int32, files []*vfs.FileDescription, flags FDFlags) ([]int32, error) {
        return t.fdTable.NewFDsVFS2(t, fd, files, flags)
}

// NewFDFrom is a convenience wrapper for t.FDTable().NewFDs with a single file.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.
func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error) {
        fds, err := t.fdTable.NewFDs(t, fd, []*fs.File{file}, flags)
        if err != nil {
                return 0, err
        }
        return fds[0], nil
}

// NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.Get.
func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
        return t.fdTable.NewFDVFS2(t, fd, file, flags)
}

// NewFDAt is a convenience wrapper for t.FDTable().NewFDAt.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.
func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error {
        return t.fdTable.NewFDAt(t, fd, file, flags)
}

// NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.
func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error {
        return t.fdTable.NewFDAtVFS2(t, fd, file, flags)
}

// WithMuLocked executes f with t.mu locked.
func (t *Task) WithMuLocked(f func(*Task)) {
        t.mu.Lock()
        f(t)
        t.mu.Unlock()
}

// MountNamespace returns t's MountNamespace. MountNamespace does not take an
// additional reference on the returned MountNamespace.
func (t *Task) MountNamespace() *fs.MountNamespace {
        return t.tg.mounts
}

// MountNamespaceVFS2 returns t's MountNamespace. A reference is taken on the
// returned mount namespace.
func (t *Task) MountNamespaceVFS2() *vfs.MountNamespace {
        t.mu.Lock()
        defer t.mu.Unlock()
        return t.mountNamespaceVFS2
}

// AbstractSockets returns t's AbstractSocketNamespace.
func (t *Task) AbstractSockets() *AbstractSocketNamespace {
        return t.abstractSockets
}

// ContainerID returns t's container ID.
func (t *Task) ContainerID() string {
        return t.containerID
}

// OOMScoreAdj gets the task's thread group's OOM score adjustment.
func (t *Task) OOMScoreAdj() int32 {
        return atomic.LoadInt32(&t.tg.oomScoreAdj)
}

// SetOOMScoreAdj sets the task's thread group's OOM score adjustment. The
// value should be between -1000 and 1000 inclusive.
func (t *Task) SetOOMScoreAdj(adj int32) error {
        if adj > 1000 || adj < -1000 {
                return linuxerr.EINVAL
        }
        atomic.StoreInt32(&t.tg.oomScoreAdj, adj)
        return nil
}

// KUID returns t's kuid.
func (t *Task) KUID() uint32 {
        return uint32(t.Credentials().EffectiveKUID)
}

// KGID returns t's kgid.
func (t *Task) KGID() uint32 {
        return uint32(t.Credentials().EffectiveKGID)
}

// SetKcov sets the kcov instance associated with t.
func (t *Task) SetKcov(k *Kcov) {
        t.kcov = k
}

// ResetKcov clears the kcov instance associated with t.
func (t *Task) ResetKcov() {
        if t.kcov != nil {
                t.kcov.OnTaskExit()
                t.kcov = nil
        }
}













































   19 






   19 


   19 

















   52 

   24 
   24 





   45 


    1 
    1 





   44 


    1 






   44 
   20 
   19 









    7 

    2 



    7 





   46 

   39 



   46 






















   32 



   33 








   33 





   31 
   21 





    9 





    8 




    8 


    8 







    8 




   45 




   44 







   45 




   45 


   45 















   19 






   19 


   19 






















   31 







   31 
   31 





   30 





   31 




   46 
   45 
    3 





   44 



    9 
    9 





    9 









































   39 
   39 
























    2 











    2 

    2 


    2 




   46 




    4 

    3 




   45 






















   19 



   19 
   19 








   19 


   45 
   11 




   46 




   46 


   44 





   46 



   45 


   45 


    9 





    9 




    9 



    9 








   21 
    7 




   21 
   18 








   55 







   55 




    5 





    2 


    2 







   48 






   46 


   21 
   10 


   21 

   30 




  119 

   24 






   95 


   14 





   95 
   14 





   95 






    9 





    9 


    1 



    8 




    1 





    1 

    1 
























   19 





   19 








   17 





   17 
   17 
   17 







   17 


   52 



   27 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/ports"
)

type protocolIDs struct {
        network   tcpip.NetworkProtocolNumber
        transport tcpip.TransportProtocolNumber
}

// transportEndpoints manages all endpoints of a given protocol. It has its own
// mutex so as to reduce interference between protocols.
type transportEndpoints struct {
        // mu protects all fields of the transportEndpoints.
        mu        sync.RWMutex
        endpoints map[TransportEndpointID]*endpointsByNIC
        // rawEndpoints contains endpoints for raw sockets, which receive all
        // traffic of a given protocol regardless of port.
        rawEndpoints []RawTransportEndpoint
}

// unregisterEndpoint unregisters the endpoint with the given id such that it
// won't receive any more packets.
func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
        eps.mu.Lock()
        defer eps.mu.Unlock()
        epsByNIC, ok := eps.endpoints[id]
        if !ok {
                return
        }
        if !epsByNIC.unregisterEndpoint(bindToDevice, ep, flags) {
                return
        }
        delete(eps.endpoints, id)
}

func (eps *transportEndpoints) transportEndpoints() []TransportEndpoint {
        eps.mu.RLock()
        defer eps.mu.RUnlock()
        es := make([]TransportEndpoint, 0, len(eps.endpoints))
        for _, e := range eps.endpoints {
                es = append(es, e.transportEndpoints()...)
        }
        return es
}

// iterEndpointsLocked yields all endpointsByNIC in eps that match id, in
// descending order of match quality. If a call to yield returns false,
// iterEndpointsLocked stops iteration and returns immediately.
//
// Preconditions: eps.mu must be locked.
func (eps *transportEndpoints) iterEndpointsLocked(id TransportEndpointID, yield func(*endpointsByNIC) bool) {
        // Try to find a match with the id as provided.
        if ep, ok := eps.endpoints[id]; ok {
                if !yield(ep) {
                        return
                }
        }

        // Try to find a match with the id minus the local address.
        nid := id

        nid.LocalAddress = ""
        if ep, ok := eps.endpoints[nid]; ok {
                if !yield(ep) {
                        return
                }
        }

        // Try to find a match with the id minus the remote part.
        nid.LocalAddress = id.LocalAddress
        nid.RemoteAddress = ""
        nid.RemotePort = 0
        if ep, ok := eps.endpoints[nid]; ok {
                if !yield(ep) {
                        return
                }
        }

        // Try to find a match with only the local port.
        nid.LocalAddress = ""
        if ep, ok := eps.endpoints[nid]; ok {
                if !yield(ep) {
                        return
                }
        }
}

// findAllEndpointsLocked returns all endpointsByNIC in eps that match id, in
// descending order of match quality.
//
// Preconditions: eps.mu must be locked.
func (eps *transportEndpoints) findAllEndpointsLocked(id TransportEndpointID) []*endpointsByNIC {
        var matchedEPs []*endpointsByNIC
        eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool {
                matchedEPs = append(matchedEPs, ep)
                return true
        })
        return matchedEPs
}

// findEndpointLocked returns the endpoint that most closely matches the given id.
//
// Preconditions: eps.mu must be locked.
func (eps *transportEndpoints) findEndpointLocked(id TransportEndpointID) *endpointsByNIC {
        var matchedEP *endpointsByNIC
        eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool {
                matchedEP = ep
                return false
        })
        return matchedEP
}

type endpointsByNIC struct {
        mu        sync.RWMutex
        endpoints map[tcpip.NICID]*multiPortEndpoint
        // seed is a random secret for a jenkins hash.
        seed uint32
}

func (epsByNIC *endpointsByNIC) transportEndpoints() []TransportEndpoint {
        epsByNIC.mu.RLock()
        defer epsByNIC.mu.RUnlock()
        var eps []TransportEndpoint
        for _, ep := range epsByNIC.endpoints {
                eps = append(eps, ep.transportEndpoints()...)
        }
        return eps
}

// handlePacket is called by the stack when new packets arrive to this transport
// endpoint. It returns false if the packet could not be matched to any
// transport endpoint, true otherwise.
func (epsByNIC *endpointsByNIC) handlePacket(id TransportEndpointID, pkt *PacketBuffer) bool {
        epsByNIC.mu.RLock()

        mpep, ok := epsByNIC.endpoints[pkt.NICID]
        if !ok {
                if mpep, ok = epsByNIC.endpoints[0]; !ok {
                        epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
                        return false
                }
        }

        // If this is a broadcast or multicast datagram, deliver the datagram to all
        // endpoints bound to the right device.
        if isInboundMulticastOrBroadcast(pkt, id.LocalAddress) {
                mpep.handlePacketAll(id, pkt)
                epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
                return true
        }
        // multiPortEndpoints are guaranteed to have at least one element.
        transEP := selectEndpoint(id, mpep, epsByNIC.seed)
        if queuedProtocol, mustQueue := mpep.demux.queuedProtocols[protocolIDs{mpep.netProto, mpep.transProto}]; mustQueue {
                queuedProtocol.QueuePacket(transEP, id, pkt)
                epsByNIC.mu.RUnlock()
                return true
        }

        transEP.HandlePacket(id, pkt)
        epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
        return true
}

// handleError delivers an error to the transport endpoint identified by id.
func (epsByNIC *endpointsByNIC) handleError(n *nic, id TransportEndpointID, transErr TransportError, pkt *PacketBuffer) {
        epsByNIC.mu.RLock()
        defer epsByNIC.mu.RUnlock()

        mpep, ok := epsByNIC.endpoints[n.ID()]
        if !ok {
                mpep, ok = epsByNIC.endpoints[0]
        }
        if !ok {
                return
        }

        // TODO(eyalsoha): Why don't we look at id to see if this packet needs to
        // broadcast like we are doing with handlePacket above?

        // multiPortEndpoints are guaranteed to have at least one element.
        selectEndpoint(id, mpep, epsByNIC.seed).HandleError(transErr, pkt)
}

// registerEndpoint returns true if it succeeds. It fails and returns
// false if ep already has an element with the same key.
func (epsByNIC *endpointsByNIC) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error {
        epsByNIC.mu.Lock()
        defer epsByNIC.mu.Unlock()

        multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
        if !ok {
                multiPortEp = &multiPortEndpoint{
                        demux:      d,
                        netProto:   netProto,
                        transProto: transProto,
                }
        }

        if err := multiPortEp.singleRegisterEndpoint(t, flags); err != nil {
                return err
        }
        // Only add this newly created multiportEndpoint if the singleRegisterEndpoint
        // succeeded.
        if !ok {
                epsByNIC.endpoints[bindToDevice] = multiPortEp
        }
        return nil
}

func (epsByNIC *endpointsByNIC) checkEndpoint(flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error {
        epsByNIC.mu.RLock()
        defer epsByNIC.mu.RUnlock()

        multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
        if !ok {
                return nil
        }

        return multiPortEp.singleCheckEndpoint(flags)
}

// unregisterEndpoint returns true if endpointsByNIC has to be unregistered.
func (epsByNIC *endpointsByNIC) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint, flags ports.Flags) bool {
        epsByNIC.mu.Lock()
        defer epsByNIC.mu.Unlock()
        multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
        if !ok {
                return false
        }
        if multiPortEp.unregisterEndpoint(t, flags) {
                delete(epsByNIC.endpoints, bindToDevice)
        }
        return len(epsByNIC.endpoints) == 0
}

// transportDemuxer demultiplexes packets targeted at a transport endpoint
// (i.e., after they've been parsed by the network layer). It does two levels
// of demultiplexing: first based on the network and transport protocols, then
// based on endpoints IDs. It should only be instantiated via
// newTransportDemuxer.
type transportDemuxer struct {
        stack *Stack

        // protocol is immutable.
        protocol        map[protocolIDs]*transportEndpoints
        queuedProtocols map[protocolIDs]queuedTransportProtocol
}

// queuedTransportProtocol if supported by a protocol implementation will cause
// the dispatcher to delivery packets to the QueuePacket method instead of
// calling HandlePacket directly on the endpoint.
type queuedTransportProtocol interface {
        QueuePacket(ep TransportEndpoint, id TransportEndpointID, pkt *PacketBuffer)
}

func newTransportDemuxer(stack *Stack) *transportDemuxer {
        d := &transportDemuxer{
                stack:           stack,
                protocol:        make(map[protocolIDs]*transportEndpoints),
                queuedProtocols: make(map[protocolIDs]queuedTransportProtocol),
        }

        // Add each network and transport pair to the demuxer.
        for netProto := range stack.networkProtocols {
                for proto := range stack.transportProtocols {
                        protoIDs := protocolIDs{netProto, proto}
                        d.protocol[protoIDs] = &transportEndpoints{
                                endpoints: make(map[TransportEndpointID]*endpointsByNIC),
                        }
                        qTransProto, isQueued := (stack.transportProtocols[proto].proto).(queuedTransportProtocol)
                        if isQueued {
                                d.queuedProtocols[protoIDs] = qTransProto
                        }
                }
        }

        return d
}

// registerEndpoint registers the given endpoint with the dispatcher such that
// packets that match the endpoint ID are delivered to it.
func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error {
        for i, n := range netProtos {
                if err := d.singleRegisterEndpoint(n, protocol, id, ep, flags, bindToDevice); err != nil {
                        d.unregisterEndpoint(netProtos[:i], protocol, id, ep, flags, bindToDevice)
                        return err
                }
        }

        return nil
}

// checkEndpoint checks if an endpoint can be registered with the dispatcher.
func (d *transportDemuxer) checkEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error {
        for _, n := range netProtos {
                if err := d.singleCheckEndpoint(n, protocol, id, flags, bindToDevice); err != nil {
                        return err
                }
        }

        return nil
}

// multiPortEndpoint is a container for TransportEndpoints which are bound to
// the same pair of address and port. endpointsArr always has at least one
// element.
//
// FIXME(gvisor.dev/issue/873): Restore this properly. Currently, we just save
// this to ensure that the underlying endpoints get saved/restored, but not not
// use the restored copy.
//
// +stateify savable
type multiPortEndpoint struct {
        mu         sync.RWMutex `state:"nosave"`
        demux      *transportDemuxer
        netProto   tcpip.NetworkProtocolNumber
        transProto tcpip.TransportProtocolNumber

        // endpoints stores the transport endpoints in the order in which they
        // were bound. This is required for UDP SO_REUSEADDR.
        endpoints []TransportEndpoint
        flags     ports.FlagCounter
}

func (ep *multiPortEndpoint) transportEndpoints() []TransportEndpoint {
        ep.mu.RLock()
        eps := append([]TransportEndpoint(nil), ep.endpoints...)
        ep.mu.RUnlock()
        return eps
}

// reciprocalScale scales a value into range [0, n).
//
// This is similar to val % n, but faster.
// See http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
func reciprocalScale(val, n uint32) uint32 {
        return uint32((uint64(val) * uint64(n)) >> 32)
}

// selectEndpoint calculates a hash of destination and source addresses and
// ports then uses it to select a socket. In this case, all packets from one
// address will be sent to same endpoint.
func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32) TransportEndpoint {
        if len(mpep.endpoints) == 1 {
                return mpep.endpoints[0]
        }

        if mpep.flags.SharedFlags().ToFlags().Effective().MostRecent {
                return mpep.endpoints[len(mpep.endpoints)-1]
        }

        payload := []byte{
                byte(id.LocalPort),
                byte(id.LocalPort >> 8),
                byte(id.RemotePort),
                byte(id.RemotePort >> 8),
        }

        h := jenkins.Sum32(seed)
        h.Write(payload)
        h.Write([]byte(id.LocalAddress))
        h.Write([]byte(id.RemoteAddress))
        hash := h.Sum32()

        idx := reciprocalScale(hash, uint32(len(mpep.endpoints)))
        return mpep.endpoints[idx]
}

func (ep *multiPortEndpoint) handlePacketAll(id TransportEndpointID, pkt *PacketBuffer) {
        ep.mu.RLock()
        queuedProtocol, mustQueue := ep.demux.queuedProtocols[protocolIDs{ep.netProto, ep.transProto}]
        // HandlePacket takes ownership of pkt, so each endpoint needs
        // its own copy except for the final one.
        for _, endpoint := range ep.endpoints[:len(ep.endpoints)-1] {
                if mustQueue {
                        queuedProtocol.QueuePacket(endpoint, id, pkt.Clone())
                } else {
                        endpoint.HandlePacket(id, pkt.Clone())
                }
        }
        if endpoint := ep.endpoints[len(ep.endpoints)-1]; mustQueue {
                queuedProtocol.QueuePacket(endpoint, id, pkt)
        } else {
                endpoint.HandlePacket(id, pkt)
        }
        ep.mu.RUnlock() // Don't use defer for performance reasons.
}

// singleRegisterEndpoint tries to add an endpoint to the multiPortEndpoint
// list. The list might be empty already.
func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, flags ports.Flags) tcpip.Error {
        ep.mu.Lock()
        defer ep.mu.Unlock()
        bits := flags.Bits() & ports.MultiBindFlagMask

        if len(ep.endpoints) != 0 {
                // If it was previously bound, we need to check if we can bind again.
                if ep.flags.TotalRefs() > 0 && bits&ep.flags.SharedFlags() == 0 {
                        return &tcpip.ErrPortInUse{}
                }
        }

        ep.endpoints = append(ep.endpoints, t)
        ep.flags.AddRef(bits)

        return nil
}

func (ep *multiPortEndpoint) singleCheckEndpoint(flags ports.Flags) tcpip.Error {
        ep.mu.RLock()
        defer ep.mu.RUnlock()

        bits := flags.Bits() & ports.MultiBindFlagMask

        if len(ep.endpoints) != 0 {
                // If it was previously bound, we need to check if we can bind again.
                if ep.flags.TotalRefs() > 0 && bits&ep.flags.SharedFlags() == 0 {
                        return &tcpip.ErrPortInUse{}
                }
        }

        return nil
}

// unregisterEndpoint returns true if multiPortEndpoint has to be unregistered.
func (ep *multiPortEndpoint) unregisterEndpoint(t TransportEndpoint, flags ports.Flags) bool {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        for i, endpoint := range ep.endpoints {
                if endpoint == t {
                        copy(ep.endpoints[i:], ep.endpoints[i+1:])
                        ep.endpoints[len(ep.endpoints)-1] = nil
                        ep.endpoints = ep.endpoints[:len(ep.endpoints)-1]

                        ep.flags.DropRef(flags.Bits() & ports.MultiBindFlagMask)
                        break
                }
        }
        return len(ep.endpoints) == 0
}

func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error {
        if id.RemotePort != 0 {
                // SO_REUSEPORT only applies to bound/listening endpoints.
                flags.LoadBalanced = false
        }

        eps, ok := d.protocol[protocolIDs{netProto, protocol}]
        if !ok {
                return &tcpip.ErrUnknownProtocol{}
        }

        eps.mu.Lock()
        defer eps.mu.Unlock()
        epsByNIC, ok := eps.endpoints[id]
        if !ok {
                epsByNIC = &endpointsByNIC{
                        endpoints: make(map[tcpip.NICID]*multiPortEndpoint),
                        seed:      d.stack.Seed(),
                }
        }
        if err := epsByNIC.registerEndpoint(d, netProto, protocol, ep, flags, bindToDevice); err != nil {
                return err
        }
        // Only add this newly created epsByNIC if registerEndpoint succeeded.
        if !ok {
                eps.endpoints[id] = epsByNIC
        }
        return nil
}

func (d *transportDemuxer) singleCheckEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error {
        if id.RemotePort != 0 {
                // SO_REUSEPORT only applies to bound/listening endpoints.
                flags.LoadBalanced = false
        }

        eps, ok := d.protocol[protocolIDs{netProto, protocol}]
        if !ok {
                return &tcpip.ErrUnknownProtocol{}
        }

        eps.mu.RLock()
        defer eps.mu.RUnlock()

        epsByNIC, ok := eps.endpoints[id]
        if !ok {
                return nil
        }

        return epsByNIC.checkEndpoint(flags, bindToDevice)
}

// unregisterEndpoint unregisters the endpoint with the given id such that it
// won't receive any more packets.
func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
        if id.RemotePort != 0 {
                // SO_REUSEPORT only applies to bound/listening endpoints.
                flags.LoadBalanced = false
        }

        for _, n := range netProtos {
                if eps, ok := d.protocol[protocolIDs{n, protocol}]; ok {
                        eps.unregisterEndpoint(id, ep, flags, bindToDevice)
                }
        }
}

// deliverPacket attempts to find one or more matching transport endpoints, and
// then, if matches are found, delivers the packet to them. Returns true if
// the packet no longer needs to be handled.
func (d *transportDemuxer) deliverPacket(protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer, id TransportEndpointID) bool {
        eps, ok := d.protocol[protocolIDs{pkt.NetworkProtocolNumber, protocol}]
        if !ok {
                return false
        }

        // If the packet is a UDP broadcast or multicast, then find all matching
        // transport endpoints.
        if protocol == header.UDPProtocolNumber && isInboundMulticastOrBroadcast(pkt, id.LocalAddress) {
                eps.mu.RLock()
                destEPs := eps.findAllEndpointsLocked(id)
                eps.mu.RUnlock()
                // Fail if we didn't find at least one matching transport endpoint.
                if len(destEPs) == 0 {
                        d.stack.stats.UDP.UnknownPortErrors.Increment()
                        return false
                }
                // handlePacket takes ownership of pkt, so each endpoint needs its own
                // copy except for the final one.
                for _, ep := range destEPs[:len(destEPs)-1] {
                        ep.handlePacket(id, pkt.Clone())
                }
                destEPs[len(destEPs)-1].handlePacket(id, pkt)
                return true
        }

        // If the packet is a TCP packet with a unspecified source or non-unicast
        // destination address, then do nothing further and instruct the caller to do
        // the same. The network layer handles address validation for specified source
        // addresses.
        if protocol == header.TCPProtocolNumber && (!isSpecified(id.LocalAddress) || !isSpecified(id.RemoteAddress) || isInboundMulticastOrBroadcast(pkt, id.LocalAddress)) {
                // TCP can only be used to communicate between a single source and a
                // single destination; the addresses must be unicast.e
                d.stack.stats.TCP.InvalidSegmentsReceived.Increment()
                return true
        }

        eps.mu.RLock()
        ep := eps.findEndpointLocked(id)
        eps.mu.RUnlock()
        if ep == nil {
                if protocol == header.UDPProtocolNumber {
                        d.stack.stats.UDP.UnknownPortErrors.Increment()
                }
                return false
        }
        return ep.handlePacket(id, pkt)
}

// deliverRawPacket attempts to deliver the given packet and returns whether it
// was delivered successfully.
func (d *transportDemuxer) deliverRawPacket(protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) bool {
        eps, ok := d.protocol[protocolIDs{pkt.NetworkProtocolNumber, protocol}]
        if !ok {
                return false
        }

        // As in net/ipv4/ip_input.c:ip_local_deliver, attempt to deliver via
        // raw endpoint first. If there are multiple raw endpoints, they all
        // receive the packet.
        eps.mu.RLock()
        // Copy the list of raw endpoints to avoid packet handling under lock.
        var rawEPs []RawTransportEndpoint
        if n := len(eps.rawEndpoints); n != 0 {
                rawEPs = make([]RawTransportEndpoint, n)
                if m := copy(rawEPs, eps.rawEndpoints); m != n {
                        panic(fmt.Sprintf("unexpected copy = %d, want %d", m, n))
                }
        }
        eps.mu.RUnlock()
        for _, rawEP := range rawEPs {
                // Each endpoint gets its own copy of the packet for the sake
                // of save/restore.
                rawEP.HandlePacket(pkt.Clone())
        }

        return len(rawEPs) != 0
}

// deliverError attempts to deliver the given error to the appropriate transport
// endpoint.
//
// Returns true if the error was delivered.
func (d *transportDemuxer) deliverError(n *nic, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, transErr TransportError, pkt *PacketBuffer, id TransportEndpointID) bool {
        eps, ok := d.protocol[protocolIDs{net, trans}]
        if !ok {
                return false
        }

        eps.mu.RLock()
        ep := eps.findEndpointLocked(id)
        eps.mu.RUnlock()
        if ep == nil {
                return false
        }

        ep.handleError(n, id, transErr, pkt)
        return true
}

// findTransportEndpoint find a single endpoint that most closely matches the provided id.
func (d *transportDemuxer) findTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, nicID tcpip.NICID) TransportEndpoint {
        eps, ok := d.protocol[protocolIDs{netProto, transProto}]
        if !ok {
                return nil
        }

        eps.mu.RLock()
        epsByNIC := eps.findEndpointLocked(id)
        if epsByNIC == nil {
                eps.mu.RUnlock()
                return nil
        }

        epsByNIC.mu.RLock()
        eps.mu.RUnlock()

        mpep, ok := epsByNIC.endpoints[nicID]
        if !ok {
                if mpep, ok = epsByNIC.endpoints[0]; !ok {
                        epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
                        return nil
                }
        }

        ep := selectEndpoint(id, mpep, epsByNIC.seed)
        epsByNIC.mu.RUnlock()
        return ep
}

// registerRawEndpoint registers the given endpoint with the dispatcher such
// that packets of the appropriate protocol are delivered to it. A single
// packet can be sent to one or more raw endpoints along with a non-raw
// endpoint.
func (d *transportDemuxer) registerRawEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) tcpip.Error {
        eps, ok := d.protocol[protocolIDs{netProto, transProto}]
        if !ok {
                return &tcpip.ErrNotSupported{}
        }

        eps.mu.Lock()
        eps.rawEndpoints = append(eps.rawEndpoints, ep)
        eps.mu.Unlock()

        return nil
}

// unregisterRawEndpoint unregisters the raw endpoint for the given transport
// protocol such that it won't receive any more packets.
func (d *transportDemuxer) unregisterRawEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) {
        eps, ok := d.protocol[protocolIDs{netProto, transProto}]
        if !ok {
                panic(fmt.Errorf("tried to unregister endpoint with unsupported network and transport protocol pair: %d, %d", netProto, transProto))
        }

        eps.mu.Lock()
        for i, rawEP := range eps.rawEndpoints {
                if rawEP == ep {
                        lastIdx := len(eps.rawEndpoints) - 1
                        eps.rawEndpoints[i] = eps.rawEndpoints[lastIdx]
                        eps.rawEndpoints[lastIdx] = nil
                        eps.rawEndpoints = eps.rawEndpoints[:lastIdx]
                        break
                }
        }
        eps.mu.Unlock()
}

func isInboundMulticastOrBroadcast(pkt *PacketBuffer, localAddr tcpip.Address) bool {
        return pkt.NetworkPacketInfo.LocalAddressBroadcast || header.IsV4MulticastAddress(localAddr) || header.IsV6MulticastAddress(localAddr)
}

func isSpecified(addr tcpip.Address) bool {
        return addr != header.IPv4Any && addr != header.IPv6Any
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/socket/unix/transport/transport_message_list.go: no such file or directory




















































































































































































































































































  523 




  751 









  285 


















    8 



    4 

    2 



    8 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "fmt"
        "strings"

        "gvisor.dev/gvisor/pkg/abi"
)

// Constants for open(2).
const (
        O_ACCMODE  = 000000003
        O_RDONLY   = 000000000
        O_WRONLY   = 000000001
        O_RDWR     = 000000002
        O_CREAT    = 000000100
        O_EXCL     = 000000200
        O_NOCTTY   = 000000400
        O_TRUNC    = 000001000
        O_APPEND   = 000002000
        O_NONBLOCK = 000004000
        O_DSYNC    = 000010000
        O_ASYNC    = 000020000
        O_NOATIME  = 001000000
        O_CLOEXEC  = 002000000
        O_SYNC     = 004000000 // __O_SYNC in Linux
        O_PATH     = 010000000
        O_TMPFILE  = 020000000 // __O_TMPFILE in Linux
)

// Constants for fstatat(2).
const (
        AT_SYMLINK_NOFOLLOW = 0x100
)

// Constants for mount(2).
const (
        MS_RDONLY      = 0x1
        MS_NOSUID      = 0x2
        MS_NODEV       = 0x4
        MS_NOEXEC      = 0x8
        MS_SYNCHRONOUS = 0x10
        MS_REMOUNT     = 0x20
        MS_MANDLOCK    = 0x40
        MS_DIRSYNC     = 0x80
        MS_NOATIME     = 0x400
        MS_NODIRATIME  = 0x800
        MS_BIND        = 0x1000
        MS_MOVE        = 0x2000
        MS_REC         = 0x4000

        MS_POSIXACL    = 0x10000
        MS_UNBINDABLE  = 0x20000
        MS_PRIVATE     = 0x40000
        MS_SLAVE       = 0x80000
        MS_SHARED      = 0x100000
        MS_RELATIME    = 0x200000
        MS_KERNMOUNT   = 0x400000
        MS_I_VERSION   = 0x800000
        MS_STRICTATIME = 0x1000000

        MS_MGC_VAL = 0xC0ED0000
        MS_MGC_MSK = 0xffff0000
)

// Constants for umount2(2).
const (
        MNT_FORCE       = 0x1
        MNT_DETACH      = 0x2
        MNT_EXPIRE      = 0x4
        UMOUNT_NOFOLLOW = 0x8
)

// Constants for unlinkat(2).
const (
        AT_REMOVEDIR = 0x200
)

// Constants for linkat(2) and fchownat(2).
const (
        AT_SYMLINK_FOLLOW = 0x400
        AT_EMPTY_PATH     = 0x1000
)

// Constants for all file-related ...at(2) syscalls.
const (
        AT_FDCWD = -100
)

// Special values for the ns field in utimensat(2).
const (
        UTIME_NOW  = ((1 << 30) - 1)
        UTIME_OMIT = ((1 << 30) - 2)
)

// MaxSymlinkTraversals is the maximum number of links that will be followed by
// the kernel to resolve a symlink.
const MaxSymlinkTraversals = 40

// Constants for flock(2).
const (
        LOCK_SH = 1 // shared lock
        LOCK_EX = 2 // exclusive lock
        LOCK_NB = 4 // or'd with one of the above to prevent blocking
        LOCK_UN = 8 // remove lock
)

// Values for mode_t.
const (
        S_IFMT   = 0170000
        S_IFSOCK = 0140000
        S_IFLNK  = 0120000
        S_IFREG  = 0100000
        S_IFBLK  = 060000
        S_IFDIR  = 040000
        S_IFCHR  = 020000
        S_IFIFO  = 010000

        FileTypeMask        = S_IFMT
        ModeSocket          = S_IFSOCK
        ModeSymlink         = S_IFLNK
        ModeRegular         = S_IFREG
        ModeBlockDevice     = S_IFBLK
        ModeDirectory       = S_IFDIR
        ModeCharacterDevice = S_IFCHR
        ModeNamedPipe       = S_IFIFO

        S_ISUID = 04000
        S_ISGID = 02000
        S_ISVTX = 01000

        ModeSetUID = S_ISUID
        ModeSetGID = S_ISGID
        ModeSticky = S_ISVTX

        ModeUserAll     = 0700
        ModeUserRead    = 0400
        ModeUserWrite   = 0200
        ModeUserExec    = 0100
        ModeGroupAll    = 0070
        ModeGroupRead   = 0040
        ModeGroupWrite  = 0020
        ModeGroupExec   = 0010
        ModeOtherAll    = 0007
        ModeOtherRead   = 0004
        ModeOtherWrite  = 0002
        ModeOtherExec   = 0001
        PermissionsMask = 0777
)

// Values for linux_dirent64.d_type.
const (
        DT_UNKNOWN = 0
        DT_FIFO    = 1
        DT_CHR     = 2
        DT_DIR     = 4
        DT_BLK     = 6
        DT_REG     = 8
        DT_LNK     = 10
        DT_SOCK    = 12
        DT_WHT     = 14
)

// DirentType are the friendly strings for linux_dirent64.d_type.
var DirentType = abi.ValueSet{
        DT_UNKNOWN: "DT_UNKNOWN",
        DT_FIFO:    "DT_FIFO",
        DT_CHR:     "DT_CHR",
        DT_DIR:     "DT_DIR",
        DT_BLK:     "DT_BLK",
        DT_REG:     "DT_REG",
        DT_LNK:     "DT_LNK",
        DT_SOCK:    "DT_SOCK",
        DT_WHT:     "DT_WHT",
}

// Values for preadv2/pwritev2.
const (
        // NOTE(b/120162627): gVisor does not implement the RWF_HIPRI feature, but
        // the flag is accepted as a valid flag argument for preadv2/pwritev2 and
        // silently ignored.
        RWF_HIPRI = 0x00000001
        RWF_DSYNC = 0x00000002
        RWF_SYNC  = 0x00000004
        RWF_VALID = RWF_HIPRI | RWF_DSYNC | RWF_SYNC
)

// SizeOfStat is the size of a Stat struct.
var SizeOfStat = (*Stat)(nil).SizeBytes()

// Flags for statx.
const (
        AT_STATX_SYNC_TYPE    = 0x6000
        AT_STATX_SYNC_AS_STAT = 0x0000
        AT_STATX_FORCE_SYNC   = 0x2000
        AT_STATX_DONT_SYNC    = 0x4000
)

// Mask values for statx.
const (
        STATX_TYPE        = 0x00000001
        STATX_MODE        = 0x00000002
        STATX_NLINK       = 0x00000004
        STATX_UID         = 0x00000008
        STATX_GID         = 0x00000010
        STATX_ATIME       = 0x00000020
        STATX_MTIME       = 0x00000040
        STATX_CTIME       = 0x00000080
        STATX_INO         = 0x00000100
        STATX_SIZE        = 0x00000200
        STATX_BLOCKS      = 0x00000400
        STATX_BASIC_STATS = 0x000007ff
        STATX_BTIME       = 0x00000800
        STATX_ALL         = 0x00000fff
        STATX__RESERVED   = 0x80000000
)

// Bitmasks for Statx.Attributes and Statx.AttributesMask, from
// include/uapi/linux/stat.h.
const (
        STATX_ATTR_COMPRESSED = 0x00000004
        STATX_ATTR_IMMUTABLE  = 0x00000010
        STATX_ATTR_APPEND     = 0x00000020
        STATX_ATTR_NODUMP     = 0x00000040
        STATX_ATTR_ENCRYPTED  = 0x00000800
        STATX_ATTR_AUTOMOUNT  = 0x00001000
)

// Statx represents struct statx.
//
// +marshal
type Statx struct {
        Mask           uint32
        Blksize        uint32
        Attributes     uint64
        Nlink          uint32
        UID            uint32
        GID            uint32
        Mode           uint16
        _              uint16
        Ino            uint64
        Size           uint64
        Blocks         uint64
        AttributesMask uint64
        Atime          StatxTimestamp
        Btime          StatxTimestamp
        Ctime          StatxTimestamp
        Mtime          StatxTimestamp
        RdevMajor      uint32
        RdevMinor      uint32
        DevMajor       uint32
        DevMinor       uint32
}

// SizeOfStatx is the size of a Statx struct.
var SizeOfStatx = (*Statx)(nil).SizeBytes()

// FileMode represents a mode_t.
type FileMode uint16

// Permissions returns just the permission bits.
func (m FileMode) Permissions() FileMode {
        return m & PermissionsMask
}

// FileType returns just the file type bits.
func (m FileMode) FileType() FileMode {
        return m & FileTypeMask
}

// ExtraBits returns everything but the file type and permission bits.
func (m FileMode) ExtraBits() FileMode {
        return m &^ (PermissionsMask | FileTypeMask)
}

// IsDir returns true if file type represents a directory.
func (m FileMode) IsDir() bool {
        return m.FileType() == S_IFDIR
}

// String returns a string representation of m.
func (m FileMode) String() string {
        var s []string
        if ft := m.FileType(); ft != 0 {
                s = append(s, fileType.Parse(uint64(ft)))
        }
        if eb := m.ExtraBits(); eb != 0 {
                s = append(s, modeExtraBits.Parse(uint64(eb)))
        }
        s = append(s, fmt.Sprintf("0o%o", m.Permissions()))
        return strings.Join(s, "|")
}

// DirentType maps file types to dirent types appropriate for (struct
// dirent)::d_type.
func (m FileMode) DirentType() uint8 {
        switch m.FileType() {
        case ModeSocket:
                return DT_SOCK
        case ModeSymlink:
                return DT_LNK
        case ModeRegular:
                return DT_REG
        case ModeBlockDevice:
                return DT_BLK
        case ModeDirectory:
                return DT_DIR
        case ModeCharacterDevice:
                return DT_CHR
        case ModeNamedPipe:
                return DT_FIFO
        default:
                return DT_UNKNOWN
        }
}

var modeExtraBits = abi.FlagSet{
        {
                Flag: ModeSetUID,
                Name: "S_ISUID",
        },
        {
                Flag: ModeSetGID,
                Name: "S_ISGID",
        },
        {
                Flag: ModeSticky,
                Name: "S_ISVTX",
        },
}

var fileType = abi.ValueSet{
        ModeSocket:          "S_IFSOCK",
        ModeSymlink:         "S_IFLINK",
        ModeRegular:         "S_IFREG",
        ModeBlockDevice:     "S_IFBLK",
        ModeDirectory:       "S_IFDIR",
        ModeCharacterDevice: "S_IFCHR",
        ModeNamedPipe:       "S_IFIFO",
}

// Constants for memfd_create(2). Source: include/uapi/linux/memfd.h
const (
        MFD_CLOEXEC       = 0x0001
        MFD_ALLOW_SEALING = 0x0002
)

// Constants related to file seals. Source: include/uapi/{asm-generic,linux}/fcntl.h
const (
        F_LINUX_SPECIFIC_BASE = 1024
        F_ADD_SEALS           = F_LINUX_SPECIFIC_BASE + 9
        F_GET_SEALS           = F_LINUX_SPECIFIC_BASE + 10

        F_SEAL_SEAL   = 0x0001 // Prevent further seals from being set.
        F_SEAL_SHRINK = 0x0002 // Prevent file from shrinking.
        F_SEAL_GROW   = 0x0004 // Prevent file from growing.
        F_SEAL_WRITE  = 0x0008 // Prevent writes.
)

// Constants related to fallocate(2). Source: include/uapi/linux/falloc.h
const (
        FALLOC_FL_KEEP_SIZE      = 0x01
        FALLOC_FL_PUNCH_HOLE     = 0x02
        FALLOC_FL_NO_HIDE_STALE  = 0x04
        FALLOC_FL_COLLAPSE_RANGE = 0x08
        FALLOC_FL_ZERO_RANGE     = 0x10
        FALLOC_FL_INSERT_RANGE   = 0x20
        FALLOC_FL_UNSHARE_RANGE  = 0x40
)






























    1 













    3 







    1 


    2 


    1 


    1 



    1 













    3 







    1 


    2 


    1 






    9 





   12 






    5 







    3 





    4 






    2 







    4 

    1 


    3 


    1 


    2 


    2 
    1 


    2 


    2 



    3 

    1 


    2 


    2 
    1 


    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)

const (
        // As NGROUPS_MAX in include/uapi/linux/limits.h.
        maxNGroups = 65536
)

// Getuid implements the Linux syscall getuid.
func Getuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        c := t.Credentials()
        ruid := c.RealKUID.In(c.UserNamespace).OrOverflow()
        return uintptr(ruid), nil, nil
}

// Geteuid implements the Linux syscall geteuid.
func Geteuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        c := t.Credentials()
        euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow()
        return uintptr(euid), nil, nil
}

// Getresuid implements the Linux syscall getresuid.
func Getresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        ruidAddr := args[0].Pointer()
        euidAddr := args[1].Pointer()
        suidAddr := args[2].Pointer()
        c := t.Credentials()
        ruid := c.RealKUID.In(c.UserNamespace).OrOverflow()
        euid := c.EffectiveKUID.In(c.UserNamespace).OrOverflow()
        suid := c.SavedKUID.In(c.UserNamespace).OrOverflow()
        if _, err := ruid.CopyOut(t, ruidAddr); err != nil {
                return 0, nil, err
        }
        if _, err := euid.CopyOut(t, euidAddr); err != nil {
                return 0, nil, err
        }
        if _, err := suid.CopyOut(t, suidAddr); err != nil {
                return 0, nil, err
        }
        return 0, nil, nil
}

// Getgid implements the Linux syscall getgid.
func Getgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        c := t.Credentials()
        rgid := c.RealKGID.In(c.UserNamespace).OrOverflow()
        return uintptr(rgid), nil, nil
}

// Getegid implements the Linux syscall getegid.
func Getegid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        c := t.Credentials()
        egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow()
        return uintptr(egid), nil, nil
}

// Getresgid implements the Linux syscall getresgid.
func Getresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        rgidAddr := args[0].Pointer()
        egidAddr := args[1].Pointer()
        sgidAddr := args[2].Pointer()
        c := t.Credentials()
        rgid := c.RealKGID.In(c.UserNamespace).OrOverflow()
        egid := c.EffectiveKGID.In(c.UserNamespace).OrOverflow()
        sgid := c.SavedKGID.In(c.UserNamespace).OrOverflow()
        if _, err := rgid.CopyOut(t, rgidAddr); err != nil {
                return 0, nil, err
        }
        if _, err := egid.CopyOut(t, egidAddr); err != nil {
                return 0, nil, err
        }
        if _, err := sgid.CopyOut(t, sgidAddr); err != nil {
                return 0, nil, err
        }
        return 0, nil, nil
}

// Setuid implements the Linux syscall setuid.
func Setuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        uid := auth.UID(args[0].Int())
        return 0, nil, t.SetUID(uid)
}

// Setreuid implements the Linux syscall setreuid.
func Setreuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        ruid := auth.UID(args[0].Int())
        euid := auth.UID(args[1].Int())
        return 0, nil, t.SetREUID(ruid, euid)
}

// Setresuid implements the Linux syscall setreuid.
func Setresuid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        ruid := auth.UID(args[0].Int())
        euid := auth.UID(args[1].Int())
        suid := auth.UID(args[2].Int())
        return 0, nil, t.SetRESUID(ruid, euid, suid)
}

// Setgid implements the Linux syscall setgid.
func Setgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        gid := auth.GID(args[0].Int())
        return 0, nil, t.SetGID(gid)
}

// Setregid implements the Linux syscall setregid.
func Setregid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        rgid := auth.GID(args[0].Int())
        egid := auth.GID(args[1].Int())
        return 0, nil, t.SetREGID(rgid, egid)
}

// Setresgid implements the Linux syscall setregid.
func Setresgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        rgid := auth.GID(args[0].Int())
        egid := auth.GID(args[1].Int())
        sgid := auth.GID(args[2].Int())
        return 0, nil, t.SetRESGID(rgid, egid, sgid)
}

// Getgroups implements the Linux syscall getgroups.
func Getgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        size := int(args[0].Int())
        if size < 0 {
                return 0, nil, linuxerr.EINVAL
        }
        kgids := t.Credentials().ExtraKGIDs
        // "If size is zero, list is not modified, but the total number of
        // supplementary group IDs for the process is returned." - getgroups(2)
        if size == 0 {
                return uintptr(len(kgids)), nil, nil
        }
        if size < len(kgids) {
                return 0, nil, linuxerr.EINVAL
        }
        gids := make([]auth.GID, len(kgids))
        for i, kgid := range kgids {
                gids[i] = kgid.In(t.UserNamespace()).OrOverflow()
        }
        if _, err := auth.CopyGIDSliceOut(t, args[1].Pointer(), gids); err != nil {
                return 0, nil, err
        }
        return uintptr(len(gids)), nil, nil
}

// Setgroups implements the Linux syscall setgroups.
func Setgroups(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        size := args[0].Int()
        if size < 0 || size > maxNGroups {
                return 0, nil, linuxerr.EINVAL
        }
        if size == 0 {
                return 0, nil, t.SetExtraGIDs(nil)
        }
        gids := make([]auth.GID, size)
        if _, err := auth.CopyGIDSliceIn(t, args[1].Pointer(), gids); err != nil {
                return 0, nil, err
        }
        return 0, nil, t.SetExtraGIDs(gids)
}







































































  334 





















































  335 
  334 




  334 














  333 




  334 


  335 










  332 







  335 




   21 



  332 





  332 





  333 



    1 



    1 


    1 


  334 







  335 
















  335 




  332 



  333 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package p9

import (
        "runtime"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/fd"
        "gvisor.dev/gvisor/pkg/fdchannel"
        "gvisor.dev/gvisor/pkg/flipcall"
        "gvisor.dev/gvisor/pkg/log"
)

// channelsPerClient is the number of channels to create per client.
//
// While the client and server will generally agree on this number, in reality
// it's completely up to the server. We simply define a minimum of 2, and a
// maximum of 4, and select the number of available processes as a tie-breaker.
// Note that we don't want the number of channels to be too large, because each
// will account for channelSize memory used, which can be large.
var channelsPerClient = func() int {
        n := runtime.NumCPU()
        if n < 2 {
                return 2
        }
        if n > 4 {
                return 4
        }
        return n
}()

// channelSize is the channel size to create.
//
// We simply ensure that this is larger than the largest possible message size,
// plus the flipcall packet header, plus the two bytes we write below.
const channelSize = int(2 + flipcall.PacketHeaderBytes + 2 + maximumLength)

// channel is a fast IPC channel.
//
// The same object is used by both the server and client implementations. In
// general, the client will use only the send and recv methods.
type channel struct {
        desc flipcall.PacketWindowDescriptor
        data flipcall.Endpoint
        fds  fdchannel.Endpoint
        buf  buffer

        // -- client only --
        connected bool
        active    bool

        // -- server only --
        client *fd.FD
        done   chan struct{}
}

// reset resets the channel buffer.
func (ch *channel) reset(sz uint32) {
        ch.buf.data = ch.data.Data()[:sz]
}

// service services the channel.
func (ch *channel) service(cs *connState) error {
        rsz, err := ch.data.RecvFirst()
        if err != nil {
                return err
        }
        for rsz > 0 {
                m, err := ch.recv(nil, rsz)
                if err != nil {
                        return err
                }
                r := cs.handle(m)
                msgRegistry.put(m)
                rsz, err = ch.send(r)
                if err != nil {
                        return err
                }
        }
        return nil // Done.
}

// Shutdown shuts down the channel.
//
// This must be called before Close.
func (ch *channel) Shutdown() {
        ch.data.Shutdown()
}

// Close closes the channel.
//
// This must only be called once, and cannot return an error. Note that
// synchronization for this method is provided at a high-level, depending on
// whether it is the client or server. This cannot be called while there are
// active callers in either service or sendRecv.
//
// Precondition: the channel should be shutdown.
func (ch *channel) Close() error {
        // Close all backing transports.
        ch.fds.Destroy()
        ch.data.Destroy()
        if ch.client != nil {
                ch.client.Close()
        }
        return nil
}

// send sends the given message.
//
// The return value is the size of the received response. Not that in the
// server case, this is the size of the next request.
func (ch *channel) send(m message) (uint32, error) {
        if log.IsLogging(log.Debug) {
                log.Debugf("send [channel @%p] %s", ch, m.String())
        }

        // Send any file payload.
        sentFD := false
        if filer, ok := m.(filer); ok {
                if f := filer.FilePayload(); f != nil {
                        if err := ch.fds.SendFD(f.FD()); err != nil {
                                return 0, err
                        }
                        f.Close()     // Per sendRecvLegacy.
                        sentFD = true // To mark below.
                }
        }

        // Encode the message.
        //
        // Note that IPC itself encodes the length of messages, so we don't
        // need to encode a standard 9P header. We write only the message type.
        ch.reset(0)

        ch.buf.WriteMsgType(m.Type())
        if sentFD {
                ch.buf.Write8(1) // Incoming FD.
        } else {
                ch.buf.Write8(0) // No incoming FD.
        }
        m.encode(&ch.buf)
        ssz := uint32(len(ch.buf.data)) // Updated below.

        // Is there a payload?
        if payloader, ok := m.(payloader); ok {
                p := payloader.Payload()
                copy(ch.data.Data()[ssz:], p)
                ssz += uint32(len(p))
        }

        // Perform the one-shot communication.
        return ch.data.SendRecv(ssz)
}

// recv decodes a message that exists on the channel.
//
// If the passed r is non-nil, then the type must match or an error will be
// generated. If the passed r is nil, then a new message will be created and
// returned.
func (ch *channel) recv(r message, rsz uint32) (message, error) {
        // Decode the response from the inline buffer.
        ch.reset(rsz)
        t := ch.buf.ReadMsgType()
        hasFD := ch.buf.Read8() != 0
        if t == MsgRlerror {
                // Change the message type. We check for this special case
                // after decoding below, and transform into an error.
                r = &Rlerror{}
        } else if r == nil {
                nr, err := msgRegistry.get(0, t)
                if err != nil {
                        return nil, err
                }
                r = nr // New message.
        } else if t != r.Type() {
                // Not an error and not the expected response; propagate.
                return nil, &ErrBadResponse{Got: t, Want: r.Type()}
        }

        // Is there a payload? Copy from the latter portion.
        if payloader, ok := r.(payloader); ok {
                fs := payloader.FixedSize()
                p := payloader.Payload()
                payloadData := ch.buf.data[fs:]
                if len(p) < len(payloadData) {
                        p = make([]byte, len(payloadData))
                        copy(p, payloadData)
                        payloader.SetPayload(p)
                } else if n := copy(p, payloadData); n < len(p) {
                        payloader.SetPayload(p[:n])
                }
                ch.buf.data = ch.buf.data[:fs]
        }

        r.decode(&ch.buf)
        if ch.buf.isOverrun() {
                // Nothing valid was available.
                log.Debugf("recv [got %d bytes, needed more]", rsz)
                return nil, ErrNoValidMessage
        }

        // Read any FD result.
        if hasFD {
                if rfd, err := ch.fds.RecvFDNonblock(); err == nil {
                        f := fd.New(rfd)
                        if filer, ok := r.(filer); ok {
                                // Set the payload.
                                filer.SetFilePayload(f)
                        } else {
                                // Don't want the FD.
                                f.Close()
                        }
                } else {
                        // The header bit was set but nothing came in.
                        log.Warningf("expected FD, got err: %v", err)
                }
        }

        // Log a message.
        if log.IsLogging(log.Debug) {
                log.Debugf("recv [channel @%p] %s", ch, r.String())
        }

        // Convert errors appropriately; see above.
        if rlerr, ok := r.(*Rlerror); ok {
                return r, unix.Errno(rlerr.Error)
        }

        return r, nil
}




















































   13 





















    1 






    1 




    2 




    1 
























    1 

















    2 





    1 


















    1 





    1 
















    1 




    1 















































    1 












































   34 




   24 



    1 


   23 
    3 






   21 
    3 


   19 





   13 







   11 








    7 



    2 

    4 

    1 



    6 


    5 



    1 





    4 

    4 




   25 
    1 


   25 
    9 


   18 


    1 


   17 
    9 




    8 




   11 







   16 









   71 















  417 




  383 




   53 




    3 




    1 





    6 




  412 




    1 










  400 









    1 




    1 




    1 




    1 










    2 




    1 




    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "bytes"
        "io"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// The following design pattern is strongly recommended for filesystem
// implementations to adapt:
//   - Have a local fileDescription struct (containing FileDescription) which
//     embeds FileDescriptionDefaultImpl and overrides the default methods
//     which are common to all fd implementations for that filesystem like
//     StatusFlags, SetStatusFlags, Stat, SetStat, StatFS, etc.
//   - This should be embedded in all file description implementations as the
//     first field by value.
//   - Directory FDs would also embed DirectoryFileDescriptionDefaultImpl.

// FileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl to obtain implementations of many FileDescriptionImpl
// methods with default behavior analogous to Linux's.
//
// +stateify savable
type FileDescriptionDefaultImpl struct{}

// OnClose implements FileDescriptionImpl.OnClose analogously to
// file_operations::flush == NULL in Linux.
func (FileDescriptionDefaultImpl) OnClose(ctx context.Context) error {
        return nil
}

// StatFS implements FileDescriptionImpl.StatFS analogously to
// super_operations::statfs == NULL in Linux.
func (FileDescriptionDefaultImpl) StatFS(ctx context.Context) (linux.Statfs, error) {
        return linux.Statfs{}, syserror.ENOSYS
}

// Allocate implements FileDescriptionImpl.Allocate analogously to
// fallocate called on an invalid type of file in Linux.
//
// Note that directories can rely on this implementation even though they
// should technically return EISDIR. Allocate should never be called for a
// directory, because it requires a writable fd.
func (FileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error {
        return linuxerr.ENODEV
}

// Readiness implements waiter.Waitable.Readiness analogously to
// file_operations::poll == NULL in Linux.
func (FileDescriptionDefaultImpl) Readiness(mask waiter.EventMask) waiter.EventMask {
        // include/linux/poll.h:vfs_poll() => DEFAULT_POLLMASK
        return waiter.ReadableEvents | waiter.WritableEvents
}

// EventRegister implements waiter.Waitable.EventRegister analogously to
// file_operations::poll == NULL in Linux.
func (FileDescriptionDefaultImpl) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
}

// EventUnregister implements waiter.Waitable.EventUnregister analogously to
// file_operations::poll == NULL in Linux.
func (FileDescriptionDefaultImpl) EventUnregister(e *waiter.Entry) {
}

// PRead implements FileDescriptionImpl.PRead analogously to
// file_operations::read == file_operations::read_iter == NULL in Linux.
func (FileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
        return 0, linuxerr.EINVAL
}

// Read implements FileDescriptionImpl.Read analogously to
// file_operations::read == file_operations::read_iter == NULL in Linux.
func (FileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
        return 0, linuxerr.EINVAL
}

// PWrite implements FileDescriptionImpl.PWrite analogously to
// file_operations::write == file_operations::write_iter == NULL in Linux.
func (FileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
        return 0, linuxerr.EINVAL
}

// Write implements FileDescriptionImpl.Write analogously to
// file_operations::write == file_operations::write_iter == NULL in Linux.
func (FileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
        return 0, linuxerr.EINVAL
}

// IterDirents implements FileDescriptionImpl.IterDirents analogously to
// file_operations::iterate == file_operations::iterate_shared == NULL in
// Linux.
func (FileDescriptionDefaultImpl) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
        return linuxerr.ENOTDIR
}

// Seek implements FileDescriptionImpl.Seek analogously to
// file_operations::llseek == NULL in Linux.
func (FileDescriptionDefaultImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        return 0, linuxerr.ESPIPE
}

// Sync implements FileDescriptionImpl.Sync analogously to
// file_operations::fsync == NULL in Linux.
func (FileDescriptionDefaultImpl) Sync(ctx context.Context) error {
        return linuxerr.EINVAL
}

// ConfigureMMap implements FileDescriptionImpl.ConfigureMMap analogously to
// file_operations::mmap == NULL in Linux.
func (FileDescriptionDefaultImpl) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
        return linuxerr.ENODEV
}

// Ioctl implements FileDescriptionImpl.Ioctl analogously to
// file_operations::unlocked_ioctl == NULL in Linux.
func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        return 0, linuxerr.ENOTTY
}

// ListXattr implements FileDescriptionImpl.ListXattr analogously to
// inode_operations::listxattr == NULL in Linux.
func (FileDescriptionDefaultImpl) ListXattr(ctx context.Context, size uint64) ([]string, error) {
        // This isn't exactly accurate; see FileDescription.ListXattr.
        return nil, linuxerr.ENOTSUP
}

// GetXattr implements FileDescriptionImpl.GetXattr analogously to
// inode::i_opflags & IOP_XATTR == 0 in Linux.
func (FileDescriptionDefaultImpl) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) {
        return "", linuxerr.ENOTSUP
}

// SetXattr implements FileDescriptionImpl.SetXattr analogously to
// inode::i_opflags & IOP_XATTR == 0 in Linux.
func (FileDescriptionDefaultImpl) SetXattr(ctx context.Context, opts SetXattrOptions) error {
        return linuxerr.ENOTSUP
}

// RemoveXattr implements FileDescriptionImpl.RemoveXattr analogously to
// inode::i_opflags & IOP_XATTR == 0 in Linux.
func (FileDescriptionDefaultImpl) RemoveXattr(ctx context.Context, name string) error {
        return linuxerr.ENOTSUP
}

// DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
// FileDescriptionImpl that always represent directories to obtain
// implementations of non-directory I/O methods that return EISDIR.
//
// +stateify savable
type DirectoryFileDescriptionDefaultImpl struct{}

// Allocate implements DirectoryFileDescriptionDefaultImpl.Allocate.
func (DirectoryFileDescriptionDefaultImpl) Allocate(ctx context.Context, mode, offset, length uint64) error {
        return syserror.EISDIR
}

// PRead implements FileDescriptionImpl.PRead.
func (DirectoryFileDescriptionDefaultImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
        return 0, syserror.EISDIR
}

// Read implements FileDescriptionImpl.Read.
func (DirectoryFileDescriptionDefaultImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
        return 0, syserror.EISDIR
}

// PWrite implements FileDescriptionImpl.PWrite.
func (DirectoryFileDescriptionDefaultImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
        return 0, syserror.EISDIR
}

// Write implements FileDescriptionImpl.Write.
func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
        return 0, syserror.EISDIR
}

// DentryMetadataFileDescriptionImpl may be embedded by implementations of
// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is
// true to obtain implementations of Stat and SetStat that panic.
//
// +stateify savable
type DentryMetadataFileDescriptionImpl struct{}

// Stat implements FileDescriptionImpl.Stat.
func (DentryMetadataFileDescriptionImpl) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
        panic("illegal call to DentryMetadataFileDescriptionImpl.Stat")
}

// SetStat implements FileDescriptionImpl.SetStat.
func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetStatOptions) error {
        panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat")
}

// DynamicBytesSource represents a data source for a
// DynamicBytesFileDescriptionImpl.
//
// +stateify savable
type DynamicBytesSource interface {
        // Generate writes the file's contents to buf.
        Generate(ctx context.Context, buf *bytes.Buffer) error
}

// StaticData implements DynamicBytesSource over a static string.
//
// +stateify savable
type StaticData struct {
        Data string
}

// Generate implements DynamicBytesSource.
func (s *StaticData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        buf.WriteString(s.Data)
        return nil
}

// WritableDynamicBytesSource extends DynamicBytesSource to allow writes to the
// underlying source.
//
// TODO(b/179825241): Make utility for integer-based writable files.
type WritableDynamicBytesSource interface {
        DynamicBytesSource

        // Write sends writes to the source.
        Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error)
}

// DynamicBytesFileDescriptionImpl may be embedded by implementations of
// FileDescriptionImpl that represent read-only regular files whose contents
// are backed by a bytes.Buffer that is regenerated when necessary, consistent
// with Linux's fs/seq_file.c:single_open().
//
// If data additionally implements WritableDynamicBytesSource, writes are
// dispatched to the implementer. The source data is not automatically modified.
//
// DynamicBytesFileDescriptionImpl.SetDataSource() must be called before first
// use.
//
// +stateify savable
type DynamicBytesFileDescriptionImpl struct {
        data     DynamicBytesSource // immutable
        mu       sync.Mutex         `state:"nosave"` // protects the following fields
        buf      bytes.Buffer       `state:".([]byte)"`
        off      int64
        lastRead int64 // offset at which the last Read, PRead, or Seek ended
}

func (fd *DynamicBytesFileDescriptionImpl) saveBuf() []byte {
        return fd.buf.Bytes()
}

func (fd *DynamicBytesFileDescriptionImpl) loadBuf(p []byte) {
        fd.buf.Write(p)
}

// SetDataSource must be called exactly once on fd before first use.
func (fd *DynamicBytesFileDescriptionImpl) SetDataSource(data DynamicBytesSource) {
        fd.data = data
}

// Preconditions: fd.mu must be locked.
func (fd *DynamicBytesFileDescriptionImpl) preadLocked(ctx context.Context, dst usermem.IOSequence, offset int64, opts *ReadOptions) (int64, error) {
        // Regenerate the buffer if it's empty, or before pread() at a new offset.
        // Compare fs/seq_file.c:seq_read() => traverse().
        switch {
        case offset != fd.lastRead:
                fd.buf.Reset()
                fallthrough
        case fd.buf.Len() == 0:
                if err := fd.data.Generate(ctx, &fd.buf); err != nil {
                        fd.buf.Reset()
                        // fd.off is not updated in this case.
                        fd.lastRead = 0
                        return 0, err
                }
        }
        bs := fd.buf.Bytes()
        if offset >= int64(len(bs)) {
                return 0, io.EOF
        }
        n, err := dst.CopyOut(ctx, bs[offset:])
        fd.lastRead = offset + int64(n)
        return int64(n), err
}

// PRead implements FileDescriptionImpl.PRead.
func (fd *DynamicBytesFileDescriptionImpl) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
        fd.mu.Lock()
        n, err := fd.preadLocked(ctx, dst, offset, &opts)
        fd.mu.Unlock()
        return n, err
}

// Read implements FileDescriptionImpl.Read.
func (fd *DynamicBytesFileDescriptionImpl) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
        fd.mu.Lock()
        n, err := fd.preadLocked(ctx, dst, fd.off, &opts)
        fd.off += n
        fd.mu.Unlock()
        return n, err
}

// Seek implements FileDescriptionImpl.Seek.
func (fd *DynamicBytesFileDescriptionImpl) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        fd.mu.Lock()
        defer fd.mu.Unlock()
        switch whence {
        case linux.SEEK_SET:
                // Use offset as given.
        case linux.SEEK_CUR:
                offset += fd.off
        default:
                // fs/seq_file:seq_lseek() rejects SEEK_END etc.
                return 0, linuxerr.EINVAL
        }
        if offset < 0 {
                return 0, linuxerr.EINVAL
        }
        if offset != fd.lastRead {
                // Regenerate the file's contents immediately. Compare
                // fs/seq_file.c:seq_lseek() => traverse().
                fd.buf.Reset()
                if err := fd.data.Generate(ctx, &fd.buf); err != nil {
                        fd.buf.Reset()
                        fd.off = 0
                        fd.lastRead = 0
                        return 0, err
                }
                fd.lastRead = offset
        }
        fd.off = offset
        return offset, nil
}

// Preconditions: fd.mu must be locked.
func (fd *DynamicBytesFileDescriptionImpl) pwriteLocked(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
        if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
                return 0, linuxerr.EOPNOTSUPP
        }
        limit, err := CheckLimit(ctx, offset, src.NumBytes())
        if err != nil {
                return 0, err
        }
        src = src.TakeFirst64(limit)

        writable, ok := fd.data.(WritableDynamicBytesSource)
        if !ok {
                return 0, syserror.EIO
        }
        n, err := writable.Write(ctx, src, offset)
        if err != nil {
                return 0, err
        }

        // Invalidate cached data that might exist prior to this call.
        fd.buf.Reset()
        return n, nil
}

// PWrite implements FileDescriptionImpl.PWrite.
func (fd *DynamicBytesFileDescriptionImpl) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
        fd.mu.Lock()
        n, err := fd.pwriteLocked(ctx, src, offset, opts)
        fd.mu.Unlock()
        return n, err
}

// Write implements FileDescriptionImpl.Write.
func (fd *DynamicBytesFileDescriptionImpl) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
        fd.mu.Lock()
        n, err := fd.pwriteLocked(ctx, src, fd.off, opts)
        fd.off += n
        fd.mu.Unlock()
        return n, err
}

// GenericConfigureMMap may be used by most implementations of
// FileDescriptionImpl.ConfigureMMap.
func GenericConfigureMMap(fd *FileDescription, m memmap.Mappable, opts *memmap.MMapOpts) error {
        opts.Mappable = m
        opts.MappingIdentity = fd
        fd.IncRef()
        return nil
}

// LockFD may be used by most implementations of FileDescriptionImpl.Lock*
// functions. Caller must call Init().
//
// +stateify savable
type LockFD struct {
        locks *FileLocks
}

// SupportsLocks implements FileDescriptionImpl.SupportsLocks.
func (LockFD) SupportsLocks() bool {
        return true
}

// Init initializes fd with FileLocks to use.
func (fd *LockFD) Init(locks *FileLocks) {
        fd.locks = locks
}

// Locks returns the locks associated with this file.
func (fd *LockFD) Locks() *FileLocks {
        return fd.locks
}

// LockBSD implements FileDescriptionImpl.LockBSD.
func (fd *LockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error {
        return fd.locks.LockBSD(ctx, uid, ownerPID, t, block)
}

// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
func (fd *LockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
        fd.locks.UnlockBSD(uid)
        return nil
}

// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
func (fd *LockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error {
        return fd.locks.LockPOSIX(ctx, uid, ownerPID, t, r, block)
}

// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
func (fd *LockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error {
        return fd.locks.UnlockPOSIX(ctx, uid, r)
}

// TestPOSIX implements FileDescriptionImpl.TestPOSIX.
func (fd *LockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) {
        return fd.locks.TestPOSIX(ctx, uid, t, r)
}

// NoLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface
// returning ENOLCK.
//
// +stateify savable
type NoLockFD struct{}

// SupportsLocks implements FileDescriptionImpl.SupportsLocks.
func (NoLockFD) SupportsLocks() bool {
        return false
}

// LockBSD implements FileDescriptionImpl.LockBSD.
func (NoLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error {
        return linuxerr.ENOLCK
}

// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
func (NoLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
        return linuxerr.ENOLCK
}

// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
func (NoLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error {
        return linuxerr.ENOLCK
}

// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
func (NoLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error {
        return linuxerr.ENOLCK
}

// TestPOSIX implements FileDescriptionImpl.TestPOSIX.
func (NoLockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) {
        return linux.Flock{}, linuxerr.ENOLCK
}

// BadLockFD implements Lock*/Unlock* portion of FileDescriptionImpl interface
// returning EBADF.
//
// +stateify savable
type BadLockFD struct{}

// SupportsLocks implements FileDescriptionImpl.SupportsLocks.
func (BadLockFD) SupportsLocks() bool {
        return false
}

// LockBSD implements FileDescriptionImpl.LockBSD.
func (BadLockFD) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error {
        return linuxerr.EBADF
}

// UnlockBSD implements FileDescriptionImpl.UnlockBSD.
func (BadLockFD) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
        return linuxerr.EBADF
}

// LockPOSIX implements FileDescriptionImpl.LockPOSIX.
func (BadLockFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error {
        return linuxerr.EBADF
}

// UnlockPOSIX implements FileDescriptionImpl.UnlockPOSIX.
func (BadLockFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error {
        return linuxerr.EBADF
}

// TestPOSIX implements FileDescriptionImpl.TestPOSIX.
func (BadLockFD) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) {
        return linux.Flock{}, linuxerr.EBADF
}























































   32 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package arp

import (
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

var _ stack.NetworkEndpointStats = (*Stats)(nil)

// Stats holds statistics related to ARP.
type Stats struct {
        // ARP holds ARP statistics.
        ARP tcpip.ARPStats
}

// IsNetworkEndpointStats implements stack.NetworkEndpointStats.
func (*Stats) IsNetworkEndpointStats() {}

type sharedStats struct {
        localStats Stats
        arp        multiCounterARPStats
}

// LINT.IfChange(multiCounterARPStats)

type multiCounterARPStats struct {
        packetsReceived                                 tcpip.MultiCounterStat
        disabledPacketsReceived                         tcpip.MultiCounterStat
        malformedPacketsReceived                        tcpip.MultiCounterStat
        requestsReceived                                tcpip.MultiCounterStat
        requestsReceivedUnknownTargetAddress            tcpip.MultiCounterStat
        outgoingRequestInterfaceHasNoLocalAddressErrors tcpip.MultiCounterStat
        outgoingRequestBadLocalAddressErrors            tcpip.MultiCounterStat
        outgoingRequestsDropped                         tcpip.MultiCounterStat
        outgoingRequestsSent                            tcpip.MultiCounterStat
        repliesReceived                                 tcpip.MultiCounterStat
        outgoingRepliesDropped                          tcpip.MultiCounterStat
        outgoingRepliesSent                             tcpip.MultiCounterStat
}

func (m *multiCounterARPStats) init(a, b *tcpip.ARPStats) {
        m.packetsReceived.Init(a.PacketsReceived, b.PacketsReceived)
        m.disabledPacketsReceived.Init(a.DisabledPacketsReceived, b.DisabledPacketsReceived)
        m.malformedPacketsReceived.Init(a.MalformedPacketsReceived, b.MalformedPacketsReceived)
        m.requestsReceived.Init(a.RequestsReceived, b.RequestsReceived)
        m.requestsReceivedUnknownTargetAddress.Init(a.RequestsReceivedUnknownTargetAddress, b.RequestsReceivedUnknownTargetAddress)
        m.outgoingRequestInterfaceHasNoLocalAddressErrors.Init(a.OutgoingRequestInterfaceHasNoLocalAddressErrors, b.OutgoingRequestInterfaceHasNoLocalAddressErrors)
        m.outgoingRequestBadLocalAddressErrors.Init(a.OutgoingRequestBadLocalAddressErrors, b.OutgoingRequestBadLocalAddressErrors)
        m.outgoingRequestsDropped.Init(a.OutgoingRequestsDropped, b.OutgoingRequestsDropped)
        m.outgoingRequestsSent.Init(a.OutgoingRequestsSent, b.OutgoingRequestsSent)
        m.repliesReceived.Init(a.RepliesReceived, b.RepliesReceived)
        m.outgoingRepliesDropped.Init(a.OutgoingRepliesDropped, b.OutgoingRepliesDropped)
        m.outgoingRepliesSent.Init(a.OutgoingRepliesSent, b.OutgoingRepliesSent)
}

// LINT.ThenChange(../../tcpip.go:ARPStats)



























































































   14 
















   14 


   14 
   13 


   14 














   12 



    3 





   12 


    1 



   12 







   12 










   14 




    4 


    1 

    1 



    1 



    3 
    3 




    3 








    1 



    3 
    3 












    9 

    7 
    1 


    8 
    1 


    7 




    7 










    7 



    7 




   10 


    3 



    1 

    2 

    2 


    2 



   10 


   10 





   10 





    1 




    9 

    1 



    9 




    8 
    8 



    7 


    8 







    8 
    1 




    7 
    1 


    6 




    6 
    1 



    5 

    2 


    3 



    5 



    3 



























    2 











    1 





    1 









    2 




    2 













    1 






    1 






    1 



    1 




    1 



    3 
    1 



    2 
















    2 











    2 



    2 



    2 





   14 

    1 


   14 








    4 






    3 
    1 

    1 












    4 





    4 
    1 



    3 











    1 




    2 



















    3 




    2 



    1 





    1 




    1 



    1 




    1 



   11 

    2 







    9 



    9 








    9 


    9 


    1 



    9 







    9 

    1 

    1 




    8 




    1 



    8 














    3 




    3 



    1 






    1 











    1 







    1 


























    3 


    1 

    1 




    2 








    3 









    3 







    3 





















    2 




































    2 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package icmp

import (
        "io"
        "time"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/ports"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
        "gvisor.dev/gvisor/pkg/waiter"
)

// +stateify savable
type icmpPacket struct {
        icmpPacketEntry
        senderAddress tcpip.FullAddress
        data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
        receivedAt    time.Time             `state:".(int64)"`
}

type endpointState int

const (
        stateInitial endpointState = iota
        stateBound
        stateConnected
        stateClosed
)

// endpoint represents an ICMP endpoint. This struct serves as the interface
// between users of the endpoint and the protocol implementation; it is legal to
// have concurrent goroutines make calls into the endpoint, they are properly
// synchronized.
//
// +stateify savable
type endpoint struct {
        stack.TransportEndpointInfo
        tcpip.DefaultSocketOptionsHandler

        // The following fields are initialized at creation time and are
        // immutable.
        stack       *stack.Stack `state:"manual"`
        waiterQueue *waiter.Queue
        uniqueID    uint64

        // The following fields are used to manage the receive queue, and are
        // protected by rcvMu.
        rcvMu      sync.Mutex `state:"nosave"`
        rcvReady   bool
        rcvList    icmpPacketList
        rcvBufSize int
        rcvClosed  bool

        // The following fields are protected by the mu mutex.
        mu sync.RWMutex `state:"nosave"`
        // shutdownFlags represent the current shutdown state of the endpoint.
        shutdownFlags tcpip.ShutdownFlags
        state         endpointState
        route         *stack.Route `state:"manual"`
        ttl           uint8
        stats         tcpip.TransportEndpointStats `state:"nosave"`

        // owner is used to get uid and gid of the packet.
        owner tcpip.PacketOwner

        // ops is used to get socket level options.
        ops tcpip.SocketOptions

        // frozen indicates if the packets should be delivered to the endpoint
        // during restore.
        frozen bool
}

func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
        ep := &endpoint{
                stack: s,
                TransportEndpointInfo: stack.TransportEndpointInfo{
                        NetProto:   netProto,
                        TransProto: transProto,
                },
                waiterQueue: waiterQueue,
                state:       stateInitial,
                uniqueID:    s.UniqueID(),
        }
        ep.ops.InitHandler(ep, ep.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits)
        ep.ops.SetSendBufferSize(32*1024, false /* notify */)
        ep.ops.SetReceiveBufferSize(32*1024, false /* notify */)

        // Override with stack defaults.
        var ss tcpip.SendBufferSizeOption
        if err := s.Option(&ss); err == nil {
                ep.ops.SetSendBufferSize(int64(ss.Default), false /* notify */)
        }
        var rs tcpip.ReceiveBufferSizeOption
        if err := s.Option(&rs); err == nil {
                ep.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */)
        }
        return ep, nil
}

// UniqueID implements stack.TransportEndpoint.UniqueID.
func (e *endpoint) UniqueID() uint64 {
        return e.uniqueID
}

// Abort implements stack.TransportEndpoint.Abort.
func (e *endpoint) Abort() {
        e.Close()
}

// Close puts the endpoint in a closed state and frees all resources
// associated with it.
func (e *endpoint) Close() {
        e.mu.Lock()
        e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite
        switch e.state {
        case stateBound, stateConnected:
                bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
                e.stack.UnregisterTransportEndpoint([]tcpip.NetworkProtocolNumber{e.NetProto}, e.TransProto, e.ID, e, ports.Flags{}, bindToDevice)
        }

        // Close the receive list and drain it.
        e.rcvMu.Lock()
        e.rcvClosed = true
        e.rcvBufSize = 0
        for !e.rcvList.Empty() {
                p := e.rcvList.Front()
                e.rcvList.Remove(p)
        }
        e.rcvMu.Unlock()

        if e.route != nil {
                e.route.Release()
                e.route = nil
        }

        // Update the state.
        e.state = stateClosed

        e.mu.Unlock()

        e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
}

// ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
func (*endpoint) ModerateRecvBuf(int) {}

// SetOwner implements tcpip.Endpoint.SetOwner.
func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
        e.owner = owner
}

// Read implements tcpip.Endpoint.Read.
func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) {
        e.rcvMu.Lock()

        if e.rcvList.Empty() {
                var err tcpip.Error = &tcpip.ErrWouldBlock{}
                if e.rcvClosed {
                        e.stats.ReadErrors.ReadClosed.Increment()
                        err = &tcpip.ErrClosedForReceive{}
                }
                e.rcvMu.Unlock()
                return tcpip.ReadResult{}, err
        }

        p := e.rcvList.Front()
        if !opts.Peek {
                e.rcvList.Remove(p)
                e.rcvBufSize -= p.data.Size()
        }

        e.rcvMu.Unlock()

        res := tcpip.ReadResult{
                Total: p.data.Size(),
                ControlMessages: tcpip.ControlMessages{
                        HasTimestamp: true,
                        Timestamp:    p.receivedAt.UnixNano(),
                },
        }
        if opts.NeedRemoteAddr {
                res.RemoteAddr = p.senderAddress
        }

        n, err := p.data.ReadTo(dst, opts.Peek)
        if n == 0 && err != nil {
                return res, &tcpip.ErrBadBuffer{}
        }
        res.Count = n
        return res, nil
}

// prepareForWrite prepares the endpoint for sending data. In particular, it
// binds it if it's still in the initial state. To do so, it must first
// reacquire the mutex in exclusive mode.
//
// Returns true for retry if preparation should be retried.
// +checklocks:e.mu
func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip.Error) {
        switch e.state {
        case stateInitial:
        case stateConnected:
                return false, nil

        case stateBound:
                if to == nil {
                        return false, &tcpip.ErrDestinationRequired{}
                }
                return false, nil
        default:
                return false, &tcpip.ErrInvalidEndpointState{}
        }

        e.mu.RUnlock()
        e.mu.Lock()
        defer e.mu.DowngradeLock()

        // The state changed when we released the shared locked and re-acquired
        // it in exclusive mode. Try again.
        if e.state != stateInitial {
                return true, nil
        }

        // The state is still 'initial', so try to bind the endpoint.
        if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
                return false, err
        }

        return true, nil
}

// Write writes data to the endpoint's peer. This method does not block
// if the data cannot be written.
func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
        n, err := e.write(p, opts)
        switch err.(type) {
        case nil:
                e.stats.PacketsSent.Increment()
        case *tcpip.ErrMessageTooLong, *tcpip.ErrInvalidOptionValue:
                e.stats.WriteErrors.InvalidArgs.Increment()
        case *tcpip.ErrClosedForSend:
                e.stats.WriteErrors.WriteClosed.Increment()
        case *tcpip.ErrInvalidEndpointState:
                e.stats.WriteErrors.InvalidEndpointState.Increment()
        case *tcpip.ErrNoRoute, *tcpip.ErrBroadcastDisabled, *tcpip.ErrNetworkUnreachable:
                // Errors indicating any problem with IP routing of the packet.
                e.stats.SendErrors.NoRoute.Increment()
        default:
                // For all other errors when writing to the network layer.
                e.stats.SendErrors.SendToNetworkFailed.Increment()
        }
        return n, err
}

func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
        // MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
        if opts.More {
                return 0, &tcpip.ErrInvalidOptionValue{}
        }

        to := opts.To

        e.mu.RLock()
        defer e.mu.RUnlock()

        // If we've shutdown with SHUT_WR we are in an invalid state for sending.
        if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
                return 0, &tcpip.ErrClosedForSend{}
        }

        // Prepare for write.
        for {
                retry, err := e.prepareForWrite(to)
                if err != nil {
                        return 0, err
                }

                if !retry {
                        break
                }
        }

        route := e.route
        if to != nil {
                // Reject destination address if it goes through a different
                // NIC than the endpoint was bound to.
                nicID := to.NIC
                if nicID == 0 {
                        nicID = tcpip.NICID(e.ops.GetBindToDevice())
                }
                if e.BindNICID != 0 {
                        if nicID != 0 && nicID != e.BindNICID {
                                return 0, &tcpip.ErrNoRoute{}
                        }

                        nicID = e.BindNICID
                }

                dst, netProto, err := e.checkV4MappedLocked(*to)
                if err != nil {
                        return 0, err
                }

                // Find the endpoint.
                r, err := e.stack.FindRoute(nicID, e.BindAddr, dst.Addr, netProto, false /* multicastLoop */)
                if err != nil {
                        return 0, err
                }
                defer r.Release()

                route = r
        }

        v := make([]byte, p.Len())
        if _, err := io.ReadFull(p, v); err != nil {
                return 0, &tcpip.ErrBadBuffer{}
        }

        var err tcpip.Error
        switch e.NetProto {
        case header.IPv4ProtocolNumber:
                err = send4(route, e.ID.LocalPort, v, e.ttl, e.owner)

        case header.IPv6ProtocolNumber:
                err = send6(route, e.ID.LocalPort, v, e.ttl)
        }

        if err != nil {
                return 0, err
        }

        return int64(len(v)), nil
}

var _ tcpip.SocketOptionsHandler = (*endpoint)(nil)

// HasNIC implements tcpip.SocketOptionsHandler.
func (e *endpoint) HasNIC(id int32) bool {
        return e.stack.HasNIC(tcpip.NICID(id))
}

// SetSockOpt sets a socket option.
func (*endpoint) SetSockOpt(tcpip.SettableSocketOption) tcpip.Error {
        return nil
}

// SetSockOptInt sets a socket option. Currently not supported.
func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
        switch opt {
        case tcpip.TTLOption:
                e.mu.Lock()
                e.ttl = uint8(v)
                e.mu.Unlock()

        }
        return nil
}

// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
        switch opt {
        case tcpip.ReceiveQueueSizeOption:
                v := 0
                e.rcvMu.Lock()
                if !e.rcvList.Empty() {
                        p := e.rcvList.Front()
                        v = p.data.Size()
                }
                e.rcvMu.Unlock()
                return v, nil

        case tcpip.TTLOption:
                e.rcvMu.Lock()
                v := int(e.ttl)
                e.rcvMu.Unlock()
                return v, nil

        default:
                return -1, &tcpip.ErrUnknownProtocolOption{}
        }
}

// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) tcpip.Error {
        return &tcpip.ErrUnknownProtocolOption{}
}

func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpip.PacketOwner) tcpip.Error {
        if len(data) < header.ICMPv4MinimumSize {
                return &tcpip.ErrInvalidEndpointState{}
        }

        pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                ReserveHeaderBytes: header.ICMPv4MinimumSize + int(r.MaxHeaderLength()),
        })
        pkt.Owner = owner

        icmpv4 := header.ICMPv4(pkt.TransportHeader().Push(header.ICMPv4MinimumSize))
        pkt.TransportProtocolNumber = header.ICMPv4ProtocolNumber
        copy(icmpv4, data)
        // Set the ident to the user-specified port. Sequence number should
        // already be set by the user.
        icmpv4.SetIdent(ident)
        data = data[header.ICMPv4MinimumSize:]

        // Linux performs these basic checks.
        if icmpv4.Type() != header.ICMPv4Echo || icmpv4.Code() != 0 {
                return &tcpip.ErrInvalidEndpointState{}
        }

        // Because this icmp endpoint is implemented in the transport layer, we can
        // only increment the 'stack-wide' stats but we can't increment the
        // 'per-NetworkEndpoint' stats.
        sentStat := r.Stats().ICMP.V4.PacketsSent.EchoRequest

        icmpv4.SetChecksum(0)
        icmpv4.SetChecksum(^header.Checksum(icmpv4, header.Checksum(data, 0)))

        pkt.Data().AppendView(data)

        if ttl == 0 {
                ttl = r.DefaultTTL()
        }

        if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, pkt); err != nil {
                r.Stats().ICMP.V4.PacketsSent.Dropped.Increment()
                return err
        }

        sentStat.Increment()
        return nil
}

func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) tcpip.Error {
        if len(data) < header.ICMPv6EchoMinimumSize {
                return &tcpip.ErrInvalidEndpointState{}
        }

        pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                ReserveHeaderBytes: header.ICMPv6MinimumSize + int(r.MaxHeaderLength()),
        })

        icmpv6 := header.ICMPv6(pkt.TransportHeader().Push(header.ICMPv6MinimumSize))
        pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
        copy(icmpv6, data)
        // Set the ident. Sequence number is provided by the user.
        icmpv6.SetIdent(ident)
        data = data[header.ICMPv6MinimumSize:]

        if icmpv6.Type() != header.ICMPv6EchoRequest || icmpv6.Code() != 0 {
                return &tcpip.ErrInvalidEndpointState{}
        }
        // Because this icmp endpoint is implemented in the transport layer, we can
        // only increment the 'stack-wide' stats but we can't increment the
        // 'per-NetworkEndpoint' stats.
        sentStat := r.Stats().ICMP.V6.PacketsSent.EchoRequest

        pkt.Data().AppendView(data)
        dataRange := pkt.Data().AsRange()
        icmpv6.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{
                Header:      icmpv6,
                Src:         r.LocalAddress(),
                Dst:         r.RemoteAddress(),
                PayloadCsum: dataRange.Checksum(),
                PayloadLen:  dataRange.Size(),
        }))

        if ttl == 0 {
                ttl = r.DefaultTTL()
        }

        if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, pkt); err != nil {
                r.Stats().ICMP.V6.PacketsSent.Dropped.Increment()
        }

        sentStat.Increment()
        return nil
}

// checkV4MappedLocked determines the effective network protocol and converts
// addr to its canonical form.
func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) {
        unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, false /* v6only */)
        if err != nil {
                return tcpip.FullAddress{}, 0, err
        }
        return unwrapped, netProto, nil
}

// Disconnect implements tcpip.Endpoint.Disconnect.
func (*endpoint) Disconnect() tcpip.Error {
        return &tcpip.ErrNotSupported{}
}

// Connect connects the endpoint to its peer. Specifying a NIC is optional.
func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()

        nicID := addr.NIC
        localPort := uint16(0)
        switch e.state {
        case stateInitial:
        case stateBound, stateConnected:
                localPort = e.ID.LocalPort
                if e.BindNICID == 0 {
                        break
                }

                if nicID != 0 && nicID != e.BindNICID {
                        return &tcpip.ErrInvalidEndpointState{}
                }

                nicID = e.BindNICID
        default:
                return &tcpip.ErrInvalidEndpointState{}
        }

        addr, netProto, err := e.checkV4MappedLocked(addr)
        if err != nil {
                return err
        }

        // Find a route to the desired destination.
        r, err := e.stack.FindRoute(nicID, e.BindAddr, addr.Addr, netProto, false /* multicastLoop */)
        if err != nil {
                return err
        }

        id := stack.TransportEndpointID{
                LocalAddress:  r.LocalAddress(),
                LocalPort:     localPort,
                RemoteAddress: r.RemoteAddress(),
        }

        // Even if we're connected, this endpoint can still be used to send
        // packets on a different network protocol, so we register both even if
        // v6only is set to false and this is an ipv6 endpoint.
        netProtos := []tcpip.NetworkProtocolNumber{netProto}

        id, err = e.registerWithStack(nicID, netProtos, id)
        if err != nil {
                r.Release()
                return err
        }

        e.ID = id
        e.route = r
        e.RegisterNICID = nicID

        e.state = stateConnected

        e.rcvMu.Lock()
        e.rcvReady = true
        e.rcvMu.Unlock()

        return nil
}

// ConnectEndpoint is not supported.
func (*endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error {
        return &tcpip.ErrInvalidEndpointState{}
}

// Shutdown closes the read and/or write end of the endpoint connection
// to its peer.
func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()
        e.shutdownFlags |= flags

        if e.state != stateConnected {
                return &tcpip.ErrNotConnected{}
        }

        if flags&tcpip.ShutdownRead != 0 {
                e.rcvMu.Lock()
                wasClosed := e.rcvClosed
                e.rcvClosed = true
                e.rcvMu.Unlock()

                if !wasClosed {
                        e.waiterQueue.Notify(waiter.ReadableEvents)
                }
        }

        return nil
}

// Listen is not supported by UDP, it just fails.
func (*endpoint) Listen(int) tcpip.Error {
        return &tcpip.ErrNotSupported{}
}

// Accept is not supported by UDP, it just fails.
func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) {
        return nil, nil, &tcpip.ErrNotSupported{}
}

func (e *endpoint) registerWithStack(_ tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.Error) {
        bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
        if id.LocalPort != 0 {
                // The endpoint already has a local port, just attempt to
                // register it.
                err := e.stack.RegisterTransportEndpoint(netProtos, e.TransProto, id, e, ports.Flags{}, bindToDevice)
                return id, err
        }

        // We need to find a port for the endpoint.
        _, err := e.stack.PickEphemeralPort(e.stack.Rand(), func(p uint16) (bool, tcpip.Error) {
                id.LocalPort = p
                err := e.stack.RegisterTransportEndpoint(netProtos, e.TransProto, id, e, ports.Flags{}, bindToDevice)
                switch err.(type) {
                case nil:
                        return true, nil
                case *tcpip.ErrPortInUse:
                        return false, nil
                default:
                        return false, err
                }
        })

        return id, err
}

func (e *endpoint) bindLocked(addr tcpip.FullAddress) tcpip.Error {
        // Don't allow binding once endpoint is not in the initial state
        // anymore.
        if e.state != stateInitial {
                return &tcpip.ErrInvalidEndpointState{}
        }

        addr, netProto, err := e.checkV4MappedLocked(addr)
        if err != nil {
                return err
        }

        // Expand netProtos to include v4 and v6 if the caller is binding to a
        // wildcard (empty) address, and this is an IPv6 endpoint with v6only
        // set to false.
        netProtos := []tcpip.NetworkProtocolNumber{netProto}

        if len(addr.Addr) != 0 {
                // A local address was specified, verify that it's valid.
                if e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) == 0 {
                        return &tcpip.ErrBadLocalAddress{}
                }
        }

        id := stack.TransportEndpointID{
                LocalPort:    addr.Port,
                LocalAddress: addr.Addr,
        }
        id, err = e.registerWithStack(addr.NIC, netProtos, id)
        if err != nil {
                return err
        }

        e.ID = id
        e.RegisterNICID = addr.NIC

        // Mark endpoint as bound.
        e.state = stateBound

        e.rcvMu.Lock()
        e.rcvReady = true
        e.rcvMu.Unlock()

        return nil
}

// Bind binds the endpoint to a specific local address and port.
// Specifying a NIC is optional.
func (e *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()

        err := e.bindLocked(addr)
        if err != nil {
                return err
        }

        e.BindNICID = addr.NIC
        e.BindAddr = addr.Addr

        return nil
}

// GetLocalAddress returns the address to which the endpoint is bound.
func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
        e.mu.RLock()
        defer e.mu.RUnlock()

        return tcpip.FullAddress{
                NIC:  e.RegisterNICID,
                Addr: e.ID.LocalAddress,
                Port: e.ID.LocalPort,
        }, nil
}

// GetRemoteAddress returns the address to which the endpoint is connected.
func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) {
        e.mu.RLock()
        defer e.mu.RUnlock()

        if e.state != stateConnected {
                return tcpip.FullAddress{}, &tcpip.ErrNotConnected{}
        }

        return tcpip.FullAddress{
                NIC:  e.RegisterNICID,
                Addr: e.ID.RemoteAddress,
                Port: e.ID.RemotePort,
        }, nil
}

// Readiness returns the current readiness of the endpoint. For example, if
// waiter.EventIn is set, the endpoint is immediately readable.
func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
        // The endpoint is always writable.
        result := waiter.WritableEvents & mask

        // Determine if the endpoint is readable if requested.
        if (mask & waiter.ReadableEvents) != 0 {
                e.rcvMu.Lock()
                if !e.rcvList.Empty() || e.rcvClosed {
                        result |= waiter.ReadableEvents
                }
                e.rcvMu.Unlock()
        }

        return result
}

// HandlePacket is called by the stack when new packets arrive to this transport
// endpoint.
func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
        // Only accept echo replies.
        switch e.NetProto {
        case header.IPv4ProtocolNumber:
                h := header.ICMPv4(pkt.TransportHeader().View())
                if len(h) < header.ICMPv4MinimumSize || h.Type() != header.ICMPv4EchoReply {
                        e.stack.Stats().DroppedPackets.Increment()
                        e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
                        return
                }
        case header.IPv6ProtocolNumber:
                h := header.ICMPv6(pkt.TransportHeader().View())
                if len(h) < header.ICMPv6MinimumSize || h.Type() != header.ICMPv6EchoReply {
                        e.stack.Stats().DroppedPackets.Increment()
                        e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
                        return
                }
        }

        e.rcvMu.Lock()

        // Drop the packet if our buffer is currently full.
        if !e.rcvReady || e.rcvClosed {
                e.rcvMu.Unlock()
                e.stack.Stats().DroppedPackets.Increment()
                e.stats.ReceiveErrors.ClosedReceiver.Increment()
                return
        }

        rcvBufSize := e.ops.GetReceiveBufferSize()
        if e.frozen || e.rcvBufSize >= int(rcvBufSize) {
                e.rcvMu.Unlock()
                e.stack.Stats().DroppedPackets.Increment()
                e.stats.ReceiveErrors.ReceiveBufferOverflow.Increment()
                return
        }

        wasEmpty := e.rcvBufSize == 0

        // Push new packet into receive list and increment the buffer size.
        packet := &icmpPacket{
                senderAddress: tcpip.FullAddress{
                        NIC:  pkt.NICID,
                        Addr: id.RemoteAddress,
                },
        }

        // ICMP socket's data includes ICMP header.
        packet.data = pkt.TransportHeader().View().ToVectorisedView()
        packet.data.Append(pkt.Data().ExtractVV())

        e.rcvList.PushBack(packet)
        e.rcvBufSize += packet.data.Size()

        packet.receivedAt = e.stack.Clock().Now()

        e.rcvMu.Unlock()
        e.stats.PacketsReceived.Increment()
        // Notify any waiters that there's data to be read now.
        if wasEmpty {
                e.waiterQueue.Notify(waiter.ReadableEvents)
        }
}

// HandleError implements stack.TransportEndpoint.
func (*endpoint) HandleError(stack.TransportError, *stack.PacketBuffer) {}

// State implements tcpip.Endpoint.State. The ICMP endpoint currently doesn't
// expose internal socket state.
func (e *endpoint) State() uint32 {
        return 0
}

// Info returns a copy of the endpoint info.
func (e *endpoint) Info() tcpip.EndpointInfo {
        e.mu.RLock()
        // Make a copy of the endpoint info.
        ret := e.TransportEndpointInfo
        e.mu.RUnlock()
        return &ret
}

// Stats returns a pointer to the endpoint stats.
func (e *endpoint) Stats() tcpip.EndpointStats {
        return &e.stats
}

// Wait implements stack.TransportEndpoint.Wait.
func (*endpoint) Wait() {}

// LastError implements tcpip.Endpoint.LastError.
func (*endpoint) LastError() tcpip.Error {
        return nil
}

// SocketOptions implements tcpip.Endpoint.SocketOptions.
func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
        return &e.ops
}

// freeze prevents any more packets from being delivered to the endpoint.
func (e *endpoint) freeze() {
        e.mu.Lock()
        e.frozen = true
        e.mu.Unlock()
}

// thaw unfreezes a previously frozen endpoint using endpoint.freeze() allows
// new packets to be delivered again.
func (e *endpoint) thaw() {
        e.mu.Lock()
        e.frozen = false
        e.mu.Unlock()
}


















































  187 




  186 




























  180 


  183 

  187 



  186 
















  102 




    7 



    4 


    6 



    2 




  145 
  145 















  187 






  187 







  103 




   16 




   16 

    2 


    2 



   16 



  242 





  243 








  241 





   44 
    1 


   43 









   43 


  187 
  181 


   24 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package proc

import (
        "bytes"
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/mm"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

// taskInode represents the inode for /proc/PID/ directory.
//
// +stateify savable
type taskInode struct {
        implStatFS
        kernfs.InodeAttrs
        kernfs.InodeDirectoryNoNewChildren
        kernfs.InodeNotSymlink
        kernfs.InodeTemporary
        kernfs.OrderedChildren
        taskInodeRefs

        locks vfs.FileLocks

        task *kernel.Task
}

var _ kernfs.Inode = (*taskInode)(nil)

func (fs *filesystem) newTaskInode(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, fakeCgroupControllers map[string]string) (kernfs.Inode, error) {
        if task.ExitState() == kernel.TaskExitDead {
                return nil, linuxerr.ESRCH
        }

        contents := map[string]kernfs.Inode{
                "auxv":      fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &auxvData{task: task}),
                "cmdline":   fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
                "comm":      fs.newComm(ctx, task, fs.NextIno(), 0444),
                "cwd":       fs.newCwdSymlink(ctx, task, fs.NextIno()),
                "environ":   fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
                "exe":       fs.newExeSymlink(ctx, task, fs.NextIno()),
                "fd":        fs.newFDDirInode(ctx, task),
                "fdinfo":    fs.newFDInfoDirInode(ctx, task),
                "gid_map":   fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
                "io":        fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
                "maps":      fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mapsData{task: task}),
                "mem":       fs.newMemInode(ctx, task, fs.NextIno(), 0400),
                "mountinfo": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountInfoData{fs: fs, task: task}),
                "mounts":    fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &mountsData{fs: fs, task: task}),
                "net":       fs.newTaskNetDir(ctx, task),
                "ns": fs.newTaskOwnedDir(ctx, task, fs.NextIno(), 0511, map[string]kernfs.Inode{
                        "net":  fs.newNamespaceSymlink(ctx, task, fs.NextIno(), "net"),
                        "pid":  fs.newNamespaceSymlink(ctx, task, fs.NextIno(), "pid"),
                        "user": fs.newNamespaceSymlink(ctx, task, fs.NextIno(), "user"),
                }),
                "oom_score":     fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, newStaticFile("0\n")),
                "oom_score_adj": fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
                "smaps":         fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &smapsData{task: task}),
                "stat":          fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
                "statm":         fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &statmData{task: task}),
                "status":        fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
                "uid_map":       fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
        }
        if isThreadGroup {
                contents["task"] = fs.newSubtasks(ctx, task, pidns, fakeCgroupControllers)
        }
        if len(fakeCgroupControllers) > 0 {
                contents["cgroup"] = fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, newFakeCgroupData(fakeCgroupControllers))
        } else {
                contents["cgroup"] = fs.newTaskOwnedInode(ctx, task, fs.NextIno(), 0444, &taskCgroupData{task: task})
        }

        taskInode := &taskInode{task: task}
        // Note: credentials are overridden by taskOwnedInode.
        taskInode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
        taskInode.InitRefs()

        inode := &taskOwnedInode{Inode: taskInode, owner: task}

        taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
        links := taskInode.OrderedChildren.Populate(contents)
        taskInode.IncLinks(links)

        return inode, nil
}

// Valid implements kernfs.Inode.Valid. This inode remains valid as long
// as the task is still running. When it's dead, another tasks with the same
// PID could replace it.
func (i *taskInode) Valid(ctx context.Context) bool {
        return i.task.ExitState() != kernel.TaskExitDead
}

// Open implements kernfs.Inode.Open.
func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
                SeekEnd: kernfs.SeekEndZero,
        })
        if err != nil {
                return nil, err
        }
        return fd.VFSFileDescription(), nil
}

// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
        return linuxerr.EPERM
}

// DecRef implements kernfs.Inode.DecRef.
func (i *taskInode) DecRef(ctx context.Context) {
        i.taskInodeRefs.DecRef(func() { i.Destroy(ctx) })
}

// taskOwnedInode implements kernfs.Inode and overrides inode owner with task
// effective user and group.
//
// +stateify savable
type taskOwnedInode struct {
        kernfs.Inode

        // owner is the task that owns this inode.
        owner *kernel.Task
}

var _ kernfs.Inode = (*taskOwnedInode)(nil)

func (fs *filesystem) newTaskOwnedInode(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) kernfs.Inode {
        // Note: credentials are overridden by taskOwnedInode.
        inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)

        return &taskOwnedInode{Inode: inode, owner: task}
}

func (fs *filesystem) newTaskOwnedDir(ctx context.Context, task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]kernfs.Inode) kernfs.Inode {
        // Note: credentials are overridden by taskOwnedInode.
        fdOpts := kernfs.GenericDirectoryFDOptions{SeekEnd: kernfs.SeekEndZero}
        dir := kernfs.NewStaticDir(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, children, fdOpts)

        return &taskOwnedInode{Inode: dir, owner: task}
}

func (i *taskOwnedInode) Valid(ctx context.Context) bool {
        return i.owner.ExitState() != kernel.TaskExitDead && i.Inode.Valid(ctx)
}

// Stat implements kernfs.Inode.Stat.
func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
        stat, err := i.Inode.Stat(ctx, fs, opts)
        if err != nil {
                return linux.Statx{}, err
        }
        if opts.Mask&(linux.STATX_UID|linux.STATX_GID) != 0 {
                uid, gid := i.getOwner(linux.FileMode(stat.Mode))
                if opts.Mask&linux.STATX_UID != 0 {
                        stat.UID = uint32(uid)
                }
                if opts.Mask&linux.STATX_GID != 0 {
                        stat.GID = uint32(gid)
                }
        }
        return stat, nil
}

// CheckPermissions implements kernfs.Inode.CheckPermissions.
func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
        mode := i.Mode()
        uid, gid := i.getOwner(mode)
        return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid)
}

func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) {
        // By default, set the task owner as the file owner.
        creds := i.owner.Credentials()
        uid := creds.EffectiveKUID
        gid := creds.EffectiveKGID

        // Linux doesn't apply dumpability adjustments to world readable/executable
        // directories so that applications can stat /proc/PID to determine the
        // effective UID of a process. See fs/proc/base.c:task_dump_owner.
        if mode.FileType() == linux.ModeDirectory && mode.Permissions() == 0555 {
                return uid, gid
        }

        // If the task is not dumpable, then root (in the namespace preferred)
        // owns the file.
        m := getMM(i.owner)
        if m == nil {
                return auth.RootKUID, auth.RootKGID
        }
        if m.Dumpability() != mm.UserDumpable {
                uid = auth.RootKUID
                if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() {
                        uid = kuid
                }
                gid = auth.RootKGID
                if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() {
                        gid = kgid
                }
        }
        return uid, gid
}

func newIO(t *kernel.Task, isThreadGroup bool) *ioData {
        if isThreadGroup {
                return &ioData{ioUsage: t.ThreadGroup()}
        }
        return &ioData{ioUsage: t}
}

// newFakeCgroupData creates an inode that shows fake cgroup
// information passed in as mount options.  From man 7 cgroups: "For
// each cgroup hierarchy of which the process is a member, there is
// one entry containing three colon-separated fields:
// hierarchy-ID:controller-list:cgroup-path"
//
// TODO(b/182488796): Remove once all users adopt cgroupfs.
func newFakeCgroupData(controllers map[string]string) dynamicInode {
        var buf bytes.Buffer

        // The hierarchy ids must be positive integers (for cgroup v1), but the
        // exact number does not matter, so long as they are unique. We can
        // just use a counter, but since linux sorts this file in descending
        // order, we must count down to preserve this behavior.
        i := len(controllers)
        for name, dir := range controllers {
                fmt.Fprintf(&buf, "%d:%s:%s\n", i, name, dir)
                i--
        }
        return newStaticFile(buf.String())
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/auth/atomicptr_credentials_unsafe.go: no such file or directory










































    3 
    3 





    1 









    1 




    7 
    6 





    1 









  419 





    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
        "gvisor.dev/gvisor/pkg/syserror"
)

// FileLocks supports POSIX and BSD style locks, which correspond to fcntl(2)
// and flock(2) respectively in Linux. It can be embedded into various file
// implementations for VFS2 that support locking.
//
// Note that in Linux these two types of locks are _not_ cooperative, because
// race and deadlock conditions make merging them prohibitive. We do the same
// and keep them oblivious to each other.
//
// +stateify savable
type FileLocks struct {
        // bsd is a set of BSD-style advisory file wide locks, see flock(2).
        bsd fslock.Locks

        // posix is a set of POSIX-style regional advisory locks, see fcntl(2).
        posix fslock.Locks
}

// LockBSD tries to acquire a BSD-style lock on the entire file.
func (fl *FileLocks) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerID int32, t fslock.LockType, block fslock.Blocker) error {
        if fl.bsd.LockRegion(uid, ownerID, t, fslock.LockRange{0, fslock.LockEOF}, block) {
                return nil
        }

        // Return an appropriate error for the unsuccessful lock attempt, depending on
        // whether this is a blocking or non-blocking operation.
        if block == nil {
                return syserror.ErrWouldBlock
        }
        return syserror.ERESTARTSYS
}

// UnlockBSD releases a BSD-style lock on the entire file.
//
// This operation is always successful, even if there did not exist a lock on
// the requested region held by uid in the first place.
func (fl *FileLocks) UnlockBSD(uid fslock.UniqueID) {
        fl.bsd.UnlockRegion(uid, fslock.LockRange{0, fslock.LockEOF})
}

// LockPOSIX tries to acquire a POSIX-style lock on a file region.
func (fl *FileLocks) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error {
        if fl.posix.LockRegion(uid, ownerPID, t, r, block) {
                return nil
        }

        // Return an appropriate error for the unsuccessful lock attempt, depending on
        // whether this is a blocking or non-blocking operation.
        if block == nil {
                return syserror.ErrWouldBlock
        }
        return syserror.ERESTARTSYS
}

// UnlockPOSIX releases a POSIX-style lock on a file region.
//
// This operation is always successful, even if there did not exist a lock on
// the requested region held by uid in the first place.
func (fl *FileLocks) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error {
        fl.posix.UnlockRegion(uid, r)
        return nil
}

// TestPOSIX returns information about whether the specified lock can be held, in the style of the F_GETLK fcntl.
func (fl *FileLocks) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) {
        return fl.posix.TestRegion(ctx, uid, t, r), nil
}












































   18 












    9 





   10 




    1 





    1 



    2 





    2 
    2 










    2 






    2 
    1 



    1 



   14 
    4 


   10 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package devpts

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
)

// Terminal is a pseudoterminal.
//
// +stateify savable
type Terminal struct {
        // n is the terminal index. It is immutable.
        n uint32

        // ld is the line discipline of the terminal. It is immutable.
        ld *lineDiscipline

        // masterKTTY contains the controlling process of the master end of
        // this terminal. This field is immutable.
        masterKTTY *kernel.TTY

        // replicaKTTY contains the controlling process of the replica end of this
        // terminal. This field is immutable.
        replicaKTTY *kernel.TTY
}

func newTerminal(n uint32) *Terminal {
        termios := linux.DefaultReplicaTermios
        t := Terminal{
                n:           n,
                ld:          newLineDiscipline(termios),
                masterKTTY:  &kernel.TTY{Index: n},
                replicaKTTY: &kernel.TTY{Index: n},
        }
        return &t
}

// setControllingTTY makes tm the controlling terminal of the calling thread
// group.
func (tm *Terminal) setControllingTTY(ctx context.Context, steal bool, isMaster, isReadable bool) error {
        task := kernel.TaskFromContext(ctx)
        if task == nil {
                panic("setControllingTTY must be called from a task context")
        }

        return task.ThreadGroup().SetControllingTTY(tm.tty(isMaster), steal, isReadable)
}

// releaseControllingTTY removes tm as the controlling terminal of the calling
// thread group.
func (tm *Terminal) releaseControllingTTY(ctx context.Context, isMaster bool) error {
        task := kernel.TaskFromContext(ctx)
        if task == nil {
                panic("releaseControllingTTY must be called from a task context")
        }

        return task.ThreadGroup().ReleaseControllingTTY(tm.tty(isMaster))
}

// foregroundProcessGroup gets the process group ID of tm's foreground process.
func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
        task := kernel.TaskFromContext(ctx)
        if task == nil {
                panic("foregroundProcessGroup must be called from a task context")
        }

        ret, err := task.ThreadGroup().ForegroundProcessGroup(tm.tty(isMaster))
        if err != nil {
                return 0, err
        }

        // Write it out to *arg.
        retP := primitive.Int32(ret)
        _, err = retP.CopyOut(task, args[2].Pointer())
        return 0, err
}

// foregroundProcessGroup sets tm's foreground process.
func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
        task := kernel.TaskFromContext(ctx)
        if task == nil {
                panic("setForegroundProcessGroup must be called from a task context")
        }

        // Read in the process group ID.
        var pgid primitive.Int32
        if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil {
                return 0, err
        }

        ret, err := task.ThreadGroup().SetForegroundProcessGroup(tm.tty(isMaster), kernel.ProcessGroupID(pgid))
        return uintptr(ret), err
}

func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
        if isMaster {
                return tm.masterKTTY
        }
        return tm.replicaKTTY
}





































































































  627 

   26 

  100 





  624 
  544 



  368 














  715 
  534 


  702 






   59 

   59 





    2 








  582 



  178 
  130 






   89 

    1 

    4 

    4 


   86 







  584 
  481 













  231 



  483 




   39 



    1 



  358 


  222 












  376 




  352 











  143 

  143 




  354 









    1 











  354 



  352 


  184 




  143 


  184 








  305 










  306 













    6 






    3 



    3 






    1 














    2 


    1 










  476 








   57 









    2 







  602 



  609 



  607 
    1 


  608 




  605 


















  134 

   29 


  133 


  591 
  191 


  591 



   79 



   79 






  589 




  342 
   24 





  341 



  604 

   52 





   33 







   96 







   92 



   96 










  583 




  106 







  572 




  569 


  521 









  127 
  127 
   24 



  128 





  256 







  257 



  217 









  643 








  353 








  677 










   83 
   83 










  677 

   14 








   12 





  622 





  624 



  656 









  653 





  655 




  298 




  655 




    4 




    4 





    3 




    3 












    1 



    4 
















   38 




    4 



   36 



   36 



   31 



   31 


   31 
   31 




   31 







   31 


   31 







   33 



   31 



   25 
   10 


   26 
   26 






   26 
   24 




   26 
















   26 













   33 
    2 


   31 

   24 


   14 


   14 












   53 



   53 

















  692 








   26 





























   26 






   23 


   10 




   26 






  692 


   31 






   34 








   32 
    2 
    1 







    2 






    2 



    4 






    4 



   30 

   29 


   30 

   33 



   33 





  690 


   45 





  564 






  559 




  677 







   19 




   13 


   14 



   13 
    1 









    1 


    1 




    1 




   13 







   13 





    1 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

// This file defines the behavior of task signal handling.

import (
        "fmt"
        "sync/atomic"
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/eventchannel"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        ucspb "gvisor.dev/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/waiter"
)

// SignalAction is an internal signal action.
type SignalAction int

// Available signal actions.
// Note that although we refer the complete set internally,
// the application is only capable of using the Default and
// Ignore actions from the system call interface.
const (
        SignalActionTerm SignalAction = iota
        SignalActionCore
        SignalActionStop
        SignalActionIgnore
        SignalActionHandler
)

// Default signal handler actions. Note that for most signals,
// (except SIGKILL and SIGSTOP) these can be overridden by the app.
var defaultActions = map[linux.Signal]SignalAction{
        // POSIX.1-1990 standard.
        linux.SIGHUP:  SignalActionTerm,
        linux.SIGINT:  SignalActionTerm,
        linux.SIGQUIT: SignalActionCore,
        linux.SIGILL:  SignalActionCore,
        linux.SIGABRT: SignalActionCore,
        linux.SIGFPE:  SignalActionCore,
        linux.SIGKILL: SignalActionTerm, // but see ThreadGroup.applySignalSideEffects
        linux.SIGSEGV: SignalActionCore,
        linux.SIGPIPE: SignalActionTerm,
        linux.SIGALRM: SignalActionTerm,
        linux.SIGTERM: SignalActionTerm,
        linux.SIGUSR1: SignalActionTerm,
        linux.SIGUSR2: SignalActionTerm,
        linux.SIGCHLD: SignalActionIgnore,
        linux.SIGCONT: SignalActionIgnore, // but see ThreadGroup.applySignalSideEffects
        linux.SIGSTOP: SignalActionStop,
        linux.SIGTSTP: SignalActionStop,
        linux.SIGTTIN: SignalActionStop,
        linux.SIGTTOU: SignalActionStop,
        // POSIX.1-2001 standard.
        linux.SIGBUS:    SignalActionCore,
        linux.SIGPROF:   SignalActionTerm,
        linux.SIGSYS:    SignalActionCore,
        linux.SIGTRAP:   SignalActionCore,
        linux.SIGURG:    SignalActionIgnore,
        linux.SIGVTALRM: SignalActionTerm,
        linux.SIGXCPU:   SignalActionCore,
        linux.SIGXFSZ:   SignalActionCore,
        // The rest on linux.
        linux.SIGSTKFLT: SignalActionTerm,
        linux.SIGIO:     SignalActionTerm,
        linux.SIGPWR:    SignalActionTerm,
        linux.SIGWINCH:  SignalActionIgnore,
}

// computeAction figures out what to do given a signal number
// and an linux.SigAction. SIGSTOP always results in a SignalActionStop,
// and SIGKILL always results in a SignalActionTerm.
// Signal 0 is always ignored as many programs use it for various internal functions
// and don't expect it to do anything.
//
// In the event the signal is not one of these, act.Handler determines what
// happens next.
// If act.Handler is:
// 0, the default action is taken;
// 1, the signal is ignored;
// anything else, the function returns SignalActionHandler.
func computeAction(sig linux.Signal, act linux.SigAction) SignalAction {
        switch sig {
        case linux.SIGSTOP:
                return SignalActionStop
        case linux.SIGKILL:
                return SignalActionTerm
        case linux.Signal(0):
                return SignalActionIgnore
        }

        switch act.Handler {
        case linux.SIG_DFL:
                return defaultActions[sig]
        case linux.SIG_IGN:
                return SignalActionIgnore
        default:
                return SignalActionHandler
        }
}

// UnblockableSignals contains the set of signals which cannot be blocked.
var UnblockableSignals = linux.MakeSignalSet(linux.SIGKILL, linux.SIGSTOP)

// StopSignals is the set of signals whose default action is SignalActionStop.
var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTIN, linux.SIGTTOU)

// dequeueSignalLocked returns a pending signal that is *not* included in mask.
// If there are no pending unmasked signals, dequeueSignalLocked returns nil.
//
// Preconditions: t.tg.signalHandlers.mu must be locked.
func (t *Task) dequeueSignalLocked(mask linux.SignalSet) *linux.SignalInfo {
        if info := t.pendingSignals.dequeue(mask); info != nil {
                return info
        }
        return t.tg.pendingSignals.dequeue(mask)
}

// discardSpecificLocked removes all instances of the given signal from all
// signal queues in tg.
//
// Preconditions: The signal mutex must be locked.
func (tg *ThreadGroup) discardSpecificLocked(sig linux.Signal) {
        tg.pendingSignals.discardSpecific(sig)
        for t := tg.tasks.Front(); t != nil; t = t.Next() {
                t.pendingSignals.discardSpecific(sig)
        }
}

// PendingSignals returns the set of pending signals.
func (t *Task) PendingSignals() linux.SignalSet {
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        return t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet
}

// deliverSignal delivers the given signal and returns the following run state.
func (t *Task) deliverSignal(info *linux.SignalInfo, act linux.SigAction) taskRunState {
        sig := linux.Signal(info.Signo)
        sigact := computeAction(sig, act)

        if t.haveSyscallReturn {
                if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
                        // Signals that are ignored, cause a thread group stop, or
                        // terminate the thread group do not interact with interrupted
                        // syscalls; in Linux terms, they are never returned to the signal
                        // handling path from get_signal => get_signal_to_deliver. The
                        // behavior of an interrupted syscall is determined by the first
                        // signal that is actually handled (by userspace).
                        if sigact == SignalActionHandler {
                                switch {
                                case sre == syserror.ERESTARTNOHAND:
                                        fallthrough
                                case sre == syserror.ERESTART_RESTARTBLOCK:
                                        fallthrough
                                case (sre == syserror.ERESTARTSYS && act.Flags&linux.SA_RESTART == 0):
                                        t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
                                        t.Arch().SetReturn(uintptr(-ExtractErrno(syserror.EINTR, -1)))
                                default:
                                        t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
                                        t.Arch().RestartSyscall()
                                }
                        }
                }
        }

        switch sigact {
        case SignalActionTerm, SignalActionCore:
                // "Default action is to terminate the process." - signal(7)
                t.Debugf("Signal %d: terminating thread group", info.Signo)

                // Emit an event channel messages related to this uncaught signal.
                ucs := &ucspb.UncaughtSignal{
                        Tid:          int32(t.Kernel().TaskSet().Root.IDOfTask(t)),
                        Pid:          int32(t.Kernel().TaskSet().Root.IDOfThreadGroup(t.ThreadGroup())),
                        Registers:    t.Arch().StateData().Proto(),
                        SignalNumber: info.Signo,
                }

                // Attach an fault address if appropriate.
                switch sig {
                case linux.SIGSEGV, linux.SIGFPE, linux.SIGILL, linux.SIGTRAP, linux.SIGBUS:
                        ucs.FaultAddr = info.Addr()
                }

                eventchannel.Emit(ucs)

                t.PrepareGroupExit(linux.WaitStatusTerminationSignal(sig))
                return (*runExit)(nil)

        case SignalActionStop:
                // "Default action is to stop the process."
                t.initiateGroupStop(info)

        case SignalActionIgnore:
                // "Default action is to ignore the signal."
                t.Debugf("Signal %d: ignored", info.Signo)

        case SignalActionHandler:
                // Try to deliver the signal to the user-configured handler.
                t.Debugf("Signal %d: delivering to handler", info.Signo)
                if err := t.deliverSignalToHandler(info, act); err != nil {
                        // This is not a warning, it can occur during normal operation.
                        t.Debugf("Failed to deliver signal %+v to user handler: %v", info, err)

                        // Send a forced SIGSEGV. If the signal that couldn't be delivered
                        // was a SIGSEGV, force the handler to SIG_DFL.
                        t.forceSignal(linux.SIGSEGV, sig == linux.SIGSEGV /* unconditional */)
                        t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                }

        default:
                panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(sig, act)))
        }
        return (*runInterrupt)(nil)
}

// deliverSignalToHandler changes the task's userspace state to enter the given
// user-configured handler for the given signal.
func (t *Task) deliverSignalToHandler(info *linux.SignalInfo, act linux.SigAction) error {
        // Signal delivery to an application handler interrupts restartable
        // sequences.
        t.rseqInterrupt()

        // Are executing on the main stack,
        // or the provided alternate stack?
        sp := hostarch.Addr(t.Arch().Stack())

        // N.B. This is a *copy* of the alternate stack that the user's signal
        // handler expects to see in its ucontext (even if it's not in use).
        alt := t.signalStack
        if act.Flags&linux.SA_ONSTACK != 0 && alt.IsEnabled() {
                alt.Flags |= linux.SS_ONSTACK
                if !alt.Contains(sp) {
                        sp = hostarch.Addr(alt.Top())
                }
        }

        mm := t.MemoryManager()
        // Set up the signal handler. If we have a saved signal mask, the signal
        // handler should run with the current mask, but sigreturn should restore
        // the saved one.
        st := &arch.Stack{
                Arch:   t.Arch(),
                IO:     mm,
                Bottom: sp,
        }
        mask := t.signalMask
        if t.haveSavedSignalMask {
                mask = t.savedSignalMask
        }

        // Set up the restorer.
        // x86-64 should always uses SA_RESTORER, but this flag is optional on other platforms.
        // Please see the linux code as reference:
        // linux/arch/x86/kernel/signal.c:__setup_rt_frame()
        // If SA_RESTORER is not configured, we can use the sigreturn trampolines
        // the vdso provides instead.
        // Please see the linux code as reference:
        // linux/arch/arm64/kernel/signal.c:setup_return()
        if act.Flags&linux.SA_RESTORER == 0 {
                act.Restorer = mm.VDSOSigReturn()
        }

        if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil {
                return err
        }
        t.p.FullStateChanged()
        t.haveSavedSignalMask = false

        // Add our signal mask.
        newMask := t.signalMask | act.Mask
        if act.Flags&linux.SA_NODEFER == 0 {
                newMask |= linux.SignalSetOf(linux.Signal(info.Signo))
        }
        t.SetSignalMask(newMask)

        return nil
}

var ctrlResume = &SyscallControl{ignoreReturn: true}

// SignalReturn implements sigreturn(2) (if rt is false) or rt_sigreturn(2) (if
// rt is true).
func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
        st := t.Stack()
        sigset, alt, err := t.Arch().SignalRestore(st, rt)
        if err != nil {
                return nil, err
        }

        // Attempt to record the given signal stack. Note that we silently
        // ignore failures here, as does Linux. Only an EFAULT may be
        // generated, but SignalRestore has already deserialized the entire
        // frame successfully.
        t.SetSignalStack(alt)

        // Restore our signal mask. SIGKILL and SIGSTOP should not be blocked.
        t.SetSignalMask(sigset &^ UnblockableSignals)
        t.p.FullStateChanged()

        return ctrlResume, nil
}

// Sigtimedwait implements the semantics of sigtimedwait(2).
//
// Preconditions:
// * The caller must be running on the task goroutine.
// * t.exitState < TaskExitZombie.
func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*linux.SignalInfo, error) {
        // set is the set of signals we're interested in; invert it to get the set
        // of signals to block.
        mask := ^(set &^ UnblockableSignals)

        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        if info := t.dequeueSignalLocked(mask); info != nil {
                return info, nil
        }

        if timeout == 0 {
                return nil, linuxerr.EAGAIN
        }

        // Unblock signals we're waiting for. Remember the original signal mask so
        // that Task.sendSignalTimerLocked doesn't discard ignored signals that
        // we're temporarily unblocking.
        t.realSignalMask = t.signalMask
        t.setSignalMaskLocked(t.signalMask & mask)

        // Wait for a timeout or new signal.
        t.tg.signalHandlers.mu.Unlock()
        _, err := t.BlockWithTimeout(nil, true, timeout)
        t.tg.signalHandlers.mu.Lock()

        // Restore the original signal mask.
        t.setSignalMaskLocked(t.realSignalMask)
        t.realSignalMask = 0

        if info := t.dequeueSignalLocked(mask); info != nil {
                return info, nil
        }
        if err == linuxerr.ETIMEDOUT {
                return nil, linuxerr.EAGAIN
        }
        return nil, err
}

// SendSignal sends the given signal to t.
//
// The following errors may be returned:
//
//        linuxerr.ESRCH - The task has exited.
//        linuxerr.EINVAL - The signal is not valid.
//        linuxerr.EAGAIN - THe signal is realtime, and cannot be queued.
//
func (t *Task) SendSignal(info *linux.SignalInfo) error {
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        return t.sendSignalLocked(info, false /* group */)
}

// SendGroupSignal sends the given signal to t's thread group.
func (t *Task) SendGroupSignal(info *linux.SignalInfo) error {
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        return t.sendSignalLocked(info, true /* group */)
}

// SendSignal sends the given signal to tg, using tg's leader to determine if
// the signal is blocked.
func (tg *ThreadGroup) SendSignal(info *linux.SignalInfo) error {
        tg.pidns.owner.mu.RLock()
        defer tg.pidns.owner.mu.RUnlock()
        tg.signalHandlers.mu.Lock()
        defer tg.signalHandlers.mu.Unlock()
        return tg.leader.sendSignalLocked(info, true /* group */)
}

func (t *Task) sendSignalLocked(info *linux.SignalInfo, group bool) error {
        return t.sendSignalTimerLocked(info, group, nil)
}

func (t *Task) sendSignalTimerLocked(info *linux.SignalInfo, group bool, timer *IntervalTimer) error {
        if t.exitState == TaskExitDead {
                return linuxerr.ESRCH
        }
        sig := linux.Signal(info.Signo)
        if sig == 0 {
                return nil
        }
        if !sig.IsValid() {
                return linuxerr.EINVAL
        }

        // Signal side effects apply even if the signal is ultimately discarded.
        t.tg.applySignalSideEffectsLocked(sig)

        // TODO: "Only signals for which the "init" process has established a
        // signal handler can be sent to the "init" process by other members of the
        // PID namespace. This restriction applies even to privileged processes,
        // and prevents other members of the PID namespace from accidentally
        // killing the "init" process." - pid_namespaces(7). We don't currently do
        // this for child namespaces, though we should; we also don't do this for
        // the root namespace (the same restriction applies to global init on
        // Linux), where whether or not we should is much murkier. In practice,
        // most sandboxed applications are not prepared to function as an init
        // process.

        // Unmasked, ignored signals are discarded without being queued, unless
        // they will be visible to a tracer. Even for group signals, it's the
        // originally-targeted task's signal mask and tracer that matter; compare
        // Linux's kernel/signal.c:__send_signal() => prepare_signal() =>
        // sig_ignored().
        ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore
        if sigset := linux.SignalSetOf(sig); sigset&t.signalMask == 0 && sigset&t.realSignalMask == 0 && ignored && !t.hasTracer() {
                t.Debugf("Discarding ignored signal %d", sig)
                if timer != nil {
                        timer.signalRejectedLocked()
                }
                return nil
        }

        q := &t.pendingSignals
        if group {
                q = &t.tg.pendingSignals
        }
        if !q.enqueue(info, timer) {
                if sig.IsRealtime() {
                        return linuxerr.EAGAIN
                }
                t.Debugf("Discarding duplicate signal %d", sig)
                if timer != nil {
                        timer.signalRejectedLocked()
                }
                return nil
        }

        // Find a receiver to notify. Note that the task we choose to notify, if
        // any, may not be the task that actually dequeues and handles the signal;
        // e.g. a racing signal mask change may cause the notified task to become
        // ineligible, or a racing sibling task may dequeue the signal first.
        if t.canReceiveSignalLocked(sig) {
                t.Debugf("Notified of signal %d", sig)
                t.interrupt()
                return nil
        }
        if group {
                if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
                        nt.Debugf("Notified of group signal %d", sig)
                        nt.interrupt()
                        return nil
                }
        }
        t.Debugf("No task notified of signal %d", sig)
        return nil
}

func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) {
        switch {
        case linux.SignalSetOf(sig)&StopSignals != 0:
                // Stop signals cause all prior SIGCONT to be discarded. (This is
                // despite the fact this has little effect since SIGCONT's most
                // important effect is applied when the signal is sent in the branch
                // below, not when the signal is delivered.)
                tg.discardSpecificLocked(linux.SIGCONT)
        case sig == linux.SIGCONT:
                // "The SIGCONT signal has a side effect of waking up (all threads of)
                // a group-stopped process. This side effect happens before
                // signal-delivery-stop. The tracer can't suppress this side effect (it
                // can only suppress signal injection, which only causes the SIGCONT
                // handler to not be executed in the tracee, if such a handler is
                // installed." - ptrace(2)
                tg.endGroupStopLocked(true)
        case sig == linux.SIGKILL:
                // "SIGKILL does not generate signal-delivery-stop and therefore the
                // tracer can't suppress it. SIGKILL kills even within system calls
                // (syscall-exit-stop is not generated prior to death by SIGKILL)." -
                // ptrace(2)
                //
                // Note that this differs from ThreadGroup.requestExit in that it
                // ignores tg.execing.
                if !tg.exiting {
                        tg.exiting = true
                        tg.exitStatus = linux.WaitStatusTerminationSignal(linux.SIGKILL)
                }
                for t := tg.tasks.Front(); t != nil; t = t.Next() {
                        t.killLocked()
                }
        }
}

// canReceiveSignalLocked returns true if t should be interrupted to receive
// the given signal. canReceiveSignalLocked is analogous to Linux's
// kernel/signal.c:wants_signal(), but see below for divergences.
//
// Preconditions: The signal mutex must be locked.
func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
        // Notify that the signal is queued.
        t.signalQueue.Notify(waiter.EventMask(linux.MakeSignalSet(sig)))

        // - Do not choose tasks that are blocking the signal.
        if linux.SignalSetOf(sig)&t.signalMask != 0 {
                return false
        }
        // - No need to check Task.exitState, as the exit path sets every bit in the
        // signal mask when it transitions from TaskExitNone to TaskExitInitiated.
        // - No special case for SIGKILL: SIGKILL already interrupted all tasks in the
        // task group via applySignalSideEffects => killLocked.
        // - Do not choose stopped tasks, which cannot handle signals.
        if t.stop != nil {
                return false
        }
        // - Do not choose tasks that have already been interrupted, as they may be
        // busy handling another signal.
        if len(t.interruptChan) != 0 {
                return false
        }
        return true
}

// findSignalReceiverLocked returns a task in tg that should be interrupted to
// receive the given signal. If no such task exists, findSignalReceiverLocked
// returns nil.
//
// Linux actually records curr_target to balance the group signal targets.
//
// Preconditions: The signal mutex must be locked.
func (tg *ThreadGroup) findSignalReceiverLocked(sig linux.Signal) *Task {
        for t := tg.tasks.Front(); t != nil; t = t.Next() {
                if t.canReceiveSignalLocked(sig) {
                        return t
                }
        }
        return nil
}

// forceSignal ensures that the task is not ignoring or blocking the given
// signal. If unconditional is true, forceSignal takes action even if the
// signal isn't being ignored or blocked.
func (t *Task) forceSignal(sig linux.Signal, unconditional bool) {
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        t.forceSignalLocked(sig, unconditional)
}

func (t *Task) forceSignalLocked(sig linux.Signal, unconditional bool) {
        blocked := linux.SignalSetOf(sig)&t.signalMask != 0
        act := t.tg.signalHandlers.actions[sig]
        ignored := act.Handler == linux.SIG_IGN
        if blocked || ignored || unconditional {
                act.Handler = linux.SIG_DFL
                t.tg.signalHandlers.actions[sig] = act
                if blocked {
                        t.setSignalMaskLocked(t.signalMask &^ linux.SignalSetOf(sig))
                }
        }
}

// SignalMask returns a copy of t's signal mask.
func (t *Task) SignalMask() linux.SignalSet {
        return linux.SignalSet(atomic.LoadUint64((*uint64)(&t.signalMask)))
}

// SetSignalMask sets t's signal mask.
//
// Preconditions:
// * The caller must be running on the task goroutine.
// * t.exitState < TaskExitZombie.
func (t *Task) SetSignalMask(mask linux.SignalSet) {
        // By precondition, t prevents t.tg from completing an execve and mutating
        // t.tg.signalHandlers, so we can skip the TaskSet mutex.
        t.tg.signalHandlers.mu.Lock()
        t.setSignalMaskLocked(mask)
        t.tg.signalHandlers.mu.Unlock()
}

// Preconditions: The signal mutex must be locked.
func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
        oldMask := t.signalMask
        atomic.StoreUint64((*uint64)(&t.signalMask), uint64(mask))

        // If the new mask blocks any signals that were not blocked by the old
        // mask, and at least one such signal is pending in tg.pendingSignals, and
        // t has been woken, it could be the case that t was woken to handle that
        // signal, but will no longer do so as a result of its new signal mask, so
        // we have to pick a replacement.
        blocked := mask &^ oldMask
        blockedGroupPending := blocked & t.tg.pendingSignals.pendingSet
        if blockedGroupPending != 0 && t.interrupted() {
                linux.ForEachSignal(blockedGroupPending, func(sig linux.Signal) {
                        if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
                                nt.interrupt()
                                return
                        }
                })
        }

        // Conversely, if the new mask unblocks any signals that were blocked by
        // the old mask, and at least one such signal is pending, we may now need
        // to handle that signal.
        unblocked := oldMask &^ mask
        unblockedPending := unblocked & (t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet)
        if unblockedPending != 0 {
                t.interruptSelf()
        }
}

// SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's
// comment).
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
        t.savedSignalMask = mask
        t.haveSavedSignalMask = true
}

// SignalStack returns the task-private signal stack.
func (t *Task) SignalStack() linux.SignalStack {
        t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
        alt := t.signalStack
        if t.onSignalStack(alt) {
                alt.Flags |= linux.SS_ONSTACK
        }
        return alt
}

// onSignalStack returns true if the task is executing on the given signal stack.
func (t *Task) onSignalStack(alt linux.SignalStack) bool {
        sp := hostarch.Addr(t.Arch().Stack())
        return alt.Contains(sp)
}

// SetSignalStack sets the task-private signal stack.
//
// This value may not be changed if the task is currently executing on the
// signal stack, i.e. if t.onSignalStack returns true. In this case, this
// function will return false. Otherwise, true is returned.
func (t *Task) SetSignalStack(alt linux.SignalStack) bool {
        // Check that we're not executing on the stack.
        if t.onSignalStack(t.signalStack) {
                return false
        }

        if alt.Flags&linux.SS_DISABLE != 0 {
                // Don't record anything beyond the flags.
                t.signalStack = linux.SignalStack{
                        Flags: linux.SS_DISABLE,
                }
        } else {
                // Mask out irrelevant parts: only disable matters.
                alt.Flags &= linux.SS_DISABLE
                t.signalStack = alt
        }
        return true
}

// SetSigAction atomically sets the thread group's signal action for signal sig
// to *actptr (if actptr is not nil) and returns the old signal action.
func (tg *ThreadGroup) SetSigAction(sig linux.Signal, actptr *linux.SigAction) (linux.SigAction, error) {
        if !sig.IsValid() {
                return linux.SigAction{}, linuxerr.EINVAL
        }

        tg.pidns.owner.mu.RLock()
        defer tg.pidns.owner.mu.RUnlock()
        sh := tg.signalHandlers
        sh.mu.Lock()
        defer sh.mu.Unlock()
        oldact := sh.actions[sig]
        if actptr != nil {
                if sig == linux.SIGKILL || sig == linux.SIGSTOP {
                        return oldact, linuxerr.EINVAL
                }

                act := *actptr
                act.Mask &^= UnblockableSignals
                sh.actions[sig] = act
                // From POSIX, by way of Linux:
                //
                // "Setting a signal action to SIG_IGN for a signal that is pending
                // shall cause the pending signal to be discarded, whether or not it is
                // blocked."
                //
                // "Setting a signal action to SIG_DFL for a signal that is pending and
                // whose default action is to ignore the signal (for example, SIGCHLD),
                // shall cause the pending signal to be discarded, whether or not it is
                // blocked."
                if computeAction(sig, act) == SignalActionIgnore {
                        tg.discardSpecificLocked(sig)
                }
        }
        return oldact, nil
}

// groupStop is a TaskStop placed on tasks that have received a stop signal
// (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from
// the ptrace man page.)
//
// +stateify savable
type groupStop struct{}

// Killable implements TaskStop.Killable.
func (*groupStop) Killable() bool { return true }

// initiateGroupStop attempts to initiate a group stop based on a
// previously-dequeued stop signal.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) initiateGroupStop(info *linux.SignalInfo) {
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        if t.groupStopPending {
                t.Debugf("Signal %d: not stopping thread group: lost to racing stop signal", info.Signo)
                return
        }
        if !t.tg.groupStopDequeued {
                t.Debugf("Signal %d: not stopping thread group: lost to racing SIGCONT", info.Signo)
                return
        }
        if t.tg.exiting {
                t.Debugf("Signal %d: not stopping thread group: lost to racing group exit", info.Signo)
                return
        }
        if t.tg.execing != nil {
                t.Debugf("Signal %d: not stopping thread group: lost to racing execve", info.Signo)
                return
        }
        if !t.tg.groupStopComplete {
                t.tg.groupStopSignal = linux.Signal(info.Signo)
        }
        t.tg.groupStopPendingCount = 0
        for t2 := t.tg.tasks.Front(); t2 != nil; t2 = t2.Next() {
                if t2.killedLocked() || t2.exitState >= TaskExitInitiated {
                        t2.groupStopPending = false
                        continue
                }
                t2.groupStopPending = true
                t2.groupStopAcknowledged = false
                if t2.ptraceSeized {
                        t2.trapNotifyPending = true
                        if s, ok := t2.stop.(*ptraceStop); ok && s.listen {
                                t2.endInternalStopLocked()
                        }
                }
                t2.interrupt()
                t.tg.groupStopPendingCount++
        }
        t.Debugf("Signal %d: stopping %d threads in thread group", info.Signo, t.tg.groupStopPendingCount)
}

// endGroupStopLocked ensures that all prior stop signals received by tg are
// not stopping tg and will not stop tg in the future. If broadcast is true,
// parent and tracer notification will be scheduled if appropriate.
//
// Preconditions: The signal mutex must be locked.
func (tg *ThreadGroup) endGroupStopLocked(broadcast bool) {
        // Discard all previously-queued stop signals.
        linux.ForEachSignal(StopSignals, tg.discardSpecificLocked)

        if tg.groupStopPendingCount == 0 && !tg.groupStopComplete {
                return
        }

        completeStr := "incomplete"
        if tg.groupStopComplete {
                completeStr = "complete"
        }
        tg.leader.Debugf("Ending %s group stop with %d threads pending", completeStr, tg.groupStopPendingCount)
        for t := tg.tasks.Front(); t != nil; t = t.Next() {
                t.groupStopPending = false
                if t.ptraceSeized {
                        t.trapNotifyPending = true
                        if s, ok := t.stop.(*ptraceStop); ok && s.listen {
                                t.endInternalStopLocked()
                        }
                } else {
                        if _, ok := t.stop.(*groupStop); ok {
                                t.endInternalStopLocked()
                        }
                }
        }
        if broadcast {
                // Instead of notifying the parent here, set groupContNotify so that
                // one of the continuing tasks does so. (Linux does something similar.)
                // The reason we do this is to keep locking sane. In order to send a
                // signal to the parent, we need to lock its signal mutex, but we're
                // already holding tg's signal mutex, and the TaskSet mutex must be
                // locked for writing for us to hold two signal mutexes. Since we don't
                // want to require this for endGroupStopLocked (which is called from
                // signal-sending paths), nor do we want to lose atomicity by releasing
                // the mutexes we're already holding, just let the continuing thread
                // group deal with it.
                tg.groupContNotify = true
                tg.groupContInterrupted = !tg.groupStopComplete
                tg.groupContWaitable = true
        }
        // Unsetting groupStopDequeued will cause racing calls to initiateGroupStop
        // to recognize that the group stop has been cancelled.
        tg.groupStopDequeued = false
        tg.groupStopSignal = 0
        tg.groupStopPendingCount = 0
        tg.groupStopComplete = false
        tg.groupStopWaitable = false
}

// participateGroupStopLocked is called to handle thread group side effects
// after t unsets t.groupStopPending. The caller must handle task side effects
// (e.g. placing the task goroutine into the group stop). It returns true if
// the caller must notify t.tg.leader's parent of a completed group stop (which
// participateGroupStopLocked cannot do due to holding the wrong locks).
//
// Preconditions: The signal mutex must be locked.
func (t *Task) participateGroupStopLocked() bool {
        if t.groupStopAcknowledged {
                return false
        }
        t.groupStopAcknowledged = true
        t.tg.groupStopPendingCount--
        if t.tg.groupStopPendingCount != 0 {
                return false
        }
        if t.tg.groupStopComplete {
                return false
        }
        t.Debugf("Completing group stop")
        t.tg.groupStopComplete = true
        t.tg.groupStopWaitable = true
        t.tg.groupContNotify = false
        t.tg.groupContWaitable = false
        return true
}

// signalStop sends a signal to t's thread group of a new group stop, group
// continue, or ptrace stop, if appropriate. code and status are set in the
// signal sent to tg, if any.
//
// Preconditions: The TaskSet mutex must be locked (for reading or writing).
func (t *Task) signalStop(target *Task, code int32, status int32) {
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        act, ok := t.tg.signalHandlers.actions[linux.SIGCHLD]
        if !ok || (act.Handler != linux.SIG_IGN && act.Flags&linux.SA_NOCLDSTOP == 0) {
                sigchld := &linux.SignalInfo{
                        Signo: int32(linux.SIGCHLD),
                        Code:  code,
                }
                sigchld.SetPID(int32(t.tg.pidns.tids[target]))
                sigchld.SetUID(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
                sigchld.SetStatus(status)
                // TODO(b/72102453): Set utime, stime.
                t.sendSignalLocked(sigchld, true /* group */)
        }
}

// The runInterrupt state handles conditions indicated by interrupts.
//
// +stateify savable
type runInterrupt struct{}

func (*runInterrupt) execute(t *Task) taskRunState {
        // Interrupts are de-duplicated (t.unsetInterrupted() will undo the effect
        // of all previous calls to t.interrupted() regardless of how many such
        // calls there have been), so early exits from this function must re-enter
        // the runInterrupt state to check for more interrupt-signaled conditions.

        t.tg.signalHandlers.mu.Lock()

        // Did we just leave a group stop?
        if t.tg.groupContNotify {
                t.tg.groupContNotify = false
                sig := t.tg.groupStopSignal
                intr := t.tg.groupContInterrupted
                t.tg.signalHandlers.mu.Unlock()
                t.tg.pidns.owner.mu.RLock()
                // For consistency with Linux, if the parent and (thread group
                // leader's) tracer are in the same thread group, deduplicate
                // notifications.
                notifyParent := t.tg.leader.parent != nil
                if tracer := t.tg.leader.Tracer(); tracer != nil {
                        if notifyParent && tracer.tg == t.tg.leader.parent.tg {
                                notifyParent = false
                        }
                        // Sending CLD_STOPPED to the tracer doesn't really make any sense;
                        // the thread group leader may have already entered the stop and
                        // notified its tracer accordingly. But it's consistent with
                        // Linux...
                        if intr {
                                tracer.signalStop(t.tg.leader, linux.CLD_STOPPED, int32(sig))
                                if !notifyParent {
                                        tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop | EventChildGroupStop)
                                } else {
                                        tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop)
                                }
                        } else {
                                tracer.signalStop(t.tg.leader, linux.CLD_CONTINUED, int32(sig))
                                tracer.tg.eventQueue.Notify(EventGroupContinue)
                        }
                }
                if notifyParent {
                        // If groupContInterrupted, do as Linux does and pretend the group
                        // stop completed just before it ended. The theoretical behavior in
                        // this case would be to send a SIGCHLD indicating the completed
                        // stop, followed by a SIGCHLD indicating the continue. However,
                        // SIGCHLD is a standard signal, so the latter would always be
                        // dropped. Hence sending only the former is equivalent.
                        if intr {
                                t.tg.leader.parent.signalStop(t.tg.leader, linux.CLD_STOPPED, int32(sig))
                                t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue | EventChildGroupStop)
                        } else {
                                t.tg.leader.parent.signalStop(t.tg.leader, linux.CLD_CONTINUED, int32(sig))
                                t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue)
                        }
                }
                t.tg.pidns.owner.mu.RUnlock()
                return (*runInterrupt)(nil)
        }

        // Do we need to enter a group stop or related ptrace stop? This path is
        // analogous to Linux's kernel/signal.c:get_signal() => do_signal_stop()
        // (with ptrace enabled) and do_jobctl_trap().
        if t.groupStopPending || t.trapStopPending || t.trapNotifyPending {
                sig := t.tg.groupStopSignal
                notifyParent := false
                if t.groupStopPending {
                        t.groupStopPending = false
                        // We care about t.tg.groupStopSignal (for tracer notification)
                        // even if this doesn't complete a group stop, so keep the
                        // value of sig we've already read.
                        notifyParent = t.participateGroupStopLocked()
                }
                t.trapStopPending = false
                t.trapNotifyPending = false
                // Drop the signal mutex so we can take the TaskSet mutex.
                t.tg.signalHandlers.mu.Unlock()

                t.tg.pidns.owner.mu.RLock()
                if t.tg.leader.parent == nil {
                        notifyParent = false
                }
                if tracer := t.Tracer(); tracer != nil {
                        if t.ptraceSeized {
                                if sig == 0 {
                                        sig = linux.SIGTRAP
                                }
                                // "If tracee was attached using PTRACE_SEIZE, group-stop is
                                // indicated by PTRACE_EVENT_STOP: status>>16 ==
                                // PTRACE_EVENT_STOP. This allows detection of group-stops
                                // without requiring an extra PTRACE_GETSIGINFO call." -
                                // "Group-stop", ptrace(2)
                                t.ptraceCode = int32(sig) | linux.PTRACE_EVENT_STOP<<8
                                t.ptraceSiginfo = &linux.SignalInfo{
                                        Signo: int32(sig),
                                        Code:  t.ptraceCode,
                                }
                                t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t]))
                                t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
                        } else {
                                t.ptraceCode = int32(sig)
                                t.ptraceSiginfo = nil
                        }
                        if t.beginPtraceStopLocked() {
                                tracer.signalStop(t, linux.CLD_STOPPED, int32(sig))
                                // For consistency with Linux, if the parent and tracer are in the
                                // same thread group, deduplicate notification signals.
                                if notifyParent && tracer.tg == t.tg.leader.parent.tg {
                                        notifyParent = false
                                        tracer.tg.eventQueue.Notify(EventChildGroupStop | EventTraceeStop)
                                } else {
                                        tracer.tg.eventQueue.Notify(EventTraceeStop)
                                }
                        }
                } else {
                        t.tg.signalHandlers.mu.Lock()
                        if !t.killedLocked() {
                                t.beginInternalStopLocked((*groupStop)(nil))
                        }
                        t.tg.signalHandlers.mu.Unlock()
                }
                if notifyParent {
                        t.tg.leader.parent.signalStop(t.tg.leader, linux.CLD_STOPPED, int32(sig))
                        t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
                }
                t.tg.pidns.owner.mu.RUnlock()

                return (*runInterrupt)(nil)
        }

        // Are there signals pending?
        if info := t.dequeueSignalLocked(t.signalMask); info != nil {
                t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())

                if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 {
                        // Indicate that we've dequeued a stop signal before unlocking the
                        // signal mutex; initiateGroupStop will check for races with
                        // endGroupStopLocked after relocking it.
                        t.tg.groupStopDequeued = true
                }
                if t.ptraceSignalLocked(info) {
                        // Dequeueing the signal action must wait until after the
                        // signal-delivery-stop ends since the tracer can change or
                        // suppress the signal.
                        t.tg.signalHandlers.mu.Unlock()
                        return (*runInterruptAfterSignalDeliveryStop)(nil)
                }
                act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
                t.tg.signalHandlers.mu.Unlock()
                return t.deliverSignal(info, act)
        }

        t.unsetInterrupted()
        t.tg.signalHandlers.mu.Unlock()
        return (*runApp)(nil)
}

// +stateify savable
type runInterruptAfterSignalDeliveryStop struct{}

func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
        t.tg.pidns.owner.mu.Lock()
        // Can't defer unlock: deliverSignal must be called without holding TaskSet
        // mutex.
        sig := linux.Signal(t.ptraceCode)
        defer func() {
                t.ptraceSiginfo = nil
        }()
        if !sig.IsValid() {
                t.tg.pidns.owner.mu.Unlock()
                return (*runInterrupt)(nil)
        }
        info := t.ptraceSiginfo
        if sig != linux.Signal(info.Signo) {
                info.Signo = int32(sig)
                info.Errno = 0
                info.Code = linux.SI_USER
                // pid isn't a valid field for all signal numbers, but Linux
                // doesn't care (kernel/signal.c:ptrace_signal()).
                //
                // Linux uses t->parent for the tid and uid here, which is the tracer
                // if it hasn't detached or the real parent otherwise.
                parent := t.parent
                if tracer := t.Tracer(); tracer != nil {
                        parent = tracer
                }
                if parent == nil {
                        // Tracer has detached and t was created by Kernel.CreateProcess().
                        // Pretend the parent is in an ancestor PID + user namespace.
                        info.SetPID(0)
                        info.SetUID(int32(auth.OverflowUID))
                } else {
                        info.SetPID(int32(t.tg.pidns.tids[parent]))
                        info.SetUID(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
                }
        }
        t.tg.signalHandlers.mu.Lock()
        t.tg.pidns.owner.mu.Unlock()
        // If the signal is masked, re-queue it.
        if linux.SignalSetOf(sig)&t.signalMask != 0 {
                t.sendSignalLocked(info, false /* group */)
                t.tg.signalHandlers.mu.Unlock()
                return (*runInterrupt)(nil)
        }
        act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
        t.tg.signalHandlers.mu.Unlock()
        return t.deliverSignal(info, act)
}

// SignalRegister registers a waiter for pending signals.
func (t *Task) SignalRegister(e *waiter.Entry, mask waiter.EventMask) {
        t.tg.signalHandlers.mu.Lock()
        t.signalQueue.EventRegister(e, mask)
        t.tg.signalHandlers.mu.Unlock()
}

// SignalUnregister unregisters a waiter for pending signals.
func (t *Task) SignalUnregister(e *waiter.Entry) {
        t.tg.signalHandlers.mu.Lock()
        t.signalQueue.EventUnregister(e)
        t.tg.signalHandlers.mu.Unlock()
}














































































































  639 











  646 






  644 













  644 












  639 








  644 





  642 












  643 





  644 









 1627 









 1623 










 1630 

























  637 







 1627 

 1628 

 1624 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package time provides a calibrated clock synchronized to a system reference
// clock.
package time

import (
        "time"

        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/metric"
        "gvisor.dev/gvisor/pkg/sync"
)

// CalibratedClock implements a clock that tracks a reference clock.
//
// Users should call Update at regular intervals of around approxUpdateInterval
// to ensure that the clock does not drift significantly from the reference
// clock.
type CalibratedClock struct {
        // mu protects the fields below.
        // TODO(mpratt): consider a sequence counter for read locking.
        mu sync.RWMutex

        // ref sample the reference clock that this clock is calibrated
        // against.
        ref *sampler

        // ready indicates that the fields below are ready for use calculating
        // time.
        ready bool

        // params are the current timekeeping parameters.
        params Parameters

        // errorNS is the estimated clock error in nanoseconds.
        errorNS ReferenceNS
}

// NewCalibratedClock creates a CalibratedClock that tracks the given ClockID.
func NewCalibratedClock(c ClockID) *CalibratedClock {
        return &CalibratedClock{
                ref: newSampler(c),
        }
}

// Debugf logs at debug level.
func (c *CalibratedClock) Debugf(format string, v ...interface{}) {
        if log.IsLogging(log.Debug) {
                args := []interface{}{c.ref.clockID}
                args = append(args, v...)
                log.Debugf("CalibratedClock(%v): "+format, args...)
        }
}

// Infof logs at debug level.
func (c *CalibratedClock) Infof(format string, v ...interface{}) {
        if log.IsLogging(log.Info) {
                args := []interface{}{c.ref.clockID}
                args = append(args, v...)
                log.Infof("CalibratedClock(%v): "+format, args...)
        }
}

// Warningf logs at debug level.
func (c *CalibratedClock) Warningf(format string, v ...interface{}) {
        if log.IsLogging(log.Warning) {
                args := []interface{}{c.ref.clockID}
                args = append(args, v...)
                log.Warningf("CalibratedClock(%v): "+format, args...)
        }
}

// reset forces the clock to restart the calibration process, logging the
// passed message.
func (c *CalibratedClock) reset(str string, v ...interface{}) {
        c.mu.Lock()
        defer c.mu.Unlock()
        c.resetLocked(str, v...)
}

// resetLocked is equivalent to reset with c.mu already held for writing.
func (c *CalibratedClock) resetLocked(str string, v ...interface{}) {
        c.Warningf(str+" Resetting clock; time may jump.", v...)
        c.ready = false
        c.ref.Reset()
        metric.WeirdnessMetric.Increment("time_fallback")
}

// updateParams updates the timekeeping parameters based on the passed
// parameters.
//
// actual is the actual estimated timekeeping parameters. The stored parameters
// may need to be adjusted slightly from these values to compensate for error.
//
// Preconditions: c.mu must be held for writing.
func (c *CalibratedClock) updateParams(actual Parameters) {
        if !c.ready {
                // At initial calibration there is nothing to correct.
                c.params = actual
                c.ready = true

                c.Infof("ready")

                return
        }

        // Otherwise, adjust the params to correct for errors.
        newParams, errorNS, err := errorAdjust(c.params, actual, actual.BaseCycles)
        if err != nil {
                // Something is very wrong. Reset and try again from the
                // beginning.
                c.resetLocked("Unable to update params: %v.", err)
                return
        }
        logErrorAdjustment(c.ref.clockID, errorNS, c.params, newParams)

        if errorNS.Magnitude() >= MaxClockError {
                // We should never get such extreme error, something is very
                // wrong. Reset everything and start again.
                //
                // N.B. logErrorAdjustment will have already logged the error
                // at warning level.
                //
                // TODO(mpratt): We could allow Realtime clock jumps here.
                c.resetLocked("Extreme clock error.")
                return
        }

        c.params = newParams
        c.errorNS = errorNS
}

// Update runs the update step of the clock, updating its synchronization with
// the reference clock.
//
// Update returns timekeeping and true with the new timekeeping parameters if
// the clock is calibrated. Update should be called regularly to prevent the
// clock from getting significantly out of sync from the reference clock.
//
// The returned timekeeping parameters are invalidated on the next call to
// Update.
func (c *CalibratedClock) Update() (Parameters, bool) {
        c.mu.Lock()
        defer c.mu.Unlock()

        if err := c.ref.Sample(); err != nil {
                c.resetLocked("Unable to update calibrated clock: %v.", err)
                return Parameters{}, false
        }

        oldest, newest, ok := c.ref.Range()
        if !ok {
                // Not ready yet.
                return Parameters{}, false
        }

        minCount := uint64(newest.before - oldest.after)
        maxCount := uint64(newest.after - oldest.before)
        refInterval := uint64(newest.ref - oldest.ref)

        // freq hz = count / (interval ns) * (nsPerS ns) / (1 s)
        nsPerS := uint64(time.Second.Nanoseconds())

        minHz, ok := muldiv64(minCount, nsPerS, refInterval)
        if !ok {
                c.resetLocked("Unable to update calibrated clock: (%v - %v) * %v / %v overflows.", newest.before, oldest.after, nsPerS, refInterval)
                return Parameters{}, false
        }

        maxHz, ok := muldiv64(maxCount, nsPerS, refInterval)
        if !ok {
                c.resetLocked("Unable to update calibrated clock: (%v - %v) * %v / %v overflows.", newest.after, oldest.before, nsPerS, refInterval)
                return Parameters{}, false
        }

        c.updateParams(Parameters{
                Frequency:  (minHz + maxHz) / 2,
                BaseRef:    newest.ref,
                BaseCycles: newest.after,
        })

        return c.params, true
}

// GetTime returns the current time based on the clock calibration.
func (c *CalibratedClock) GetTime() (int64, error) {
        c.mu.RLock()

        if !c.ready {
                // Fallback to a syscall.
                now, err := c.ref.Syscall()
                c.mu.RUnlock()
                return int64(now), err
        }

        now := c.ref.Cycles()
        v, ok := c.params.ComputeTime(now)
        if !ok {
                // Something is seriously wrong with the clock. Try
                // again with syscalls.
                c.resetLocked("Time computation overflowed. params = %+v, now = %v.", c.params, now)
                now, err := c.ref.Syscall()
                c.mu.RUnlock()
                return int64(now), err
        }

        c.mu.RUnlock()
        return v, nil
}

// CalibratedClocks contains calibrated monotonic and realtime clocks.
//
// TODO(mpratt): We know that Linux runs the monotonic and realtime clocks at
// the same rate, so rather than tracking both individually, we could do one
// calibration for both clocks.
type CalibratedClocks struct {
        // monotonic is the clock tracking the system monotonic clock.
        monotonic *CalibratedClock

        // realtime is the realtime equivalent of monotonic.
        realtime *CalibratedClock
}

// NewCalibratedClocks creates a CalibratedClocks.
func NewCalibratedClocks() *CalibratedClocks {
        return &CalibratedClocks{
                monotonic: NewCalibratedClock(Monotonic),
                realtime:  NewCalibratedClock(Realtime),
        }
}

// Update implements Clocks.Update.
func (c *CalibratedClocks) Update() (Parameters, bool, Parameters, bool) {
        monotonicParams, monotonicOk := c.monotonic.Update()
        realtimeParams, realtimeOk := c.realtime.Update()

        return monotonicParams, monotonicOk, realtimeParams, realtimeOk
}

// GetTime implements Clocks.GetTime.
func (c *CalibratedClocks) GetTime(id ClockID) (int64, error) {
        switch id {
        case Monotonic:
                return c.monotonic.GetTime()
        case Realtime:
                return c.realtime.GetTime()
        default:
                return 0, linuxerr.EINVAL
        }
}


































    7 










    7 



    7 
    1 




    6 




    6 



    6 



    2 






    5 



    5 












    2 











    2 
    1 




    1 

    1 




    1 


    1 




    3 








    3 


    2 
    1 


    1 



    1 


    1 
    1 







    2 






    3 
    4 
    1 





    4 
    1 



    3 





















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/eventfd"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/mm"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
)

// IoSetup implements linux syscall io_setup(2).
func IoSetup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        nrEvents := args[0].Int()
        idAddr := args[1].Pointer()

        // Linux uses the native long as the aio ID.
        //
        // The context pointer _must_ be zero initially.
        var idIn uint64
        if _, err := primitive.CopyUint64In(t, idAddr, &idIn); err != nil {
                return 0, nil, err
        }
        if idIn != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        id, err := t.MemoryManager().NewAIOContext(t, uint32(nrEvents))
        if err != nil {
                return 0, nil, err
        }

        // Copy out the new ID.
        if _, err := primitive.CopyUint64Out(t, idAddr, id); err != nil {
                t.MemoryManager().DestroyAIOContext(t, id)
                return 0, nil, err
        }

        return 0, nil, nil
}

// IoDestroy implements linux syscall io_destroy(2).
func IoDestroy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        id := args[0].Uint64()

        ctx := t.MemoryManager().DestroyAIOContext(t, id)
        if ctx == nil {
                // Does not exist.
                return 0, nil, linuxerr.EINVAL
        }

        // Drain completed requests amd wait for pending requests until there are no
        // more.
        for {
                ctx.Drain()

                ch := ctx.WaitChannel()
                if ch == nil {
                        // No more requests, we're done.
                        return 0, nil, nil
                }
                // The task cannot be interrupted during the wait. Equivalent to
                // TASK_UNINTERRUPTIBLE in Linux.
                t.UninterruptibleSleepStart(true /* deactivate */)
                <-ch
                t.UninterruptibleSleepFinish(true /* activate */)
        }
}

// IoGetevents implements linux syscall io_getevents(2).
func IoGetevents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        id := args[0].Uint64()
        minEvents := args[1].Int()
        events := args[2].Int()
        eventsAddr := args[3].Pointer()
        timespecAddr := args[4].Pointer()

        // Sanity check arguments.
        if minEvents < 0 || minEvents > events {
                return 0, nil, linuxerr.EINVAL
        }

        ctx, ok := t.MemoryManager().LookupAIOContext(t, id)
        if !ok {
                return 0, nil, linuxerr.EINVAL
        }

        // Setup the timeout.
        var haveDeadline bool
        var deadline ktime.Time
        if timespecAddr != 0 {
                d, err := copyTimespecIn(t, timespecAddr)
                if err != nil {
                        return 0, nil, err
                }
                if !d.Valid() {
                        return 0, nil, linuxerr.EINVAL
                }
                deadline = t.Kernel().MonotonicClock().Now().Add(d.ToDuration())
                haveDeadline = true
        }

        // Loop over all requests.
        for count := int32(0); count < events; count++ {
                // Get a request, per semantics.
                var v interface{}
                if count >= minEvents {
                        var ok bool
                        v, ok = ctx.PopRequest()
                        if !ok {
                                return uintptr(count), nil, nil
                        }
                } else {
                        var err error
                        v, err = waitForRequest(ctx, t, haveDeadline, deadline)
                        if err != nil {
                                if count > 0 || linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
                                        return uintptr(count), nil, nil
                                }
                                return 0, nil, syserror.ConvertIntr(err, syserror.EINTR)
                        }
                }

                ev := v.(*linux.IOEvent)

                // Copy out the result.
                if _, err := ev.CopyOut(t, eventsAddr); err != nil {
                        if count > 0 {
                                return uintptr(count), nil, nil
                        }
                        // Nothing done.
                        return 0, nil, err
                }

                // Keep rolling.
                eventsAddr += hostarch.Addr(linux.IOEventSize)
        }

        // Everything finished.
        return uintptr(events), nil, nil
}

func waitForRequest(ctx *mm.AIOContext, t *kernel.Task, haveDeadline bool, deadline ktime.Time) (interface{}, error) {
        for {
                if v, ok := ctx.PopRequest(); ok {
                        // Request was readily available. Just return it.
                        return v, nil
                }

                // Need to wait for request completion.
                done := ctx.WaitChannel()
                if done == nil {
                        // Context has been destroyed.
                        return nil, linuxerr.EINVAL
                }
                if err := t.BlockWithDeadline(done, haveDeadline, deadline); err != nil {
                        return nil, err
                }
        }
}

// memoryFor returns appropriate memory for the given callback.
func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) {
        bytes := int(cb.Bytes)
        if bytes < 0 {
                // Linux also requires that this field fit in ssize_t.
                return usermem.IOSequence{}, linuxerr.EINVAL
        }

        // Since this I/O will be asynchronous with respect to t's task goroutine,
        // we have no guarantee that t's AddressSpace will be active during the
        // I/O.
        switch cb.OpCode {
        case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE:
                return t.SingleIOSequence(hostarch.Addr(cb.Buf), bytes, usermem.IOOpts{
                        AddressSpaceActive: false,
                })

        case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV:
                return t.IovecsIOSequence(hostarch.Addr(cb.Buf), bytes, usermem.IOOpts{
                        AddressSpaceActive: false,
                })

        case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP:
                return usermem.IOSequence{}, nil

        default:
                // Not a supported command.
                return usermem.IOSequence{}, linuxerr.EINVAL
        }
}

// IoCancel implements linux syscall io_cancel(2).
//
// It is not presently supported (ENOSYS indicates no support on this
// architecture).
func IoCancel(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return 0, nil, syserror.ENOSYS
}

// LINT.IfChange

func getAIOCallback(t *kernel.Task, file *fs.File, cbAddr hostarch.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, actx *mm.AIOContext, eventFile *fs.File) kernel.AIOCallback {
        return func(ctx context.Context) {
                if actx.Dead() {
                        actx.CancelPendingRequest()
                        return
                }
                ev := &linux.IOEvent{
                        Data: cb.Data,
                        Obj:  uint64(cbAddr),
                }

                var err error
                switch cb.OpCode {
                case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV:
                        ev.Result, err = file.Preadv(ctx, ioseq, cb.Offset)
                case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
                        ev.Result, err = file.Pwritev(ctx, ioseq, cb.Offset)
                case linux.IOCB_CMD_FSYNC:
                        err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
                case linux.IOCB_CMD_FDSYNC:
                        err = file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncData)
                }

                // Update the result.
                if err != nil {
                        err = handleIOError(t, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", file)
                        ev.Result = -int64(kernel.ExtractErrno(err, 0))
                }

                file.DecRef(ctx)

                // Queue the result for delivery.
                actx.FinishRequest(ev)

                // Notify the event file if one was specified. This needs to happen
                // *after* queueing the result to avoid racing with the thread we may
                // wake up.
                if eventFile != nil {
                        eventFile.FileOperations.(*eventfd.EventOperations).Signal(1)
                        eventFile.DecRef(ctx)
                }
        }
}

// submitCallback processes a single callback.
func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr hostarch.Addr) error {
        file := t.GetFile(cb.FD)
        if file == nil {
                // File not found.
                return linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Was there an eventFD? Extract it.
        var eventFile *fs.File
        if cb.Flags&linux.IOCB_FLAG_RESFD != 0 {
                eventFile = t.GetFile(cb.ResFD)
                if eventFile == nil {
                        // Bad FD.
                        return linuxerr.EBADF
                }
                defer eventFile.DecRef(t)

                // Check that it is an eventfd.
                if _, ok := eventFile.FileOperations.(*eventfd.EventOperations); !ok {
                        // Not an event FD.
                        return linuxerr.EINVAL
                }
        }

        ioseq, err := memoryFor(t, cb)
        if err != nil {
                return err
        }

        // Check offset for reads/writes.
        switch cb.OpCode {
        case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
                if cb.Offset < 0 {
                        return linuxerr.EINVAL
                }
        }

        // Prepare the request.
        ctx, ok := t.MemoryManager().LookupAIOContext(t, id)
        if !ok {
                return linuxerr.EINVAL
        }
        if err := ctx.Prepare(); err != nil {
                return err
        }

        if eventFile != nil {
                // The request is set. Make sure there's a ref on the file.
                //
                // This is necessary when the callback executes on completion,
                // which is also what will release this reference.
                eventFile.IncRef()
        }

        // Perform the request asynchronously.
        file.IncRef()
        t.QueueAIO(getAIOCallback(t, file, cbAddr, cb, ioseq, ctx, eventFile))

        // All set.
        return nil
}

// IoSubmit implements linux syscall io_submit(2).
func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        id := args[0].Uint64()
        nrEvents := args[1].Int()
        addr := args[2].Pointer()

        if nrEvents < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        for i := int32(0); i < nrEvents; i++ {
                // Copy in the callback address.
                var cbAddr hostarch.Addr
                switch t.Arch().Width() {
                case 8:
                        var cbAddrP primitive.Uint64
                        if _, err := cbAddrP.CopyIn(t, addr); err != nil {
                                if i > 0 {
                                        // Some successful.
                                        return uintptr(i), nil, nil
                                }
                                // Nothing done.
                                return 0, nil, err
                        }
                        cbAddr = hostarch.Addr(cbAddrP)
                default:
                        return 0, nil, syserror.ENOSYS
                }

                // Copy in this callback.
                var cb linux.IOCallback
                if _, err := cb.CopyIn(t, cbAddr); err != nil {

                        if i > 0 {
                                // Some have been successful.
                                return uintptr(i), nil, nil
                        }
                        // Nothing done.
                        return 0, nil, err
                }

                // Process this callback.
                if err := submitCallback(t, id, &cb, cbAddr); err != nil {
                        if i > 0 {
                                // Partial success.
                                return uintptr(i), nil, nil
                        }
                        // Nothing done.
                        return 0, nil, err
                }

                // Advance to the next one.
                addr += hostarch.Addr(t.Arch().Width())
        }

        return uintptr(nrEvents), nil, nil
}

// LINT.ThenChange(vfs2/aio.go)






































































































    1 


























































































   12 












    1 












    5 

































































    3 


































































    1 







































    1 



































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcpip

import (
        "fmt"
)

// Error represents an error in the netstack error space.
//
// The error interface is intentionally omitted to avoid loss of type
// information that would occur if these errors were passed as error.
type Error interface {
        isError()

        // IgnoreStats indicates whether this error should be included in failure
        // counts in tcpip.Stats structs.
        IgnoreStats() bool

        fmt.Stringer
}

// LINT.IfChange

// ErrAborted indicates the operation was aborted.
//
// +stateify savable
type ErrAborted struct{}

func (*ErrAborted) isError() {}

// IgnoreStats implements Error.
func (*ErrAborted) IgnoreStats() bool {
        return false
}
func (*ErrAborted) String() string {
        return "operation aborted"
}

// ErrAddressFamilyNotSupported indicates the operation does not support the
// given address family.
//
// +stateify savable
type ErrAddressFamilyNotSupported struct{}

func (*ErrAddressFamilyNotSupported) isError() {}

// IgnoreStats implements Error.
func (*ErrAddressFamilyNotSupported) IgnoreStats() bool {
        return false
}
func (*ErrAddressFamilyNotSupported) String() string {
        return "address family not supported by protocol"
}

// ErrAlreadyBound indicates the endpoint is already bound.
//
// +stateify savable
type ErrAlreadyBound struct{}

func (*ErrAlreadyBound) isError() {}

// IgnoreStats implements Error.
func (*ErrAlreadyBound) IgnoreStats() bool {
        return true
}
func (*ErrAlreadyBound) String() string { return "endpoint already bound" }

// ErrAlreadyConnected indicates the endpoint is already connected.
//
// +stateify savable
type ErrAlreadyConnected struct{}

func (*ErrAlreadyConnected) isError() {}

// IgnoreStats implements Error.
func (*ErrAlreadyConnected) IgnoreStats() bool {
        return true
}
func (*ErrAlreadyConnected) String() string { return "endpoint is already connected" }

// ErrAlreadyConnecting indicates the endpoint is already connecting.
//
// +stateify savable
type ErrAlreadyConnecting struct{}

func (*ErrAlreadyConnecting) isError() {}

// IgnoreStats implements Error.
func (*ErrAlreadyConnecting) IgnoreStats() bool {
        return true
}
func (*ErrAlreadyConnecting) String() string { return "endpoint is already connecting" }

// ErrBadAddress indicates a bad address was provided.
//
// +stateify savable
type ErrBadAddress struct{}

func (*ErrBadAddress) isError() {}

// IgnoreStats implements Error.
func (*ErrBadAddress) IgnoreStats() bool {
        return false
}
func (*ErrBadAddress) String() string { return "bad address" }

// ErrBadBuffer indicates a bad buffer was provided.
//
// +stateify savable
type ErrBadBuffer struct{}

func (*ErrBadBuffer) isError() {}

// IgnoreStats implements Error.
func (*ErrBadBuffer) IgnoreStats() bool {
        return false
}
func (*ErrBadBuffer) String() string { return "bad buffer" }

// ErrBadLocalAddress indicates a bad local address was provided.
//
// +stateify savable
type ErrBadLocalAddress struct{}

func (*ErrBadLocalAddress) isError() {}

// IgnoreStats implements Error.
func (*ErrBadLocalAddress) IgnoreStats() bool {
        return false
}
func (*ErrBadLocalAddress) String() string { return "bad local address" }

// ErrBroadcastDisabled indicates broadcast is not enabled on the endpoint.
//
// +stateify savable
type ErrBroadcastDisabled struct{}

func (*ErrBroadcastDisabled) isError() {}

// IgnoreStats implements Error.
func (*ErrBroadcastDisabled) IgnoreStats() bool {
        return false
}
func (*ErrBroadcastDisabled) String() string { return "broadcast socket option disabled" }

// ErrClosedForReceive indicates the endpoint is closed for incoming data.
//
// +stateify savable
type ErrClosedForReceive struct{}

func (*ErrClosedForReceive) isError() {}

// IgnoreStats implements Error.
func (*ErrClosedForReceive) IgnoreStats() bool {
        return false
}
func (*ErrClosedForReceive) String() string { return "endpoint is closed for receive" }

// ErrClosedForSend indicates the endpoint is closed for outgoing data.
//
// +stateify savable
type ErrClosedForSend struct{}

func (*ErrClosedForSend) isError() {}

// IgnoreStats implements Error.
func (*ErrClosedForSend) IgnoreStats() bool {
        return false
}
func (*ErrClosedForSend) String() string { return "endpoint is closed for send" }

// ErrConnectStarted indicates the endpoint is connecting asynchronously.
//
// +stateify savable
type ErrConnectStarted struct{}

func (*ErrConnectStarted) isError() {}

// IgnoreStats implements Error.
func (*ErrConnectStarted) IgnoreStats() bool {
        return true
}
func (*ErrConnectStarted) String() string { return "connection attempt started" }

// ErrConnectionAborted indicates the connection was aborted.
//
// +stateify savable
type ErrConnectionAborted struct{}

func (*ErrConnectionAborted) isError() {}

// IgnoreStats implements Error.
func (*ErrConnectionAborted) IgnoreStats() bool {
        return false
}
func (*ErrConnectionAborted) String() string { return "connection aborted" }

// ErrConnectionRefused indicates the connection was refused.
//
// +stateify savable
type ErrConnectionRefused struct{}

func (*ErrConnectionRefused) isError() {}

// IgnoreStats implements Error.
func (*ErrConnectionRefused) IgnoreStats() bool {
        return false
}
func (*ErrConnectionRefused) String() string { return "connection was refused" }

// ErrConnectionReset indicates the connection was reset.
//
// +stateify savable
type ErrConnectionReset struct{}

func (*ErrConnectionReset) isError() {}

// IgnoreStats implements Error.
func (*ErrConnectionReset) IgnoreStats() bool {
        return false
}
func (*ErrConnectionReset) String() string { return "connection reset by peer" }

// ErrDestinationRequired indicates the operation requires a destination
// address, and one was not provided.
//
// +stateify savable
type ErrDestinationRequired struct{}

func (*ErrDestinationRequired) isError() {}

// IgnoreStats implements Error.
func (*ErrDestinationRequired) IgnoreStats() bool {
        return false
}
func (*ErrDestinationRequired) String() string { return "destination address is required" }

// ErrDuplicateAddress indicates the operation encountered a duplicate address.
//
// +stateify savable
type ErrDuplicateAddress struct{}

func (*ErrDuplicateAddress) isError() {}

// IgnoreStats implements Error.
func (*ErrDuplicateAddress) IgnoreStats() bool {
        return false
}
func (*ErrDuplicateAddress) String() string { return "duplicate address" }

// ErrDuplicateNICID indicates the operation encountered a duplicate NIC ID.
//
// +stateify savable
type ErrDuplicateNICID struct{}

func (*ErrDuplicateNICID) isError() {}

// IgnoreStats implements Error.
func (*ErrDuplicateNICID) IgnoreStats() bool {
        return false
}
func (*ErrDuplicateNICID) String() string { return "duplicate nic id" }

// ErrInvalidEndpointState indicates the endpoint is in an invalid state.
//
// +stateify savable
type ErrInvalidEndpointState struct{}

func (*ErrInvalidEndpointState) isError() {}

// IgnoreStats implements Error.
func (*ErrInvalidEndpointState) IgnoreStats() bool {
        return false
}
func (*ErrInvalidEndpointState) String() string { return "endpoint is in invalid state" }

// ErrInvalidOptionValue indicates an invalid option value was provided.
//
// +stateify savable
type ErrInvalidOptionValue struct{}

func (*ErrInvalidOptionValue) isError() {}

// IgnoreStats implements Error.
func (*ErrInvalidOptionValue) IgnoreStats() bool {
        return false
}
func (*ErrInvalidOptionValue) String() string { return "invalid option value specified" }

// ErrInvalidPortRange indicates an attempt to set an invalid port range.
//
// +stateify savable
type ErrInvalidPortRange struct{}

func (*ErrInvalidPortRange) isError() {}

// IgnoreStats implements Error.
func (*ErrInvalidPortRange) IgnoreStats() bool {
        return true
}
func (*ErrInvalidPortRange) String() string { return "invalid port range" }

// ErrMalformedHeader indicates the operation encountered a malformed header.
//
// +stateify savable
type ErrMalformedHeader struct{}

func (*ErrMalformedHeader) isError() {}

// IgnoreStats implements Error.
func (*ErrMalformedHeader) IgnoreStats() bool {
        return false
}
func (*ErrMalformedHeader) String() string { return "header is malformed" }

// ErrMessageTooLong indicates the operation encountered a message whose length
// exceeds the maximum permitted.
//
// +stateify savable
type ErrMessageTooLong struct{}

func (*ErrMessageTooLong) isError() {}

// IgnoreStats implements Error.
func (*ErrMessageTooLong) IgnoreStats() bool {
        return false
}
func (*ErrMessageTooLong) String() string { return "message too long" }

// ErrNetworkUnreachable indicates the operation is not able to reach the
// destination network.
//
// +stateify savable
type ErrNetworkUnreachable struct{}

func (*ErrNetworkUnreachable) isError() {}

// IgnoreStats implements Error.
func (*ErrNetworkUnreachable) IgnoreStats() bool {
        return false
}
func (*ErrNetworkUnreachable) String() string { return "network is unreachable" }

// ErrNoBufferSpace indicates no buffer space is available.
//
// +stateify savable
type ErrNoBufferSpace struct{}

func (*ErrNoBufferSpace) isError() {}

// IgnoreStats implements Error.
func (*ErrNoBufferSpace) IgnoreStats() bool {
        return false
}
func (*ErrNoBufferSpace) String() string { return "no buffer space available" }

// ErrNoPortAvailable indicates no port could be allocated for the operation.
//
// +stateify savable
type ErrNoPortAvailable struct{}

func (*ErrNoPortAvailable) isError() {}

// IgnoreStats implements Error.
func (*ErrNoPortAvailable) IgnoreStats() bool {
        return false
}
func (*ErrNoPortAvailable) String() string { return "no ports are available" }

// ErrNoRoute indicates the operation is not able to find a route to the
// destination.
//
// +stateify savable
type ErrNoRoute struct{}

func (*ErrNoRoute) isError() {}

// IgnoreStats implements Error.
func (*ErrNoRoute) IgnoreStats() bool {
        return false
}
func (*ErrNoRoute) String() string { return "no route" }

// ErrNoSuchFile is used to indicate that ENOENT should be returned the to
// calling application.
//
// +stateify savable
type ErrNoSuchFile struct{}

func (*ErrNoSuchFile) isError() {}

// IgnoreStats implements Error.
func (*ErrNoSuchFile) IgnoreStats() bool {
        return false
}
func (*ErrNoSuchFile) String() string { return "no such file" }

// ErrNotConnected indicates the endpoint is not connected.
//
// +stateify savable
type ErrNotConnected struct{}

func (*ErrNotConnected) isError() {}

// IgnoreStats implements Error.
func (*ErrNotConnected) IgnoreStats() bool {
        return false
}
func (*ErrNotConnected) String() string { return "endpoint not connected" }

// ErrNotPermitted indicates the operation is not permitted.
//
// +stateify savable
type ErrNotPermitted struct{}

func (*ErrNotPermitted) isError() {}

// IgnoreStats implements Error.
func (*ErrNotPermitted) IgnoreStats() bool {
        return false
}
func (*ErrNotPermitted) String() string { return "operation not permitted" }

// ErrNotSupported indicates the operation is not supported.
//
// +stateify savable
type ErrNotSupported struct{}

func (*ErrNotSupported) isError() {}

// IgnoreStats implements Error.
func (*ErrNotSupported) IgnoreStats() bool {
        return false
}
func (*ErrNotSupported) String() string { return "operation not supported" }

// ErrPortInUse indicates the provided port is in use.
//
// +stateify savable
type ErrPortInUse struct{}

func (*ErrPortInUse) isError() {}

// IgnoreStats implements Error.
func (*ErrPortInUse) IgnoreStats() bool {
        return false
}
func (*ErrPortInUse) String() string { return "port is in use" }

// ErrQueueSizeNotSupported indicates the endpoint does not allow queue size
// operation.
//
// +stateify savable
type ErrQueueSizeNotSupported struct{}

func (*ErrQueueSizeNotSupported) isError() {}

// IgnoreStats implements Error.
func (*ErrQueueSizeNotSupported) IgnoreStats() bool {
        return false
}
func (*ErrQueueSizeNotSupported) String() string { return "queue size querying not supported" }

// ErrTimeout indicates the operation timed out.
//
// +stateify savable
type ErrTimeout struct{}

func (*ErrTimeout) isError() {}

// IgnoreStats implements Error.
func (*ErrTimeout) IgnoreStats() bool {
        return false
}
func (*ErrTimeout) String() string { return "operation timed out" }

// ErrUnknownDevice indicates an unknown device identifier was provided.
//
// +stateify savable
type ErrUnknownDevice struct{}

func (*ErrUnknownDevice) isError() {}

// IgnoreStats implements Error.
func (*ErrUnknownDevice) IgnoreStats() bool {
        return false
}
func (*ErrUnknownDevice) String() string { return "unknown device" }

// ErrUnknownNICID indicates an unknown NIC ID was provided.
//
// +stateify savable
type ErrUnknownNICID struct{}

func (*ErrUnknownNICID) isError() {}

// IgnoreStats implements Error.
func (*ErrUnknownNICID) IgnoreStats() bool {
        return false
}
func (*ErrUnknownNICID) String() string { return "unknown nic id" }

// ErrUnknownProtocol indicates an unknown protocol was requested.
//
// +stateify savable
type ErrUnknownProtocol struct{}

func (*ErrUnknownProtocol) isError() {}

// IgnoreStats implements Error.
func (*ErrUnknownProtocol) IgnoreStats() bool {
        return false
}
func (*ErrUnknownProtocol) String() string { return "unknown protocol" }

// ErrUnknownProtocolOption indicates an unknown protocol option was provided.
//
// +stateify savable
type ErrUnknownProtocolOption struct{}

func (*ErrUnknownProtocolOption) isError() {}

// IgnoreStats implements Error.
func (*ErrUnknownProtocolOption) IgnoreStats() bool {
        return false
}
func (*ErrUnknownProtocolOption) String() string { return "unknown option for protocol" }

// ErrWouldBlock indicates the operation would block.
//
// +stateify savable
type ErrWouldBlock struct{}

func (*ErrWouldBlock) isError() {}

// IgnoreStats implements Error.
func (*ErrWouldBlock) IgnoreStats() bool {
        return true
}
func (*ErrWouldBlock) String() string { return "operation would block" }

// LINT.ThenChange(../syserr/netstack.go)































    5 










    5 


















  702 




  700 

  699 


  699 



  696 









    4 




    4 
    4 






  700 
  697 

  699 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package sched

import "math/bits"

const (
        bitsPerByte  = 8
        bytesPerLong = 8 // only for 64-bit architectures
)

// CPUSet contains a bitmap to record CPU information.
//
// Note that this definition is only correct for little-endian architectures,
// since Linux's cpumask_t uses unsigned long.
type CPUSet []byte

// CPUSetSize returns the size in bytes of a CPUSet that can contain num cpus.
func CPUSetSize(num uint) uint {
        // NOTE(b/68859821): Applications may expect that the size of a CPUSet in
        // bytes is always a multiple of sizeof(unsigned long), since this is true
        // in Linux. Thus we always round up.
        bytes := (num + bitsPerByte - 1) / bitsPerByte
        longs := (bytes + bytesPerLong - 1) / bytesPerLong
        return longs * bytesPerLong
}

// NewCPUSet returns a CPUSet for the given number of CPUs which initially
// contains no CPUs.
func NewCPUSet(num uint) CPUSet {
        return CPUSet(make([]byte, CPUSetSize(num)))
}

// NewFullCPUSet returns a CPUSet for the given number of CPUs, all of which
// are present in the set.
func NewFullCPUSet(num uint) CPUSet {
        c := NewCPUSet(num)
        var i uint
        for ; i < num/bitsPerByte; i++ {
                c[i] = 0xff
        }
        if rem := num % bitsPerByte; rem != 0 {
                c[i] = (1 << rem) - 1
        }
        return c
}

// Size returns the size of 'c' in bytes.
func (c CPUSet) Size() uint {
        return uint(len(c))
}

// NumCPUs returns how many cpus are set in the CPUSet.
func (c CPUSet) NumCPUs() uint {
        var n int
        for _, b := range c {
                n += bits.OnesCount8(b)
        }
        return uint(n)
}

// Copy returns a copy of the CPUSet.
func (c CPUSet) Copy() CPUSet {
        return append(CPUSet(nil), c...)
}

// Set sets the bit corresponding to cpu.
func (c *CPUSet) Set(cpu uint) {
        (*c)[cpu/bitsPerByte] |= 1 << (cpu % bitsPerByte)
}

// ClearAbove clears bits corresponding to cpu and all higher cpus.
func (c *CPUSet) ClearAbove(cpu uint) {
        i := cpu / bitsPerByte
        if i >= c.Size() {
                return
        }
        (*c)[i] &^= 0xff << (cpu % bitsPerByte)
        for i++; i < c.Size(); i++ {
                (*c)[i] = 0
        }
}

// ForEachCPU iterates over the CPUSet and calls fn with the cpu index if
// it's set.
func (c CPUSet) ForEachCPU(fn func(uint)) {
        for i := uint(0); i < c.Size()*bitsPerByte; i++ {
                bit := uint(1) << (i & (bitsPerByte - 1))
                if uint(c[i/bitsPerByte])&bit == bit {
                        fn(i)
                }
        }
}






















































    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tun

import (
        "encoding/binary"

        "gvisor.dev/gvisor/pkg/tcpip"
)

const (
        // PacketInfoHeaderSize is the size of the packet information header.
        PacketInfoHeaderSize = 4

        offsetFlags    = 0
        offsetProtocol = 2
)

// PacketInfoFields contains fields sent through the wire if IFF_NO_PI flag is
// not set.
type PacketInfoFields struct {
        Flags    uint16
        Protocol tcpip.NetworkProtocolNumber
}

// PacketInfoHeader is the wire representation of the packet information sent if
// IFF_NO_PI flag is not set.
type PacketInfoHeader []byte

// Encode encodes f into h.
func (h PacketInfoHeader) Encode(f *PacketInfoFields) {
        binary.BigEndian.PutUint16(h[offsetFlags:][:2], f.Flags)
        binary.BigEndian.PutUint16(h[offsetProtocol:][:2], uint16(f.Protocol))
}

// Flags returns the flag field in h.
func (h PacketInfoHeader) Flags() uint16 {
        return binary.BigEndian.Uint16(h[offsetFlags:])
}

// Protocol returns the protocol field in h.
func (h PacketInfoHeader) Protocol() tcpip.NetworkProtocolNumber {
        return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(h[offsetProtocol:]))
}



































































































































































































































































































  853 















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package refs defines an interface for reference counted objects. It
// also provides a drop-in implementation called AtomicRefCount.
package refs

import (
        "bytes"
        "fmt"
        "reflect"
        "runtime"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sync"
)

// RefCounter is the interface to be implemented by objects that are reference
// counted.
//
// TODO(gvisor.dev/issue/1624): Get rid of most of this package and replace it
// with refsvfs2.
type RefCounter interface {
        // IncRef increments the reference counter on the object.
        IncRef()

        // DecRef decrements the reference counter on the object.
        //
        // Note that AtomicRefCounter.DecRef() does not support destructors.
        // If a type has a destructor, it must implement its own DecRef()
        // method and call AtomicRefCounter.DecRefWithDestructor(destructor).
        DecRef(ctx context.Context)

        // TryIncRef attempts to increase the reference counter on the object,
        // but may fail if all references have already been dropped. This
        // should be used only in special circumstances, such as WeakRefs.
        TryIncRef() bool

        // addWeakRef adds the given weak reference. Note that you should have a
        // reference to the object when calling this method.
        addWeakRef(*WeakRef)

        // dropWeakRef drops the given weak reference. Note that you should have
        // a reference to the object when calling this method.
        dropWeakRef(*WeakRef)
}

// A WeakRefUser is notified when the last non-weak reference is dropped.
type WeakRefUser interface {
        // WeakRefGone is called when the last non-weak reference is dropped.
        WeakRefGone(ctx context.Context)
}

// WeakRef is a weak reference.
//
// +stateify savable
type WeakRef struct {
        weakRefEntry `state:"nosave"`

        // obj is an atomic value that points to the refCounter.
        obj atomic.Value `state:".(savedReference)"`

        // user is notified when the weak ref is zapped by the object getting
        // destroyed.
        user WeakRefUser
}

// weakRefPool is a pool of weak references to avoid allocations on the hot path.
var weakRefPool = sync.Pool{
        New: func() interface{} {
                return &WeakRef{}
        },
}

// NewWeakRef acquires a weak reference for the given object.
//
// An optional user will be notified when the last non-weak reference is
// dropped.
//
// Note that you must hold a reference to the object prior to getting a weak
// reference. (But you may drop the non-weak reference after that.)
func NewWeakRef(rc RefCounter, u WeakRefUser) *WeakRef {
        w := weakRefPool.Get().(*WeakRef)
        w.init(rc, u)
        return w
}

// get attempts to get a normal reference to the underlying object, and returns
// the object. If this weak reference has already been zapped (the object has
// been destroyed) then false is returned. If the object still exists, then
// true is returned.
func (w *WeakRef) get() (RefCounter, bool) {
        rc := w.obj.Load().(RefCounter)
        if v := reflect.ValueOf(rc); v == reflect.Zero(v.Type()) {
                // This pointer has already been zapped by zap() below. We do
                // this to ensure that the GC can collect the underlying
                // RefCounter objects and they don't hog resources.
                return nil, false
        }
        if !rc.TryIncRef() {
                return nil, true
        }
        return rc, true
}

// Get attempts to get a normal reference to the underlying object, and returns
// the object. If this fails (the object no longer exists), then nil will be
// returned instead.
func (w *WeakRef) Get() RefCounter {
        rc, _ := w.get()
        return rc
}

// Drop drops this weak reference. You should always call drop when you are
// finished with the weak reference. You may not use this object after calling
// drop.
func (w *WeakRef) Drop(ctx context.Context) {
        rc, ok := w.get()
        if !ok {
                // We've been zapped already. When the refcounter has called
                // zap, we're guaranteed it's not holding references.
                weakRefPool.Put(w)
                return
        }
        if rc == nil {
                // The object is in the process of being destroyed. We can't
                // remove this from the object's list, nor can we return this
                // object to the pool. It'll just be garbage collected. This is
                // a rare edge case, so it's not a big deal.
                return
        }

        // At this point, we have a reference on the object. So destruction
        // of the object (and zapping this weak reference) can't race here.
        rc.dropWeakRef(w)

        // And now aren't on the object's list of weak references. So it won't
        // zap us if this causes the reference count to drop to zero.
        rc.DecRef(ctx)

        // Return to the pool.
        weakRefPool.Put(w)
}

// init initializes this weak reference.
func (w *WeakRef) init(rc RefCounter, u WeakRefUser) {
        // Reset the contents of the weak reference.
        // This is important because we are reseting the atomic value type.
        // Otherwise, we could panic here if obj is different than what it was
        // the last time this was used.
        *w = WeakRef{}
        w.user = u
        w.obj.Store(rc)

        // In the load path, we may already have a nil value. So we need to
        // check whether or not that is the case before calling addWeakRef.
        if v := reflect.ValueOf(rc); v != reflect.Zero(v.Type()) {
                rc.addWeakRef(w)
        }
}

// zap zaps this weak reference.
func (w *WeakRef) zap() {
        // We need to be careful about types here.
        // So reflect is involved. But it's not that bad.
        rc := w.obj.Load()
        typ := reflect.TypeOf(rc)
        w.obj.Store(reflect.Zero(typ).Interface())
}

// AtomicRefCount keeps a reference count using atomic operations and calls the
// destructor when the count reaches zero.
//
// Do not use AtomicRefCount for new ref-counted objects! It is deprecated in
// favor of the refsvfs2 package.
//
// N.B. To allow the zero-object to be initialized, the count is offset by
//      1, that is, when refCount is n, there are really n+1 references.
//
// +stateify savable
type AtomicRefCount struct {
        // refCount is composed of two fields:
        //
        //        [32-bit speculative references]:[32-bit real references]
        //
        // Speculative references are used for TryIncRef, to avoid a
        // CompareAndSwap loop. See IncRef, DecRef and TryIncRef for details of
        // how these fields are used.
        refCount int64

        // name is the name of the type which owns this ref count.
        //
        // name is immutable after EnableLeakCheck is called.
        name string

        // stack optionally records the caller of EnableLeakCheck.
        //
        // stack is immutable after EnableLeakCheck is called.
        stack []uintptr

        // mu protects the list below.
        mu sync.Mutex `state:"nosave"`

        // weakRefs is our collection of weak references.
        weakRefs weakRefList `state:"nosave"`
}

// LeakMode configures the leak checker.
type LeakMode uint32

// TODO(gvisor.dev/issue/1624): Simplify down to two modes (on/off) once vfs1
// ref counting is gone.
const (
        // UninitializedLeakChecking indicates that the leak checker has not yet been initialized.
        UninitializedLeakChecking LeakMode = iota

        // NoLeakChecking indicates that no effort should be made to check for
        // leaks.
        NoLeakChecking

        // LeaksLogWarning indicates that a warning should be logged when leaks
        // are found.
        LeaksLogWarning

        // LeaksLogTraces indicates that a trace collected during allocation
        // should be logged when leaks are found.
        LeaksLogTraces
)

// Set implements flag.Value.
func (l *LeakMode) Set(v string) error {
        switch v {
        case "disabled":
                *l = NoLeakChecking
        case "log-names":
                *l = LeaksLogWarning
        case "log-traces":
                *l = LeaksLogTraces
        default:
                return fmt.Errorf("invalid ref leak mode %q", v)
        }
        return nil
}

// Get implements flag.Value.
func (l *LeakMode) Get() interface{} {
        return *l
}

// String implements flag.Value.
func (l LeakMode) String() string {
        switch l {
        case UninitializedLeakChecking:
                return "uninitialized"
        case NoLeakChecking:
                return "disabled"
        case LeaksLogWarning:
                return "log-names"
        case LeaksLogTraces:
                return "log-traces"
        }
        panic(fmt.Sprintf("invalid ref leak mode %d", l))
}

// leakMode stores the current mode for the reference leak checker.
//
// Values must be one of the LeakMode values.
//
// leakMode must be accessed atomically.
var leakMode uint32

// SetLeakMode configures the reference leak checker.
func SetLeakMode(mode LeakMode) {
        atomic.StoreUint32(&leakMode, uint32(mode))
}

// GetLeakMode returns the current leak mode.
func GetLeakMode() LeakMode {
        return LeakMode(atomic.LoadUint32(&leakMode))
}

const maxStackFrames = 40

type fileLine struct {
        file string
        line int
}

// A stackKey is a representation of a stack frame for use as a map key.
//
// The fileLine type is used as PC values seem to vary across collections, even
// for the same call stack.
type stackKey [maxStackFrames]fileLine

var stackCache = struct {
        sync.Mutex
        entries map[stackKey][]uintptr
}{entries: map[stackKey][]uintptr{}}

func makeStackKey(pcs []uintptr) stackKey {
        frames := runtime.CallersFrames(pcs)
        var key stackKey
        keySlice := key[:0]
        for {
                frame, more := frames.Next()
                keySlice = append(keySlice, fileLine{frame.File, frame.Line})

                if !more || len(keySlice) == len(key) {
                        break
                }
        }
        return key
}

// RecordStack constructs and returns the PCs on the current stack.
func RecordStack() []uintptr {
        pcs := make([]uintptr, maxStackFrames)
        n := runtime.Callers(1, pcs)
        if n == 0 {
                // No pcs available. Stop now.
                //
                // This can happen if the first argument to runtime.Callers
                // is large.
                return nil
        }
        pcs = pcs[:n]
        key := makeStackKey(pcs)
        stackCache.Lock()
        v, ok := stackCache.entries[key]
        if !ok {
                // Reallocate to prevent pcs from escaping.
                v = append([]uintptr(nil), pcs...)
                stackCache.entries[key] = v
        }
        stackCache.Unlock()
        return v
}

// FormatStack converts the given stack into a readable format.
func FormatStack(pcs []uintptr) string {
        frames := runtime.CallersFrames(pcs)
        var trace bytes.Buffer
        for {
                frame, more := frames.Next()
                fmt.Fprintf(&trace, "%s:%d: %s\n", frame.File, frame.Line, frame.Function)

                if !more {
                        break
                }
        }
        return trace.String()
}

func (r *AtomicRefCount) finalize() {
        var note string
        switch LeakMode(atomic.LoadUint32(&leakMode)) {
        case NoLeakChecking:
                return
        case UninitializedLeakChecking:
                note = "(Leak checker uninitialized): "
        }
        if n := r.ReadRefs(); n != 0 {
                msg := fmt.Sprintf("%sAtomicRefCount %p owned by %q garbage collected with ref count of %d (want 0)", note, r, r.name, n)
                if len(r.stack) != 0 {
                        msg += ":\nCaller:\n" + FormatStack(r.stack)
                } else {
                        msg += " (enable trace logging to debug)"
                }
                log.Warningf(msg)
        }
}

// EnableLeakCheck checks for reference leaks when the AtomicRefCount gets
// garbage collected.
//
// This function adds a finalizer to the AtomicRefCount, so the AtomicRefCount
// must be at the beginning of its parent.
//
// name is a friendly name that will be listed as the owner of the
// AtomicRefCount in logs. It should be the name of the parent type, including
// package.
func (r *AtomicRefCount) EnableLeakCheck(name string) {
        if name == "" {
                panic("invalid name")
        }
        switch LeakMode(atomic.LoadUint32(&leakMode)) {
        case NoLeakChecking:
                return
        case LeaksLogTraces:
                r.stack = RecordStack()
        }
        r.name = name
        runtime.SetFinalizer(r, (*AtomicRefCount).finalize)
}

// ReadRefs returns the current number of references. The returned count is
// inherently racy and is unsafe to use without external synchronization.
func (r *AtomicRefCount) ReadRefs() int64 {
        // Account for the internal -1 offset on refcounts.
        return atomic.LoadInt64(&r.refCount) + 1
}

// IncRef increments this object's reference count. While the count is kept
// greater than zero, the destructor doesn't get called.
//
// The sanity check here is limited to real references, since if they have
// dropped beneath zero then the object should have been destroyed.
//
//go:nosplit
func (r *AtomicRefCount) IncRef() {
        if v := atomic.AddInt64(&r.refCount, 1); v <= 0 {
                panic("Incrementing non-positive ref count")
        }
}

// TryIncRef attempts to increment the reference count, *unless the count has
// already reached zero*. If false is returned, then the object has already
// been destroyed, and the weak reference is no longer valid. If true if
// returned then a valid reference is now held on the object.
//
// To do this safely without a loop, a speculative reference is first acquired
// on the object. This allows multiple concurrent TryIncRef calls to
// distinguish other TryIncRef calls from genuine references held.
//
//go:nosplit
func (r *AtomicRefCount) TryIncRef() bool {
        const speculativeRef = 1 << 32
        v := atomic.AddInt64(&r.refCount, speculativeRef)
        if int32(v) < 0 {
                // This object has already been freed.
                atomic.AddInt64(&r.refCount, -speculativeRef)
                return false
        }

        // Turn into a real reference.
        atomic.AddInt64(&r.refCount, -speculativeRef+1)
        return true
}

// addWeakRef adds the given weak reference.
func (r *AtomicRefCount) addWeakRef(w *WeakRef) {
        r.mu.Lock()
        r.weakRefs.PushBack(w)
        r.mu.Unlock()
}

// dropWeakRef drops the given weak reference.
func (r *AtomicRefCount) dropWeakRef(w *WeakRef) {
        r.mu.Lock()
        r.weakRefs.Remove(w)
        r.mu.Unlock()
}

// DecRefWithDestructor decrements the object's reference count. If the
// resulting count is negative and the destructor is not nil, then the
// destructor will be called.
//
// Note that speculative references are counted here. Since they were added
// prior to real references reaching zero, they will successfully convert to
// real references. In other words, we see speculative references only in the
// following case:
//
//        A: TryIncRef [speculative increase => sees non-negative references]
//        B: DecRef [real decrease]
//        A: TryIncRef [transform speculative to real]
//
//go:nosplit
func (r *AtomicRefCount) DecRefWithDestructor(ctx context.Context, destroy func(context.Context)) {
        switch v := atomic.AddInt64(&r.refCount, -1); {
        case v < -1:
                panic("Decrementing non-positive ref count")

        case v == -1:
                // Zap weak references. Note that at this point, all weak
                // references are already invalid. That is, TryIncRef() will
                // return false due to the reference count check.
                r.mu.Lock()
                for !r.weakRefs.Empty() {
                        w := r.weakRefs.Front()
                        // Capture the callback because w cannot be touched
                        // after it's zapped -- the owner is free it reuse it
                        // after that.
                        user := w.user
                        r.weakRefs.Remove(w)
                        w.zap()

                        if user != nil {
                                r.mu.Unlock()
                                user.WeakRefGone(ctx)
                                r.mu.Lock()
                        }
                }
                r.mu.Unlock()

                // Call the destructor.
                if destroy != nil {
                        destroy(ctx)
                }
        }
}

// DecRef decrements this object's reference count.
//
//go:nosplit
func (r *AtomicRefCount) DecRef(ctx context.Context) {
        r.DecRefWithDestructor(ctx, nil)
}

// OnExit is called on sandbox exit. It runs GC to enqueue refcount finalizers,
// which check for reference leaks. There is no way to guarantee that every
// finalizer will run before exiting, but this at least ensures that they will
// be discovered/enqueued by GC.
func OnExit() {
        if LeakMode(atomic.LoadUint32(&leakMode)) != NoLeakChecking {
                runtime.GC()
        }
}



























    2 





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import "gvisor.dev/gvisor/pkg/tcpip/seqnum"

// sackRecovery stores the variables related to TCP SACK loss recovery
// algorithm.
//
// +stateify savable
type sackRecovery struct {
        s *sender
}

func newSACKRecovery(s *sender) *sackRecovery {
        return &sackRecovery{s: s}
}

// handleSACKRecovery implements the loss recovery phase as described in RFC6675
// section 5, step C.
func (sr *sackRecovery) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) {
        snd := sr.s
        snd.SetPipe()

        if smss := int(snd.ep.scoreboard.SMSS()); limit > smss {
                // Cap segment size limit to s.smss as SACK recovery requires
                // that all retransmissions or new segments send during recovery
                // be of <= SMSS.
                limit = smss
        }

        nextSegHint := snd.writeList.Front()
        for snd.Outstanding < snd.SndCwnd {
                var nextSeg *segment
                var rescueRtx bool
                nextSeg, nextSegHint, rescueRtx = snd.NextSeg(nextSegHint)
                if nextSeg == nil {
                        return dataSent
                }
                if !snd.isAssignedSequenceNumber(nextSeg) || snd.SndNxt.LessThanEq(nextSeg.sequenceNumber) {
                        // New data being sent.

                        // Step C.3 described below is handled by
                        // maybeSendSegment which increments sndNxt when
                        // a segment is transmitted.
                        //
                        // Step C.3 "If any of the data octets sent in
                        // (C.1) are above HighData, HighData must be
                        // updated to reflect the transmission of
                        // previously unsent data."
                        //
                        // We pass s.smss as the limit as the Step 2) requires that
                        // new data sent should be of size s.smss or less.
                        if sent := snd.maybeSendSegment(nextSeg, limit, end); !sent {
                                return dataSent
                        }
                        dataSent = true
                        snd.Outstanding++
                        snd.writeNext = nextSeg.Next()
                        continue
                }

                // Now handle the retransmission case where we matched either step 1,3 or 4
                // of the NextSeg algorithm.
                // RFC 6675, Step C.4.
                //
                // "The estimate of the amount of data outstanding in the network
                // must be updated by incrementing pipe by the number of octets
                // transmitted in (C.1)."
                snd.Outstanding++
                dataSent = true
                snd.sendSegment(nextSeg)

                segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen())
                if rescueRtx {
                        // We do the last part of rule (4) of NextSeg here to update
                        // RescueRxt as until this point we don't know if we are going
                        // to use the rescue transmission.
                        snd.FastRecovery.RescueRxt = snd.FastRecovery.Last
                } else {
                        // RFC 6675, Step C.2
                        //
                        // "If any of the data octets sent in (C.1) are below
                        // HighData, HighRxt MUST be set to the highest sequence
                        // number of the retransmitted segment unless NextSeg ()
                        // rule (4) was invoked for this retransmission."
                        snd.FastRecovery.HighRxt = segEnd - 1
                }
        }
        return dataSent
}

func (sr *sackRecovery) DoRecovery(rcvdSeg *segment, fastRetransmit bool) {
        snd := sr.s
        if fastRetransmit {
                snd.resendSegment()
        }

        // We are in fast recovery mode. Ignore the ack if it's out of range.
        if ack := rcvdSeg.ackNumber; !ack.InRange(snd.SndUna, snd.SndNxt+1) {
                return
        }

        // RFC 6675 recovery algorithm step C 1-5.
        end := snd.SndUna.Add(snd.SndWnd)
        dataSent := sr.handleSACKRecovery(snd.MaxPayloadSize, end)
        snd.postXmit(dataSent, true /* shouldScheduleProbe */)
}






















































    7 











    7 



    3 

    1 


    2 







    1 

    1 



    4 





    2 






    4 
    4 












    1 




    2 














    3 




    3 






    3 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package timerfd implements timer fds.
package timerfd

import (
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// TimerFileDescription implements vfs.FileDescriptionImpl for timer fds. It also
// implements ktime.TimerListener.
//
// +stateify savable
type TimerFileDescription struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.DentryMetadataFileDescriptionImpl
        vfs.NoLockFD

        events waiter.Queue
        timer  *ktime.Timer

        // val is the number of timer expirations since the last successful
        // call to PRead, or SetTime. val must be accessed using atomic memory
        // operations.
        val uint64
}

var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil)
var _ ktime.TimerListener = (*TimerFileDescription)(nil)

// New returns a new timer fd.
func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) {
        vd := vfsObj.NewAnonVirtualDentry("[timerfd]")
        defer vd.DecRef(ctx)
        tfd := &TimerFileDescription{}
        tfd.timer = ktime.NewTimer(clock, tfd)
        if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{
                UseDentryMetadata: true,
                DenyPRead:         true,
                DenyPWrite:        true,
        }); err != nil {
                return nil, err
        }
        return &tfd.vfsfd, nil
}

// Read implements vfs.FileDescriptionImpl.Read.
func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        const sizeofUint64 = 8
        if dst.NumBytes() < sizeofUint64 {
                return 0, linuxerr.EINVAL
        }
        if val := atomic.SwapUint64(&tfd.val, 0); val != 0 {
                var buf [sizeofUint64]byte
                hostarch.ByteOrder.PutUint64(buf[:], val)
                if _, err := dst.CopyOut(ctx, buf[:]); err != nil {
                        // Linux does not undo consuming the number of
                        // expirations even if writing to userspace fails.
                        return 0, err
                }
                return sizeofUint64, nil
        }
        return 0, syserror.ErrWouldBlock
}

// Clock returns the timer fd's Clock.
func (tfd *TimerFileDescription) Clock() ktime.Clock {
        return tfd.timer.Clock()
}

// GetTime returns the associated Timer's setting and the time at which it was
// observed.
func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) {
        return tfd.timer.Get()
}

// SetTime atomically changes the associated Timer's setting, resets the number
// of expirations to 0, and returns the previous setting and the time at which
// it was observed.
func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) {
        return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) })
}

// Readiness implements waiter.Waitable.Readiness.
func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
        var ready waiter.EventMask
        if atomic.LoadUint64(&tfd.val) != 0 {
                ready |= waiter.ReadableEvents
        }
        return ready
}

// EventRegister implements waiter.Waitable.EventRegister.
func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        tfd.events.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) {
        tfd.events.EventUnregister(e)
}

// PauseTimer pauses the associated Timer.
func (tfd *TimerFileDescription) PauseTimer() {
        tfd.timer.Pause()
}

// ResumeTimer resumes the associated Timer.
func (tfd *TimerFileDescription) ResumeTimer() {
        tfd.timer.Resume()
}

// Release implements vfs.FileDescriptionImpl.Release.
func (tfd *TimerFileDescription) Release(context.Context) {
        tfd.timer.Destroy()
}

// Notify implements ktime.TimerListener.Notify.
func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
        atomic.AddUint64(&tfd.val, exp)
        tfd.events.Notify(waiter.ReadableEvents)
        return ktime.Setting{}, false
}

// Destroy implements ktime.TimerListener.Destroy.
func (tfd *TimerFileDescription) Destroy() {}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/memmap/file_range.go: no such file or directory

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/mm/special_mappable_refs.go: no such file or directory










































































































































































































    3 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import (
        "encoding/binary"
        "fmt"
        "time"
)

var _ fmt.Stringer = NDPRoutePreference(0)

// NDPRoutePreference is the preference values for default routers or
// more-specific routes.
//
// As per RFC 4191 section 2.1,
//
//   Default router preferences and preferences for more-specific routes
//   are encoded the same way.
//
//   Preference values are encoded as a two-bit signed integer, as
//   follows:
//
//      01      High
//      00      Medium (default)
//      11      Low
//      10      Reserved - MUST NOT be sent
//
//   Note that implementations can treat the value as a two-bit signed
//   integer.
//
//   Having just three values reinforces that they are not metrics and
//   more values do not appear to be necessary for reasonable scenarios.
type NDPRoutePreference uint8

const (
        // HighRoutePreference indicates a high preference, as per
        // RFC 4191 section 2.1.
        HighRoutePreference NDPRoutePreference = 0b01

        // MediumRoutePreference indicates a medium preference, as per
        // RFC 4191 section 2.1.
        //
        // This is the default preference value.
        MediumRoutePreference = 0b00

        // LowRoutePreference indicates a low preference, as per
        // RFC 4191 section 2.1.
        LowRoutePreference = 0b11

        // ReservedRoutePreference is a reserved preference value, as per
        // RFC 4191 section 2.1.
        //
        // It MUST NOT be sent.
        ReservedRoutePreference = 0b10
)

// String implements fmt.Stringer.
func (p NDPRoutePreference) String() string {
        switch p {
        case HighRoutePreference:
                return "HighRoutePreference"
        case MediumRoutePreference:
                return "MediumRoutePreference"
        case LowRoutePreference:
                return "LowRoutePreference"
        case ReservedRoutePreference:
                return "ReservedRoutePreference"
        default:
                return fmt.Sprintf("NDPRoutePreference(%d)", p)
        }
}

// NDPRouterAdvert is an NDP Router Advertisement message. It will only contain
// the body of an ICMPv6 packet.
//
// See RFC 4861 section 4.2 and RFC 4191 section 2.2 for more details.
type NDPRouterAdvert []byte

// As per RFC 4191 section 2.2,
//
//       0                   1                   2                   3
//       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
//      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//      |     Type      |     Code      |          Checksum             |
//      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//      | Cur Hop Limit |M|O|H|Prf|Resvd|       Router Lifetime         |
//      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//      |                         Reachable Time                        |
//      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//      |                          Retrans Timer                        |
//      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//      |   Options ...
//      +-+-+-+-+-+-+-+-+-+-+-+-
const (
        // NDPRAMinimumSize is the minimum size of a valid NDP Router
        // Advertisement message (body of an ICMPv6 packet).
        NDPRAMinimumSize = 12

        // ndpRACurrHopLimitOffset is the byte of the Curr Hop Limit field
        // within an NDPRouterAdvert.
        ndpRACurrHopLimitOffset = 0

        // ndpRAFlagsOffset is the byte with the NDP RA bit-fields/flags
        // within an NDPRouterAdvert.
        ndpRAFlagsOffset = 1

        // ndpRAManagedAddrConfFlagMask is the mask of the Managed Address
        // Configuration flag within the bit-field/flags byte of an
        // NDPRouterAdvert.
        ndpRAManagedAddrConfFlagMask = (1 << 7)

        // ndpRAOtherConfFlagMask is the mask of the Other Configuration flag
        // within the bit-field/flags byte of an NDPRouterAdvert.
        ndpRAOtherConfFlagMask = (1 << 6)

        // ndpDefaultRouterPreferenceShift is the shift of the Prf (Default Router
        // Preference) field within the flags byte of an NDPRouterAdvert.
        ndpDefaultRouterPreferenceShift = 3

        // ndpDefaultRouterPreferenceMask is the mask of the Prf (Default Router
        // Preference) field within the flags byte of an NDPRouterAdvert.
        ndpDefaultRouterPreferenceMask = (0b11 << ndpDefaultRouterPreferenceShift)

        // ndpRARouterLifetimeOffset is the start of the 2-byte Router Lifetime
        // field within an NDPRouterAdvert.
        ndpRARouterLifetimeOffset = 2

        // ndpRAReachableTimeOffset is the start of the 4-byte Reachable Time
        // field within an NDPRouterAdvert.
        ndpRAReachableTimeOffset = 4

        // ndpRARetransTimerOffset is the start of the 4-byte Retrans Timer
        // field within an NDPRouterAdvert.
        ndpRARetransTimerOffset = 8

        // ndpRAOptionsOffset is the start of the NDP options in an
        // NDPRouterAdvert.
        ndpRAOptionsOffset = 12
)

// CurrHopLimit returns the value of the Curr Hop Limit field.
func (b NDPRouterAdvert) CurrHopLimit() uint8 {
        return b[ndpRACurrHopLimitOffset]
}

// ManagedAddrConfFlag returns the value of the Managed Address Configuration
// flag.
func (b NDPRouterAdvert) ManagedAddrConfFlag() bool {
        return b[ndpRAFlagsOffset]&ndpRAManagedAddrConfFlagMask != 0
}

// OtherConfFlag returns the value of the Other Configuration flag.
func (b NDPRouterAdvert) OtherConfFlag() bool {
        return b[ndpRAFlagsOffset]&ndpRAOtherConfFlagMask != 0
}

// DefaultRouterPreference returns the Default Router Preference field.
func (b NDPRouterAdvert) DefaultRouterPreference() NDPRoutePreference {
        return NDPRoutePreference((b[ndpRAFlagsOffset] & ndpDefaultRouterPreferenceMask) >> ndpDefaultRouterPreferenceShift)
}

// RouterLifetime returns the lifetime associated with the default router. A
// value of 0 means the source of the Router Advertisement is not a default
// router and SHOULD NOT appear on the default router list. Note, a value of 0
// only means that the router should not be used as a default router, it does
// not apply to other information contained in the Router Advertisement.
func (b NDPRouterAdvert) RouterLifetime() time.Duration {
        // The field is the time in seconds, as per RFC 4861 section 4.2.
        return time.Second * time.Duration(binary.BigEndian.Uint16(b[ndpRARouterLifetimeOffset:]))
}

// ReachableTime returns the time that a node assumes a neighbor is reachable
// after having received a reachability confirmation. A value of 0 means
// that it is unspecified by the source of the Router Advertisement message.
func (b NDPRouterAdvert) ReachableTime() time.Duration {
        // The field is the time in milliseconds, as per RFC 4861 section 4.2.
        return time.Millisecond * time.Duration(binary.BigEndian.Uint32(b[ndpRAReachableTimeOffset:]))
}

// RetransTimer returns the time between retransmitted Neighbor Solicitation
// messages. A value of 0 means that it is unspecified by the source of the
// Router Advertisement message.
func (b NDPRouterAdvert) RetransTimer() time.Duration {
        // The field is the time in milliseconds, as per RFC 4861 section 4.2.
        return time.Millisecond * time.Duration(binary.BigEndian.Uint32(b[ndpRARetransTimerOffset:]))
}

// Options returns an NDPOptions of the the options body.
func (b NDPRouterAdvert) Options() NDPOptions {
        return NDPOptions(b[ndpRAOptionsOffset:])
}





































  241 




  241 








  241 


  241 


  241 
  240 


  241 



  241 





  241 

    1 


  241 


  220 


  240 


  241 

  241 


  241 

  240 



  240 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mm

import (
        "bytes"
        "fmt"

        "gvisor.dev/gvisor/pkg/context"
)

const (
        // If checkInvariants is true, perform runtime checks for invariants
        // expected by the mm package. This is normally disabled since MM is a
        // significant hot path in general, and some such checks (notably
        // memmap.CheckTranslateResult) are very expensive.
        checkInvariants = false

        // If logIOErrors is true, log I/O errors that originate from MM before
        // converting them to EFAULT.
        logIOErrors = false
)

// String implements fmt.Stringer.String.
func (mm *MemoryManager) String() string {
        return mm.DebugString(context.Background())
}

// DebugString returns a string containing information about mm for debugging.
func (mm *MemoryManager) DebugString(ctx context.Context) string {
        mm.mappingMu.RLock()
        defer mm.mappingMu.RUnlock()
        mm.activeMu.RLock()
        defer mm.activeMu.RUnlock()
        return mm.debugStringLocked(ctx)
}

// Preconditions: mm.mappingMu and mm.activeMu must be locked.
func (mm *MemoryManager) debugStringLocked(ctx context.Context) string {
        var b bytes.Buffer
        b.WriteString("VMAs:\n")
        for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
                b.Write(mm.vmaMapsEntryLocked(ctx, vseg))
        }
        b.WriteString("PMAs:\n")
        for pseg := mm.pmas.FirstSegment(); pseg.Ok(); pseg = pseg.NextSegment() {
                b.Write(pseg.debugStringEntryLocked())
        }
        return string(b.Bytes())
}

// Preconditions: mm.activeMu must be locked.
func (pseg pmaIterator) debugStringEntryLocked() []byte {
        var b bytes.Buffer

        fmt.Fprintf(&b, "%08x-%08x ", pseg.Start(), pseg.End())

        pma := pseg.ValuePtr()
        if pma.effectivePerms.Read {
                b.WriteByte('r')
        } else {
                b.WriteByte('-')
        }
        if pma.effectivePerms.Write {
                if pma.needCOW {
                        b.WriteByte('c')
                } else {
                        b.WriteByte('w')
                }
        } else {
                b.WriteByte('-')
        }
        if pma.effectivePerms.Execute {
                b.WriteByte('x')
        } else {
                b.WriteByte('-')
        }
        if pma.private {
                b.WriteByte('p')
        } else {
                b.WriteByte('s')
        }

        fmt.Fprintf(&b, " %08x %T\n", pma.off, pma.file)
        return b.Bytes()
}










































































   52 





    1 


   52 



   42 




   42 







    1 


    1 





















    8 
    5 





    7 




    7 



    7 




    7 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
        "math"
        "time"

        "gvisor.dev/gvisor/pkg/sleep"
        "gvisor.dev/gvisor/pkg/tcpip"
)

type timerState int

const (
        // The timer is disabled.
        timerStateDisabled timerState = iota
        // The timer is enabled, but the clock timer may be set to an earlier
        // expiration time due to a previous orphaned state.
        timerStateEnabled
        // The timer is disabled, but the clock timer is enabled, which means that
        // it will cause a spurious wakeup unless the timer is enabled before the
        // clock timer fires.
        timerStateOrphaned
)

// timer is a timer implementation that reduces the interactions with the
// clock timer infrastructure by letting timers run (and potentially
// eventually expire) even if they are stopped. It makes it cheaper to
// disable/reenable timers at the expense of spurious wakes. This is useful for
// cases when the same timer is disabled/reenabled repeatedly with relatively
// long timeouts farther into the future.
//
// TCP retransmit timers benefit from this because they the timeouts are long
// (currently at least 200ms), and get disabled when acks are received, and
// reenabled when new pending segments are sent.
//
// It is advantageous to avoid interacting with the clock because it acquires
// a global mutex and performs O(log n) operations, where n is the global number
// of timers, whenever a timer is enabled or disabled, and may make a syscall.
//
// This struct is thread-compatible.
type timer struct {
        state timerState

        clock tcpip.Clock

        // target is the expiration time of the current timer. It is only
        // meaningful in the enabled state.
        target tcpip.MonotonicTime

        // clockTarget is the expiration time of the clock timer. It is
        // meaningful in the enabled and orphaned states.
        clockTarget tcpip.MonotonicTime

        // timer is the clock timer used to wait on.
        timer tcpip.Timer
}

// init initializes the timer. Once it expires, it the given waker will be
// asserted.
func (t *timer) init(clock tcpip.Clock, w *sleep.Waker) {
        t.state = timerStateDisabled
        t.clock = clock

        // Initialize a clock timer that will assert the waker, then
        // immediately stop it.
        t.timer = t.clock.AfterFunc(math.MaxInt64, func() {
                w.Assert()
        })
        t.timer.Stop()
}

// cleanup frees all resources associated with the timer.
func (t *timer) cleanup() {
        if t.timer == nil {
                // No cleanup needed.
                return
        }
        t.timer.Stop()
        *t = timer{}
}

// checkExpiration checks if the given timer has actually expired, it should be
// called whenever a sleeper wakes up due to the waker being asserted, and is
// used to check if it's a supurious wake (due to a previously orphaned timer)
// or a legitimate one.
func (t *timer) checkExpiration() bool {
        // Transition to fully disabled state if we're just consuming an
        // orphaned timer.
        if t.state == timerStateOrphaned {
                t.state = timerStateDisabled
                return false
        }

        // The timer is enabled, but it may have expired early. Check if that's
        // the case, and if so, reset the runtime timer to the correct time.
        now := t.clock.NowMonotonic()
        if now.Before(t.target) {
                t.clockTarget = t.target
                t.timer.Reset(t.target.Sub(now))
                return false
        }

        // The timer has actually expired, disable it for now and inform the
        // caller.
        t.state = timerStateDisabled
        return true
}

// disable disables the timer, leaving it in an orphaned state if it wasn't
// already disabled.
func (t *timer) disable() {
        if t.state != timerStateDisabled {
                t.state = timerStateOrphaned
        }
}

// enabled returns true if the timer is currently enabled, false otherwise.
func (t *timer) enabled() bool {
        return t.state == timerStateEnabled
}

// enable enables the timer, programming the runtime timer if necessary.
func (t *timer) enable(d time.Duration) {
        t.target = t.clock.NowMonotonic().Add(d)

        // Check if we need to set the runtime timer.
        if t.state == timerStateDisabled || t.target.Before(t.clockTarget) {
                t.clockTarget = t.target
                t.timer.Reset(d)
        }

        t.state = timerStateEnabled
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/pending_signals_list.go: no such file or directory















































  171 





  527 





    3 





    2 













    2 





   54 








    1 



















   10 







   47 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "fmt"
)

// Options for waitpid(2), wait4(2), and/or waitid(2), from
// include/uapi/linux/wait.h.
const (
        WNOHANG    = 0x00000001
        WUNTRACED  = 0x00000002
        WSTOPPED   = WUNTRACED
        WEXITED    = 0x00000004
        WCONTINUED = 0x00000008
        WNOWAIT    = 0x01000000
        WNOTHREAD  = 0x20000000
        WALL       = 0x40000000
        WCLONE     = 0x80000000
)

// ID types for waitid(2), from include/uapi/linux/wait.h.
const (
        P_ALL  = 0x0
        P_PID  = 0x1
        P_PGID = 0x2
)

// WaitStatus represents a thread status, as returned by the wait* family of
// syscalls.
type WaitStatus uint32

// WaitStatusExit returns a WaitStatus representing the given exit status.
func WaitStatusExit(status int32) WaitStatus {
        return WaitStatus(uint32(status) << 8)
}

// WaitStatusTerminationSignal returns a WaitStatus representing termination by
// the given signal.
func WaitStatusTerminationSignal(sig Signal) WaitStatus {
        return WaitStatus(uint32(sig))
}

// WaitStatusStopped returns a WaitStatus representing stoppage by the given
// signal or ptrace trap code.
func WaitStatusStopped(code uint32) WaitStatus {
        return WaitStatus(code<<8 | 0x7f)
}

// WaitStatusContinued returns a WaitStatus representing continuation by
// SIGCONT.
func WaitStatusContinued() WaitStatus {
        return WaitStatus(0xffff)
}

// WithCoreDump returns a copy of ws that indicates that a core dump was
// generated.
//
// Preconditions: ws.Signaled().
func (ws WaitStatus) WithCoreDump() WaitStatus {
        return ws | 0x80
}

// Exited returns true if ws represents an exit status, consistent with
// WIFEXITED.
func (ws WaitStatus) Exited() bool {
        return ws&0x7f == 0
}

// Signaled returns true if ws represents a termination by signal, consistent
// with WIFSIGNALED.
func (ws WaitStatus) Signaled() bool {
        // ws&0x7f != 0 (exited) and ws&0x7f != 0x7f (stopped or continued)
        return ((ws&0x7f)+1)>>1 != 0
}

// CoreDumped returns true if ws indicates that a core dump was produced,
// consistent with WCOREDUMP.
//
// Preconditions: ws.Signaled().
func (ws WaitStatus) CoreDumped() bool {
        return ws&0x80 != 0
}

// Stopped returns true if ws represents a stoppage, consistent with
// WIFSTOPPED.
func (ws WaitStatus) Stopped() bool {
        return ws&0xff == 0x7f
}

// Continued returns true if ws represents a continuation by SIGCONT,
// consistent with WIFCONTINUED.
func (ws WaitStatus) Continued() bool {
        return ws == 0xffff
}

// ExitStatus returns the lower 8 bits of the exit status represented by ws,
// consistent with WEXITSTATUS.
//
// Preconditions: ws.Exited().
func (ws WaitStatus) ExitStatus() uint32 {
        return uint32((ws & 0xff00) >> 8)
}

// TerminationSignal returns the termination signal represented by ws,
// consistent with WTERMSIG.
//
// Preconditions: ws.Signaled().
func (ws WaitStatus) TerminationSignal() Signal {
        return Signal(ws & 0x7f)
}

// StopSignal returns the stop signal represented by ws, consistent with
// WSTOPSIG.
//
// Preconditions: ws.Stopped().
func (ws WaitStatus) StopSignal() Signal {
        return Signal((ws & 0xff00) >> 8)
}

// PtraceEvent returns the PTRACE_EVENT_* field in ws.
//
// Preconditions: ws.Stopped().
func (ws WaitStatus) PtraceEvent() uint32 {
        return uint32(ws >> 16)
}

// String implements fmt.Stringer.String.
func (ws WaitStatus) String() string {
        switch {
        case ws.Exited():
                return fmt.Sprintf("exit status %d", ws.ExitStatus())
        case ws.Signaled():
                if ws.CoreDumped() {
                        return fmt.Sprintf("killed by signal %d (core dumped)", ws.TerminationSignal())
                }
                return fmt.Sprintf("killed by signal %d", ws.TerminationSignal())
        case ws.Stopped():
                if ev := ws.PtraceEvent(); ev != 0 {
                        return fmt.Sprintf("stopped by signal %d (PTRACE_EVENT %d)", ws.StopSignal(), ev)
                }
                return fmt.Sprintf("stopped by signal %d", ws.StopSignal())
        case ws.Continued():
                return "continued"
        default:
                return fmt.Sprintf("unknown status %#x", uint32(ws))
        }
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/proc/subtasks_inode_refs.go: no such file or directory


















































































































































































































































































































































































































































   30 
   31 



   31 



   31 



   31 



   31 




































































































































































   31 





   31 




   30 



































   31 



   31 








   30 


   31 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 



    1 



    1 



    1 



    1 










   32 





   32 
   32 

































































































































    1 
    1 









   32 




   32 

















   10 























   10 



   10 


   10 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ipv6

import (
        "fmt"
        "time"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

const (
        // defaultMaxRtrSolicitations is the default number of Router
        // Solicitation messages to send when an IPv6 endpoint becomes enabled.
        //
        // Default = 3 (from RFC 4861 section 10).
        defaultMaxRtrSolicitations = 3

        // defaultRtrSolicitationInterval is the default amount of time between
        // sending Router Solicitation messages.
        //
        // Default = 4s (from 4861 section 10).
        defaultRtrSolicitationInterval = 4 * time.Second

        // defaultMaxRtrSolicitationDelay is the default maximum amount of time
        // to wait before sending the first Router Solicitation message.
        //
        // Default = 1s (from 4861 section 10).
        defaultMaxRtrSolicitationDelay = time.Second

        // defaultHandleRAs is the default configuration for whether or not to
        // handle incoming Router Advertisements as a host.
        defaultHandleRAs = HandlingRAsEnabledWhenForwardingDisabled

        // defaultDiscoverDefaultRouters is the default configuration for
        // whether or not to discover default routers from incoming Router
        // Advertisements, as a host.
        defaultDiscoverDefaultRouters = true

        // defaultDiscoverMoreSpecificRoutes is the default configuration for
        // whether or not to discover more-specific routes from incoming Router
        // Advertisements, as a host.
        defaultDiscoverMoreSpecificRoutes = true

        // defaultDiscoverOnLinkPrefixes is the default configuration for
        // whether or not to discover on-link prefixes from incoming Router
        // Advertisements' Prefix Information option, as a host.
        defaultDiscoverOnLinkPrefixes = true

        // defaultAutoGenGlobalAddresses is the default configuration for
        // whether or not to generate global IPv6 addresses in response to
        // receiving a new Prefix Information option with its Autonomous
        // Address AutoConfiguration flag set, as a host.
        //
        // Default = true.
        defaultAutoGenGlobalAddresses = true

        // minimumRtrSolicitationInterval is the minimum amount of time to wait
        // between sending Router Solicitation messages. This limit is imposed
        // to make sure that Router Solicitation messages are not sent all at
        // once, defeating the purpose of sending the initial few messages.
        minimumRtrSolicitationInterval = 500 * time.Millisecond

        // minimumMaxRtrSolicitationDelay is the minimum amount of time to wait
        // before sending the first Router Solicitation message. It is 0 because
        // we cannot have a negative delay.
        minimumMaxRtrSolicitationDelay = 0

        // MaxDiscoveredOffLinkRoutes is the maximum number of discovered off-link
        // routes. The stack should stop discovering new off-link routes after
        // this limit is reached.
        //
        // This value MUST be at minimum 2 as per RFC 4861 section 6.3.4, and
        // SHOULD be more.
        MaxDiscoveredOffLinkRoutes = 10

        // MaxDiscoveredOnLinkPrefixes is the maximum number of discovered
        // on-link prefixes. The stack should stop discovering new on-link
        // prefixes after discovering MaxDiscoveredOnLinkPrefixes on-link
        // prefixes.
        MaxDiscoveredOnLinkPrefixes = 10

        // validPrefixLenForAutoGen is the expected prefix length that an
        // address can be generated for. Must be 64 bits as the interface
        // identifier (IID) is 64 bits and an IPv6 address is 128 bits, so
        // 128 - 64 = 64.
        validPrefixLenForAutoGen = 64

        // defaultAutoGenTempGlobalAddresses is the default configuration for whether
        // or not to generate temporary SLAAC addresses.
        defaultAutoGenTempGlobalAddresses = true

        // defaultMaxTempAddrValidLifetime is the default maximum valid lifetime
        // for temporary SLAAC addresses generated as part of RFC 4941.
        //
        // Default = 7 days (from RFC 4941 section 5).
        defaultMaxTempAddrValidLifetime = 7 * 24 * time.Hour

        // defaultMaxTempAddrPreferredLifetime is the default preferred lifetime
        // for temporary SLAAC addresses generated as part of RFC 4941.
        //
        // Default = 1 day (from RFC 4941 section 5).
        defaultMaxTempAddrPreferredLifetime = 24 * time.Hour

        // defaultRegenAdvanceDuration is the default duration before the deprecation
        // of a temporary address when a new address will be generated.
        //
        // Default = 5s (from RFC 4941 section 5).
        defaultRegenAdvanceDuration = 5 * time.Second

        // minRegenAdvanceDuration is the minimum duration before the deprecation
        // of a temporary address when a new address will be generated.
        minRegenAdvanceDuration = time.Duration(0)

        // maxSLAACAddrLocalRegenAttempts is the maximum number of times to attempt
        // SLAAC address regenerations in response to an IPv6 endpoint-local conflict.
        maxSLAACAddrLocalRegenAttempts = 10

        // MinPrefixInformationValidLifetimeForUpdate is the minimum Valid
        // Lifetime to update the valid lifetime of a generated address by
        // SLAAC.
        //
        // Min = 2hrs.
        MinPrefixInformationValidLifetimeForUpdate = 2 * time.Hour

        // MaxDesyncFactor is the upper bound for the preferred lifetime's desync
        // factor for temporary SLAAC addresses.
        //
        // Must be greater than 0.
        //
        // Max = 10m (from RFC 4941 section 5).
        MaxDesyncFactor = 10 * time.Minute

        // MinMaxTempAddrPreferredLifetime is the minimum value allowed for the
        // maximum preferred lifetime for temporary SLAAC addresses.
        //
        // This value guarantees that a temporary address is preferred for at
        // least 1hr if the SLAAC prefix is valid for at least that time.
        MinMaxTempAddrPreferredLifetime = defaultRegenAdvanceDuration + MaxDesyncFactor + time.Hour

        // MinMaxTempAddrValidLifetime is the minimum value allowed for the
        // maximum valid lifetime for temporary SLAAC addresses.
        //
        // This value guarantees that a temporary address is valid for at least
        // 2hrs if the SLAAC prefix is valid for at least that time.
        MinMaxTempAddrValidLifetime = 2 * time.Hour
)

// NDPEndpoint is an endpoint that supports NDP.
type NDPEndpoint interface {
        // SetNDPConfigurations sets the NDP configurations.
        SetNDPConfigurations(NDPConfigurations)
}

// DHCPv6ConfigurationFromNDPRA is a configuration available via DHCPv6 that an
// NDP Router Advertisement informed the Stack about.
type DHCPv6ConfigurationFromNDPRA int

const (
        _ DHCPv6ConfigurationFromNDPRA = iota

        // DHCPv6NoConfiguration indicates that no configurations are available via
        // DHCPv6.
        DHCPv6NoConfiguration

        // DHCPv6ManagedAddress indicates that addresses are available via DHCPv6.
        //
        // DHCPv6ManagedAddress also implies DHCPv6OtherConfigurations because DHCPv6
        // returns all available configuration information when serving addresses.
        DHCPv6ManagedAddress

        // DHCPv6OtherConfigurations indicates that other configuration information is
        // available via DHCPv6.
        //
        // Other configurations are configurations other than addresses. Examples of
        // other configurations are recursive DNS server list, DNS search lists and
        // default gateway.
        DHCPv6OtherConfigurations
)

// NDPDispatcher is the interface integrators of netstack must implement to
// receive and handle NDP related events.
type NDPDispatcher interface {
        // OnDuplicateAddressDetectionResult is called when the DAD process for an
        // address on a NIC completes.
        //
        // This function is not permitted to block indefinitely. This function
        // is also not permitted to call into the stack.
        OnDuplicateAddressDetectionResult(tcpip.NICID, tcpip.Address, stack.DADResult)

        // OnOffLinkRouteUpdated is called when an off-link route is updated.
        //
        // This function is not permitted to block indefinitely. This function
        // is also not permitted to call into the stack.
        OnOffLinkRouteUpdated(tcpip.NICID, tcpip.Subnet, tcpip.Address, header.NDPRoutePreference)

        // OnOffLinkRouteInvalidated is called when an off-link route is invalidated.
        //
        // This function is not permitted to block indefinitely. This function
        // is also not permitted to call into the stack.
        OnOffLinkRouteInvalidated(tcpip.NICID, tcpip.Subnet, tcpip.Address)

        // OnOnLinkPrefixDiscovered is called when a new on-link prefix is discovered.
        //
        // This function is not permitted to block indefinitely. This function
        // is also not permitted to call into the stack.
        OnOnLinkPrefixDiscovered(tcpip.NICID, tcpip.Subnet)

        // OnOnLinkPrefixInvalidated is called when a discovered on-link prefix that
        // was remembered is invalidated.
        //
        // This function is not permitted to block indefinitely. This function
        // is also not permitted to call into the stack.
        OnOnLinkPrefixInvalidated(tcpip.NICID, tcpip.Subnet)

        // OnAutoGenAddress is called when a new prefix with its autonomous address-
        // configuration flag set is received and SLAAC was performed.
        //
        // This function is not permitted to block indefinitely. It must not
        // call functions on the stack itself.
        OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix)

        // OnAutoGenAddressDeprecated is called when an auto-generated address (SLAAC)
        // is deprecated, but is still considered valid. Note, if an address is
        // invalidated at the same ime it is deprecated, the deprecation event may not
        // be received.
        //
        // This function is not permitted to block indefinitely. It must not
        // call functions on the stack itself.
        OnAutoGenAddressDeprecated(tcpip.NICID, tcpip.AddressWithPrefix)

        // OnAutoGenAddressInvalidated is called when an auto-generated address
        // (SLAAC) is invalidated.
        //
        // This function is not permitted to block indefinitely. It must not
        // call functions on the stack itself.
        OnAutoGenAddressInvalidated(tcpip.NICID, tcpip.AddressWithPrefix)

        // OnRecursiveDNSServerOption is called when the stack learns of DNS servers
        // through NDP. Note, the addresses may contain link-local addresses.
        //
        // It is up to the caller to use the DNS Servers only for their valid
        // lifetime. OnRecursiveDNSServerOption may be called for new or
        // already known DNS servers. If called with known DNS servers, their
        // valid lifetimes must be refreshed to the lifetime (it may be increased,
        // decreased, or completely invalidated when the lifetime = 0).
        //
        // This function is not permitted to block indefinitely. It must not
        // call functions on the stack itself.
        OnRecursiveDNSServerOption(tcpip.NICID, []tcpip.Address, time.Duration)

        // OnDNSSearchListOption is called when the stack learns of DNS search lists
        // through NDP.
        //
        // It is up to the caller to use the domain names in the search list
        // for only their valid lifetime. OnDNSSearchListOption may be called
        // with new or already known domain names. If called with known domain
        // names, their valid lifetimes must be refreshed to the lifetime (it may
        // be increased, decreased or completely invalidated when the lifetime = 0.
        OnDNSSearchListOption(tcpip.NICID, []string, time.Duration)

        // OnDHCPv6Configuration is called with an updated configuration that is
        // available via DHCPv6 for the passed NIC.
        //
        // This function is not permitted to block indefinitely. It must not
        // call functions on the stack itself.
        OnDHCPv6Configuration(tcpip.NICID, DHCPv6ConfigurationFromNDPRA)
}

var _ fmt.Stringer = HandleRAsConfiguration(0)

// HandleRAsConfiguration enumerates when RAs may be handled.
type HandleRAsConfiguration int

const (
        // HandlingRAsDisabled indicates that Router Advertisements will not be
        // handled.
        HandlingRAsDisabled HandleRAsConfiguration = iota

        // HandlingRAsEnabledWhenForwardingDisabled indicates that router
        // advertisements will only be handled when forwarding is disabled.
        HandlingRAsEnabledWhenForwardingDisabled

        // HandlingRAsAlwaysEnabled indicates that Router Advertisements will always
        // be handled, even when forwarding is enabled.
        HandlingRAsAlwaysEnabled
)

// String implements fmt.Stringer.
func (c HandleRAsConfiguration) String() string {
        switch c {
        case HandlingRAsDisabled:
                return "HandlingRAsDisabled"
        case HandlingRAsEnabledWhenForwardingDisabled:
                return "HandlingRAsEnabledWhenForwardingDisabled"
        case HandlingRAsAlwaysEnabled:
                return "HandlingRAsAlwaysEnabled"
        default:
                return fmt.Sprintf("HandleRAsConfiguration(%d)", c)
        }
}

// enabled returns true iff Router Advertisements may be handled given the
// specified forwarding status.
func (c HandleRAsConfiguration) enabled(forwarding bool) bool {
        switch c {
        case HandlingRAsDisabled:
                return false
        case HandlingRAsEnabledWhenForwardingDisabled:
                return !forwarding
        case HandlingRAsAlwaysEnabled:
                return true
        default:
                panic(fmt.Sprintf("unhandled HandleRAsConfiguration = %d", c))
        }
}

// NDPConfigurations is the NDP configurations for the netstack.
type NDPConfigurations struct {
        // The number of Router Solicitation messages to send when the IPv6 endpoint
        // becomes enabled.
        //
        // Ignored unless configured to handle Router Advertisements.
        MaxRtrSolicitations uint8

        // The amount of time between transmitting Router Solicitation messages.
        //
        // Must be greater than or equal to 0.5s.
        RtrSolicitationInterval time.Duration

        // The maximum amount of time before transmitting the first Router
        // Solicitation message.
        //
        // Must be greater than or equal to 0s.
        MaxRtrSolicitationDelay time.Duration

        // HandleRAs is the configuration for when Router Advertisements should be
        // handled.
        HandleRAs HandleRAsConfiguration

        // DiscoverDefaultRouters determines whether or not default routers are
        // discovered from Router Advertisements, as per RFC 4861 section 6. This
        // configuration is ignored if RAs will not be processed (see HandleRAs).
        DiscoverDefaultRouters bool

        // DiscoverMoreSpecificRoutes determines whether or not more specific routes
        // are discovered from Router Advertisements, as per RFC 4191. This
        // configuration is ignored if RAs will not be processed (see HandleRAs).
        DiscoverMoreSpecificRoutes bool

        // DiscoverOnLinkPrefixes determines whether or not on-link prefixes are
        // discovered from Router Advertisements' Prefix Information option, as per
        // RFC 4861 section 6. This configuration is ignored if RAs will not be
        // processed (see HandleRAs).
        DiscoverOnLinkPrefixes bool

        // AutoGenGlobalAddresses determines whether or not an IPv6 endpoint performs
        // SLAAC to auto-generate global SLAAC addresses in response to Prefix
        // Information options, as per RFC 4862.
        //
        // Note, if an address was already generated for some unique prefix, as
        // part of SLAAC, this option does not affect whether or not the
        // lifetime(s) of the generated address changes; this option only
        // affects the generation of new addresses as part of SLAAC.
        AutoGenGlobalAddresses bool

        // AutoGenAddressConflictRetries determines how many times to attempt to retry
        // generation of a permanent auto-generated address in response to DAD
        // conflicts.
        //
        // If the method used to generate the address does not support creating
        // alternative addresses (e.g. IIDs based on the modified EUI64 of a NIC's
        // MAC address), then no attempt is made to resolve the conflict.
        AutoGenAddressConflictRetries uint8

        // AutoGenTempGlobalAddresses determines whether or not temporary SLAAC
        // addresses are generated for an IPv6 endpoint as part of SLAAC privacy
        // extensions, as per RFC 4941.
        //
        // Ignored if AutoGenGlobalAddresses is false.
        AutoGenTempGlobalAddresses bool

        // MaxTempAddrValidLifetime is the maximum valid lifetime for temporary
        // SLAAC addresses.
        MaxTempAddrValidLifetime time.Duration

        // MaxTempAddrPreferredLifetime is the maximum preferred lifetime for
        // temporary SLAAC addresses.
        MaxTempAddrPreferredLifetime time.Duration

        // RegenAdvanceDuration is the duration before the deprecation of a temporary
        // address when a new address will be generated.
        RegenAdvanceDuration time.Duration
}

// DefaultNDPConfigurations returns an NDPConfigurations populated with
// default values.
func DefaultNDPConfigurations() NDPConfigurations {
        return NDPConfigurations{
                MaxRtrSolicitations:          defaultMaxRtrSolicitations,
                RtrSolicitationInterval:      defaultRtrSolicitationInterval,
                MaxRtrSolicitationDelay:      defaultMaxRtrSolicitationDelay,
                HandleRAs:                    defaultHandleRAs,
                DiscoverDefaultRouters:       defaultDiscoverDefaultRouters,
                DiscoverMoreSpecificRoutes:   defaultDiscoverMoreSpecificRoutes,
                DiscoverOnLinkPrefixes:       defaultDiscoverOnLinkPrefixes,
                AutoGenGlobalAddresses:       defaultAutoGenGlobalAddresses,
                AutoGenTempGlobalAddresses:   defaultAutoGenTempGlobalAddresses,
                MaxTempAddrValidLifetime:     defaultMaxTempAddrValidLifetime,
                MaxTempAddrPreferredLifetime: defaultMaxTempAddrPreferredLifetime,
                RegenAdvanceDuration:         defaultRegenAdvanceDuration,
        }
}

// validate modifies an NDPConfigurations with valid values. If invalid values
// are present in c, the corresponding default values are used instead.
func (c *NDPConfigurations) validate() {
        if c.RtrSolicitationInterval < minimumRtrSolicitationInterval {
                c.RtrSolicitationInterval = defaultRtrSolicitationInterval
        }

        if c.MaxRtrSolicitationDelay < minimumMaxRtrSolicitationDelay {
                c.MaxRtrSolicitationDelay = defaultMaxRtrSolicitationDelay
        }

        if c.MaxTempAddrValidLifetime < MinMaxTempAddrValidLifetime {
                c.MaxTempAddrValidLifetime = MinMaxTempAddrValidLifetime
        }

        if c.MaxTempAddrPreferredLifetime < MinMaxTempAddrPreferredLifetime || c.MaxTempAddrPreferredLifetime > c.MaxTempAddrValidLifetime {
                c.MaxTempAddrPreferredLifetime = MinMaxTempAddrPreferredLifetime
        }

        if c.RegenAdvanceDuration < minRegenAdvanceDuration {
                c.RegenAdvanceDuration = minRegenAdvanceDuration
        }
}

type timer struct {
        // done indicates to the timer that the timer was stopped.
        done *bool

        timer tcpip.Timer
}

type offLinkRoute struct {
        dest   tcpip.Subnet
        router tcpip.Address
}

// ndpState is the per-Interface NDP state.
type ndpState struct {
        // Do not allow overwriting this state.
        _ sync.NoCopy

        // The IPv6 endpoint this ndpState is for.
        ep *endpoint

        // configs is the per-interface NDP configurations.
        configs NDPConfigurations

        // The DAD timers to send the next NS message, or resolve the address.
        dad ip.DAD

        // The off-link routes discovered through Router Advertisements.
        offLinkRoutes map[offLinkRoute]offLinkRouteState

        // rtrSolicitTimer is the timer used to send the next router solicitation
        // message.
        //
        // rtrSolicitTimer is the zero value when NDP is not soliciting routers.
        rtrSolicitTimer timer

        // The on-link prefixes discovered through Router Advertisements' Prefix
        // Information option.
        onLinkPrefixes map[tcpip.Subnet]onLinkPrefixState

        // The SLAAC prefixes discovered through Router Advertisements' Prefix
        // Information option.
        slaacPrefixes map[tcpip.Subnet]slaacPrefixState

        // The last learned DHCPv6 configuration from an NDP RA.
        dhcpv6Configuration DHCPv6ConfigurationFromNDPRA

        // temporaryIIDHistory is the history value used to generate a new temporary
        // IID.
        temporaryIIDHistory [header.IIDSize]byte

        // temporaryAddressDesyncFactor is the preferred lifetime's desync factor for
        // temporary SLAAC addresses.
        temporaryAddressDesyncFactor time.Duration
}

// offLinkRouteState holds data associated with an off-link route discovered by
// a Router Advertisement (RA).
type offLinkRouteState struct {
        prf header.NDPRoutePreference

        // Job to invalidate the route.
        //
        // Must not be nil.
        invalidationJob *tcpip.Job
}

// onLinkPrefixState holds data associated with an on-link prefix discovered by
// a Router Advertisement's Prefix Information option (PI) when the NDP
// configurations was configured to do so.
type onLinkPrefixState struct {
        // Job to invalidate the on-link prefix.
        //
        // Must not be nil.
        invalidationJob *tcpip.Job
}

// tempSLAACAddrState holds state associated with a temporary SLAAC address.
type tempSLAACAddrState struct {
        // Job to deprecate the temporary SLAAC address.
        //
        // Must not be nil.
        deprecationJob *tcpip.Job

        // Job to invalidate the temporary SLAAC address.
        //
        // Must not be nil.
        invalidationJob *tcpip.Job

        // Job to regenerate the temporary SLAAC address.
        //
        // Must not be nil.
        regenJob *tcpip.Job

        createdAt tcpip.MonotonicTime

        // The address's endpoint.
        //
        // Must not be nil.
        addressEndpoint stack.AddressEndpoint

        // Has a new temporary SLAAC address already been regenerated?
        regenerated bool
}

// slaacPrefixState holds state associated with a SLAAC prefix.
type slaacPrefixState struct {
        // Job to deprecate the prefix.
        //
        // Must not be nil.
        deprecationJob *tcpip.Job

        // Job to invalidate the prefix.
        //
        // Must not be nil.
        invalidationJob *tcpip.Job

        // nil iff the address is valid forever.
        validUntil *tcpip.MonotonicTime

        // nil iff the address is preferred forever.
        preferredUntil *tcpip.MonotonicTime

        // State associated with the stable address generated for the prefix.
        stableAddr struct {
                // The address's endpoint.
                //
                // May only be nil when the address is being (re-)generated. Otherwise,
                // must not be nil as all SLAAC prefixes must have a stable address.
                addressEndpoint stack.AddressEndpoint

                // The number of times an address has been generated locally where the IPv6
                // endpoint already had the generated address.
                localGenerationFailures uint8
        }

        // The temporary (short-lived) addresses generated for the SLAAC prefix.
        tempAddrs map[tcpip.Address]tempSLAACAddrState

        // The next two fields are used by both stable and temporary addresses
        // generated for a SLAAC prefix. This is safe as only 1 address is in the
        // generation and DAD process at any time. That is, no two addresses are
        // generated at the same time for a given SLAAC prefix.

        // The number of times an address has been generated and added to the IPv6
        // endpoint.
        //
        // Addresses may be regenerated in reseponse to a DAD conflicts.
        generationAttempts uint8

        // The maximum number of times to attempt regeneration of a SLAAC address
        // in response to DAD conflicts.
        maxGenerationAttempts uint8
}

// startDuplicateAddressDetection performs Duplicate Address Detection.
//
// This function must only be called by IPv6 addresses that are currently
// tentative.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, addressEndpoint stack.AddressEndpoint) tcpip.Error {
        // addr must be a valid unicast IPv6 address.
        if !header.IsV6UnicastAddress(addr) {
                return &tcpip.ErrAddressFamilyNotSupported{}
        }

        if addressEndpoint.GetKind() != stack.PermanentTentative {
                // The endpoint should be marked as tentative since we are starting DAD.
                panic(fmt.Sprintf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.ep.nic.ID()))
        }

        ret := ndp.dad.CheckDuplicateAddressLocked(addr, func(r stack.DADResult) {
                if addressEndpoint.GetKind() != stack.PermanentTentative {
                        // The endpoint should still be marked as tentative since we are still
                        // performing DAD on it.
                        panic(fmt.Sprintf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.ep.nic.ID()))
                }

                var dadSucceeded bool
                switch r.(type) {
                case *stack.DADAborted, *stack.DADError, *stack.DADDupAddrDetected:
                        dadSucceeded = false
                case *stack.DADSucceeded:
                        dadSucceeded = true
                default:
                        panic(fmt.Sprintf("unrecognized DAD result = %T", r))
                }

                if dadSucceeded {
                        addressEndpoint.SetKind(stack.Permanent)
                }

                if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
                        ndpDisp.OnDuplicateAddressDetectionResult(ndp.ep.nic.ID(), addr, r)
                }

                if dadSucceeded {
                        if addressEndpoint.ConfigType() == stack.AddressConfigSlaac {
                                // Reset the generation attempts counter as we are starting the
                                // generation of a new address for the SLAAC prefix.
                                ndp.regenerateTempSLAACAddr(addressEndpoint.AddressWithPrefix().Subnet(), true /* resetGenAttempts */)
                        }

                        ndp.ep.onAddressAssignedLocked(addr)
                }
        })

        switch ret {
        case stack.DADStarting:
        case stack.DADAlreadyRunning:
                panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.ep.nic.ID()))
        case stack.DADDisabled:
                addressEndpoint.SetKind(stack.Permanent)

                // Consider DAD to have resolved even if no DAD messages were actually
                // transmitted.
                if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
                        ndpDisp.OnDuplicateAddressDetectionResult(ndp.ep.nic.ID(), addr, &stack.DADSucceeded{})
                }

                ndp.ep.onAddressAssignedLocked(addr)
        }

        return nil
}

// stopDuplicateAddressDetection ends a running Duplicate Address Detection
// process. Note, this may leave the DAD process for a tentative address in
// such a state forever, unless some other external event resolves the DAD
// process (receiving an NA from the true owner of addr, or an NS for addr
// (implying another node is attempting to use addr)). It is up to the caller
// of this function to handle such a scenario.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address, reason stack.DADResult) {
        ndp.dad.StopLocked(addr, reason)
}

// handleRA handles a Router Advertisement message that arrived on the NIC
// this ndp is for. Does nothing if the NIC is configured to not handle RAs.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
        // Is the IPv6 endpoint configured to handle RAs at all?
        //
        // Currently, the stack does not determine router interface status on a
        // per-interface basis; it is a protocol-wide configuration, so we check the
        // protocol's forwarding flag to determine if the IPv6 endpoint is forwarding
        // packets.
        if !ndp.configs.HandleRAs.enabled(ndp.ep.Forwarding()) {
                ndp.ep.stats.localStats.UnhandledRouterAdvertisements.Increment()
                return
        }

        // Only worry about the DHCPv6 configuration if we have an NDPDispatcher as we
        // only inform the dispatcher on configuration changes. We do nothing else
        // with the information.
        if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
                var configuration DHCPv6ConfigurationFromNDPRA
                switch {
                case ra.ManagedAddrConfFlag():
                        configuration = DHCPv6ManagedAddress

                case ra.OtherConfFlag():
                        configuration = DHCPv6OtherConfigurations

                default:
                        configuration = DHCPv6NoConfiguration
                }

                if ndp.dhcpv6Configuration != configuration {
                        ndp.dhcpv6Configuration = configuration
                        ndpDisp.OnDHCPv6Configuration(ndp.ep.nic.ID(), configuration)
                }
        }

        // Is the IPv6 endpoint configured to discover default routers?
        if ndp.configs.DiscoverDefaultRouters {
                prf := ra.DefaultRouterPreference()
                if prf == header.ReservedRoutePreference {
                        // As per RFC 4191 section 2.2,
                        //
                        //   Prf (Default Router Preference)
                        //
                        //     If the Reserved (10) value is received, the receiver MUST treat the
                        //     value as if it were (00).
                        //
                        // Note that the value 00 is the medium (default) router preference value.
                        prf = header.MediumRoutePreference
                }

                // We represent default routers with a default (off-link) route through the
                // router.
                ndp.handleOffLinkRouteDiscovery(offLinkRoute{dest: header.IPv6EmptySubnet, router: ip}, ra.RouterLifetime(), prf)
        }

        // TODO(b/141556115): Do (RetransTimer, ReachableTime)) Parameter
        //                    Discovery.

        // We know the options is valid as far as wire format is concerned since
        // we got the Router Advertisement, as documented by this fn. Given this
        // we do not check the iterator for errors on calls to Next.
        it, _ := ra.Options().Iter(false)
        for opt, done, _ := it.Next(); !done; opt, done, _ = it.Next() {
                switch opt := opt.(type) {
                case header.NDPRecursiveDNSServer:
                        if ndp.ep.protocol.options.NDPDisp == nil {
                                continue
                        }

                        addrs, _ := opt.Addresses()
                        ndp.ep.protocol.options.NDPDisp.OnRecursiveDNSServerOption(ndp.ep.nic.ID(), addrs, opt.Lifetime())

                case header.NDPDNSSearchList:
                        if ndp.ep.protocol.options.NDPDisp == nil {
                                continue
                        }

                        domainNames, _ := opt.DomainNames()
                        ndp.ep.protocol.options.NDPDisp.OnDNSSearchListOption(ndp.ep.nic.ID(), domainNames, opt.Lifetime())

                case header.NDPPrefixInformation:
                        prefix := opt.Subnet()

                        // Is the prefix a link-local?
                        if header.IsV6LinkLocalUnicastAddress(prefix.ID()) {
                                // ...Yes, skip as per RFC 4861 section 6.3.4,
                                // and RFC 4862 section 5.5.3.b (for SLAAC).
                                continue
                        }

                        // Is the Prefix Length 0?
                        if prefix.Prefix() == 0 {
                                // ...Yes, skip as this is an invalid prefix
                                // as all IPv6 addresses cannot be on-link.
                                continue
                        }

                        if opt.OnLinkFlag() {
                                ndp.handleOnLinkPrefixInformation(opt)
                        }

                        if opt.AutonomousAddressConfigurationFlag() {
                                ndp.handleAutonomousPrefixInformation(opt)
                        }

                case header.NDPRouteInformation:
                        if !ndp.configs.DiscoverMoreSpecificRoutes {
                                continue
                        }

                        dest, err := opt.Prefix()
                        if err != nil {
                                panic(fmt.Sprintf("%T.Prefix(): %s", opt, err))
                        }

                        prf := opt.RoutePreference()
                        if prf == header.ReservedRoutePreference {
                                // As per RFC 4191 section 2.3,
                                //
                                //   Prf (Route Preference)
                                //       2-bit signed integer.  The Route Preference indicates
                                //       whether to prefer the router associated with this prefix
                                //       over others, when multiple identical prefixes (for
                                //       different routers) have been received.  If the Reserved
                                //       (10) value is received, the Route Information Option MUST
                                //       be ignored.
                                continue
                        }

                        ndp.handleOffLinkRouteDiscovery(offLinkRoute{dest: dest, router: ip}, opt.RouteLifetime(), prf)
                }

                // TODO(b/141556115): Do (MTU) Parameter Discovery.
        }
}

// invalidateOffLinkRoute invalidates a discovered off-link route.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) invalidateOffLinkRoute(route offLinkRoute) {
        state, ok := ndp.offLinkRoutes[route]
        if !ok {
                return
        }

        state.invalidationJob.Cancel()
        delete(ndp.offLinkRoutes, route)

        // Let the integrator know a discovered off-link route is invalidated.
        if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
                ndpDisp.OnOffLinkRouteInvalidated(ndp.ep.nic.ID(), route.dest, route.router)
        }
}

// handleOffLinkRouteDiscovery handles the discovery of an off-link route.
//
// Precondition: ndp.ep.mu must be locked.
func (ndp *ndpState) handleOffLinkRouteDiscovery(route offLinkRoute, lifetime time.Duration, prf header.NDPRoutePreference) {
        ndpDisp := ndp.ep.protocol.options.NDPDisp
        if ndpDisp == nil {
                return
        }

        state, ok := ndp.offLinkRoutes[route]
        switch {
        case !ok && lifetime != 0:
                // This is a new route we are discovering.
                //
                // Only remember it if we currently know about less than
                // MaxDiscoveredOffLinkRoutes routers.
                if len(ndp.offLinkRoutes) < MaxDiscoveredOffLinkRoutes {
                        // Inform the integrator when we discovered an off-link route.
                        ndpDisp.OnOffLinkRouteUpdated(ndp.ep.nic.ID(), route.dest, route.router, prf)

                        state := offLinkRouteState{
                                prf: prf,
                                invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
                                        ndp.invalidateOffLinkRoute(route)
                                }),
                        }

                        state.invalidationJob.Schedule(lifetime)

                        ndp.offLinkRoutes[route] = state
                }

        case ok && lifetime != 0:
                // This is an already discovered off-link route. Update the lifetime.
                state.invalidationJob.Cancel()
                state.invalidationJob.Schedule(lifetime)

                if prf != state.prf {
                        state.prf = prf

                        // Inform the integrator about route preference updates.
                        ndpDisp.OnOffLinkRouteUpdated(ndp.ep.nic.ID(), route.dest, route.router, prf)
                }

                ndp.offLinkRoutes[route] = state

        case ok && lifetime == 0:
                // The already discovered off-link route is no longer considered valid so we
                // invalidate it immediately.
                ndp.invalidateOffLinkRoute(route)
        }
}

// rememberOnLinkPrefix remembers a newly discovered on-link prefix with IPv6
// address with prefix prefix with lifetime l.
//
// The prefix identified by prefix MUST NOT already be known.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) {
        ndpDisp := ndp.ep.protocol.options.NDPDisp
        if ndpDisp == nil {
                return
        }

        // Inform the integrator when we discovered an on-link prefix.
        ndpDisp.OnOnLinkPrefixDiscovered(ndp.ep.nic.ID(), prefix)

        state := onLinkPrefixState{
                invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
                        ndp.invalidateOnLinkPrefix(prefix)
                }),
        }

        if l < header.NDPInfiniteLifetime {
                state.invalidationJob.Schedule(l)
        }

        ndp.onLinkPrefixes[prefix] = state
}

// invalidateOnLinkPrefix invalidates a discovered on-link prefix.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
        s, ok := ndp.onLinkPrefixes[prefix]

        // Is the on-link prefix still discovered?
        if !ok {
                // ...Nope, do nothing further.
                return
        }

        s.invalidationJob.Cancel()
        delete(ndp.onLinkPrefixes, prefix)

        // Let the integrator know a discovered on-link prefix is invalidated.
        if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
                ndpDisp.OnOnLinkPrefixInvalidated(ndp.ep.nic.ID(), prefix)
        }
}

// handleOnLinkPrefixInformation handles a Prefix Information option with
// its on-link flag set, as per RFC 4861 section 6.3.4.
//
// handleOnLinkPrefixInformation assumes that the prefix this pi is for is
// not the link-local prefix and the on-link flag is set.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformation) {
        prefix := pi.Subnet()
        prefixState, ok := ndp.onLinkPrefixes[prefix]
        vl := pi.ValidLifetime()

        if !ok && vl == 0 {
                // Don't know about this prefix but it has a zero valid
                // lifetime, so just ignore.
                return
        }

        if !ok && vl != 0 {
                // This is a new on-link prefix we are discovering
                //
                // Only remember it if we currently know about less than
                // MaxDiscoveredOnLinkPrefixes on-link prefixes.
                if ndp.configs.DiscoverOnLinkPrefixes && len(ndp.onLinkPrefixes) < MaxDiscoveredOnLinkPrefixes {
                        ndp.rememberOnLinkPrefix(prefix, vl)
                }
                return
        }

        if ok && vl == 0 {
                // We know about the on-link prefix, but it is
                // no longer to be considered on-link, so
                // invalidate it.
                ndp.invalidateOnLinkPrefix(prefix)
                return
        }

        // This is an already discovered on-link prefix with a
        // new non-zero valid lifetime.
        //
        // Update the invalidation job.

        prefixState.invalidationJob.Cancel()

        if vl < header.NDPInfiniteLifetime {
                // Prefix is valid for a finite lifetime, schedule the job to execute after
                // the new valid lifetime.
                prefixState.invalidationJob.Schedule(vl)
        }

        ndp.onLinkPrefixes[prefix] = prefixState
}

// handleAutonomousPrefixInformation handles a Prefix Information option with
// its autonomous flag set, as per RFC 4862 section 5.5.3.
//
// handleAutonomousPrefixInformation assumes that the prefix this pi is for is
// not the link-local prefix and the autonomous flag is set.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInformation) {
        vl := pi.ValidLifetime()
        pl := pi.PreferredLifetime()

        // If the preferred lifetime is greater than the valid lifetime,
        // silently ignore the Prefix Information option, as per RFC 4862
        // section 5.5.3.c.
        if pl > vl {
                return
        }

        prefix := pi.Subnet()

        // Check if we already maintain SLAAC state for prefix.
        if state, ok := ndp.slaacPrefixes[prefix]; ok {
                // As per RFC 4862 section 5.5.3.e, refresh prefix's SLAAC lifetimes.
                ndp.refreshSLAACPrefixLifetimes(prefix, &state, pl, vl)
                ndp.slaacPrefixes[prefix] = state
                return
        }

        // prefix is a new SLAAC prefix. Do the work as outlined by RFC 4862 section
        // 5.5.3.d if ndp is configured to auto-generate new addresses via SLAAC.
        if !ndp.configs.AutoGenGlobalAddresses {
                return
        }

        ndp.doSLAAC(prefix, pl, vl)
}

// doSLAAC generates a new SLAAC address with the provided lifetimes
// for prefix.
//
// pl is the new preferred lifetime. vl is the new valid lifetime.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
        // If we do not already have an address for this prefix and the valid
        // lifetime is 0, no need to do anything further, as per RFC 4862
        // section 5.5.3.d.
        if vl == 0 {
                return
        }

        // Make sure the prefix is valid (as far as its length is concerned) to
        // generate a valid IPv6 address from an interface identifier (IID), as
        // per RFC 4862 sectiion 5.5.3.d.
        if prefix.Prefix() != validPrefixLenForAutoGen {
                return
        }

        state := slaacPrefixState{
                deprecationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
                        state, ok := ndp.slaacPrefixes[prefix]
                        if !ok {
                                panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the deprecated SLAAC prefix %s", prefix))
                        }

                        ndp.deprecateSLAACAddress(state.stableAddr.addressEndpoint)
                }),
                invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
                        state, ok := ndp.slaacPrefixes[prefix]
                        if !ok {
                                panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the invalidated SLAAC prefix %s", prefix))
                        }

                        ndp.invalidateSLAACPrefix(prefix, state)
                }),
                tempAddrs:             make(map[tcpip.Address]tempSLAACAddrState),
                maxGenerationAttempts: ndp.configs.AutoGenAddressConflictRetries + 1,
        }

        now := ndp.ep.protocol.stack.Clock().NowMonotonic()

        // The time an address is preferred until is needed to properly generate the
        // address.
        if pl < header.NDPInfiniteLifetime {
                t := now.Add(pl)
                state.preferredUntil = &t
        }

        if !ndp.generateSLAACAddr(prefix, &state) {
                // We were unable to generate an address for the prefix, we do not nothing
                // further as there is no reason to maintain state or jobs for a prefix we
                // do not have an address for.
                return
        }

        // Setup the initial jobs to deprecate and invalidate prefix.

        if pl < header.NDPInfiniteLifetime && pl != 0 {
                state.deprecationJob.Schedule(pl)
        }

        if vl < header.NDPInfiniteLifetime {
                state.invalidationJob.Schedule(vl)
                t := now.Add(vl)
                state.validUntil = &t
        }

        // If the address is assigned (DAD resolved), generate a temporary address.
        if state.stableAddr.addressEndpoint.GetKind() == stack.Permanent {
                // Reset the generation attempts counter as we are starting the generation
                // of a new address for the SLAAC prefix.
                ndp.generateTempSLAACAddr(prefix, &state, true /* resetGenAttempts */)
        }

        ndp.slaacPrefixes[prefix] = state
}

// addAndAcquireSLAACAddr adds a SLAAC address to the IPv6 endpoint.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) addAndAcquireSLAACAddr(addr tcpip.AddressWithPrefix, configType stack.AddressConfigType, deprecated bool) stack.AddressEndpoint {
        // Inform the integrator that we have a new SLAAC address.
        ndpDisp := ndp.ep.protocol.options.NDPDisp
        if ndpDisp == nil {
                return nil
        }

        addressEndpoint, err := ndp.ep.addAndAcquirePermanentAddressLocked(addr, stack.FirstPrimaryEndpoint, configType, deprecated)
        if err != nil {
                panic(fmt.Sprintf("ndp: error when adding SLAAC address %+v: %s", addr, err))
        }

        ndpDisp.OnAutoGenAddress(ndp.ep.nic.ID(), addr)

        return addressEndpoint
}

// generateSLAACAddr generates a SLAAC address for prefix.
//
// Returns true if an address was successfully generated.
//
// Panics if the prefix is not a SLAAC prefix or it already has an address.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixState) bool {
        if addressEndpoint := state.stableAddr.addressEndpoint; addressEndpoint != nil {
                panic(fmt.Sprintf("ndp: SLAAC prefix %s already has a permenant address %s", prefix, addressEndpoint.AddressWithPrefix()))
        }

        // If we have already reached the maximum address generation attempts for the
        // prefix, do not generate another address.
        if state.generationAttempts == state.maxGenerationAttempts {
                return false
        }

        var generatedAddr tcpip.AddressWithPrefix
        addrBytes := []byte(prefix.ID())

        for i := 0; ; i++ {
                // If we were unable to generate an address after the maximum SLAAC address
                // local regeneration attempts, do nothing further.
                if i == maxSLAACAddrLocalRegenAttempts {
                        return false
                }

                dadCounter := state.generationAttempts + state.stableAddr.localGenerationFailures
                if oIID := ndp.ep.protocol.options.OpaqueIIDOpts; oIID.NICNameFromID != nil {
                        addrBytes = header.AppendOpaqueInterfaceIdentifier(
                                addrBytes[:header.IIDOffsetInIPv6Address],
                                prefix,
                                oIID.NICNameFromID(ndp.ep.nic.ID(), ndp.ep.nic.Name()),
                                dadCounter,
                                oIID.SecretKey,
                        )
                } else if dadCounter == 0 {
                        // Modified-EUI64 based IIDs have no way to resolve DAD conflicts, so if
                        // the DAD counter is non-zero, we cannot use this method.
                        //
                        // Only attempt to generate an interface-specific IID if we have a valid
                        // link address.
                        //
                        // TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
                        // LinkEndpoint.LinkAddress) before reaching this point.
                        linkAddr := ndp.ep.nic.LinkAddress()
                        if !header.IsValidUnicastEthernetAddress(linkAddr) {
                                return false
                        }

                        // Generate an address within prefix from the modified EUI-64 of ndp's
                        // NIC's Ethernet MAC address.
                        header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
                } else {
                        // We have no way to regenerate an address in response to an address
                        // conflict when addresses are not generated with opaque IIDs.
                        return false
                }

                generatedAddr = tcpip.AddressWithPrefix{
                        Address:   tcpip.Address(addrBytes),
                        PrefixLen: validPrefixLenForAutoGen,
                }

                if !ndp.ep.hasPermanentAddressRLocked(generatedAddr.Address) {
                        break
                }

                state.stableAddr.localGenerationFailures++
        }

        deprecated := state.preferredUntil != nil && !state.preferredUntil.After(ndp.ep.protocol.stack.Clock().NowMonotonic())
        if addressEndpoint := ndp.addAndAcquireSLAACAddr(generatedAddr, stack.AddressConfigSlaac, deprecated); addressEndpoint != nil {
                state.stableAddr.addressEndpoint = addressEndpoint
                state.generationAttempts++
                return true
        }

        return false
}

// regenerateSLAACAddr regenerates an address for a SLAAC prefix.
//
// If generating a new address for the prefix fails, the prefix is invalidated.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) regenerateSLAACAddr(prefix tcpip.Subnet) {
        state, ok := ndp.slaacPrefixes[prefix]
        if !ok {
                panic(fmt.Sprintf("ndp: SLAAC prefix state not found to regenerate address for %s", prefix))
        }

        if ndp.generateSLAACAddr(prefix, &state) {
                ndp.slaacPrefixes[prefix] = state
                return
        }

        // We were unable to generate a permanent address for the SLAAC prefix so
        // invalidate the prefix as there is no reason to maintain state for a
        // SLAAC prefix we do not have an address for.
        ndp.invalidateSLAACPrefix(prefix, state)
}

// generateTempSLAACAddr generates a new temporary SLAAC address.
//
// If resetGenAttempts is true, the prefix's generation counter is reset.
//
// Returns true if a new address was generated.
func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *slaacPrefixState, resetGenAttempts bool) bool {
        // Are we configured to auto-generate new temporary global addresses for the
        // prefix?
        if !ndp.configs.AutoGenTempGlobalAddresses || prefix == header.IPv6LinkLocalPrefix.Subnet() {
                return false
        }

        if resetGenAttempts {
                prefixState.generationAttempts = 0
                prefixState.maxGenerationAttempts = ndp.configs.AutoGenAddressConflictRetries + 1
        }

        // If we have already reached the maximum address generation attempts for the
        // prefix, do not generate another address.
        if prefixState.generationAttempts == prefixState.maxGenerationAttempts {
                return false
        }

        stableAddr := prefixState.stableAddr.addressEndpoint.AddressWithPrefix().Address
        now := ndp.ep.protocol.stack.Clock().NowMonotonic()

        // As per RFC 4941 section 3.3 step 4, the valid lifetime of a temporary
        // address is the lower of the valid lifetime of the stable address or the
        // maximum temporary address valid lifetime.
        vl := ndp.configs.MaxTempAddrValidLifetime
        if prefixState.validUntil != nil {
                if prefixVL := prefixState.validUntil.Sub(now); vl > prefixVL {
                        vl = prefixVL
                }
        }

        if vl <= 0 {
                // Cannot create an address without a valid lifetime.
                return false
        }

        // As per RFC 4941 section 3.3 step 4, the preferred lifetime of a temporary
        // address is the lower of the preferred lifetime of the stable address or the
        // maximum temporary address preferred lifetime - the temporary address desync
        // factor.
        pl := ndp.configs.MaxTempAddrPreferredLifetime - ndp.temporaryAddressDesyncFactor
        if prefixState.preferredUntil != nil {
                if prefixPL := prefixState.preferredUntil.Sub(now); pl > prefixPL {
                        // Respect the preferred lifetime of the prefix, as per RFC 4941 section
                        // 3.3 step 4.
                        pl = prefixPL
                }
        }

        // As per RFC 4941 section 3.3 step 5, a temporary address is created only if
        // the calculated preferred lifetime is greater than the advance regeneration
        // duration. In particular, we MUST NOT create a temporary address with a zero
        // Preferred Lifetime.
        if pl <= ndp.configs.RegenAdvanceDuration {
                return false
        }

        // Attempt to generate a new address that is not already assigned to the IPv6
        // endpoint.
        var generatedAddr tcpip.AddressWithPrefix
        for i := 0; ; i++ {
                // If we were unable to generate an address after the maximum SLAAC address
                // local regeneration attempts, do nothing further.
                if i == maxSLAACAddrLocalRegenAttempts {
                        return false
                }

                generatedAddr = header.GenerateTempIPv6SLAACAddr(ndp.temporaryIIDHistory[:], stableAddr)
                if !ndp.ep.hasPermanentAddressRLocked(generatedAddr.Address) {
                        break
                }
        }

        // As per RFC RFC 4941 section 3.3 step 5, we MUST NOT create a temporary
        // address with a zero preferred lifetime. The checks above ensure this
        // so we know the address is not deprecated.
        addressEndpoint := ndp.addAndAcquireSLAACAddr(generatedAddr, stack.AddressConfigSlaacTemp, false /* deprecated */)
        if addressEndpoint == nil {
                return false
        }

        state := tempSLAACAddrState{
                deprecationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
                        prefixState, ok := ndp.slaacPrefixes[prefix]
                        if !ok {
                                panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to deprecate temporary address %s", prefix, generatedAddr))
                        }

                        tempAddrState, ok := prefixState.tempAddrs[generatedAddr.Address]
                        if !ok {
                                panic(fmt.Sprintf("ndp: must have a tempAddr entry to deprecate temporary address %s", generatedAddr))
                        }

                        ndp.deprecateSLAACAddress(tempAddrState.addressEndpoint)
                }),
                invalidationJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
                        prefixState, ok := ndp.slaacPrefixes[prefix]
                        if !ok {
                                panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to invalidate temporary address %s", prefix, generatedAddr))
                        }

                        tempAddrState, ok := prefixState.tempAddrs[generatedAddr.Address]
                        if !ok {
                                panic(fmt.Sprintf("ndp: must have a tempAddr entry to invalidate temporary address %s", generatedAddr))
                        }

                        ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, generatedAddr.Address, tempAddrState)
                }),
                regenJob: ndp.ep.protocol.stack.NewJob(&ndp.ep.mu, func() {
                        prefixState, ok := ndp.slaacPrefixes[prefix]
                        if !ok {
                                panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to regenerate temporary address after %s", prefix, generatedAddr))
                        }

                        tempAddrState, ok := prefixState.tempAddrs[generatedAddr.Address]
                        if !ok {
                                panic(fmt.Sprintf("ndp: must have a tempAddr entry to regenerate temporary address after %s", generatedAddr))
                        }

                        // If an address has already been regenerated for this address, don't
                        // regenerate another address.
                        if tempAddrState.regenerated {
                                return
                        }

                        // Reset the generation attempts counter as we are starting the generation
                        // of a new address for the SLAAC prefix.
                        tempAddrState.regenerated = ndp.generateTempSLAACAddr(prefix, &prefixState, true /* resetGenAttempts */)
                        prefixState.tempAddrs[generatedAddr.Address] = tempAddrState
                        ndp.slaacPrefixes[prefix] = prefixState
                }),
                createdAt:       now,
                addressEndpoint: addressEndpoint,
        }

        state.deprecationJob.Schedule(pl)
        state.invalidationJob.Schedule(vl)
        state.regenJob.Schedule(pl - ndp.configs.RegenAdvanceDuration)

        prefixState.generationAttempts++
        prefixState.tempAddrs[generatedAddr.Address] = state

        return true
}

// regenerateTempSLAACAddr regenerates a temporary address for a SLAAC prefix.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) regenerateTempSLAACAddr(prefix tcpip.Subnet, resetGenAttempts bool) {
        state, ok := ndp.slaacPrefixes[prefix]
        if !ok {
                panic(fmt.Sprintf("ndp: SLAAC prefix state not found to regenerate temporary address for %s", prefix))
        }

        ndp.generateTempSLAACAddr(prefix, &state, resetGenAttempts)
        ndp.slaacPrefixes[prefix] = state
}

// refreshSLAACPrefixLifetimes refreshes the lifetimes of a SLAAC prefix.
//
// pl is the new preferred lifetime. vl is the new valid lifetime.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixState *slaacPrefixState, pl, vl time.Duration) {
        // If the preferred lifetime is zero, then the prefix should be deprecated.
        deprecated := pl == 0
        if deprecated {
                ndp.deprecateSLAACAddress(prefixState.stableAddr.addressEndpoint)
        } else {
                prefixState.stableAddr.addressEndpoint.SetDeprecated(false)
        }

        // If prefix was preferred for some finite lifetime before, cancel the
        // deprecation job so it can be reset.
        prefixState.deprecationJob.Cancel()

        now := ndp.ep.protocol.stack.Clock().NowMonotonic()

        // Schedule the deprecation job if prefix has a finite preferred lifetime.
        if pl < header.NDPInfiniteLifetime {
                if !deprecated {
                        prefixState.deprecationJob.Schedule(pl)
                }
                t := now.Add(pl)
                prefixState.preferredUntil = &t
        } else {
                prefixState.preferredUntil = nil
        }

        // As per RFC 4862 section 5.5.3.e, update the valid lifetime for prefix:
        //
        // 1) If the received Valid Lifetime is greater than 2 hours or greater than
        //    RemainingLifetime, set the valid lifetime of the prefix to the
        //    advertised Valid Lifetime.
        //
        // 2) If RemainingLifetime is less than or equal to 2 hours, ignore the
        //    advertised Valid Lifetime.
        //
        // 3) Otherwise, reset the valid lifetime of the prefix to 2 hours.

        if vl >= header.NDPInfiniteLifetime {
                // Handle the infinite valid lifetime separately as we do not schedule a
                // job in this case.
                prefixState.invalidationJob.Cancel()
                prefixState.validUntil = nil
        } else {
                var effectiveVl time.Duration
                var rl time.Duration

                // If the prefix was originally set to be valid forever, assume the
                // remaining time to be the maximum possible value.
                if prefixState.validUntil == nil {
                        rl = header.NDPInfiniteLifetime
                } else {
                        rl = prefixState.validUntil.Sub(now)
                }

                if vl > MinPrefixInformationValidLifetimeForUpdate || vl > rl {
                        effectiveVl = vl
                } else if rl > MinPrefixInformationValidLifetimeForUpdate {
                        effectiveVl = MinPrefixInformationValidLifetimeForUpdate
                }

                if effectiveVl != 0 {
                        prefixState.invalidationJob.Cancel()
                        prefixState.invalidationJob.Schedule(effectiveVl)
                        t := now.Add(effectiveVl)
                        prefixState.validUntil = &t
                }
        }

        // If DAD is not yet complete on the stable address, there is no need to do
        // work with temporary addresses.
        if prefixState.stableAddr.addressEndpoint.GetKind() != stack.Permanent {
                return
        }

        // Note, we do not need to update the entries in the temporary address map
        // after updating the jobs because the jobs are held as pointers.
        var regenForAddr tcpip.Address
        allAddressesRegenerated := true
        for tempAddr, tempAddrState := range prefixState.tempAddrs {
                // As per RFC 4941 section 3.3 step 4, the valid lifetime of a temporary
                // address is the lower of the valid lifetime of the stable address or the
                // maximum temporary address valid lifetime. Note, the valid lifetime of a
                // temporary address is relative to the address's creation time.
                validUntil := tempAddrState.createdAt.Add(ndp.configs.MaxTempAddrValidLifetime)
                if prefixState.validUntil != nil && prefixState.validUntil.Before(validUntil) {
                        validUntil = *prefixState.validUntil
                }

                // If the address is no longer valid, invalidate it immediately. Otherwise,
                // reset the invalidation job.
                newValidLifetime := validUntil.Sub(now)
                if newValidLifetime <= 0 {
                        ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, tempAddr, tempAddrState)
                        continue
                }
                tempAddrState.invalidationJob.Cancel()
                tempAddrState.invalidationJob.Schedule(newValidLifetime)

                // As per RFC 4941 section 3.3 step 4, the preferred lifetime of a temporary
                // address is the lower of the preferred lifetime of the stable address or
                // the maximum temporary address preferred lifetime - the temporary address
                // desync factor. Note, the preferred lifetime of a temporary address is
                // relative to the address's creation time.
                preferredUntil := tempAddrState.createdAt.Add(ndp.configs.MaxTempAddrPreferredLifetime - ndp.temporaryAddressDesyncFactor)
                if prefixState.preferredUntil != nil && prefixState.preferredUntil.Before(preferredUntil) {
                        preferredUntil = *prefixState.preferredUntil
                }

                // If the address is no longer preferred, deprecate it immediately.
                // Otherwise, schedule the deprecation job again.
                newPreferredLifetime := preferredUntil.Sub(now)
                tempAddrState.deprecationJob.Cancel()

                if newPreferredLifetime <= 0 {
                        ndp.deprecateSLAACAddress(tempAddrState.addressEndpoint)
                } else {
                        tempAddrState.addressEndpoint.SetDeprecated(false)
                        tempAddrState.deprecationJob.Schedule(newPreferredLifetime)
                }

                tempAddrState.regenJob.Cancel()
                if tempAddrState.regenerated {
                } else {
                        allAddressesRegenerated = false

                        if newPreferredLifetime <= ndp.configs.RegenAdvanceDuration {
                                // The new preferred lifetime is less than the advance regeneration
                                // duration so regenerate an address for this temporary address
                                // immediately after we finish iterating over the temporary addresses.
                                regenForAddr = tempAddr
                        } else {
                                tempAddrState.regenJob.Schedule(newPreferredLifetime - ndp.configs.RegenAdvanceDuration)
                        }
                }
        }

        // Generate a new temporary address if all of the existing temporary addresses
        // have been regenerated, or we need to immediately regenerate an address
        // due to an update in preferred lifetime.
        //
        // If each temporay address has already been regenerated, no new temporary
        // address is generated. To ensure continuation of temporary SLAAC addresses,
        // we manually try to regenerate an address here.
        if len(regenForAddr) != 0 || allAddressesRegenerated {
                // Reset the generation attempts counter as we are starting the generation
                // of a new address for the SLAAC prefix.
                if state, ok := prefixState.tempAddrs[regenForAddr]; ndp.generateTempSLAACAddr(prefix, prefixState, true /* resetGenAttempts */) && ok {
                        state.regenerated = true
                        prefixState.tempAddrs[regenForAddr] = state
                }
        }
}

// deprecateSLAACAddress marks the address as deprecated and notifies the NDP
// dispatcher that address has been deprecated.
//
// deprecateSLAACAddress does nothing if the address is already deprecated.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) deprecateSLAACAddress(addressEndpoint stack.AddressEndpoint) {
        if addressEndpoint.Deprecated() {
                return
        }

        addressEndpoint.SetDeprecated(true)
        if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
                ndpDisp.OnAutoGenAddressDeprecated(ndp.ep.nic.ID(), addressEndpoint.AddressWithPrefix())
        }
}

// invalidateSLAACPrefix invalidates a SLAAC prefix.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) invalidateSLAACPrefix(prefix tcpip.Subnet, state slaacPrefixState) {
        ndp.cleanupSLAACPrefixResources(prefix, state)

        if addressEndpoint := state.stableAddr.addressEndpoint; addressEndpoint != nil {
                if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
                        ndpDisp.OnAutoGenAddressInvalidated(ndp.ep.nic.ID(), addressEndpoint.AddressWithPrefix())
                }

                if err := ndp.ep.removePermanentEndpointInnerLocked(addressEndpoint, &stack.DADAborted{}); err != nil {
                        panic(fmt.Sprintf("ndp: error removing stable SLAAC address %s: %s", addressEndpoint.AddressWithPrefix(), err))
                }
        }
}

// cleanupSLAACAddrResourcesAndNotify cleans up an invalidated SLAAC address's
// resources.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidatePrefix bool) {
        if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
                ndpDisp.OnAutoGenAddressInvalidated(ndp.ep.nic.ID(), addr)
        }

        prefix := addr.Subnet()
        state, ok := ndp.slaacPrefixes[prefix]
        if !ok || state.stableAddr.addressEndpoint == nil || addr.Address != state.stableAddr.addressEndpoint.AddressWithPrefix().Address {
                return
        }

        if !invalidatePrefix {
                // If the prefix is not being invalidated, disassociate the address from the
                // prefix and do nothing further.
                state.stableAddr.addressEndpoint.DecRef()
                state.stableAddr.addressEndpoint = nil
                ndp.slaacPrefixes[prefix] = state
                return
        }

        ndp.cleanupSLAACPrefixResources(prefix, state)
}

// cleanupSLAACPrefixResources cleans up a SLAAC prefix's jobs and entry.
//
// Panics if the SLAAC prefix is not known.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) cleanupSLAACPrefixResources(prefix tcpip.Subnet, state slaacPrefixState) {
        // Invalidate all temporary addresses.
        for tempAddr, tempAddrState := range state.tempAddrs {
                ndp.invalidateTempSLAACAddr(state.tempAddrs, tempAddr, tempAddrState)
        }

        if state.stableAddr.addressEndpoint != nil {
                state.stableAddr.addressEndpoint.DecRef()
                state.stableAddr.addressEndpoint = nil
        }
        state.deprecationJob.Cancel()
        state.invalidationJob.Cancel()
        delete(ndp.slaacPrefixes, prefix)
}

// invalidateTempSLAACAddr invalidates a temporary SLAAC address.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) invalidateTempSLAACAddr(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) {
        ndp.cleanupTempSLAACAddrResourcesAndNotifyInner(tempAddrs, tempAddr, tempAddrState)

        if err := ndp.ep.removePermanentEndpointInnerLocked(tempAddrState.addressEndpoint, &stack.DADAborted{}); err != nil {
                panic(fmt.Sprintf("error removing temporary SLAAC address %s: %s", tempAddrState.addressEndpoint.AddressWithPrefix(), err))
        }
}

// cleanupTempSLAACAddrResourcesAndNotify cleans up an invalidated temporary
// SLAAC address's resources from ndp and notifies the NDP dispatcher that the
// address was invalidated.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix) {
        prefix := addr.Subnet()
        state, ok := ndp.slaacPrefixes[prefix]
        if !ok {
                panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry to clean up temp addr %s resources", addr))
        }

        tempAddrState, ok := state.tempAddrs[addr.Address]
        if !ok {
                panic(fmt.Sprintf("ndp: must have a tempAddr entry to clean up temp addr %s resources", addr))
        }

        ndp.cleanupTempSLAACAddrResourcesAndNotifyInner(state.tempAddrs, addr.Address, tempAddrState)
}

// cleanupTempSLAACAddrResourcesAndNotifyInner is like
// cleanupTempSLAACAddrResourcesAndNotify except it does not lookup the
// temporary address's state in ndp - it assumes the passed state is valid.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotifyInner(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) {
        if ndpDisp := ndp.ep.protocol.options.NDPDisp; ndpDisp != nil {
                ndpDisp.OnAutoGenAddressInvalidated(ndp.ep.nic.ID(), tempAddrState.addressEndpoint.AddressWithPrefix())
        }

        tempAddrState.addressEndpoint.DecRef()
        tempAddrState.addressEndpoint = nil
        tempAddrState.deprecationJob.Cancel()
        tempAddrState.invalidationJob.Cancel()
        tempAddrState.regenJob.Cancel()
        delete(tempAddrs, tempAddr)
}

// cleanupState cleans up ndp's state.
//
// This function invalidates all discovered on-link prefixes, discovered
// routers, and auto-generated addresses.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) cleanupState() {
        for prefix, state := range ndp.slaacPrefixes {
                ndp.invalidateSLAACPrefix(prefix, state)
        }

        for prefix := range ndp.onLinkPrefixes {
                ndp.invalidateOnLinkPrefix(prefix)
        }

        if got := len(ndp.onLinkPrefixes); got != 0 {
                panic(fmt.Sprintf("ndp: still have discovered on-link prefixes after cleaning up; found = %d", got))
        }

        for route := range ndp.offLinkRoutes {
                ndp.invalidateOffLinkRoute(route)
        }

        if got := len(ndp.offLinkRoutes); got != 0 {
                panic(fmt.Sprintf("ndp: still have discovered off-link routes after cleaning up; found = %d", got))
        }

        ndp.dhcpv6Configuration = 0
}

// startSolicitingRouters starts soliciting routers, as per RFC 4861 section
// 6.3.7. If routers are already being solicited, this function does nothing.
//
// If ndp is not configured to handle Router Advertisements, routers will not
// be solicited as there is no point soliciting routers if we don't handle their
// advertisements.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) startSolicitingRouters() {
        if ndp.rtrSolicitTimer.timer != nil {
                // We are already soliciting routers.
                return
        }

        remaining := ndp.configs.MaxRtrSolicitations
        if remaining == 0 {
                return
        }

        if !ndp.configs.HandleRAs.enabled(ndp.ep.Forwarding()) {
                return
        }

        // Calculate the random delay before sending our first RS, as per RFC
        // 4861 section 6.3.7.
        var delay time.Duration
        if ndp.configs.MaxRtrSolicitationDelay > 0 {
                delay = time.Duration(ndp.ep.protocol.stack.Rand().Int63n(int64(ndp.configs.MaxRtrSolicitationDelay)))
        }

        // Protected by ndp.ep.mu.
        done := false

        ndp.rtrSolicitTimer = timer{
                done: &done,
                timer: ndp.ep.protocol.stack.Clock().AfterFunc(delay, func() {
                        // As per RFC 4861 section 4.1:
                        //
                        //   IP Fields:
                        //     Source Address
                        //       An IP address assigned to the sending interface, or
                        //       the unspecified address if no address is assigned
                        //       to the sending interface.
                        localAddr := header.IPv6Any
                        if addressEndpoint := ndp.ep.AcquireOutgoingPrimaryAddress(header.IPv6AllRoutersLinkLocalMulticastAddress, false); addressEndpoint != nil {
                                localAddr = addressEndpoint.AddressWithPrefix().Address
                                addressEndpoint.DecRef()
                        }

                        // As per RFC 4861 section 4.1, an NDP RS SHOULD include the source
                        // link-layer address option if the source address of the NDP RS is
                        // specified. This option MUST NOT be included if the source address is
                        // unspecified.
                        //
                        // TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
                        // LinkEndpoint.LinkAddress) before reaching this point.
                        var optsSerializer header.NDPOptionsSerializer
                        linkAddress := ndp.ep.nic.LinkAddress()
                        if localAddr != header.IPv6Any && header.IsValidUnicastEthernetAddress(linkAddress) {
                                optsSerializer = header.NDPOptionsSerializer{
                                        header.NDPSourceLinkLayerAddressOption(linkAddress),
                                }
                        }
                        payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize + optsSerializer.Length()
                        icmpData := header.ICMPv6(buffer.NewView(payloadSize))
                        icmpData.SetType(header.ICMPv6RouterSolicit)
                        rs := header.NDPRouterSolicit(icmpData.MessageBody())
                        rs.Options().Serialize(optsSerializer)
                        icmpData.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{
                                Header: icmpData,
                                Src:    localAddr,
                                Dst:    header.IPv6AllRoutersLinkLocalMulticastAddress,
                        }))

                        pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                                ReserveHeaderBytes: int(ndp.ep.MaxHeaderLength()),
                                Data:               buffer.View(icmpData).ToVectorisedView(),
                        })

                        sent := ndp.ep.stats.icmp.packetsSent
                        if err := addIPHeader(localAddr, header.IPv6AllRoutersLinkLocalMulticastAddress, pkt, stack.NetworkHeaderParams{
                                Protocol: header.ICMPv6ProtocolNumber,
                                TTL:      header.NDPHopLimit,
                        }, nil /* extensionHeaders */); err != nil {
                                panic(fmt.Sprintf("failed to add IP header: %s", err))
                        }

                        if err := ndp.ep.nic.WritePacketToRemote(header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllRoutersLinkLocalMulticastAddress), ProtocolNumber, pkt); err != nil {
                                sent.dropped.Increment()
                                // Don't send any more messages if we had an error.
                                remaining = 0
                        } else {
                                sent.routerSolicit.Increment()
                                remaining--
                        }

                        ndp.ep.mu.Lock()
                        defer ndp.ep.mu.Unlock()

                        if done {
                                // Router solicitation was stopped.
                                return
                        }

                        if remaining == 0 {
                                // We are done soliciting routers.
                                ndp.stopSolicitingRouters()
                                return
                        }

                        ndp.rtrSolicitTimer.timer.Reset(ndp.configs.RtrSolicitationInterval)
                }),
        }
}

// forwardingChanged handles a change in forwarding configuration.
//
// If transitioning to a host, router solicitation will be started. Otherwise,
// router solicitation will be stopped if NDP is not configured to handle RAs
// as a router.
//
// Precondition: ndp.ep.mu must be locked.
func (ndp *ndpState) forwardingChanged(forwarding bool) {
        if forwarding {
                if ndp.configs.HandleRAs.enabled(forwarding) {
                        return
                }

                ndp.stopSolicitingRouters()
                return
        }

        // Solicit routers when transitioning to a host.
        //
        // If the endpoint is not currently enabled, routers will be solicited when
        // the endpoint becomes enabled (if it is still a host).
        if ndp.ep.Enabled() {
                ndp.startSolicitingRouters()
        }
}

// stopSolicitingRouters stops soliciting routers. If routers are not currently
// being solicited, this function does nothing.
//
// The IPv6 endpoint that ndp belongs to MUST be locked.
func (ndp *ndpState) stopSolicitingRouters() {
        if ndp.rtrSolicitTimer.timer == nil {
                // Nothing to do.
                return
        }

        ndp.rtrSolicitTimer.timer.Stop()
        *ndp.rtrSolicitTimer.done = true
        ndp.rtrSolicitTimer = timer{}
}

func (ndp *ndpState) init(ep *endpoint, dadOptions ip.DADOptions) {
        if ndp.offLinkRoutes != nil {
                panic("attempted to initialize NDP state twice")
        }

        ndp.ep = ep
        ndp.configs = ep.protocol.options.NDPConfigs
        ndp.dad.Init(&ndp.ep.mu, ep.protocol.options.DADConfigs, dadOptions)
        ndp.offLinkRoutes = make(map[offLinkRoute]offLinkRouteState)
        ndp.onLinkPrefixes = make(map[tcpip.Subnet]onLinkPrefixState)
        ndp.slaacPrefixes = make(map[tcpip.Subnet]slaacPrefixState)

        header.InitialTempIID(ndp.temporaryIIDHistory[:], ndp.ep.protocol.options.TempIIDSeed, ndp.ep.nic.ID())
        ndp.temporaryAddressDesyncFactor = time.Duration(ep.protocol.stack.Rand().Int63n(int64(MaxDesyncFactor)))
}

func (ndp *ndpState) SendDADMessage(addr tcpip.Address, nonce []byte) tcpip.Error {
        snmc := header.SolicitedNodeAddr(addr)
        return ndp.ep.sendNDPNS(header.IPv6Any, snmc, addr, header.EthernetAddressFromMulticastIPv6Address(snmc), header.NDPOptionsSerializer{
                header.NDPNonceOption(nonce),
        })
}

func (e *endpoint) sendNDPNS(srcAddr, dstAddr, targetAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, opts header.NDPOptionsSerializer) tcpip.Error {
        icmp := header.ICMPv6(buffer.NewView(header.ICMPv6NeighborSolicitMinimumSize + opts.Length()))
        icmp.SetType(header.ICMPv6NeighborSolicit)
        ns := header.NDPNeighborSolicit(icmp.MessageBody())
        ns.SetTargetAddress(targetAddr)
        ns.Options().Serialize(opts)
        icmp.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{
                Header: icmp,
                Src:    srcAddr,
                Dst:    dstAddr,
        }))

        pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                ReserveHeaderBytes: int(e.MaxHeaderLength()),
                Data:               buffer.View(icmp).ToVectorisedView(),
        })

        if err := addIPHeader(srcAddr, dstAddr, pkt, stack.NetworkHeaderParams{
                Protocol: header.ICMPv6ProtocolNumber,
                TTL:      header.NDPHopLimit,
        }, nil /* extensionHeaders */); err != nil {
                panic(fmt.Sprintf("failed to add IP header: %s", err))
        }

        sent := e.stats.icmp.packetsSent
        err := e.nic.WritePacketToRemote(remoteLinkAddr, ProtocolNumber, pkt)
        if err != nil {
                sent.dropped.Increment()
        } else {
                sent.neighborSolicit.Increment()
        }
        return err
}






















































  182 
















   26 

    1 



   25 
    1 


   24 


   24 



    2 




    2 



    2 
    2 



    2 
    2 






    2 


    1 











    3 
    1 


    2 



    2 
    1 


    1 



    3 
    1 


    2 



    1 



    1 



    5 



    5 


    5 


    5 



    5 




    5 


    5 



    1 




  145 
  145 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package proc

import (
        "sort"
        "strconv"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

// subtasksInode represents the inode for /proc/[pid]/task/ directory.
//
// +stateify savable
type subtasksInode struct {
        implStatFS
        kernfs.InodeAlwaysValid
        kernfs.InodeAttrs
        kernfs.InodeDirectoryNoNewChildren
        kernfs.InodeNotSymlink
        kernfs.InodeTemporary
        kernfs.OrderedChildren
        subtasksInodeRefs

        locks vfs.FileLocks

        fs                *filesystem
        task              *kernel.Task
        pidns             *kernel.PIDNamespace
        cgroupControllers map[string]string
}

var _ kernfs.Inode = (*subtasksInode)(nil)

func (fs *filesystem) newSubtasks(ctx context.Context, task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) kernfs.Inode {
        subInode := &subtasksInode{
                fs:                fs,
                task:              task,
                pidns:             pidns,
                cgroupControllers: cgroupControllers,
        }
        // Note: credentials are overridden by taskOwnedInode.
        subInode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
        subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
        subInode.InitRefs()

        inode := &taskOwnedInode{Inode: subInode, owner: task}
        return inode
}

// Lookup implements kernfs.inodeDirectory.Lookup.
func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
        tid, err := strconv.ParseUint(name, 10, 32)
        if err != nil {
                return nil, syserror.ENOENT
        }

        subTask := i.pidns.TaskWithID(kernel.ThreadID(tid))
        if subTask == nil {
                return nil, syserror.ENOENT
        }
        if subTask.ThreadGroup() != i.task.ThreadGroup() {
                return nil, syserror.ENOENT
        }
        return i.fs.newTaskInode(ctx, subTask, i.pidns, false, i.cgroupControllers)
}

// IterDirents implements kernfs.inodeDirectory.IterDirents.
func (i *subtasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
        tasks := i.task.ThreadGroup().MemberIDs(i.pidns)
        if len(tasks) == 0 {
                return offset, syserror.ENOENT
        }
        if relOffset >= int64(len(tasks)) {
                return offset, nil
        }

        tids := make([]int, 0, len(tasks))
        for _, tid := range tasks {
                tids = append(tids, int(tid))
        }

        sort.Ints(tids)
        for _, tid := range tids[relOffset:] {
                dirent := vfs.Dirent{
                        Name:    strconv.FormatUint(uint64(tid), 10),
                        Type:    linux.DT_DIR,
                        Ino:     i.fs.NextIno(),
                        NextOff: offset + 1,
                }
                if err := cb.Handle(dirent); err != nil {
                        return offset, err
                }
                offset++
        }
        return offset, nil
}

// +stateify savable
type subtasksFD struct {
        kernfs.GenericDirectoryFD

        task *kernel.Task
}

func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
        if fd.task.ExitState() >= kernel.TaskExitZombie {
                return syserror.ENOENT
        }
        return fd.GenericDirectoryFD.IterDirents(ctx, cb)
}

// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        if fd.task.ExitState() >= kernel.TaskExitZombie {
                return 0, syserror.ENOENT
        }
        return fd.GenericDirectoryFD.Seek(ctx, offset, whence)
}

// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
        if fd.task.ExitState() >= kernel.TaskExitZombie {
                return linux.Statx{}, syserror.ENOENT
        }
        return fd.GenericDirectoryFD.Stat(ctx, opts)
}

// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
        if fd.task.ExitState() >= kernel.TaskExitZombie {
                return syserror.ENOENT
        }
        return fd.GenericDirectoryFD.SetStat(ctx, opts)
}

// Open implements kernfs.Inode.Open.
func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd := &subtasksFD{task: i.task}
        if err := fd.Init(&i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
                SeekEnd: kernfs.SeekEndZero,
        }); err != nil {
                return nil, err
        }
        if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
                return nil, err
        }
        return fd.VFSFileDescription(), nil
}

// Stat implements kernfs.Inode.Stat.
func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
        stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
        if err != nil {
                return linux.Statx{}, err
        }
        if opts.Mask&linux.STATX_NLINK != 0 {
                stat.Nlink += uint32(i.task.ThreadGroup().Count())
        }
        return stat, nil
}

// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
        return linuxerr.EPERM
}

// DecRef implements kernfs.Inode.DecRef.
func (i *subtasksInode) DecRef(ctx context.Context) {
        i.subtasksInodeRefs.DecRef(func() { i.Destroy(ctx) })
}




























 1629 

 1634 



    1 


 1628 








    4 

    4 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/syserror"
)

// copyTimespecIn copies a Timespec from the untrusted app range to the kernel.
func copyTimespecIn(t *kernel.Task, addr hostarch.Addr) (linux.Timespec, error) {
        switch t.Arch().Width() {
        case 8:
                ts := linux.Timespec{}
                in := t.CopyScratchBuffer(16)
                _, err := t.CopyInBytes(addr, in)
                if err != nil {
                        return ts, err
                }
                ts.Sec = int64(hostarch.ByteOrder.Uint64(in[0:]))
                ts.Nsec = int64(hostarch.ByteOrder.Uint64(in[8:]))
                return ts, nil
        default:
                return linux.Timespec{}, syserror.ENOSYS
        }
}

// copyTimespecOut copies a Timespec to the untrusted app range.
func copyTimespecOut(t *kernel.Task, addr hostarch.Addr, ts *linux.Timespec) error {
        switch t.Arch().Width() {
        case 8:
                out := t.CopyScratchBuffer(16)
                hostarch.ByteOrder.PutUint64(out[0:], uint64(ts.Sec))
                hostarch.ByteOrder.PutUint64(out[8:], uint64(ts.Nsec))
                _, err := t.CopyOutBytes(addr, out)
                return err
        default:
                return syserror.ENOSYS
        }
}

// copyTimevalIn copies a Timeval from the untrusted app range to the kernel.
func copyTimevalIn(t *kernel.Task, addr hostarch.Addr) (linux.Timeval, error) {
        switch t.Arch().Width() {
        case 8:
                tv := linux.Timeval{}
                in := t.CopyScratchBuffer(16)
                _, err := t.CopyInBytes(addr, in)
                if err != nil {
                        return tv, err
                }
                tv.Sec = int64(hostarch.ByteOrder.Uint64(in[0:]))
                tv.Usec = int64(hostarch.ByteOrder.Uint64(in[8:]))
                return tv, nil
        default:
                return linux.Timeval{}, syserror.ENOSYS
        }
}

// copyTimevalOut copies a Timeval to the untrusted app range.
func copyTimevalOut(t *kernel.Task, addr hostarch.Addr, tv *linux.Timeval) error {
        switch t.Arch().Width() {
        case 8:
                out := t.CopyScratchBuffer(16)
                hostarch.ByteOrder.PutUint64(out[0:], uint64(tv.Sec))
                hostarch.ByteOrder.PutUint64(out[8:], uint64(tv.Usec))
                _, err := t.CopyOutBytes(addr, out)
                return err
        default:
                return syserror.ENOSYS
        }
}

// copyTimespecInToDuration copies a Timespec from the untrusted app range,
// validates it and converts it to a Duration.
//
// If the Timespec is larger than what can be represented in a Duration, the
// returned value is the maximum that Duration will allow.
//
// If timespecAddr is NULL, the returned value is negative.
func copyTimespecInToDuration(t *kernel.Task, timespecAddr hostarch.Addr) (time.Duration, error) {
        // Use a negative Duration to indicate "no timeout".
        timeout := time.Duration(-1)
        if timespecAddr != 0 {
                timespec, err := copyTimespecIn(t, timespecAddr)
                if err != nil {
                        return 0, err
                }
                if !timespec.Valid() {
                        return 0, linuxerr.EINVAL
                }
                timeout = time.Duration(timespec.ToNsecCapped())
        }
        return timeout, nil
}




















































































































































































































































































    3 



    3 


    3 


    3 


    3 


    3 



















 1957 




 1961 




 1956 




  346 




 1106 




 1006 




  416 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package arch provides abstractions around architecture-dependent details,
// such as syscall calling conventions, native types, etc.
package arch

import (
        "fmt"
        "io"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/cpuid"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/sentry/arch/fpu"
        "gvisor.dev/gvisor/pkg/sentry/limits"
)

// Arch describes an architecture.
type Arch int

const (
        // AMD64 is the x86-64 architecture.
        AMD64 Arch = iota
        // ARM64 is the aarch64 architecture.
        ARM64
)

// String implements fmt.Stringer.
func (a Arch) String() string {
        switch a {
        case AMD64:
                return "amd64"
        case ARM64:
                return "arm64"
        default:
                return fmt.Sprintf("Arch(%d)", a)
        }
}

// Context provides architecture-dependent information for a specific thread.
//
// NOTE(b/34169503): Currently we use uintptr here to refer to a generic native
// register value. While this will work for the foreseeable future, it isn't
// strictly correct. We may want to create some abstraction that makes this
// more clear or enables us to store values of arbitrary widths. This is
// particularly true for RegisterMap().
type Context interface {
        // Arch returns the architecture for this Context.
        Arch() Arch

        // Native converts a generic type to a native value.
        //
        // Because the architecture is not specified here, we may be dealing
        // with return values of varying sizes (for example ARCH_GETFS). This
        // is a simple utility function to convert to the native size in these
        // cases, and then we can CopyOut.
        Native(val uintptr) marshal.Marshallable

        // Value converts a native type back to a generic value.
        // Once a value has been converted to native via the above call -- it
        // can be converted back here.
        Value(val marshal.Marshallable) uintptr

        // Width returns the number of bytes for a native value.
        Width() uint

        // Fork creates a clone of the context.
        Fork() Context

        // SyscallNo returns the syscall number.
        SyscallNo() uintptr

        // SyscallSaveOrig save orignal register value.
        SyscallSaveOrig()

        // SyscallArgs returns the syscall arguments in an array.
        SyscallArgs() SyscallArguments

        // Return returns the return value for a system call.
        Return() uintptr

        // SetReturn sets the return value for a system call.
        SetReturn(value uintptr)

        // RestartSyscall reverses over the current syscall instruction, such that
        // when the application resumes execution the syscall will be re-attempted.
        RestartSyscall()

        // RestartSyscallWithRestartBlock reverses over the current syscall
        // instraction and overwrites the current syscall number with that of
        // restart_syscall(2). This causes the application to restart the current
        // syscall with a custom function when execution resumes.
        RestartSyscallWithRestartBlock()

        // IP returns the current instruction pointer.
        IP() uintptr

        // SetIP sets the current instruction pointer.
        SetIP(value uintptr)

        // Stack returns the current stack pointer.
        Stack() uintptr

        // SetStack sets the current stack pointer.
        SetStack(value uintptr)

        // TLS returns the current TLS pointer.
        TLS() uintptr

        // SetTLS sets the current TLS pointer. Returns false if value is invalid.
        SetTLS(value uintptr) bool

        // SetOldRSeqInterruptedIP sets the register that contains the old IP
        // when an "old rseq" restartable sequence is interrupted.
        SetOldRSeqInterruptedIP(value uintptr)

        // StateData returns a pointer to underlying architecture state.
        StateData() *State

        // RegisterMap returns a map of all registers.
        RegisterMap() (map[string]uintptr, error)

        // SignalSetup modifies the context in preparation for handling the
        // given signal.
        //
        // st is the stack where the signal handler frame should be
        // constructed.
        //
        // act is the SigAction that specifies how this signal is being
        // handled.
        //
        // info is the SignalInfo of the signal being delivered.
        //
        // alt is the alternate signal stack (even if the alternate signal
        // stack is not going to be used).
        //
        // sigset is the signal mask before entering the signal handler.
        SignalSetup(st *Stack, act *linux.SigAction, info *linux.SignalInfo, alt *linux.SignalStack, sigset linux.SignalSet) error

        // SignalRestore restores context after returning from a signal
        // handler.
        //
        // st is the current thread stack.
        //
        // rt is true if SignalRestore is being entered from rt_sigreturn and
        // false if SignalRestore is being entered from sigreturn.
        // SignalRestore returns the thread's new signal mask.
        SignalRestore(st *Stack, rt bool) (linux.SignalSet, linux.SignalStack, error)

        // CPUIDEmulate emulates a CPUID instruction according to current register state.
        CPUIDEmulate(l log.Logger)

        // SingleStep returns true if single stepping is enabled.
        SingleStep() bool

        // SetSingleStep enables single stepping.
        SetSingleStep()

        // ClearSingleStep disables single stepping.
        ClearSingleStep()

        // FloatingPointData will be passed to underlying save routines.
        FloatingPointData() *fpu.State

        // NewMmapLayout returns a layout for a new MM, where MinAddr for the
        // returned layout must be no lower than min, and MaxAddr for the returned
        // layout must be no higher than max. Repeated calls to NewMmapLayout may
        // return different layouts.
        NewMmapLayout(min, max hostarch.Addr, limits *limits.LimitSet) (MmapLayout, error)

        // PIELoadAddress returns a preferred load address for a
        // position-independent executable within l.
        PIELoadAddress(l MmapLayout) hostarch.Addr

        // FeatureSet returns the FeatureSet in use in this context.
        FeatureSet() *cpuid.FeatureSet

        // Hack around our package dependences being too broken to support the
        // equivalent of arch_ptrace():

        // PtracePeekUser implements ptrace(PTRACE_PEEKUSR).
        PtracePeekUser(addr uintptr) (marshal.Marshallable, error)

        // PtracePokeUser implements ptrace(PTRACE_POKEUSR).
        PtracePokeUser(addr, data uintptr) error

        // PtraceGetRegs implements ptrace(PTRACE_GETREGS) by writing the
        // general-purpose registers represented by this Context to dst and
        // returning the number of bytes written.
        PtraceGetRegs(dst io.Writer) (int, error)

        // PtraceSetRegs implements ptrace(PTRACE_SETREGS) by reading
        // general-purpose registers from src into this Context and returning the
        // number of bytes read.
        PtraceSetRegs(src io.Reader) (int, error)

        // PtraceGetRegSet implements ptrace(PTRACE_GETREGSET) by writing the
        // register set given by architecture-defined value regset from this
        // Context to dst and returning the number of bytes written, which must be
        // less than or equal to maxlen.
        PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error)

        // PtraceSetRegSet implements ptrace(PTRACE_SETREGSET) by reading the
        // register set given by architecture-defined value regset from src and
        // returning the number of bytes read, which must be less than or equal to
        // maxlen.
        PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error)

        // FullRestore returns 'true' if all CPU registers must be restored
        // when switching to the untrusted application. Typically a task enters
        // and leaves the kernel via a system call. Platform.Switch() may
        // optimize for this by not saving/restoring all registers if allowed
        // by the ABI. For e.g. the amd64 ABI specifies that syscall clobbers
        // %rcx and %r11. If FullRestore returns true then these optimizations
        // must be disabled and all registers restored.
        FullRestore() bool
}

// MmapDirection is a search direction for mmaps.
type MmapDirection int

const (
        // MmapBottomUp instructs mmap to prefer lower addresses.
        MmapBottomUp MmapDirection = iota

        // MmapTopDown instructs mmap to prefer higher addresses.
        MmapTopDown
)

// MmapLayout defines the layout of the user address space for a particular
// MemoryManager.
//
// Note that "highest address" below is always exclusive.
//
// +stateify savable
type MmapLayout struct {
        // MinAddr is the lowest mappable address.
        MinAddr hostarch.Addr

        // MaxAddr is the highest mappable address.
        MaxAddr hostarch.Addr

        // BottomUpBase is the lowest address that may be returned for a
        // MmapBottomUp mmap.
        BottomUpBase hostarch.Addr

        // TopDownBase is the highest address that may be returned for a
        // MmapTopDown mmap.
        TopDownBase hostarch.Addr

        // DefaultDirection is the direction for most non-fixed mmaps in this
        // layout.
        DefaultDirection MmapDirection

        // MaxStackRand is the maximum randomization to apply to stack
        // allocations to maintain a proper gap between the stack and
        // TopDownBase.
        MaxStackRand uint64
}

// Valid returns true if this layout is valid.
func (m *MmapLayout) Valid() bool {
        if m.MinAddr > m.MaxAddr {
                return false
        }
        if m.BottomUpBase < m.MinAddr {
                return false
        }
        if m.BottomUpBase > m.MaxAddr {
                return false
        }
        if m.TopDownBase < m.MinAddr {
                return false
        }
        if m.TopDownBase > m.MaxAddr {
                return false
        }
        return true
}

// SyscallArgument is an argument supplied to a syscall implementation. The
// methods used to access the arguments are named after the ***C type name*** and
// they convert to the closest Go type available. For example, Int() refers to a
// 32-bit signed integer argument represented in Go as an int32.
//
// Using the accessor methods guarantees that the conversion between types is
// correct, taking into account size and signedness (i.e., zero-extension vs
// signed-extension).
type SyscallArgument struct {
        // Prefer to use accessor methods instead of 'Value' directly.
        Value uintptr
}

// SyscallArguments represents the set of arguments passed to a syscall.
type SyscallArguments [6]SyscallArgument

// Pointer returns the hostarch.Addr representation of a pointer argument.
func (a SyscallArgument) Pointer() hostarch.Addr {
        return hostarch.Addr(a.Value)
}

// Int returns the int32 representation of a 32-bit signed integer argument.
func (a SyscallArgument) Int() int32 {
        return int32(a.Value)
}

// Uint returns the uint32 representation of a 32-bit unsigned integer argument.
func (a SyscallArgument) Uint() uint32 {
        return uint32(a.Value)
}

// Int64 returns the int64 representation of a 64-bit signed integer argument.
func (a SyscallArgument) Int64() int64 {
        return int64(a.Value)
}

// Uint64 returns the uint64 representation of a 64-bit unsigned integer argument.
func (a SyscallArgument) Uint64() uint64 {
        return uint64(a.Value)
}

// SizeT returns the uint representation of a size_t argument.
func (a SyscallArgument) SizeT() uint {
        return uint(a.Value)
}

// ModeT returns the int representation of a mode_t argument.
func (a SyscallArgument) ModeT() uint {
        return uint(uint16(a.Value))
}











































  334 



  335 
















  333 











  332 
  335 




  335 
  332 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package flipcall

import (
        "reflect"
        "unsafe"

        "gvisor.dev/gvisor/pkg/sync"
)

// Packets consist of a 16-byte header followed by an arbitrarily-sized
// datagram. The header consists of:
//
// - A 4-byte native-endian connection state.
//
// - A 4-byte native-endian datagram length in bytes.
//
// - 8 reserved bytes.
const (
        // PacketHeaderBytes is the size of a flipcall packet header in bytes. The
        // maximum datagram size supported by a flipcall connection is equal to the
        // length of the packet window minus PacketHeaderBytes.
        //
        // PacketHeaderBytes is exported to support its use in constant
        // expressions. Non-constant expressions may prefer to use
        // PacketWindowLengthForDataCap().
        PacketHeaderBytes = 16
)

func (ep *Endpoint) connState() *uint32 {
        return (*uint32)(unsafe.Pointer(ep.packet))
}

func (ep *Endpoint) dataLen() *uint32 {
        return (*uint32)(unsafe.Pointer(ep.packet + 4))
}

// Data returns the datagram part of ep's packet window as a byte slice.
//
// Note that the packet window is shared with the potentially-untrusted peer
// Endpoint, which may concurrently mutate the contents of the packet window.
// Thus:
//
// - Readers must not assume that two reads of the same byte in Data() will
// return the same result. In other words, readers should read any given byte
// in Data() at most once.
//
// - Writers must not assume that they will read back the same data that they
// have written. In other words, writers should avoid reading from Data() at
// all.
func (ep *Endpoint) Data() (bs []byte) {
        bshdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs))
        bshdr.Data = ep.packet + PacketHeaderBytes
        bshdr.Len = int(ep.dataCap)
        bshdr.Cap = int(ep.dataCap)
        return
}

// ioSync is a dummy variable used to indicate synchronization to the Go race
// detector. Compare syscall.ioSync.
var ioSync int64

func raceBecomeActive() {
        if sync.RaceEnabled {
                sync.RaceAcquire(unsafe.Pointer(&ioSync))
        }
}

func raceBecomeInactive() {
        if sync.RaceEnabled {
                sync.RaceReleaseMerge(unsafe.Pointer(&ioSync))
        }
}

















































   15 
   15 
   15 







    4 

    4 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

const (
        // IFNAMSIZ is the size of the name field for IFReq.
        IFNAMSIZ = 16
)

// IFReq is an interface request.
//
// +marshal
type IFReq struct {
        // IFName is an encoded name, normally null-terminated. This should be
        // accessed via the Name and SetName functions.
        IFName [IFNAMSIZ]byte

        // Data is the union of the following structures:
        //
        //        struct sockaddr ifr_addr;
        //        struct sockaddr ifr_dstaddr;
        //        struct sockaddr ifr_broadaddr;
        //        struct sockaddr ifr_netmask;
        //        struct sockaddr ifr_hwaddr;
        //        short           ifr_flags;
        //        int             ifr_ifindex;
        //        int             ifr_metric;
        //        int             ifr_mtu;
        //        struct ifmap    ifr_map;
        //        char            ifr_slave[IFNAMSIZ];
        //        char            ifr_newname[IFNAMSIZ];
        //        char           *ifr_data;
        Data [24]byte
}

// Name returns the name.
func (ifr *IFReq) Name() string {
        for c := 0; c < len(ifr.IFName); c++ {
                if ifr.IFName[c] == 0 {
                        return string(ifr.IFName[:c])
                }
        }
        return string(ifr.IFName[:])
}

// SetName sets the name.
func (ifr *IFReq) SetName(name string) {
        n := copy(ifr.IFName[:], []byte(name))
        for i := n; i < len(ifr.IFName); i++ {
                ifr.IFName[i] = 0
        }
}

// SizeOfIFReq is the binary size of an IFReq struct (40 bytes).
var SizeOfIFReq = (*IFReq)(nil).SizeBytes()

// IFMap contains interface hardware parameters.
type IFMap struct {
        MemStart uint64
        MemEnd   uint64
        BaseAddr int16
        IRQ      byte
        DMA      byte
        Port     byte
        _        [3]byte // Pad to sizeof(struct ifmap).
}

// IFConf is used to return a list of interfaces and their addresses. See
// netdevice(7) and struct ifconf for more detail on its use.
//
// +marshal
type IFConf struct {
        Len int32
        _   [4]byte // Pad to sizeof(struct ifconf).
        Ptr uint64
}










































































    5 




    2 







    4 








    4 



    4 
    3 



    4 
    3 



    4 
    1 



    3 





    4 
    4 

    3 
    2 

    1 



    1 



    1 




    2 









    1 


    2 



    2 











   97 






   96 



   97 



    2 
    1 



    1 




   96 
   95 
    1 



   93 




   93 

    1 

   93 













   94 

   93 

    1 



   94 









    3 



    1 



    2 

    1 



    1 




    1 




    1 












    1 

    1 


    1 



    1 









    3 


    2 


    1 



    3 



















   12 


















    2 
    1 





   20 
   20 


    1 



    1 



    1 





    1 



    1 



   32 
   32 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tun

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/link/channel"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
        "gvisor.dev/gvisor/pkg/waiter"
)

const (
        // drivers/net/tun.c:tun_net_init()
        defaultDevMtu = 1500

        // Queue length for outbound packet, arriving at fd side for read. Overflow
        // causes packet drops. gVisor implementation-specific.
        defaultDevOutQueueLen = 1024
)

var zeroMAC [6]byte

// Device is an opened /dev/net/tun device.
//
// +stateify savable
type Device struct {
        waiter.Queue

        mu           sync.RWMutex `state:"nosave"`
        endpoint     *tunEndpoint
        notifyHandle *channel.NotificationHandle
        flags        Flags
}

// Flags set properties of a Device
type Flags struct {
        TUN          bool
        TAP          bool
        NoPacketInfo bool
}

// beforeSave is invoked by stateify.
func (d *Device) beforeSave() {
        d.mu.Lock()
        defer d.mu.Unlock()
        // TODO(b/110961832): Restore the device to stack. At this moment, the stack
        // is not savable.
        if d.endpoint != nil {
                panic("/dev/net/tun does not support save/restore when a device is associated with it.")
        }
}

// Release implements fs.FileOperations.Release.
func (d *Device) Release(ctx context.Context) {
        d.mu.Lock()
        defer d.mu.Unlock()

        // Decrease refcount if there is an endpoint associated with this file.
        if d.endpoint != nil {
                d.endpoint.RemoveNotify(d.notifyHandle)
                d.endpoint.DecRef(ctx)
                d.endpoint = nil
        }
}

// SetIff services TUNSETIFF ioctl(2) request.
func (d *Device) SetIff(s *stack.Stack, name string, flags Flags) error {
        d.mu.Lock()
        defer d.mu.Unlock()

        if d.endpoint != nil {
                return linuxerr.EINVAL
        }

        // Input validation.
        if flags.TAP && flags.TUN || !flags.TAP && !flags.TUN {
                return linuxerr.EINVAL
        }

        prefix := "tun"
        if flags.TAP {
                prefix = "tap"
        }

        linkCaps := stack.CapabilityNone
        if flags.TAP {
                linkCaps |= stack.CapabilityResolutionRequired
        }

        endpoint, err := attachOrCreateNIC(s, name, prefix, linkCaps)
        if err != nil {
                return linuxerr.EINVAL
        }

        d.endpoint = endpoint
        d.notifyHandle = d.endpoint.AddNotify(d)
        d.flags = flags
        return nil
}

func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkEndpointCapabilities) (*tunEndpoint, error) {
        for {
                // 1. Try to attach to an existing NIC.
                if name != "" {
                        if linkEP := s.GetLinkEndpointByName(name); linkEP != nil {
                                endpoint, ok := linkEP.(*tunEndpoint)
                                if !ok {
                                        // Not a NIC created by tun device.
                                        return nil, linuxerr.EOPNOTSUPP
                                }
                                if !endpoint.TryIncRef() {
                                        // Race detected: NIC got deleted in between.
                                        continue
                                }
                                return endpoint, nil
                        }
                }

                // 2. Creating a new NIC.
                id := tcpip.NICID(s.UniqueID())
                endpoint := &tunEndpoint{
                        Endpoint: channel.New(defaultDevOutQueueLen, defaultDevMtu, ""),
                        stack:    s,
                        nicID:    id,
                        name:     name,
                        isTap:    prefix == "tap",
                }
                endpoint.InitRefs()
                endpoint.Endpoint.LinkEPCapabilities = linkCaps
                if endpoint.name == "" {
                        endpoint.name = fmt.Sprintf("%s%d", prefix, id)
                }
                err := s.CreateNICWithOptions(endpoint.nicID, endpoint, stack.NICOptions{
                        Name: endpoint.name,
                })
                switch err.(type) {
                case nil:
                        return endpoint, nil
                case *tcpip.ErrDuplicateNICID:
                        // Race detected: A NIC has been created in between.
                        continue
                default:
                        return nil, linuxerr.EINVAL
                }
        }
}

// Write inject one inbound packet to the network interface.
func (d *Device) Write(data []byte) (int64, error) {
        d.mu.RLock()
        endpoint := d.endpoint
        d.mu.RUnlock()
        if endpoint == nil {
                return 0, linuxerr.EBADFD
        }
        if !endpoint.IsAttached() {
                return 0, syserror.EIO
        }

        dataLen := int64(len(data))

        // Packet information.
        var pktInfoHdr PacketInfoHeader
        if !d.flags.NoPacketInfo {
                if len(data) < PacketInfoHeaderSize {
                        // Ignore bad packet.
                        return dataLen, nil
                }
                pktInfoHdr = PacketInfoHeader(data[:PacketInfoHeaderSize])
                data = data[PacketInfoHeaderSize:]
        }

        // Ethernet header (TAP only).
        var ethHdr header.Ethernet
        if d.flags.TAP {
                if len(data) < header.EthernetMinimumSize {
                        // Ignore bad packet.
                        return dataLen, nil
                }
                ethHdr = header.Ethernet(data[:header.EthernetMinimumSize])
                data = data[header.EthernetMinimumSize:]
        }

        // Try to determine network protocol number, default zero.
        var protocol tcpip.NetworkProtocolNumber
        switch {
        case pktInfoHdr != nil:
                protocol = pktInfoHdr.Protocol()
        case ethHdr != nil:
                protocol = ethHdr.Type()
        case d.flags.TUN:
                // TUN interface with IFF_NO_PI enabled, thus
                // we need to determine protocol from version field
                version := data[0] >> 4
                if version == 4 {
                        protocol = header.IPv4ProtocolNumber
                } else if version == 6 {
                        protocol = header.IPv6ProtocolNumber
                }
        }

        // Try to determine remote link address, default zero.
        var remote tcpip.LinkAddress
        switch {
        case ethHdr != nil:
                remote = ethHdr.SourceAddress()
        default:
                remote = tcpip.LinkAddress(zeroMAC[:])
        }

        pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                ReserveHeaderBytes: len(ethHdr),
                Data:               buffer.View(data).ToVectorisedView(),
        })
        copy(pkt.LinkHeader().Push(len(ethHdr)), ethHdr)
        endpoint.InjectLinkAddr(protocol, remote, pkt)
        return dataLen, nil
}

// Read reads one outgoing packet from the network interface.
func (d *Device) Read() ([]byte, error) {
        d.mu.RLock()
        endpoint := d.endpoint
        d.mu.RUnlock()
        if endpoint == nil {
                return nil, linuxerr.EBADFD
        }

        for {
                info, ok := endpoint.Read()
                if !ok {
                        return nil, syserror.ErrWouldBlock
                }

                v, ok := d.encodePkt(&info)
                if !ok {
                        // Ignore unsupported packet.
                        continue
                }
                return v, nil
        }
}

// encodePkt encodes packet for fd side.
func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) {
        var vv buffer.VectorisedView

        // Packet information.
        if !d.flags.NoPacketInfo {
                hdr := make(PacketInfoHeader, PacketInfoHeaderSize)
                hdr.Encode(&PacketInfoFields{
                        Protocol: info.Proto,
                })
                vv.AppendView(buffer.View(hdr))
        }

        // Ethernet header (TAP only).
        if d.flags.TAP {
                // Add ethernet header if not provided.
                if info.Pkt.LinkHeader().View().IsEmpty() {
                        d.endpoint.AddHeader(info.Route.LocalLinkAddress, info.Route.RemoteLinkAddress, info.Proto, info.Pkt)
                }
                vv.AppendView(info.Pkt.LinkHeader().View())
        }

        // Append upper headers.
        vv.AppendView(info.Pkt.NetworkHeader().View())
        vv.AppendView(info.Pkt.TransportHeader().View())
        // Append data payload.
        vv.Append(info.Pkt.Data().ExtractVV())

        return vv.ToView(), true
}

// Name returns the name of the attached network interface. Empty string if
// unattached.
func (d *Device) Name() string {
        d.mu.RLock()
        defer d.mu.RUnlock()
        if d.endpoint != nil {
                return d.endpoint.name
        }
        return ""
}

// Flags returns the flags set for d. Zero value if unset.
func (d *Device) Flags() Flags {
        d.mu.RLock()
        defer d.mu.RUnlock()
        return d.flags
}

// Readiness implements watier.Waitable.Readiness.
func (d *Device) Readiness(mask waiter.EventMask) waiter.EventMask {
        if mask&waiter.ReadableEvents != 0 {
                d.mu.RLock()
                endpoint := d.endpoint
                d.mu.RUnlock()
                if endpoint != nil && endpoint.NumQueued() == 0 {
                        mask &= ^waiter.ReadableEvents
                }
        }
        return mask & (waiter.ReadableEvents | waiter.WritableEvents)
}

// WriteNotify implements channel.Notification.WriteNotify.
func (d *Device) WriteNotify() {
        d.Notify(waiter.ReadableEvents)
}

// tunEndpoint is the link endpoint for the NIC created by the tun device.
//
// It is ref-counted as multiple opening files can attach to the same NIC.
// The last owner is responsible for deleting the NIC.
type tunEndpoint struct {
        tunEndpointRefs
        *channel.Endpoint

        stack *stack.Stack
        nicID tcpip.NICID
        name  string
        isTap bool
}

// DecRef decrements refcount of e, removing NIC if it reaches 0.
func (e *tunEndpoint) DecRef(ctx context.Context) {
        e.tunEndpointRefs.DecRef(func() {
                e.stack.RemoveNIC(e.nicID)
        })
}

// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
func (e *tunEndpoint) ARPHardwareType() header.ARPHardwareType {
        if e.isTap {
                return header.ARPHardwareEther
        }
        return header.ARPHardwareNone
}

// AddHeader implements stack.LinkEndpoint.AddHeader.
func (e *tunEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
        if !e.isTap {
                return
        }
        eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize))
        hdr := &header.EthernetFields{
                SrcAddr: local,
                DstAddr: remote,
                Type:    protocol,
        }
        if hdr.SrcAddr == "" {
                hdr.SrcAddr = e.LinkAddress()
        }

        eth.Encode(hdr)
}

// MaxHeaderLength returns the maximum size of the link layer header.
func (e *tunEndpoint) MaxHeaderLength() uint16 {
        if e.isTap {
                return header.EthernetMinimumSize
        }
        return 0
}



















 1622 




 1624 




 1623 




 1628 




 1626 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// Copyright 2019 The gVisor Authors.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build race

package sync

import (
        "runtime"
        "unsafe"
)

// RaceEnabled is true if the Go data race detector is enabled.
const RaceEnabled = true

// RaceDisable has the same semantics as runtime.RaceDisable.
func RaceDisable() {
        runtime.RaceDisable()
}

// RaceEnable has the same semantics as runtime.RaceEnable.
func RaceEnable() {
        runtime.RaceEnable()
}

// RaceAcquire has the same semantics as runtime.RaceAcquire.
func RaceAcquire(addr unsafe.Pointer) {
        runtime.RaceAcquire(addr)
}

// RaceRelease has the same semantics as runtime.RaceRelease.
func RaceRelease(addr unsafe.Pointer) {
        runtime.RaceRelease(addr)
}

// RaceReleaseMerge has the same semantics as runtime.RaceReleaseMerge.
func RaceReleaseMerge(addr unsafe.Pointer) {
        runtime.RaceReleaseMerge(addr)
}

// RaceUncheckedAtomicCompareAndSwapUintptr is equivalent to
// sync/atomic.CompareAndSwapUintptr, but is not checked by the race detector.
// This is necessary when implementing gopark callbacks, since no race context
// is available during their execution.
func RaceUncheckedAtomicCompareAndSwapUintptr(ptr *uintptr, old, new uintptr) bool

































    1 






    1 






    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tmpfs

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

// +stateify savable
type deviceFile struct {
        inode inode
        kind  vfs.DeviceKind
        major uint32
        minor uint32
}

func (fs *filesystem) newDeviceFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32, parentDir *directory) *inode {
        file := &deviceFile{
                kind:  kind,
                major: major,
                minor: minor,
        }
        switch kind {
        case vfs.BlockDevice:
                mode |= linux.S_IFBLK
        case vfs.CharDevice:
                mode |= linux.S_IFCHR
        default:
                panic(fmt.Sprintf("invalid DeviceKind: %v", kind))
        }
        file.inode.init(file, fs, kuid, kgid, mode, parentDir)
        file.inode.nlink = 1 // from parent directory
        return &file.inode
}










































   19 











   17 





    7 






    7 



    7 









    1 




   14 






   14 



   14 







    2 
    1 



    1 



    1 





    1 



    1 



   18 




    1 





    1 



    1 






    1 
















   15 


    1 


   14 



   14 






   14 





    1 




    1 



    1 





    1 


















    1 
    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package transport

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/waiter"
)

// connectionlessEndpoint is a unix endpoint for unix sockets that support operating in
// a connectionless fashon.
//
// Specifically, this means datagram unix sockets not created with
// socketpair(2).
//
// +stateify savable
type connectionlessEndpoint struct {
        baseEndpoint
}

var (
        _ = BoundEndpoint((*connectionlessEndpoint)(nil))
        _ = Endpoint((*connectionlessEndpoint)(nil))
)

// NewConnectionless creates a new unbound dgram endpoint.
func NewConnectionless(ctx context.Context) Endpoint {
        ep := &connectionlessEndpoint{baseEndpoint{Queue: &waiter.Queue{}}}
        q := queue{ReaderQueue: ep.Queue, WriterQueue: &waiter.Queue{}, limit: defaultBufferSize}
        q.InitRefs()
        ep.receiver = &queueReceiver{readQueue: &q}
        ep.ops.SetSendBufferSize(defaultBufferSize, false /* notify */)
        ep.ops.SetReceiveBufferSize(defaultBufferSize, false /* notify */)
        ep.ops.InitHandler(ep, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits)
        return ep
}

// isBound returns true iff the endpoint is bound.
func (e *connectionlessEndpoint) isBound() bool {
        return e.path != ""
}

// Close puts the endpoint in a closed state and frees all resources associated
// with it.
func (e *connectionlessEndpoint) Close(ctx context.Context) {
        e.Lock()
        if e.connected != nil {
                e.connected.Release(ctx)
                e.connected = nil
        }

        if e.isBound() {
                e.path = ""
        }

        e.receiver.CloseRecv()
        r := e.receiver
        e.receiver = nil
        e.Unlock()

        r.CloseNotify()
        r.Release(ctx)
}

// BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
func (e *connectionlessEndpoint) BidirectionalConnect(ctx context.Context, ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error {
        return syserr.ErrConnectionRefused
}

// UnidirectionalConnect implements BoundEndpoint.UnidirectionalConnect.
func (e *connectionlessEndpoint) UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error) {
        e.Lock()
        r := e.receiver
        e.Unlock()
        if r == nil {
                return nil, syserr.ErrConnectionRefused
        }
        q := r.(*queueReceiver).readQueue
        if !q.TryIncRef() {
                return nil, syserr.ErrConnectionRefused
        }
        return &connectedEndpoint{
                endpoint:   e,
                writeQueue: q,
        }, nil
}

// SendMsg writes data and a control message to the specified endpoint.
// This method does not block if the data cannot be written.
func (e *connectionlessEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, *syserr.Error) {
        if to == nil {
                return e.baseEndpoint.SendMsg(ctx, data, c, nil)
        }

        connected, err := to.UnidirectionalConnect(ctx)
        if err != nil {
                return 0, syserr.ErrInvalidEndpointState
        }
        defer connected.Release(ctx)

        e.Lock()
        n, notify, err := connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
        e.Unlock()

        if notify {
                connected.SendNotify()
        }

        return n, err
}

// Type implements Endpoint.Type.
func (e *connectionlessEndpoint) Type() linux.SockType {
        return linux.SOCK_DGRAM
}

// Connect attempts to connect directly to server.
func (e *connectionlessEndpoint) Connect(ctx context.Context, server BoundEndpoint) *syserr.Error {
        connected, err := server.UnidirectionalConnect(ctx)
        if err != nil {
                return err
        }

        e.Lock()
        if e.connected != nil {
                e.connected.Release(ctx)
        }
        e.connected = connected
        e.Unlock()

        return nil
}

// Listen starts listening on the connection.
func (*connectionlessEndpoint) Listen(int) *syserr.Error {
        return syserr.ErrNotSupported
}

// Accept accepts a new connection.
func (*connectionlessEndpoint) Accept(*tcpip.FullAddress) (Endpoint, *syserr.Error) {
        return nil, syserr.ErrNotSupported
}

// Bind binds the connection.
//
// For Unix endpoints, this _only sets the address associated with the socket_.
// Work associated with sockets in the filesystem or finding those sockets must
// be done by a higher level.
//
// Bind will fail only if the socket is connected, bound or the passed address
// is invalid (the empty string).
func (e *connectionlessEndpoint) Bind(addr tcpip.FullAddress, commit func() *syserr.Error) *syserr.Error {
        e.Lock()
        defer e.Unlock()
        if e.isBound() {
                return syserr.ErrAlreadyBound
        }
        if addr.Addr == "" {
                // The empty string is not permitted.
                return syserr.ErrBadLocalAddress
        }
        if commit != nil {
                if err := commit(); err != nil {
                        return err
                }
        }

        // Save the bound address.
        e.path = string(addr.Addr)
        return nil
}

// Readiness returns the current readiness of the endpoint. For example, if
// waiter.EventIn is set, the endpoint is immediately readable.
func (e *connectionlessEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
        e.Lock()
        defer e.Unlock()

        ready := waiter.EventMask(0)
        if mask&waiter.ReadableEvents != 0 && e.receiver.Readable() {
                ready |= waiter.ReadableEvents
        }

        if e.Connected() {
                if mask&waiter.WritableEvents != 0 && e.connected.Writable() {
                        ready |= waiter.WritableEvents
                }
        }

        return ready
}

// State implements socket.Socket.State.
func (e *connectionlessEndpoint) State() uint32 {
        e.Lock()
        defer e.Unlock()

        switch {
        case e.isBound():
                return linux.SS_UNCONNECTED
        case e.Connected():
                return linux.SS_CONNECTING
        default:
                return linux.SS_DISCONNECTING
        }
}

// OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize.
func (e *connectionlessEndpoint) OnSetSendBufferSize(v int64) (newSz int64) {
        if e.Connected() {
                return e.baseEndpoint.connected.SetSendBufferSize(v)
        }
        return v
}


























    8 




    1 



    7 





    5 



    7 
    2 
    1 


    3 
    1 


    2 
    2 


    1 




    3 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fs/lock"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
)

// Flock implements linux syscall flock(2).
func Flock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        operation := args[1].Int()

        file := t.GetFileVFS2(fd)
        if file == nil {
                // flock(2): EBADF fd is not an open file descriptor.
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        nonblocking := operation&linux.LOCK_NB != 0
        operation &^= linux.LOCK_NB

        var blocker lock.Blocker
        if !nonblocking {
                blocker = t
        }

        switch operation {
        case linux.LOCK_EX:
                if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.WriteLock, blocker); err != nil {
                        return 0, nil, err
                }
        case linux.LOCK_SH:
                if err := file.LockBSD(t, int32(t.TGIDInRoot()), lock.ReadLock, blocker); err != nil {
                        return 0, nil, err
                }
        case linux.LOCK_UN:
                if err := file.UnlockBSD(t); err != nil {
                        return 0, nil, err
                }
        default:
                // flock(2): EINVAL operation is invalid.
                return 0, nil, linuxerr.EINVAL
        }

        return 0, nil, nil
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/tmpfs/fstree.go: no such file or directory




































































 1680 







  338 
  319 

    1 


  320 

  339 


 1678 





 1679 
 1674 


  338 


  338 



  744 

  217 



  650 




  641 




  644 



  640 






















 1671 

   21 



 1673 




 1672 




 1679 



 1675 






















   13 





   13 




   13 




   13 

























  271 
    1 



  272 




  264 






























  267 



  253 




  253 




  240 






















  241 














































    1 






    1 
















    1 
    1 




    1 






    1 

    1 



 1472 






 1468 
















 1469 
 1486 




 1488 






 1488 

 1493 






























































 1667 



 1656 





 1069 




  336 
  323 



   13 



  960 


    6 
    2 



    4 

  976 

    1 
    1 







  981 




    8 



  976 


  968 


  966 








  318 



  306 





   18 
    3 





   15 




    1 





   14 


    1 



   13 







   13 




    1 



   12 


   12 


   11 









   14 

   13 


    2 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mm

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/platform"
        "gvisor.dev/gvisor/pkg/usermem"
)

// There are two supported ways to copy data to/from application virtual
// memory:
//
// 1. Internally-mapped copying: Determine the platform.File that backs the
// copied-to/from virtual address, obtain a mapping of its pages, and read or
// write to the mapping.
//
// 2. AddressSpace copying: If platform.Platform.SupportsAddressSpaceIO() is
// true, AddressSpace permissions are applicable, and an AddressSpace is
// available, copy directly through the AddressSpace, handling faults as
// needed.
//
// (Given that internally-mapped copying requires that backing memory is always
// implemented using a host file descriptor, we could also preadv/pwritev to it
// instead. But this would incur a host syscall for each use of the mapped
// page, whereas mmap is a one-time cost.)
//
// The fixed overhead of internally-mapped copying is expected to be higher
// than that of AddressSpace copying since the former always needs to translate
// addresses, whereas the latter only needs to do so when faults occur.
// However, the throughput of internally-mapped copying is expected to be
// somewhat higher than that of AddressSpace copying due to the high cost of
// page faults and because implementations of the latter usually rely on
// safecopy, which doesn't use AVX registers. So we prefer to use AddressSpace
// copying (when available) for smaller copies, and switch to internally-mapped
// copying once a size threshold is exceeded.
const (
        // copyMapMinBytes is the size threshold for switching to internally-mapped
        // copying in CopyOut, CopyIn, and ZeroOut.
        copyMapMinBytes = 32 << 10 // 32 KB

        // rwMapMinBytes is the size threshold for switching to internally-mapped
        // copying in CopyOutFrom and CopyInTo. It's lower than copyMapMinBytes
        // since AddressSpace copying in this case requires additional buffering;
        // see CopyOutFrom for details.
        rwMapMinBytes = 512
)

// CheckIORange is similar to hostarch.Addr.ToRange, but applies bounds checks
// consistent with Linux's arch/x86/include/asm/uaccess.h:access_ok().
//
// Preconditions: length >= 0.
func (mm *MemoryManager) CheckIORange(addr hostarch.Addr, length int64) (hostarch.AddrRange, bool) {
        // Note that access_ok() constrains end even if length == 0.
        ar, ok := addr.ToRange(uint64(length))
        return ar, (ok && ar.End <= mm.layout.MaxAddr)
}

// checkIOVec applies bound checks consistent with Linux's
// arch/x86/include/asm/uaccess.h:access_ok() to ars.
func (mm *MemoryManager) checkIOVec(ars hostarch.AddrRangeSeq) bool {
        for !ars.IsEmpty() {
                ar := ars.Head()
                if _, ok := mm.CheckIORange(ar.Start, int64(ar.Length())); !ok {
                        return false
                }
                ars = ars.Tail()
        }
        return true
}

func (mm *MemoryManager) asioEnabled(opts usermem.IOOpts) bool {
        return mm.haveASIO && !opts.IgnorePermissions && opts.AddressSpaceActive
}

// translateIOError converts errors to EFAULT, as is usually reported for all
// I/O errors originating from MM in Linux.
func translateIOError(ctx context.Context, err error) error {
        if err == nil {
                return nil
        }
        if logIOErrors {
                ctx.Debugf("MM I/O error: %v", err)
        }
        return linuxerr.EFAULT
}

// CopyOut implements usermem.IO.CopyOut.
func (mm *MemoryManager) CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts usermem.IOOpts) (int, error) {
        ar, ok := mm.CheckIORange(addr, int64(len(src)))
        if !ok {
                return 0, linuxerr.EFAULT
        }

        if len(src) == 0 {
                return 0, nil
        }

        // Do AddressSpace IO if applicable.
        if mm.asioEnabled(opts) && len(src) < copyMapMinBytes {
                return mm.asCopyOut(ctx, addr, src)
        }

        // Go through internal mappings.
        n64, err := mm.withInternalMappings(ctx, ar, hostarch.Write, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
                n, err := safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(src)))
                return n, translateIOError(ctx, err)
        })
        return int(n64), err
}

func (mm *MemoryManager) asCopyOut(ctx context.Context, addr hostarch.Addr, src []byte) (int, error) {
        var done int
        for {
                n, err := mm.as.CopyOut(addr+hostarch.Addr(done), src[done:])
                done += n
                if err == nil {
                        return done, nil
                }
                if f, ok := err.(platform.SegmentationFault); ok {
                        ar, _ := addr.ToRange(uint64(len(src)))
                        if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Write); err != nil {
                                return done, err
                        }
                        continue
                }
                return done, translateIOError(ctx, err)
        }
}

// CopyIn implements usermem.IO.CopyIn.
func (mm *MemoryManager) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts usermem.IOOpts) (int, error) {
        ar, ok := mm.CheckIORange(addr, int64(len(dst)))
        if !ok {
                return 0, linuxerr.EFAULT
        }

        if len(dst) == 0 {
                return 0, nil
        }

        // Do AddressSpace IO if applicable.
        if mm.asioEnabled(opts) && len(dst) < copyMapMinBytes {
                return mm.asCopyIn(ctx, addr, dst)
        }

        // Go through internal mappings.
        n64, err := mm.withInternalMappings(ctx, ar, hostarch.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
                n, err := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(dst)), ims)
                return n, translateIOError(ctx, err)
        })
        return int(n64), err
}

func (mm *MemoryManager) asCopyIn(ctx context.Context, addr hostarch.Addr, dst []byte) (int, error) {
        var done int
        for {
                n, err := mm.as.CopyIn(addr+hostarch.Addr(done), dst[done:])
                done += n
                if err == nil {
                        return done, nil
                }
                if f, ok := err.(platform.SegmentationFault); ok {
                        ar, _ := addr.ToRange(uint64(len(dst)))
                        if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Read); err != nil {
                                return done, err
                        }
                        continue
                }
                return done, translateIOError(ctx, err)
        }
}

// ZeroOut implements usermem.IO.ZeroOut.
func (mm *MemoryManager) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts usermem.IOOpts) (int64, error) {
        ar, ok := mm.CheckIORange(addr, toZero)
        if !ok {
                return 0, linuxerr.EFAULT
        }

        if toZero == 0 {
                return 0, nil
        }

        // Do AddressSpace IO if applicable.
        if mm.asioEnabled(opts) && toZero < copyMapMinBytes {
                return mm.asZeroOut(ctx, addr, toZero)
        }

        // Go through internal mappings.
        return mm.withInternalMappings(ctx, ar, hostarch.Write, opts.IgnorePermissions, func(dsts safemem.BlockSeq) (uint64, error) {
                n, err := safemem.ZeroSeq(dsts)
                return n, translateIOError(ctx, err)
        })
}

func (mm *MemoryManager) asZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64) (int64, error) {
        var done int64
        for {
                n, err := mm.as.ZeroOut(addr+hostarch.Addr(done), uintptr(toZero-done))
                done += int64(n)
                if err == nil {
                        return done, nil
                }
                if f, ok := err.(platform.SegmentationFault); ok {
                        ar, _ := addr.ToRange(uint64(toZero))
                        if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Write); err != nil {
                                return done, err
                        }
                        continue
                }
                return done, translateIOError(ctx, err)
        }
}

// CopyOutFrom implements usermem.IO.CopyOutFrom.
func (mm *MemoryManager) CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts usermem.IOOpts) (int64, error) {
        if !mm.checkIOVec(ars) {
                return 0, linuxerr.EFAULT
        }

        if ars.NumBytes() == 0 {
                return 0, nil
        }

        // Do AddressSpace IO if applicable.
        if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes {
                // We have to introduce a buffered copy, instead of just passing a
                // safemem.BlockSeq representing addresses in the AddressSpace to src.
                // This is because usermem.IO.CopyOutFrom() guarantees that it calls
                // src.ReadToBlocks() at most once, which is incompatible with handling
                // faults between calls. In the future, this is probably best resolved
                // by introducing a CopyOutFrom variant or option that allows it to
                // call src.ReadToBlocks() any number of times.
                //
                // This issue applies to CopyInTo as well.
                buf := make([]byte, int(ars.NumBytes()))
                bufN, bufErr := src.ReadToBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)))
                var done int64
                for done < int64(bufN) {
                        ar := ars.Head()
                        cplen := int64(ar.Length())
                        if cplen > int64(bufN)-done {
                                cplen = int64(bufN) - done
                        }
                        n, err := mm.asCopyOut(ctx, ar.Start, buf[int(done):int(done+cplen)])
                        done += int64(n)
                        if err != nil {
                                return done, err
                        }
                        ars = ars.Tail()
                }
                // Do not convert errors returned by src to EFAULT.
                return done, bufErr
        }

        // Go through internal mappings.
        return mm.withVecInternalMappings(ctx, ars, hostarch.Write, opts.IgnorePermissions, src.ReadToBlocks)
}

// CopyInTo implements usermem.IO.CopyInTo.
func (mm *MemoryManager) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts usermem.IOOpts) (int64, error) {
        if !mm.checkIOVec(ars) {
                return 0, linuxerr.EFAULT
        }

        if ars.NumBytes() == 0 {
                return 0, nil
        }

        // Do AddressSpace IO if applicable.
        if mm.asioEnabled(opts) && ars.NumBytes() < rwMapMinBytes {
                buf := make([]byte, int(ars.NumBytes()))
                var done int
                var bufErr error
                for !ars.IsEmpty() {
                        ar := ars.Head()
                        var n int
                        n, bufErr = mm.asCopyIn(ctx, ar.Start, buf[done:done+int(ar.Length())])
                        done += n
                        if bufErr != nil {
                                break
                        }
                        ars = ars.Tail()
                }
                n, err := dst.WriteFromBlocks(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:done])))
                if err != nil {
                        return int64(n), err
                }
                // Do not convert errors returned by dst to EFAULT.
                return int64(n), bufErr
        }

        // Go through internal mappings.
        return mm.withVecInternalMappings(ctx, ars, hostarch.Read, opts.IgnorePermissions, dst.WriteFromBlocks)
}

// SwapUint32 implements usermem.IO.SwapUint32.
func (mm *MemoryManager) SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts usermem.IOOpts) (uint32, error) {
        ar, ok := mm.CheckIORange(addr, 4)
        if !ok {
                return 0, linuxerr.EFAULT
        }

        // Do AddressSpace IO if applicable.
        if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
                for {
                        old, err := mm.as.SwapUint32(addr, new)
                        if err == nil {
                                return old, nil
                        }
                        if f, ok := err.(platform.SegmentationFault); ok {
                                if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.ReadWrite); err != nil {
                                        return 0, err
                                }
                                continue
                        }
                        return 0, translateIOError(ctx, err)
                }
        }

        // Go through internal mappings.
        var old uint32
        _, err := mm.withInternalMappings(ctx, ar, hostarch.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
                if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
                        // Atomicity is unachievable across mappings.
                        return 0, linuxerr.EFAULT
                }
                im := ims.Head()
                var err error
                old, err = safemem.SwapUint32(im, new)
                if err != nil {
                        return 0, translateIOError(ctx, err)
                }
                // Return the number of bytes read.
                return 4, nil
        })
        return old, err
}

// CompareAndSwapUint32 implements usermem.IO.CompareAndSwapUint32.
func (mm *MemoryManager) CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts usermem.IOOpts) (uint32, error) {
        ar, ok := mm.CheckIORange(addr, 4)
        if !ok {
                return 0, linuxerr.EFAULT
        }

        // Do AddressSpace IO if applicable.
        if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
                for {
                        prev, err := mm.as.CompareAndSwapUint32(addr, old, new)
                        if err == nil {
                                return prev, nil
                        }
                        if f, ok := err.(platform.SegmentationFault); ok {
                                if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.ReadWrite); err != nil {
                                        return 0, err
                                }
                                continue
                        }
                        return 0, translateIOError(ctx, err)
                }
        }

        // Go through internal mappings.
        var prev uint32
        _, err := mm.withInternalMappings(ctx, ar, hostarch.ReadWrite, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
                if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
                        // Atomicity is unachievable across mappings.
                        return 0, linuxerr.EFAULT
                }
                im := ims.Head()
                var err error
                prev, err = safemem.CompareAndSwapUint32(im, old, new)
                if err != nil {
                        return 0, translateIOError(ctx, err)
                }
                // Return the number of bytes read.
                return 4, nil
        })
        return prev, err
}

// LoadUint32 implements usermem.IO.LoadUint32.
func (mm *MemoryManager) LoadUint32(ctx context.Context, addr hostarch.Addr, opts usermem.IOOpts) (uint32, error) {
        ar, ok := mm.CheckIORange(addr, 4)
        if !ok {
                return 0, linuxerr.EFAULT
        }

        // Do AddressSpace IO if applicable.
        if mm.haveASIO && opts.AddressSpaceActive && !opts.IgnorePermissions {
                for {
                        val, err := mm.as.LoadUint32(addr)
                        if err == nil {
                                return val, nil
                        }
                        if f, ok := err.(platform.SegmentationFault); ok {
                                if err := mm.handleASIOFault(ctx, f.Addr, ar, hostarch.Read); err != nil {
                                        return 0, err
                                }
                                continue
                        }
                        return 0, translateIOError(ctx, err)
                }
        }

        // Go through internal mappings.
        var val uint32
        _, err := mm.withInternalMappings(ctx, ar, hostarch.Read, opts.IgnorePermissions, func(ims safemem.BlockSeq) (uint64, error) {
                if ims.NumBlocks() != 1 || ims.NumBytes() != 4 {
                        // Atomicity is unachievable across mappings.
                        return 0, linuxerr.EFAULT
                }
                im := ims.Head()
                var err error
                val, err = safemem.LoadUint32(im)
                if err != nil {
                        return 0, translateIOError(ctx, err)
                }
                // Return the number of bytes read.
                return 4, nil
        })
        return val, err
}

// handleASIOFault handles a page fault at address addr for an AddressSpaceIO
// operation spanning ioar.
//
// Preconditions:
// * mm.as != nil.
// * ioar.Length() != 0.
// * ioar.Contains(addr).
func (mm *MemoryManager) handleASIOFault(ctx context.Context, addr hostarch.Addr, ioar hostarch.AddrRange, at hostarch.AccessType) error {
        // Try to map all remaining pages in the I/O operation. This RoundUp can't
        // overflow because otherwise it would have been caught by CheckIORange.
        end, _ := ioar.End.RoundUp()
        ar := hostarch.AddrRange{addr.RoundDown(), end}

        // Don't bother trying existingPMAsLocked; in most cases, if we did have
        // existing pmas, we wouldn't have faulted.

        // Ensure that we have usable vmas. Here and below, only return early if we
        // can't map the first (faulting) page; failure to map later pages are
        // silently ignored. This maximizes partial success.
        mm.mappingMu.RLock()
        vseg, vend, err := mm.getVMAsLocked(ctx, ar, at, false)
        if vendaddr := vend.Start(); vendaddr < ar.End {
                if vendaddr <= ar.Start {
                        mm.mappingMu.RUnlock()
                        return translateIOError(ctx, err)
                }
                ar.End = vendaddr
        }

        // Ensure that we have usable pmas.
        mm.activeMu.Lock()
        pseg, pend, err := mm.getPMAsLocked(ctx, vseg, ar, at)
        mm.mappingMu.RUnlock()
        if pendaddr := pend.Start(); pendaddr < ar.End {
                if pendaddr <= ar.Start {
                        mm.activeMu.Unlock()
                        return translateIOError(ctx, err)
                }
                ar.End = pendaddr
        }

        // Downgrade to a read-lock on activeMu since we don't need to mutate pmas
        // anymore.
        mm.activeMu.DowngradeLock()

        err = mm.mapASLocked(pseg, ar, false)
        mm.activeMu.RUnlock()
        return translateIOError(ctx, err)
}

// withInternalMappings ensures that pmas exist for all addresses in ar,
// support access of type (at, ignorePermissions), and have internal mappings
// cached. It then calls f with mm.activeMu locked for reading, passing
// internal mappings for the subrange of ar for which this property holds.
//
// withInternalMappings takes a function returning uint64 since many safemem
// functions have this property, but returns an int64 since this is usually
// more useful for usermem.IO methods.
//
// Preconditions: 0 < ar.Length() <= math.MaxInt64.
func (mm *MemoryManager) withInternalMappings(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
        // If pmas are already available, we can do IO without touching mm.vmas or
        // mm.mappingMu.
        mm.activeMu.RLock()
        if pseg := mm.existingPMAsLocked(ar, at, ignorePermissions, true /* needInternalMappings */); pseg.Ok() {
                n, err := f(mm.internalMappingsLocked(pseg, ar))
                mm.activeMu.RUnlock()
                // Do not convert errors returned by f to EFAULT.
                return int64(n), err
        }
        mm.activeMu.RUnlock()

        // Ensure that we have usable vmas.
        mm.mappingMu.RLock()
        vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions)
        if vendaddr := vend.Start(); vendaddr < ar.End {
                if vendaddr <= ar.Start {
                        mm.mappingMu.RUnlock()
                        return 0, translateIOError(ctx, verr)
                }
                ar.End = vendaddr
        }

        // Ensure that we have usable pmas.
        mm.activeMu.Lock()
        pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at)
        mm.mappingMu.RUnlock()
        if pendaddr := pend.Start(); pendaddr < ar.End {
                if pendaddr <= ar.Start {
                        mm.activeMu.Unlock()
                        return 0, translateIOError(ctx, perr)
                }
                ar.End = pendaddr
        }
        imend, imerr := mm.getPMAInternalMappingsLocked(pseg, ar)
        mm.activeMu.DowngradeLock()
        if imendaddr := imend.Start(); imendaddr < ar.End {
                if imendaddr <= ar.Start {
                        mm.activeMu.RUnlock()
                        return 0, translateIOError(ctx, imerr)
                }
                ar.End = imendaddr
        }

        // Do I/O.
        un, err := f(mm.internalMappingsLocked(pseg, ar))
        mm.activeMu.RUnlock()
        n := int64(un)

        // Return the first error in order of progress through ar.
        if err != nil {
                // Do not convert errors returned by f to EFAULT.
                return n, err
        }
        if imerr != nil {
                return n, translateIOError(ctx, imerr)
        }
        if perr != nil {
                return n, translateIOError(ctx, perr)
        }
        return n, translateIOError(ctx, verr)
}

// withVecInternalMappings ensures that pmas exist for all addresses in ars,
// support access of type (at, ignorePermissions), and have internal mappings
// cached. It then calls f with mm.activeMu locked for reading, passing
// internal mappings for the subset of ars for which this property holds.
//
// Preconditions: !ars.IsEmpty().
func (mm *MemoryManager) withVecInternalMappings(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
        // withInternalMappings is faster than withVecInternalMappings because of
        // iterator plumbing (this isn't generally practical in the vector case due
        // to iterator invalidation between AddrRanges). Use it if possible.
        if ars.NumRanges() == 1 {
                return mm.withInternalMappings(ctx, ars.Head(), at, ignorePermissions, f)
        }

        // If pmas are already available, we can do IO without touching mm.vmas or
        // mm.mappingMu.
        mm.activeMu.RLock()
        if mm.existingVecPMAsLocked(ars, at, ignorePermissions, true /* needInternalMappings */) {
                n, err := f(mm.vecInternalMappingsLocked(ars))
                mm.activeMu.RUnlock()
                // Do not convert errors returned by f to EFAULT.
                return int64(n), err
        }
        mm.activeMu.RUnlock()

        // Ensure that we have usable vmas.
        mm.mappingMu.RLock()
        vars, verr := mm.getVecVMAsLocked(ctx, ars, at, ignorePermissions)
        if vars.NumBytes() == 0 {
                mm.mappingMu.RUnlock()
                return 0, translateIOError(ctx, verr)
        }

        // Ensure that we have usable pmas.
        mm.activeMu.Lock()
        pars, perr := mm.getVecPMAsLocked(ctx, vars, at)
        mm.mappingMu.RUnlock()
        if pars.NumBytes() == 0 {
                mm.activeMu.Unlock()
                return 0, translateIOError(ctx, perr)
        }
        imars, imerr := mm.getVecPMAInternalMappingsLocked(pars)
        mm.activeMu.DowngradeLock()
        if imars.NumBytes() == 0 {
                mm.activeMu.RUnlock()
                return 0, translateIOError(ctx, imerr)
        }

        // Do I/O.
        un, err := f(mm.vecInternalMappingsLocked(imars))
        mm.activeMu.RUnlock()
        n := int64(un)

        // Return the first error in order of progress through ars.
        if err != nil {
                // Do not convert errors from f to EFAULT.
                return n, err
        }
        if imerr != nil {
                return n, translateIOError(ctx, imerr)
        }
        if perr != nil {
                return n, translateIOError(ctx, perr)
        }
        return n, translateIOError(ctx, verr)
}

// truncatedAddrRangeSeq returns a copy of ars, but with the end truncated to
// at most address end on AddrRange arsit.Head(). It is used in vector I/O paths to
// truncate hostarch.AddrRangeSeq when errors occur.
//
// Preconditions:
// * !arsit.IsEmpty().
// * end <= arsit.Head().End.
func truncatedAddrRangeSeq(ars, arsit hostarch.AddrRangeSeq, end hostarch.Addr) hostarch.AddrRangeSeq {
        ar := arsit.Head()
        if end <= ar.Start {
                return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes())
        }
        return ars.TakeFirst64(ars.NumBytes() - arsit.NumBytes() + int64(end-ar.Start))
}
























   10 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import "container/heap"

type segmentHeap []*segment

var _ heap.Interface = (*segmentHeap)(nil)

// Len returns the length of h.
func (h *segmentHeap) Len() int {
        return len(*h)
}

// Less determines whether the i-th element of h is less than the j-th element.
func (h *segmentHeap) Less(i, j int) bool {
        return (*h)[i].sequenceNumber.LessThan((*h)[j].sequenceNumber)
}

// Swap swaps the i-th and j-th elements of h.
func (h *segmentHeap) Swap(i, j int) {
        (*h)[i], (*h)[j] = (*h)[j], (*h)[i]
}

// Push adds x as the last element of h.
func (h *segmentHeap) Push(x interface{}) {
        *h = append(*h, x.(*segment))
}

// Pop removes the last element of h and returns it.
func (h *segmentHeap) Pop() interface{} {
        old := *h
        n := len(old)
        x := old[n-1]
        old[n-1] = nil
        *h = old[:n-1]
        return x
}





































































































































































































































































































































































































  513 




  511 





  456 




  512 





  517 














  515 





  517 






  515 












  512 






  517 

  515 





  513 





  516 



  515 













































   89 




   89 




   90 

   30 





   90 










































  490 










  491 




  490 









  491 


  491 



  524 




  526 


  524 


  526 



  525 



  491 




  491 




  495 





  492 
  486 







  486 


  495 

  489 






 1642 



 1639 



 1646 
 1642 


 1644 


 1641 










 1645 

 1640 









 1638 
 1638 


 1641 
 1639 


 1643 

 1645 






























































































































































































































































































































  612 


















  490 






  490 












  490 









  490 








































  489 


  489 
  490 



  494 


  491 




  489 



  494 





  490 



  489 








  488 


  491 









  490 











  490 









  491 
















































  334 



  532 



  533 


  531 



  478 




































  487 



  486 


    3 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package pgalloc contains the page allocator subsystem, which manages memory
// that may be mapped into application address spaces.
//
// Lock order:
//
// pgalloc.MemoryFile.mu
//   pgalloc.MemoryFile.mappingsMu
package pgalloc

import (
        "fmt"
        "math"
        "os"
        "sync/atomic"
        "time"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/hostmm"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/usage"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
)

// MemoryFile is a memmap.File whose pages may be allocated to arbitrary
// users.
type MemoryFile struct {
        // opts holds options passed to NewMemoryFile. opts is immutable.
        opts MemoryFileOpts

        // MemoryFile owns a single backing file, which is modeled as follows:
        //
        // Each page in the file can be committed or uncommitted. A page is
        // committed if the host kernel is spending resources to store its contents
        // and uncommitted otherwise. This definition includes pages that the host
        // kernel has swapped; this is intentional, to ensure that accounting does
        // not change even if host kernel swapping behavior changes, and that
        // memory used by pseudo-swap mechanisms like zswap is still accounted.
        //
        // The initial contents of uncommitted pages are implicitly zero bytes. A
        // read or write to the contents of an uncommitted page causes it to be
        // committed. This is the only event that can cause a uncommitted page to
        // be committed.
        //
        // fallocate(FALLOC_FL_PUNCH_HOLE) (MemoryFile.Decommit) causes committed
        // pages to be uncommitted. This is the only event that can cause a
        // committed page to be uncommitted.
        //
        // Memory accounting is based on identifying the set of committed pages.
        // Since we do not have direct access to the MMU, tracking reads and writes
        // to uncommitted pages to detect commitment would introduce additional
        // page faults, which would be prohibitively expensive. Instead, we query
        // the host kernel to determine which pages are committed.

        // file is the backing file. The file pointer is immutable.
        file *os.File

        mu sync.Mutex

        // usage maps each page in the file to metadata for that page. Pages for
        // which no segment exists in usage are both unallocated (not in use) and
        // uncommitted.
        //
        // Since usage stores usageInfo objects by value, clients should usually
        // use usageIterator.ValuePtr() instead of usageIterator.Value() to get a
        // pointer to the usageInfo rather than a copy.
        //
        // usage must be kept maximally merged (that is, there should never be two
        // adjacent segments with the same values). At least markReclaimed depends
        // on this property.
        //
        // usage is protected by mu.
        usage usageSet

        // The UpdateUsage function scans all segments with knownCommitted set
        // to false, sees which pages are committed and creates corresponding
        // segments with knownCommitted set to true.
        //
        // In order to avoid unnecessary scans, usageExpected tracks the total
        // file blocks expected. This is used to elide the scan when this
        // matches the underlying file blocks.
        //
        // To track swapped pages, usageSwapped tracks the discrepency between
        // what is observed in core and what is reported by the file. When
        // usageSwapped is non-zero, a sweep will be performed at least every
        // second. The start of the last sweep is recorded in usageLast.
        //
        // All usage attributes are all protected by mu.
        usageExpected uint64
        usageSwapped  uint64
        usageLast     time.Time

        // fileSize is the size of the backing memory file in bytes. fileSize is
        // always a power-of-two multiple of chunkSize.
        //
        // fileSize is protected by mu.
        fileSize int64

        // Pages from the backing file are mapped into the local address space on
        // the granularity of large pieces called chunks. mappings is a []uintptr
        // that stores, for each chunk, the start address of a mapping of that
        // chunk in the current process' address space, or 0 if no such mapping
        // exists. Once a chunk is mapped, it is never remapped or unmapped until
        // the MemoryFile is destroyed.
        //
        // Mutating the mappings slice or its contents requires both holding
        // mappingsMu and using atomic memory operations. (The slice is mutated
        // whenever the file is expanded. Per the above, the only permitted
        // mutation of the slice's contents is the assignment of a mapping to a
        // chunk that was previously unmapped.) Reading the slice or its contents
        // only requires *either* holding mappingsMu or using atomic memory
        // operations. This allows MemoryFile.MapInternal to avoid locking in the
        // common case where chunk mappings already exist.
        mappingsMu sync.Mutex
        mappings   atomic.Value

        // destroyed is set by Destroy to instruct the reclaimer goroutine to
        // release resources and exit. destroyed is protected by mu.
        destroyed bool

        // reclaimable is true if usage may contain reclaimable pages. reclaimable
        // is protected by mu.
        reclaimable bool

        // relcaim is the collection of regions for reclaim. relcaim is protected
        // by mu.
        reclaim reclaimSet

        // reclaimCond is signaled (with mu locked) when reclaimable or destroyed
        // transitions from false to true.
        reclaimCond sync.Cond

        // evictable maps EvictableMemoryUsers to eviction state.
        //
        // evictable is protected by mu.
        evictable map[EvictableMemoryUser]*evictableMemoryUserInfo

        // evictionWG counts the number of goroutines currently performing evictions.
        evictionWG sync.WaitGroup

        // stopNotifyPressure stops memory cgroup pressure level
        // notifications used to drive eviction. stopNotifyPressure is
        // immutable.
        stopNotifyPressure func()
}

// MemoryFileOpts provides options to NewMemoryFile.
type MemoryFileOpts struct {
        // DelayedEviction controls the extent to which the MemoryFile may delay
        // eviction of evictable allocations.
        DelayedEviction DelayedEvictionType

        // If UseHostMemcgPressure is true, use host memory cgroup pressure level
        // notifications to determine when eviction is necessary. This option has
        // no effect unless DelayedEviction is DelayedEvictionEnabled.
        UseHostMemcgPressure bool

        // If ManualZeroing is true, MemoryFile must not assume that new pages
        // obtained from the host are zero-filled, such that MemoryFile must manually
        // zero newly-allocated pages.
        ManualZeroing bool
}

// DelayedEvictionType is the type of MemoryFileOpts.DelayedEviction.
type DelayedEvictionType int

const (
        // DelayedEvictionDefault has unspecified behavior.
        DelayedEvictionDefault DelayedEvictionType = iota

        // DelayedEvictionDisabled requires that evictable allocations are evicted
        // as soon as possible.
        DelayedEvictionDisabled

        // DelayedEvictionEnabled requests that the MemoryFile delay eviction of
        // evictable allocations until doing so is considered necessary to avoid
        // performance degradation due to host memory pressure, or OOM kills.
        //
        // As of this writing, the behavior of DelayedEvictionEnabled depends on
        // whether or not MemoryFileOpts.UseHostMemcgPressure is enabled:
        //
        // - If UseHostMemcgPressure is true, evictions are delayed until memory
        // pressure is indicated.
        //
        // - Otherwise, evictions are only delayed until the reclaimer goroutine
        // is out of work (pages to reclaim).
        DelayedEvictionEnabled

        // DelayedEvictionManual requires that evictable allocations are only
        // evicted when MemoryFile.StartEvictions() is called. This is extremely
        // dangerous outside of tests.
        DelayedEvictionManual
)

// usageInfo tracks usage information.
//
// +stateify savable
type usageInfo struct {
        // kind is the usage kind.
        kind usage.MemoryKind

        // knownCommitted is true if the tracked region is definitely committed.
        // (If it is false, the tracked region may or may not be committed.)
        knownCommitted bool

        refs uint64
}

// canCommit returns true if the tracked region can be committed.
func (u *usageInfo) canCommit() bool {
        // refs must be greater than 0 because we assume that reclaimable pages
        // (that aren't already known to be committed) are not committed. This
        // isn't necessarily true, even after the reclaimer does Decommit(),
        // because the kernel may subsequently back the hugepage-sized region
        // containing the decommitted page with a hugepage. However, it's
        // consistent with our treatment of unallocated pages, which have the same
        // property.
        return !u.knownCommitted && u.refs != 0
}

// An EvictableMemoryUser represents a user of MemoryFile-allocated memory that
// may be asked to deallocate that memory in the presence of memory pressure.
type EvictableMemoryUser interface {
        // Evict requests that the EvictableMemoryUser deallocate memory used by
        // er, which was registered as evictable by a previous call to
        // MemoryFile.MarkEvictable.
        //
        // Evict is not required to deallocate memory. In particular, since pgalloc
        // must call Evict without holding locks to avoid circular lock ordering,
        // it is possible that the passed range has already been marked as
        // unevictable by a racing call to MemoryFile.MarkUnevictable.
        // Implementations of EvictableMemoryUser must detect such races and handle
        // them by making Evict have no effect on unevictable ranges.
        //
        // After a call to Evict, the MemoryFile will consider the evicted range
        // unevictable (i.e. it will not call Evict on the same range again) until
        // informed otherwise by a subsequent call to MarkEvictable.
        Evict(ctx context.Context, er EvictableRange)
}

// An EvictableRange represents a range of uint64 offsets in an
// EvictableMemoryUser.
//
// In practice, most EvictableMemoryUsers will probably be implementations of
// memmap.Mappable, and EvictableRange therefore corresponds to
// memmap.MappableRange. However, this package cannot depend on the memmap
// package, since doing so would create a circular dependency.
//
// type EvictableRange <generated using go_generics>

// evictableMemoryUserInfo is the value type of MemoryFile.evictable.
type evictableMemoryUserInfo struct {
        // ranges tracks all evictable ranges for the given user.
        ranges evictableRangeSet

        // If evicting is true, there is a goroutine currently evicting all
        // evictable ranges for this user.
        evicting bool
}

const (
        chunkShift = 30
        chunkSize  = 1 << chunkShift // 1 GB
        chunkMask  = chunkSize - 1

        // maxPage is the highest 64-bit page.
        maxPage = math.MaxUint64 &^ (hostarch.PageSize - 1)
)

// NewMemoryFile creates a MemoryFile backed by the given file. If
// NewMemoryFile succeeds, ownership of file is transferred to the returned
// MemoryFile.
func NewMemoryFile(file *os.File, opts MemoryFileOpts) (*MemoryFile, error) {
        switch opts.DelayedEviction {
        case DelayedEvictionDefault:
                opts.DelayedEviction = DelayedEvictionEnabled
        case DelayedEvictionDisabled, DelayedEvictionManual:
                opts.UseHostMemcgPressure = false
        case DelayedEvictionEnabled:
                // ok
        default:
                return nil, fmt.Errorf("invalid MemoryFileOpts.DelayedEviction: %v", opts.DelayedEviction)
        }

        // Truncate the file to 0 bytes first to ensure that it's empty.
        if err := file.Truncate(0); err != nil {
                return nil, err
        }
        f := &MemoryFile{
                opts:      opts,
                file:      file,
                evictable: make(map[EvictableMemoryUser]*evictableMemoryUserInfo),
        }
        f.mappings.Store(make([]uintptr, 0))
        f.reclaimCond.L = &f.mu

        if f.opts.DelayedEviction == DelayedEvictionEnabled && f.opts.UseHostMemcgPressure {
                stop, err := hostmm.NotifyCurrentMemcgPressureCallback(func() {
                        f.mu.Lock()
                        startedAny := f.startEvictionsLocked()
                        f.mu.Unlock()
                        if startedAny {
                                log.Debugf("pgalloc.MemoryFile performing evictions due to memcg pressure")
                        }
                }, "low")
                if err != nil {
                        return nil, fmt.Errorf("failed to configure memcg pressure level notifications: %v", err)
                }
                f.stopNotifyPressure = stop
        }

        go f.runReclaim() // S/R-SAFE: f.mu

        // The Linux kernel contains an optional feature called "Integrity
        // Measurement Architecture" (IMA). If IMA is enabled, it will checksum
        // binaries the first time they are mapped PROT_EXEC. This is bad news for
        // executable pages mapped from our backing file, which can grow to
        // terabytes in (sparse) size. If IMA attempts to checksum a file that
        // large, it will allocate all of the sparse pages and quickly exhaust all
        // memory.
        //
        // Work around IMA by immediately creating a temporary PROT_EXEC mapping,
        // while the backing file is still small. IMA will ignore any future
        // mappings.
        m, _, errno := unix.Syscall6(
                unix.SYS_MMAP,
                0,
                hostarch.PageSize,
                unix.PROT_EXEC,
                unix.MAP_SHARED,
                file.Fd(),
                0)
        if errno != 0 {
                // This isn't fatal (IMA may not even be in use). Log the error, but
                // don't return it.
                log.Warningf("Failed to pre-map MemoryFile PROT_EXEC: %v", errno)
        } else {
                if _, _, errno := unix.Syscall(
                        unix.SYS_MUNMAP,
                        m,
                        hostarch.PageSize,
                        0); errno != 0 {
                        panic(fmt.Sprintf("failed to unmap PROT_EXEC MemoryFile mapping: %v", errno))
                }
        }

        return f, nil
}

// Destroy releases all resources used by f.
//
// Preconditions: All pages allocated by f have been freed.
//
// Postconditions: None of f's methods may be called after Destroy.
func (f *MemoryFile) Destroy() {
        f.mu.Lock()
        defer f.mu.Unlock()
        f.destroyed = true
        f.reclaimCond.Signal()
}

// Allocate returns a range of initially-zeroed pages of the given length with
// the given accounting kind and a single reference held by the caller. When
// the last reference on an allocated page is released, ownership of the page
// is returned to the MemoryFile, allowing it to be returned by a future call
// to Allocate.
//
// Preconditions: length must be page-aligned and non-zero.
func (f *MemoryFile) Allocate(length uint64, kind usage.MemoryKind) (memmap.FileRange, error) {
        if length == 0 || length%hostarch.PageSize != 0 {
                panic(fmt.Sprintf("invalid allocation length: %#x", length))
        }

        f.mu.Lock()
        defer f.mu.Unlock()

        // Align hugepage-and-larger allocations on hugepage boundaries to try
        // to take advantage of hugetmpfs.
        alignment := uint64(hostarch.PageSize)
        if length >= hostarch.HugePageSize {
                alignment = hostarch.HugePageSize
        }

        // Find a range in the underlying file.
        fr, ok := findAvailableRange(&f.usage, f.fileSize, length, alignment)
        if !ok {
                return memmap.FileRange{}, syserror.ENOMEM
        }

        // Expand the file if needed.
        if int64(fr.End) > f.fileSize {
                // Round the new file size up to be chunk-aligned.
                newFileSize := (int64(fr.End) + chunkMask) &^ chunkMask
                if err := f.file.Truncate(newFileSize); err != nil {
                        return memmap.FileRange{}, err
                }
                f.fileSize = newFileSize
                f.mappingsMu.Lock()
                oldMappings := f.mappings.Load().([]uintptr)
                newMappings := make([]uintptr, newFileSize>>chunkShift)
                copy(newMappings, oldMappings)
                f.mappings.Store(newMappings)
                f.mappingsMu.Unlock()
        }

        if f.opts.ManualZeroing {
                if err := f.manuallyZero(fr); err != nil {
                        return memmap.FileRange{}, err
                }
        }
        // Mark selected pages as in use.
        if !f.usage.Add(fr, usageInfo{
                kind: kind,
                refs: 1,
        }) {
                panic(fmt.Sprintf("allocating %v: failed to insert into usage set:\n%v", fr, &f.usage))
        }

        return fr, nil
}

// findAvailableRange returns an available range in the usageSet.
//
// Note that scanning for available slots takes place from end first backwards,
// then forwards. This heuristic has important consequence for how sequential
// mappings can be merged in the host VMAs, given that addresses for both
// application and sentry mappings are allocated top-down (from higher to
// lower addresses). The file is also grown expoentially in order to create
// space for mappings to be allocated downwards.
//
// Precondition: alignment must be a power of 2.
func findAvailableRange(usage *usageSet, fileSize int64, length, alignment uint64) (memmap.FileRange, bool) {
        alignmentMask := alignment - 1

        // Search for space in existing gaps, starting at the current end of the
        // file and working backward.
        lastGap := usage.LastGap()
        gap := lastGap
        for {
                end := gap.End()
                if end > uint64(fileSize) {
                        end = uint64(fileSize)
                }

                // Try to allocate from the end of this gap, with the start of the
                // allocated range aligned down to alignment.
                unalignedStart := end - length
                if unalignedStart > end {
                        // Negative overflow: this and all preceding gaps are too small to
                        // accommodate length.
                        break
                }
                if start := unalignedStart &^ alignmentMask; start >= gap.Start() {
                        return memmap.FileRange{start, start + length}, true
                }

                gap = gap.PrevLargeEnoughGap(length)
                if !gap.Ok() {
                        break
                }
        }

        // Check that it's possible to fit this allocation at the end of a file of any size.
        min := lastGap.Start()
        min = (min + alignmentMask) &^ alignmentMask
        if min+length < min {
                // Overflow: allocation would exceed the range of uint64.
                return memmap.FileRange{}, false
        }

        // Determine the minimum file size required to fit this allocation at its end.
        for {
                newFileSize := 2 * fileSize
                if newFileSize <= fileSize {
                        if fileSize != 0 {
                                // Overflow: allocation would exceed the range of int64.
                                return memmap.FileRange{}, false
                        }
                        newFileSize = chunkSize
                }
                fileSize = newFileSize

                unalignedStart := uint64(fileSize) - length
                if unalignedStart > uint64(fileSize) {
                        // Negative overflow: fileSize is still inadequate.
                        continue
                }
                if start := unalignedStart &^ alignmentMask; start >= min {
                        return memmap.FileRange{start, start + length}, true
                }
        }
}

// AllocateAndFill allocates memory of the given kind and fills it by calling
// r.ReadToBlocks() repeatedly until either length bytes are read or a non-nil
// error is returned. It returns the memory filled by r, truncated down to the
// nearest page. If this is shorter than length bytes due to an error returned
// by r.ReadToBlocks(), it returns that error.
//
// Preconditions:
// * length > 0.
// * length must be page-aligned.
func (f *MemoryFile) AllocateAndFill(length uint64, kind usage.MemoryKind, r safemem.Reader) (memmap.FileRange, error) {
        fr, err := f.Allocate(length, kind)
        if err != nil {
                return memmap.FileRange{}, err
        }
        dsts, err := f.MapInternal(fr, hostarch.Write)
        if err != nil {
                f.DecRef(fr)
                return memmap.FileRange{}, err
        }
        n, err := safemem.ReadFullToBlocks(r, dsts)
        un := uint64(hostarch.Addr(n).RoundDown())
        if un < length {
                // Free unused memory and update fr to contain only the memory that is
                // still allocated.
                f.DecRef(memmap.FileRange{fr.Start + un, fr.End})
                fr.End = fr.Start + un
        }
        return fr, err
}

// fallocate(2) modes, defined in Linux's include/uapi/linux/falloc.h.
const (
        _FALLOC_FL_KEEP_SIZE  = 1
        _FALLOC_FL_PUNCH_HOLE = 2
)

// Decommit releases resources associated with maintaining the contents of the
// given pages. If Decommit succeeds, future accesses of the decommitted pages
// will read zeroes.
//
// Preconditions: fr.Length() > 0.
func (f *MemoryFile) Decommit(fr memmap.FileRange) error {
        if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
                panic(fmt.Sprintf("invalid range: %v", fr))
        }

        if f.opts.ManualZeroing {
                // FALLOC_FL_PUNCH_HOLE may not zero pages if ManualZeroing is in
                // effect.
                if err := f.manuallyZero(fr); err != nil {
                        return err
                }
        } else {
                if err := f.decommitFile(fr); err != nil {
                        return err
                }
        }

        f.markDecommitted(fr)
        return nil
}

func (f *MemoryFile) manuallyZero(fr memmap.FileRange) error {
        return f.forEachMappingSlice(fr, func(bs []byte) {
                for i := range bs {
                        bs[i] = 0
                }
        })
}

func (f *MemoryFile) decommitFile(fr memmap.FileRange) error {
        // "After a successful call, subsequent reads from this range will
        // return zeroes. The FALLOC_FL_PUNCH_HOLE flag must be ORed with
        // FALLOC_FL_KEEP_SIZE in mode ..." - fallocate(2)
        return unix.Fallocate(
                int(f.file.Fd()),
                _FALLOC_FL_PUNCH_HOLE|_FALLOC_FL_KEEP_SIZE,
                int64(fr.Start),
                int64(fr.Length()))
}

func (f *MemoryFile) markDecommitted(fr memmap.FileRange) {
        f.mu.Lock()
        defer f.mu.Unlock()
        // Since we're changing the knownCommitted attribute, we need to merge
        // across the entire range to ensure that the usage tree is minimal.
        gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
                val := seg.ValuePtr()
                if val.knownCommitted {
                        // Drop the usageExpected appropriately.
                        amount := seg.Range().Length()
                        usage.MemoryAccounting.Dec(amount, val.kind)
                        f.usageExpected -= amount
                        val.knownCommitted = false
                }
        })
        if gap.Ok() {
                panic(fmt.Sprintf("Decommit(%v): attempted to decommit unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
        }
        f.usage.MergeRange(fr)
}

// IncRef implements memmap.File.IncRef.
func (f *MemoryFile) IncRef(fr memmap.FileRange) {
        if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
                panic(fmt.Sprintf("invalid range: %v", fr))
        }

        f.mu.Lock()
        defer f.mu.Unlock()

        gap := f.usage.ApplyContiguous(fr, func(seg usageIterator) {
                seg.ValuePtr().refs++
        })
        if gap.Ok() {
                panic(fmt.Sprintf("IncRef(%v): attempted to IncRef on unallocated pages %v:\n%v", fr, gap.Range(), &f.usage))
        }

        f.usage.MergeAdjacent(fr)
}

// DecRef implements memmap.File.DecRef.
func (f *MemoryFile) DecRef(fr memmap.FileRange) {
        if !fr.WellFormed() || fr.Length() == 0 || fr.Start%hostarch.PageSize != 0 || fr.End%hostarch.PageSize != 0 {
                panic(fmt.Sprintf("invalid range: %v", fr))
        }

        var freed bool

        f.mu.Lock()
        defer f.mu.Unlock()

        for seg := f.usage.FindSegment(fr.Start); seg.Ok() && seg.Start() < fr.End; seg = seg.NextSegment() {
                seg = f.usage.Isolate(seg, fr)
                val := seg.ValuePtr()
                if val.refs == 0 {
                        panic(fmt.Sprintf("DecRef(%v): 0 existing references on %v:\n%v", fr, seg.Range(), &f.usage))
                }
                val.refs--
                if val.refs == 0 {
                        f.reclaim.Add(seg.Range(), reclaimSetValue{})
                        freed = true
                        // Reclassify memory as System, until it's freed by the reclaim
                        // goroutine.
                        if val.knownCommitted {
                                usage.MemoryAccounting.Move(seg.Range().Length(), usage.System, val.kind)
                        }
                        val.kind = usage.System
                }
        }
        f.usage.MergeAdjacent(fr)

        if freed {
                f.reclaimable = true
                f.reclaimCond.Signal()
        }
}

// MapInternal implements memmap.File.MapInternal.
func (f *MemoryFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) {
        if !fr.WellFormed() || fr.Length() == 0 {
                panic(fmt.Sprintf("invalid range: %v", fr))
        }
        if at.Execute {
                return safemem.BlockSeq{}, linuxerr.EACCES
        }

        chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
        if chunks == 1 {
                // Avoid an unnecessary slice allocation.
                var seq safemem.BlockSeq
                err := f.forEachMappingSlice(fr, func(bs []byte) {
                        seq = safemem.BlockSeqOf(safemem.BlockFromSafeSlice(bs))
                })
                return seq, err
        }
        blocks := make([]safemem.Block, 0, chunks)
        err := f.forEachMappingSlice(fr, func(bs []byte) {
                blocks = append(blocks, safemem.BlockFromSafeSlice(bs))
        })
        return safemem.BlockSeqFromSlice(blocks), err
}

// forEachMappingSlice invokes fn on a sequence of byte slices that
// collectively map all bytes in fr.
func (f *MemoryFile) forEachMappingSlice(fr memmap.FileRange, fn func([]byte)) error {
        mappings := f.mappings.Load().([]uintptr)
        for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
                chunk := int(chunkStart >> chunkShift)
                m := atomic.LoadUintptr(&mappings[chunk])
                if m == 0 {
                        var err error
                        mappings, m, err = f.getChunkMapping(chunk)
                        if err != nil {
                                return err
                        }
                }
                startOff := uint64(0)
                if chunkStart < fr.Start {
                        startOff = fr.Start - chunkStart
                }
                endOff := uint64(chunkSize)
                if chunkStart+chunkSize > fr.End {
                        endOff = fr.End - chunkStart
                }
                fn(unsafeSlice(m, chunkSize)[startOff:endOff])
        }
        return nil
}

func (f *MemoryFile) getChunkMapping(chunk int) ([]uintptr, uintptr, error) {
        f.mappingsMu.Lock()
        defer f.mappingsMu.Unlock()
        // Another thread may have replaced f.mappings altogether due to file
        // expansion.
        mappings := f.mappings.Load().([]uintptr)
        // Another thread may have already mapped the chunk.
        if m := mappings[chunk]; m != 0 {
                return mappings, m, nil
        }
        m, _, errno := unix.Syscall6(
                unix.SYS_MMAP,
                0,
                chunkSize,
                unix.PROT_READ|unix.PROT_WRITE,
                unix.MAP_SHARED,
                f.file.Fd(),
                uintptr(chunk<<chunkShift))
        if errno != 0 {
                return nil, 0, errno
        }
        atomic.StoreUintptr(&mappings[chunk], m)
        return mappings, m, nil
}

// MarkEvictable allows f to request memory deallocation by calling
// user.Evict(er) in the future.
//
// Redundantly marking an already-evictable range as evictable has no effect.
func (f *MemoryFile) MarkEvictable(user EvictableMemoryUser, er EvictableRange) {
        f.mu.Lock()
        defer f.mu.Unlock()
        info, ok := f.evictable[user]
        if !ok {
                info = &evictableMemoryUserInfo{}
                f.evictable[user] = info
        }
        gap := info.ranges.LowerBoundGap(er.Start)
        for gap.Ok() && gap.Start() < er.End {
                gapER := gap.Range().Intersect(er)
                if gapER.Length() == 0 {
                        gap = gap.NextGap()
                        continue
                }
                gap = info.ranges.Insert(gap, gapER, evictableRangeSetValue{}).NextGap()
        }
        if !info.evicting {
                switch f.opts.DelayedEviction {
                case DelayedEvictionDisabled:
                        // Kick off eviction immediately.
                        f.startEvictionGoroutineLocked(user, info)
                case DelayedEvictionEnabled:
                        if !f.opts.UseHostMemcgPressure {
                                // Ensure that the reclaimer goroutine is running, so that it
                                // can start eviction when necessary.
                                f.reclaimCond.Signal()
                        }
                }
        }
}

// MarkUnevictable informs f that user no longer considers er to be evictable,
// so the MemoryFile should no longer call user.Evict(er). Note that, per
// EvictableMemoryUser.Evict's documentation, user.Evict(er) may still be
// called even after MarkUnevictable returns due to race conditions, and
// implementations of EvictableMemoryUser must handle this possibility.
//
// Redundantly marking an already-unevictable range as unevictable has no
// effect.
func (f *MemoryFile) MarkUnevictable(user EvictableMemoryUser, er EvictableRange) {
        f.mu.Lock()
        defer f.mu.Unlock()
        info, ok := f.evictable[user]
        if !ok {
                return
        }
        seg := info.ranges.LowerBoundSegment(er.Start)
        for seg.Ok() && seg.Start() < er.End {
                seg = info.ranges.Isolate(seg, er)
                seg = info.ranges.Remove(seg).NextSegment()
        }
        // We can only remove info if there's no eviction goroutine running on its
        // behalf.
        if !info.evicting && info.ranges.IsEmpty() {
                delete(f.evictable, user)
        }
}

// MarkAllUnevictable informs f that user no longer considers any offsets to be
// evictable. It otherwise has the same semantics as MarkUnevictable.
func (f *MemoryFile) MarkAllUnevictable(user EvictableMemoryUser) {
        f.mu.Lock()
        defer f.mu.Unlock()
        info, ok := f.evictable[user]
        if !ok {
                return
        }
        info.ranges.RemoveAll()
        // We can only remove info if there's no eviction goroutine running on its
        // behalf.
        if !info.evicting {
                delete(f.evictable, user)
        }
}

// ShouldCacheEvictable returns true if f is meaningfully delaying evictions of
// evictable memory, such that it may be advantageous to cache data in
// evictable memory. The value returned by ShouldCacheEvictable may change
// between calls.
func (f *MemoryFile) ShouldCacheEvictable() bool {
        return f.opts.DelayedEviction == DelayedEvictionManual || f.opts.UseHostMemcgPressure
}

// UpdateUsage ensures that the memory usage statistics in
// usage.MemoryAccounting are up to date.
func (f *MemoryFile) UpdateUsage() error {
        f.mu.Lock()
        defer f.mu.Unlock()

        // If the underlying usage matches where the usage tree already
        // represents, then we can just avoid the entire scan (we know it's
        // accurate).
        currentUsage, err := f.TotalUsage()
        if err != nil {
                return err
        }
        if currentUsage == f.usageExpected && f.usageSwapped == 0 {
                log.Debugf("UpdateUsage: skipped with usageSwapped=0.")
                return nil
        }
        // If the current usage matches the expected but there's swap
        // accounting, then ensure a scan takes place at least every second
        // (when requested).
        if currentUsage == f.usageExpected+f.usageSwapped && time.Now().Before(f.usageLast.Add(time.Second)) {
                log.Debugf("UpdateUsage: skipped with usageSwapped!=0.")
                return nil
        }
        // Linux updates usage values at CONFIG_HZ.
        if scanningAfter := time.Now().Sub(f.usageLast).Milliseconds(); scanningAfter < time.Second.Milliseconds()/linux.CLOCKS_PER_SEC {
                log.Debugf("UpdateUsage: skipped because previous scan happened %d ms back", scanningAfter)
                return nil
        }

        f.usageLast = time.Now()
        err = f.updateUsageLocked(currentUsage, mincore)
        log.Debugf("UpdateUsage: currentUsage=%d, usageExpected=%d, usageSwapped=%d.",
                currentUsage, f.usageExpected, f.usageSwapped)
        log.Debugf("UpdateUsage: took %v.", time.Since(f.usageLast))
        return err
}

// updateUsageLocked attempts to detect commitment of previous-uncommitted
// pages by invoking checkCommitted, which is a function that, for each page i
// in bs, sets committed[i] to 1 if the page is committed and 0 otherwise.
//
// Precondition: f.mu must be held; it may be unlocked and reacquired.
// +checklocks:f.mu
func (f *MemoryFile) updateUsageLocked(currentUsage uint64, checkCommitted func(bs []byte, committed []byte) error) error {
        // Track if anything changed to elide the merge. In the common case, we
        // expect all segments to be committed and no merge to occur.
        changedAny := false
        defer func() {
                if changedAny {
                        f.usage.MergeAll()
                }

                // Adjust the swap usage to reflect reality.
                if f.usageExpected < currentUsage {
                        // Since no pages may be marked decommitted while we hold mu, we
                        // know that usage may have only increased since we got the last
                        // current usage. Therefore, if usageExpected is still short of
                        // currentUsage, we must assume that the difference is in pages
                        // that have been swapped.
                        newUsageSwapped := currentUsage - f.usageExpected
                        if f.usageSwapped < newUsageSwapped {
                                usage.MemoryAccounting.Inc(newUsageSwapped-f.usageSwapped, usage.System)
                        } else {
                                usage.MemoryAccounting.Dec(f.usageSwapped-newUsageSwapped, usage.System)
                        }
                        f.usageSwapped = newUsageSwapped
                } else if f.usageSwapped != 0 {
                        // We have more usage accounted for than the file itself.
                        // That's fine, we probably caught a race where pages were
                        // being committed while the below loop was running. Just
                        // report the higher number that we found and ignore swap.
                        usage.MemoryAccounting.Dec(f.usageSwapped, usage.System)
                        f.usageSwapped = 0
                }
        }()

        // Reused mincore buffer, will generally be <= 4096 bytes.
        var buf []byte

        // Iterate over all usage data. There will only be usage segments
        // present when there is an associated reference.
        for seg := f.usage.FirstSegment(); seg.Ok(); {
                if !seg.ValuePtr().canCommit() {
                        seg = seg.NextSegment()
                        continue
                }

                // Get the range for this segment. As we touch slices, the
                // Start value will be walked along.
                r := seg.Range()

                var checkErr error
                err := f.forEachMappingSlice(r,
                        func(s []byte) {
                                if checkErr != nil {
                                        return
                                }

                                // Ensure that we have sufficient buffer for the call
                                // (one byte per page). The length of each slice must
                                // be page-aligned.
                                bufLen := len(s) / hostarch.PageSize
                                if len(buf) < bufLen {
                                        buf = make([]byte, bufLen)
                                }

                                // Query for new pages in core.
                                // NOTE(b/165896008): mincore (which is passed as checkCommitted)
                                // by f.UpdateUsage() might take a really long time. So unlock f.mu
                                // while checkCommitted runs.
                                f.mu.Unlock() // +checklocksforce
                                err := checkCommitted(s, buf)
                                f.mu.Lock()
                                if err != nil {
                                        checkErr = err
                                        return
                                }

                                // Scan each page and switch out segments.
                                seg := f.usage.LowerBoundSegment(r.Start)
                                for i := 0; i < bufLen; {
                                        if buf[i]&0x1 == 0 {
                                                i++
                                                continue
                                        }
                                        // Scan to the end of this committed range.
                                        j := i + 1
                                        for ; j < bufLen; j++ {
                                                if buf[j]&0x1 == 0 {
                                                        break
                                                }
                                        }
                                        committedFR := memmap.FileRange{
                                                Start: r.Start + uint64(i*hostarch.PageSize),
                                                End:   r.Start + uint64(j*hostarch.PageSize),
                                        }
                                        // Advance seg to committedFR.Start.
                                        for seg.Ok() && seg.End() < committedFR.Start {
                                                seg = seg.NextSegment()
                                        }
                                        // Mark pages overlapping committedFR as committed.
                                        for seg.Ok() && seg.Start() < committedFR.End {
                                                if seg.ValuePtr().canCommit() {
                                                        seg = f.usage.Isolate(seg, committedFR)
                                                        seg.ValuePtr().knownCommitted = true
                                                        amount := seg.Range().Length()
                                                        usage.MemoryAccounting.Inc(amount, seg.ValuePtr().kind)
                                                        f.usageExpected += amount
                                                        changedAny = true
                                                }
                                                seg = seg.NextSegment()
                                        }
                                        // Continue scanning for committed pages.
                                        i = j + 1
                                }

                                // Advance r.Start.
                                r.Start += uint64(len(s))
                        })
                if checkErr != nil {
                        return checkErr
                }
                if err != nil {
                        return err
                }

                // Continue with the first segment after r.End.
                seg = f.usage.LowerBoundSegment(r.End)
        }

        return nil
}

// TotalUsage returns an aggregate usage for all memory statistics except
// Mapped (which is external to MemoryFile). This is generally much cheaper
// than UpdateUsage, but will not provide a fine-grained breakdown.
func (f *MemoryFile) TotalUsage() (uint64, error) {
        // Stat the underlying file to discover the underlying usage. stat(2)
        // always reports the allocated block count in units of 512 bytes. This
        // includes pages in the page cache and swapped pages.
        var stat unix.Stat_t
        if err := unix.Fstat(int(f.file.Fd()), &stat); err != nil {
                return 0, err
        }
        return uint64(stat.Blocks * 512), nil
}

// TotalSize returns the current size of the backing file in bytes, which is an
// upper bound on the amount of memory that can currently be allocated from the
// MemoryFile. The value returned by TotalSize is permitted to change.
func (f *MemoryFile) TotalSize() uint64 {
        f.mu.Lock()
        defer f.mu.Unlock()
        return uint64(f.fileSize)
}

// File returns the backing file.
func (f *MemoryFile) File() *os.File {
        return f.file
}

// FD implements memmap.File.FD.
func (f *MemoryFile) FD() int {
        return int(f.file.Fd())
}

// String implements fmt.Stringer.String.
//
// Note that because f.String locks f.mu, calling f.String internally
// (including indirectly through the fmt package) risks recursive locking.
// Within the pgalloc package, use f.usage directly instead.
func (f *MemoryFile) String() string {
        f.mu.Lock()
        defer f.mu.Unlock()
        return f.usage.String()
}

// runReclaim implements the reclaimer goroutine, which continuously decommits
// reclaimable pages in order to reduce memory usage and make them available
// for allocation.
func (f *MemoryFile) runReclaim() {
        for {
                // N.B. We must call f.markReclaimed on the returned FrameRange.
                fr, ok := f.findReclaimable()
                if !ok {
                        break
                }

                if f.opts.ManualZeroing {
                        // If ManualZeroing is in effect, only hugepage-aligned regions may
                        // be safely passed to decommitFile. Pages will be zeroed on
                        // reallocation, so we don't need to perform any manual zeroing
                        // here, whether or not decommitFile succeeds.
                        if startAddr, ok := hostarch.Addr(fr.Start).HugeRoundUp(); ok {
                                if endAddr := hostarch.Addr(fr.End).HugeRoundDown(); startAddr < endAddr {
                                        decommitFR := memmap.FileRange{uint64(startAddr), uint64(endAddr)}
                                        if err := f.decommitFile(decommitFR); err != nil {
                                                log.Warningf("Reclaim failed to decommit %v: %v", decommitFR, err)
                                        }
                                }
                        }
                } else {
                        if err := f.decommitFile(fr); err != nil {
                                log.Warningf("Reclaim failed to decommit %v: %v", fr, err)
                                // Zero the pages manually. This won't reduce memory usage, but at
                                // least ensures that the pages will be zero when reallocated.
                                if err := f.manuallyZero(fr); err != nil {
                                        panic(fmt.Sprintf("Reclaim failed to decommit or zero %v: %v", fr, err))
                                }
                        }
                }
                f.markDecommitted(fr)
                f.markReclaimed(fr)
        }

        // We only get here if findReclaimable finds f.destroyed set and returns
        // false.
        f.mu.Lock()
        if !f.destroyed {
                f.mu.Unlock()
                panic("findReclaimable broke out of reclaim loop, but destroyed is no longer set")
        }
        f.file.Close()
        // Ensure that any attempts to use f.file.Fd() fail instead of getting a fd
        // that has possibly been reassigned.
        f.file = nil
        f.mappingsMu.Lock()
        defer f.mappingsMu.Unlock()
        mappings := f.mappings.Load().([]uintptr)
        for i, m := range mappings {
                if m != 0 {
                        _, _, errno := unix.Syscall(unix.SYS_MUNMAP, m, chunkSize, 0)
                        if errno != 0 {
                                log.Warningf("Failed to unmap mapping %#x for MemoryFile chunk %d: %v", m, i, errno)
                        }
                }
        }
        // Similarly, invalidate f.mappings. (atomic.Value.Store(nil) panics.)
        f.mappings.Store([]uintptr{})
        f.mu.Unlock()

        // This must be called without holding f.mu to avoid circular lock
        // ordering.
        if f.stopNotifyPressure != nil {
                f.stopNotifyPressure()
        }
}

// findReclaimable finds memory that has been marked for reclaim.
//
// Note that there returned range will be removed from tracking. It
// must be reclaimed (removed from f.usage) at this point.
func (f *MemoryFile) findReclaimable() (memmap.FileRange, bool) {
        f.mu.Lock()
        defer f.mu.Unlock()
        for {
                for {
                        if f.destroyed {
                                return memmap.FileRange{}, false
                        }
                        if f.reclaimable {
                                break
                        }
                        if f.opts.DelayedEviction == DelayedEvictionEnabled && !f.opts.UseHostMemcgPressure {
                                // No work to do. Evict any pending evictable allocations to
                                // get more reclaimable pages before going to sleep.
                                f.startEvictionsLocked()
                        }
                        f.reclaimCond.Wait()
                }
                // Allocate works from the back of the file inwards, so reclaim
                // preserves this order to minimize the cost of the search.
                if seg := f.reclaim.LastSegment(); seg.Ok() {
                        fr := seg.Range()
                        f.reclaim.Remove(seg)
                        return fr, true
                }
                // Nothing is reclaimable.
                f.reclaimable = false
        }
}

func (f *MemoryFile) markReclaimed(fr memmap.FileRange) {
        f.mu.Lock()
        defer f.mu.Unlock()
        seg := f.usage.FindSegment(fr.Start)
        // All of fr should be mapped to a single uncommitted reclaimable
        // segment accounted to System.
        if !seg.Ok() {
                panic(fmt.Sprintf("reclaimed pages %v include unreferenced pages:\n%v", fr, &f.usage))
        }
        if !seg.Range().IsSupersetOf(fr) {
                panic(fmt.Sprintf("reclaimed pages %v are not entirely contained in segment %v with state %v:\n%v", fr, seg.Range(), seg.Value(), &f.usage))
        }
        if got, want := seg.Value(), (usageInfo{
                kind:           usage.System,
                knownCommitted: false,
                refs:           0,
        }); got != want {
                panic(fmt.Sprintf("reclaimed pages %v in segment %v has incorrect state %v, wanted %v:\n%v", fr, seg.Range(), got, want, &f.usage))
        }
        // Deallocate reclaimed pages. Even though all of seg is reclaimable,
        // the caller of markReclaimed may not have decommitted it, so we can
        // only mark fr as reclaimed.
        f.usage.Remove(f.usage.Isolate(seg, fr))
}

// StartEvictions requests that f evict all evictable allocations. It does not
// wait for eviction to complete; for this, see MemoryFile.WaitForEvictions.
func (f *MemoryFile) StartEvictions() {
        f.mu.Lock()
        defer f.mu.Unlock()
        f.startEvictionsLocked()
}

// Preconditions: f.mu must be locked.
func (f *MemoryFile) startEvictionsLocked() bool {
        startedAny := false
        for user, info := range f.evictable {
                // Don't start multiple goroutines to evict the same user's
                // allocations.
                if !info.evicting {
                        f.startEvictionGoroutineLocked(user, info)
                        startedAny = true
                }
        }
        return startedAny
}

// Preconditions:
// * info == f.evictable[user].
// * !info.evicting.
// * f.mu must be locked.
func (f *MemoryFile) startEvictionGoroutineLocked(user EvictableMemoryUser, info *evictableMemoryUserInfo) {
        info.evicting = true
        f.evictionWG.Add(1)
        go func() { // S/R-SAFE: f.evictionWG
                defer f.evictionWG.Done()
                for {
                        f.mu.Lock()
                        info, ok := f.evictable[user]
                        if !ok {
                                // This shouldn't happen: only this goroutine is permitted
                                // to delete this entry.
                                f.mu.Unlock()
                                panic(fmt.Sprintf("evictableMemoryUserInfo for EvictableMemoryUser %v deleted while eviction goroutine running", user))
                        }
                        if info.ranges.IsEmpty() {
                                delete(f.evictable, user)
                                f.mu.Unlock()
                                return
                        }
                        // Evict from the end of info.ranges, under the assumption that
                        // if ranges in user start being used again (and are
                        // consequently marked unevictable), such uses are more likely
                        // to start from the beginning of user.
                        seg := info.ranges.LastSegment()
                        er := seg.Range()
                        info.ranges.Remove(seg)
                        // user.Evict() must be called without holding f.mu to avoid
                        // circular lock ordering.
                        f.mu.Unlock()
                        user.Evict(context.Background(), er)
                }
        }()
}

// WaitForEvictions blocks until f is no longer evicting any evictable
// allocations.
func (f *MemoryFile) WaitForEvictions() {
        f.evictionWG.Wait()
}

type usageSetFunctions struct{}

func (usageSetFunctions) MinKey() uint64 {
        return 0
}

func (usageSetFunctions) MaxKey() uint64 {
        return math.MaxUint64
}

func (usageSetFunctions) ClearValue(val *usageInfo) {
}

func (usageSetFunctions) Merge(_ memmap.FileRange, val1 usageInfo, _ memmap.FileRange, val2 usageInfo) (usageInfo, bool) {
        return val1, val1 == val2
}

func (usageSetFunctions) Split(_ memmap.FileRange, val usageInfo, _ uint64) (usageInfo, usageInfo) {
        return val, val
}

// evictableRangeSetValue is the value type of evictableRangeSet.
type evictableRangeSetValue struct{}

type evictableRangeSetFunctions struct{}

func (evictableRangeSetFunctions) MinKey() uint64 {
        return 0
}

func (evictableRangeSetFunctions) MaxKey() uint64 {
        return math.MaxUint64
}

func (evictableRangeSetFunctions) ClearValue(val *evictableRangeSetValue) {
}

func (evictableRangeSetFunctions) Merge(_ EvictableRange, _ evictableRangeSetValue, _ EvictableRange, _ evictableRangeSetValue) (evictableRangeSetValue, bool) {
        return evictableRangeSetValue{}, true
}

func (evictableRangeSetFunctions) Split(_ EvictableRange, _ evictableRangeSetValue, _ uint64) (evictableRangeSetValue, evictableRangeSetValue) {
        return evictableRangeSetValue{}, evictableRangeSetValue{}
}

// reclaimSetValue is the value type of reclaimSet.
type reclaimSetValue struct{}

type reclaimSetFunctions struct{}

func (reclaimSetFunctions) MinKey() uint64 {
        return 0
}

func (reclaimSetFunctions) MaxKey() uint64 {
        return math.MaxUint64
}

func (reclaimSetFunctions) ClearValue(val *reclaimSetValue) {
}

func (reclaimSetFunctions) Merge(_ memmap.FileRange, _ reclaimSetValue, _ memmap.FileRange, _ reclaimSetValue) (reclaimSetValue, bool) {
        return reclaimSetValue{}, true
}

func (reclaimSetFunctions) Split(_ memmap.FileRange, _ reclaimSetValue, _ uint64) (reclaimSetValue, reclaimSetValue) {
        return reclaimSetValue{}, reclaimSetValue{}
}








































   41 




   56 




    8 



   10 
    8 


    3 



    1 




    7 








   52 





















   23 

    9 
    2 



    8 


    8 

    8 
    8 




    8 


    2 




    6 

    8 

    2 
    2 




    2 







    2 


    6 



    8 




    7 



   21 

















   10 
    3 


    9 





    5 

    5 





    3 




    5 







    2 
    2 














    2 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fsutil

import (
        "fmt"
        "io"
        "math"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        "gvisor.dev/gvisor/pkg/sentry/usage"
)

// FileRangeSet maps offsets into a memmap.Mappable to offsets into a
// memmap.File. It is used to implement Mappables that store data in
// sparsely-allocated memory.
//
// type FileRangeSet <generated by go_generics>

// FileRangeSetFunctions implements segment.Functions for FileRangeSet.
type FileRangeSetFunctions struct{}

// MinKey implements segment.Functions.MinKey.
func (FileRangeSetFunctions) MinKey() uint64 {
        return 0
}

// MaxKey implements segment.Functions.MaxKey.
func (FileRangeSetFunctions) MaxKey() uint64 {
        return math.MaxUint64
}

// ClearValue implements segment.Functions.ClearValue.
func (FileRangeSetFunctions) ClearValue(_ *uint64) {
}

// Merge implements segment.Functions.Merge.
func (FileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) {
        if frstart1+mr1.Length() != frstart2 {
                return 0, false
        }
        return frstart1, true
}

// Split implements segment.Functions.Split.
func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) {
        return frstart, frstart + (split - mr.Start)
}

// FileRange returns the FileRange mapped by seg.
func (seg FileRangeIterator) FileRange() memmap.FileRange {
        return seg.FileRangeOf(seg.Range())
}

// FileRangeOf returns the FileRange mapped by mr.
//
// Preconditions:
// * seg.Range().IsSupersetOf(mr).
// * mr.Length() != 0.
func (seg FileRangeIterator) FileRangeOf(mr memmap.MappableRange) memmap.FileRange {
        frstart := seg.Value() + (mr.Start - seg.Start())
        return memmap.FileRange{frstart, frstart + mr.Length()}
}

// Fill attempts to ensure that all memmap.Mappable offsets in required are
// mapped to a memmap.File offset, by allocating from mf with the given
// memory usage kind and invoking readAt to store data into memory. (If readAt
// returns a successful partial read, Fill will call it repeatedly until all
// bytes have been read.) EOF is handled consistently with the requirements of
// mmap(2): bytes after EOF on the same page are zeroed; pages after EOF are
// invalid. fileSize is an upper bound on the file's size; bytes after fileSize
// will be zeroed without calling readAt.
//
// Fill may read offsets outside of required, but will never read offsets
// outside of optional. It returns a non-nil error if any error occurs, even
// if the error only affects offsets in optional, but not in required.
//
// Preconditions:
// * required.Length() > 0.
// * optional.IsSupersetOf(required).
// * required and optional must be page-aligned.
func (frs *FileRangeSet) Fill(ctx context.Context, required, optional memmap.MappableRange, fileSize uint64, mf *pgalloc.MemoryFile, kind usage.MemoryKind, readAt func(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)) error {
        gap := frs.LowerBoundGap(required.Start)
        for gap.Ok() && gap.Start() < required.End {
                if gap.Range().Length() == 0 {
                        gap = gap.NextGap()
                        continue
                }
                gr := gap.Range().Intersect(optional)

                // Read data into the gap.
                fr, err := mf.AllocateAndFill(gr.Length(), kind, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
                        var done uint64
                        for !dsts.IsEmpty() {
                                n, err := func() (uint64, error) {
                                        off := gr.Start + done
                                        if off >= fileSize {
                                                return 0, io.EOF
                                        }
                                        if off+dsts.NumBytes() > fileSize {
                                                rd := fileSize - off
                                                n, err := readAt(ctx, dsts.TakeFirst64(rd), off)
                                                if n == rd && err == nil {
                                                        return n, io.EOF
                                                }
                                                return n, err
                                        }
                                        return readAt(ctx, dsts, off)
                                }()
                                done += n
                                dsts = dsts.DropFirst64(n)
                                if err != nil {
                                        if err == io.EOF {
                                                // MemoryFile.AllocateAndFill truncates down to a page
                                                // boundary, but FileRangeSet.Fill is supposed to
                                                // zero-fill to the end of the page in this case.
                                                donepgaddr, ok := hostarch.Addr(done).RoundUp()
                                                if donepg := uint64(donepgaddr); ok && donepg != done {
                                                        dsts.DropFirst64(donepg - done)
                                                        done = donepg
                                                        if dsts.IsEmpty() {
                                                                return done, nil
                                                        }
                                                }
                                        }
                                        return done, err
                                }
                        }
                        return done, nil
                }))

                // Store anything we managed to read into the cache.
                if done := fr.Length(); done != 0 {
                        gr.End = gr.Start + done
                        gap = frs.Insert(gap, gr, fr.Start).NextGap()
                }

                if err != nil {
                        return err
                }
        }
        return nil
}

// Drop removes segments for memmap.Mappable offsets in mr, freeing the
// corresponding memmap.FileRanges.
//
// Preconditions: mr must be page-aligned.
func (frs *FileRangeSet) Drop(mr memmap.MappableRange, mf *pgalloc.MemoryFile) {
        seg := frs.LowerBoundSegment(mr.Start)
        for seg.Ok() && seg.Start() < mr.End {
                seg = frs.Isolate(seg, mr)
                mf.DecRef(seg.FileRange())
                seg = frs.Remove(seg).NextSegment()
        }
}

// DropAll removes all segments in mr, freeing the corresponding
// memmap.FileRanges.
func (frs *FileRangeSet) DropAll(mf *pgalloc.MemoryFile) {
        for seg := frs.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
                mf.DecRef(seg.FileRange())
        }
        frs.RemoveAll()
}

// Truncate updates frs to reflect Mappable truncation to the given length:
// bytes after the new EOF on the same page are zeroed, and pages after the new
// EOF are freed.
func (frs *FileRangeSet) Truncate(end uint64, mf *pgalloc.MemoryFile) {
        pgendaddr, ok := hostarch.Addr(end).RoundUp()
        if ok {
                pgend := uint64(pgendaddr)

                // Free truncated pages.
                frs.SplitAt(pgend)
                seg := frs.LowerBoundSegment(pgend)
                for seg.Ok() {
                        mf.DecRef(seg.FileRange())
                        seg = frs.Remove(seg).NextSegment()
                }

                if end == pgend {
                        return
                }
        }

        // Here we know end < end.RoundUp(). If the new EOF lands in the
        // middle of a page that we have, zero out its contents beyond the new
        // length.
        seg := frs.FindSegment(end)
        if seg.Ok() {
                fr := seg.FileRange()
                fr.Start += end - seg.Start()
                ims, err := mf.MapInternal(fr, hostarch.Write)
                if err != nil {
                        // There's no good recourse from here. This means
                        // that we can't keep cached memory consistent with
                        // the new end of file. The caller may have already
                        // updated the file size on their backing file system.
                        //
                        // We don't want to risk blindly continuing onward,
                        // so in the extremely rare cases this does happen,
                        // we abandon ship.
                        panic(fmt.Sprintf("Failed to map %v: %v", fr, err))
                }
                if _, err := safemem.ZeroSeq(ims); err != nil {
                        panic(fmt.Sprintf("Zeroing %v failed: %v", fr, err))
                }
        }
}











































  212 



  237 



  237 



    1 







   30 



































































































  210 




  209 






  209 



  210 









  210 


  210 







  210 



   39 







































   29 









   29 












































   29 




   29 


   29 







   24 

    4 






   24 


















    1 






    1 


    1 




    1 



    1 


    1 
    1 







    1 



    1 


    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "fmt"
        "math/bits"
        "sync/atomic"
        "unsafe"

        "gvisor.dev/gvisor/pkg/gohacks"
        "gvisor.dev/gvisor/pkg/sync"
)

// mountKey represents the location at which a Mount is mounted. It is
// structurally identical to VirtualDentry, but stores its fields as
// unsafe.Pointer since mutators synchronize with VFS path traversal using
// seqcounts.
//
// This is explicitly not savable.
type mountKey struct {
        parent unsafe.Pointer // *Mount
        point  unsafe.Pointer // *Dentry
}

var (
        mountKeyHasher = sync.MapKeyHasher(map[mountKey]struct{}(nil))
        mountKeySeed   = sync.RandUintptr()
)

func (k *mountKey) hash() uintptr {
        return mountKeyHasher(gohacks.Noescape(unsafe.Pointer(k)), mountKeySeed)
}

func (mnt *Mount) parent() *Mount {
        return (*Mount)(atomic.LoadPointer(&mnt.key.parent))
}

func (mnt *Mount) point() *Dentry {
        return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
}

func (mnt *Mount) getKey() VirtualDentry {
        return VirtualDentry{
                mount:  mnt.parent(),
                dentry: mnt.point(),
        }
}

// Invariant: mnt.key.parent == nil. vd.Ok().
func (mnt *Mount) setKey(vd VirtualDentry) {
        atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount))
        atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry))
}

// mountTable maps (mount parent, mount point) pairs to mounts. It supports
// efficient concurrent lookup, even in the presence of concurrent mutators
// (provided mutation is sufficiently uncommon).
//
// mountTable.Init() must be called on new mountTables before use.
type mountTable struct {
        // mountTable is implemented as a seqcount-protected hash table that
        // resolves collisions with linear probing, featuring Robin Hood insertion
        // and backward shift deletion. These minimize probe length variance,
        // significantly improving the performance of linear probing at high load
        // factors. (mountTable doesn't use bucketing, which is the other major
        // technique commonly used in high-performance hash tables; the efficiency
        // of bucketing is largely due to SIMD lookup, and Go lacks both SIMD
        // intrinsics and inline assembly, limiting the performance of this
        // approach.)

        seq sync.SeqCount `state:"nosave"`

        // size holds both length (number of elements) and capacity (number of
        // slots): capacity is stored as its base-2 log (referred to as order) in
        // the least significant bits of size, and length is stored in the
        // remaining bits. Go defines bit shifts >= width of shifted unsigned
        // operand as shifting to 0, which differs from x86's SHL, so the Go
        // compiler inserts a bounds check for each bit shift unless we mask order
        // anyway (cf. runtime.bucketShift()), and length isn't used by lookup;
        // thus this bit packing gets us more bits for the length (vs. storing
        // length and cap in separate uint32s) for ~free.
        size uint64

        slots unsafe.Pointer `state:"nosave"` // []mountSlot; never nil after Init
}

type mountSlot struct {
        // We don't store keys in slots; instead, we just check Mount.parent and
        // Mount.point directly. Any practical use of lookup will need to touch
        // Mounts anyway, and comparing hashes means that false positives are
        // extremely rare, so this isn't an extra cache line touch overall.
        value unsafe.Pointer // *Mount
        hash  uintptr
}

const (
        mtSizeOrderBits = 6 // log2 of pointer size in bits
        mtSizeOrderMask = (1 << mtSizeOrderBits) - 1
        mtSizeOrderOne  = 1
        mtSizeLenLSB    = mtSizeOrderBits
        mtSizeLenOne    = 1 << mtSizeLenLSB
        mtSizeLenNegOne = ^uint64(mtSizeOrderMask) // uint64(-1) << mtSizeLenLSB

        mountSlotBytes = unsafe.Sizeof(mountSlot{})
        mountKeyBytes  = unsafe.Sizeof(mountKey{})

        // Tuning parameters.
        //
        // Essentially every mountTable will contain at least /proc, /sys, and
        // /dev/shm, so there is ~no reason for mtInitCap to be < 4.
        mtInitOrder  = 2
        mtInitCap    = 1 << mtInitOrder
        mtMaxLoadNum = 13
        mtMaxLoadDen = 16
)

func init() {
        // We can't just define mtSizeOrderBits as follows because Go doesn't have
        // constexpr.
        if ptrBits := uint(unsafe.Sizeof(uintptr(0)) * 8); mtSizeOrderBits != bits.TrailingZeros(ptrBits) {
                panic(fmt.Sprintf("mtSizeOrderBits (%d) must be %d = log2 of pointer size in bits (%d)", mtSizeOrderBits, bits.TrailingZeros(ptrBits), ptrBits))
        }
        if bits.OnesCount(uint(mountSlotBytes)) != 1 {
                panic(fmt.Sprintf("sizeof(mountSlotBytes) (%d) must be a power of 2 to use bit masking for wraparound", mountSlotBytes))
        }
        if mtInitCap <= 1 {
                panic(fmt.Sprintf("mtInitCap (%d) must be at least 2 since mountTable methods assume that there will always be at least one empty slot", mtInitCap))
        }
        if mtMaxLoadNum >= mtMaxLoadDen {
                panic(fmt.Sprintf("invalid mountTable maximum load factor (%d/%d)", mtMaxLoadNum, mtMaxLoadDen))
        }
}

// Init must be called exactly once on each mountTable before use.
func (mt *mountTable) Init() {
        mt.size = mtInitOrder
        mt.slots = newMountTableSlots(mtInitCap)
}

func newMountTableSlots(cap uintptr) unsafe.Pointer {
        slice := make([]mountSlot, cap, cap)
        hdr := (*gohacks.SliceHeader)(unsafe.Pointer(&slice))
        return hdr.Data
}

// Lookup returns the Mount with the given parent, mounted at the given point.
// If no such Mount exists, Lookup returns nil.
//
// Lookup may be called even if there are concurrent mutators of mt.
func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
        key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
        hash := key.hash()

loop:
        for {
                epoch := mt.seq.BeginRead()
                size := atomic.LoadUint64(&mt.size)
                slots := atomic.LoadPointer(&mt.slots)
                if !mt.seq.ReadOk(epoch) {
                        continue
                }
                tcap := uintptr(1) << (size & mtSizeOrderMask)
                mask := tcap - 1
                off := (hash & mask) * mountSlotBytes
                offmask := mask * mountSlotBytes
                for {
                        // This avoids bounds checking.
                        slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
                        slotValue := atomic.LoadPointer(&slot.value)
                        slotHash := atomic.LoadUintptr(&slot.hash)
                        if !mt.seq.ReadOk(epoch) {
                                // The element we're looking for might have been moved into a
                                // slot we've previously checked, so restart entirely.
                                continue loop
                        }
                        if slotValue == nil {
                                return nil
                        }
                        if slotHash == hash {
                                mount := (*Mount)(slotValue)
                                var mountKey mountKey
                                mountKey.parent = atomic.LoadPointer(&mount.key.parent)
                                mountKey.point = atomic.LoadPointer(&mount.key.point)
                                if !mt.seq.ReadOk(epoch) {
                                        continue loop
                                }
                                if key == mountKey {
                                        return mount
                                }
                        }
                        off = (off + mountSlotBytes) & offmask
                }
        }
}

// Range calls f on each Mount in mt. If f returns false, Range stops iteration
// and returns immediately.
func (mt *mountTable) Range(f func(*Mount) bool) {
        tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
        slotPtr := mt.slots
        last := unsafe.Pointer(uintptr(mt.slots) + ((tcap - 1) * mountSlotBytes))
        for {
                slot := (*mountSlot)(slotPtr)
                if slot.value != nil {
                        if !f((*Mount)(slot.value)) {
                                return
                        }
                }
                if slotPtr == last {
                        return
                }
                slotPtr = unsafe.Pointer(uintptr(slotPtr) + mountSlotBytes)
        }
}

// Insert inserts the given mount into mt.
//
// Preconditions: mt must not already contain a Mount with the same mount point
// and parent.
func (mt *mountTable) Insert(mount *Mount) {
        mt.seq.BeginWrite()
        mt.insertSeqed(mount)
        mt.seq.EndWrite()
}

// insertSeqed inserts the given mount into mt.
//
// Preconditions:
// * mt.seq must be in a writer critical section.
// * mt must not already contain a Mount with the same mount point and parent.
func (mt *mountTable) insertSeqed(mount *Mount) {
        hash := mount.key.hash()

        // We're under the maximum load factor if:
        //
        //          (len+1) / cap <= mtMaxLoadNum / mtMaxLoadDen
        // (len+1) * mtMaxLoadDen <= mtMaxLoadNum * cap
        tlen := mt.size >> mtSizeLenLSB
        order := mt.size & mtSizeOrderMask
        tcap := uintptr(1) << order
        if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) {
                // Atomically insert the new element into the table.
                atomic.AddUint64(&mt.size, mtSizeLenOne)
                mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash)
                return
        }

        // Otherwise, we have to expand. Double the number of slots in the new
        // table.
        newOrder := order + 1
        if newOrder > mtSizeOrderMask {
                panic("mount table size overflow")
        }
        newCap := uintptr(1) << newOrder
        newSlots := newMountTableSlots(newCap)
        // Copy existing elements to the new table.
        oldCur := mt.slots
        // Go does not permit pointers to the end of allocated objects, so we
        // must use a pointer to the last element of the old table. The
        // following expression is equivalent to
        // `slots+(cap-1)*mountSlotBytes` but has a critical path length of 2
        // arithmetic instructions instead of 3.
        oldLast := unsafe.Pointer((uintptr(mt.slots) - mountSlotBytes) + (tcap * mountSlotBytes))
        for {
                oldSlot := (*mountSlot)(oldCur)
                if oldSlot.value != nil {
                        mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash)
                }
                if oldCur == oldLast {
                        break
                }
                oldCur = unsafe.Pointer(uintptr(oldCur) + mountSlotBytes)
        }
        // Insert the new element into the new table.
        mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash)
        // Switch to the new table.
        atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne)
        atomic.StorePointer(&mt.slots, newSlots)
}

// Preconditions:
// * There are no concurrent mutators of the table (slots, cap).
// * If the table is visible to readers, then mt.seq must be in a writer
//   critical section.
// * cap must be a power of 2.
func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, hash uintptr) {
        mask := cap - 1
        off := (hash & mask) * mountSlotBytes
        offmask := mask * mountSlotBytes
        disp := uintptr(0)
        for {
                slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
                slotValue := slot.value
                if slotValue == nil {
                        atomic.StorePointer(&slot.value, value)
                        atomic.StoreUintptr(&slot.hash, hash)
                        return
                }
                // If we've been displaced farther from our first-probed slot than the
                // element stored in this one, swap elements and switch to inserting
                // the replaced one. (This is Robin Hood insertion.)
                slotHash := slot.hash
                slotDisp := ((off / mountSlotBytes) - slotHash) & mask
                if disp > slotDisp {
                        atomic.StorePointer(&slot.value, value)
                        atomic.StoreUintptr(&slot.hash, hash)
                        value = slotValue
                        hash = slotHash
                        disp = slotDisp
                }
                off = (off + mountSlotBytes) & offmask
                disp++
        }
}

// Remove removes the given mount from mt.
//
// Preconditions: mt must contain mount.
func (mt *mountTable) Remove(mount *Mount) {
        mt.seq.BeginWrite()
        mt.removeSeqed(mount)
        mt.seq.EndWrite()
}

// removeSeqed removes the given mount from mt.
//
// Preconditions:
// * mt.seq must be in a writer critical section.
// * mt must contain mount.
func (mt *mountTable) removeSeqed(mount *Mount) {
        hash := mount.key.hash()
        tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
        mask := tcap - 1
        slots := mt.slots
        off := (hash & mask) * mountSlotBytes
        offmask := mask * mountSlotBytes
        for {
                slot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + off))
                slotValue := slot.value
                if slotValue == unsafe.Pointer(mount) {
                        // Found the element to remove. Move all subsequent elements
                        // backward until we either find an empty slot, or an element that
                        // is already in its first-probed slot. (This is backward shift
                        // deletion.)
                        for {
                                nextOff := (off + mountSlotBytes) & offmask
                                nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff))
                                nextSlotValue := nextSlot.value
                                if nextSlotValue == nil {
                                        break
                                }
                                nextSlotHash := nextSlot.hash
                                if (nextOff / mountSlotBytes) == (nextSlotHash & mask) {
                                        break
                                }
                                atomic.StorePointer(&slot.value, nextSlotValue)
                                atomic.StoreUintptr(&slot.hash, nextSlotHash)
                                off = nextOff
                                slot = nextSlot
                        }
                        atomic.StorePointer(&slot.value, nil)
                        atomic.AddUint64(&mt.size, mtSizeLenNegOne)
                        return
                }
                if checkInvariants && slotValue == nil {
                        panic(fmt.Sprintf("mountTable.Remove() called on missing Mount %v", mount))
                }
                off = (off + mountSlotBytes) & offmask
        }
}
























    3 

    1 


    3 


    3 


    3 



    4 






    4 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package netstack

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/tcpip/link/tun"
)

// TUNFlagsToLinux converts a tun.Flags to Linux TUN flags.
func TUNFlagsToLinux(flags tun.Flags) uint16 {
        ret := uint16(linux.IFF_NOFILTER)
        if flags.TAP {
                ret |= linux.IFF_TAP
        }
        if flags.TUN {
                ret |= linux.IFF_TUN
        }
        if flags.NoPacketInfo {
                ret |= linux.IFF_NO_PI
        }
        return ret
}

// LinuxToTUNFlags converts Linux TUN flags to a tun.Flags.
func LinuxToTUNFlags(flags uint16) (tun.Flags, error) {
        // Linux adds IFF_NOFILTER (the same value as IFF_NO_PI unfortunately)
        // when there is no sk_filter. See __tun_chr_ioctl() in
        // net/drivers/tun.c.
        if flags&^uint16(linux.IFF_TUN|linux.IFF_TAP|linux.IFF_NO_PI|linux.IFF_ONE_QUEUE) != 0 {
                return tun.Flags{}, linuxerr.EINVAL
        }
        return tun.Flags{
                TUN:          flags&linux.IFF_TUN != 0,
                TAP:          flags&linux.IFF_TAP != 0,
                NoPacketInfo: flags&linux.IFF_NO_PI != 0,
        }, nil
}


































 1961 








 1549 





 1454 







    7 




    7 

















   11 




    5 








 1957 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package hostarch

import (
        "fmt"
)

// Addr represents a generic virtual address.
//
// +stateify savable
type Addr uintptr

// AddLength adds the given length to start and returns the result. ok is true
// iff adding the length did not overflow the range of Addr.
//
// Note: This function is usually used to get the end of an address range
// defined by its start address and length. Since the resulting end is
// exclusive, end == 0 is technically valid, and corresponds to a range that
// extends to the end of the address space, but ok will be false. This isn't
// expected to ever come up in practice.
func (v Addr) AddLength(length uint64) (end Addr, ok bool) {
        end = v + Addr(length)
        // The second half of the following check is needed in case uintptr is
        // smaller than 64 bits.
        ok = end >= v && length <= uint64(^Addr(0))
        return
}

// RoundDown returns the address rounded down to the nearest page boundary.
func (v Addr) RoundDown() Addr {
        return v & ^Addr(PageSize-1)
}

// RoundUp returns the address rounded up to the nearest page boundary. ok is
// true iff rounding up did not wrap around.
func (v Addr) RoundUp() (addr Addr, ok bool) {
        addr = Addr(v + PageSize - 1).RoundDown()
        ok = addr >= v
        return
}

// MustRoundUp is equivalent to RoundUp, but panics if rounding up wraps
// around.
func (v Addr) MustRoundUp() Addr {
        addr, ok := v.RoundUp()
        if !ok {
                panic(fmt.Sprintf("hostarch.Addr(%d).RoundUp() wraps", v))
        }
        return addr
}

// HugeRoundDown returns the address rounded down to the nearest huge page
// boundary.
func (v Addr) HugeRoundDown() Addr {
        return v & ^Addr(HugePageSize-1)
}

// HugeRoundUp returns the address rounded up to the nearest huge page boundary.
// ok is true iff rounding up did not wrap around.
func (v Addr) HugeRoundUp() (addr Addr, ok bool) {
        addr = Addr(v + HugePageSize - 1).HugeRoundDown()
        ok = addr >= v
        return
}

// PageOffset returns the offset of v into the current page.
func (v Addr) PageOffset() uint64 {
        return uint64(v & Addr(PageSize-1))
}

// IsPageAligned returns true if v.PageOffset() == 0.
func (v Addr) IsPageAligned() bool {
        return v.PageOffset() == 0
}

// AddrRange is a range of Addrs.
//
// type AddrRange <generated by go_generics>

// ToRange returns [v, v+length).
func (v Addr) ToRange(length uint64) (AddrRange, bool) {
        end, ok := v.AddLength(length)
        return AddrRange{v, end}, ok
}

// IsPageAligned returns true if ar.Start.IsPageAligned() and
// ar.End.IsPageAligned().
func (ar AddrRange) IsPageAligned() bool {
        return ar.Start.IsPageAligned() && ar.End.IsPageAligned()
}

// String implements fmt.Stringer.String.
func (ar AddrRange) String() string {
        return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End)
}

// PageRoundDown/Up are equivalent to Addr.RoundDown/Up, but without the
// potentially truncating conversion from uint64 to Addr. This is necessary
// because there is no way to define generic "PageRoundDown/Up" functions in Go.

// PageRoundDown returns x rounded down to the nearest page boundary.
func PageRoundDown(x uint64) uint64 {
        return x &^ (PageSize - 1)
}

// PageRoundUp returns x rounded up to the nearest page boundary.
// ok is true iff rounding up did not wrap around.
func PageRoundUp(x uint64) (addr uint64, ok bool) {
        addr = PageRoundDown(x + PageSize - 1)
        ok = addr >= x
        return
}










































































 1773 





   70 



   70 
















    6 










  454 










    3 












   11 














 1770 
   53 


 1770 


 1771 








 1768 




   75 




 1771 








 1764 


 1765 














    5 











































 1759 













 1762 



 1754 


















































   19 





   24 



   24 


   21 








    5 



    5 


    5 









   20 
    5 


   15 





   31 
   31 


    1 



   19 














 1762 
 1752 


 1764 


 1763 



 1754 



















































 1745 













 1746 





 1739 







 1737 













 1738 
 1737 

 1737 
 1737 
 1734 




 1741 





 1740 






 1735 

 1728 
   29 



 1724 










































    4 






    4 






    4 









 1721 












 1717 






 1719 





 1722 


 1714 

 1596 




 1719 








   74 






 1716 
 1716 












   22 
















 1711 





 1713 

 1711 



 1711 



 1704 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package time defines the Timer type, which provides a periodic timer that
// works by sampling a user-provided clock.
package time

import (
        "fmt"
        "math"
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/waiter"
)

// Events that may be generated by a Clock.
const (
        // ClockEventSet occurs when a Clock undergoes a discontinuous change.
        ClockEventSet waiter.EventMask = 1 << iota

        // ClockEventRateIncrease occurs when the rate at which a Clock advances
        // increases significantly, such that values returned by previous calls to
        // Clock.WallTimeUntil may be too large.
        ClockEventRateIncrease
)

// Time represents an instant in time with nanosecond precision.
//
// Time may represent time with respect to any clock and may not have any
// meaning in the real world.
//
// +stateify savable
type Time struct {
        ns int64
}

var (
        // MinTime is the zero time instant, the lowest possible time that can
        // be represented by Time.
        MinTime = Time{ns: math.MinInt64}

        // MaxTime is the highest possible time that can be represented by
        // Time.
        MaxTime = Time{ns: math.MaxInt64}

        // ZeroTime represents the zero time in an unspecified Clock's domain.
        ZeroTime = Time{ns: 0}
)

const (
        // MinDuration is the minimum duration representable by time.Duration.
        MinDuration = time.Duration(math.MinInt64)

        // MaxDuration is the maximum duration representable by time.Duration.
        MaxDuration = time.Duration(math.MaxInt64)
)

// FromNanoseconds returns a Time representing the point ns nanoseconds after
// an unspecified Clock's zero time.
func FromNanoseconds(ns int64) Time {
        return Time{ns}
}

// FromSeconds returns a Time representing the point s seconds after an
// unspecified Clock's zero time.
func FromSeconds(s int64) Time {
        if s > math.MaxInt64/time.Second.Nanoseconds() {
                return MaxTime
        }
        return Time{s * 1e9}
}

// FromUnix converts from Unix seconds and nanoseconds to Time, assuming a real
// time Unix clock domain.
func FromUnix(s int64, ns int64) Time {
        if s > math.MaxInt64/time.Second.Nanoseconds() {
                return MaxTime
        }
        t := s * 1e9
        if t > math.MaxInt64-ns {
                return MaxTime
        }
        return Time{t + ns}
}

// FromTimespec converts from Linux Timespec to Time.
func FromTimespec(ts linux.Timespec) Time {
        return Time{ts.ToNsecCapped()}
}

// FromTimeval converts a Linux Timeval to Time.
func FromTimeval(tv linux.Timeval) Time {
        return Time{tv.ToNsecCapped()}
}

// Nanoseconds returns nanoseconds elapsed since the zero time in t's Clock
// domain. If t represents walltime, this is nanoseconds since the Unix epoch.
func (t Time) Nanoseconds() int64 {
        return t.ns
}

// Seconds returns seconds elapsed since the zero time in t's Clock domain. If
// t represents walltime, this is seconds since Unix epoch.
func (t Time) Seconds() int64 {
        return t.Nanoseconds() / time.Second.Nanoseconds()
}

// Timespec converts Time to a Linux timespec.
func (t Time) Timespec() linux.Timespec {
        return linux.NsecToTimespec(t.Nanoseconds())
}

// Unix returns the (seconds, nanoseconds) representation of t such that
// seconds*1e9 + nanoseconds = t.
func (t Time) Unix() (s int64, ns int64) {
        s = t.ns / 1e9
        ns = t.ns % 1e9
        return
}

// TimeT converts Time to a Linux time_t.
func (t Time) TimeT() linux.TimeT {
        return linux.NsecToTimeT(t.Nanoseconds())
}

// Timeval converts Time to a Linux timeval.
func (t Time) Timeval() linux.Timeval {
        return linux.NsecToTimeval(t.Nanoseconds())
}

// StatxTimestamp converts Time to a Linux statx_timestamp.
func (t Time) StatxTimestamp() linux.StatxTimestamp {
        return linux.NsecToStatxTimestamp(t.Nanoseconds())
}

// Add adds the duration of d to t.
func (t Time) Add(d time.Duration) Time {
        if t.ns > 0 && d.Nanoseconds() > math.MaxInt64-int64(t.ns) {
                return MaxTime
        }
        if t.ns < 0 && d.Nanoseconds() < math.MinInt64-int64(t.ns) {
                return MinTime
        }
        return Time{int64(t.ns) + d.Nanoseconds()}
}

// AddTime adds the duration of u to t.
func (t Time) AddTime(u Time) Time {
        return t.Add(time.Duration(u.ns))
}

// Equal reports whether the two times represent the same instant in time.
func (t Time) Equal(u Time) bool {
        return t.ns == u.ns
}

// Before reports whether the instant t is before the instant u.
func (t Time) Before(u Time) bool {
        return t.ns < u.ns
}

// After reports whether the instant t is after the instant u.
func (t Time) After(u Time) bool {
        return t.ns > u.ns
}

// Sub returns the duration of t - u.
//
// N.B. This measure may not make sense for every Time returned by ktime.Clock.
// Callers who need wall time duration can use ktime.Clock.WallTimeUntil to
// estimate that wall time.
func (t Time) Sub(u Time) time.Duration {
        dur := time.Duration(int64(t.ns)-int64(u.ns)) * time.Nanosecond
        switch {
        case u.Add(dur).Equal(t):
                return dur
        case t.Before(u):
                return MinDuration
        default:
                return MaxDuration
        }
}

// IsMin returns whether t represents the lowest possible time instant.
func (t Time) IsMin() bool {
        return t == MinTime
}

// IsZero returns whether t represents the zero time instant in t's Clock domain.
func (t Time) IsZero() bool {
        return t == ZeroTime
}

// String returns the time represented in nanoseconds as a string.
func (t Time) String() string {
        return fmt.Sprintf("%dns", t.Nanoseconds())
}

// A Clock is an abstract time source.
type Clock interface {
        // Now returns the current time in nanoseconds according to the Clock.
        Now() Time

        // WallTimeUntil returns the estimated wall time until Now will return a
        // value greater than or equal to t, given that a recent call to Now
        // returned now. If t has already passed, WallTimeUntil may return 0 or a
        // negative value.
        //
        // WallTimeUntil must be abstract to support Clocks that do not represent
        // wall time (e.g. thread group execution timers). Clocks that represent
        // wall times may embed the WallRateClock type to obtain an appropriate
        // trivial implementation of WallTimeUntil.
        //
        // WallTimeUntil is used to determine when associated Timers should next
        // check for expirations. Returning too small a value may result in
        // spurious Timer goroutine wakeups, while returning too large a value may
        // result in late expirations. Implementations should usually err on the
        // side of underestimating.
        WallTimeUntil(t, now Time) time.Duration

        // Waitable methods may be used to subscribe to Clock events. Waiters will
        // not be preserved by Save and must be re-established during restore.
        //
        // Since Clock events are transient, implementations of
        // waiter.Waitable.Readiness should return 0.
        waiter.Waitable
}

// WallRateClock implements Clock.WallTimeUntil for Clocks that elapse at the
// same rate as wall time.
type WallRateClock struct{}

// WallTimeUntil implements Clock.WallTimeUntil.
func (*WallRateClock) WallTimeUntil(t, now Time) time.Duration {
        return t.Sub(now)
}

// NoClockEvents implements waiter.Waitable for Clocks that do not generate
// events.
type NoClockEvents struct{}

// Readiness implements waiter.Waitable.Readiness.
func (*NoClockEvents) Readiness(mask waiter.EventMask) waiter.EventMask {
        return 0
}

// EventRegister implements waiter.Waitable.EventRegister.
func (*NoClockEvents) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (*NoClockEvents) EventUnregister(e *waiter.Entry) {
}

// ClockEventsQueue implements waiter.Waitable by wrapping waiter.Queue and
// defining waiter.Waitable.Readiness as required by Clock.
type ClockEventsQueue struct {
        waiter.Queue
}

// Readiness implements waiter.Waitable.Readiness.
func (*ClockEventsQueue) Readiness(mask waiter.EventMask) waiter.EventMask {
        return 0
}

// A TimerListener receives expirations from a Timer.
type TimerListener interface {
        // Notify is called when its associated Timer expires. exp is the number of
        // expirations. setting is the next timer Setting.
        //
        // Notify is called with the associated Timer's mutex locked, so Notify
        // must not take any locks that precede Timer.mu in lock order.
        //
        // If Notify returns true, the timer will use the returned setting
        // rather than the passed one.
        //
        // Preconditions: exp > 0.
        Notify(exp uint64, setting Setting) (newSetting Setting, update bool)

        // Destroy is called when the timer is destroyed.
        Destroy()
}

// Setting contains user-controlled mutable Timer properties.
//
// +stateify savable
type Setting struct {
        // Enabled is true if the timer is running.
        Enabled bool

        // Next is the time in nanoseconds of the next expiration.
        Next Time

        // Period is the time in nanoseconds between expirations. If Period is
        // zero, the timer will not automatically restart after expiring.
        //
        // Invariant: Period >= 0.
        Period time.Duration
}

// SettingFromSpec converts a (value, interval) pair to a Setting based on a
// reading from c. value is interpreted as a time relative to c.Now().
func SettingFromSpec(value time.Duration, interval time.Duration, c Clock) (Setting, error) {
        return SettingFromSpecAt(value, interval, c.Now())
}

// SettingFromSpecAt converts a (value, interval) pair to a Setting. value is
// interpreted as a time relative to now.
func SettingFromSpecAt(value time.Duration, interval time.Duration, now Time) (Setting, error) {
        if value < 0 {
                return Setting{}, linuxerr.EINVAL
        }
        if value == 0 {
                return Setting{Period: interval}, nil
        }
        return Setting{
                Enabled: true,
                Next:    now.Add(value),
                Period:  interval,
        }, nil
}

// SettingFromAbsSpec converts a (value, interval) pair to a Setting. value is
// interpreted as an absolute time.
func SettingFromAbsSpec(value Time, interval time.Duration) (Setting, error) {
        if value.Before(ZeroTime) {
                return Setting{}, linuxerr.EINVAL
        }
        if value.IsZero() {
                return Setting{Period: interval}, nil
        }
        return Setting{
                Enabled: true,
                Next:    value,
                Period:  interval,
        }, nil
}

// SettingFromItimerspec converts a linux.Itimerspec to a Setting. If abs is
// true, its.Value is interpreted as an absolute time. Otherwise, it is
// interpreted as a time relative to c.Now().
func SettingFromItimerspec(its linux.Itimerspec, abs bool, c Clock) (Setting, error) {
        if abs {
                return SettingFromAbsSpec(FromTimespec(its.Value), its.Interval.ToDuration())
        }
        return SettingFromSpec(its.Value.ToDuration(), its.Interval.ToDuration(), c)
}

// SpecFromSetting converts a timestamp and a Setting to a (relative value,
// interval) pair, as used by most Linux syscalls that return a struct
// itimerval or struct itimerspec.
func SpecFromSetting(now Time, s Setting) (value, period time.Duration) {
        if !s.Enabled {
                return 0, s.Period
        }
        return s.Next.Sub(now), s.Period
}

// ItimerspecFromSetting converts a Setting to a linux.Itimerspec.
func ItimerspecFromSetting(now Time, s Setting) linux.Itimerspec {
        val, iv := SpecFromSetting(now, s)
        return linux.Itimerspec{
                Interval: linux.DurationToTimespec(iv),
                Value:    linux.DurationToTimespec(val),
        }
}

// At returns an updated Setting and a number of expirations after the
// associated Clock indicates a time of now.
//
// Settings may be created by successive calls to At with decreasing
// values of now (i.e. time may appear to go backward). Supporting this is
// required to support non-monotonic clocks, as well as allowing
// Timer.clock.Now() to be called without holding Timer.mu.
func (s Setting) At(now Time) (Setting, uint64) {
        if !s.Enabled {
                return s, 0
        }
        if s.Next.After(now) {
                return s, 0
        }
        if s.Period == 0 {
                s.Enabled = false
                return s, 1
        }
        exp := 1 + uint64(now.Sub(s.Next).Nanoseconds())/uint64(s.Period)
        s.Next = s.Next.Add(time.Duration(uint64(s.Period) * exp))
        return s, exp
}

// Timer is an optionally-periodic timer driven by sampling a user-specified
// Clock. Timer's semantics support the requirements of Linux's interval timers
// (setitimer(2), timer_create(2), timerfd_create(2)).
//
// Timers should be created using NewTimer and must be cleaned up by calling
// Timer.Destroy when no longer used.
//
// +stateify savable
type Timer struct {
        // clock is the time source. clock is immutable.
        clock Clock

        // listener is notified of expirations. listener is immutable.
        listener TimerListener

        // mu protects the following mutable fields.
        mu sync.Mutex `state:"nosave"`

        // setting is the timer setting. setting is protected by mu.
        setting Setting

        // paused is true if the Timer is paused. paused is protected by mu.
        paused bool

        // kicker is used to wake the Timer goroutine. The kicker pointer is
        // immutable, but its state is protected by mu.
        kicker *time.Timer `state:"nosave"`

        // entry is registered with clock.EventRegister. entry is immutable.
        //
        // Per comment in Clock, entry must be re-registered after restore; per
        // comment in Timer.Load, this is done in Timer.Resume.
        entry waiter.Entry `state:"nosave"`

        // events is the channel that will be notified whenever entry receives an
        // event. It is also closed by Timer.Destroy to instruct the Timer
        // goroutine to exit.
        events chan struct{} `state:"nosave"`
}

// timerTickEvents are Clock events that require the Timer goroutine to Tick
// prematurely.
const timerTickEvents = ClockEventSet | ClockEventRateIncrease

// NewTimer returns a new Timer that will obtain time from clock and send
// expirations to listener. The Timer is initially stopped and has no first
// expiration or period configured.
func NewTimer(clock Clock, listener TimerListener) *Timer {
        t := &Timer{
                clock:    clock,
                listener: listener,
        }
        t.init()
        return t
}

// init initializes Timer state that is not preserved across save/restore. If
// init has already been called, calling it again is a no-op.
//
// Preconditions: t.mu must be locked, or the caller must have exclusive access
// to t.
func (t *Timer) init() {
        if t.kicker != nil {
                return
        }
        // If t.kicker is nil, the Timer goroutine can't be running, so we can't
        // race with it.
        t.kicker = time.NewTimer(0)
        t.entry, t.events = waiter.NewChannelEntry(nil)
        t.clock.EventRegister(&t.entry, timerTickEvents)
        go t.runGoroutine() // S/R-SAFE: synchronized by t.mu
}

// Destroy releases resources owned by the Timer. A Destroyed Timer must not be
// used again; in particular, a Destroyed Timer should not be Saved.
func (t *Timer) Destroy() {
        // Stop the Timer, ensuring that the Timer goroutine will not call
        // t.kicker.Reset, before calling t.kicker.Stop.
        t.mu.Lock()
        t.setting.Enabled = false
        t.mu.Unlock()
        t.kicker.Stop()
        // Unregister t.entry, ensuring that the Clock will not send to t.events,
        // before closing t.events to instruct the Timer goroutine to exit.
        t.clock.EventUnregister(&t.entry)
        close(t.events)
        t.listener.Destroy()
}

func (t *Timer) runGoroutine() {
        for {
                select {
                case <-t.kicker.C:
                case _, ok := <-t.events:
                        if !ok {
                                // Channel closed by Destroy.
                                return
                        }
                }
                t.Tick()
        }
}

// Tick requests that the Timer immediately check for expirations and
// re-evaluate when it should next check for expirations.
func (t *Timer) Tick() {
        now := t.clock.Now()
        t.mu.Lock()
        defer t.mu.Unlock()
        if t.paused {
                return
        }
        s, exp := t.setting.At(now)
        t.setting = s
        if exp > 0 {
                if newS, ok := t.listener.Notify(exp, t.setting); ok {
                        t.setting = newS
                }
        }
        t.resetKickerLocked(now)
}

// Pause pauses the Timer, ensuring that it does not generate any further
// expirations until Resume is called. If the Timer is already paused, Pause
// has no effect.
func (t *Timer) Pause() {
        t.mu.Lock()
        defer t.mu.Unlock()
        t.paused = true
        // t.kicker may be nil if we were restored but never resumed.
        if t.kicker != nil {
                t.kicker.Stop()
        }
}

// Resume ends the effect of Pause. If the Timer is not paused, Resume has no
// effect.
func (t *Timer) Resume() {
        t.mu.Lock()
        defer t.mu.Unlock()
        if !t.paused {
                return
        }
        t.paused = false

        // Lazily initialize the Timer. We can't call Timer.init until Timer.Resume
        // because save/restore will restore Timers before
        // kernel.Timekeeper.SetClocks() has been called, so if t.clock is backed
        // by a kernel.Timekeeper then the Timer goroutine will panic if it calls
        // t.clock.Now().
        t.init()

        // Kick the Timer goroutine in case it was already initialized, but the
        // Timer goroutine was sleeping.
        t.kicker.Reset(0)
}

// Get returns a snapshot of the Timer's current Setting and the time
// (according to the Timer's Clock) at which the snapshot was taken.
//
// Preconditions: The Timer must not be paused (since its Setting cannot
// be advanced to the current time while it is paused.)
func (t *Timer) Get() (Time, Setting) {
        now := t.clock.Now()
        t.mu.Lock()
        defer t.mu.Unlock()
        if t.paused {
                panic(fmt.Sprintf("Timer.Get called on paused Timer %p", t))
        }
        s, exp := t.setting.At(now)
        t.setting = s
        if exp > 0 {
                if newS, ok := t.listener.Notify(exp, t.setting); ok {
                        t.setting = newS
                }
        }
        t.resetKickerLocked(now)
        return now, s
}

// Swap atomically changes the Timer's Setting and returns the Timer's previous
// Setting and the time (according to the Timer's Clock) at which the snapshot
// was taken. Setting s.Enabled to true starts the Timer, while setting
// s.Enabled to false stops it.
//
// Preconditions: The Timer must not be paused.
func (t *Timer) Swap(s Setting) (Time, Setting) {
        return t.SwapAnd(s, nil)
}

// SwapAnd atomically changes the Timer's Setting, calls f if it is not nil,
// and returns the Timer's previous Setting and the time (according to the
// Timer's Clock) at which the Setting was changed. Setting s.Enabled to true
// starts the timer, while setting s.Enabled to false stops it.
//
// Preconditions:
// * The Timer must not be paused.
// * f cannot call any Timer methods since it is called with the Timer mutex
//   locked.
func (t *Timer) SwapAnd(s Setting, f func()) (Time, Setting) {
        now := t.clock.Now()
        t.mu.Lock()
        defer t.mu.Unlock()
        if t.paused {
                panic(fmt.Sprintf("Timer.SwapAnd called on paused Timer %p", t))
        }
        oldS, oldExp := t.setting.At(now)
        if oldExp > 0 {
                t.listener.Notify(oldExp, oldS)
                // N.B. The returned Setting doesn't matter because we're about
                // to overwrite.
        }
        if f != nil {
                f()
        }
        newS, newExp := s.At(now)
        t.setting = newS
        if newExp > 0 {
                if newS, ok := t.listener.Notify(newExp, t.setting); ok {
                        t.setting = newS
                }
        }
        t.resetKickerLocked(now)
        return now, oldS
}

// Atomically invokes f atomically with respect to expirations of t; that is, t
// cannot generate expirations while f is being called.
//
// Preconditions: f cannot call any Timer methods since it is called with the
// Timer mutex locked.
func (t *Timer) Atomically(f func()) {
        t.mu.Lock()
        defer t.mu.Unlock()
        f()
}

// Preconditions: t.mu must be locked.
func (t *Timer) resetKickerLocked(now Time) {
        if t.setting.Enabled {
                // Clock.WallTimeUntil may return a negative value. This is fine;
                // time.when treats negative Durations as 0.
                t.kicker.Reset(t.clock.WallTimeUntil(t.setting.Next, now))
        }
        // We don't call t.kicker.Stop if !t.setting.Enabled because in most cases
        // resetKickerLocked will be called from the Timer goroutine itself, in
        // which case t.kicker has already fired and t.kicker.Stop will be an
        // expensive no-op (time.Timer.Stop => time.stopTimer => runtime.stopTimer
        // => runtime.deltimer).
}

// Clock returns the Clock used by t.
func (t *Timer) Clock() Clock {
        return t.clock
}

// ChannelNotifier is a TimerListener that sends a message on an empty struct
// channel.
//
// ChannelNotifier cannot be saved or loaded.
type ChannelNotifier struct {
        // tchan must be a buffered channel.
        tchan chan struct{}
}

// NewChannelNotifier creates a new channel notifier.
//
// If the notifier is used with a timer, Timer.Destroy will close the channel
// returned here.
func NewChannelNotifier() (TimerListener, <-chan struct{}) {
        tchan := make(chan struct{}, 1)
        return &ChannelNotifier{tchan}, tchan
}

// Notify implements ktime.TimerListener.Notify.
func (c *ChannelNotifier) Notify(uint64, Setting) (Setting, bool) {
        select {
        case c.tchan <- struct{}{}:
        default:
        }

        return Setting{}, false
}

// Destroy implements ktime.TimerListener.Destroy and will close the channel.
func (c *ChannelNotifier) Destroy() {
        close(c.tchan)
}




































































    3 


    1 


    2 



    3 


    3 






    2 







    2 











    5 







    5 



    4 





    3 


    2 



    4 

    5 




    4 








   14 




   14 



    1 



   13 



   13 







   13 



   14 




   11 








    9 
















   17 
    7 




   15 


   15 



    8 








   15 






   13 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package devpts

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// waitBufMaxBytes is the maximum size of a wait buffer. It is based on
// TTYB_DEFAULT_MEM_LIMIT.
const waitBufMaxBytes = 131072

// queue represents one of the input or output queues between a pty master and
// replica. Bytes written to a queue are added to the read buffer until it is
// full, at which point they are written to the wait buffer. Bytes are
// processed (i.e. undergo termios transformations) as they are added to the
// read buffer. The read buffer is readable when its length is nonzero and
// readable is true.
//
// +stateify savable
type queue struct {
        // mu protects everything in queue.
        mu sync.Mutex `state:"nosave"`

        // readBuf is buffer of data ready to be read when readable is true.
        // This data has been processed.
        readBuf []byte

        // waitBuf contains data that can't fit into readBuf. It is put here
        // until it can be loaded into the read buffer. waitBuf contains data
        // that hasn't been processed.
        waitBuf    [][]byte
        waitBufLen uint64

        // readable indicates whether the read buffer can be read from.  In
        // canonical mode, there can be an unterminated line in the read buffer,
        // so readable must be checked.
        readable bool

        // transform is the the queue's function for transforming bytes
        // entering the queue. For example, transform might convert all '\r's
        // entering the queue to '\n's.
        transformer
}

// readReadiness returns whether q is ready to be read from.
func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask {
        q.mu.Lock()
        defer q.mu.Unlock()
        if len(q.readBuf) > 0 && q.readable {
                return waiter.ReadableEvents
        }
        return waiter.EventMask(0)
}

// writeReadiness returns whether q is ready to be written to.
func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
        q.mu.Lock()
        defer q.mu.Unlock()
        if q.waitBufLen < waitBufMaxBytes {
                return waiter.WritableEvents
        }
        return waiter.EventMask(0)
}

// readableSize writes the number of readable bytes to userspace.
func (q *queue) readableSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
        q.mu.Lock()
        defer q.mu.Unlock()
        size := primitive.Int32(0)
        if q.readable {
                size = primitive.Int32(len(q.readBuf))
        }

        _, err := size.CopyOut(t, args[2].Pointer())
        return err

}

// read reads from q to userspace. It returns:
// - The number of bytes read
// - Whether the read caused more readable data to become available (whether
// data was pushed from the wait buffer to the read buffer).
// - Whether any data was echoed back (need to notify readers).
//
// Preconditions: l.termiosMu must be held for reading.
func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, bool, error) {
        q.mu.Lock()
        defer q.mu.Unlock()

        if !q.readable {
                return 0, false, false, syserror.ErrWouldBlock
        }

        if dst.NumBytes() > canonMaxBytes {
                dst = dst.TakeFirst(canonMaxBytes)
        }

        n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dst safemem.BlockSeq) (uint64, error) {
                src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(q.readBuf))
                n, err := safemem.CopySeq(dst, src)
                if err != nil {
                        return 0, err
                }
                q.readBuf = q.readBuf[n:]

                // If we read everything, this queue is no longer readable.
                if len(q.readBuf) == 0 {
                        q.readable = false
                }

                return n, nil
        }))
        if err != nil {
                return 0, false, false, err
        }

        // Move data from the queue's wait buffer to its read buffer.
        nPushed, notifyEcho := q.pushWaitBufLocked(l)

        return int64(n), nPushed > 0, notifyEcho, nil
}

// write writes to q from userspace.
// The returned boolean indicates whether any data was echoed back.
//
// Preconditions: l.termiosMu must be held for reading.
func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
        q.mu.Lock()
        defer q.mu.Unlock()

        // Copy data into the wait buffer.
        n, err := src.CopyInTo(ctx, safemem.WriterFunc(func(src safemem.BlockSeq) (uint64, error) {
                copyLen := src.NumBytes()
                room := waitBufMaxBytes - q.waitBufLen
                // If out of room, return EAGAIN.
                if room == 0 && copyLen > 0 {
                        return 0, syserror.ErrWouldBlock
                }
                // Cap the size of the wait buffer.
                if copyLen > room {
                        copyLen = room
                        src = src.TakeFirst64(room)
                }
                buf := make([]byte, copyLen)

                // Copy the data into the wait buffer.
                dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf))
                n, err := safemem.CopySeq(dst, src)
                if err != nil {
                        return 0, err
                }
                q.waitBufAppend(buf)

                return n, nil
        }))
        if err != nil {
                return 0, false, err
        }

        // Push data from the wait to the read buffer.
        _, notifyEcho := q.pushWaitBufLocked(l)

        return n, notifyEcho, nil
}

// writeBytes writes to q from b.
// The returned boolean indicates whether any data was echoed back.
//
// Preconditions: l.termiosMu must be held for reading.
func (q *queue) writeBytes(b []byte, l *lineDiscipline) bool {
        q.mu.Lock()
        defer q.mu.Unlock()

        // Write to the wait buffer.
        q.waitBufAppend(b)
        _, notifyEcho := q.pushWaitBufLocked(l)
        return notifyEcho
}

// pushWaitBufLocked fills the queue's read buffer with data from the wait
// buffer.
// The returned boolean indicates whether any data was echoed back.
//
// Preconditions:
// * l.termiosMu must be held for reading.
// * q.mu must be locked.
func (q *queue) pushWaitBufLocked(l *lineDiscipline) (int, bool) {
        if q.waitBufLen == 0 {
                return 0, false
        }

        // Move data from the wait to the read buffer.
        var total int
        var i int
        var notifyEcho bool
        for i = 0; i < len(q.waitBuf); i++ {
                n, echo := q.transform(l, q, q.waitBuf[i])
                total += n
                notifyEcho = notifyEcho || echo
                if n != len(q.waitBuf[i]) {
                        // The read buffer filled up without consuming the
                        // entire buffer.
                        q.waitBuf[i] = q.waitBuf[i][n:]
                        break
                }
        }

        // Update wait buffer based on consumed data.
        q.waitBuf = q.waitBuf[i:]
        q.waitBufLen -= uint64(total)

        return total, notifyEcho
}

// Precondition: q.mu must be locked.
func (q *queue) waitBufAppend(b []byte) {
        q.waitBuf = append(q.waitBuf, b)
        q.waitBufLen += uint64(len(b))
}

















































  114 







  115 

    4 



  115 







































  114 





















  115 












  115 








  115 









  115 



  114 


















  114 








    9 

    3 


    9 









   60 
   60 





   64 







  115 







   67 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package header provides the implementation of the encoding and decoding of
// network protocol headers.
package header

import (
        "encoding/binary"
        "fmt"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
)

func calculateChecksum(buf []byte, odd bool, initial uint32) (uint16, bool) {
        v := initial

        if odd {
                v += uint32(buf[0])
                buf = buf[1:]
        }

        l := len(buf)
        odd = l&1 != 0
        if odd {
                l--
                v += uint32(buf[l]) << 8
        }

        for i := 0; i < l; i += 2 {
                v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
        }

        return ChecksumCombine(uint16(v), uint16(v>>16)), odd
}

func unrolledCalculateChecksum(buf []byte, odd bool, initial uint32) (uint16, bool) {
        v := initial

        if odd {
                v += uint32(buf[0])
                buf = buf[1:]
        }

        l := len(buf)
        odd = l&1 != 0
        if odd {
                l--
                v += uint32(buf[l]) << 8
        }
        for (l - 64) >= 0 {
                i := 0
                v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
                v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
                v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
                v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
                v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
                v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
                v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
                v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
                i += 16
                v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
                v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
                v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
                v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
                v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
                v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
                v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
                v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
                i += 16
                v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
                v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
                v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
                v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
                v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
                v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
                v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
                v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
                i += 16
                v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
                v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
                v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
                v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
                v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
                v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
                v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
                v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
                buf = buf[64:]
                l = l - 64
        }
        if (l - 32) >= 0 {
                i := 0
                v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
                v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
                v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
                v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
                v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
                v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
                v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
                v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
                i += 16
                v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
                v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
                v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
                v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
                v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
                v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
                v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
                v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
                buf = buf[32:]
                l = l - 32
        }
        if (l - 16) >= 0 {
                i := 0
                v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
                v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
                v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
                v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
                v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
                v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
                v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
                v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
                buf = buf[16:]
                l = l - 16
        }
        if (l - 8) >= 0 {
                i := 0
                v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
                v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
                v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
                v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
                buf = buf[8:]
                l = l - 8
        }
        if (l - 4) >= 0 {
                i := 0
                v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
                v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
                buf = buf[4:]
                l = l - 4
        }

        // At this point since l was even before we started unrolling
        // there can be only two bytes left to add.
        if l != 0 {
                v += (uint32(buf[0]) << 8) + uint32(buf[1])
        }

        return ChecksumCombine(uint16(v), uint16(v>>16)), odd
}

// ChecksumOld calculates the checksum (as defined in RFC 1071) of the bytes in
// the given byte array. This function uses a non-optimized implementation. Its
// only retained for reference and to use as a benchmark/test. Most code should
// use the header.Checksum function.
//
// The initial checksum must have been computed on an even number of bytes.
func ChecksumOld(buf []byte, initial uint16) uint16 {
        s, _ := calculateChecksum(buf, false, uint32(initial))
        return s
}

// Checksum calculates the checksum (as defined in RFC 1071) of the bytes in the
// given byte array. This function uses an optimized unrolled version of the
// checksum algorithm.
//
// The initial checksum must have been computed on an even number of bytes.
func Checksum(buf []byte, initial uint16) uint16 {
        s, _ := unrolledCalculateChecksum(buf, false, uint32(initial))
        return s
}

// ChecksumVV calculates the checksum (as defined in RFC 1071) of the bytes in
// the given VectorizedView.
//
// The initial checksum must have been computed on an even number of bytes.
func ChecksumVV(vv buffer.VectorisedView, initial uint16) uint16 {
        var c Checksumer
        for _, v := range vv.Views() {
                c.Add([]byte(v))
        }
        return ChecksumCombine(initial, c.Checksum())
}

// Checksumer calculates checksum defined in RFC 1071.
type Checksumer struct {
        sum uint16
        odd bool
}

// Add adds b to checksum.
func (c *Checksumer) Add(b []byte) {
        if len(b) > 0 {
                c.sum, c.odd = unrolledCalculateChecksum(b, c.odd, uint32(c.sum))
        }
}

// Checksum returns the latest checksum value.
func (c *Checksumer) Checksum() uint16 {
        return c.sum
}

// ChecksumCombine combines the two uint16 to form their checksum. This is done
// by adding them and the carry.
//
// Note that checksum a must have been computed on an even number of bytes.
func ChecksumCombine(a, b uint16) uint16 {
        v := uint32(a) + uint32(b)
        return uint16(v + v>>16)
}

// PseudoHeaderChecksum calculates the pseudo-header checksum for the given
// destination protocol and network address. Pseudo-headers are needed by
// transport layers when calculating their own checksum.
func PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, srcAddr tcpip.Address, dstAddr tcpip.Address, totalLen uint16) uint16 {
        xsum := Checksum([]byte(srcAddr), 0)
        xsum = Checksum([]byte(dstAddr), xsum)

        // Add the length portion of the checksum to the pseudo-checksum.
        tmp := make([]byte, 2)
        binary.BigEndian.PutUint16(tmp, totalLen)
        xsum = Checksum(tmp, xsum)

        return Checksum([]byte{0, uint8(protocol)}, xsum)
}

// checksumUpdate2ByteAlignedUint16 updates a uint16 value in a calculated
// checksum.
//
// The value MUST begin at a 2-byte boundary in the original buffer.
func checksumUpdate2ByteAlignedUint16(xsum, old, new uint16) uint16 {
        // As per RFC 1071 page 4,
        //        (4)  Incremental Update
        //
        //        ...
        //
        //        To update the checksum, simply add the differences of the
        //        sixteen bit integers that have been changed.  To see why this
        //        works, observe that every 16-bit integer has an additive inverse
        //        and that addition is associative.  From this it follows that
        //        given the original value m, the new value m', and the old
        //        checksum C, the new checksum C' is:
        //
        //                C' = C + (-m) + m' = C + (m' - m)
        return ChecksumCombine(xsum, ChecksumCombine(new, ^old))
}

// checksumUpdate2ByteAlignedAddress updates an address in a calculated
// checksum.
//
// The addresses must have the same length and must contain an even number
// of bytes. The address MUST begin at a 2-byte boundary in the original buffer.
func checksumUpdate2ByteAlignedAddress(xsum uint16, old, new tcpip.Address) uint16 {
        const uint16Bytes = 2

        if len(old) != len(new) {
                panic(fmt.Sprintf("buffer lengths are different; old = %d, new = %d", len(old), len(new)))
        }

        if len(old)%uint16Bytes != 0 {
                panic(fmt.Sprintf("buffer has an odd number of bytes; got = %d", len(old)))
        }

        // As per RFC 1071 page 4,
        //        (4)  Incremental Update
        //
        //        ...
        //
        //        To update the checksum, simply add the differences of the
        //        sixteen bit integers that have been changed.  To see why this
        //        works, observe that every 16-bit integer has an additive inverse
        //        and that addition is associative.  From this it follows that
        //        given the original value m, the new value m', and the old
        //        checksum C, the new checksum C' is:
        //
        //                C' = C + (-m) + m' = C + (m' - m)
        for len(old) != 0 {
                // Convert the 2 byte sequences to uint16 values then apply the increment
                // update.
                xsum = checksumUpdate2ByteAlignedUint16(xsum, (uint16(old[0])<<8)+uint16(old[1]), (uint16(new[0])<<8)+uint16(new[1]))
                old = old[uint16Bytes:]
                new = new[uint16Bytes:]
        }

        return xsum
}







































 1823 
 1139 



 1815 




 1139 







  811 

   13 



  891 









 1865 
   82 




 1859 









   12 
 1308 


 1307 










 1953 








   14 







 1962 

 1956 




 1954 
  161 

 1956 



 1959 








 1853 



 1957 

 1169 





  165 





 1950 








 1958 







 1947 
  166 




 1954 




   18 
    1 




   18 




  378 

   50 


  378 



  378 

   50 






 1958 





  752 

  755 






  635 






  758 

  756 
  130 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "runtime"
        "runtime/trace"
        "time"

        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
)

// BlockWithTimeout blocks t until an event is received from C, the application
// monotonic clock indicates that timeout has elapsed (only if haveTimeout is true),
// or t is interrupted. It returns:
//
// - The remaining timeout, which is guaranteed to be 0 if the timeout expired,
// and is unspecified if haveTimeout is false.
//
// - An error which is nil if an event is received from C, ETIMEDOUT if the timeout
// expired, and syserror.ErrInterrupted if t is interrupted.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) BlockWithTimeout(C chan struct{}, haveTimeout bool, timeout time.Duration) (time.Duration, error) {
        if !haveTimeout {
                return timeout, t.block(C, nil)
        }

        start := t.Kernel().MonotonicClock().Now()
        deadline := start.Add(timeout)
        err := t.BlockWithDeadline(C, true, deadline)

        // Timeout, explicitly return a remaining duration of 0.
        if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
                return 0, err
        }

        // Compute the remaining timeout. Note that even if block() above didn't
        // return due to a timeout, we may have used up any of the remaining time
        // since then. We cap the remaining timeout to 0 to make it easier to
        // directly use the returned duration.
        end := t.Kernel().MonotonicClock().Now()
        remainingTimeout := timeout - end.Sub(start)
        if remainingTimeout < 0 {
                remainingTimeout = 0
        }

        return remainingTimeout, err
}

// BlockWithDeadline blocks t until an event is received from C, the
// application monotonic clock indicates a time of deadline (only if
// haveDeadline is true), or t is interrupted. It returns nil if an event is
// received from C, ETIMEDOUT if the deadline expired, and
// syserror.ErrInterrupted if t is interrupted.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) BlockWithDeadline(C <-chan struct{}, haveDeadline bool, deadline ktime.Time) error {
        if !haveDeadline {
                return t.block(C, nil)
        }

        // Start the timeout timer.
        t.blockingTimer.Swap(ktime.Setting{
                Enabled: true,
                Next:    deadline,
        })

        err := t.block(C, t.blockingTimerChan)

        // Stop the timeout timer and drain the channel.
        t.blockingTimer.Swap(ktime.Setting{})
        select {
        case <-t.blockingTimerChan:
        default:
        }

        return err
}

// BlockWithTimer blocks t until an event is received from C or tchan, or t is
// interrupted. It returns nil if an event is received from C, ETIMEDOUT if an
// event is received from tchan, and syserror.ErrInterrupted if t is
// interrupted.
//
// Most clients should use BlockWithDeadline or BlockWithTimeout instead.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) BlockWithTimer(C <-chan struct{}, tchan <-chan struct{}) error {
        return t.block(C, tchan)
}

// Block blocks t until an event is received from C or t is interrupted. It
// returns nil if an event is received from C and syserror.ErrInterrupted if t
// is interrupted.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) Block(C <-chan struct{}) error {
        return t.block(C, nil)
}

// block blocks a task on one of many events.
// N.B. defer is too expensive to be used here.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) block(C <-chan struct{}, timerChan <-chan struct{}) error {
        // This function is very hot; skip this check outside of +race builds.
        if sync.RaceEnabled {
                t.assertTaskGoroutine()
        }

        // Fast path if the request is already done.
        select {
        case <-C:
                return nil
        default:
        }

        // Deactive our address space, we don't need it.
        interrupt := t.SleepStart()

        // If the request is not completed, but the timer has already expired,
        // then ensure that we run through a scheduler cycle. This is because
        // we may see applications relying on timer slack to yield the thread.
        // For example, they may attempt to sleep for some number of nanoseconds,
        // and expect that this will actually yield the CPU and sleep for at
        // least microseconds, e.g.:
        // https://github.com/LMAX-Exchange/disruptor/commit/6ca210f2bcd23f703c479804d583718e16f43c07
        if len(timerChan) > 0 {
                runtime.Gosched()
        }

        region := trace.StartRegion(t.traceContext, blockRegion)
        select {
        case <-C:
                region.End()
                t.SleepFinish(true)
                // Woken by event.
                return nil

        case <-interrupt:
                region.End()
                t.SleepFinish(false)
                // Return the indicated error on interrupt.
                return syserror.ErrInterrupted

        case <-timerChan:
                region.End()
                t.SleepFinish(true)
                // We've timed out.
                return linuxerr.ETIMEDOUT
        }
}

// SleepStart implements context.ChannelSleeper.SleepStart.
func (t *Task) SleepStart() <-chan struct{} {
        t.assertTaskGoroutine()
        t.Deactivate()
        t.accountTaskGoroutineEnter(TaskGoroutineBlockedInterruptible)
        return t.interruptChan
}

// SleepFinish implements context.ChannelSleeper.SleepFinish.
func (t *Task) SleepFinish(success bool) {
        if !success {
                // Our caller received from t.interruptChan; we need to re-send to it
                // to ensure that t.interrupted() is still true.
                t.interruptSelf()
        }
        t.accountTaskGoroutineLeave(TaskGoroutineBlockedInterruptible)
        t.Activate()
}

// Interrupted implements context.ChannelSleeper.Interrupted.
func (t *Task) Interrupted() bool {
        if t.interrupted() {
                return true
        }
        // Indicate that t's task goroutine is still responsive (i.e. reset the
        // watchdog timer).
        t.accountTaskGoroutineRunning()
        return false
}

// UninterruptibleSleepStart implements context.Context.UninterruptibleSleepStart.
func (t *Task) UninterruptibleSleepStart(deactivate bool) {
        t.assertTaskGoroutine()
        if deactivate {
                t.Deactivate()
        }
        t.accountTaskGoroutineEnter(TaskGoroutineBlockedUninterruptible)
}

// UninterruptibleSleepFinish implements context.Context.UninterruptibleSleepFinish.
func (t *Task) UninterruptibleSleepFinish(activate bool) {
        t.accountTaskGoroutineLeave(TaskGoroutineBlockedUninterruptible)
        if activate {
                t.Activate()
        }
}

// interrupted returns true if interrupt or interruptSelf has been called at
// least once since the last call to unsetInterrupted.
func (t *Task) interrupted() bool {
        return len(t.interruptChan) != 0
}

// unsetInterrupted causes interrupted to return false until the next call to
// interrupt or interruptSelf.
func (t *Task) unsetInterrupted() {
        select {
        case <-t.interruptChan:
        default:
        }
}

// interrupt unblocks the task and interrupts it if it's currently running in
// userspace.
func (t *Task) interrupt() {
        t.interruptSelf()
        t.p.Interrupt()
}

// interruptSelf is like Interrupt, but can only be called by the task
// goroutine.
func (t *Task) interruptSelf() {
        select {
        case t.interruptChan <- struct{}{}:
        default:
        }
        // platform.Context.Interrupt() is unnecessary since a task goroutine
        // calling interruptSelf() cannot also be blocked in
        // platform.Context.Switch().
}

































































    7 
















    7 



    7 















  182 






  181 
    1 



    1 







  182 




  182 



    1 












    1 



    1 









    1 

    1 






    1 


    1 





    1 

    1 




    1 




    1 



    1 
    1 






    1 








    2 






    2 


    5 





    4 

    1 
    1 





    5 



    1 
    1 














    7 



    7 





    7 
    7 


    7 


    7 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package proc

import (
        "bytes"
        "sort"
        "strconv"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

const (
        selfName       = "self"
        threadSelfName = "thread-self"
)

// tasksInode represents the inode for /proc/ directory.
//
// +stateify savable
type tasksInode struct {
        implStatFS
        kernfs.InodeAlwaysValid
        kernfs.InodeAttrs
        kernfs.InodeDirectoryNoNewChildren
        kernfs.InodeNotSymlink
        kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
        kernfs.OrderedChildren
        tasksInodeRefs

        locks vfs.FileLocks

        fs    *filesystem
        pidns *kernel.PIDNamespace

        // '/proc/self' and '/proc/thread-self' have custom directory offsets in
        // Linux. So handle them outside of OrderedChildren.

        // fakeCgroupControllers is a map of controller name to directory in the
        // cgroup hierarchy. These controllers are immutable and will be listed
        // in /proc/pid/cgroup if not nil.
        fakeCgroupControllers map[string]string
}

var _ kernfs.Inode = (*tasksInode)(nil)

func (fs *filesystem) newTasksInode(ctx context.Context, k *kernel.Kernel, pidns *kernel.PIDNamespace, fakeCgroupControllers map[string]string) *tasksInode {
        root := auth.NewRootCredentials(pidns.UserNamespace())
        contents := map[string]kernfs.Inode{
                "cmdline":     fs.newInode(ctx, root, 0444, &cmdLineData{}),
                "cpuinfo":     fs.newInode(ctx, root, 0444, newStaticFileSetStat(cpuInfoData(k))),
                "filesystems": fs.newInode(ctx, root, 0444, &filesystemsData{}),
                "loadavg":     fs.newInode(ctx, root, 0444, &loadavgData{}),
                "sys":         fs.newSysDir(ctx, root, k),
                "meminfo":     fs.newInode(ctx, root, 0444, &meminfoData{}),
                "mounts":      kernfs.NewStaticSymlink(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"),
                "net":         kernfs.NewStaticSymlink(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"),
                "stat":        fs.newInode(ctx, root, 0444, &statData{}),
                "uptime":      fs.newInode(ctx, root, 0444, &uptimeData{}),
                "version":     fs.newInode(ctx, root, 0444, &versionData{}),
        }
        // If fakeCgroupControllers are provided, don't create a cgroupfs backed
        // /proc/cgroup as it will not match the fake controllers.
        if len(fakeCgroupControllers) == 0 {
                contents["cgroups"] = fs.newInode(ctx, root, 0444, &cgroupsData{})
        }

        inode := &tasksInode{
                pidns:                 pidns,
                fs:                    fs,
                fakeCgroupControllers: fakeCgroupControllers,
        }
        inode.InodeAttrs.Init(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
        inode.InitRefs()

        inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
        links := inode.OrderedChildren.Populate(contents)
        inode.IncLinks(links)

        return inode
}

// Lookup implements kernfs.inodeDirectory.Lookup.
func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
        // Check if a static entry was looked up.
        if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil {
                return d, nil
        }

        // Not a static entry. Try to lookup a corresponding task.
        tid, err := strconv.ParseUint(name, 10, 64)
        if err != nil {
                root := auth.NewRootCredentials(i.pidns.UserNamespace())
                // If it failed to parse, check if it's one of the special handled files.
                switch name {
                case selfName:
                        return i.newSelfSymlink(ctx, root), nil
                case threadSelfName:
                        return i.newThreadSelfSymlink(ctx, root), nil
                }
                return nil, syserror.ENOENT
        }

        task := i.pidns.TaskWithID(kernel.ThreadID(tid))
        if task == nil {
                return nil, syserror.ENOENT
        }

        return i.fs.newTaskInode(ctx, task, i.pidns, true, i.fakeCgroupControllers)
}

// IterDirents implements kernfs.inodeDirectory.IterDirents.
func (i *tasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
        // fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
        const FIRST_PROCESS_ENTRY = 256

        // Use maxTaskID to shortcut searches that will result in 0 entries.
        const maxTaskID = kernel.TasksLimit + 1
        if offset >= maxTaskID {
                return offset, nil
        }

        // According to Linux (fs/proc/base.c:proc_pid_readdir()), process directories
        // start at offset FIRST_PROCESS_ENTRY with '/proc/self', followed by
        // '/proc/thread-self' and then '/proc/[pid]'.
        if offset < FIRST_PROCESS_ENTRY {
                offset = FIRST_PROCESS_ENTRY
        }

        if offset == FIRST_PROCESS_ENTRY {
                dirent := vfs.Dirent{
                        Name:    selfName,
                        Type:    linux.DT_LNK,
                        Ino:     i.fs.NextIno(),
                        NextOff: offset + 1,
                }
                if err := cb.Handle(dirent); err != nil {
                        return offset, err
                }
                offset++
        }
        if offset == FIRST_PROCESS_ENTRY+1 {
                dirent := vfs.Dirent{
                        Name:    threadSelfName,
                        Type:    linux.DT_LNK,
                        Ino:     i.fs.NextIno(),
                        NextOff: offset + 1,
                }
                if err := cb.Handle(dirent); err != nil {
                        return offset, err
                }
                offset++
        }

        // Collect all tasks that TGIDs are greater than the offset specified. Per
        // Linux we only include in directory listings if it's the leader. But for
        // whatever crazy reason, you can still walk to the given node.
        var tids []int
        startTid := offset - FIRST_PROCESS_ENTRY - 2
        for _, tg := range i.pidns.ThreadGroups() {
                tid := i.pidns.IDOfThreadGroup(tg)
                if int64(tid) < startTid {
                        continue
                }
                if leader := tg.Leader(); leader != nil {
                        tids = append(tids, int(tid))
                }
        }

        if len(tids) == 0 {
                return offset, nil
        }

        sort.Ints(tids)
        for _, tid := range tids {
                dirent := vfs.Dirent{
                        Name:    strconv.FormatUint(uint64(tid), 10),
                        Type:    linux.DT_DIR,
                        Ino:     i.fs.NextIno(),
                        NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1,
                }
                if err := cb.Handle(dirent); err != nil {
                        return offset, err
                }
                offset++
        }
        return maxTaskID, nil
}

// Open implements kernfs.Inode.Open.
func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
                SeekEnd: kernfs.SeekEndZero,
        })
        if err != nil {
                return nil, err
        }
        return fd.VFSFileDescription(), nil
}

func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
        stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
        if err != nil {
                return linux.Statx{}, err
        }

        if opts.Mask&linux.STATX_NLINK != 0 {
                // Add dynamic children to link count.
                for _, tg := range i.pidns.ThreadGroups() {
                        if leader := tg.Leader(); leader != nil {
                                stat.Nlink++
                        }
                }
        }

        return stat, nil
}

// DecRef implements kernfs.Inode.DecRef.
func (i *tasksInode) DecRef(ctx context.Context) {
        i.tasksInodeRefs.DecRef(func() { i.Destroy(ctx) })
}

// staticFileSetStat implements a special static file that allows inode
// attributes to be set. This is to support /proc files that are readonly, but
// allow attributes to be set.
//
// +stateify savable
type staticFileSetStat struct {
        dynamicBytesFileSetAttr
        vfs.StaticData
}

var _ dynamicInode = (*staticFileSetStat)(nil)

func newStaticFileSetStat(data string) *staticFileSetStat {
        return &staticFileSetStat{StaticData: vfs.StaticData{Data: data}}
}

func cpuInfoData(k *kernel.Kernel) string {
        features := k.FeatureSet()
        if features == nil {
                // Kernel is always initialized with a FeatureSet.
                panic("cpuinfo read with nil FeatureSet")
        }
        var buf bytes.Buffer
        for i, max := uint(0), k.ApplicationCores(); i < max; i++ {
                features.WriteCPUInfoTo(i, &buf)
        }
        return buf.String()
}

func shmData(v uint64) dynamicInode {
        return newStaticFile(strconv.FormatUint(v, 10))
}























































  644 
  518 


  552 












 1628 












  642 












































  641 
  639 
  642 





  640 




  643 






















  639 





  643 
  642 





  641 



  643 
  641 




    2 

















 1627 




  642 




  640 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package time

import (
        "errors"

        "gvisor.dev/gvisor/pkg/log"
)

const (
        // maxSampleLoops is the maximum number of times to try to get a clock sample
        // under the expected overhead.
        maxSampleLoops = 5

        // maxSamples is the maximum number of samples to collect.
        maxSamples = 10
)

// errOverheadTooHigh is returned from sampler.Sample if the syscall
// overhead is too high.
var errOverheadTooHigh = errors.New("time syscall overhead exceeds maximum")

// TSCValue is a value from the TSC.
type TSCValue int64

// Rdtsc reads the TSC.
//
// Intel SDM, Vol 3, Ch 17.15:
// "The RDTSC instruction reads the time-stamp counter and is guaranteed to
// return a monotonically increasing unique value whenever executed, except for
// a 64-bit counter wraparound. Intel guarantees that the time-stamp counter
// will not wraparound within 10 years after being reset."
//
// We use int64, so we have 5 years before wrap-around.
func Rdtsc() TSCValue

// ReferenceNS are nanoseconds in the reference clock domain.
// int64 gives us ~290 years before this overflows.
type ReferenceNS int64

// Magnitude returns the absolute value of r.
func (r ReferenceNS) Magnitude() ReferenceNS {
        if r < 0 {
                return -r
        }
        return r
}

// cycleClock is a TSC-based cycle clock.
type cycleClock interface {
        // Cycles returns a count value from the TSC.
        Cycles() TSCValue
}

// tscCycleClock is a cycleClock that uses the real TSC.
type tscCycleClock struct{}

// Cycles implements cycleClock.Cycles.
func (tscCycleClock) Cycles() TSCValue {
        return Rdtsc()
}

// sample contains a sample from the reference clock, with TSC values from
// before and after the reference clock value was captured.
type sample struct {
        before TSCValue
        after  TSCValue
        ref    ReferenceNS
}

// Overhead returns the sample overhead in TSC cycles.
func (s *sample) Overhead() TSCValue {
        return s.after - s.before
}

// referenceClocks collects individual samples from a reference clock ID and
// TSC.
type referenceClocks interface {
        cycleClock

        // Sample returns a single sample from the reference clock ID.
        Sample(c ClockID) (sample, error)
}

// sampler collects samples from a reference system clock, minimizing
// the overhead in each sample.
type sampler struct {
        // clockID is the reference clock ID (e.g., CLOCK_MONOTONIC).
        clockID ClockID

        // clocks provides raw samples.
        clocks referenceClocks

        // overhead is the estimated sample overhead in TSC cycles.
        overhead TSCValue

        // samples is a ring buffer of the latest samples collected.
        samples []sample
}

// newSampler creates a sampler for clockID.
func newSampler(c ClockID) *sampler {
        return &sampler{
                clockID:  c,
                clocks:   syscallTSCReferenceClocks{},
                overhead: defaultOverheadCycles,
        }
}

// Reset discards previously collected clock samples.
func (s *sampler) Reset() {
        s.overhead = defaultOverheadCycles
        s.samples = []sample{}
}

// lowOverheadSample returns a reference clock sample with minimized syscall overhead.
func (s *sampler) lowOverheadSample() (sample, error) {
        for {
                for i := 0; i < maxSampleLoops; i++ {
                        samp, err := s.clocks.Sample(s.clockID)
                        if err != nil {
                                return sample{}, err
                        }

                        if samp.before > samp.after {
                                log.Warningf("TSC went backwards: %v > %v", samp.before, samp.after)
                                continue
                        }

                        if samp.Overhead() <= s.overhead {
                                return samp, nil
                        }
                }

                // Couldn't get a sample with the current overhead. Increase it.
                newOverhead := 2 * s.overhead
                if newOverhead > maxOverheadCycles {
                        // We'll give it one more shot with the max overhead.

                        if s.overhead == maxOverheadCycles {
                                return sample{}, errOverheadTooHigh
                        }

                        newOverhead = maxOverheadCycles
                }

                s.overhead = newOverhead
                log.Debugf("Time: Adjusting syscall overhead up to %v", s.overhead)
        }
}

// Sample collects a reference clock sample.
func (s *sampler) Sample() error {
        sample, err := s.lowOverheadSample()
        if err != nil {
                return err
        }

        s.samples = append(s.samples, sample)
        if len(s.samples) > maxSamples {
                s.samples = s.samples[1:]
        }

        // If the 4 most recent samples all have an overhead less than half the
        // expected overhead, adjust downwards.
        if len(s.samples) < 4 {
                return nil
        }

        for _, sample := range s.samples[len(s.samples)-4:] {
                if sample.Overhead() > s.overhead/2 {
                        return nil
                }
        }

        s.overhead -= s.overhead / 8
        log.Debugf("Time: Adjusting syscall overhead down to %v", s.overhead)

        return nil
}

// Syscall returns the current raw reference time without storing TSC
// samples.
func (s *sampler) Syscall() (ReferenceNS, error) {
        sample, err := s.clocks.Sample(s.clockID)
        if err != nil {
                return 0, err
        }

        return sample.ref, nil
}

// Cycles returns a raw TSC value.
func (s *sampler) Cycles() TSCValue {
        return s.clocks.Cycles()
}

// Range returns the widest range of clock samples available.
func (s *sampler) Range() (sample, sample, bool) {
        if len(s.samples) < 2 {
                return sample{}, sample{}, false
        }

        return s.samples[0], s.samples[len(s.samples)-1], true
}
























































































































  143 















































































    2 










































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package memmap defines semantics for memory mappings.
package memmap

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/safemem"
)

// Mappable represents a memory-mappable object, a mutable mapping from uint64
// offsets to (File, uint64 File offset) pairs.
//
// See mm/mm.go for Mappable's place in the lock order.
//
// All Mappable methods have the following preconditions:
// * hostarch.AddrRanges and MappableRanges must be non-empty (Length() != 0).
// * hostarch.Addrs and Mappable offsets must be page-aligned.
type Mappable interface {
        // AddMapping notifies the Mappable of a mapping from addresses ar in ms to
        // offsets [offset, offset+ar.Length()) in this Mappable.
        //
        // The writable flag indicates whether the backing data for a Mappable can
        // be modified through the mapping. Effectively, this means a shared mapping
        // where Translate may be called with at.Write == true. This is a property
        // established at mapping creation and must remain constant throughout the
        // lifetime of the mapping.
        //
        // Preconditions: offset+ar.Length() does not overflow.
        AddMapping(ctx context.Context, ms MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error

        // RemoveMapping notifies the Mappable of the removal of a mapping from
        // addresses ar in ms to offsets [offset, offset+ar.Length()) in this
        // Mappable.
        //
        // Preconditions:
        // * offset+ar.Length() does not overflow.
        // * The removed mapping must exist. writable must match the
        //   corresponding call to AddMapping.
        RemoveMapping(ctx context.Context, ms MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool)

        // CopyMapping notifies the Mappable of an attempt to copy a mapping in ms
        // from srcAR to dstAR. For most Mappables, this is equivalent to
        // AddMapping. Note that it is possible that srcAR.Length() != dstAR.Length(),
        // and also that srcAR.Length() == 0.
        //
        // CopyMapping is only called when a mapping is copied within a given
        // MappingSpace; it is analogous to Linux's vm_operations_struct::mremap.
        //
        // Preconditions:
        // * offset+srcAR.Length() and offset+dstAR.Length() do not overflow.
        // * The mapping at srcAR must exist. writable must match the
        //   corresponding call to AddMapping.
        CopyMapping(ctx context.Context, ms MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error

        // Translate returns the Mappable's current mappings for at least the range
        // of offsets specified by required, and at most the range of offsets
        // specified by optional. at is the set of access types that may be
        // performed using the returned Translations. If not all required offsets
        // are translated, it returns a non-nil error explaining why.
        //
        // Translations are valid until invalidated by a callback to
        // MappingSpace.Invalidate or until the caller removes its mapping of the
        // translated range. Mappable implementations must ensure that at least one
        // reference is held on all pages in a File that may be the result
        // of a valid Translation.
        //
        // Preconditions:
        // * required.Length() > 0.
        // * optional.IsSupersetOf(required).
        // * required and optional must be page-aligned.
        // * The caller must have established a mapping for all of the queried
        //   offsets via a previous call to AddMapping.
        // * The caller is responsible for ensuring that calls to Translate
        //   synchronize with invalidation.
        //
        // Postconditions: See CheckTranslateResult.
        Translate(ctx context.Context, required, optional MappableRange, at hostarch.AccessType) ([]Translation, error)

        // InvalidateUnsavable requests that the Mappable invalidate Translations
        // that cannot be preserved across save/restore.
        //
        // Invariant: InvalidateUnsavable never races with concurrent calls to any
        // other Mappable methods.
        InvalidateUnsavable(ctx context.Context) error
}

// Translations are returned by Mappable.Translate.
type Translation struct {
        // Source is the translated range in the Mappable.
        Source MappableRange

        // File is the mapped file.
        File File

        // Offset is the offset into File at which this Translation begins.
        Offset uint64

        // Perms is the set of permissions for which platform.AddressSpace.MapFile
        // and platform.AddressSpace.MapInternal on this Translation is permitted.
        Perms hostarch.AccessType
}

// FileRange returns the FileRange represented by t.
func (t Translation) FileRange() FileRange {
        return FileRange{t.Offset, t.Offset + t.Source.Length()}
}

// CheckTranslateResult returns an error if (ts, terr) does not satisfy all
// postconditions for Mappable.Translate(required, optional, at).
//
// Preconditions: Same as Mappable.Translate.
func CheckTranslateResult(required, optional MappableRange, at hostarch.AccessType, ts []Translation, terr error) error {
        // Verify that the inputs to Mappable.Translate were valid.
        if !required.WellFormed() || required.Length() == 0 {
                panic(fmt.Sprintf("invalid required range: %v", required))
        }
        if !hostarch.Addr(required.Start).IsPageAligned() || !hostarch.Addr(required.End).IsPageAligned() {
                panic(fmt.Sprintf("unaligned required range: %v", required))
        }
        if !optional.IsSupersetOf(required) {
                panic(fmt.Sprintf("optional range %v is not a superset of required range %v", optional, required))
        }
        if !hostarch.Addr(optional.Start).IsPageAligned() || !hostarch.Addr(optional.End).IsPageAligned() {
                panic(fmt.Sprintf("unaligned optional range: %v", optional))
        }

        // The first Translation must include required.Start.
        if len(ts) != 0 && !ts[0].Source.Contains(required.Start) {
                return fmt.Errorf("first Translation %+v does not cover start of required range %v", ts[0], required)
        }
        for i, t := range ts {
                if !t.Source.WellFormed() || t.Source.Length() == 0 {
                        return fmt.Errorf("Translation %+v has invalid Source", t)
                }
                if !hostarch.Addr(t.Source.Start).IsPageAligned() || !hostarch.Addr(t.Source.End).IsPageAligned() {
                        return fmt.Errorf("Translation %+v has unaligned Source", t)
                }
                if t.File == nil {
                        return fmt.Errorf("Translation %+v has nil File", t)
                }
                if !hostarch.Addr(t.Offset).IsPageAligned() {
                        return fmt.Errorf("Translation %+v has unaligned Offset", t)
                }
                // Translations must be contiguous and in increasing order of
                // Translation.Source.
                if i > 0 && ts[i-1].Source.End != t.Source.Start {
                        return fmt.Errorf("Translation %+v and Translation %+v are not contiguous", ts[i-1], t)
                }
                // At least part of each Translation must be required.
                if t.Source.Intersect(required).Length() == 0 {
                        return fmt.Errorf("Translation %+v lies entirely outside required range %v", t, required)
                }
                // Translations must be constrained to the optional range.
                if !optional.IsSupersetOf(t.Source) {
                        return fmt.Errorf("Translation %+v lies outside optional range %v", t, optional)
                }
                // Each Translation must permit a superset of requested accesses.
                if !t.Perms.SupersetOf(at) {
                        return fmt.Errorf("Translation %+v does not permit all requested access types %v", t, at)
                }
        }
        // If the set of Translations does not cover the entire required range,
        // Translate must return a non-nil error explaining why.
        if terr == nil {
                if len(ts) == 0 {
                        return fmt.Errorf("no Translations and no error")
                }
                if t := ts[len(ts)-1]; !t.Source.Contains(required.End - 1) {
                        return fmt.Errorf("last Translation %+v does not reach end of required range %v, but Translate returned no error", t, required)
                }
        }
        return nil
}

// BusError may be returned by implementations of Mappable.Translate for errors
// that should result in SIGBUS delivery if they cause application page fault
// handling to fail.
type BusError struct {
        // Err is the original error.
        Err error
}

// Error implements error.Error.
func (b *BusError) Error() string {
        return fmt.Sprintf("BusError: %v", b.Err.Error())
}

// MappableRange represents a range of uint64 offsets into a Mappable.
//
// type MappableRange <generated using go_generics>

// String implements fmt.Stringer.String.
func (mr MappableRange) String() string {
        return fmt.Sprintf("[%#x, %#x)", mr.Start, mr.End)
}

// MappingSpace represents a mutable mapping from hostarch.Addrs to (Mappable,
// uint64 offset) pairs.
type MappingSpace interface {
        // Invalidate is called to notify the MappingSpace that values returned by
        // previous calls to Mappable.Translate for offsets mapped by addresses in
        // ar are no longer valid.
        //
        // Invalidate must not take any locks preceding mm.MemoryManager.activeMu
        // in the lock order.
        //
        // Preconditions:
        // * ar.Length() != 0.
        // * ar must be page-aligned.
        Invalidate(ar hostarch.AddrRange, opts InvalidateOpts)
}

// InvalidateOpts holds options to MappingSpace.Invalidate.
type InvalidateOpts struct {
        // InvalidatePrivate is true if private pages in the invalidated region
        // should also be discarded, causing their data to be lost.
        InvalidatePrivate bool
}

// MappingIdentity controls the lifetime of a Mappable, and provides
// information about the Mappable for /proc/[pid]/maps. It is distinct from
// Mappable because all Mappables that are coherent must compare equal to
// support the implementation of shared futexes, but different
// MappingIdentities may represent the same Mappable, in the same way that
// multiple fs.Files may represent the same fs.Inode. (This similarity is not
// coincidental; fs.File implements MappingIdentity, and some
// fs.InodeOperations implement Mappable.)
type MappingIdentity interface {
        // IncRef increments the MappingIdentity's reference count.
        IncRef()

        // DecRef decrements the MappingIdentity's reference count.
        DecRef(ctx context.Context)

        // MappedName returns the application-visible name shown in
        // /proc/[pid]/maps.
        MappedName(ctx context.Context) string

        // DeviceID returns the device number shown in /proc/[pid]/maps.
        DeviceID() uint64

        // InodeID returns the inode number shown in /proc/[pid]/maps.
        InodeID() uint64

        // Msync has the same semantics as fs.FileOperations.Fsync(ctx,
        // int64(mr.Start), int64(mr.End-1), fs.SyncData).
        // (fs.FileOperations.Fsync() takes an inclusive end, but mr.End is
        // exclusive, hence mr.End-1.) It is defined rather than Fsync so that
        // implementors don't need to depend on the fs package for fs.SyncType.
        Msync(ctx context.Context, mr MappableRange) error
}

// MLockMode specifies the memory locking behavior of a memory mapping.
type MLockMode int

// Note that the ordering of MLockModes is significant; see
// mm.MemoryManager.defMLockMode.
const (
        // MLockNone specifies that a mapping has no memory locking behavior.
        //
        // This must be the zero value for MLockMode.
        MLockNone MLockMode = iota

        // MLockEager specifies that a mapping is memory-locked, as by mlock() or
        // similar. Pages in the mapping should be made, and kept, resident in
        // physical memory as soon as possible.
        //
        // As of this writing, MLockEager does not cause memory-locking to be
        // requested from the host; it only affects the sentry's memory management
        // behavior.
        //
        // MLockEager is analogous to Linux's VM_LOCKED.
        MLockEager

        // MLockLazy specifies that a mapping is memory-locked, as by mlock() or
        // similar. Pages in the mapping should be kept resident in physical memory
        // once they have been made resident due to e.g. a page fault.
        //
        // As of this writing, MLockLazy does not cause memory-locking to be
        // requested from the host; in fact, it has virtually no effect, except for
        // interactions between mlocked pages and other syscalls.
        //
        // MLockLazy is analogous to Linux's VM_LOCKED | VM_LOCKONFAULT.
        MLockLazy
)

// MMapOpts specifies a request to create a memory mapping.
type MMapOpts struct {
        // Length is the length of the mapping.
        Length uint64

        // MappingIdentity controls the lifetime of Mappable, and provides
        // properties of the mapping shown in /proc/[pid]/maps. If MMapOpts is used
        // to successfully create a memory mapping, a reference is taken on
        // MappingIdentity.
        MappingIdentity MappingIdentity

        // Mappable is the Mappable to be mapped. If Mappable is nil, the mapping
        // is anonymous. If Mappable is not nil, it must remain valid as long as a
        // reference is held on MappingIdentity.
        Mappable Mappable

        // Offset is the offset into Mappable to map. If Mappable is nil, Offset is
        // ignored.
        Offset uint64

        // Addr is the suggested address for the mapping.
        Addr hostarch.Addr

        // Fixed specifies whether this is a fixed mapping (it must be located at
        // Addr).
        Fixed bool

        // Unmap specifies whether existing mappings in the range being mapped may
        // be replaced. If Unmap is true, Fixed must be true.
        Unmap bool

        // If Map32Bit is true, all addresses in the created mapping must fit in a
        // 32-bit integer. (Note that the "end address" of the mapping, i.e. the
        // address of the first byte *after* the mapping, need not fit in a 32-bit
        // integer.) Map32Bit is ignored if Fixed is true.
        Map32Bit bool

        // Perms is the set of permissions to the applied to this mapping.
        Perms hostarch.AccessType

        // MaxPerms limits the set of permissions that may ever apply to this
        // mapping. If Mappable is not nil, all memmap.Translations returned by
        // Mappable.Translate must support all accesses in MaxPerms.
        //
        // Preconditions: MaxAccessType should be an effective AccessType, as
        // access cannot be limited beyond effective AccessTypes.
        MaxPerms hostarch.AccessType

        // Private is true if writes to the mapping should be propagated to a copy
        // that is exclusive to the MemoryManager.
        Private bool

        // GrowsDown is true if the mapping should be automatically expanded
        // downward on guard page faults.
        GrowsDown bool

        // Precommit is true if the platform should eagerly commit resources to the
        // mapping (see platform.AddressSpace.MapFile).
        Precommit bool

        // MLockMode specifies the memory locking behavior of the mapping.
        MLockMode MLockMode

        // Hint is the name used for the mapping in /proc/[pid]/maps. If Hint is
        // empty, MappingIdentity.MappedName() will be used instead.
        //
        // TODO(jamieliu): Replace entirely with MappingIdentity?
        Hint string

        // Force means to skip validation checks of Addr and Length. It can be
        // used to create special mappings below mm.layout.MinAddr and
        // mm.layout.MaxAddr. It has to be used with caution.
        //
        // If Force is true, Unmap and Fixed must be true.
        Force bool

        // SentryOwnedContent indicates the sentry exclusively controls the
        // underlying memory backing the mapping thus the memory content is
        // guaranteed not to be modified outside the sentry's purview.
        SentryOwnedContent bool
}

// File represents a host file that may be mapped into an platform.AddressSpace.
type File interface {
        // All pages in a File are reference-counted.

        // IncRef increments the reference count on all pages in fr.
        //
        // Preconditions:
        // * fr.Start and fr.End must be page-aligned.
        // * fr.Length() > 0.
        // * At least one reference must be held on all pages in fr. (The File
        //   interface does not provide a way to acquire an initial reference;
        //   implementors may define mechanisms for doing so.)
        IncRef(fr FileRange)

        // DecRef decrements the reference count on all pages in fr.
        //
        // Preconditions:
        // * fr.Start and fr.End must be page-aligned.
        // * fr.Length() > 0.
        // * At least one reference must be held on all pages in fr.
        DecRef(fr FileRange)

        // MapInternal returns a mapping of the given file offsets in the invoking
        // process' address space for reading and writing.
        //
        // Note that fr.Start and fr.End need not be page-aligned.
        //
        // Preconditions:
        // * fr.Length() > 0.
        // * At least one reference must be held on all pages in fr.
        //
        // Postconditions: The returned mapping is valid as long as at least one
        // reference is held on the mapped pages.
        MapInternal(fr FileRange, at hostarch.AccessType) (safemem.BlockSeq, error)

        // FD returns the file descriptor represented by the File.
        //
        // The only permitted operation on the returned file descriptor is to map
        // pages from it consistent with the requirements of AddressSpace.MapFile.
        FD() int
}

// FileRange represents a range of uint64 offsets into a File.
//
// type FileRange <generated using go_generics>

// String implements fmt.Stringer.String.
func (fr FileRange) String() string {
        return fmt.Sprintf("[%#x, %#x)", fr.Start, fr.End)
}


















































































































































































































































































































































































































































































































































































































































































































































  238 




  240 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package gofer

import (
        "errors"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fd"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/p9"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/device"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/fs/fdpipe"
        "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
        "gvisor.dev/gvisor/pkg/sentry/fs/host"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
)

// inodeOperations implements fs.InodeOperations.
//
// +stateify savable
type inodeOperations struct {
        fsutil.InodeNotVirtual `state:"nosave"`

        // fileState implements fs.CachedFileObject. It exists
        // to break a circular load dependency between inodeOperations
        // and cachingInodeOps (below).
        fileState *inodeFileState `state:"wait"`

        // cachingInodeOps implement memmap.Mappable for inodeOperations.
        cachingInodeOps *fsutil.CachingInodeOperations

        // readdirMu protects readdirCache and concurrent Readdirs.
        readdirMu sync.Mutex `state:"nosave"`

        // readdirCache is a cache of readdir results in the form of
        // a fs.SortedDentryMap.
        //
        // Starts out as nil, and is initialized under readdirMu lazily;
        // invalidating the cache means setting it to nil.
        readdirCache *fs.SortedDentryMap `state:"nosave"`
}

// inodeFileState implements fs.CachedFileObject and otherwise fully
// encapsulates state that needs to be manually loaded on restore for
// this file object.
//
// This unfortunate structure exists because fs.CachingInodeOperations
// defines afterLoad and therefore cannot be lazily loaded (to break a
// circular load dependency between it and inodeOperations). Even with
// lazy loading, this approach defines the dependencies between objects
// and the expected load behavior more concretely.
//
// +stateify savable
type inodeFileState struct {
        // s is common file system state for Gofers.
        s *session `state:"wait"`

        // MultiDeviceKey consists of:
        //
        // * Device:          file system device from a specific gofer.
        // * SecondaryDevice: unique identifier of the attach point.
        // * Inode:           the inode of this resource, unique per Device.=
        //
        // These fields combined enable consistent hashing of virtual inodes
        // on goferDevice.
        key device.MultiDeviceKey `state:"nosave"`

        // file is the p9 file that contains a single unopened fid.
        file contextFile `state:"nosave"`

        // sattr caches the stable attributes.
        sattr fs.StableAttr `state:"wait"`

        // handlesMu protects the below fields.
        handlesMu sync.RWMutex `state:"nosave"`

        // If readHandles is non-nil, it holds handles that are either read-only or
        // read/write. If writeHandles is non-nil, it holds write-only handles if
        // writeHandlesRW is false, and read/write handles if writeHandlesRW is
        // true.
        //
        // Once readHandles becomes non-nil, it can't be changed until
        // inodeFileState.Release()*, because of a defect in the
        // fsutil.CachedFileObject interface: there's no way for the caller of
        // fsutil.CachedFileObject.FD() to keep the returned FD open, so if we
        // racily replace readHandles after inodeFileState.FD() has returned
        // readHandles.Host.FD(), fsutil.CachingInodeOperations may use a closed
        // FD. writeHandles can be changed if writeHandlesRW is false, since
        // inodeFileState.FD() can't return a write-only FD, but can't be changed
        // if writeHandlesRW is true for the same reason.
        //
        // * There is one notable exception in recreateReadHandles(), where it dup's
        // the FD and invalidates the page cache.
        readHandles    *handles `state:"nosave"`
        writeHandles   *handles `state:"nosave"`
        writeHandlesRW bool     `state:"nosave"`

        // loading is acquired when the inodeFileState begins an asynchronous
        // load. It releases when the load is complete. Callers that require all
        // state to be available should call waitForLoad() to ensure that.
        loading sync.CrossGoroutineMutex `state:".(struct{})"`

        // savedUAttr is only allocated during S/R. It points to the save-time
        // unstable attributes and is used to validate restore-time ones.
        //
        // Note that these unstable attributes are only used to detect cross-S/R
        // external file system metadata changes. They may differ from the
        // cached unstable attributes in cachingInodeOps, as that might differ
        // from the external file system attributes if there had been WriteOut
        // failures. S/R is transparent to Sentry and the latter will continue
        // using its cached values after restore.
        savedUAttr *fs.UnstableAttr

        // hostMappable is created when using 'cacheRemoteRevalidating' to map pages
        // directly from host.
        hostMappable *fsutil.HostMappable
}

// Release releases file handles.
func (i *inodeFileState) Release(ctx context.Context) {
        i.file.close(ctx)
        if i.readHandles != nil {
                i.readHandles.DecRef()
        }
        if i.writeHandles != nil {
                i.writeHandles.DecRef()
        }
}

func (i *inodeFileState) canShareHandles() bool {
        // Only share handles for regular files, since for other file types,
        // distinct handles may have special semantics even if they represent the
        // same file. Disable handle sharing for cache policy cacheNone, since this
        // is legacy behavior.
        return fs.IsFile(i.sattr) && i.s.cachePolicy != cacheNone
}

// Preconditions: i.handlesMu must be locked for writing.
func (i *inodeFileState) setSharedHandlesLocked(flags fs.FileFlags, h *handles) {
        if flags.Read && i.readHandles == nil {
                h.IncRef()
                i.readHandles = h
        }
        if flags.Write {
                if i.writeHandles == nil {
                        h.IncRef()
                        i.writeHandles = h
                        i.writeHandlesRW = flags.Read
                } else if !i.writeHandlesRW && flags.Read {
                        // Upgrade i.writeHandles.
                        i.writeHandles.DecRef()
                        h.IncRef()
                        i.writeHandles = h
                        i.writeHandlesRW = flags.Read
                }
        }
}

// getHandles returns a set of handles for a new file using i opened with the
// given flags.
func (i *inodeFileState) getHandles(ctx context.Context, flags fs.FileFlags, cache *fsutil.CachingInodeOperations) (*handles, error) {
        if !i.canShareHandles() {
                return newHandles(ctx, i.s.client, i.file, flags)
        }

        i.handlesMu.Lock()
        h, invalidate, err := i.getHandlesLocked(ctx, flags)
        i.handlesMu.Unlock()

        if invalidate {
                cache.NotifyChangeFD()
                if i.hostMappable != nil {
                        i.hostMappable.NotifyChangeFD()
                }
        }

        return h, err
}

// getHandlesLocked returns a pointer to cached handles and a boolean indicating
// whether previously open read handle was recreated. Host mappings must be
// invalidated if so.
func (i *inodeFileState) getHandlesLocked(ctx context.Context, flags fs.FileFlags) (*handles, bool, error) {
        // Check if we are able to use cached handles.
        if flags.Truncate && p9.VersionSupportsOpenTruncateFlag(i.s.client.Version()) {
                // If we are truncating (and the gofer supports it), then we
                // always need a new handle. Don't return one from the cache.
        } else if flags.Write {
                if i.writeHandles != nil && (i.writeHandlesRW || !flags.Read) {
                        // File is opened for writing, and we have cached write
                        // handles that we can use.
                        i.writeHandles.IncRef()
                        return i.writeHandles, false, nil
                }
        } else if i.readHandles != nil {
                // File is opened for reading and we have cached handles.
                i.readHandles.IncRef()
                return i.readHandles, false, nil
        }

        // Get new handles and cache them for future sharing.
        h, err := newHandles(ctx, i.s.client, i.file, flags)
        if err != nil {
                return nil, false, err
        }

        // Read handles invalidation is needed if:
        //   - Mount option 'overlayfs_stale_read' is set
        //   - Read handle is open: nothing to invalidate otherwise
        //   - Write handle is not open: file was not open for write and is being open
        //     for write now (will trigger copy up in overlayfs).
        invalidate := false
        if i.s.overlayfsStaleRead && i.readHandles != nil && i.writeHandles == nil && flags.Write {
                if err := i.recreateReadHandles(ctx, h, flags); err != nil {
                        return nil, false, err
                }
                invalidate = true
        }
        i.setSharedHandlesLocked(flags, h)
        return h, invalidate, nil
}

func (i *inodeFileState) recreateReadHandles(ctx context.Context, writer *handles, flags fs.FileFlags) error {
        h := writer
        if !flags.Read {
                // Writer can't be used for read, must create a new handle.
                var err error
                h, err = newHandles(ctx, i.s.client, i.file, fs.FileFlags{Read: true})
                if err != nil {
                        return err
                }
                defer h.DecRef()
        }

        if i.readHandles.Host == nil {
                // If current readHandles doesn't have a host FD, it can simply be replaced.
                i.readHandles.DecRef()

                h.IncRef()
                i.readHandles = h
                return nil
        }

        if h.Host == nil {
                // Current read handle has a host FD and can't be replaced with one that
                // doesn't, because it breaks fsutil.CachedFileObject.FD() contract.
                log.Warningf("Read handle can't be invalidated, reads may return stale data")
                return nil
        }

        // Due to a defect in the fsutil.CachedFileObject interface,
        // readHandles.Host.FD() may be used outside locks, making it impossible to
        // reliably close it. To workaround it, we dup the new FD into the old one, so
        // operations on the old will see the new data. Then, make the new handle take
        // ownereship of the old FD and mark the old readHandle to not close the FD
        // when done.
        if err := unix.Dup3(h.Host.FD(), i.readHandles.Host.FD(), unix.O_CLOEXEC); err != nil {
                return err
        }

        h.Host.Close()
        h.Host = fd.New(i.readHandles.Host.FD())
        i.readHandles.isHostBorrowed = true
        i.readHandles.DecRef()

        h.IncRef()
        i.readHandles = h
        return nil
}

// ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt.
func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
        i.handlesMu.RLock()
        n, err := i.readHandles.readWriterAt(ctx, int64(offset)).ReadToBlocks(dsts)
        i.handlesMu.RUnlock()
        return n, err
}

// WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt.
func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
        i.handlesMu.RLock()
        n, err := i.writeHandles.readWriterAt(ctx, int64(offset)).WriteFromBlocks(srcs)
        i.handlesMu.RUnlock()
        return n, err
}

// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr, forceSetTimestamps bool) error {
        if i.skipSetAttr(mask, forceSetTimestamps) {
                return nil
        }
        as, ans := attr.AccessTime.Unix()
        ms, mns := attr.ModificationTime.Unix()
        // An update of status change time is implied by mask.AccessTime
        // or mask.ModificationTime. Updating status change time to a
        // time earlier than the system time is not possible.
        return i.file.setAttr(
                ctx,
                p9.SetAttrMask{
                        Permissions:        mask.Perms,
                        Size:               mask.Size,
                        UID:                mask.UID,
                        GID:                mask.GID,
                        ATime:              mask.AccessTime,
                        ATimeNotSystemTime: true,
                        MTime:              mask.ModificationTime,
                        MTimeNotSystemTime: true,
                }, p9.SetAttr{
                        Permissions:      p9.FileMode(attr.Perms.LinuxMode()),
                        UID:              p9.UID(attr.Owner.UID),
                        GID:              p9.GID(attr.Owner.GID),
                        Size:             uint64(attr.Size),
                        ATimeSeconds:     uint64(as),
                        ATimeNanoSeconds: uint64(ans),
                        MTimeSeconds:     uint64(ms),
                        MTimeNanoSeconds: uint64(mns),
                })
}

// skipSetAttr checks if attribute change can be skipped. It can be skipped
// when:
//   - Mask is empty
//   - Mask contains only attributes that cannot be set in the gofer
//   - forceSetTimestamps is false and mask contains only atime and/or mtime
//     and host FD exists
//
// Updates to atime and mtime can be skipped because cached value will be
// "close enough" to host value, given that operation went directly to host FD.
// Skipping atime updates is particularly important to reduce the number of
// operations sent to the Gofer for readonly files.
func (i *inodeFileState) skipSetAttr(mask fs.AttrMask, forceSetTimestamps bool) bool {
        // First remove attributes that cannot be updated.
        cpy := mask
        cpy.Type = false
        cpy.DeviceID = false
        cpy.InodeID = false
        cpy.BlockSize = false
        cpy.Usage = false
        cpy.Links = false
        if cpy.Empty() {
                return true
        }

        // Then check if more than just atime and mtime is being set.
        cpy.AccessTime = false
        cpy.ModificationTime = false
        if !cpy.Empty() {
                return false
        }

        // If forceSetTimestamps was passed, then we cannot skip.
        if forceSetTimestamps {
                return false
        }

        // Skip if we have a host FD.
        i.handlesMu.RLock()
        defer i.handlesMu.RUnlock()
        return (i.readHandles != nil && i.readHandles.Host != nil) ||
                (i.writeHandles != nil && i.writeHandles.Host != nil)
}

// Sync implements fsutil.CachedFileObject.Sync.
func (i *inodeFileState) Sync(ctx context.Context) error {
        i.handlesMu.RLock()
        defer i.handlesMu.RUnlock()
        if i.writeHandles == nil {
                return nil
        }
        return i.writeHandles.File.fsync(ctx)
}

// FD implements fsutil.CachedFileObject.FD.
func (i *inodeFileState) FD() int {
        i.handlesMu.RLock()
        defer i.handlesMu.RUnlock()
        if i.writeHandlesRW && i.writeHandles != nil && i.writeHandles.Host != nil {
                return int(i.writeHandles.Host.FD())
        }
        if i.readHandles != nil && i.readHandles.Host != nil {
                return int(i.readHandles.Host.FD())
        }
        return -1
}

// waitForLoad makes sure any restore-issued loading is done.
func (i *inodeFileState) waitForLoad() {
        // This is not a no-op. The loading mutex is hold upon restore until
        // all loading actions are done.
        i.loading.Lock()
        i.loading.Unlock()
}

func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, error) {
        _, valid, pattr, err := getattr(ctx, i.file)
        if err != nil {
                return fs.UnstableAttr{}, err
        }
        return unstable(ctx, valid, pattr, i.s.mounter, i.s.client), nil
}

func (i *inodeFileState) Allocate(ctx context.Context, offset, length int64) error {
        i.handlesMu.RLock()
        defer i.handlesMu.RUnlock()

        // No options are supported for now.
        mode := p9.AllocateMode{}
        return i.writeHandles.File.allocate(ctx, mode, uint64(offset), uint64(length))
}

// session extracts the gofer's session from the MountSource.
func (i *inodeOperations) session() *session {
        return i.fileState.s
}

// Release implements fs.InodeOperations.Release.
func (i *inodeOperations) Release(ctx context.Context) {
        i.cachingInodeOps.Release()

        // Releasing the fileState may make RPCs to the gofer. There is
        // no need to wait for those to return, so we can do this
        // asynchronously.
        //
        // We use AsyncWithContext to avoid needing to allocate an extra
        // anonymous function on the heap. We must use background context
        // because the async work cannot happen on the task context.
        fs.AsyncWithContext(context.Background(), i.fileState.Release)
}

// Mappable implements fs.InodeOperations.Mappable.
func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable {
        if i.session().cachePolicy.useCachingInodeOps(inode) {
                return i.cachingInodeOps
        }
        // This check is necessary because it's returning an interface type.
        if i.fileState.hostMappable != nil {
                return i.fileState.hostMappable
        }
        return nil
}

// UnstableAttr implements fs.InodeOperations.UnstableAttr.
func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
        if i.session().cachePolicy.cacheUAttrs(inode) {
                return i.cachingInodeOps.UnstableAttr(ctx, inode)
        }
        return i.fileState.unstableAttr(ctx)
}

// Check implements fs.InodeOperations.Check.
func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
        return fs.ContextCanAccessFile(ctx, inode, p)
}

// GetFile implements fs.InodeOperations.GetFile.
func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
        switch d.Inode.StableAttr.Type {
        case fs.Socket:
                if i.session().overrides != nil {
                        return nil, linuxerr.ENXIO
                }
                return i.getFileSocket(ctx, d, flags)
        case fs.Pipe:
                return i.getFilePipe(ctx, d, flags)
        default:
                return i.getFileDefault(ctx, d, flags)
        }
}

func (i *inodeOperations) getFileSocket(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
        f, err := i.fileState.file.connect(ctx, p9.AnonymousSocket)
        if err != nil {
                return nil, unix.EIO
        }
        fsf, err := host.NewSocketWithDirent(ctx, d, f, flags)
        if err != nil {
                f.Close()
                return nil, err
        }
        return fsf, nil
}

func (i *inodeOperations) getFilePipe(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
        // Try to open as a host pipe; if that doesn't work, handle it normally.
        pipeOps, err := fdpipe.Open(ctx, i, flags)
        if err == errNotHostFile {
                return i.getFileDefault(ctx, d, flags)
        }
        if err != nil {
                return nil, err
        }
        return fs.NewFile(ctx, d, flags, pipeOps), nil
}

// errNotHostFile indicates that the file is not a host file.
var errNotHostFile = errors.New("not a host file")

// NonBlockingOpen implements fdpipe.NonBlockingOpener for opening host named pipes.
func (i *inodeOperations) NonBlockingOpen(ctx context.Context, p fs.PermMask) (*fd.FD, error) {
        i.fileState.waitForLoad()

        // Get a cloned fid which we will open.
        _, newFile, err := i.fileState.file.walk(ctx, nil)
        if err != nil {
                log.Warningf("Open Walk failed: %v", err)
                return nil, err
        }
        defer newFile.close(ctx)

        flags, err := openFlagsFromPerms(p)
        if err != nil {
                log.Warningf("Open flags %s parsing failed: %v", p, err)
                return nil, err
        }
        hostFile, _, _, err := newFile.open(ctx, flags)
        // If the host file returned is nil and the error is nil,
        // then this was never a host file to begin with, and should
        // be treated like a remote file.
        if hostFile == nil && err == nil {
                return nil, errNotHostFile
        }
        return hostFile, err
}

func (i *inodeOperations) getFileDefault(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
        h, err := i.fileState.getHandles(ctx, flags, i.cachingInodeOps)
        if err != nil {
                return nil, err
        }
        return NewFile(ctx, d, d.BaseName(), flags, i, h), nil
}

// SetPermissions implements fs.InodeOperations.SetPermissions.
func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
        if i.session().cachePolicy.cacheUAttrs(inode) {
                return i.cachingInodeOps.SetPermissions(ctx, inode, p)
        }

        mask := p9.SetAttrMask{Permissions: true}
        pattr := p9.SetAttr{Permissions: p9.FileMode(p.LinuxMode())}
        // Execute the chmod.
        return i.fileState.file.setAttr(ctx, mask, pattr) == nil
}

// SetOwner implements fs.InodeOperations.SetOwner.
func (i *inodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
        // Save the roundtrip.
        if !owner.UID.Ok() && !owner.GID.Ok() {
                return nil
        }

        if i.session().cachePolicy.cacheUAttrs(inode) {
                return i.cachingInodeOps.SetOwner(ctx, inode, owner)
        }

        var mask p9.SetAttrMask
        var attr p9.SetAttr
        if owner.UID.Ok() {
                mask.UID = true
                attr.UID = p9.UID(owner.UID)
        }
        if owner.GID.Ok() {
                mask.GID = true
                attr.GID = p9.GID(owner.GID)
        }
        return i.fileState.file.setAttr(ctx, mask, attr)
}

// SetTimestamps implements fs.InodeOperations.SetTimestamps.
func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
        if i.session().cachePolicy.cacheUAttrs(inode) {
                return i.cachingInodeOps.SetTimestamps(ctx, inode, ts)
        }

        return utimes(ctx, i.fileState.file, ts)
}

// Truncate implements fs.InodeOperations.Truncate.
func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length int64) error {
        // This can only be called for files anyway.
        if i.session().cachePolicy.useCachingInodeOps(inode) {
                return i.cachingInodeOps.Truncate(ctx, inode, length)
        }

        uattr, err := i.fileState.unstableAttr(ctx)
        if err != nil {
                return err
        }

        if i.session().cachePolicy == cacheRemoteRevalidating {
                return i.fileState.hostMappable.Truncate(ctx, length, uattr)
        }

        mask := p9.SetAttrMask{Size: true}
        attr := p9.SetAttr{Size: uint64(length)}
        if uattr.Perms.HasSetUIDOrGID() {
                mask.Permissions = true
                uattr.Perms.DropSetUIDAndMaybeGID()
                attr.Permissions = p9.FileMode(uattr.Perms.LinuxMode())
        }

        return i.fileState.file.setAttr(ctx, mask, attr)
}

// GetXattr implements fs.InodeOperations.GetXattr.
func (i *inodeOperations) GetXattr(ctx context.Context, _ *fs.Inode, name string, size uint64) (string, error) {
        return i.fileState.file.getXattr(ctx, name, size)
}

// SetXattr implements fs.InodeOperations.SetXattr.
func (i *inodeOperations) SetXattr(ctx context.Context, _ *fs.Inode, name string, value string, flags uint32) error {
        return i.fileState.file.setXattr(ctx, name, value, flags)
}

// ListXattr implements fs.InodeOperations.ListXattr.
func (i *inodeOperations) ListXattr(ctx context.Context, _ *fs.Inode, size uint64) (map[string]struct{}, error) {
        return i.fileState.file.listXattr(ctx, size)
}

// RemoveXattr implements fs.InodeOperations.RemoveXattr.
func (i *inodeOperations) RemoveXattr(ctx context.Context, _ *fs.Inode, name string) error {
        return i.fileState.file.removeXattr(ctx, name)
}

// Allocate implements fs.InodeOperations.Allocate.
func (i *inodeOperations) Allocate(ctx context.Context, inode *fs.Inode, offset, length int64) error {
        // This can only be called for files anyway.
        if i.session().cachePolicy.useCachingInodeOps(inode) {
                return i.cachingInodeOps.Allocate(ctx, offset, length)
        }
        if i.session().cachePolicy == cacheRemoteRevalidating {
                return i.fileState.hostMappable.Allocate(ctx, offset, length)
        }

        // No options are supported for now.
        mode := p9.AllocateMode{}
        return i.fileState.file.allocate(ctx, mode, uint64(offset), uint64(length))
}

// WriteOut implements fs.InodeOperations.WriteOut.
func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
        if inode.MountSource.Flags.ReadOnly || !i.session().cachePolicy.cacheUAttrs(inode) {
                return nil
        }

        return i.cachingInodeOps.WriteOut(ctx, inode)
}

// Readlink implements fs.InodeOperations.Readlink.
func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
        if !fs.IsSymlink(inode.StableAttr) {
                return "", unix.ENOLINK
        }
        return i.fileState.file.readlink(ctx)
}

// Getlink implementfs fs.InodeOperations.Getlink.
func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
        if !fs.IsSymlink(i.fileState.sattr) {
                return nil, linuxerr.ENOLINK
        }
        return nil, fs.ErrResolveViaReadlink
}

// StatFS makes a StatFS request.
func (i *inodeOperations) StatFS(ctx context.Context) (fs.Info, error) {
        fsstat, err := i.fileState.file.statFS(ctx)
        if err != nil {
                return fs.Info{}, err
        }

        info := fs.Info{
                // This is primarily for distinguishing a gofer file system in
                // tests. Testing is important, so instead of defining
                // something completely random, use a standard value.
                Type:        linux.V9FS_MAGIC,
                TotalBlocks: fsstat.Blocks,
                FreeBlocks:  fsstat.BlocksFree,
                TotalFiles:  fsstat.Files,
                FreeFiles:   fsstat.FilesFree,
        }

        // If blocks available is non-zero, prefer that.
        if fsstat.BlocksAvailable != 0 {
                info.FreeBlocks = fsstat.BlocksAvailable
        }

        return info, nil
}

func (i *inodeOperations) configureMMap(file *fs.File, opts *memmap.MMapOpts) error {
        if i.session().cachePolicy.useCachingInodeOps(file.Dirent.Inode) {
                return fsutil.GenericConfigureMMap(file, i.cachingInodeOps, opts)
        }
        if i.fileState.hostMappable != nil {
                return fsutil.GenericConfigureMMap(file, i.fileState.hostMappable, opts)
        }
        return linuxerr.ENODEV
}

func init() {
        syserror.AddErrorUnwrapper(func(err error) (unix.Errno, bool) {
                if _, ok := err.(p9.ErrSocket); ok {
                        // Treat as an I/O error.
                        return unix.EIO, true
                }
                return 0, false
        })
}

// AddLink implements InodeOperations.AddLink, but is currently a noop.
func (*inodeOperations) AddLink() {}

// DropLink implements InodeOperations.DropLink, but is currently a noop.
func (*inodeOperations) DropLink() {}

// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/hostarch/addr_range.go: no such file or directory

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/syscalls/linux/linux_abi_autogen_unsafe.go: no such file or directory







































  248 





    2 


  244 



    1 




  241 


    1 



  241 





    5 





    1 


    4 





    1 



    3 




  244 

  231 



   97 
   93 




    4 



    8 







    6 




    4 
    1 


    3 


    9 





    3 









    3 


    1 




    2 





    2 






    2 





   64 






    1 


   63 


    1 




   62 


    2 



   60 





    3 
















    3 







    3 


    1 



    2 





    2 


    2 



   64 

   63 



    1 
    1 

































  342 





    2 


  341 



    1 




  338 


    1 



  336 





    8 





    1 


    7 





    1 



    6 




  342 

  337 



    3 





    3 



    6 












    7 
    1 


    3 


    3 




   17 









   17 


    1 




   16 
    1 




   14 






   15 





    3 






    1 


    2 







    2 






    2 





    3 













    1 


    2 







    2 






    2 





    2 


    2 



   19 

   19 





































  100 
   94 



    7 




    2 



    5 



   27 





    2 


   26 






    2 





    1 


    1 







    1 




    1 






    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/socket"
        slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

const (
        eventMaskRead  = waiter.EventRdNorm | waiter.EventIn | waiter.EventHUp | waiter.EventErr
        eventMaskWrite = waiter.EventWrNorm | waiter.EventOut | waiter.EventHUp | waiter.EventErr
)

// Read implements Linux syscall read(2).
func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        size := args[2].SizeT()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Check that the size is legitimate.
        si := int(size)
        if si < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Get the destination of the read.
        dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if err != nil {
                return 0, nil, err
        }

        n, err := read(t, file, dst, vfs.ReadOptions{})
        t.IOUsage().AccountReadSyscall(n)
        return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "read", file)
}

// Readv implements Linux syscall readv(2).
func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        iovcnt := int(args[2].Int())

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Get the destination of the read.
        dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if err != nil {
                return 0, nil, err
        }

        n, err := read(t, file, dst, vfs.ReadOptions{})
        t.IOUsage().AccountReadSyscall(n)
        return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "readv", file)
}

func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        n, err := file.Read(t, dst, opts)
        if err != syserror.ErrWouldBlock {
                return n, err
        }

        allowBlock, deadline, hasDeadline := blockPolicy(t, file)
        if !allowBlock {
                return n, err
        }

        // Register for notifications.
        w, ch := waiter.NewChannelEntry(nil)
        file.EventRegister(&w, eventMaskRead)

        total := n
        for {
                // Shorten dst to reflect bytes previously read.
                dst = dst.DropFirst(int(n))

                // Issue the request and break out if it completes with anything other than
                // "would block".
                n, err = file.Read(t, dst, opts)
                total += n
                if err != syserror.ErrWouldBlock {
                        break
                }

                // Wait for a notification that we should retry.
                if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
                        if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
                                err = syserror.ErrWouldBlock
                        }
                        break
                }
        }
        file.EventUnregister(&w)

        return total, err
}

// Pread64 implements Linux syscall pread64(2).
func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        size := args[2].SizeT()
        offset := args[3].Int64()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Check that the offset is legitimate and does not overflow.
        if offset < 0 || offset+int64(size) < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Check that the size is legitimate.
        si := int(size)
        if si < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Get the destination of the read.
        dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if err != nil {
                return 0, nil, err
        }

        n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
        t.IOUsage().AccountReadSyscall(n)
        return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pread64", file)
}

// Preadv implements Linux syscall preadv(2).
func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        iovcnt := int(args[2].Int())
        offset := args[3].Int64()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Check that the offset is legitimate.
        if offset < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Get the destination of the read.
        dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if err != nil {
                return 0, nil, err
        }

        n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
        t.IOUsage().AccountReadSyscall(n)
        return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv", file)
}

// Preadv2 implements Linux syscall preadv2(2).
func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        // While the glibc signature is
        // preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
        // the actual syscall
        // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142)
        // splits the offset argument into a high/low value for compatibility with
        // 32-bit architectures. The flags argument is the 6th argument (index 5).
        fd := args[0].Int()
        addr := args[1].Pointer()
        iovcnt := int(args[2].Int())
        offset := args[3].Int64()
        flags := args[5].Int()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Check that the offset is legitimate.
        if offset < -1 {
                return 0, nil, linuxerr.EINVAL
        }

        // Get the destination of the read.
        dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if err != nil {
                return 0, nil, err
        }

        opts := vfs.ReadOptions{
                Flags: uint32(flags),
        }
        var n int64
        if offset == -1 {
                n, err = read(t, file, dst, opts)
        } else {
                n, err = pread(t, file, dst, offset, opts)
        }
        t.IOUsage().AccountReadSyscall(n)
        return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file)
}

func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        n, err := file.PRead(t, dst, offset, opts)
        if err != syserror.ErrWouldBlock {
                return n, err
        }

        allowBlock, deadline, hasDeadline := blockPolicy(t, file)
        if !allowBlock {
                return n, err
        }

        // Register for notifications.
        w, ch := waiter.NewChannelEntry(nil)
        file.EventRegister(&w, eventMaskRead)

        total := n
        for {
                // Shorten dst to reflect bytes previously read.
                dst = dst.DropFirst(int(n))

                // Issue the request and break out if it completes with anything other than
                // "would block".
                n, err = file.PRead(t, dst, offset+total, opts)
                total += n
                if err != syserror.ErrWouldBlock {
                        break
                }

                // Wait for a notification that we should retry.
                if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
                        if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
                                err = syserror.ErrWouldBlock
                        }
                        break
                }
        }
        file.EventUnregister(&w)
        return total, err
}

// Write implements Linux syscall write(2).
func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        size := args[2].SizeT()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Check that the size is legitimate.
        si := int(size)
        if si < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Get the source of the write.
        src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if err != nil {
                return 0, nil, err
        }

        n, err := write(t, file, src, vfs.WriteOptions{})
        t.IOUsage().AccountWriteSyscall(n)
        return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "write", file)
}

// Writev implements Linux syscall writev(2).
func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        iovcnt := int(args[2].Int())

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Get the source of the write.
        src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if err != nil {
                return 0, nil, err
        }

        n, err := write(t, file, src, vfs.WriteOptions{})
        t.IOUsage().AccountWriteSyscall(n)
        return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "writev", file)
}

func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        n, err := file.Write(t, src, opts)
        if err != syserror.ErrWouldBlock {
                return n, err
        }

        allowBlock, deadline, hasDeadline := blockPolicy(t, file)
        if !allowBlock {
                return n, err
        }

        // Register for notifications.
        w, ch := waiter.NewChannelEntry(nil)
        file.EventRegister(&w, eventMaskWrite)

        total := n
        for {
                // Shorten src to reflect bytes previously written.
                src = src.DropFirst(int(n))

                // Issue the request and break out if it completes with anything other than
                // "would block".
                n, err = file.Write(t, src, opts)
                total += n
                if err != syserror.ErrWouldBlock {
                        break
                }

                // Wait for a notification that we should retry.
                if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
                        if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
                                err = syserror.ErrWouldBlock
                        }
                        break
                }
        }
        file.EventUnregister(&w)
        return total, err
}

// Pwrite64 implements Linux syscall pwrite64(2).
func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        size := args[2].SizeT()
        offset := args[3].Int64()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Check that the offset is legitimate and does not overflow.
        if offset < 0 || offset+int64(size) < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Check that the size is legitimate.
        si := int(size)
        if si < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Get the source of the write.
        src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if err != nil {
                return 0, nil, err
        }

        n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
        t.IOUsage().AccountWriteSyscall(n)
        return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwrite64", file)
}

// Pwritev implements Linux syscall pwritev(2).
func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        iovcnt := int(args[2].Int())
        offset := args[3].Int64()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Check that the offset is legitimate.
        if offset < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Get the source of the write.
        src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if err != nil {
                return 0, nil, err
        }

        n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
        t.IOUsage().AccountReadSyscall(n)
        return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev", file)
}

// Pwritev2 implements Linux syscall pwritev2(2).
func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        // While the glibc signature is
        // pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
        // the actual syscall
        // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162)
        // splits the offset argument into a high/low value for compatibility with
        // 32-bit architectures. The flags argument is the 6th argument (index 5).
        fd := args[0].Int()
        addr := args[1].Pointer()
        iovcnt := int(args[2].Int())
        offset := args[3].Int64()
        flags := args[5].Int()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Check that the offset is legitimate.
        if offset < -1 {
                return 0, nil, linuxerr.EINVAL
        }

        // Get the source of the write.
        src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if err != nil {
                return 0, nil, err
        }

        opts := vfs.WriteOptions{
                Flags: uint32(flags),
        }
        var n int64
        if offset == -1 {
                n, err = write(t, file, src, opts)
        } else {
                n, err = pwrite(t, file, src, offset, opts)
        }
        t.IOUsage().AccountWriteSyscall(n)
        return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file)
}

func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
        n, err := file.PWrite(t, src, offset, opts)
        if err != syserror.ErrWouldBlock {
                return n, err
        }

        allowBlock, deadline, hasDeadline := blockPolicy(t, file)
        if !allowBlock {
                return n, err
        }

        // Register for notifications.
        w, ch := waiter.NewChannelEntry(nil)
        file.EventRegister(&w, eventMaskWrite)

        total := n
        for {
                // Shorten src to reflect bytes previously written.
                src = src.DropFirst(int(n))

                // Issue the request and break out if it completes with anything other than
                // "would block".
                n, err = file.PWrite(t, src, offset+total, opts)
                total += n
                if err != syserror.ErrWouldBlock {
                        break
                }

                // Wait for a notification that we should retry.
                if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil {
                        if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
                                err = syserror.ErrWouldBlock
                        }
                        break
                }
        }
        file.EventUnregister(&w)
        return total, err
}

func blockPolicy(t *kernel.Task, file *vfs.FileDescription) (allowBlock bool, deadline ktime.Time, hasDeadline bool) {
        if file.StatusFlags()&linux.O_NONBLOCK != 0 {
                return false, ktime.Time{}, false
        }
        // Sockets support read/write timeouts.
        if s, ok := file.Impl().(socket.SocketVFS2); ok {
                dl := s.RecvTimeout()
                if dl < 0 {
                        return false, ktime.Time{}, false
                }
                if dl > 0 {
                        return true, t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond), true
                }
        }
        return true, ktime.Time{}, false
}

// Lseek implements Linux syscall lseek(2).
func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        offset := args[1].Int64()
        whence := args[2].Int()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        newoff, err := file.Seek(t, offset, whence)
        return uintptr(newoff), nil, err
}

// Readahead implements readahead(2).
func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        offset := args[1].Int64()
        size := args[2].SizeT()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Check that the file is readable.
        if !file.IsReadable() {
                return 0, nil, linuxerr.EBADF
        }

        // Check that the size is valid.
        if int(size) < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Check that the offset is legitimate and does not overflow.
        if offset < 0 || offset+int64(size) < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Return EINVAL; if the underlying file type does not support readahead,
        // then Linux will return EINVAL to indicate as much. In the future, we
        // may extend this function to actually support readahead hints.
        return 0, nil, linuxerr.EINVAL
}









































































































  624 











  620 




  622 




















































  623 








  623 

   32 


  621 



  621 



  621 




  588 






  620 





















  618 





  616 













  615 

  582 



  612 

  614 





  610 







  609 
  609 







  610 



  606 


  604 


  605 


  609 


















  603 




  605 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/inet"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/kernel/futex"
        "gvisor.dev/gvisor/pkg/sentry/kernel/sched"
        "gvisor.dev/gvisor/pkg/sentry/usage"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

// TaskConfig defines the configuration of a new Task (see below).
type TaskConfig struct {
        // Kernel is the owning Kernel.
        Kernel *Kernel

        // Parent is the new task's parent. Parent may be nil.
        Parent *Task

        // If InheritParent is not nil, use InheritParent's parent as the new
        // task's parent.
        InheritParent *Task

        // ThreadGroup is the ThreadGroup the new task belongs to.
        ThreadGroup *ThreadGroup

        // SignalMask is the new task's initial signal mask.
        SignalMask linux.SignalSet

        // TaskImage is the TaskImage of the new task. Ownership of the
        // TaskImage is transferred to TaskSet.NewTask, whether or not it
        // succeeds.
        TaskImage *TaskImage

        // FSContext is the FSContext of the new task. A reference must be held on
        // FSContext, which is transferred to TaskSet.NewTask whether or not it
        // succeeds.
        FSContext *FSContext

        // FDTable is the FDTableof the new task. A reference must be held on
        // FDMap, which is transferred to TaskSet.NewTask whether or not it
        // succeeds.
        FDTable *FDTable

        // Credentials is the Credentials of the new task.
        Credentials *auth.Credentials

        // Niceness is the niceness of the new task.
        Niceness int

        // NetworkNamespace is the network namespace to be used for the new task.
        NetworkNamespace *inet.Namespace

        // AllowedCPUMask contains the cpus that this task can run on.
        AllowedCPUMask sched.CPUSet

        // UTSNamespace is the UTSNamespace of the new task.
        UTSNamespace *UTSNamespace

        // IPCNamespace is the IPCNamespace of the new task.
        IPCNamespace *IPCNamespace

        // AbstractSocketNamespace is the AbstractSocketNamespace of the new task.
        AbstractSocketNamespace *AbstractSocketNamespace

        // MountNamespaceVFS2 is the MountNamespace of the new task.
        MountNamespaceVFS2 *vfs.MountNamespace

        // RSeqAddr is a pointer to the the userspace linux.RSeq structure.
        RSeqAddr hostarch.Addr

        // RSeqSignature is the signature that the rseq abort IP must be signed
        // with.
        RSeqSignature uint32

        // ContainerID is the container the new task belongs to.
        ContainerID string
}

// NewTask creates a new task defined by cfg.
//
// NewTask does not start the returned task; the caller must call Task.Start.
//
// If successful, NewTask transfers references held by cfg to the new task.
// Otherwise, NewTask releases them.
func (ts *TaskSet) NewTask(ctx context.Context, cfg *TaskConfig) (*Task, error) {
        t, err := ts.newTask(cfg)
        if err != nil {
                cfg.TaskImage.release()
                cfg.FSContext.DecRef(ctx)
                cfg.FDTable.DecRef(ctx)
                cfg.IPCNamespace.DecRef(ctx)
                if cfg.MountNamespaceVFS2 != nil {
                        cfg.MountNamespaceVFS2.DecRef(ctx)
                }
                return nil, err
        }
        return t, nil
}

// newTask is a helper for TaskSet.NewTask that only takes ownership of parts
// of cfg if it succeeds.
func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
        tg := cfg.ThreadGroup
        image := cfg.TaskImage
        t := &Task{
                taskNode: taskNode{
                        tg:       tg,
                        parent:   cfg.Parent,
                        children: make(map[*Task]struct{}),
                },
                runState:           (*runApp)(nil),
                interruptChan:      make(chan struct{}, 1),
                signalMask:         cfg.SignalMask,
                signalStack:        linux.SignalStack{Flags: linux.SS_DISABLE},
                image:              *image,
                fsContext:          cfg.FSContext,
                fdTable:            cfg.FDTable,
                p:                  cfg.Kernel.Platform.NewContext(),
                k:                  cfg.Kernel,
                ptraceTracees:      make(map[*Task]struct{}),
                allowedCPUMask:     cfg.AllowedCPUMask.Copy(),
                ioUsage:            &usage.IO{},
                niceness:           cfg.Niceness,
                netns:              cfg.NetworkNamespace,
                utsns:              cfg.UTSNamespace,
                ipcns:              cfg.IPCNamespace,
                abstractSockets:    cfg.AbstractSocketNamespace,
                mountNamespaceVFS2: cfg.MountNamespaceVFS2,
                rseqCPU:            -1,
                rseqAddr:           cfg.RSeqAddr,
                rseqSignature:      cfg.RSeqSignature,
                futexWaiter:        futex.NewWaiter(),
                containerID:        cfg.ContainerID,
                cgroups:            make(map[Cgroup]struct{}),
        }
        t.creds.Store(cfg.Credentials)
        t.endStopCond.L = &t.tg.signalHandlers.mu
        t.ptraceTracer.Store((*Task)(nil))
        // We don't construct t.blockingTimer until Task.run(); see that function
        // for justification.

        // Make the new task (and possibly thread group) visible to the rest of
        // the system atomically.
        ts.mu.Lock()
        defer ts.mu.Unlock()
        tg.signalHandlers.mu.Lock()
        defer tg.signalHandlers.mu.Unlock()
        if tg.exiting || tg.execing != nil {
                // If the caller is in the same thread group, then what we return
                // doesn't matter too much since the caller will exit before it returns
                // to userspace. If the caller isn't in the same thread group, then
                // we're in uncharted territory and can return whatever we want.
                return nil, syserror.EINTR
        }
        if err := ts.assignTIDsLocked(t); err != nil {
                return nil, err
        }
        // Below this point, newTask is expected not to fail (there is no rollback
        // of assignTIDsLocked or any of the following).

        // Logging on t's behalf will panic if t.logPrefix hasn't been
        // initialized. This is the earliest point at which we can do so
        // (since t now has thread IDs).
        t.updateInfoLocked()

        if cfg.InheritParent != nil {
                t.parent = cfg.InheritParent.parent
        }
        if t.parent != nil {
                t.parent.children[t] = struct{}{}
        }

        if VFS2Enabled {
                t.EnterInitialCgroups(t.parent)
        }

        if tg.leader == nil {
                // New thread group.
                tg.leader = t
                if parentPG := tg.parentPG(); parentPG == nil {
                        tg.createSession()
                } else {
                        // Inherit the process group and terminal.
                        parentPG.incRefWithParent(parentPG)
                        tg.processGroup = parentPG
                        tg.tty = t.parent.tg.tty
                }
        }
        tg.tasks.PushBack(t)
        tg.tasksCount++
        tg.liveTasks++
        tg.activeTasks++

        // Propagate external TaskSet stops to the new task.
        t.stopCount = ts.stopCount

        t.mu.Lock()
        defer t.mu.Unlock()

        t.cpu = assignCPU(t.allowedCPUMask, ts.Root.tids[t])

        t.startTime = t.k.RealtimeClock().Now()

        return t, nil
}

// assignTIDsLocked ensures that new task t is visible in all PID namespaces in
// which it should be visible.
//
// Preconditions: ts.mu must be locked for writing.
func (ts *TaskSet) assignTIDsLocked(t *Task) error {
        type allocatedTID struct {
                ns  *PIDNamespace
                tid ThreadID
        }
        var allocatedTIDs []allocatedTID
        for ns := t.tg.pidns; ns != nil; ns = ns.parent {
                tid, err := ns.allocateTID()
                if err != nil {
                        // Failure. Remove the tids we already allocated in descendant
                        // namespaces.
                        for _, a := range allocatedTIDs {
                                delete(a.ns.tasks, a.tid)
                                delete(a.ns.tids, t)
                                if t.tg.leader == nil {
                                        delete(a.ns.tgids, t.tg)
                                }
                        }
                        return err
                }
                ns.tasks[tid] = t
                ns.tids[t] = tid
                if t.tg.leader == nil {
                        // New thread group.
                        ns.tgids[t.tg] = tid
                }
                allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
        }
        return nil
}

// allocateTID returns an unused ThreadID from ns.
//
// Preconditions: ns.owner.mu must be locked for writing.
func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
        if ns.exiting {
                // "In this case, a subsequent fork(2) into this PID namespace will
                // fail with the error ENOMEM; it is not possible to create a new
                // processes [sic] in a PID namespace whose init process has
                // terminated." - pid_namespaces(7)
                return 0, syserror.ENOMEM
        }
        tid := ns.last
        for {
                // Next.
                tid++
                if tid > TasksLimit {
                        tid = InitTID + 1
                }

                // Is it available?
                tidInUse := func() bool {
                        if _, ok := ns.tasks[tid]; ok {
                                return true
                        }
                        if _, ok := ns.processGroups[ProcessGroupID(tid)]; ok {
                                return true
                        }
                        if _, ok := ns.sessions[SessionID(tid)]; ok {
                                return true
                        }
                        return false
                }()

                if !tidInUse {
                        ns.last = tid
                        return tid, nil
                }

                // Did we do a full cycle?
                if tid == ns.last {
                        // No tid available.
                        return 0, linuxerr.EAGAIN
                }
        }
}

// Start starts the task goroutine. Start must be called exactly once for each
// task returned by NewTask.
//
// 'tid' must be the task's TID in the root PID namespace and it's used for
// debugging purposes only (set as parameter to Task.run to make it visible
// in stack dumps).
func (t *Task) Start(tid ThreadID) {
        // If the task was restored, it may be "starting" after having already exited.
        if t.runState == nil {
                return
        }
        t.goroutineStopped.Add(1)
        t.tg.liveGoroutines.Add(1)
        t.tg.pidns.owner.liveGoroutines.Add(1)
        t.tg.pidns.owner.runningGoroutines.Add(1)

        // Task is now running in system mode.
        t.accountTaskGoroutineLeave(TaskGoroutineNonexistent)

        // Use the task's TID in the root PID namespace to make it visible in stack dumps.
        go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/proc/task_inode_refs.go: no such file or directory
























































































































   16 



   46 














   32 








   22 


   10 




   32 





























    4 




   39 




    3 





   16 




   19 









   37 




   26 




    1 


   26 



    1 

    1 

    1 


    1 


   37 
   26 



   37 






    6 
    1 




    5 




    4 


    3 





    4 




    3 






    3 






    3 






















    2 

    1 



    3 
    3 







    2 

    1 




    3 







    1 















    7 
    3 










    7 



    4 


    1 





    1 


    1 


    1 

    3 




    2 




    1 







    1 
    1 
    1 



    1 




    1 


    1 















    5 


    1 


    5 



    5 
    1 





    4 





   99 





   99 





   91 





   90 
   85 


   89 


    1 





   90 



    5 



    2 


    3 



    1 
    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package transport

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/waiter"
)

// UniqueIDProvider generates a sequence of unique identifiers useful for,
// among other things, lock ordering.
type UniqueIDProvider interface {
        // UniqueID returns a new unique identifier.
        UniqueID() uint64
}

// A ConnectingEndpoint is a connectioned unix endpoint that is attempting to
// establish a bidirectional connection with a BoundEndpoint.
type ConnectingEndpoint interface {
        // ID returns the endpoint's globally unique identifier. This identifier
        // must be used to determine locking order if more than one endpoint is
        // to be locked in the same codepath. The endpoint with the smaller
        // identifier must be locked before endpoints with larger identifiers.
        ID() uint64

        // Passcred implements socket.Credentialer.Passcred.
        Passcred() bool

        // Type returns the socket type, typically either SockStream or
        // SockSeqpacket. The connection attempt must be aborted if this
        // value doesn't match the ConnectableEndpoint's type.
        Type() linux.SockType

        // GetLocalAddress returns the bound path.
        GetLocalAddress() (tcpip.FullAddress, tcpip.Error)

        // Locker protects the following methods. While locked, only the holder of
        // the lock can change the return value of the protected methods.
        sync.Locker

        // Connected returns true iff the ConnectingEndpoint is in the connected
        // state. ConnectingEndpoints can only be connected to a single endpoint,
        // so the connection attempt must be aborted if this returns true.
        Connected() bool

        // Listening returns true iff the ConnectingEndpoint is in the listening
        // state. ConnectingEndpoints cannot make connections while listening, so
        // the connection attempt must be aborted if this returns true.
        Listening() bool

        // WaiterQueue returns a pointer to the endpoint's waiter queue.
        WaiterQueue() *waiter.Queue
}

// connectionedEndpoint is a Unix-domain connected or connectable endpoint and implements
// ConnectingEndpoint, ConnectableEndpoint and tcpip.Endpoint.
//
// connectionedEndpoints must be in connected state in order to transfer data.
//
// This implementation includes STREAM and SEQPACKET Unix sockets created with
// socket(2), accept(2) or socketpair(2) and dgram unix sockets created with
// socketpair(2). See unix_connectionless.go for the implementation of DGRAM
// Unix sockets created with socket(2).
//
// The state is much simpler than a TCP endpoint, so it is not encoded
// explicitly. Instead we enforce the following invariants:
//
// receiver != nil, connected != nil => connected.
// path != "" && acceptedChan == nil => bound, not listening.
// path != "" && acceptedChan != nil => bound and listening.
//
// Only one of these will be true at any moment.
//
// +stateify savable
type connectionedEndpoint struct {
        baseEndpoint

        // id is the unique endpoint identifier. This is used exclusively for
        // lock ordering within connect.
        id uint64

        // idGenerator is used to generate new unique endpoint identifiers.
        idGenerator UniqueIDProvider

        // stype is used by connecting sockets to ensure that they are the
        // same type. The value is typically either tcpip.SockSeqpacket or
        // tcpip.SockStream.
        stype linux.SockType

        // acceptedChan is per the TCP endpoint implementation. Note that the
        // sockets in this channel are _already in the connected state_, and
        // have another associated connectionedEndpoint.
        //
        // If nil, then no listen call has been made.
        acceptedChan chan *connectionedEndpoint `state:".([]*connectionedEndpoint)"`
}

var (
        _ = BoundEndpoint((*connectionedEndpoint)(nil))
        _ = Endpoint((*connectionedEndpoint)(nil))
)

// NewConnectioned creates a new unbound connectionedEndpoint.
func NewConnectioned(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) Endpoint {
        return newConnectioned(ctx, stype, uid)
}

func newConnectioned(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) *connectionedEndpoint {
        ep := &connectionedEndpoint{
                baseEndpoint: baseEndpoint{Queue: &waiter.Queue{}},
                id:           uid.UniqueID(),
                idGenerator:  uid,
                stype:        stype,
        }

        ep.ops.SetSendBufferSize(defaultBufferSize, false /* notify */)
        ep.ops.SetReceiveBufferSize(defaultBufferSize, false /* notify */)
        ep.ops.InitHandler(ep, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits)
        return ep
}

// NewPair allocates a new pair of connected unix-domain connectionedEndpoints.
func NewPair(ctx context.Context, stype linux.SockType, uid UniqueIDProvider) (Endpoint, Endpoint) {
        a := newConnectioned(ctx, stype, uid)
        b := newConnectioned(ctx, stype, uid)

        q1 := &queue{ReaderQueue: a.Queue, WriterQueue: b.Queue, limit: defaultBufferSize}
        q1.InitRefs()
        q2 := &queue{ReaderQueue: b.Queue, WriterQueue: a.Queue, limit: defaultBufferSize}
        q2.InitRefs()

        if stype == linux.SOCK_STREAM {
                a.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q1}}
                b.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{q2}}
        } else {
                a.receiver = &queueReceiver{q1}
                b.receiver = &queueReceiver{q2}
        }

        q2.IncRef()
        a.connected = &connectedEndpoint{
                endpoint:   b,
                writeQueue: q2,
        }
        q1.IncRef()
        b.connected = &connectedEndpoint{
                endpoint:   a,
                writeQueue: q1,
        }

        return a, b
}

// NewExternal creates a new externally backed Endpoint. It behaves like a
// socketpair.
func NewExternal(ctx context.Context, stype linux.SockType, uid UniqueIDProvider, queue *waiter.Queue, receiver Receiver, connected ConnectedEndpoint) Endpoint {
        ep := &connectionedEndpoint{
                baseEndpoint: baseEndpoint{Queue: queue, receiver: receiver, connected: connected},
                id:           uid.UniqueID(),
                idGenerator:  uid,
                stype:        stype,
        }
        ep.ops.InitHandler(ep, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits)
        ep.ops.SetSendBufferSize(connected.SendMaxQueueSize(), false /* notify */)
        ep.ops.SetReceiveBufferSize(defaultBufferSize, false /* notify */)
        return ep
}

// ID implements ConnectingEndpoint.ID.
func (e *connectionedEndpoint) ID() uint64 {
        return e.id
}

// Type implements ConnectingEndpoint.Type and Endpoint.Type.
func (e *connectionedEndpoint) Type() linux.SockType {
        return e.stype
}

// WaiterQueue implements ConnectingEndpoint.WaiterQueue.
func (e *connectionedEndpoint) WaiterQueue() *waiter.Queue {
        return e.Queue
}

// isBound returns true iff the connectionedEndpoint is bound (but not
// listening).
func (e *connectionedEndpoint) isBound() bool {
        return e.path != "" && e.acceptedChan == nil
}

// Listening implements ConnectingEndpoint.Listening.
func (e *connectionedEndpoint) Listening() bool {
        return e.acceptedChan != nil
}

// Close puts the connectionedEndpoint in a closed state and frees all
// resources associated with it.
//
// The socket will be a fresh state after a call to close and may be reused.
// That is, close may be used to "unbind" or "disconnect" the socket in error
// paths.
func (e *connectionedEndpoint) Close(ctx context.Context) {
        e.Lock()
        var c ConnectedEndpoint
        var r Receiver
        switch {
        case e.Connected():
                e.connected.CloseSend()
                e.receiver.CloseRecv()
                // Still have unread data? If yes, we set this into the write
                // end so that the peer can get ECONNRESET) when it does read.
                if e.receiver.RecvQueuedSize() > 0 {
                        e.connected.CloseUnread()
                }
                c = e.connected
                r = e.receiver
                e.connected = nil
                e.receiver = nil
        case e.isBound():
                e.path = ""
        case e.Listening():
                close(e.acceptedChan)
                for n := range e.acceptedChan {
                        n.Close(ctx)
                }
                e.acceptedChan = nil
                e.path = ""
        }
        e.Unlock()
        if c != nil {
                c.CloseNotify()
                c.Release(ctx)
        }
        if r != nil {
                r.CloseNotify()
                r.Release(ctx)
        }
}

// BidirectionalConnect implements BoundEndpoint.BidirectionalConnect.
func (e *connectionedEndpoint) BidirectionalConnect(ctx context.Context, ce ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error {
        if ce.Type() != e.stype {
                return syserr.ErrWrongProtocolForSocket
        }

        // Check if ce is e to avoid a deadlock.
        if ce, ok := ce.(*connectionedEndpoint); ok && ce == e {
                return syserr.ErrInvalidEndpointState
        }

        // Do a dance to safely acquire locks on both endpoints.
        if e.id < ce.ID() {
                e.Lock()
                ce.Lock()
        } else {
                ce.Lock()
                e.Lock()
        }

        // Check connecting state.
        if ce.Connected() {
                e.Unlock()
                ce.Unlock()
                return syserr.ErrAlreadyConnected
        }
        if ce.Listening() {
                e.Unlock()
                ce.Unlock()
                return syserr.ErrInvalidEndpointState
        }

        // Check bound state.
        if !e.Listening() {
                e.Unlock()
                ce.Unlock()
                return syserr.ErrConnectionRefused
        }

        // Create a newly bound connectionedEndpoint.
        ne := &connectionedEndpoint{
                baseEndpoint: baseEndpoint{
                        path:  e.path,
                        Queue: &waiter.Queue{},
                },
                id:          e.idGenerator.UniqueID(),
                idGenerator: e.idGenerator,
                stype:       e.stype,
        }
        ne.ops.InitHandler(ne, &stackHandler{}, getSendBufferLimits, getReceiveBufferLimits)
        ne.ops.SetSendBufferSize(defaultBufferSize, false /* notify */)
        ne.ops.SetReceiveBufferSize(defaultBufferSize, false /* notify */)

        readQueue := &queue{ReaderQueue: ce.WaiterQueue(), WriterQueue: ne.Queue, limit: defaultBufferSize}
        readQueue.InitRefs()
        ne.connected = &connectedEndpoint{
                endpoint:   ce,
                writeQueue: readQueue,
        }

        // Make sure the accepted endpoint inherits this listening socket's SO_SNDBUF.
        writeQueue := &queue{ReaderQueue: ne.Queue, WriterQueue: ce.WaiterQueue(), limit: e.ops.GetSendBufferSize()}
        writeQueue.InitRefs()
        if e.stype == linux.SOCK_STREAM {
                ne.receiver = &streamQueueReceiver{queueReceiver: queueReceiver{readQueue: writeQueue}}
        } else {
                ne.receiver = &queueReceiver{readQueue: writeQueue}
        }

        select {
        case e.acceptedChan <- ne:
                // Commit state.
                writeQueue.IncRef()
                connected := &connectedEndpoint{
                        endpoint:   ne,
                        writeQueue: writeQueue,
                }
                readQueue.IncRef()
                if e.stype == linux.SOCK_STREAM {
                        returnConnect(&streamQueueReceiver{queueReceiver: queueReceiver{readQueue: readQueue}}, connected)
                } else {
                        returnConnect(&queueReceiver{readQueue: readQueue}, connected)
                }

                // Notify can deadlock if we are holding these locks.
                e.Unlock()
                ce.Unlock()

                // Notify on both ends.
                e.Notify(waiter.ReadableEvents)
                ce.WaiterQueue().Notify(waiter.WritableEvents)

                return nil
        default:
                // Busy; return EAGAIN per spec.
                ne.Close(ctx)
                e.Unlock()
                ce.Unlock()
                return syserr.ErrTryAgain
        }
}

// UnidirectionalConnect implements BoundEndpoint.UnidirectionalConnect.
func (e *connectionedEndpoint) UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error) {
        return nil, syserr.ErrConnectionRefused
}

// Connect attempts to directly connect to another Endpoint.
// Implements Endpoint.Connect.
func (e *connectionedEndpoint) Connect(ctx context.Context, server BoundEndpoint) *syserr.Error {
        returnConnect := func(r Receiver, ce ConnectedEndpoint) {
                e.receiver = r
                e.connected = ce
                // Make sure the newly created connected endpoint's write queue is updated
                // to reflect this endpoint's send buffer size.
                if bufSz := e.connected.SetSendBufferSize(e.ops.GetSendBufferSize()); bufSz != e.ops.GetSendBufferSize() {
                        e.ops.SetSendBufferSize(bufSz, false /* notify */)
                        e.ops.SetReceiveBufferSize(bufSz, false /* notify */)
                }
        }

        return server.BidirectionalConnect(ctx, e, returnConnect)
}

// Listen starts listening on the connection.
func (e *connectionedEndpoint) Listen(backlog int) *syserr.Error {
        e.Lock()
        defer e.Unlock()
        if e.Listening() {
                // Adjust the size of the channel iff we can fix existing
                // pending connections into the new one.
                if len(e.acceptedChan) > backlog {
                        return syserr.ErrInvalidEndpointState
                }
                origChan := e.acceptedChan
                e.acceptedChan = make(chan *connectionedEndpoint, backlog)
                close(origChan)
                for ep := range origChan {
                        e.acceptedChan <- ep
                }
                return nil
        }
        if !e.isBound() {
                return syserr.ErrInvalidEndpointState
        }

        // Normal case.
        e.acceptedChan = make(chan *connectionedEndpoint, backlog)
        return nil
}

// Accept accepts a new connection.
func (e *connectionedEndpoint) Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error) {
        e.Lock()
        defer e.Unlock()

        if !e.Listening() {
                return nil, syserr.ErrInvalidEndpointState
        }

        select {
        case ne := <-e.acceptedChan:
                if peerAddr != nil {
                        ne.Lock()
                        c := ne.connected
                        ne.Unlock()
                        if c != nil {
                                addr, err := c.GetLocalAddress()
                                if err != nil {
                                        return nil, syserr.TranslateNetstackError(err)
                                }
                                *peerAddr = addr
                        }
                }
                return ne, nil

        default:
                // Nothing left.
                return nil, syserr.ErrWouldBlock
        }
}

// Bind binds the connection.
//
// For Unix connectionedEndpoints, this _only sets the address associated with
// the socket_. Work associated with sockets in the filesystem or finding those
// sockets must be done by a higher level.
//
// Bind will fail only if the socket is connected, bound or the passed address
// is invalid (the empty string).
func (e *connectionedEndpoint) Bind(addr tcpip.FullAddress, commit func() *syserr.Error) *syserr.Error {
        e.Lock()
        defer e.Unlock()
        if e.isBound() || e.Listening() {
                return syserr.ErrAlreadyBound
        }
        if addr.Addr == "" {
                // The empty string is not permitted.
                return syserr.ErrBadLocalAddress
        }
        if commit != nil {
                if err := commit(); err != nil {
                        return err
                }
        }

        // Save the bound address.
        e.path = string(addr.Addr)
        return nil
}

// SendMsg writes data and a control message to the endpoint's peer.
// This method does not block if the data cannot be written.
func (e *connectionedEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, *syserr.Error) {
        // Stream sockets do not support specifying the endpoint. Seqpacket
        // sockets ignore the passed endpoint.
        if e.stype == linux.SOCK_STREAM && to != nil {
                return 0, syserr.ErrNotSupported
        }
        return e.baseEndpoint.SendMsg(ctx, data, c, to)
}

// Readiness returns the current readiness of the connectionedEndpoint. For
// example, if waiter.EventIn is set, the connectionedEndpoint is immediately
// readable.
func (e *connectionedEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
        e.Lock()
        defer e.Unlock()

        ready := waiter.EventMask(0)
        switch {
        case e.Connected():
                if mask&waiter.ReadableEvents != 0 && e.receiver.Readable() {
                        ready |= waiter.ReadableEvents
                }
                if mask&waiter.WritableEvents != 0 && e.connected.Writable() {
                        ready |= waiter.WritableEvents
                }
        case e.Listening():
                if mask&waiter.ReadableEvents != 0 && len(e.acceptedChan) > 0 {
                        ready |= waiter.ReadableEvents
                }
        }

        return ready
}

// State implements socket.Socket.State.
func (e *connectionedEndpoint) State() uint32 {
        e.Lock()
        defer e.Unlock()

        if e.Connected() {
                return linux.SS_CONNECTED
        }
        return linux.SS_UNCONNECTED
}

// OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize.
func (e *connectionedEndpoint) OnSetSendBufferSize(v int64) (newSz int64) {
        if e.Connected() {
                return e.baseEndpoint.connected.SetSendBufferSize(v)
        }
        return v
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/shm/shm_refs.go: no such file or directory













































    7 
    7 















































































    1 


    1 























  700 












  757 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package context defines an internal context type.
//
// The given Context conforms to the standard Go context, but mandates
// additional methods that are specific to the kernel internals. Note however,
// that the Context described by this package carries additional constraints
// regarding concurrent access and retaining beyond the scope of a call.
//
// See the Context type for complete details.
package context

import (
        "context"
        "time"

        "gvisor.dev/gvisor/pkg/log"
)

type contextID int

// Globally accessible values from a context. These keys are defined in the
// context package to resolve dependency cycles by not requiring the caller to
// import packages usually required to get these information.
const (
        // CtxThreadGroupID is the current thread group ID when a context represents
        // a task context. The value is represented as an int32.
        CtxThreadGroupID contextID = iota
)

// ThreadGroupIDFromContext returns the current thread group ID when ctx
// represents a task context.
func ThreadGroupIDFromContext(ctx Context) (tgid int32, ok bool) {
        if tgid := ctx.Value(CtxThreadGroupID); tgid != nil {
                return tgid.(int32), true
        }
        return 0, false
}

// A Context represents a thread of execution (hereafter "goroutine" to reflect
// Go idiosyncrasy). It carries state associated with the goroutine across API
// boundaries.
//
// While Context exists for essentially the same reasons as Go's standard
// context.Context, the standard type represents the state of an operation
// rather than that of a goroutine. This is a critical distinction:
//
// - Unlike context.Context, which "may be passed to functions running in
// different goroutines", it is *not safe* to use the same Context in multiple
// concurrent goroutines.
//
// - It is *not safe* to retain a Context passed to a function beyond the scope
// of that function call.
//
// In both cases, values extracted from the Context should be used instead.
type Context interface {
        log.Logger
        context.Context

        ChannelSleeper

        // UninterruptibleSleepStart indicates the beginning of an uninterruptible
        // sleep state (equivalent to Linux's TASK_UNINTERRUPTIBLE). If deactivate
        // is true and the Context represents a Task, the Task's AddressSpace is
        // deactivated.
        UninterruptibleSleepStart(deactivate bool)

        // UninterruptibleSleepFinish indicates the end of an uninterruptible sleep
        // state that was begun by a previous call to UninterruptibleSleepStart. If
        // activate is true and the Context represents a Task, the Task's
        // AddressSpace is activated. Normally activate is the same value as the
        // deactivate parameter passed to UninterruptibleSleepStart.
        UninterruptibleSleepFinish(activate bool)
}

// A ChannelSleeper represents a goroutine that may sleep interruptibly, where
// interruption is indicated by a channel becoming readable.
type ChannelSleeper interface {
        // SleepStart is called before going to sleep interruptibly. If SleepStart
        // returns a non-nil channel and that channel becomes ready for receiving
        // while the goroutine is sleeping, the goroutine should be woken, and
        // SleepFinish(false) should be called. Otherwise, SleepFinish(true) should
        // be called after the goroutine stops sleeping.
        SleepStart() <-chan struct{}

        // SleepFinish is called after an interruptibly-sleeping goroutine stops
        // sleeping, as documented by SleepStart.
        SleepFinish(success bool)

        // Interrupted returns true if the channel returned by SleepStart is
        // ready for receiving.
        Interrupted() bool
}

// NoopSleeper is a noop implementation of ChannelSleeper and
// Context.UninterruptibleSleep* methods for anonymous embedding in other types
// that do not implement special behavior around sleeps.
type NoopSleeper struct{}

// SleepStart implements ChannelSleeper.SleepStart.
func (NoopSleeper) SleepStart() <-chan struct{} {
        return nil
}

// SleepFinish implements ChannelSleeper.SleepFinish.
func (NoopSleeper) SleepFinish(success bool) {}

// Interrupted implements ChannelSleeper.Interrupted.
func (NoopSleeper) Interrupted() bool {
        return false
}

// UninterruptibleSleepStart implements Context.UninterruptibleSleepStart.
func (NoopSleeper) UninterruptibleSleepStart(deactivate bool) {}

// UninterruptibleSleepFinish implements Context.UninterruptibleSleepFinish.
func (NoopSleeper) UninterruptibleSleepFinish(activate bool) {}

// Deadline implements context.Context.Deadline.
func (NoopSleeper) Deadline() (time.Time, bool) {
        return time.Time{}, false
}

// Done implements context.Context.Done.
func (NoopSleeper) Done() <-chan struct{} {
        return nil
}

// Err returns context.Context.Err.
func (NoopSleeper) Err() error {
        return nil
}

// logContext implements basic logging.
type logContext struct {
        log.Logger
        NoopSleeper
}

// Value implements Context.Value.
func (logContext) Value(key interface{}) interface{} {
        return nil
}

// bgContext is the context returned by context.Background.
var bgContext = &logContext{Logger: log.Log()}

// Background returns an empty context using the default logger.
// Generally, one should use the Task as their context when available, or avoid
// having to use a context in places where a Task is unavailable.
//
// Using a Background context for tests is fine, as long as no values are
// needed from the context in the tested code paths.
func Background() Context {
        return bgContext
}

// WithValue returns a copy of parent in which the value associated with key is
// val.
func WithValue(parent Context, key, val interface{}) Context {
        return &withValue{
                Context: parent,
                key:     key,
                val:     val,
        }
}

type withValue struct {
        Context
        key interface{}
        val interface{}
}

// Value implements Context.Value.
func (ctx *withValue) Value(key interface{}) interface{} {
        if key == ctx.key {
                return ctx.val
        }
        return ctx.Context.Value(key)
}
















































    2 

    2 

    1 



    1 

    1 



    1 




    1 


    1 














    2 
    1 


    2 



    1 


    1 











    1 

    1 



    1 






    1 
    1 


    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package control

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

// SCMRightsVFS2 represents a SCM_RIGHTS socket control message.
//
// +stateify savable
type SCMRightsVFS2 interface {
        transport.RightsControlMessage

        // Files returns up to max RightsFiles.
        //
        // Returned files are consumed and ownership is transferred to the caller.
        // Subsequent calls to Files will return the next files.
        Files(ctx context.Context, max int) (rf RightsFilesVFS2, truncated bool)
}

// RightsFilesVFS2 represents a SCM_RIGHTS socket control message. A reference
// is maintained for each vfs.FileDescription and is release either when an FD
// is created or when the Release method is called.
//
// +stateify savable
type RightsFilesVFS2 []*vfs.FileDescription

// NewSCMRightsVFS2 creates a new SCM_RIGHTS socket control message
// representation using local sentry FDs.
func NewSCMRightsVFS2(t *kernel.Task, fds []int32) (SCMRightsVFS2, error) {
        files := make(RightsFilesVFS2, 0, len(fds))
        for _, fd := range fds {
                file := t.GetFileVFS2(fd)
                if file == nil {
                        files.Release(t)
                        return nil, linuxerr.EBADF
                }
                files = append(files, file)
        }
        return &files, nil
}

// Files implements SCMRights.Files.
func (fs *RightsFilesVFS2) Files(ctx context.Context, max int) (RightsFilesVFS2, bool) {
        n := max
        var trunc bool
        if l := len(*fs); n > l {
                n = l
        } else if n < l {
                trunc = true
        }
        rf := (*fs)[:n]
        *fs = (*fs)[n:]
        return rf, trunc
}

// Clone implements transport.RightsControlMessage.Clone.
func (fs *RightsFilesVFS2) Clone() transport.RightsControlMessage {
        nfs := append(RightsFilesVFS2(nil), *fs...)
        for _, nf := range nfs {
                nf.IncRef()
        }
        return &nfs
}

// Release implements transport.RightsControlMessage.Release.
func (fs *RightsFilesVFS2) Release(ctx context.Context) {
        for _, f := range *fs {
                f.DecRef(ctx)
        }
        *fs = nil
}

// rightsFDsVFS2 gets up to the specified maximum number of FDs.
func rightsFDsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, max int) ([]int32, bool) {
        files, trunc := rights.Files(t, max)
        fds := make([]int32, 0, len(files))
        for i := 0; i < max && len(files) > 0; i++ {
                fd, err := t.NewFDFromVFS2(0, files[0], kernel.FDFlags{
                        CloseOnExec: cloexec,
                })
                files[0].DecRef(t)
                files = files[1:]
                if err != nil {
                        t.Warningf("Error inserting FD: %v", err)
                        // This is what Linux does.
                        break
                }

                fds = append(fds, int32(fd))
        }
        return fds, trunc
}

// PackRightsVFS2 packs as many FDs as will fit into the unused capacity of buf.
func PackRightsVFS2(t *kernel.Task, rights SCMRightsVFS2, cloexec bool, buf []byte, flags int) ([]byte, int) {
        maxFDs := (cap(buf) - len(buf) - linux.SizeOfControlMessageHeader) / 4
        // Linux does not return any FDs if none fit.
        if maxFDs <= 0 {
                flags |= linux.MSG_CTRUNC
                return buf, flags
        }
        fds, trunc := rightsFDsVFS2(t, rights, cloexec, maxFDs)
        if trunc {
                flags |= linux.MSG_CTRUNC
        }
        align := t.Arch().Width()
        return putCmsg(buf, flags, linux.SCM_RIGHTS, align, fds)
}

// NewVFS2 creates default control messages if needed.
func NewVFS2(t *kernel.Task, socketOrEndpoint interface{}, rights SCMRightsVFS2) transport.ControlMessages {
        return transport.ControlMessages{
                Credentials: makeCreds(t, socketOrEndpoint),
                Rights:      rights,
        }
}













































   32 













    5 




    4 


    1 





   32 













    8 



    8 















    1 








    1 



   32 




















   32 









































   32 
















































   32 



















   32 

   32 





   32 



   32 



   32 

   32 
   31 




















   32 



    1 









    1 





    1 



















    1 




    1 






  155 








  187 







  184 

  185 





    1 



    1 



   79 



   80 


   80 



   80 








   89 

   89 




   89 
   87 









   87 


























  164 
  163 
   91 




   90 



   91 


  150 
   86 
   42 




  126 


  163 



  113 



  126 














































   72 




    9 



    9 

















    9 



    9 



   20 




   20 








   20 


   20 




















    1 




































  141 




  118 




  210 






   31 






  140 




  138 




  138 





  153 






  153 




  155 


















   89 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
)

var _ AddressableEndpoint = (*AddressableEndpointState)(nil)

// AddressableEndpointState is an implementation of an AddressableEndpoint.
type AddressableEndpointState struct {
        networkEndpoint NetworkEndpoint

        // Lock ordering (from outer to inner lock ordering):
        //
        // AddressableEndpointState.mu
        //   addressState.mu
        mu struct {
                sync.RWMutex

                endpoints map[tcpip.Address]*addressState
                primary   []*addressState
        }
}

// Init initializes the AddressableEndpointState with networkEndpoint.
//
// Must be called before calling any other function on m.
func (a *AddressableEndpointState) Init(networkEndpoint NetworkEndpoint) {
        a.networkEndpoint = networkEndpoint

        a.mu.Lock()
        defer a.mu.Unlock()
        a.mu.endpoints = make(map[tcpip.Address]*addressState)
}

// GetAddress returns the AddressEndpoint for the passed address.
//
// GetAddress does not increment the address's reference count or check if the
// address is considered bound to the endpoint.
//
// Returns nil if the passed address is not associated with the endpoint.
func (a *AddressableEndpointState) GetAddress(addr tcpip.Address) AddressEndpoint {
        a.mu.RLock()
        defer a.mu.RUnlock()

        ep, ok := a.mu.endpoints[addr]
        if !ok {
                return nil
        }
        return ep
}

// ForEachEndpoint calls f for each address.
//
// Once f returns false, f will no longer be called.
func (a *AddressableEndpointState) ForEachEndpoint(f func(AddressEndpoint) bool) {
        a.mu.RLock()
        defer a.mu.RUnlock()

        for _, ep := range a.mu.endpoints {
                if !f(ep) {
                        return
                }
        }
}

// ForEachPrimaryEndpoint calls f for each primary address.
//
// Once f returns false, f will no longer be called.
func (a *AddressableEndpointState) ForEachPrimaryEndpoint(f func(AddressEndpoint) bool) {
        a.mu.RLock()
        defer a.mu.RUnlock()

        for _, ep := range a.mu.primary {
                if !f(ep) {
                        return
                }
        }
}

func (a *AddressableEndpointState) releaseAddressState(addrState *addressState) {
        a.mu.Lock()
        defer a.mu.Unlock()
        a.releaseAddressStateLocked(addrState)
}

// releaseAddressState removes addrState from s's address state (primary and endpoints list).
//
// Preconditions: a.mu must be write locked.
func (a *AddressableEndpointState) releaseAddressStateLocked(addrState *addressState) {
        oldPrimary := a.mu.primary
        for i, s := range a.mu.primary {
                if s == addrState {
                        a.mu.primary = append(a.mu.primary[:i], a.mu.primary[i+1:]...)
                        oldPrimary[len(oldPrimary)-1] = nil
                        break
                }
        }
        delete(a.mu.endpoints, addrState.addr.Address)
}

// AddAndAcquirePermanentAddress implements AddressableEndpoint.
func (a *AddressableEndpointState) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior, configType AddressConfigType, deprecated bool) (AddressEndpoint, tcpip.Error) {
        a.mu.Lock()
        defer a.mu.Unlock()
        ep, err := a.addAndAcquireAddressLocked(addr, peb, configType, deprecated, true /* permanent */)
        // From https://golang.org/doc/faq#nil_error:
        //
        // Under the covers, interfaces are implemented as two elements, a type T and
        // a value V.
        //
        // An interface value is nil only if the V and T are both unset, (T=nil, V is
        // not set), In particular, a nil interface will always hold a nil type. If we
        // store a nil pointer of type *int inside an interface value, the inner type
        // will be *int regardless of the value of the pointer: (T=*int, V=nil). Such
        // an interface value will therefore be non-nil even when the pointer value V
        // inside is nil.
        //
        // Since addAndAcquireAddressLocked returns a nil value with a non-nil type,
        // we need to explicitly return nil below if ep is (a typed) nil.
        if ep == nil {
                return nil, err
        }
        return ep, err
}

// AddAndAcquireTemporaryAddress adds a temporary address.
//
// Returns *tcpip.ErrDuplicateAddress if the address exists.
//
// The temporary address's endpoint is acquired and returned.
func (a *AddressableEndpointState) AddAndAcquireTemporaryAddress(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior) (AddressEndpoint, tcpip.Error) {
        a.mu.Lock()
        defer a.mu.Unlock()
        ep, err := a.addAndAcquireAddressLocked(addr, peb, AddressConfigStatic, false /* deprecated */, false /* permanent */)
        // From https://golang.org/doc/faq#nil_error:
        //
        // Under the covers, interfaces are implemented as two elements, a type T and
        // a value V.
        //
        // An interface value is nil only if the V and T are both unset, (T=nil, V is
        // not set), In particular, a nil interface will always hold a nil type. If we
        // store a nil pointer of type *int inside an interface value, the inner type
        // will be *int regardless of the value of the pointer: (T=*int, V=nil). Such
        // an interface value will therefore be non-nil even when the pointer value V
        // inside is nil.
        //
        // Since addAndAcquireAddressLocked returns a nil value with a non-nil type,
        // we need to explicitly return nil below if ep is (a typed) nil.
        if ep == nil {
                return nil, err
        }
        return ep, err
}

// addAndAcquireAddressLocked adds, acquires and returns a permanent or
// temporary address.
//
// If the addressable endpoint already has the address in a non-permanent state,
// and addAndAcquireAddressLocked is adding a permanent address, that address is
// promoted in place and its properties set to the properties provided. If the
// address already exists in any other state, then *tcpip.ErrDuplicateAddress is
// returned, regardless the kind of address that is being added.
//
// Precondition: a.mu must be write locked.
func (a *AddressableEndpointState) addAndAcquireAddressLocked(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior, configType AddressConfigType, deprecated, permanent bool) (*addressState, tcpip.Error) {
        // attemptAddToPrimary is false when the address is already in the primary
        // address list.
        attemptAddToPrimary := true
        addrState, ok := a.mu.endpoints[addr.Address]
        if ok {
                if !permanent {
                        // We are adding a non-permanent address but the address exists. No need
                        // to go any further since we can only promote existing temporary/expired
                        // addresses to permanent.
                        return nil, &tcpip.ErrDuplicateAddress{}
                }

                addrState.mu.Lock()
                if addrState.mu.kind.IsPermanent() {
                        addrState.mu.Unlock()
                        // We are adding a permanent address but a permanent address already
                        // exists.
                        return nil, &tcpip.ErrDuplicateAddress{}
                }

                if addrState.mu.refs == 0 {
                        panic(fmt.Sprintf("found an address that should have been released (ref count == 0); address = %s", addrState.addr))
                }

                // We now promote the address.
                for i, s := range a.mu.primary {
                        if s == addrState {
                                switch peb {
                                case CanBePrimaryEndpoint:
                                        // The address is already in the primary address list.
                                        attemptAddToPrimary = false
                                case FirstPrimaryEndpoint:
                                        if i == 0 {
                                                // The address is already first in the primary address list.
                                                attemptAddToPrimary = false
                                        } else {
                                                a.mu.primary = append(a.mu.primary[:i], a.mu.primary[i+1:]...)
                                        }
                                case NeverPrimaryEndpoint:
                                        a.mu.primary = append(a.mu.primary[:i], a.mu.primary[i+1:]...)
                                default:
                                        panic(fmt.Sprintf("unrecognized primary endpoint behaviour = %d", peb))
                                }
                                break
                        }
                }
        }

        if addrState == nil {
                addrState = &addressState{
                        addressableEndpointState: a,
                        addr:                     addr,
                        // Cache the subnet in addrState to avoid calls to addr.Subnet() as that
                        // results in allocations on every call.
                        subnet: addr.Subnet(),
                }
                a.mu.endpoints[addr.Address] = addrState
                addrState.mu.Lock()
                // We never promote an address to temporary - it can only be added as such.
                // If we are actaully adding a permanent address, it is promoted below.
                addrState.mu.kind = Temporary
        }

        // At this point we have an address we are either promoting from an expired or
        // temporary address to permanent, promoting an expired address to temporary,
        // or we are adding a new temporary or permanent address.
        //
        // The address MUST be write locked at this point.
        defer addrState.mu.Unlock() // +checklocksforce

        if permanent {
                if addrState.mu.kind.IsPermanent() {
                        panic(fmt.Sprintf("only non-permanent addresses should be promoted to permanent; address = %s", addrState.addr))
                }

                // Primary addresses are biased by 1.
                addrState.mu.refs++
                addrState.mu.kind = Permanent
        }
        // Acquire the address before returning it.
        addrState.mu.refs++
        addrState.mu.deprecated = deprecated
        addrState.mu.configType = configType

        if attemptAddToPrimary {
                switch peb {
                case NeverPrimaryEndpoint:
                case CanBePrimaryEndpoint:
                        a.mu.primary = append(a.mu.primary, addrState)
                case FirstPrimaryEndpoint:
                        if cap(a.mu.primary) == len(a.mu.primary) {
                                a.mu.primary = append([]*addressState{addrState}, a.mu.primary...)
                        } else {
                                // Shift all the endpoints by 1 to make room for the new address at the
                                // front. We could have just created a new slice but this saves
                                // allocations when the slice has capacity for the new address.
                                primaryCount := len(a.mu.primary)
                                a.mu.primary = append(a.mu.primary, nil)
                                if n := copy(a.mu.primary[1:], a.mu.primary); n != primaryCount {
                                        panic(fmt.Sprintf("copied %d elements; expected = %d elements", n, primaryCount))
                                }
                                a.mu.primary[0] = addrState
                        }
                default:
                        panic(fmt.Sprintf("unrecognized primary endpoint behaviour = %d", peb))
                }
        }

        return addrState, nil
}

// RemovePermanentAddress implements AddressableEndpoint.
func (a *AddressableEndpointState) RemovePermanentAddress(addr tcpip.Address) tcpip.Error {
        a.mu.Lock()
        defer a.mu.Unlock()
        return a.removePermanentAddressLocked(addr)
}

// removePermanentAddressLocked is like RemovePermanentAddress but with locking
// requirements.
//
// Precondition: a.mu must be write locked.
func (a *AddressableEndpointState) removePermanentAddressLocked(addr tcpip.Address) tcpip.Error {
        addrState, ok := a.mu.endpoints[addr]
        if !ok {
                return &tcpip.ErrBadLocalAddress{}
        }

        return a.removePermanentEndpointLocked(addrState)
}

// RemovePermanentEndpoint removes the passed endpoint if it is associated with
// a and permanent.
func (a *AddressableEndpointState) RemovePermanentEndpoint(ep AddressEndpoint) tcpip.Error {
        addrState, ok := ep.(*addressState)
        if !ok || addrState.addressableEndpointState != a {
                return &tcpip.ErrInvalidEndpointState{}
        }

        a.mu.Lock()
        defer a.mu.Unlock()
        return a.removePermanentEndpointLocked(addrState)
}

// removePermanentAddressLocked is like RemovePermanentAddress but with locking
// requirements.
//
// Precondition: a.mu must be write locked.
func (a *AddressableEndpointState) removePermanentEndpointLocked(addrState *addressState) tcpip.Error {
        if !addrState.GetKind().IsPermanent() {
                return &tcpip.ErrBadLocalAddress{}
        }

        addrState.SetKind(PermanentExpired)
        a.decAddressRefLocked(addrState)
        return nil
}

// decAddressRef decrements the address's reference count and releases it once
// the reference count hits 0.
func (a *AddressableEndpointState) decAddressRef(addrState *addressState) {
        a.mu.Lock()
        defer a.mu.Unlock()
        a.decAddressRefLocked(addrState)
}

// decAddressRefLocked is like decAddressRef but with locking requirements.
//
// Precondition: a.mu must be write locked.
func (a *AddressableEndpointState) decAddressRefLocked(addrState *addressState) {
        addrState.mu.Lock()
        defer addrState.mu.Unlock()

        if addrState.mu.refs == 0 {
                panic(fmt.Sprintf("attempted to decrease ref count for AddressEndpoint w/ addr = %s when it is already released", addrState.addr))
        }

        addrState.mu.refs--

        if addrState.mu.refs != 0 {
                return
        }

        // A non-expired permanent address must not have its reference count dropped
        // to 0.
        if addrState.mu.kind.IsPermanent() {
                panic(fmt.Sprintf("permanent addresses should be removed through the AddressableEndpoint: addr = %s, kind = %d", addrState.addr, addrState.mu.kind))
        }

        a.releaseAddressStateLocked(addrState)
}

// MainAddress implements AddressableEndpoint.
func (a *AddressableEndpointState) MainAddress() tcpip.AddressWithPrefix {
        a.mu.RLock()
        defer a.mu.RUnlock()

        ep := a.acquirePrimaryAddressRLocked(func(ep *addressState) bool {
                return ep.GetKind() == Permanent
        })
        if ep == nil {
                return tcpip.AddressWithPrefix{}
        }

        addr := ep.AddressWithPrefix()
        a.decAddressRefLocked(ep)
        return addr
}

// acquirePrimaryAddressRLocked returns an acquired primary address that is
// valid according to isValid.
//
// Precondition: e.mu must be read locked
func (a *AddressableEndpointState) acquirePrimaryAddressRLocked(isValid func(*addressState) bool) *addressState {
        var deprecatedEndpoint *addressState
        for _, ep := range a.mu.primary {
                if !isValid(ep) {
                        continue
                }

                if !ep.Deprecated() {
                        if ep.IncRef() {
                                // ep is not deprecated, so return it immediately.
                                //
                                // If we kept track of a deprecated endpoint, decrement its reference
                                // count since it was incremented when we decided to keep track of it.
                                if deprecatedEndpoint != nil {
                                        a.decAddressRefLocked(deprecatedEndpoint)
                                        deprecatedEndpoint = nil
                                }

                                return ep
                        }
                } else if deprecatedEndpoint == nil && ep.IncRef() {
                        // We prefer an endpoint that is not deprecated, but we keep track of
                        // ep in case a doesn't have any non-deprecated endpoints.
                        //
                        // If we end up finding a more preferred endpoint, ep's reference count
                        // will be decremented.
                        deprecatedEndpoint = ep
                }
        }

        return deprecatedEndpoint
}

// AcquireAssignedAddressOrMatching returns an address endpoint that is
// considered assigned to the addressable endpoint.
//
// If the address is an exact match with an existing address, that address is
// returned. Otherwise, if f is provided, f is called with each address and
// the address that f returns true for is returned.
//
// If there is no matching address, a temporary address will be returned if
// allowTemp is true.
//
// Regardless how the address was obtained, it will be acquired before it is
// returned.
func (a *AddressableEndpointState) AcquireAssignedAddressOrMatching(localAddr tcpip.Address, f func(AddressEndpoint) bool, allowTemp bool, tempPEB PrimaryEndpointBehavior) AddressEndpoint {
        lookup := func() *addressState {
                if addrState, ok := a.mu.endpoints[localAddr]; ok {
                        if !addrState.IsAssigned(allowTemp) {
                                return nil
                        }

                        if !addrState.IncRef() {
                                panic(fmt.Sprintf("failed to increase the reference count for address = %s", addrState.addr))
                        }

                        return addrState
                }

                if f != nil {
                        for _, addrState := range a.mu.endpoints {
                                if addrState.IsAssigned(allowTemp) && f(addrState) && addrState.IncRef() {
                                        return addrState
                                }
                        }
                }
                return nil
        }
        // Avoid exclusive lock on mu unless we need to add a new address.
        a.mu.RLock()
        ep := lookup()
        a.mu.RUnlock()

        if ep != nil {
                return ep
        }

        if !allowTemp {
                return nil
        }

        // Acquire state lock in exclusive mode as we need to add a new temporary
        // endpoint.
        a.mu.Lock()
        defer a.mu.Unlock()

        // Do the lookup again in case another goroutine added the address in the time
        // we released and acquired the lock.
        ep = lookup()
        if ep != nil {
                return ep
        }

        // Proceed to add a new temporary endpoint.
        addr := localAddr.WithPrefix()
        ep, err := a.addAndAcquireAddressLocked(addr, tempPEB, AddressConfigStatic, false /* deprecated */, false /* permanent */)
        if err != nil {
                // addAndAcquireAddressLocked only returns an error if the address is
                // already assigned but we just checked above if the address exists so we
                // expect no error.
                panic(fmt.Sprintf("a.addAndAcquireAddressLocked(%s, %d, %d, false, false): %s", addr, tempPEB, AddressConfigStatic, err))
        }

        // From https://golang.org/doc/faq#nil_error:
        //
        // Under the covers, interfaces are implemented as two elements, a type T and
        // a value V.
        //
        // An interface value is nil only if the V and T are both unset, (T=nil, V is
        // not set), In particular, a nil interface will always hold a nil type. If we
        // store a nil pointer of type *int inside an interface value, the inner type
        // will be *int regardless of the value of the pointer: (T=*int, V=nil). Such
        // an interface value will therefore be non-nil even when the pointer value V
        // inside is nil.
        //
        // Since addAndAcquireAddressLocked returns a nil value with a non-nil type,
        // we need to explicitly return nil below if ep is (a typed) nil.
        if ep == nil {
                return nil
        }
        return ep
}

// AcquireAssignedAddress implements AddressableEndpoint.
func (a *AddressableEndpointState) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB PrimaryEndpointBehavior) AddressEndpoint {
        return a.AcquireAssignedAddressOrMatching(localAddr, nil, allowTemp, tempPEB)
}

// AcquireOutgoingPrimaryAddress implements AddressableEndpoint.
func (a *AddressableEndpointState) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) AddressEndpoint {
        a.mu.RLock()
        defer a.mu.RUnlock()

        ep := a.acquirePrimaryAddressRLocked(func(ep *addressState) bool {
                return ep.IsAssigned(allowExpired)
        })

        // From https://golang.org/doc/faq#nil_error:
        //
        // Under the covers, interfaces are implemented as two elements, a type T and
        // a value V.
        //
        // An interface value is nil only if the V and T are both unset, (T=nil, V is
        // not set), In particular, a nil interface will always hold a nil type. If we
        // store a nil pointer of type *int inside an interface value, the inner type
        // will be *int regardless of the value of the pointer: (T=*int, V=nil). Such
        // an interface value will therefore be non-nil even when the pointer value V
        // inside is nil.
        //
        // Since acquirePrimaryAddressRLocked returns a nil value with a non-nil type,
        // we need to explicitly return nil below if ep is (a typed) nil.
        if ep == nil {
                return nil
        }

        return ep
}

// PrimaryAddresses implements AddressableEndpoint.
func (a *AddressableEndpointState) PrimaryAddresses() []tcpip.AddressWithPrefix {
        a.mu.RLock()
        defer a.mu.RUnlock()

        var addrs []tcpip.AddressWithPrefix
        for _, ep := range a.mu.primary {
                // Don't include tentative, expired or temporary endpoints
                // to avoid confusion and prevent the caller from using
                // those.
                switch ep.GetKind() {
                case PermanentTentative, PermanentExpired, Temporary:
                        continue
                }

                addrs = append(addrs, ep.AddressWithPrefix())
        }

        return addrs
}

// PermanentAddresses implements AddressableEndpoint.
func (a *AddressableEndpointState) PermanentAddresses() []tcpip.AddressWithPrefix {
        a.mu.RLock()
        defer a.mu.RUnlock()

        var addrs []tcpip.AddressWithPrefix
        for _, ep := range a.mu.endpoints {
                if !ep.GetKind().IsPermanent() {
                        continue
                }

                addrs = append(addrs, ep.AddressWithPrefix())
        }

        return addrs
}

// Cleanup forcefully leaves all groups and removes all permanent addresses.
func (a *AddressableEndpointState) Cleanup() {
        a.mu.Lock()
        defer a.mu.Unlock()

        for _, ep := range a.mu.endpoints {
                // removePermanentEndpointLocked returns *tcpip.ErrBadLocalAddress if ep is
                // not a permanent address.
                switch err := a.removePermanentEndpointLocked(ep); err.(type) {
                case nil, *tcpip.ErrBadLocalAddress:
                default:
                        panic(fmt.Sprintf("unexpected error from removePermanentEndpointLocked(%s): %s", ep.addr, err))
                }
        }
}

var _ AddressEndpoint = (*addressState)(nil)

// addressState holds state for an address.
type addressState struct {
        addressableEndpointState *AddressableEndpointState
        addr                     tcpip.AddressWithPrefix
        subnet                   tcpip.Subnet
        // Lock ordering (from outer to inner lock ordering):
        //
        // AddressableEndpointState.mu
        //   addressState.mu
        mu struct {
                sync.RWMutex

                refs       uint32
                kind       AddressKind
                configType AddressConfigType
                deprecated bool
        }
}

// AddressWithPrefix implements AddressEndpoint.
func (a *addressState) AddressWithPrefix() tcpip.AddressWithPrefix {
        return a.addr
}

// Subnet implements AddressEndpoint.
func (a *addressState) Subnet() tcpip.Subnet {
        return a.subnet
}

// GetKind implements AddressEndpoint.
func (a *addressState) GetKind() AddressKind {
        a.mu.RLock()
        defer a.mu.RUnlock()
        return a.mu.kind
}

// SetKind implements AddressEndpoint.
func (a *addressState) SetKind(kind AddressKind) {
        a.mu.Lock()
        defer a.mu.Unlock()
        a.mu.kind = kind
}

// IsAssigned implements AddressEndpoint.
func (a *addressState) IsAssigned(allowExpired bool) bool {
        if !a.addressableEndpointState.networkEndpoint.Enabled() {
                return false
        }

        switch a.GetKind() {
        case PermanentTentative:
                return false
        case PermanentExpired:
                return allowExpired
        default:
                return true
        }
}

// IncRef implements AddressEndpoint.
func (a *addressState) IncRef() bool {
        a.mu.Lock()
        defer a.mu.Unlock()
        if a.mu.refs == 0 {
                return false
        }

        a.mu.refs++
        return true
}

// DecRef implements AddressEndpoint.
func (a *addressState) DecRef() {
        a.addressableEndpointState.decAddressRef(a)
}

// ConfigType implements AddressEndpoint.
func (a *addressState) ConfigType() AddressConfigType {
        a.mu.RLock()
        defer a.mu.RUnlock()
        return a.mu.configType
}

// SetDeprecated implements AddressEndpoint.
func (a *addressState) SetDeprecated(d bool) {
        a.mu.Lock()
        defer a.mu.Unlock()
        a.mu.deprecated = d
}

// Deprecated implements AddressEndpoint.
func (a *addressState) Deprecated() bool {
        a.mu.RLock()
        defer a.mu.RUnlock()
        return a.mu.deprecated
}














































































































   11 












    3 







   13 




   13 

   13 




   13 




   12 









   13 

























    6 
    4 


    2 




    3 














    5 










    5 




    9 


    4 
    4 











    5 





    5 




    5 















    4 











    1 















    1 


    1 













    3 
    1 


    1 





    2 







    2 
    1 


    1 


    2 





































    2 











    2 








    2 





    2 


    2 









    2 





   10 

    2 



   10 
    3 

    9 







   10 
   10 





   10 

    5 






    5 






    5 



    5 




   12 









   12 










    2 


    1 






   11 













   11 













   11 
   12 







    1 











    1 











    1 

    1 




























   10 
    5 





    5 








   11 



   11 




    1 




    1 



   11 



   10 

    5 



   10 



   21 




   21 



   18 




   14 
























   11 


    6 



    3 






   14 






   14 



   14 
















   13 





   12 



   21 





   19 



   23 






















   23 






















































   24 

    5 



   24 



   24 








   20 


   23 



   20 
    9 


   21 



   11 






   11 
















   12 






   12 



   12 



   12 




   12 















    7 

    8 



    8 



    8 






    2 





    1 
    2 









    2 


    2 





    9 





    8 







    5 





















   34 





   34 





   34 







    1 










    1 

















   10 
   10 

   10 



    1 



    1 
    1 





























    1 


















    8 

    8 



    8 
    5 




    8 

    1 


    5 







    5 




    5 



    5 





    6 









    7 




    1 
    1 


    7 























    7 











    5 






    4 











    5 


    5 







































    6 

    5 




    6 













    7 







    8 
    3 





    8 



    8 





    8 










   11 






   10 
    5 


















    6 
















    6 











    6 























    1 




    1 








    5 









    3 





    3 









    3 



    2 



    2 















    2 






    2 















    2 





    2 









    6 
    6 





    6 




    6 








    6 



    6 







    7 


    1 




    6 
   11 





   11 






    2 












    8 
    1 























    1 





    1 









    1 























































































































    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
        "encoding/binary"
        "math"
        "time"

        "gvisor.dev/gvisor/pkg/sleep"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/seqnum"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
        "gvisor.dev/gvisor/pkg/waiter"
)

// maxSegmentsPerWake is the maximum number of segments to process in the main
// protocol goroutine per wake-up. Yielding [after this number of segments are
// processed] allows other events to be processed as well (e.g., timeouts,
// resets, etc.).
const maxSegmentsPerWake = 100

type handshakeState int

// The following are the possible states of the TCP connection during a 3-way
// handshake. A depiction of the states and transitions can be found in RFC 793,
// page 23.
const (
        handshakeSynSent handshakeState = iota
        handshakeSynRcvd
        handshakeCompleted
)

// The following are used to set up sleepers.
const (
        wakerForNotification = iota
        wakerForNewSegment
        wakerForResend
)

const (
        // Maximum space available for options.
        maxOptionSize = 40
)

// handshake holds the state used during a TCP 3-way handshake.
//
// NOTE: handshake.ep.mu is held during handshake processing. It is released if
// we are going to block and reacquired when we start processing an event.
type handshake struct {
        ep       *endpoint
        listenEP *endpoint
        state    handshakeState
        active   bool
        flags    header.TCPFlags
        ackNum   seqnum.Value

        // iss is the initial send sequence number, as defined in RFC 793.
        iss seqnum.Value

        // rcvWnd is the receive window, as defined in RFC 793.
        rcvWnd seqnum.Size

        // sndWnd is the send window, as defined in RFC 793.
        sndWnd seqnum.Size

        // mss is the maximum segment size received from the peer.
        mss uint16

        // sndWndScale is the send window scale, as defined in RFC 1323. A
        // negative value means no scaling is supported by the peer.
        sndWndScale int

        // rcvWndScale is the receive window scale, as defined in RFC 1323.
        rcvWndScale int

        // startTime is the time at which the first SYN/SYN-ACK was sent.
        startTime tcpip.MonotonicTime

        // deferAccept if non-zero will drop the final ACK for a passive
        // handshake till an ACK segment with data is received or the timeout is
        // hit.
        deferAccept time.Duration

        // acked is true if the the final ACK for a 3-way handshake has
        // been received. This is required to stop retransmitting the
        // original SYN-ACK when deferAccept is enabled.
        acked bool

        // sendSYNOpts is the cached values for the SYN options to be sent.
        sendSYNOpts header.TCPSynOptions
}

func (e *endpoint) newHandshake() *handshake {
        h := &handshake{
                ep:          e,
                active:      true,
                rcvWnd:      seqnum.Size(e.initialReceiveWindow()),
                rcvWndScale: e.rcvWndScaleForHandshake(),
        }
        h.resetState()
        // Store reference to handshake state in endpoint.
        e.h = h
        return h
}

func (e *endpoint) newPassiveHandshake(isn, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) *handshake {
        h := e.newHandshake()
        h.resetToSynRcvd(isn, irs, opts, deferAccept)
        return h
}

// FindWndScale determines the window scale to use for the given maximum window
// size.
func FindWndScale(wnd seqnum.Size) int {
        if wnd < 0x10000 {
                return 0
        }

        max := seqnum.Size(math.MaxUint16)
        s := 0
        for wnd > max && s < header.MaxWndScale {
                s++
                max <<= 1
        }

        return s
}

// resetState resets the state of the handshake object such that it becomes
// ready for a new 3-way handshake.
func (h *handshake) resetState() {
        h.state = handshakeSynSent
        h.flags = header.TCPFlagSyn
        h.ackNum = 0
        h.mss = 0
        h.iss = generateSecureISN(h.ep.TransportEndpointInfo.ID, h.ep.stack.Clock(), h.ep.stack.Seed())
}

// generateSecureISN generates a secure Initial Sequence number based on the
// recommendation here https://tools.ietf.org/html/rfc6528#page-3.
func generateSecureISN(id stack.TransportEndpointID, clock tcpip.Clock, seed uint32) seqnum.Value {
        isnHasher := jenkins.Sum32(seed)
        isnHasher.Write([]byte(id.LocalAddress))
        isnHasher.Write([]byte(id.RemoteAddress))
        portBuf := make([]byte, 2)
        binary.LittleEndian.PutUint16(portBuf, id.LocalPort)
        isnHasher.Write(portBuf)
        binary.LittleEndian.PutUint16(portBuf, id.RemotePort)
        isnHasher.Write(portBuf)
        // The time period here is 64ns. This is similar to what linux uses
        // generate a sequence number that overlaps less than one
        // time per MSL (2 minutes).
        //
        // A 64ns clock ticks 10^9/64 = 15625000) times in a second.
        // To wrap the whole 32 bit space would require
        // 2^32/1562500 ~ 274 seconds.
        //
        // Which sort of guarantees that we won't reuse the ISN for a new
        // connection for the same tuple for at least 274s.
        isn := isnHasher.Sum32() + uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Nanoseconds()>>6)
        return seqnum.Value(isn)
}

// effectiveRcvWndScale returns the effective receive window scale to be used.
// If the peer doesn't support window scaling, the effective rcv wnd scale is
// zero; otherwise it's the value calculated based on the initial rcv wnd.
func (h *handshake) effectiveRcvWndScale() uint8 {
        if h.sndWndScale < 0 {
                return 0
        }
        return uint8(h.rcvWndScale)
}

// resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
// state.
func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) {
        h.active = false
        h.state = handshakeSynRcvd
        h.flags = header.TCPFlagSyn | header.TCPFlagAck
        h.iss = iss
        h.ackNum = irs + 1
        h.mss = opts.MSS
        h.sndWndScale = opts.WS
        h.deferAccept = deferAccept
        h.ep.setEndpointState(StateSynRecv)
}

// checkAck checks if the ACK number, if present, of a segment received during
// a TCP 3-way handshake is valid. If it's not, a RST segment is sent back in
// response.
func (h *handshake) checkAck(s *segment) bool {
        if s.flags.Contains(header.TCPFlagAck) && s.ackNumber != h.iss+1 {
                // RFC 793, page 36, states that a reset must be generated when
                // the connection is in any non-synchronized state and an
                // incoming segment acknowledges something not yet sent. The
                // connection remains in the same state.
                ack := s.sequenceNumber.Add(s.logicalLen())
                h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, s.ackNumber, ack, 0)
                return false
        }

        return true
}

// synSentState handles a segment received when the TCP 3-way handshake is in
// the SYN-SENT state.
func (h *handshake) synSentState(s *segment) tcpip.Error {
        // RFC 793, page 37, states that in the SYN-SENT state, a reset is
        // acceptable if the ack field acknowledges the SYN.
        if s.flags.Contains(header.TCPFlagRst) {
                if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == h.iss+1 {
                        // RFC 793, page 67, states that "If the RST bit is set [and] If the ACK
                        // was acceptable then signal the user "error: connection reset", drop
                        // the segment, enter CLOSED state, delete TCB, and return."
                        h.ep.workerCleanup = true
                        // Although the RFC above calls out ECONNRESET, Linux actually returns
                        // ECONNREFUSED here so we do as well.
                        return &tcpip.ErrConnectionRefused{}
                }
                return nil
        }

        if !h.checkAck(s) {
                return nil
        }

        // We are in the SYN-SENT state. We only care about segments that have
        // the SYN flag.
        if !s.flags.Contains(header.TCPFlagSyn) {
                return nil
        }

        // Parse the SYN options.
        rcvSynOpts := parseSynSegmentOptions(s)

        // Remember if the Timestamp option was negotiated.
        h.ep.maybeEnableTimestamp(&rcvSynOpts)

        // Remember if the SACKPermitted option was negotiated.
        h.ep.maybeEnableSACKPermitted(&rcvSynOpts)

        // Remember the sequence we'll ack from now on.
        h.ackNum = s.sequenceNumber + 1
        h.flags |= header.TCPFlagAck
        h.mss = rcvSynOpts.MSS
        h.sndWndScale = rcvSynOpts.WS

        // If this is a SYN ACK response, we only need to acknowledge the SYN
        // and the handshake is completed.
        if s.flags.Contains(header.TCPFlagAck) {
                h.state = handshakeCompleted

                h.ep.transitionToStateEstablishedLocked(h)

                h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale())
                return nil
        }

        // A SYN segment was received, but no ACK in it. We acknowledge the SYN
        // but resend our own SYN and wait for it to be acknowledged in the
        // SYN-RCVD state.
        h.state = handshakeSynRcvd
        ttl := h.ep.ttl
        amss := h.ep.amss
        h.ep.setEndpointState(StateSynRecv)
        synOpts := header.TCPSynOptions{
                WS:    int(h.effectiveRcvWndScale()),
                TS:    rcvSynOpts.TS,
                TSVal: h.ep.timestamp(),
                TSEcr: h.ep.recentTimestamp(),

                // We only send SACKPermitted if the other side indicated it
                // permits SACK. This is not explicitly defined in the RFC but
                // this is the behaviour implemented by Linux.
                SACKPermitted: rcvSynOpts.SACKPermitted,
                MSS:           amss,
        }
        if ttl == 0 {
                ttl = h.ep.route.DefaultTTL()
        }
        h.ep.sendSynTCP(h.ep.route, tcpFields{
                id:     h.ep.TransportEndpointInfo.ID,
                ttl:    ttl,
                tos:    h.ep.sendTOS,
                flags:  h.flags,
                seq:    h.iss,
                ack:    h.ackNum,
                rcvWnd: h.rcvWnd,
        }, synOpts)
        return nil
}

// synRcvdState handles a segment received when the TCP 3-way handshake is in
// the SYN-RCVD state.
func (h *handshake) synRcvdState(s *segment) tcpip.Error {
        if s.flags.Contains(header.TCPFlagRst) {
                // RFC 793, page 37, states that in the SYN-RCVD state, a reset
                // is acceptable if the sequence number is in the window.
                if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
                        return &tcpip.ErrConnectionRefused{}
                }
                return nil
        }

        if !h.checkAck(s) {
                return nil
        }

        // RFC 793, Section 3.9, page 69, states that in the SYN-RCVD state, a
        // sequence number outside of the window causes an ACK with the proper seq
        // number and "After sending the acknowledgment, drop the unacceptable
        // segment and return."
        if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
                if h.ep.allowOutOfWindowAck() {
                        h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd)
                }
                return nil
        }

        if s.flags.Contains(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 {
                // We received two SYN segments with different sequence
                // numbers, so we reset this and restart the whole
                // process, except that we don't reset the timer.
                ack := s.sequenceNumber.Add(s.logicalLen())
                seq := seqnum.Value(0)
                if s.flags.Contains(header.TCPFlagAck) {
                        seq = s.ackNumber
                }
                h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0)

                if !h.active {
                        return &tcpip.ErrInvalidEndpointState{}
                }

                h.resetState()
                synOpts := header.TCPSynOptions{
                        WS:            h.rcvWndScale,
                        TS:            h.ep.SendTSOk,
                        TSVal:         h.ep.timestamp(),
                        TSEcr:         h.ep.recentTimestamp(),
                        SACKPermitted: h.ep.SACKPermitted,
                        MSS:           h.ep.amss,
                }
                h.ep.sendSynTCP(h.ep.route, tcpFields{
                        id:     h.ep.TransportEndpointInfo.ID,
                        ttl:    h.ep.ttl,
                        tos:    h.ep.sendTOS,
                        flags:  h.flags,
                        seq:    h.iss,
                        ack:    h.ackNum,
                        rcvWnd: h.rcvWnd,
                }, synOpts)
                return nil
        }

        // We have previously received (and acknowledged) the peer's SYN. If the
        // peer acknowledges our SYN, the handshake is completed.
        if s.flags.Contains(header.TCPFlagAck) {
                // If deferAccept is not zero and this is a bare ACK and the
                // timeout is not hit then drop the ACK.
                if h.deferAccept != 0 && s.data.Size() == 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) < h.deferAccept {
                        h.acked = true
                        h.ep.stack.Stats().DroppedPackets.Increment()
                        return nil
                }

                // If the timestamp option is negotiated and the segment does
                // not carry a timestamp option then the segment must be dropped
                // as per https://tools.ietf.org/html/rfc7323#section-3.2.
                if h.ep.SendTSOk && !s.parsedOptions.TS {
                        h.ep.stack.Stats().DroppedPackets.Increment()
                        return nil
                }

                // Drop the ACK if the accept queue is full.
                // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_ipv4.c#L1523
                // We could abort the connection as well with a tunable as in
                // https://github.com/torvalds/linux/blob/7acac4b3196/net/ipv4/tcp_minisocks.c#L788
                if listenEP := h.listenEP; listenEP != nil && listenEP.acceptQueueIsFull() {
                        listenEP.stack.Stats().DroppedPackets.Increment()
                        return nil
                }

                // Update timestamp if required. See RFC7323, section-4.3.
                if h.ep.SendTSOk && s.parsedOptions.TS {
                        h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
                }
                h.state = handshakeCompleted

                h.ep.transitionToStateEstablishedLocked(h)

                // Requeue the segment if the ACK completing the handshake has more info
                // to be procesed by the newly established endpoint.
                if (s.flags.Contains(header.TCPFlagFin) || s.data.Size() > 0) && h.ep.enqueueSegment(s) {
                        s.incRef()
                        h.ep.newSegmentWaker.Assert()
                }
                return nil
        }

        return nil
}

func (h *handshake) handleSegment(s *segment) tcpip.Error {
        h.sndWnd = s.window
        if !s.flags.Contains(header.TCPFlagSyn) && h.sndWndScale > 0 {
                h.sndWnd <<= uint8(h.sndWndScale)
        }

        switch h.state {
        case handshakeSynRcvd:
                return h.synRcvdState(s)
        case handshakeSynSent:
                return h.synSentState(s)
        }
        return nil
}

// processSegments goes through the segment queue and processes up to
// maxSegmentsPerWake (if they're available).
func (h *handshake) processSegments() tcpip.Error {
        for i := 0; i < maxSegmentsPerWake; i++ {
                s := h.ep.segmentQueue.dequeue()
                if s == nil {
                        return nil
                }

                err := h.handleSegment(s)
                s.decRef()
                if err != nil {
                        return err
                }

                // We stop processing packets once the handshake is completed,
                // otherwise we may process packets meant to be processed by
                // the main protocol goroutine.
                if h.state == handshakeCompleted {
                        break
                }
        }

        // If the queue is not empty, make sure we'll wake up in the next
        // iteration.
        if !h.ep.segmentQueue.empty() {
                h.ep.newSegmentWaker.Assert()
        }

        return nil
}

// start sends the first SYN/SYN-ACK. It does not block, even if link address
// resolution is required.
func (h *handshake) start() {
        h.startTime = h.ep.stack.Clock().NowMonotonic()
        h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
        var sackEnabled tcpip.TCPSACKEnabled
        if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil {
                // If stack returned an error when checking for SACKEnabled
                // status then just default to switching off SACK negotiation.
                sackEnabled = false
        }

        synOpts := header.TCPSynOptions{
                WS:            h.rcvWndScale,
                TS:            true,
                TSVal:         h.ep.timestamp(),
                TSEcr:         h.ep.recentTimestamp(),
                SACKPermitted: bool(sackEnabled),
                MSS:           h.ep.amss,
        }

        // start() is also called in a listen context so we want to make sure we only
        // send the TS/SACK option when we received the TS/SACK in the initial SYN.
        if h.state == handshakeSynRcvd {
                synOpts.TS = h.ep.SendTSOk
                synOpts.SACKPermitted = h.ep.SACKPermitted && bool(sackEnabled)
                if h.sndWndScale < 0 {
                        // Disable window scaling if the peer did not send us
                        // the window scaling option.
                        synOpts.WS = -1
                }
        }

        h.sendSYNOpts = synOpts
        h.ep.sendSynTCP(h.ep.route, tcpFields{
                id:     h.ep.TransportEndpointInfo.ID,
                ttl:    h.ep.ttl,
                tos:    h.ep.sendTOS,
                flags:  h.flags,
                seq:    h.iss,
                ack:    h.ackNum,
                rcvWnd: h.rcvWnd,
        }, synOpts)
}

// complete completes the TCP 3-way handshake initiated by h.start().
// +checklocks:h.ep.mu
func (h *handshake) complete() tcpip.Error {
        // Set up the wakers.
        var s sleep.Sleeper
        resendWaker := sleep.Waker{}
        s.AddWaker(&resendWaker, wakerForResend)
        s.AddWaker(&h.ep.notificationWaker, wakerForNotification)
        s.AddWaker(&h.ep.newSegmentWaker, wakerForNewSegment)
        defer s.Done()

        // Initialize the resend timer.
        timer, err := newBackoffTimer(h.ep.stack.Clock(), time.Second, MaxRTO, resendWaker.Assert)
        if err != nil {
                return err
        }
        defer timer.stop()
        for h.state != handshakeCompleted {
                // Unlock before blocking, and reacquire again afterwards (h.ep.mu is held
                // throughout handshake processing).
                h.ep.mu.Unlock()
                index, _ := s.Fetch(true /* block */)
                h.ep.mu.Lock()
                switch index {

                case wakerForResend:
                        if err := timer.reset(); err != nil {
                                return err
                        }
                        // Resend the SYN/SYN-ACK only if the following conditions hold.
                        //  - It's an active handshake (deferAccept does not apply)
                        //  - It's a passive handshake and we have not yet got the final-ACK.
                        //  - It's a passive handshake and we got an ACK but deferAccept is
                        //    enabled and we are now past the deferAccept duration.
                        // The last is required to provide a way for the peer to complete
                        // the connection with another ACK or data (as ACKs are never
                        // retransmitted on their own).
                        if h.active || !h.acked || h.deferAccept != 0 && h.ep.stack.Clock().NowMonotonic().Sub(h.startTime) > h.deferAccept {
                                h.ep.sendSynTCP(h.ep.route, tcpFields{
                                        id:     h.ep.TransportEndpointInfo.ID,
                                        ttl:    h.ep.ttl,
                                        tos:    h.ep.sendTOS,
                                        flags:  h.flags,
                                        seq:    h.iss,
                                        ack:    h.ackNum,
                                        rcvWnd: h.rcvWnd,
                                }, h.sendSYNOpts)
                        }

                case wakerForNotification:
                        n := h.ep.fetchNotifications()
                        if (n&notifyClose)|(n&notifyAbort) != 0 {
                                return &tcpip.ErrAborted{}
                        }
                        if n&notifyDrain != 0 {
                                for !h.ep.segmentQueue.empty() {
                                        s := h.ep.segmentQueue.dequeue()
                                        err := h.handleSegment(s)
                                        s.decRef()
                                        if err != nil {
                                                return err
                                        }
                                        if h.state == handshakeCompleted {
                                                return nil
                                        }
                                }
                                close(h.ep.drainDone)
                                h.ep.mu.Unlock()
                                <-h.ep.undrain
                                h.ep.mu.Lock()
                        }
                        // Check for any ICMP errors notified to us.
                        if n&notifyError != 0 {
                                if err := h.ep.lastErrorLocked(); err != nil {
                                        return err
                                }
                                // Flag the handshake failure as aborted if the lastError is
                                // cleared because of a socket layer call.
                                return &tcpip.ErrConnectionAborted{}
                        }
                case wakerForNewSegment:
                        if err := h.processSegments(); err != nil {
                                return err
                        }
                }
        }

        return nil
}

type backoffTimer struct {
        timeout    time.Duration
        maxTimeout time.Duration
        t          tcpip.Timer
}

func newBackoffTimer(clock tcpip.Clock, timeout, maxTimeout time.Duration, f func()) (*backoffTimer, tcpip.Error) {
        if timeout > maxTimeout {
                return nil, &tcpip.ErrTimeout{}
        }
        bt := &backoffTimer{timeout: timeout, maxTimeout: maxTimeout}
        bt.t = clock.AfterFunc(timeout, f)
        return bt, nil
}

func (bt *backoffTimer) reset() tcpip.Error {
        bt.timeout *= 2
        if bt.timeout > bt.maxTimeout {
                return &tcpip.ErrTimeout{}
        }
        bt.t.Reset(bt.timeout)
        return nil
}

func (bt *backoffTimer) stop() {
        bt.t.Stop()
}

func parseSynSegmentOptions(s *segment) header.TCPSynOptions {
        synOpts := header.ParseSynOptions(s.options, s.flags.Contains(header.TCPFlagAck))
        if synOpts.TS {
                s.parsedOptions.TSVal = synOpts.TSVal
                s.parsedOptions.TSEcr = synOpts.TSEcr
        }
        return synOpts
}

var optionPool = sync.Pool{
        New: func() interface{} {
                return &[maxOptionSize]byte{}
        },
}

func getOptions() []byte {
        return (*optionPool.Get().(*[maxOptionSize]byte))[:]
}

func putOptions(options []byte) {
        // Reslice to full capacity.
        optionPool.Put(optionsToArray(options))
}

func makeSynOptions(opts header.TCPSynOptions) []byte {
        // Emulate linux option order. This is as follows:
        //
        // if md5: NOP NOP MD5SIG 18 md5sig(16)
        // if mss: MSS 4 mss(2)
        // if ts and sack_advertise:
        //        SACK 2 TIMESTAMP 2 timestamp(8)
        // elif ts: NOP NOP TIMESTAMP 10 timestamp(8)
        // elif sack: NOP NOP SACK 2
        // if wscale: NOP WINDOW 3 ws(1)
        // if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8))
        //        [for each block] start_seq(4) end_seq(4)
        // if fastopen_cookie:
        //        if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2)
        //         else: FASTOPEN (2 + len(cookie))
        //        cookie(variable) [padding to four bytes]
        //
        options := getOptions()

        // Always encode the mss.
        offset := header.EncodeMSSOption(uint32(opts.MSS), options)

        // Special ordering is required here. If both TS and SACK are enabled,
        // then the SACK option precedes TS, with no padding. If they are
        // enabled individually, then we see padding before the option.
        if opts.TS && opts.SACKPermitted {
                offset += header.EncodeSACKPermittedOption(options[offset:])
                offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
        } else if opts.TS {
                offset += header.EncodeNOP(options[offset:])
                offset += header.EncodeNOP(options[offset:])
                offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
        } else if opts.SACKPermitted {
                offset += header.EncodeNOP(options[offset:])
                offset += header.EncodeNOP(options[offset:])
                offset += header.EncodeSACKPermittedOption(options[offset:])
        }

        // Initialize the WS option.
        if opts.WS >= 0 {
                offset += header.EncodeNOP(options[offset:])
                offset += header.EncodeWSOption(opts.WS, options[offset:])
        }

        // Padding to the end; note that this never apply unless we add a
        // fastopen option, we always expect the offset to remain the same.
        if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
                panic("unexpected option encoding")
        }

        return options[:offset]
}

// tcpFields is a struct to carry different parameters required by the
// send*TCP variant functions below.
type tcpFields struct {
        id     stack.TransportEndpointID
        ttl    uint8
        tos    uint8
        flags  header.TCPFlags
        seq    seqnum.Value
        ack    seqnum.Value
        rcvWnd seqnum.Size
        opts   []byte
        txHash uint32
}

func (e *endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOptions) tcpip.Error {
        tf.opts = makeSynOptions(opts)
        // We ignore SYN send errors and let the callers re-attempt send.
        if err := e.sendTCP(r, tf, buffer.VectorisedView{}, stack.GSO{}); err != nil {
                e.stats.SendErrors.SynSendToNetworkFailed.Increment()
        }
        putOptions(tf.opts)
        return nil
}

func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO) tcpip.Error {
        tf.txHash = e.txHash
        if err := sendTCP(r, tf, data, gso, e.owner); err != nil {
                e.stats.SendErrors.SegmentSendToNetworkFailed.Increment()
                return err
        }
        e.stats.SegmentsSent.Increment()
        return nil
}

func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso stack.GSO) {
        optLen := len(tf.opts)
        tcp := header.TCP(pkt.TransportHeader().Push(header.TCPMinimumSize + optLen))
        pkt.TransportProtocolNumber = header.TCPProtocolNumber
        tcp.Encode(&header.TCPFields{
                SrcPort:    tf.id.LocalPort,
                DstPort:    tf.id.RemotePort,
                SeqNum:     uint32(tf.seq),
                AckNum:     uint32(tf.ack),
                DataOffset: uint8(header.TCPMinimumSize + optLen),
                Flags:      tf.flags,
                WindowSize: uint16(tf.rcvWnd),
        })
        copy(tcp[header.TCPMinimumSize:], tf.opts)

        xsum := r.PseudoHeaderChecksum(ProtocolNumber, uint16(pkt.Size()))
        // Only calculate the checksum if offloading isn't supported.
        if gso.Type != stack.GSONone && gso.NeedsCsum {
                // This is called CHECKSUM_PARTIAL in the Linux kernel. We
                // calculate a checksum of the pseudo-header and save it in the
                // TCP header, then the kernel calculate a checksum of the
                // header and data and get the right sum of the TCP packet.
                tcp.SetChecksum(xsum)
        } else if r.RequiresTXTransportChecksum() {
                xsum = header.ChecksumCombine(xsum, pkt.Data().AsRange().Checksum())
                tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
        }
}

func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error {
        // We need to shallow clone the VectorisedView here as ReadToView will
        // split the VectorisedView and Trim underlying views as it splits. Not
        // doing the clone here will cause the underlying views of data itself
        // to be altered.
        data = data.Clone(nil)

        optLen := len(tf.opts)
        if tf.rcvWnd > math.MaxUint16 {
                tf.rcvWnd = math.MaxUint16
        }

        mss := int(gso.MSS)
        n := (data.Size() + mss - 1) / mss

        size := data.Size()
        hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen
        var pkts stack.PacketBufferList
        for i := 0; i < n; i++ {
                packetSize := mss
                if packetSize > size {
                        packetSize = size
                }
                size -= packetSize
                pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                        ReserveHeaderBytes: hdrSize,
                })
                pkt.Hash = tf.txHash
                pkt.Owner = owner
                pkt.Data().ReadFromVV(&data, packetSize)
                buildTCPHdr(r, tf, pkt, gso)
                tf.seq = tf.seq.Add(seqnum.Size(packetSize))
                pkt.GSOOptions = gso
                pkts.PushBack(pkt)
        }

        if tf.ttl == 0 {
                tf.ttl = r.DefaultTTL()
        }
        sent, err := r.WritePackets(pkts, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos})
        if err != nil {
                r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent))
        }
        r.Stats().TCP.SegmentsSent.IncrementBy(uint64(sent))
        return err
}

// sendTCP sends a TCP segment with the provided options via the provided
// network endpoint and under the provided identity.
func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso stack.GSO, owner tcpip.PacketOwner) tcpip.Error {
        optLen := len(tf.opts)
        if tf.rcvWnd > math.MaxUint16 {
                tf.rcvWnd = math.MaxUint16
        }

        if r.Loop()&stack.PacketLoop == 0 && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() {
                return sendTCPBatch(r, tf, data, gso, owner)
        }

        pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                ReserveHeaderBytes: header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen,
                Data:               data,
        })
        pkt.GSOOptions = gso
        pkt.Hash = tf.txHash
        pkt.Owner = owner
        buildTCPHdr(r, tf, pkt, gso)

        if tf.ttl == 0 {
                tf.ttl = r.DefaultTTL()
        }
        if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}, pkt); err != nil {
                r.Stats().TCP.SegmentSendErrors.Increment()
                return err
        }
        r.Stats().TCP.SegmentsSent.Increment()
        if (tf.flags & header.TCPFlagRst) != 0 {
                r.Stats().TCP.ResetsSent.Increment()
        }
        return nil
}

// makeOptions makes an options slice.
func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
        options := getOptions()
        offset := 0

        // N.B. the ordering here matches the ordering used by Linux internally
        // and described in the raw makeOptions function. We don't include
        // unnecessary cases here (post connection.)
        if e.SendTSOk {
                // Embed the timestamp if timestamp has been enabled.
                //
                // We only use the lower 32 bits of the unix time in
                // milliseconds. This is similar to what Linux does where it
                // uses the lower 32 bits of the jiffies value in the tsVal
                // field of the timestamp option.
                //
                // Further, RFC7323 section-5.4 recommends millisecond
                // resolution as the lowest recommended resolution for the
                // timestamp clock.
                //
                // Ref: https://tools.ietf.org/html/rfc7323#section-5.4.
                offset += header.EncodeNOP(options[offset:])
                offset += header.EncodeNOP(options[offset:])
                offset += header.EncodeTSOption(e.timestamp(), e.recentTimestamp(), options[offset:])
        }
        if e.SACKPermitted && len(sackBlocks) > 0 {
                offset += header.EncodeNOP(options[offset:])
                offset += header.EncodeNOP(options[offset:])
                offset += header.EncodeSACKBlocks(sackBlocks, options[offset:])
        }

        // We expect the above to produce an aligned offset.
        if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
                panic("unexpected option encoding")
        }

        return options[:offset]
}

// sendRaw sends a TCP segment to the endpoint's peer.
func (e *endpoint) sendRaw(data buffer.VectorisedView, flags header.TCPFlags, seq, ack seqnum.Value, rcvWnd seqnum.Size) tcpip.Error {
        var sackBlocks []header.SACKBlock
        if e.EndpointState() == StateEstablished && e.rcv.pendingRcvdSegments.Len() > 0 && (flags&header.TCPFlagAck != 0) {
                sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
        }
        options := e.makeOptions(sackBlocks)
        err := e.sendTCP(e.route, tcpFields{
                id:     e.TransportEndpointInfo.ID,
                ttl:    e.ttl,
                tos:    e.sendTOS,
                flags:  flags,
                seq:    seq,
                ack:    ack,
                rcvWnd: rcvWnd,
                opts:   options,
        }, data, e.gso)
        putOptions(options)
        return err
}

// Precondition: e.mu must be locked.
func (e *endpoint) sendData(next *segment) {
        // Initialize the next segment to write if it's currently nil.
        if e.snd.writeNext == nil {
                if next == nil {
                        return
                }
                e.snd.writeNext = next
        }

        // Push out any new packets.
        e.snd.sendData()
}

// resetConnectionLocked puts the endpoint in an error state with the given
// error code and sends a RST if and only if the error is not ErrConnectionReset
// indicating that the connection is being reset due to receiving a RST. This
// method must only be called from the protocol goroutine.
func (e *endpoint) resetConnectionLocked(err tcpip.Error) {
        // Only send a reset if the connection is being aborted for a reason
        // other than receiving a reset.
        e.setEndpointState(StateError)
        e.hardError = err
        switch err.(type) {
        case *tcpip.ErrConnectionReset, *tcpip.ErrTimeout:
        default:
                // The exact sequence number to be used for the RST is the same as the
                // one used by Linux. We need to handle the case of window being shrunk
                // which can cause sndNxt to be outside the acceptable window on the
                // receiver.
                //
                // See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more
                // information.
                sndWndEnd := e.snd.SndUna.Add(e.snd.SndWnd)
                resetSeqNum := sndWndEnd
                if !sndWndEnd.LessThan(e.snd.SndNxt) || e.snd.SndNxt.Size(sndWndEnd) < (1<<e.snd.SndWndScale) {
                        resetSeqNum = e.snd.SndNxt
                }
                e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, resetSeqNum, e.rcv.RcvNxt, 0)
        }
}

// completeWorkerLocked is called by the worker goroutine when it's about to
// exit.
func (e *endpoint) completeWorkerLocked() {
        // Worker is terminating(either due to moving to
        // CLOSED or ERROR state, ensure we release all
        // registrations port reservations even if the socket
        // itself is not yet closed by the application.
        e.workerRunning = false
        if e.workerCleanup {
                e.cleanupLocked()
        }
}

// transitionToStateEstablisedLocked transitions a given endpoint
// to an established state using the handshake parameters provided.
// It also initializes sender/receiver.
func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
        // Transfer handshake state to TCP connection. We disable
        // receive window scaling if the peer doesn't support it
        // (indicated by a negative send window scale).
        e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)

        e.rcvQueueInfo.rcvQueueMu.Lock()
        e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale())
        // Bootstrap the auto tuning algorithm. Starting at zero will
        // result in a really large receive window after the first auto
        // tuning adjustment.
        e.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = int(h.rcvWnd)
        e.rcvQueueInfo.rcvQueueMu.Unlock()

        e.setEndpointState(StateEstablished)
}

// transitionToStateCloseLocked ensures that the endpoint is
// cleaned up from the transport demuxer, "before" moving to
// StateClose. This will ensure that no packet will be
// delivered to this endpoint from the demuxer when the endpoint
// is transitioned to StateClose.
func (e *endpoint) transitionToStateCloseLocked() {
        s := e.EndpointState()
        if s == StateClose {
                return
        }

        if s.connected() {
                e.stack.Stats().TCP.CurrentConnected.Decrement()
                e.stack.Stats().TCP.EstablishedClosed.Increment()
        }

        // Mark the endpoint as fully closed for reads/writes.
        e.cleanupLocked()
        e.setEndpointState(StateClose)
}

// tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed
// segment to any other endpoint other than the current one. This is called
// only when the endpoint is in StateClose and we want to deliver the segment
// to any other listening endpoint. We reply with RST if we cannot find one.
func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
        ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.TransportEndpointInfo.ID, s.nicID)
        if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.TransportEndpointInfo.ID.LocalAddress.To4() != "" {
                // Dual-stack socket, try IPv4.
                ep = e.stack.FindTransportEndpoint(
                        header.IPv4ProtocolNumber,
                        e.TransProto,
                        e.TransportEndpointInfo.ID,
                        s.nicID,
                )
        }
        if ep == nil {
                replyWithReset(e.stack, s, stack.DefaultTOS, 0 /* ttl */)
                s.decRef()
                return
        }

        if e == ep {
                panic("current endpoint not removed from demuxer, enqueing segments to itself")
        }

        if ep := ep.(*endpoint); ep.enqueueSegment(s) {
                ep.newSegmentWaker.Assert()
        }
}

// Drain segment queue from the endpoint and try to re-match the segment to a
// different endpoint. This is used when the current endpoint is transitioned to
// StateClose and has been unregistered from the transport demuxer.
func (e *endpoint) drainClosingSegmentQueue() {
        for {
                s := e.segmentQueue.dequeue()
                if s == nil {
                        break
                }

                e.tryDeliverSegmentFromClosedEndpoint(s)
        }
}

func (e *endpoint) handleReset(s *segment) (ok bool, err tcpip.Error) {
        if e.rcv.acceptable(s.sequenceNumber, 0) {
                // RFC 793, page 37 states that "in all states
                // except SYN-SENT, all reset (RST) segments are
                // validated by checking their SEQ-fields." So
                // we only process it if it's acceptable.
                switch e.EndpointState() {
                // In case of a RST in CLOSE-WAIT linux moves
                // the socket to closed state with an error set
                // to indicate EPIPE.
                //
                // Technically this seems to be at odds w/ RFC.
                // As per https://tools.ietf.org/html/rfc793#section-2.7
                // page 69 the behavior for a segment arriving
                // w/ RST bit set in CLOSE-WAIT is inlined below.
                //
                //  ESTABLISHED
                //  FIN-WAIT-1
                //  FIN-WAIT-2
                //  CLOSE-WAIT

                //  If the RST bit is set then, any outstanding RECEIVEs and
                //  SEND should receive "reset" responses. All segment queues
                //  should be flushed.  Users should also receive an unsolicited
                //  general "connection reset" signal. Enter the CLOSED state,
                //  delete the TCB, and return.
                case StateCloseWait:
                        e.transitionToStateCloseLocked()
                        e.hardError = &tcpip.ErrAborted{}
                        e.notifyProtocolGoroutine(notifyTickleWorker)
                        return false, nil
                default:
                        // RFC 793, page 37 states that "in all states
                        // except SYN-SENT, all reset (RST) segments are
                        // validated by checking their SEQ-fields." So
                        // we only process it if it's acceptable.

                        // Notify protocol goroutine. This is required when
                        // handleSegment is invoked from the processor goroutine
                        // rather than the worker goroutine.
                        e.notifyProtocolGoroutine(notifyResetByPeer)
                        return false, &tcpip.ErrConnectionReset{}
                }
        }
        return true, nil
}

// handleSegments processes all inbound segments.
//
// Precondition: e.mu must be held.
func (e *endpoint) handleSegmentsLocked(fastPath bool) tcpip.Error {
        checkRequeue := true
        for i := 0; i < maxSegmentsPerWake; i++ {
                if state := e.EndpointState(); state.closed() || state == StateTimeWait {
                        return nil
                }
                s := e.segmentQueue.dequeue()
                if s == nil {
                        checkRequeue = false
                        break
                }

                cont, err := e.handleSegmentLocked(s)
                s.decRef()
                if err != nil {
                        return err
                }
                if !cont {
                        return nil
                }
        }

        // When fastPath is true we don't want to wake up the worker
        // goroutine. If the endpoint has more segments to process the
        // dispatcher will call handleSegments again anyway.
        if !fastPath && checkRequeue && !e.segmentQueue.empty() {
                e.newSegmentWaker.Assert()
        }

        // Send an ACK for all processed packets if needed.
        if e.rcv.RcvNxt != e.snd.MaxSentAck {
                e.snd.sendAck()
        }

        e.resetKeepaliveTimer(true /* receivedData */)

        return nil
}

// Precondition: e.mu must be held.
func (e *endpoint) probeSegmentLocked() {
        if fn := e.probe; fn != nil {
                fn(e.completeStateLocked())
        }
}

// handleSegment handles a given segment and notifies the worker goroutine if
// if the connection should be terminated.
//
// Precondition: e.mu must be held.
func (e *endpoint) handleSegmentLocked(s *segment) (cont bool, err tcpip.Error) {
        // Invoke the tcp probe if installed. The tcp probe function will update
        // the TCPEndpointState after the segment is processed.
        defer e.probeSegmentLocked()

        if s.flags.Contains(header.TCPFlagRst) {
                if ok, err := e.handleReset(s); !ok {
                        return false, err
                }
        } else if s.flags.Contains(header.TCPFlagSyn) {
                // See: https://tools.ietf.org/html/rfc5961#section-4.1
                //   1) If the SYN bit is set, irrespective of the sequence number, TCP
                //    MUST send an ACK (also referred to as challenge ACK) to the remote
                //    peer:
                //
                //    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
                //
                //    After sending the acknowledgment, TCP MUST drop the unacceptable
                //    segment and stop processing further.
                //
                // By sending an ACK, the remote peer is challenged to confirm the loss
                // of the previous connection and the request to start a new connection.
                // A legitimate peer, after restart, would not have a TCB in the
                // synchronized state.  Thus, when the ACK arrives, the peer should send
                // a RST segment back with the sequence number derived from the ACK
                // field that caused the RST.

                // This RST will confirm that the remote peer has indeed closed the
                // previous connection.  Upon receipt of a valid RST, the local TCP
                // endpoint MUST terminate its connection.  The local TCP endpoint
                // should then rely on SYN retransmission from the remote end to
                // re-establish the connection.
                e.snd.maybeSendOutOfWindowAck(s)
        } else if s.flags.Contains(header.TCPFlagAck) {
                // Patch the window size in the segment according to the
                // send window scale.
                s.window <<= e.snd.SndWndScale

                // RFC 793, page 41 states that "once in the ESTABLISHED
                // state all segments must carry current acknowledgment
                // information."
                drop, err := e.rcv.handleRcvdSegment(s)
                if err != nil {
                        return false, err
                }
                if drop {
                        return true, nil
                }

                // Now check if the received segment has caused us to transition
                // to a CLOSED state, if yes then terminate processing and do
                // not invoke the sender.
                state := e.EndpointState()
                if state == StateClose {
                        // When we get into StateClose while processing from the queue,
                        // return immediately and let the protocolMainloop handle it.
                        //
                        // We can reach StateClose only while processing a previous segment
                        // or a notification from the protocolMainLoop (caller goroutine).
                        // This means that with this return, the segment dequeue below can
                        // never occur on a closed endpoint.
                        return false, nil
                }

                e.snd.handleRcvdSegment(s)
        }

        return true, nil
}

// keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP
// keepalive packets periodically when the connection is idle. If we don't hear
// from the other side after a number of tries, we terminate the connection.
func (e *endpoint) keepaliveTimerExpired() tcpip.Error {
        userTimeout := e.userTimeout

        e.keepalive.Lock()
        if !e.SocketOptions().GetKeepAlive() || !e.keepalive.timer.checkExpiration() {
                e.keepalive.Unlock()
                return nil
        }

        // If a userTimeout is set then abort the connection if it is
        // exceeded.
        if userTimeout != 0 && e.stack.Clock().NowMonotonic().Sub(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 {
                e.keepalive.Unlock()
                e.stack.Stats().TCP.EstablishedTimedout.Increment()
                return &tcpip.ErrTimeout{}
        }

        if e.keepalive.unacked >= e.keepalive.count {
                e.keepalive.Unlock()
                e.stack.Stats().TCP.EstablishedTimedout.Increment()
                return &tcpip.ErrTimeout{}
        }

        // RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with
        // seg.seq = snd.nxt-1.
        e.keepalive.unacked++
        e.keepalive.Unlock()
        e.snd.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, e.snd.SndNxt-1)
        e.resetKeepaliveTimer(false)
        return nil
}

// resetKeepaliveTimer restarts or stops the keepalive timer, depending on
// whether it is enabled for this endpoint.
func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
        e.keepalive.Lock()
        if receivedData {
                e.keepalive.unacked = 0
        }
        // Start the keepalive timer IFF it's enabled and there is no pending
        // data to send.
        if !e.SocketOptions().GetKeepAlive() || e.snd == nil || e.snd.SndUna != e.snd.SndNxt {
                e.keepalive.timer.disable()
                e.keepalive.Unlock()
                return
        }
        if e.keepalive.unacked > 0 {
                e.keepalive.timer.enable(e.keepalive.interval)
        } else {
                e.keepalive.timer.enable(e.keepalive.idle)
        }
        e.keepalive.Unlock()
}

// disableKeepaliveTimer stops the keepalive timer.
func (e *endpoint) disableKeepaliveTimer() {
        e.keepalive.Lock()
        e.keepalive.timer.disable()
        e.keepalive.Unlock()
}

// protocolMainLoopDone is called at the end of protocolMainLoop.
// +checklocksrelease:e.mu
func (e *endpoint) protocolMainLoopDone(closeTimer tcpip.Timer, closeWaker *sleep.Waker) {
        if e.snd != nil {
                e.snd.resendTimer.cleanup()
                e.snd.probeTimer.cleanup()
                e.snd.reorderTimer.cleanup()
        }

        if closeTimer != nil {
                closeTimer.Stop()
        }

        e.completeWorkerLocked()

        if e.drainDone != nil {
                close(e.drainDone)
        }

        e.mu.Unlock()

        e.drainClosingSegmentQueue()

        // When the protocol loop exits we should wake up our waiters.
        e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
}

// protocolMainLoop is the main loop of the TCP protocol. It runs in its own
// goroutine and is responsible for sending segments and handling received
// segments.
func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) tcpip.Error {
        var (
                closeTimer tcpip.Timer
                closeWaker sleep.Waker
        )

        e.mu.Lock()
        if handshake {
                if err := e.h.complete(); err != nil { // +checklocksforce
                        e.lastErrorMu.Lock()
                        e.lastError = err
                        e.lastErrorMu.Unlock()

                        e.setEndpointState(StateError)
                        e.hardError = err

                        e.workerCleanup = true
                        e.protocolMainLoopDone(closeTimer, &closeWaker)
                        return err
                }
        }

        // Reaching this point means that we successfully completed the 3-way
        // handshake with our peer. The current endpoint state could be any state
        // post ESTABLISHED, including CLOSED or ERROR if the endpoint processes a
        // RST from the peer via the dispatcher fast path, before the loop is
        // started.
        if s := e.EndpointState(); !s.connected() {
                switch s {
                case StateClose, StateError:
                        // If the endpoint is in CLOSED/ERROR state, sender state has to be
                        // initialized if the endpoint was previously established.
                        if e.snd != nil {
                                break
                        }
                        fallthrough
                default:
                        panic("endpoint was not established, current state " + s.String())
                }
        }

        // Completing the 3-way handshake is an indication that the route is valid
        // and the remote is reachable as the only way we can complete a handshake
        // is if our SYN reached the remote and their ACK reached us.
        e.route.ConfirmReachable()

        drained := e.drainDone != nil
        if drained {
                close(e.drainDone)
                e.mu.Unlock()
                <-e.undrain
                e.mu.Lock()
        }

        // Set up the functions that will be called when the main protocol loop
        // wakes up.
        funcs := []struct {
                w *sleep.Waker
                f func() tcpip.Error
        }{
                {
                        w: &e.sndQueueInfo.sndWaker,
                        f: func() tcpip.Error {
                                e.sendData(nil /* next */)
                                return nil
                        },
                },
                {
                        w: &closeWaker,
                        f: func() tcpip.Error {
                                // This means the socket is being closed due
                                // to the TCP-FIN-WAIT2 timeout was hit. Just
                                // mark the socket as closed.
                                e.transitionToStateCloseLocked()
                                e.workerCleanup = true
                                return nil
                        },
                },
                {
                        w: &e.snd.resendWaker,
                        f: func() tcpip.Error {
                                if !e.snd.retransmitTimerExpired() {
                                        e.stack.Stats().TCP.EstablishedTimedout.Increment()
                                        return &tcpip.ErrTimeout{}
                                }
                                return nil
                        },
                },
                {
                        w: &e.snd.probeWaker,
                        f: e.snd.probeTimerExpired,
                },
                {
                        w: &e.newSegmentWaker,
                        f: func() tcpip.Error {
                                return e.handleSegmentsLocked(false /* fastPath */)
                        },
                },
                {
                        w: &e.keepalive.waker,
                        f: e.keepaliveTimerExpired,
                },
                {
                        w: &e.notificationWaker,
                        f: func() tcpip.Error {
                                n := e.fetchNotifications()
                                if n&notifyNonZeroReceiveWindow != 0 {
                                        e.rcv.nonZeroWindow()
                                }

                                if n&notifyMTUChanged != 0 {
                                        e.sndQueueInfo.sndQueueMu.Lock()
                                        count := e.sndQueueInfo.PacketTooBigCount
                                        e.sndQueueInfo.PacketTooBigCount = 0
                                        mtu := e.sndQueueInfo.SndMTU
                                        e.sndQueueInfo.sndQueueMu.Unlock()

                                        e.snd.updateMaxPayloadSize(mtu, count)
                                }

                                if n&notifyReset != 0 || n&notifyAbort != 0 {
                                        return &tcpip.ErrConnectionAborted{}
                                }

                                if n&notifyResetByPeer != 0 {
                                        return &tcpip.ErrConnectionReset{}
                                }

                                if n&notifyClose != 0 && e.closed {
                                        switch e.EndpointState() {
                                        case StateEstablished:
                                                // Perform full shutdown if the endpoint is still
                                                // established. This can occur when notifyClose
                                                // was asserted just before becoming established.
                                                e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
                                        case StateFinWait2:
                                                // The socket has been closed and we are in FIN_WAIT2
                                                // so start the FIN_WAIT2 timer.
                                                if closeTimer == nil {
                                                        closeTimer = e.stack.Clock().AfterFunc(e.tcpLingerTimeout, closeWaker.Assert)
                                                }
                                        }
                                }

                                if n&notifyKeepaliveChanged != 0 {
                                        // The timer could fire in background
                                        // when the endpoint is drained. That's
                                        // OK. See above.
                                        e.resetKeepaliveTimer(true)
                                }

                                if n&notifyDrain != 0 {
                                        for !e.segmentQueue.empty() {
                                                if err := e.handleSegmentsLocked(false /* fastPath */); err != nil {
                                                        return err
                                                }
                                        }
                                        if !e.EndpointState().closed() {
                                                // Only block the worker if the endpoint
                                                // is not in closed state or error state.
                                                close(e.drainDone)
                                                e.mu.Unlock() // +checklocksforce
                                                <-e.undrain
                                                e.mu.Lock()
                                        }
                                }

                                if n&notifyTickleWorker != 0 {
                                        // Just a tickle notification. No need to do
                                        // anything.
                                        return nil
                                }

                                return nil
                        },
                },
                {
                        w: &e.snd.reorderWaker,
                        f: e.snd.rc.reorderTimerExpired,
                },
        }

        // Initialize the sleeper based on the wakers in funcs.
        var s sleep.Sleeper
        for i := range funcs {
                s.AddWaker(funcs[i].w, i)
        }

        // Notify the caller that the waker initialization is complete and the
        // endpoint is ready.
        if wakerInitDone != nil {
                close(wakerInitDone)
        }

        // Tell waiters that the endpoint is connected and writable.
        e.waiterQueue.Notify(waiter.WritableEvents)

        // The following assertions and notifications are needed for restored
        // endpoints. Fresh newly created endpoints have empty states and should
        // not invoke any.
        if !e.segmentQueue.empty() {
                e.newSegmentWaker.Assert()
        }

        e.rcvQueueInfo.rcvQueueMu.Lock()
        if !e.rcvQueueInfo.rcvQueue.Empty() {
                e.waiterQueue.Notify(waiter.ReadableEvents)
        }
        e.rcvQueueInfo.rcvQueueMu.Unlock()

        if e.workerCleanup {
                e.notifyProtocolGoroutine(notifyClose)
        }

        // Main loop. Handle segments until both send and receive ends of the
        // connection have completed.
        cleanupOnError := func(err tcpip.Error) {
                e.stack.Stats().TCP.CurrentConnected.Decrement()
                e.workerCleanup = true
                if err != nil {
                        e.resetConnectionLocked(err)
                }
        }

loop:
        for {
                switch e.EndpointState() {
                case StateTimeWait, StateClose, StateError:
                        break loop
                }

                e.mu.Unlock()
                v, _ := s.Fetch(true /* block */)
                e.mu.Lock()

                // We need to double check here because the notification may be
                // stale by the time we got around to processing it.
                switch e.EndpointState() {
                case StateError:
                        // If the endpoint has already transitioned to an ERROR
                        // state just pass nil here as any reset that may need
                        // to be sent etc should already have been done and we
                        // just want to terminate the loop and cleanup the
                        // endpoint.
                        cleanupOnError(nil)
                        e.protocolMainLoopDone(closeTimer, &closeWaker)
                        return nil
                case StateTimeWait:
                        fallthrough
                case StateClose:
                        break loop
                default:
                        if err := funcs[v].f(); err != nil {
                                cleanupOnError(err)
                                e.protocolMainLoopDone(closeTimer, &closeWaker)
                                return nil
                        }
                }
        }

        var reuseTW func()
        if e.EndpointState() == StateTimeWait {
                // Disable close timer as we now entering real TIME_WAIT.
                if closeTimer != nil {
                        closeTimer.Stop()
                }
                // Mark the current sleeper done so as to free all associated
                // wakers.
                s.Done()
                // Wake up any waiters before we enter TIME_WAIT.
                e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
                e.workerCleanup = true
                reuseTW = e.doTimeWait()
        }

        // Handle any StateError transition from StateTimeWait.
        if e.EndpointState() == StateError {
                cleanupOnError(nil)
                e.protocolMainLoopDone(closeTimer, &closeWaker)
                return nil
        }

        e.transitionToStateCloseLocked()

        e.protocolMainLoopDone(closeTimer, &closeWaker)

        // A new SYN was received during TIME_WAIT and we need to abort
        // the timewait and redirect the segment to the listener queue
        if reuseTW != nil {
                reuseTW()
        }

        return nil
}

// handleTimeWaitSegments processes segments received during TIME_WAIT
// state.
func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) {
        checkRequeue := true
        for i := 0; i < maxSegmentsPerWake; i++ {
                s := e.segmentQueue.dequeue()
                if s == nil {
                        checkRequeue = false
                        break
                }
                extTW, newSyn := e.rcv.handleTimeWaitSegment(s)
                if newSyn {
                        info := e.TransportEndpointInfo
                        newID := info.ID
                        newID.RemoteAddress = ""
                        newID.RemotePort = 0
                        netProtos := []tcpip.NetworkProtocolNumber{info.NetProto}
                        // If the local address is an IPv4 address then also
                        // look for IPv6 dual stack endpoints that might be
                        // listening on the local address.
                        if newID.LocalAddress.To4() != "" {
                                netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber}
                        }
                        for _, netProto := range netProtos {
                                if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, s.nicID); listenEP != nil {
                                        tcpEP := listenEP.(*endpoint)
                                        if EndpointState(tcpEP.State()) == StateListen {
                                                reuseTW = func() {
                                                        if !tcpEP.enqueueSegment(s) {
                                                                s.decRef()
                                                                return
                                                        }
                                                        tcpEP.newSegmentWaker.Assert()
                                                }
                                                // We explicitly do not decRef
                                                // the segment as it's still
                                                // valid and being reflected to
                                                // a listening endpoint.
                                                return false, reuseTW
                                        }
                                }
                        }
                }
                if extTW {
                        extendTimeWait = true
                }
                s.decRef()
        }
        if checkRequeue && !e.segmentQueue.empty() {
                e.newSegmentWaker.Assert()
        }
        return extendTimeWait, nil
}

// doTimeWait is responsible for handling the TCP behaviour once a socket
// enters the TIME_WAIT state. Optionally it can return a closure that
// should be executed after releasing the endpoint registrations. This is
// done in cases where a new SYN is received during TIME_WAIT that carries
// a sequence number larger than one see on the connection.
// +checklocks:e.mu
func (e *endpoint) doTimeWait() (twReuse func()) {
        // Trigger a 2 * MSL time wait state. During this period
        // we will drop all incoming segments.
        // NOTE: On Linux this is not configurable and is fixed at 60 seconds.
        timeWaitDuration := DefaultTCPTimeWaitTimeout

        // Get the stack wide configuration.
        var tcpTW tcpip.TCPTimeWaitTimeoutOption
        if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil {
                timeWaitDuration = time.Duration(tcpTW)
        }

        const newSegment = 1
        const notification = 2
        const timeWaitDone = 3

        var s sleep.Sleeper
        defer s.Done()
        s.AddWaker(&e.newSegmentWaker, newSegment)
        s.AddWaker(&e.notificationWaker, notification)

        var timeWaitWaker sleep.Waker
        s.AddWaker(&timeWaitWaker, timeWaitDone)
        timeWaitTimer := e.stack.Clock().AfterFunc(timeWaitDuration, timeWaitWaker.Assert)
        defer timeWaitTimer.Stop()

        for {
                e.mu.Unlock()
                v, _ := s.Fetch(true /* block */)
                e.mu.Lock()
                switch v {
                case newSegment:
                        extendTimeWait, reuseTW := e.handleTimeWaitSegments()
                        if reuseTW != nil {
                                return reuseTW
                        }
                        if extendTimeWait {
                                timeWaitTimer.Reset(timeWaitDuration)
                        }
                case notification:
                        n := e.fetchNotifications()
                        if n&notifyAbort != 0 {
                                return nil
                        }
                        if n&notifyDrain != 0 {
                                for !e.segmentQueue.empty() {
                                        // Ignore extending TIME_WAIT during a
                                        // save. For sockets in TIME_WAIT we just
                                        // terminate the TIME_WAIT early.
                                        e.handleTimeWaitSegments()
                                }
                                close(e.drainDone)
                                e.mu.Unlock()
                                <-e.undrain
                                e.mu.Lock()
                                return nil
                        }
                case timeWaitDone:
                        return nil
                }
        }
}






















































































































































































































































































































    2 























































































































































































































































































































































































































































































































































  191 













































































    1 


    1 











  537 



  422 







  328 
  328 



  328 

  329 


  328 



  329 




  329 


  329 


  329 


  324 


  329 


  329 



























  190 











  192 













  189 

    3 





  192 
  190 



  191 


  190 



  343 



  301 











  301 















    1 




    1 


    1 



    1 


















































































































































  328 

































  312 



    8 





    2 


    6 


    1 


    5 












  328 



  327 


  326 



  327 



  786 









  302 
  303 




  303 



  303 





  807 








  804 




  807 


  808 
















  825 




  201 
   11 



  200 

  193 


  201 




    2 






    1 



















   12 







   12 



























































































   12 





















































































































































  202 







    2 



    2 
    2 









    4 



    4 


    2 


    3 



    3 


    1 


    1 



    1 


    1 





  185 


  188 

  186 




    1 


    1 



    1 






















    1 































































































































  145 







    5 













    5 










    5 


    5 





    3 





    5 












    5 










    5 




































  415 




  300 


    1 






  300 





    1 
    1 









    1 




    2 




    2 

    2 

















    1 
    1 


    1 



    2 
    2 


    2 



  142 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package gofer provides a filesystem implementation that is backed by a 9p
// server, interchangably referred to as "gofers" throughout this package.
//
// Lock order:
//   regularFileFD/directoryFD.mu
//     filesystem.renameMu
//       dentry.cachingMu
//         filesystem.cacheMu
//         dentry.dirMu
//           filesystem.syncMu
//           dentry.metadataMu
//             *** "memmap.Mappable locks" below this point
//             dentry.mapsMu
//               *** "memmap.Mappable locks taken by Translate" below this point
//               dentry.handleMu
//                 dentry.dataMu
//             filesystem.inoMu
//   specialFileFD.mu
//     specialFileFD.bufMu
//
// Locking dentry.dirMu and dentry.metadataMu in multiple dentries requires that
// either ancestor dentries are locked before descendant dentries, or that
// filesystem.renameMu is locked for writing.
package gofer

import (
        "fmt"
        "strconv"
        "strings"
        "sync/atomic"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/p9"
        refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
        "gvisor.dev/gvisor/pkg/refsvfs2"
        "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
        fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/unet"
)

// Name is the default filesystem name.
const Name = "9p"

// Mount option names for goferfs.
const (
        moptTransport              = "trans"
        moptReadFD                 = "rfdno"
        moptWriteFD                = "wfdno"
        moptAname                  = "aname"
        moptDfltUID                = "dfltuid"
        moptDfltGID                = "dfltgid"
        moptMsize                  = "msize"
        moptVersion                = "version"
        moptDentryCacheLimit       = "dentry_cache_limit"
        moptCache                  = "cache"
        moptForcePageCache         = "force_page_cache"
        moptLimitHostFDTranslation = "limit_host_fd_translation"
        moptOverlayfsStaleRead     = "overlayfs_stale_read"
)

// Valid values for the "cache" mount option.
const (
        cacheNone                = "none"
        cacheFSCache             = "fscache"
        cacheFSCacheWritethrough = "fscache_writethrough"
        cacheRemoteRevalidating  = "remote_revalidating"
)

// Valid values for "trans" mount option.
const transportModeFD = "fd"

// FilesystemType implements vfs.FilesystemType.
//
// +stateify savable
type FilesystemType struct{}

// filesystem implements vfs.FilesystemImpl.
//
// +stateify savable
type filesystem struct {
        vfsfs vfs.Filesystem

        // mfp is used to allocate memory that caches regular file contents. mfp is
        // immutable.
        mfp pgalloc.MemoryFileProvider

        // Immutable options.
        opts  filesystemOptions
        iopts InternalFilesystemOptions

        // client is the client used by this filesystem. client is immutable.
        client *p9.Client `state:"nosave"`

        // clock is a realtime clock used to set timestamps in file operations.
        clock ktime.Clock

        // devMinor is the filesystem's minor device number. devMinor is immutable.
        devMinor uint32

        // root is the root dentry. root is immutable.
        root *dentry

        // renameMu serves two purposes:
        //
        // - It synchronizes path resolution with renaming initiated by this
        // client.
        //
        // - It is held by path resolution to ensure that reachable dentries remain
        // valid. A dentry is reachable by path resolution if it has a non-zero
        // reference count (such that it is usable as vfs.ResolvingPath.Start() or
        // is reachable from its children), or if it is a child dentry (such that
        // it is reachable from its parent).
        renameMu sync.RWMutex `state:"nosave"`

        // cachedDentries contains all dentries with 0 references. (Due to race
        // conditions, it may also contain dentries with non-zero references.)
        // cachedDentriesLen is the number of dentries in cachedDentries. These fields
        // are protected by cacheMu.
        cacheMu           sync.Mutex `state:"nosave"`
        cachedDentries    dentryList
        cachedDentriesLen uint64

        // syncableDentries contains all non-synthetic dentries. specialFileFDs
        // contains all open specialFileFDs. These fields are protected by syncMu.
        syncMu           sync.Mutex `state:"nosave"`
        syncableDentries map[*dentry]struct{}
        specialFileFDs   map[*specialFileFD]struct{}

        // inoByQIDPath maps previously-observed QID.Paths to inode numbers
        // assigned to those paths. inoByQIDPath is not preserved across
        // checkpoint/restore because QIDs may be reused between different gofer
        // processes, so QIDs may be repeated for different files across
        // checkpoint/restore. inoByQIDPath is protected by inoMu.
        inoMu        sync.Mutex        `state:"nosave"`
        inoByQIDPath map[uint64]uint64 `state:"nosave"`

        // lastIno is the last inode number assigned to a file. lastIno is accessed
        // using atomic memory operations.
        lastIno uint64

        // savedDentryRW records open read/write handles during save/restore.
        savedDentryRW map[*dentry]savedDentryRW

        // released is nonzero once filesystem.Release has been called. It is accessed
        // with atomic memory operations.
        released int32
}

// +stateify savable
type filesystemOptions struct {
        // "Standard" 9P options.
        fd      int
        aname   string
        interop InteropMode // derived from the "cache" mount option
        dfltuid auth.KUID
        dfltgid auth.KGID
        msize   uint32
        version string

        // maxCachedDentries is the maximum size of filesystem.cachedDentries.
        maxCachedDentries uint64

        // If forcePageCache is true, host FDs may not be used for application
        // memory mappings even if available; instead, the client must perform its
        // own caching of regular file pages. This is primarily useful for testing.
        forcePageCache bool

        // If limitHostFDTranslation is true, apply maxFillRange() constraints to
        // host FD mappings returned by dentry.(memmap.Mappable).Translate(). This
        // makes memory accounting behavior more consistent between cases where
        // host FDs are / are not available, but may increase the frequency of
        // sentry-handled page faults on files for which a host FD is available.
        limitHostFDTranslation bool

        // If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote
        // filesystem may not be coherent with writable host FDs opened later, so
        // all uses of the former must be replaced by uses of the latter. This is
        // usually only the case when the remote filesystem is a Linux overlayfs
        // mount. (Prior to Linux 4.18, patch series centered on commit
        // d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were
        // incoherent between pre-copy-up and post-copy-up FDs; after that patch
        // series, only memory mappings are incoherent.)
        overlayfsStaleRead bool

        // If regularFilesUseSpecialFileFD is true, application FDs representing
        // regular files will use distinct file handles for each FD, in the same
        // way that application FDs representing "special files" such as sockets
        // do. Note that this disables client caching and mmap for regular files.
        regularFilesUseSpecialFileFD bool
}

// InteropMode controls the client's interaction with other remote filesystem
// users.
//
// +stateify savable
type InteropMode uint32

const (
        // InteropModeExclusive is appropriate when the filesystem client is the
        // only user of the remote filesystem.
        //
        // - The client may cache arbitrary filesystem state (file data, metadata,
        // filesystem structure, etc.).
        //
        // - Client changes to filesystem state may be sent to the remote
        // filesystem asynchronously, except when server permission checks are
        // necessary.
        //
        // - File timestamps are based on client clocks. This ensures that users of
        // the client observe timestamps that are coherent with their own clocks
        // and consistent with Linux's semantics (in particular, it is not always
        // possible for clients to set arbitrary atimes and mtimes depending on the
        // remote filesystem implementation, and never possible for clients to set
        // arbitrary ctimes.)
        InteropModeExclusive InteropMode = iota

        // InteropModeWritethrough is appropriate when there are read-only users of
        // the remote filesystem that expect to observe changes made by the
        // filesystem client.
        //
        // - The client may cache arbitrary filesystem state.
        //
        // - Client changes to filesystem state must be sent to the remote
        // filesystem synchronously.
        //
        // - File timestamps are based on client clocks. As a corollary, access
        // timestamp changes from other remote filesystem users will not be visible
        // to the client.
        InteropModeWritethrough

        // InteropModeShared is appropriate when there are users of the remote
        // filesystem that may mutate its state other than the client.
        //
        // - The client must verify ("revalidate") cached filesystem state before
        // using it.
        //
        // - Client changes to filesystem state must be sent to the remote
        // filesystem synchronously.
        //
        // - File timestamps are based on server clocks. This is necessary to
        // ensure that timestamp changes are synchronized between remote filesystem
        // users.
        //
        // Note that the correctness of InteropModeShared depends on the server
        // correctly implementing 9P fids (i.e. each fid immutably represents a
        // single filesystem object), even in the presence of remote filesystem
        // mutations from other users. If this is violated, the behavior of the
        // client is undefined.
        InteropModeShared
)

// InternalFilesystemOptions may be passed as
// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
//
// +stateify savable
type InternalFilesystemOptions struct {
        // If UniqueID is non-empty, it is an opaque string used to reassociate the
        // filesystem with a new server FD during restoration from checkpoint.
        UniqueID string

        // If LeakConnection is true, do not close the connection to the server
        // when the Filesystem is released. This is necessary for deployments in
        // which servers can handle only a single client and report failure if that
        // client disconnects.
        LeakConnection bool

        // If OpenSocketsByConnecting is true, silently translate attempts to open
        // files identifying as sockets to connect RPCs.
        OpenSocketsByConnecting bool
}

// _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default
// UIDs and GIDs used for files that do not provide a specific owner or group
// respectively.
const (
        // uint32(-2) doesn't work in Go.
        _V9FS_DEFUID = auth.KUID(4294967294)
        _V9FS_DEFGID = auth.KGID(4294967294)
)

// Name implements vfs.FilesystemType.Name.
func (FilesystemType) Name() string {
        return Name
}

// Release implements vfs.FilesystemType.Release.
func (FilesystemType) Release(ctx context.Context) {}

// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
        mfp := pgalloc.MemoryFileProviderFromContext(ctx)
        if mfp == nil {
                ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider")
                return nil, nil, linuxerr.EINVAL
        }

        mopts := vfs.GenericParseMountOptions(opts.Data)
        var fsopts filesystemOptions

        fd, err := getFDFromMountOptionsMap(ctx, mopts)
        if err != nil {
                return nil, nil, err
        }
        fsopts.fd = fd

        // Get the attach name.
        fsopts.aname = "/"
        if aname, ok := mopts[moptAname]; ok {
                delete(mopts, moptAname)
                fsopts.aname = aname
        }

        // Parse the cache policy. For historical reasons, this defaults to the
        // least generally-applicable option, InteropModeExclusive.
        fsopts.interop = InteropModeExclusive
        if cache, ok := mopts[moptCache]; ok {
                delete(mopts, moptCache)
                switch cache {
                case cacheFSCache:
                        fsopts.interop = InteropModeExclusive
                case cacheFSCacheWritethrough:
                        fsopts.interop = InteropModeWritethrough
                case cacheNone:
                        fsopts.regularFilesUseSpecialFileFD = true
                        fallthrough
                case cacheRemoteRevalidating:
                        fsopts.interop = InteropModeShared
                default:
                        ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: %s=%s", moptCache, cache)
                        return nil, nil, linuxerr.EINVAL
                }
        }

        // Parse the default UID and GID.
        fsopts.dfltuid = _V9FS_DEFUID
        if dfltuidstr, ok := mopts[moptDfltUID]; ok {
                delete(mopts, moptDfltUID)
                dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32)
                if err != nil {
                        ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltUID, dfltuidstr)
                        return nil, nil, linuxerr.EINVAL
                }
                // In Linux, dfltuid is interpreted as a UID and is converted to a KUID
                // in the caller's user namespace, but goferfs isn't
                // application-mountable.
                fsopts.dfltuid = auth.KUID(dfltuid)
        }
        fsopts.dfltgid = _V9FS_DEFGID
        if dfltgidstr, ok := mopts[moptDfltGID]; ok {
                delete(mopts, moptDfltGID)
                dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32)
                if err != nil {
                        ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltGID, dfltgidstr)
                        return nil, nil, linuxerr.EINVAL
                }
                fsopts.dfltgid = auth.KGID(dfltgid)
        }

        // Parse the 9P message size.
        fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M
        if msizestr, ok := mopts[moptMsize]; ok {
                delete(mopts, moptMsize)
                msize, err := strconv.ParseUint(msizestr, 10, 32)
                if err != nil {
                        ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: %s=%s", moptMsize, msizestr)
                        return nil, nil, linuxerr.EINVAL
                }
                fsopts.msize = uint32(msize)
        }

        // Parse the 9P protocol version.
        fsopts.version = p9.HighestVersionString()
        if version, ok := mopts[moptVersion]; ok {
                delete(mopts, moptVersion)
                fsopts.version = version
        }

        // Parse the dentry cache limit.
        fsopts.maxCachedDentries = 1000
        if str, ok := mopts[moptDentryCacheLimit]; ok {
                delete(mopts, moptDentryCacheLimit)
                maxCachedDentries, err := strconv.ParseUint(str, 10, 64)
                if err != nil {
                        ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: %s=%s", moptDentryCacheLimit, str)
                        return nil, nil, linuxerr.EINVAL
                }
                fsopts.maxCachedDentries = maxCachedDentries
        }

        // Handle simple flags.
        if _, ok := mopts[moptForcePageCache]; ok {
                delete(mopts, moptForcePageCache)
                fsopts.forcePageCache = true
        }
        if _, ok := mopts[moptLimitHostFDTranslation]; ok {
                delete(mopts, moptLimitHostFDTranslation)
                fsopts.limitHostFDTranslation = true
        }
        if _, ok := mopts[moptOverlayfsStaleRead]; ok {
                delete(mopts, moptOverlayfsStaleRead)
                fsopts.overlayfsStaleRead = true
        }
        // fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying
        // "cache=none".

        // Check for unparsed options.
        if len(mopts) != 0 {
                ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts)
                return nil, nil, linuxerr.EINVAL
        }

        // Handle internal options.
        iopts, ok := opts.InternalData.(InternalFilesystemOptions)
        if opts.InternalData != nil && !ok {
                ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData)
                return nil, nil, linuxerr.EINVAL
        }
        // If !ok, iopts being the zero value is correct.

        // Construct the filesystem object.
        devMinor, err := vfsObj.GetAnonBlockDevMinor()
        if err != nil {
                return nil, nil, err
        }
        fs := &filesystem{
                mfp:              mfp,
                opts:             fsopts,
                iopts:            iopts,
                clock:            ktime.RealtimeClockFromContext(ctx),
                devMinor:         devMinor,
                syncableDentries: make(map[*dentry]struct{}),
                specialFileFDs:   make(map[*specialFileFD]struct{}),
                inoByQIDPath:     make(map[uint64]uint64),
        }
        fs.vfsfs.Init(vfsObj, &fstype, fs)

        // Connect to the server.
        if err := fs.dial(ctx); err != nil {
                return nil, nil, err
        }

        // Perform attach to obtain the filesystem root.
        ctx.UninterruptibleSleepStart(false)
        attached, err := fs.client.Attach(fsopts.aname)
        ctx.UninterruptibleSleepFinish(false)
        if err != nil {
                fs.vfsfs.DecRef(ctx)
                return nil, nil, err
        }
        attachFile := p9file{attached}
        qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
        if err != nil {
                attachFile.close(ctx)
                fs.vfsfs.DecRef(ctx)
                return nil, nil, err
        }

        // Construct the root dentry.
        root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr)
        if err != nil {
                attachFile.close(ctx)
                fs.vfsfs.DecRef(ctx)
                return nil, nil, err
        }
        // Set the root's reference count to 2. One reference is returned to the
        // caller, and the other is held by fs to prevent the root from being "cached"
        // and subsequently evicted.
        root.refs = 2
        fs.root = root

        return &fs.vfsfs, &root.vfsd, nil
}

func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) {
        // Check that the transport is "fd".
        trans, ok := mopts[moptTransport]
        if !ok || trans != transportModeFD {
                ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as '%s=%s'", moptTransport, transportModeFD)
                return -1, linuxerr.EINVAL
        }
        delete(mopts, moptTransport)

        // Check that read and write FDs are provided and identical.
        rfdstr, ok := mopts[moptReadFD]
        if !ok {
                ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as '%s=<file descriptor>'", moptReadFD)
                return -1, linuxerr.EINVAL
        }
        delete(mopts, moptReadFD)
        rfd, err := strconv.Atoi(rfdstr)
        if err != nil {
                ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: %s=%s", moptReadFD, rfdstr)
                return -1, linuxerr.EINVAL
        }
        wfdstr, ok := mopts[moptWriteFD]
        if !ok {
                ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as '%s=<file descriptor>'", moptWriteFD)
                return -1, linuxerr.EINVAL
        }
        delete(mopts, moptWriteFD)
        wfd, err := strconv.Atoi(wfdstr)
        if err != nil {
                ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: %s=%s", moptWriteFD, wfdstr)
                return -1, linuxerr.EINVAL
        }
        if rfd != wfd {
                ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
                return -1, linuxerr.EINVAL
        }
        return rfd, nil
}

// Preconditions: fs.client == nil.
func (fs *filesystem) dial(ctx context.Context) error {
        // Establish a connection with the server.
        conn, err := unet.NewSocket(fs.opts.fd)
        if err != nil {
                return err
        }

        // Perform version negotiation with the server.
        ctx.UninterruptibleSleepStart(false)
        client, err := p9.NewClient(conn, fs.opts.msize, fs.opts.version)
        ctx.UninterruptibleSleepFinish(false)
        if err != nil {
                conn.Close()
                return err
        }
        // Ownership of conn has been transferred to client.

        fs.client = client
        return nil
}

// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
        atomic.StoreInt32(&fs.released, 1)

        mf := fs.mfp.MemoryFile()
        fs.syncMu.Lock()
        for d := range fs.syncableDentries {
                d.handleMu.Lock()
                d.dataMu.Lock()
                if h := d.writeHandleLocked(); h.isOpen() {
                        // Write dirty cached data to the remote file.
                        if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
                                log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
                        }
                        // TODO(jamieliu): Do we need to flushf/fsync d?
                }
                // Discard cached pages.
                d.cache.DropAll(mf)
                d.dirty.RemoveAll()
                d.dataMu.Unlock()
                // Close host FDs if they exist.
                if d.readFD >= 0 {
                        unix.Close(int(d.readFD))
                }
                if d.writeFD >= 0 && d.readFD != d.writeFD {
                        unix.Close(int(d.writeFD))
                }
                d.readFD = -1
                d.writeFD = -1
                d.mmapFD = -1
                d.handleMu.Unlock()
        }
        // There can't be any specialFileFDs still using fs, since each such
        // FileDescription would hold a reference on a Mount holding a reference on
        // fs.
        fs.syncMu.Unlock()

        // If leak checking is enabled, release all outstanding references in the
        // filesystem. We deliberately avoid doing this outside of leak checking; we
        // have released all external resources above rather than relying on dentry
        // destructors.
        if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking {
                fs.renameMu.Lock()
                fs.root.releaseSyntheticRecursiveLocked(ctx)
                fs.evictAllCachedDentriesLocked(ctx)
                fs.renameMu.Unlock()

                // An extra reference was held by the filesystem on the root to prevent it from
                // being cached/evicted.
                fs.root.DecRef(ctx)
        }

        if !fs.iopts.LeakConnection {
                // Close the connection to the server. This implicitly clunks all fids.
                fs.client.Close()
        }

        fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
}

// releaseSyntheticRecursiveLocked traverses the tree with root d and decrements
// the reference count on every synthetic dentry. Synthetic dentries have one
// reference for existence that should be dropped during filesystem.Release.
//
// Precondition: d.fs.renameMu is locked for writing.
func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) {
        if d.isSynthetic() {
                d.decRefNoCaching()
                d.checkCachingLocked(ctx, true /* renameMuWriteLocked */)
        }
        if d.isDir() {
                var children []*dentry
                d.dirMu.Lock()
                for _, child := range d.children {
                        children = append(children, child)
                }
                d.dirMu.Unlock()
                for _, child := range children {
                        if child != nil {
                                child.releaseSyntheticRecursiveLocked(ctx)
                        }
                }
        }
}

// dentry implements vfs.DentryImpl.
//
// +stateify savable
type dentry struct {
        vfsd vfs.Dentry

        // refs is the reference count. Each dentry holds a reference on its
        // parent, even if disowned. An additional reference is held on all
        // synthetic dentries until they are unlinked or invalidated. When refs
        // reaches 0, the dentry may be added to the cache or destroyed. If refs ==
        // -1, the dentry has already been destroyed. refs is accessed using atomic
        // memory operations.
        refs int64

        // fs is the owning filesystem. fs is immutable.
        fs *filesystem

        // parent is this dentry's parent directory. Each dentry holds a reference
        // on its parent. If this dentry is a filesystem root, parent is nil.
        // parent is protected by filesystem.renameMu.
        parent *dentry

        // name is the name of this dentry in its parent. If this dentry is a
        // filesystem root, name is the empty string. name is protected by
        // filesystem.renameMu.
        name string

        // qidPath is the p9.QID.Path for this file. qidPath is immutable.
        qidPath uint64

        // file is the unopened p9.File that backs this dentry. file is immutable.
        //
        // If file.isNil(), this dentry represents a synthetic file, i.e. a file
        // that does not exist on the remote filesystem. As of this writing, the
        // only files that can be synthetic are sockets, pipes, and directories.
        file p9file `state:"nosave"`

        // If deleted is non-zero, the file represented by this dentry has been
        // deleted. deleted is accessed using atomic memory operations.
        deleted uint32

        // cachingMu is used to synchronize concurrent dentry caching attempts on
        // this dentry.
        cachingMu sync.Mutex `state:"nosave"`

        // If cached is true, dentryEntry links dentry into
        // filesystem.cachedDentries. cached and dentryEntry are protected by
        // cachingMu.
        cached bool
        dentryEntry

        dirMu sync.Mutex `state:"nosave"`

        // If this dentry represents a directory, children contains:
        //
        // - Mappings of child filenames to dentries representing those children.
        //
        // - Mappings of child filenames that are known not to exist to nil
        // dentries (only if InteropModeShared is not in effect and the directory
        // is not synthetic).
        //
        // children is protected by dirMu.
        children map[string]*dentry

        // If this dentry represents a directory, syntheticChildren is the number
        // of child dentries for which dentry.isSynthetic() == true.
        // syntheticChildren is protected by dirMu.
        syntheticChildren int

        // If this dentry represents a directory,
        // dentry.cachedMetadataAuthoritative() == true, and dirents is not nil, it
        // is a cache of all entries in the directory, in the order they were
        // returned by the server. dirents is protected by dirMu.
        dirents []vfs.Dirent

        // Cached metadata; protected by metadataMu.
        // To access:
        //   - In situations where consistency is not required (like stat), these
        //     can be accessed using atomic operations only (without locking).
        //   - Lock metadataMu and can access without atomic operations.
        // To mutate:
        //   - Lock metadataMu and use atomic operations to update because we might
        //     have atomic readers that don't hold the lock.
        metadataMu sync.Mutex `state:"nosave"`
        ino        uint64     // immutable
        mode       uint32     // type is immutable, perms are mutable
        uid        uint32     // auth.KUID, but stored as raw uint32 for sync/atomic
        gid        uint32     // auth.KGID, but ...
        blockSize  uint32     // 0 if unknown
        // Timestamps, all nsecs from the Unix epoch.
        atime int64
        mtime int64
        ctime int64
        btime int64
        // File size, which differs from other metadata in two ways:
        //
        // - We make a best-effort attempt to keep it up to date even if
        // !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes.
        //
        // - size is protected by both metadataMu and dataMu (i.e. both must be
        // locked to mutate it; locking either is sufficient to access it).
        size uint64
        // If this dentry does not represent a synthetic file, deleted is 0, and
        // atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the
        // remote file's timestamps, which should be updated when this dentry is
        // evicted.
        atimeDirty uint32
        mtimeDirty uint32

        // nlink counts the number of hard links to this dentry. It's updated and
        // accessed using atomic operations. It's not protected by metadataMu like the
        // other metadata fields.
        nlink uint32

        mapsMu sync.Mutex `state:"nosave"`

        // If this dentry represents a regular file, mappings tracks mappings of
        // the file into memmap.MappingSpaces. mappings is protected by mapsMu.
        mappings memmap.MappingSet

        // - If this dentry represents a regular file or directory, readFile is the
        // p9.File used for reads by all regularFileFDs/directoryFDs representing
        // this dentry, and readFD (if not -1) is a host FD equivalent to readFile
        // used as a faster alternative.
        //
        // - If this dentry represents a regular file, writeFile is the p9.File
        // used for writes by all regularFileFDs representing this dentry, and
        // writeFD (if not -1) is a host FD equivalent to writeFile used as a
        // faster alternative.
        //
        // - If this dentry represents a regular file, mmapFD is the host FD used
        // for memory mappings. If mmapFD is -1, no such FD is available, and the
        // internal page cache implementation is used for memory mappings instead.
        //
        // These fields are protected by handleMu. readFD, writeFD, and mmapFD are
        // additionally written using atomic memory operations, allowing them to be
        // read (albeit racily) with atomic.LoadInt32() without locking handleMu.
        //
        // readFile and writeFile may or may not represent the same p9.File. Once
        // either p9.File transitions from closed (isNil() == true) to open
        // (isNil() == false), it may be mutated with handleMu locked, but cannot
        // be closed until the dentry is destroyed.
        //
        // readFD and writeFD may or may not be the same file descriptor. mmapFD is
        // always either -1 or equal to readFD; if !writeFile.isNil() (the file has
        // been opened for writing), it is additionally either -1 or equal to
        // writeFD.
        handleMu  sync.RWMutex `state:"nosave"`
        readFile  p9file       `state:"nosave"`
        writeFile p9file       `state:"nosave"`
        readFD    int32        `state:"nosave"`
        writeFD   int32        `state:"nosave"`
        mmapFD    int32        `state:"nosave"`

        dataMu sync.RWMutex `state:"nosave"`

        // If this dentry represents a regular file that is client-cached, cache
        // maps offsets into the cached file to offsets into
        // filesystem.mfp.MemoryFile() that store the file's data. cache is
        // protected by dataMu.
        cache fsutil.FileRangeSet

        // If this dentry represents a regular file that is client-cached, dirty
        // tracks dirty segments in cache. dirty is protected by dataMu.
        dirty fsutil.DirtySet

        // pf implements platform.File for mappings of hostFD.
        pf dentryPlatformFile

        // If this dentry represents a symbolic link, InteropModeShared is not in
        // effect, and haveTarget is true, target is the symlink target. haveTarget
        // and target are protected by dataMu.
        haveTarget bool
        target     string

        // If this dentry represents a synthetic socket file, endpoint is the
        // transport endpoint bound to this file.
        endpoint transport.BoundEndpoint

        // If this dentry represents a synthetic named pipe, pipe is the pipe
        // endpoint bound to this file.
        pipe *pipe.VFSPipe

        locks vfs.FileLocks

        // Inotify watches for this dentry.
        //
        // Note that inotify may behave unexpectedly in the presence of hard links,
        // because dentries corresponding to the same file have separate inotify
        // watches when they should share the same set. This is the case because it is
        // impossible for us to know for sure whether two dentries correspond to the
        // same underlying file (see the gofer filesystem section fo vfs/inotify.md for
        // a more in-depth discussion on this matter).
        watches vfs.Watches
}

// dentryAttrMask returns a p9.AttrMask enabling all attributes used by the
// gofer client.
func dentryAttrMask() p9.AttrMask {
        return p9.AttrMask{
                Mode:  true,
                UID:   true,
                GID:   true,
                ATime: true,
                MTime: true,
                CTime: true,
                Size:  true,
                BTime: true,
        }
}

// newDentry creates a new dentry representing the given file. The dentry
// initially has no references, but is not cached; it is the caller's
// responsibility to set the dentry's reference count and/or call
// dentry.checkCachingLocked() as appropriate.
//
// Preconditions: !file.isNil().
func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) {
        if !mask.Mode {
                ctx.Warningf("can't create gofer.dentry without file type")
                return nil, syserror.EIO
        }
        if attr.Mode.FileType() == p9.ModeRegular && !mask.Size {
                ctx.Warningf("can't create regular file gofer.dentry without file size")
                return nil, syserror.EIO
        }

        d := &dentry{
                fs:        fs,
                qidPath:   qid.Path,
                file:      file,
                ino:       fs.inoFromQIDPath(qid.Path),
                mode:      uint32(attr.Mode),
                uid:       uint32(fs.opts.dfltuid),
                gid:       uint32(fs.opts.dfltgid),
                blockSize: hostarch.PageSize,
                readFD:    -1,
                writeFD:   -1,
                mmapFD:    -1,
        }
        d.pf.dentry = d
        if mask.UID {
                d.uid = dentryUIDFromP9UID(attr.UID)
        }
        if mask.GID {
                d.gid = dentryGIDFromP9GID(attr.GID)
        }
        if mask.Size {
                d.size = attr.Size
        }
        if attr.BlockSize != 0 {
                d.blockSize = uint32(attr.BlockSize)
        }
        if mask.ATime {
                d.atime = dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)
        }
        if mask.MTime {
                d.mtime = dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)
        }
        if mask.CTime {
                d.ctime = dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds)
        }
        if mask.BTime {
                d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)
        }
        if mask.NLink {
                d.nlink = uint32(attr.NLink)
        }
        d.vfsd.Init(d)
        refsvfs2.Register(d)
        fs.syncMu.Lock()
        fs.syncableDentries[d] = struct{}{}
        fs.syncMu.Unlock()
        return d, nil
}

func (fs *filesystem) inoFromQIDPath(qidPath uint64) uint64 {
        fs.inoMu.Lock()
        defer fs.inoMu.Unlock()
        if ino, ok := fs.inoByQIDPath[qidPath]; ok {
                return ino
        }
        ino := fs.nextIno()
        fs.inoByQIDPath[qidPath] = ino
        return ino
}

func (fs *filesystem) nextIno() uint64 {
        return atomic.AddUint64(&fs.lastIno, 1)
}

func (d *dentry) isSynthetic() bool {
        return d.file.isNil()
}

func (d *dentry) cachedMetadataAuthoritative() bool {
        return d.fs.opts.interop != InteropModeShared || d.isSynthetic()
}

// updateFromP9Attrs is called to update d's metadata after an update from the
// remote filesystem.
// Precondition: d.metadataMu must be locked.
// +checklocks:d.metadataMu
func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
        if mask.Mode {
                if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want {
                        panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got))
                }
                atomic.StoreUint32(&d.mode, uint32(attr.Mode))
        }
        if mask.UID {
                atomic.StoreUint32(&d.uid, dentryUIDFromP9UID(attr.UID))
        }
        if mask.GID {
                atomic.StoreUint32(&d.gid, dentryGIDFromP9GID(attr.GID))
        }
        // There is no P9_GETATTR_* bit for I/O block size.
        if attr.BlockSize != 0 {
                atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize))
        }
        // Don't override newer client-defined timestamps with old server-defined
        // ones.
        if mask.ATime && atomic.LoadUint32(&d.atimeDirty) == 0 {
                atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds))
        }
        if mask.MTime && atomic.LoadUint32(&d.mtimeDirty) == 0 {
                atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds))
        }
        if mask.CTime {
                atomic.StoreInt64(&d.ctime, dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds))
        }
        if mask.BTime {
                atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds))
        }
        if mask.NLink {
                atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
        }
        if mask.Size {
                d.updateSizeLocked(attr.Size)
        }
}

// Preconditions: !d.isSynthetic().
// Preconditions: d.metadataMu is locked.
// +checklocks:d.metadataMu
func (d *dentry) refreshSizeLocked(ctx context.Context) error {
        d.handleMu.RLock()

        if d.writeFD < 0 {
                d.handleMu.RUnlock()
                // Ask the gofer if we don't have a host FD.
                return d.updateFromGetattrLocked(ctx)
        }

        var stat unix.Statx_t
        err := unix.Statx(int(d.writeFD), "", unix.AT_EMPTY_PATH, unix.STATX_SIZE, &stat)
        d.handleMu.RUnlock() // must be released before updateSizeLocked()
        if err != nil {
                return err
        }
        d.updateSizeLocked(stat.Size)
        return nil
}

// Preconditions: !d.isSynthetic().
func (d *dentry) updateFromGetattr(ctx context.Context) error {
        // d.metadataMu must be locked *before* we getAttr so that we do not end up
        // updating stale attributes in d.updateFromP9AttrsLocked().
        d.metadataMu.Lock()
        defer d.metadataMu.Unlock()
        return d.updateFromGetattrLocked(ctx)
}

// Preconditions:
// * !d.isSynthetic().
// * d.metadataMu is locked.
// +checklocks:d.metadataMu
func (d *dentry) updateFromGetattrLocked(ctx context.Context) error {
        // Use d.readFile or d.writeFile, which represent 9P FIDs that have been
        // opened, in preference to d.file, which represents a 9P fid that has not.
        // This may be significantly more efficient in some implementations. Prefer
        // d.writeFile over d.readFile since some filesystem implementations may
        // update a writable handle's metadata after writes to that handle, without
        // making metadata updates immediately visible to read-only handles
        // representing the same file.
        d.handleMu.RLock()
        handleMuRLocked := true
        var file p9file
        switch {
        case !d.writeFile.isNil():
                file = d.writeFile
        case !d.readFile.isNil():
                file = d.readFile
        default:
                file = d.file
                d.handleMu.RUnlock()
                handleMuRLocked = false
        }

        _, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask())
        if handleMuRLocked {
                // handleMu must be released before updateFromP9AttrsLocked().
                d.handleMu.RUnlock() // +checklocksforce: complex case.
        }
        if err != nil {
                return err
        }
        d.updateFromP9AttrsLocked(attrMask, &attr)
        return nil
}

func (d *dentry) fileType() uint32 {
        return atomic.LoadUint32(&d.mode) & linux.S_IFMT
}

func (d *dentry) statTo(stat *linux.Statx) {
        stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
        stat.Blksize = atomic.LoadUint32(&d.blockSize)
        stat.Nlink = atomic.LoadUint32(&d.nlink)
        if stat.Nlink == 0 {
                // The remote filesystem doesn't support link count; just make
                // something up. This is consistent with Linux, where
                // fs/inode.c:inode_init_always() initializes link count to 1, and
                // fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if
                // it's not provided by the remote filesystem.
                stat.Nlink = 1
        }
        stat.UID = atomic.LoadUint32(&d.uid)
        stat.GID = atomic.LoadUint32(&d.gid)
        stat.Mode = uint16(atomic.LoadUint32(&d.mode))
        stat.Ino = uint64(d.ino)
        stat.Size = atomic.LoadUint64(&d.size)
        // This is consistent with regularFileFD.Seek(), which treats regular files
        // as having no holes.
        stat.Blocks = (stat.Size + 511) / 512
        stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.atime))
        stat.Btime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.btime))
        stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.ctime))
        stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.mtime))
        stat.DevMajor = linux.UNNAMED_MAJOR
        stat.DevMinor = d.fs.devMinor
}

func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error {
        stat := &opts.Stat
        if stat.Mask == 0 {
                return nil
        }
        if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
                return linuxerr.EPERM
        }
        mode := linux.FileMode(atomic.LoadUint32(&d.mode))
        if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
                return err
        }
        if err := mnt.CheckBeginWrite(); err != nil {
                return err
        }
        defer mnt.EndWrite()

        if stat.Mask&linux.STATX_SIZE != 0 {
                // Reject attempts to truncate files other than regular files, since
                // filesystem implementations may return the wrong errno.
                switch mode.FileType() {
                case linux.S_IFREG:
                        // ok
                case linux.S_IFDIR:
                        return syserror.EISDIR
                default:
                        return linuxerr.EINVAL
                }
        }

        var now int64
        if d.cachedMetadataAuthoritative() {
                // Truncate updates mtime.
                if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE {
                        stat.Mask |= linux.STATX_MTIME
                        stat.Mtime = linux.StatxTimestamp{
                                Nsec: linux.UTIME_NOW,
                        }
                }

                // Use client clocks for timestamps.
                now = d.fs.clock.Now().Nanoseconds()
                if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW {
                        stat.Atime = linux.NsecToStatxTimestamp(now)
                }
                if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW {
                        stat.Mtime = linux.NsecToStatxTimestamp(now)
                }
        }

        d.metadataMu.Lock()
        defer d.metadataMu.Unlock()

        // As with Linux, if the UID, GID, or file size is changing, we have to
        // clear permission bits. Note that when set, clearSGID may cause
        // permissions to be updated.
        clearSGID := (stat.Mask&linux.STATX_UID != 0 && stat.UID != atomic.LoadUint32(&d.uid)) ||
                (stat.Mask&linux.STATX_GID != 0 && stat.GID != atomic.LoadUint32(&d.gid)) ||
                stat.Mask&linux.STATX_SIZE != 0
        if clearSGID {
                if stat.Mask&linux.STATX_MODE != 0 {
                        stat.Mode = uint16(vfs.ClearSUIDAndSGID(uint32(stat.Mode)))
                } else {
                        oldMode := atomic.LoadUint32(&d.mode)
                        if updatedMode := vfs.ClearSUIDAndSGID(oldMode); updatedMode != oldMode {
                                stat.Mode = uint16(updatedMode)
                                stat.Mask |= linux.STATX_MODE
                        }
                }
        }

        if !d.isSynthetic() {
                if stat.Mask != 0 {
                        if err := d.file.setAttr(ctx, p9.SetAttrMask{
                                Permissions:        stat.Mask&linux.STATX_MODE != 0,
                                UID:                stat.Mask&linux.STATX_UID != 0,
                                GID:                stat.Mask&linux.STATX_GID != 0,
                                Size:               stat.Mask&linux.STATX_SIZE != 0,
                                ATime:              stat.Mask&linux.STATX_ATIME != 0,
                                MTime:              stat.Mask&linux.STATX_MTIME != 0,
                                ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW,
                                MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW,
                        }, p9.SetAttr{
                                Permissions:      p9.FileMode(stat.Mode),
                                UID:              p9.UID(stat.UID),
                                GID:              p9.GID(stat.GID),
                                Size:             stat.Size,
                                ATimeSeconds:     uint64(stat.Atime.Sec),
                                ATimeNanoSeconds: uint64(stat.Atime.Nsec),
                                MTimeSeconds:     uint64(stat.Mtime.Sec),
                                MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
                        }); err != nil {
                                return err
                        }
                        if stat.Mask&linux.STATX_SIZE != 0 {
                                // d.size should be kept up to date, and privatized
                                // copy-on-write mappings of truncated pages need to be
                                // invalidated, even if InteropModeShared is in effect.
                                d.updateSizeLocked(stat.Size)
                        }
                }
                if d.fs.opts.interop == InteropModeShared {
                        // There's no point to updating d's metadata in this case since
                        // it'll be overwritten by revalidation before the next time it's
                        // used anyway. (InteropModeShared inhibits client caching of
                        // regular file data, so there's no cache to truncate either.)
                        return nil
                }
        }
        if stat.Mask&linux.STATX_MODE != 0 {
                atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode))
        }
        if stat.Mask&linux.STATX_UID != 0 {
                atomic.StoreUint32(&d.uid, stat.UID)
        }
        if stat.Mask&linux.STATX_GID != 0 {
                atomic.StoreUint32(&d.gid, stat.GID)
        }
        // Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because
        // if d.cachedMetadataAuthoritative() then we converted stat.Atime and
        // stat.Mtime to client-local timestamps above, and if
        // !d.cachedMetadataAuthoritative() then we returned after calling
        // d.file.setAttr(). For the same reason, now must have been initialized.
        if stat.Mask&linux.STATX_ATIME != 0 {
                atomic.StoreInt64(&d.atime, stat.Atime.ToNsec())
                atomic.StoreUint32(&d.atimeDirty, 0)
        }
        if stat.Mask&linux.STATX_MTIME != 0 {
                atomic.StoreInt64(&d.mtime, stat.Mtime.ToNsec())
                atomic.StoreUint32(&d.mtimeDirty, 0)
        }
        atomic.StoreInt64(&d.ctime, now)
        return nil
}

// doAllocate performs an allocate operation on d. Note that d.metadataMu will
// be held when allocate is called.
func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error {
        d.metadataMu.Lock()
        defer d.metadataMu.Unlock()

        // Allocating a smaller size is a noop.
        size := offset + length
        if d.cachedMetadataAuthoritative() && size <= d.size {
                return nil
        }

        err := allocate()
        if err != nil {
                return err
        }
        d.updateSizeLocked(size)
        if d.cachedMetadataAuthoritative() {
                d.touchCMtimeLocked()
        }
        return nil
}

// Preconditions: d.metadataMu must be locked.
func (d *dentry) updateSizeLocked(newSize uint64) {
        d.dataMu.Lock()
        oldSize := d.size
        atomic.StoreUint64(&d.size, newSize)
        // d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
        // below. This allows concurrent calls to Read/Translate/etc. These
        // functions synchronize with truncation by refusing to use cache
        // contents beyond the new d.size. (We are still holding d.metadataMu,
        // so we can't race with Write or another truncate.)
        d.dataMu.Unlock()
        if d.size < oldSize {
                oldpgend, _ := hostarch.PageRoundUp(oldSize)
                newpgend, _ := hostarch.PageRoundUp(d.size)
                if oldpgend != newpgend {
                        d.mapsMu.Lock()
                        d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
                                // Compare Linux's mm/truncate.c:truncate_setsize() =>
                                // truncate_pagecache() =>
                                // mm/memory.c:unmap_mapping_range(evencows=1).
                                InvalidatePrivate: true,
                        })
                        d.mapsMu.Unlock()
                }
                // We are now guaranteed that there are no translations of
                // truncated pages, and can remove them from the cache. Since
                // truncated pages have been removed from the remote file, they
                // should be dropped without being written back.
                d.dataMu.Lock()
                d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
                d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
                d.dataMu.Unlock()
        }
}

func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
        return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
}

func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
        // Deny access to the "security" and "system" namespaces since applications
        // may expect these to affect kernel behavior in unimplemented ways
        // (b/148380782). Allow all other extended attributes to be passed through
        // to the remote filesystem. This is inconsistent with Linux's 9p client,
        // but consistent with other filesystems (e.g. FUSE).
        if strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) || strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) {
                return linuxerr.EOPNOTSUPP
        }
        mode := linux.FileMode(atomic.LoadUint32(&d.mode))
        kuid := auth.KUID(atomic.LoadUint32(&d.uid))
        kgid := auth.KGID(atomic.LoadUint32(&d.gid))
        if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
                return err
        }
        return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
}

func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
        return vfs.CheckDeleteSticky(
                creds,
                linux.FileMode(atomic.LoadUint32(&d.mode)),
                auth.KUID(atomic.LoadUint32(&d.uid)),
                auth.KUID(atomic.LoadUint32(&child.uid)),
                auth.KGID(atomic.LoadUint32(&child.gid)),
        )
}

func dentryUIDFromP9UID(uid p9.UID) uint32 {
        if !uid.Ok() {
                return uint32(auth.OverflowUID)
        }
        return uint32(uid)
}

func dentryGIDFromP9GID(gid p9.GID) uint32 {
        if !gid.Ok() {
                return uint32(auth.OverflowGID)
        }
        return uint32(gid)
}

// IncRef implements vfs.DentryImpl.IncRef.
func (d *dentry) IncRef() {
        // d.refs may be 0 if d.fs.renameMu is locked, which serializes against
        // d.checkCachingLocked().
        r := atomic.AddInt64(&d.refs, 1)
        if d.LogRefs() {
                refsvfs2.LogIncRef(d, r)
        }
}

// TryIncRef implements vfs.DentryImpl.TryIncRef.
func (d *dentry) TryIncRef() bool {
        for {
                r := atomic.LoadInt64(&d.refs)
                if r <= 0 {
                        return false
                }
                if atomic.CompareAndSwapInt64(&d.refs, r, r+1) {
                        if d.LogRefs() {
                                refsvfs2.LogTryIncRef(d, r+1)
                        }
                        return true
                }
        }
}

// DecRef implements vfs.DentryImpl.DecRef.
func (d *dentry) DecRef(ctx context.Context) {
        if d.decRefNoCaching() == 0 {
                d.checkCachingLocked(ctx, false /* renameMuWriteLocked */)
        }
}

// decRefNoCaching decrements d's reference count without calling
// d.checkCachingLocked, even if d's reference count reaches 0; callers are
// responsible for ensuring that d.checkCachingLocked will be called later.
func (d *dentry) decRefNoCaching() int64 {
        r := atomic.AddInt64(&d.refs, -1)
        if d.LogRefs() {
                refsvfs2.LogDecRef(d, r)
        }
        if r < 0 {
                panic("gofer.dentry.decRefNoCaching() called without holding a reference")
        }
        return r
}

// RefType implements refsvfs2.CheckedObject.Type.
func (d *dentry) RefType() string {
        return "gofer.dentry"
}

// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
func (d *dentry) LeakMessage() string {
        return fmt.Sprintf("[gofer.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
}

// LogRefs implements refsvfs2.CheckedObject.LogRefs.
//
// This should only be set to true for debugging purposes, as it can generate an
// extremely large amount of output and drastically degrade performance.
func (d *dentry) LogRefs() bool {
        return false
}

// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
        if d.isDir() {
                events |= linux.IN_ISDIR
        }

        d.fs.renameMu.RLock()
        // The ordering below is important, Linux always notifies the parent first.
        if d.parent != nil {
                d.parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted())
        }
        d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted())
        d.fs.renameMu.RUnlock()
}

// Watches implements vfs.DentryImpl.Watches.
func (d *dentry) Watches() *vfs.Watches {
        return &d.watches
}

// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
//
// If no watches are left on this dentry and it has no references, cache it.
func (d *dentry) OnZeroWatches(ctx context.Context) {
        d.checkCachingLocked(ctx, false /* renameMuWriteLocked */)
}

// checkCachingLocked should be called after d's reference count becomes 0 or
// it becomes disowned.
//
// For performance, checkCachingLocked can also be called after d's reference
// count becomes non-zero, so that d can be removed from the LRU cache. This
// may help in reducing the size of the cache and hence reduce evictions. Note
// that this is not necessary for correctness.
//
// It may be called on a destroyed dentry. For example,
// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times
// for the same dentry when the dentry is visited more than once in the same
// operation. One of the calls may destroy the dentry, so subsequent calls will
// do nothing.
//
// Preconditions: d.fs.renameMu must be locked for writing if
// renameMuWriteLocked is true; it may be temporarily unlocked.
func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) {
        d.cachingMu.Lock()
        refs := atomic.LoadInt64(&d.refs)
        if refs == -1 {
                // Dentry has already been destroyed.
                d.cachingMu.Unlock()
                return
        }
        if refs > 0 {
                // fs.cachedDentries is permitted to contain dentries with non-zero refs,
                // which are skipped by fs.evictCachedDentryLocked() upon reaching the end
                // of the LRU. But it is still beneficial to remove d from the cache as we
                // are already holding d.cachingMu. Keeping a cleaner cache also reduces
                // the number of evictions (which is expensive as it acquires fs.renameMu).
                d.removeFromCacheLocked()
                d.cachingMu.Unlock()
                return
        }
        // Deleted and invalidated dentries with zero references are no longer
        // reachable by path resolution and should be dropped immediately.
        if d.vfsd.IsDead() {
                d.removeFromCacheLocked()
                d.cachingMu.Unlock()
                if !renameMuWriteLocked {
                        // Need to lock d.fs.renameMu for writing as needed by d.destroyLocked().
                        d.fs.renameMu.Lock()
                        defer d.fs.renameMu.Unlock()
                        // Now that renameMu is locked for writing, no more refs can be taken on
                        // d because path resolution requires renameMu for reading at least.
                        if atomic.LoadInt64(&d.refs) != 0 {
                                // Destroy d only if its ref is still 0. If not, either someone took a
                                // ref on it or it got destroyed before fs.renameMu could be acquired.
                                return
                        }
                }
                if d.isDeleted() {
                        d.watches.HandleDeletion(ctx)
                }
                d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point.
                return
        }
        // If d still has inotify watches and it is not deleted or invalidated, it
        // can't be evicted. Otherwise, we will lose its watches, even if a new
        // dentry is created for the same file in the future. Note that the size of
        // d.watches cannot concurrently transition from zero to non-zero, because
        // adding a watch requires holding a reference on d.
        if d.watches.Size() > 0 {
                // As in the refs > 0 case, removing d is beneficial.
                d.removeFromCacheLocked()
                d.cachingMu.Unlock()
                return
        }

        if atomic.LoadInt32(&d.fs.released) != 0 {
                d.cachingMu.Unlock()
                if !renameMuWriteLocked {
                        // Need to lock d.fs.renameMu to access d.parent. Lock it for writing as
                        // needed by d.destroyLocked() later.
                        d.fs.renameMu.Lock()
                        defer d.fs.renameMu.Unlock()
                }
                if d.parent != nil {
                        d.parent.dirMu.Lock()
                        delete(d.parent.children, d.name)
                        d.parent.dirMu.Unlock()
                }
                d.destroyLocked(ctx) // +checklocksforce: see above.
                return
        }

        d.fs.cacheMu.Lock()
        // If d is already cached, just move it to the front of the LRU.
        if d.cached {
                d.fs.cachedDentries.Remove(d)
                d.fs.cachedDentries.PushFront(d)
                d.fs.cacheMu.Unlock()
                d.cachingMu.Unlock()
                return
        }
        // Cache the dentry, then evict the least recently used cached dentry if
        // the cache becomes over-full.
        d.fs.cachedDentries.PushFront(d)
        d.fs.cachedDentriesLen++
        d.cached = true
        shouldEvict := d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries
        d.fs.cacheMu.Unlock()
        d.cachingMu.Unlock()

        if shouldEvict {
                if !renameMuWriteLocked {
                        // Need to lock d.fs.renameMu for writing as needed by
                        // d.evictCachedDentryLocked().
                        d.fs.renameMu.Lock()
                        defer d.fs.renameMu.Unlock()
                }
                d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above.
        }
}

// Preconditions: d.cachingMu must be locked.
func (d *dentry) removeFromCacheLocked() {
        if d.cached {
                d.fs.cacheMu.Lock()
                d.fs.cachedDentries.Remove(d)
                d.fs.cachedDentriesLen--
                d.fs.cacheMu.Unlock()
                d.cached = false
        }
}

// Precondition: fs.renameMu must be locked for writing; it may be temporarily
// unlocked.
// +checklocks:fs.renameMu
func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) {
        for fs.cachedDentriesLen != 0 {
                fs.evictCachedDentryLocked(ctx)
        }
}

// Preconditions:
// * fs.renameMu must be locked for writing; it may be temporarily unlocked.
// +checklocks:fs.renameMu
func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) {
        fs.cacheMu.Lock()
        victim := fs.cachedDentries.Back()
        fs.cacheMu.Unlock()
        if victim == nil {
                // fs.cachedDentries may have become empty between when it was checked and
                // when we locked fs.cacheMu.
                return
        }

        victim.cachingMu.Lock()
        victim.removeFromCacheLocked()
        // victim.refs or victim.watches.Size() may have become non-zero from an
        // earlier path resolution since it was inserted into fs.cachedDentries.
        if atomic.LoadInt64(&victim.refs) != 0 || victim.watches.Size() != 0 {
                victim.cachingMu.Unlock()
                return
        }
        if victim.parent != nil {
                victim.parent.dirMu.Lock()
                if !victim.vfsd.IsDead() {
                        // Note that victim can't be a mount point (in any mount
                        // namespace), since VFS holds references on mount points.
                        fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd)
                        delete(victim.parent.children, victim.name)
                        // We're only deleting the dentry, not the file it
                        // represents, so we don't need to update
                        // victimParent.dirents etc.
                }
                victim.parent.dirMu.Unlock()
        }
        // Safe to unlock cachingMu now that victim.vfsd.IsDead(). Henceforth any
        // concurrent caching attempts on victim will attempt to destroy it and so
        // will try to acquire fs.renameMu (which we have already acquired). Hence,
        // fs.renameMu will synchronize the destroy attempts.
        victim.cachingMu.Unlock()
        victim.destroyLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs.
}

// destroyLocked destroys the dentry.
//
// Preconditions:
// * d.fs.renameMu must be locked for writing; it may be temporarily unlocked.
// * d.refs == 0.
// * d.parent.children[d.name] != d, i.e. d is not reachable by path traversal
//   from its former parent dentry.
// +checklocks:d.fs.renameMu
func (d *dentry) destroyLocked(ctx context.Context) {
        switch atomic.LoadInt64(&d.refs) {
        case 0:
                // Mark the dentry destroyed.
                atomic.StoreInt64(&d.refs, -1)
        case -1:
                panic("dentry.destroyLocked() called on already destroyed dentry")
        default:
                panic("dentry.destroyLocked() called with references on the dentry")
        }

        // Allow the following to proceed without renameMu locked to improve
        // scalability.
        d.fs.renameMu.Unlock()

        mf := d.fs.mfp.MemoryFile()
        d.handleMu.Lock()
        d.dataMu.Lock()
        if h := d.writeHandleLocked(); h.isOpen() {
                // Write dirty pages back to the remote filesystem.
                if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
                        log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err)
                }
        }
        // Discard cached data.
        if !d.cache.IsEmpty() {
                mf.MarkAllUnevictable(d)
                d.cache.DropAll(mf)
                d.dirty.RemoveAll()
        }
        d.dataMu.Unlock()
        // Clunk open fids and close open host FDs.
        if !d.readFile.isNil() {
                d.readFile.close(ctx)
        }
        if !d.writeFile.isNil() && d.readFile != d.writeFile {
                d.writeFile.close(ctx)
        }
        d.readFile = p9file{}
        d.writeFile = p9file{}
        if d.readFD >= 0 {
                unix.Close(int(d.readFD))
        }
        if d.writeFD >= 0 && d.readFD != d.writeFD {
                unix.Close(int(d.writeFD))
        }
        d.readFD = -1
        d.writeFD = -1
        d.mmapFD = -1
        d.handleMu.Unlock()

        if !d.file.isNil() {
                // Note that it's possible that d.atimeDirty or d.mtimeDirty are true,
                // i.e. client and server timestamps may differ (because e.g. a client
                // write was serviced by the page cache, and only written back to the
                // remote file later). Ideally, we'd write client timestamps back to
                // the remote filesystem so that timestamps for a new dentry
                // instantiated for the same file would remain coherent. Unfortunately,
                // this turns out to be too expensive in many cases, so for now we
                // don't do this.
                if err := d.file.close(ctx); err != nil {
                        log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err)
                }
                d.file = p9file{}

                // Remove d from the set of syncable dentries.
                d.fs.syncMu.Lock()
                delete(d.fs.syncableDentries, d)
                d.fs.syncMu.Unlock()
        }

        d.fs.renameMu.Lock()

        // Drop the reference held by d on its parent without recursively locking
        // d.fs.renameMu.
        if d.parent != nil && d.parent.decRefNoCaching() == 0 {
                d.parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */)
        }
        refsvfs2.Unregister(d)
}

func (d *dentry) isDeleted() bool {
        return atomic.LoadUint32(&d.deleted) != 0
}

func (d *dentry) setDeleted() {
        atomic.StoreUint32(&d.deleted, 1)
}

func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
        if d.file.isNil() {
                return nil, nil
        }
        xattrMap, err := d.file.listXattr(ctx, size)
        if err != nil {
                return nil, err
        }
        xattrs := make([]string, 0, len(xattrMap))
        for x := range xattrMap {
                xattrs = append(xattrs, x)
        }
        return xattrs, nil
}

func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
        if d.file.isNil() {
                return "", linuxerr.ENODATA
        }
        if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
                return "", err
        }
        return d.file.getXattr(ctx, opts.Name, opts.Size)
}

func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
        if d.file.isNil() {
                return linuxerr.EPERM
        }
        if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
                return err
        }
        return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
}

func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error {
        if d.file.isNil() {
                return linuxerr.EPERM
        }
        if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
                return err
        }
        return d.file.removeXattr(ctx, name)
}

// Preconditions:
// * !d.isSynthetic().
// * d.isRegularFile() || d.isDir().
func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
        // O_TRUNC unconditionally requires us to obtain a new handle (opened with
        // O_TRUNC).
        if !trunc {
                d.handleMu.RLock()
                if (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) {
                        // Current handles are sufficient.
                        d.handleMu.RUnlock()
                        return nil
                }
                d.handleMu.RUnlock()
        }

        var fdsToCloseArr [2]int32
        fdsToClose := fdsToCloseArr[:0]
        invalidateTranslations := false
        d.handleMu.Lock()
        if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc {
                // Get a new handle. If this file has been opened for both reading and
                // writing, try to get a single handle that is usable for both:
                //
                // - Writable memory mappings of a host FD require that the host FD is
                // opened for both reading and writing.
                //
                // - NOTE(b/141991141): Some filesystems may not ensure coherence
                // between multiple handles for the same file.
                openReadable := !d.readFile.isNil() || read
                openWritable := !d.writeFile.isNil() || write
                h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc)
                if linuxerr.Equals(linuxerr.EACCES, err) && (openReadable != read || openWritable != write) {
                        // It may not be possible to use a single handle for both
                        // reading and writing, since permissions on the file may have
                        // changed to e.g. disallow reading after previously being
                        // opened for reading. In this case, we have no choice but to
                        // use separate handles for reading and writing.
                        ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d)
                        openReadable = read
                        openWritable = write
                        h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc)
                }
                if err != nil {
                        d.handleMu.Unlock()
                        return err
                }

                // Update d.readFD and d.writeFD.
                if h.fd >= 0 {
                        if openReadable && openWritable && (d.readFD < 0 || d.writeFD < 0 || d.readFD != d.writeFD) {
                                // Replace existing FDs with this one.
                                if d.readFD >= 0 {
                                        // We already have a readable FD that may be in use by
                                        // concurrent callers of d.pf.FD().
                                        if d.fs.opts.overlayfsStaleRead {
                                                // If overlayfsStaleRead is in effect, then the new FD
                                                // may not be coherent with the existing one, so we
                                                // have no choice but to switch to mappings of the new
                                                // FD in both the application and sentry.
                                                if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
                                                        d.handleMu.Unlock()
                                                        ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
                                                        h.close(ctx)
                                                        return err
                                                }
                                                fdsToClose = append(fdsToClose, d.readFD)
                                                invalidateTranslations = true
                                                atomic.StoreInt32(&d.readFD, h.fd)
                                        } else {
                                                // Otherwise, we want to avoid invalidating existing
                                                // memmap.Translations (which is expensive); instead, use
                                                // dup3 to make the old file descriptor refer to the new
                                                // file description, then close the new file descriptor
                                                // (which is no longer needed). Racing callers of d.pf.FD()
                                                // may use the old or new file description, but this
                                                // doesn't matter since they refer to the same file, and
                                                // any racing mappings must be read-only.
                                                if err := unix.Dup3(int(h.fd), int(d.readFD), unix.O_CLOEXEC); err != nil {
                                                        oldFD := d.readFD
                                                        d.handleMu.Unlock()
                                                        ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldFD, err)
                                                        h.close(ctx)
                                                        return err
                                                }
                                                fdsToClose = append(fdsToClose, h.fd)
                                                h.fd = d.readFD
                                        }
                                } else {
                                        atomic.StoreInt32(&d.readFD, h.fd)
                                }
                                if d.writeFD != h.fd && d.writeFD >= 0 {
                                        fdsToClose = append(fdsToClose, d.writeFD)
                                }
                                atomic.StoreInt32(&d.writeFD, h.fd)
                                atomic.StoreInt32(&d.mmapFD, h.fd)
                        } else if openReadable && d.readFD < 0 {
                                atomic.StoreInt32(&d.readFD, h.fd)
                                // If the file has not been opened for writing, the new FD may
                                // be used for read-only memory mappings. If the file was
                                // previously opened for reading (without an FD), then existing
                                // translations of the file may use the internal page cache;
                                // invalidate those mappings.
                                if d.writeFile.isNil() {
                                        invalidateTranslations = !d.readFile.isNil()
                                        atomic.StoreInt32(&d.mmapFD, h.fd)
                                }
                        } else if openWritable && d.writeFD < 0 {
                                atomic.StoreInt32(&d.writeFD, h.fd)
                                if d.readFD >= 0 {
                                        // We have an existing read-only FD, but the file has just
                                        // been opened for writing, so we need to start supporting
                                        // writable memory mappings. However, the new FD is not
                                        // readable, so we have no FD that can be used to create
                                        // writable memory mappings. Switch to using the internal
                                        // page cache.
                                        invalidateTranslations = true
                                        atomic.StoreInt32(&d.mmapFD, -1)
                                }
                        } else {
                                // The new FD is not useful.
                                fdsToClose = append(fdsToClose, h.fd)
                        }
                } else if openWritable && d.writeFD < 0 && d.mmapFD >= 0 {
                        // We have an existing read-only FD, but the file has just been
                        // opened for writing, so we need to start supporting writable
                        // memory mappings. However, we have no writable host FD. Switch to
                        // using the internal page cache.
                        invalidateTranslations = true
                        atomic.StoreInt32(&d.mmapFD, -1)
                }

                // Switch to new fids.
                var oldReadFile p9file
                if openReadable {
                        oldReadFile = d.readFile
                        d.readFile = h.file
                }
                var oldWriteFile p9file
                if openWritable {
                        oldWriteFile = d.writeFile
                        d.writeFile = h.file
                }
                // NOTE(b/141991141): Clunk old fids before making new fids visible (by
                // unlocking d.handleMu).
                if !oldReadFile.isNil() {
                        oldReadFile.close(ctx)
                }
                if !oldWriteFile.isNil() && oldReadFile != oldWriteFile {
                        oldWriteFile.close(ctx)
                }
        }
        d.handleMu.Unlock()

        if invalidateTranslations {
                // Invalidate application mappings that may be using an old FD; they
                // will be replaced with mappings using the new FD after future calls
                // to d.Translate(). This requires holding d.mapsMu, which precedes
                // d.handleMu in the lock order.
                d.mapsMu.Lock()
                d.mappings.InvalidateAll(memmap.InvalidateOpts{})
                d.mapsMu.Unlock()
        }
        for _, fd := range fdsToClose {
                unix.Close(int(fd))
        }

        return nil
}

// Preconditions: d.handleMu must be locked.
func (d *dentry) readHandleLocked() handle {
        return handle{
                file: d.readFile,
                fd:   d.readFD,
        }
}

// Preconditions: d.handleMu must be locked.
func (d *dentry) writeHandleLocked() handle {
        return handle{
                file: d.writeFile,
                fd:   d.writeFD,
        }
}

func (d *dentry) syncRemoteFile(ctx context.Context) error {
        d.handleMu.RLock()
        defer d.handleMu.RUnlock()
        return d.syncRemoteFileLocked(ctx)
}

// Preconditions: d.handleMu must be locked.
func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
        // If we have a host FD, fsyncing it is likely to be faster than an fsync
        // RPC. Prefer syncing write handles over read handles, since some remote
        // filesystem implementations may not sync changes made through write
        // handles otherwise.
        if d.writeFD >= 0 {
                ctx.UninterruptibleSleepStart(false)
                err := unix.Fsync(int(d.writeFD))
                ctx.UninterruptibleSleepFinish(false)
                return err
        }
        if !d.writeFile.isNil() {
                return d.writeFile.fsync(ctx)
        }
        if d.readFD >= 0 {
                ctx.UninterruptibleSleepStart(false)
                err := unix.Fsync(int(d.readFD))
                ctx.UninterruptibleSleepFinish(false)
                return err
        }
        if !d.readFile.isNil() {
                return d.readFile.fsync(ctx)
        }
        return nil
}

func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error {
        d.handleMu.RLock()
        defer d.handleMu.RUnlock()
        h := d.writeHandleLocked()
        if h.isOpen() {
                // Write back dirty pages to the remote file.
                d.dataMu.Lock()
                err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
                d.dataMu.Unlock()
                if err != nil {
                        return err
                }
        }
        if err := d.syncRemoteFileLocked(ctx); err != nil {
                if !forFilesystemSync {
                        return err
                }
                // Only return err if we can reasonably have expected sync to succeed
                // (d is a regular file and was opened for writing).
                if d.isRegularFile() && h.isOpen() {
                        return err
                }
                ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err)
        }
        return nil
}

// incLinks increments link count.
func (d *dentry) incLinks() {
        if atomic.LoadUint32(&d.nlink) == 0 {
                // The remote filesystem doesn't support link count.
                return
        }
        atomic.AddUint32(&d.nlink, 1)
}

// decLinks decrements link count.
func (d *dentry) decLinks() {
        if atomic.LoadUint32(&d.nlink) == 0 {
                // The remote filesystem doesn't support link count.
                return
        }
        atomic.AddUint32(&d.nlink, ^uint32(0))
}

// fileDescription is embedded by gofer implementations of
// vfs.FileDescriptionImpl.
//
// +stateify savable
type fileDescription struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.LockFD

        lockLogging sync.Once `state:"nosave"`
}

func (fd *fileDescription) filesystem() *filesystem {
        return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
}

func (fd *fileDescription) dentry() *dentry {
        return fd.vfsfd.Dentry().Impl().(*dentry)
}

// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
        d := fd.dentry()
        const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME)
        if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
                // TODO(jamieliu): Use specialFileFD.handle.file for the getattr if
                // available?
                if err := d.updateFromGetattr(ctx); err != nil {
                        return linux.Statx{}, err
                }
        }
        var stat linux.Statx
        d.statTo(&stat)
        return stat, nil
}

// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
        if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()); err != nil {
                return err
        }
        if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
                fd.dentry().InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
        }
        return nil
}

// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
        return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size)
}

// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
        return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
}

// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
        d := fd.dentry()
        if err := d.setXattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
                return err
        }
        d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
        return nil
}

// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
        d := fd.dentry()
        if err := d.removeXattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
                return err
        }
        d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
        return nil
}

// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error {
        fd.lockLogging.Do(func() {
                log.Infof("File lock using gofer file handled internally.")
        })
        return fd.LockFD.LockBSD(ctx, uid, ownerPID, t, block)
}

// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error {
        fd.lockLogging.Do(func() {
                log.Infof("Range lock using gofer file handled internally.")
        })
        return fd.Locks().LockPOSIX(ctx, uid, ownerPID, t, r, block)
}

// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error {
        return fd.Locks().UnlockPOSIX(ctx, uid, r)
}



















































   18 





   17 







   18 



    1 




    1 






    3 
    1 


    2 















   11 




    2 




    1 




    2 




    1 




   11 




   26 






   26 
    2 


    2 



    3 



    3 


   10 



    2 


    1 



    1 




    1 


    1 


    1 


    3 






    2 






    1 





    6 





























    2 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package devpts

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/unimpl"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// masterInode is the inode for the master end of the Terminal.
//
// +stateify savable
type masterInode struct {
        implStatFS
        kernfs.InodeAttrs
        kernfs.InodeNoopRefCount
        kernfs.InodeNotDirectory
        kernfs.InodeNotSymlink

        locks vfs.FileLocks

        // root is the devpts root inode.
        root *rootInode
}

var _ kernfs.Inode = (*masterInode)(nil)

// Open implements kernfs.Inode.Open.
func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        t, err := mi.root.allocateTerminal(ctx, rp.Credentials())
        if err != nil {
                return nil, err
        }

        fd := &masterFileDescription{
                inode: mi,
                t:     t,
        }
        fd.LockFD.Init(&mi.locks)
        if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
                return nil, err
        }
        return &fd.vfsfd, nil
}

// Stat implements kernfs.Inode.Stat.
func (mi *masterInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
        statx, err := mi.InodeAttrs.Stat(ctx, vfsfs, opts)
        if err != nil {
                return linux.Statx{}, err
        }
        statx.Blksize = 1024
        statx.RdevMajor = linux.TTYAUX_MAJOR
        statx.RdevMinor = linux.PTMX_MINOR
        return statx, nil
}

// SetStat implements kernfs.Inode.SetStat
func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
        if opts.Stat.Mask&linux.STATX_SIZE != 0 {
                return linuxerr.EINVAL
        }
        return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
}

// +stateify savable
type masterFileDescription struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.LockFD

        inode *masterInode
        t     *Terminal
}

var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil)

// Release implements vfs.FileDescriptionImpl.Release.
func (mfd *masterFileDescription) Release(ctx context.Context) {
        mfd.inode.root.masterClose(ctx, mfd.t)
}

// EventRegister implements waiter.Waitable.EventRegister.
func (mfd *masterFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        mfd.t.ld.masterWaiter.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (mfd *masterFileDescription) EventUnregister(e *waiter.Entry) {
        mfd.t.ld.masterWaiter.EventUnregister(e)
}

// Readiness implements waiter.Waitable.Readiness.
func (mfd *masterFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
        return mfd.t.ld.masterReadiness()
}

// Read implements vfs.FileDescriptionImpl.Read.
func (mfd *masterFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
        return mfd.t.ld.outputQueueRead(ctx, dst)
}

// Write implements vfs.FileDescriptionImpl.Write.
func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
        return mfd.t.ld.inputQueueWrite(ctx, src)
}

// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        t := kernel.TaskFromContext(ctx)
        if t == nil {
                // ioctl(2) may only be called from a task goroutine.
                return 0, linuxerr.ENOTTY
        }

        switch cmd := args[1].Uint(); cmd {
        case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
                // Get the number of bytes in the output queue read buffer.
                return 0, mfd.t.ld.outputQueueReadSize(t, io, args)
        case linux.TCGETS:
                // N.B. TCGETS on the master actually returns the configuration
                // of the replica end.
                return mfd.t.ld.getTermios(t, args)
        case linux.TCSETS:
                // N.B. TCSETS on the master actually affects the configuration
                // of the replica end.
                return mfd.t.ld.setTermios(t, args)
        case linux.TCSETSW:
                // TODO(b/29356795): This should drain the output queue first.
                return mfd.t.ld.setTermios(t, args)
        case linux.TIOCGPTN:
                nP := primitive.Uint32(mfd.t.n)
                _, err := nP.CopyOut(t, args[2].Pointer())
                return 0, err
        case linux.TIOCSPTLCK:
                // TODO(b/29356795): Implement pty locking. For now just pretend we do.
                return 0, nil
        case linux.TIOCGWINSZ:
                return 0, mfd.t.ld.windowSize(t, args)
        case linux.TIOCSWINSZ:
                return 0, mfd.t.ld.setWindowSize(t, args)
        case linux.TIOCSCTTY:
                // Make the given terminal the controlling terminal of the
                // calling process.
                steal := args[2].Int() == 1
                return 0, mfd.t.setControllingTTY(ctx, steal, true /* isMaster */, mfd.vfsfd.IsReadable())
        case linux.TIOCNOTTY:
                // Release this process's controlling terminal.
                return 0, mfd.t.releaseControllingTTY(ctx, true /* isMaster */)
        case linux.TIOCGPGRP:
                // Get the foreground process group.
                return mfd.t.foregroundProcessGroup(ctx, args, true /* isMaster */)
        case linux.TIOCSPGRP:
                // Set the foreground process group.
                return mfd.t.setForegroundProcessGroup(ctx, args, true /* isMaster */)
        default:
                maybeEmitUnimplementedEvent(ctx, cmd)
                return 0, linuxerr.ENOTTY
        }
}

// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (mfd *masterFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
        creds := auth.CredentialsFromContext(ctx)
        fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem()
        return mfd.inode.SetStat(ctx, fs, creds, opts)
}

// Stat implements vfs.FileDescriptionImpl.Stat.
func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
        fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem()
        return mfd.inode.Stat(ctx, fs, opts)
}

// maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid.
func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
        switch cmd {
        case linux.TCGETS,
                linux.TCSETS,
                linux.TCSETSW,
                linux.TCSETSF,
                linux.TIOCGWINSZ,
                linux.TIOCSWINSZ,
                linux.TIOCSETD,
                linux.TIOCSBRK,
                linux.TIOCCBRK,
                linux.TCSBRK,
                linux.TCSBRKP,
                linux.TIOCSTI,
                linux.TIOCCONS,
                linux.FIONBIO,
                linux.TIOCEXCL,
                linux.TIOCNXCL,
                linux.TIOCGEXCL,
                linux.TIOCGSID,
                linux.TIOCGETD,
                linux.TIOCVHANGUP,
                linux.TIOCGDEV,
                linux.TIOCMGET,
                linux.TIOCMSET,
                linux.TIOCMBIC,
                linux.TIOCMBIS,
                linux.TIOCGICOUNT,
                linux.TCFLSH,
                linux.TIOCSSERIAL,
                linux.TIOCGPTPEER:

                unimpl.EmitUnimplementedEvent(ctx)
        }
}








































    1 




   10 




   11 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import "gvisor.dev/gvisor/pkg/tcpip"

// NDPNeighborSolicit is an NDP Neighbor Solicitation message. It will only
// contain the body of an ICMPv6 packet.
//
// See RFC 4861 section 4.3 for more details.
type NDPNeighborSolicit []byte

const (
        // NDPNSMinimumSize is the minimum size of a valid NDP Neighbor
        // Solicitation message (body of an ICMPv6 packet).
        NDPNSMinimumSize = 20

        // ndpNSTargetAddessOffset is the start of the Target Address
        // field within an NDPNeighborSolicit.
        ndpNSTargetAddessOffset = 4

        // ndpNSOptionsOffset is the start of the NDP options in an
        // NDPNeighborSolicit.
        ndpNSOptionsOffset = ndpNSTargetAddessOffset + IPv6AddressSize
)

// TargetAddress returns the value within the Target Address field.
func (b NDPNeighborSolicit) TargetAddress() tcpip.Address {
        return tcpip.Address(b[ndpNSTargetAddessOffset:][:IPv6AddressSize])
}

// SetTargetAddress sets the value within the Target Address field.
func (b NDPNeighborSolicit) SetTargetAddress(addr tcpip.Address) {
        copy(b[ndpNSTargetAddessOffset:][:IPv6AddressSize], addr)
}

// Options returns an NDPOptions of the the options body.
func (b NDPNeighborSolicit) Options() NDPOptions {
        return NDPOptions(b[ndpNSOptionsOffset:])
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/process_group_list.go: no such file or directory




























   42 




  126 




   50 



  170 
  167 


    5 



   19 





  206 

  205 

  205 



   41 



  206 








  179 


  179 

   43 


  176 




  180 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fsutil

import (
        "math"

        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/usage"
)

// FrameRefSetFunctions implements segment.Functions for FrameRefSet.
type FrameRefSetFunctions struct{}

// MinKey implements segment.Functions.MinKey.
func (FrameRefSetFunctions) MinKey() uint64 {
        return 0
}

// MaxKey implements segment.Functions.MaxKey.
func (FrameRefSetFunctions) MaxKey() uint64 {
        return math.MaxUint64
}

// ClearValue implements segment.Functions.ClearValue.
func (FrameRefSetFunctions) ClearValue(val *uint64) {
}

// Merge implements segment.Functions.Merge.
func (FrameRefSetFunctions) Merge(_ memmap.FileRange, val1 uint64, _ memmap.FileRange, val2 uint64) (uint64, bool) {
        if val1 != val2 {
                return 0, false
        }
        return val1, true
}

// Split implements segment.Functions.Split.
func (FrameRefSetFunctions) Split(_ memmap.FileRange, val uint64, _ uint64) (uint64, uint64) {
        return val, val
}

// IncRefAndAccount adds a reference on the range fr. All newly inserted segments
// are accounted as host page cache memory mappings.
func (refs *FrameRefSet) IncRefAndAccount(fr memmap.FileRange) {
        seg, gap := refs.Find(fr.Start)
        for {
                switch {
                case seg.Ok() && seg.Start() < fr.End:
                        seg = refs.Isolate(seg, fr)
                        seg.SetValue(seg.Value() + 1)
                        seg, gap = seg.NextNonEmpty()
                case gap.Ok() && gap.Start() < fr.End:
                        newRange := gap.Range().Intersect(fr)
                        usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
                        seg, gap = refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
                default:
                        refs.MergeAdjacent(fr)
                        return
                }
        }
}

// DecRefAndAccount removes a reference on the range fr and untracks segments
// that are removed from memory accounting.
func (refs *FrameRefSet) DecRefAndAccount(fr memmap.FileRange) {
        seg := refs.FindSegment(fr.Start)

        for seg.Ok() && seg.Start() < fr.End {
                seg = refs.Isolate(seg, fr)
                if old := seg.Value(); old == 1 {
                        usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
                        seg = refs.Remove(seg).NextSegment()
                } else {
                        seg.SetValue(old - 1)
                        seg = seg.NextSegment()
                }
        }
        refs.MergeAdjacent(fr)
}








































    3 




    3 







    3 









    3 






    3 












    3 










    3 












   12 





   12 
   12 


   12 















   12 

   12 


   12 












  326 

  322 


   12 
   12 


   12 



    2 
    2 
























  305 
    8 


  301 


  302 

  302 
   10 



  301 












  302 
   13 


  293 


























  301 



  302 



  292 


   14 
   14 
   13 


   14 
























  298 



  297 



  292 


    3 






   10 


    9 





   38 




   37 



   18 


    5 






   25 


   25 









    7 





    5 





    2 


    2 
    1 


    1 


    1 



    1 









    1 


    1 


    1 


    1 


    1 
    1 




    1 




















































    3 





    2 


    1 


    1 






























































































































































    2 




    1 


    1 



   16 




    5 


   12 
    1 


    7 



   11 






    2 





    1 


    1 






    1 































    3 












































    1 

















































  297 




    1 


  297 





  292 
  294 




  296 


  189 

    1 


  189 


  190 





  190 








  289 
  287 



    4 



    4 


    3 



    3 










    3 
    1 


    2 



    1 










    1 


    1 









  187 





  189 







  190 



  180 
  178 
    1 


  179 



  179 

   13 

    1 



   13 


   12 


   10 




   10 




   10 

   10 


   10 
















  178 





  179 










  179 
























































































































































































    4 




    2 


    2 






    2 









    2 


    2 





    2 






    2 
    1 


    1 






































































































































































    3 




    1 



    1 
















    4 




    3 





    1 





    3 




    1 



    2 


    2 



    2 



    2 















    2 













    3 




    1 


    2 


    2 












    2 



    2 




    1 


    1 



    3 




    1 


    2 



    3 



    2 



    1 

    1 








    1 







    1 

    1 








  303 










    2 
    2 


    2 



    2 

















    2 


    2 



    2 


    2 


    2 



    2 
    2 


    2 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package gofer

import (
        "fmt"
        "math"
        "strings"
        "sync"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/p9"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
        "gvisor.dev/gvisor/pkg/sentry/fsmetric"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

// Sync implements vfs.FilesystemImpl.Sync.
func (fs *filesystem) Sync(ctx context.Context) error {
        // Snapshot current syncable dentries and special file FDs.
        fs.renameMu.RLock()
        fs.syncMu.Lock()
        ds := make([]*dentry, 0, len(fs.syncableDentries))
        for d := range fs.syncableDentries {
                // It's safe to use IncRef here even though fs.syncableDentries doesn't
                // hold references since we hold fs.renameMu. Note that we can't use
                // TryIncRef since cached dentries at zero references should still be
                // synced.
                d.IncRef()
                ds = append(ds, d)
        }
        fs.renameMu.RUnlock()
        sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs))
        for sffd := range fs.specialFileFDs {
                // As above, fs.specialFileFDs doesn't hold references. However, unlike
                // dentries, an FD that has reached zero references can't be
                // resurrected, so we can use TryIncRef.
                if sffd.vfsfd.TryIncRef() {
                        sffds = append(sffds, sffd)
                }
        }
        fs.syncMu.Unlock()

        // Return the first error we encounter, but sync everything we can
        // regardless.
        var retErr error

        // Sync syncable dentries.
        for _, d := range ds {
                err := d.syncCachedFile(ctx, true /* forFilesystemSync */)
                d.DecRef(ctx)
                if err != nil {
                        ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err)
                        if retErr == nil {
                                retErr = err
                        }
                }
        }

        // Sync special files, which may be writable but do not use dentry shared
        // handles (so they won't be synced by the above).
        for _, sffd := range sffds {
                err := sffd.sync(ctx, true /* forFilesystemSync */)
                sffd.vfsfd.DecRef(ctx)
                if err != nil {
                        ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err)
                        if retErr == nil {
                                retErr = err
                        }
                }
        }

        return retErr
}

// maxFilenameLen is the maximum length of a filename. This is dictated by 9P's
// encoding of strings, which uses 2 bytes for the length prefix.
const maxFilenameLen = (1 << 16) - 1

// dentrySlicePool is a pool of *[]*dentry used to store dentries for which
// dentry.checkCachingLocked() must be called. The pool holds pointers to
// slices because Go lacks generics, so sync.Pool operates on interface{}, so
// every call to (what should be) sync.Pool<[]*dentry>.Put() allocates a copy
// of the slice header on the heap.
var dentrySlicePool = sync.Pool{
        New: func() interface{} {
                ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
                return &ds
        },
}

func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
        if ds == nil {
                ds = dentrySlicePool.Get().(*[]*dentry)
        }
        *ds = append(*ds, d)
        return ds
}

// Precondition: !parent.isSynthetic() && !child.isSynthetic().
func appendNewChildDentry(ds **[]*dentry, parent *dentry, child *dentry) {
        // The new child was added to parent and took a ref on the parent (hence
        // parent can be removed from cache). A new child has 0 refs for now. So
        // checkCachingLocked() should be called on both. Call it first on the parent
        // as it may create space in the cache for child to be inserted - hence
        // avoiding a cache eviction.
        *ds = appendDentry(*ds, parent)
        *ds = appendDentry(*ds, child)
}

// Preconditions: ds != nil.
func putDentrySlice(ds *[]*dentry) {
        // Allow dentries to be GC'd.
        for i := range *ds {
                (*ds)[i] = nil
        }
        *ds = (*ds)[:0]
        dentrySlicePool.Put(ds)
}

// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls
// dentry.checkCachingLocked on all dentries in *dsp with fs.renameMu locked
// for writing.
//
// dsp is a pointer-to-pointer since defer evaluates its arguments immediately,
// but dentry slices are allocated lazily, and it's much easier to say "defer
// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() {
// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this.
// +checklocksrelease:fs.renameMu
func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, dsp **[]*dentry) {
        fs.renameMu.RUnlock()
        if *dsp == nil {
                return
        }
        ds := **dsp
        for _, d := range ds {
                d.checkCachingLocked(ctx, false /* renameMuWriteLocked */)
        }
        putDentrySlice(*dsp)
}

// +checklocksrelease:fs.renameMu
func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) {
        if *ds == nil {
                fs.renameMu.Unlock()
                return
        }
        for _, d := range **ds {
                d.checkCachingLocked(ctx, true /* renameMuWriteLocked */)
        }
        fs.renameMu.Unlock()
        putDentrySlice(*ds)
}

// stepLocked resolves rp.Component() to an existing file, starting from the
// given directory.
//
// Dentries which may become cached as a result of the traversal are appended
// to *ds.
//
// Preconditions:
// * fs.renameMu must be locked.
// * d.dirMu must be locked.
// * !rp.Done().
// * If !d.cachedMetadataAuthoritative(), then d and all children that are
//   part of rp must have been revalidated.
//
// Postconditions: The returned dentry's cached metadata is up to date.
func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, bool, error) {
        if !d.isDir() {
                return nil, false, linuxerr.ENOTDIR
        }
        if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
                return nil, false, err
        }
        followedSymlink := false
afterSymlink:
        name := rp.Component()
        if name == "." {
                rp.Advance()
                return d, followedSymlink, nil
        }
        if name == ".." {
                if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
                        return nil, false, err
                } else if isRoot || d.parent == nil {
                        rp.Advance()
                        return d, followedSymlink, nil
                }
                if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
                        return nil, false, err
                }
                rp.Advance()
                return d.parent, followedSymlink, nil
        }
        child, err := fs.getChildLocked(ctx, d, name, ds)
        if err != nil {
                return nil, false, err
        }
        if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
                return nil, false, err
        }
        if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
                target, err := child.readlink(ctx, rp.Mount())
                if err != nil {
                        return nil, false, err
                }
                if err := rp.HandleSymlink(target); err != nil {
                        return nil, false, err
                }
                followedSymlink = true
                goto afterSymlink // don't check the current directory again
        }
        rp.Advance()
        return child, followedSymlink, nil
}

// getChildLocked returns a dentry representing the child of parent with the
// given name. Returns ENOENT if the child doesn't exist.
//
// Preconditions:
// * fs.renameMu must be locked.
// * parent.dirMu must be locked.
// * parent.isDir().
// * name is not "." or "..".
// * dentry at name has been revalidated
func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
        if len(name) > maxFilenameLen {
                return nil, linuxerr.ENAMETOOLONG
        }
        if child, ok := parent.children[name]; ok || parent.isSynthetic() {
                if child == nil {
                        return nil, syserror.ENOENT
                }
                return child, nil
        }

        qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
        if err != nil {
                if linuxerr.Equals(linuxerr.ENOENT, err) {
                        parent.cacheNegativeLookupLocked(name)
                }
                return nil, err
        }

        // Create a new dentry representing the file.
        child, err := fs.newDentry(ctx, file, qid, attrMask, &attr)
        if err != nil {
                file.close(ctx)
                delete(parent.children, name)
                return nil, err
        }
        parent.cacheNewChildLocked(child, name)
        appendNewChildDentry(ds, parent, child)
        return child, nil
}

// walkParentDirLocked resolves all but the last path component of rp to an
// existing directory, starting from the given directory (which is usually
// rp.Start().Impl().(*dentry)). It does not check that the returned directory
// is searchable by the provider of rp.
//
// Preconditions:
// * fs.renameMu must be locked.
// * !rp.Done().
// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
//   to date.
func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
        if err := fs.revalidateParentDir(ctx, rp, d, ds); err != nil {
                return nil, err
        }
        for !rp.Final() {
                d.dirMu.Lock()
                next, followedSymlink, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
                d.dirMu.Unlock()
                if err != nil {
                        return nil, err
                }
                d = next
                if followedSymlink {
                        if err := fs.revalidateParentDir(ctx, rp, d, ds); err != nil {
                                return nil, err
                        }
                }
        }
        if !d.isDir() {
                return nil, linuxerr.ENOTDIR
        }
        return d, nil
}

// resolveLocked resolves rp to an existing file.
//
// Preconditions: fs.renameMu must be locked.
func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
        d := rp.Start().Impl().(*dentry)
        if err := fs.revalidatePath(ctx, rp, d, ds); err != nil {
                return nil, err
        }
        for !rp.Done() {
                d.dirMu.Lock()
                next, followedSymlink, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
                d.dirMu.Unlock()
                if err != nil {
                        return nil, err
                }
                d = next
                if followedSymlink {
                        if err := fs.revalidatePath(ctx, rp, d, ds); err != nil {
                                return nil, err
                        }
                }
        }
        if rp.MustBeDir() && !d.isDir() {
                return nil, linuxerr.ENOTDIR
        }
        return d, nil
}

// doCreateAt checks that creating a file at rp is permitted, then invokes
// createInRemoteDir (if the parent directory is a real remote directory) or
// createInSyntheticDir (if the parent directory is synthetic) to do so.
//
// Preconditions:
// * !rp.Done().
// * For the final path component in rp, !rp.ShouldFollowSymlink().
func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) error, createInSyntheticDir func(parent *dentry, name string) error) error {
        var ds *[]*dentry
        fs.renameMu.RLock()
        defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
        start := rp.Start().Impl().(*dentry)
        parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
        if err != nil {
                return err
        }

        // Order of checks is important. First check if parent directory can be
        // executed, then check for existence, and lastly check if mount is writable.
        if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
                return err
        }
        name := rp.Component()
        if name == "." || name == ".." {
                return linuxerr.EEXIST
        }
        if parent.isDeleted() {
                return syserror.ENOENT
        }
        if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, name, &ds); err != nil {
                return err
        }

        parent.dirMu.Lock()
        defer parent.dirMu.Unlock()

        if len(name) > maxFilenameLen {
                return linuxerr.ENAMETOOLONG
        }
        // Check for existence only if caching information is available. Otherwise,
        // don't check for existence just yet. We will check for existence if the
        // checks for writability fail below. Existence check is done by the creation
        // RPCs themselves.
        if child, ok := parent.children[name]; ok && child != nil {
                return linuxerr.EEXIST
        }
        checkExistence := func() error {
                if child, err := fs.getChildLocked(ctx, parent, name, &ds); err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) {
                        return err
                } else if child != nil {
                        return linuxerr.EEXIST
                }
                return nil
        }

        mnt := rp.Mount()
        if err := mnt.CheckBeginWrite(); err != nil {
                // Existence check takes precedence.
                if existenceErr := checkExistence(); existenceErr != nil {
                        return existenceErr
                }
                return err
        }
        defer mnt.EndWrite()

        if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
                // Existence check takes precedence.
                if existenceErr := checkExistence(); existenceErr != nil {
                        return existenceErr
                }
                return err
        }
        if !dir && rp.MustBeDir() {
                return syserror.ENOENT
        }
        if parent.isSynthetic() {
                if createInSyntheticDir == nil {
                        return linuxerr.EPERM
                }
                if err := createInSyntheticDir(parent, name); err != nil {
                        return err
                }
                parent.touchCMtime()
                parent.dirents = nil
                ev := linux.IN_CREATE
                if dir {
                        ev |= linux.IN_ISDIR
                }
                parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
                return nil
        }
        // No cached dentry exists; however, in InteropModeShared there might still be
        // an existing file at name. Just attempt the file creation RPC anyways. If a
        // file does exist, the RPC will fail with EEXIST like we would have.
        if err := createInRemoteDir(parent, name, &ds); err != nil {
                return err
        }
        if fs.opts.interop != InteropModeShared {
                if child, ok := parent.children[name]; ok && child == nil {
                        // Delete the now-stale negative dentry.
                        delete(parent.children, name)
                }
                parent.touchCMtime()
                parent.dirents = nil
        }
        ev := linux.IN_CREATE
        if dir {
                ev |= linux.IN_ISDIR
        }
        parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
        return nil
}

// Preconditions: !rp.Done().
func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error {
        var ds *[]*dentry
        fs.renameMu.RLock()
        defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
        start := rp.Start().Impl().(*dentry)
        parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
        if err != nil {
                return err
        }
        if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
                return err
        }
        if err := rp.Mount().CheckBeginWrite(); err != nil {
                return err
        }
        defer rp.Mount().EndWrite()

        name := rp.Component()
        if dir {
                if name == "." {
                        return linuxerr.EINVAL
                }
                if name == ".." {
                        return linuxerr.ENOTEMPTY
                }
        } else {
                if name == "." || name == ".." {
                        return syserror.EISDIR
                }
        }

        vfsObj := rp.VirtualFilesystem()
        if err := fs.revalidateOne(ctx, vfsObj, parent, rp.Component(), &ds); err != nil {
                return err
        }

        mntns := vfs.MountNamespaceFromContext(ctx)
        defer mntns.DecRef(ctx)

        parent.dirMu.Lock()
        defer parent.dirMu.Unlock()

        // Load child if sticky bit is set because we need to determine whether
        // deletion is allowed.
        var child *dentry
        if atomic.LoadUint32(&parent.mode)&linux.ModeSticky == 0 {
                var ok bool
                child, ok = parent.children[name]
                if ok && child == nil {
                        // Hit a negative cached entry, child doesn't exist.
                        return syserror.ENOENT
                }
        } else {
                child, _, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
                if err != nil {
                        return err
                }
                if err := parent.mayDelete(rp.Credentials(), child); err != nil {
                        return err
                }
        }

        // If a child dentry exists, prepare to delete it. This should fail if it is
        // a mount point. We detect mount points by speculatively calling
        // PrepareDeleteDentry, which fails if child is a mount point.
        //
        // Also note that if child is nil, then it can't be a mount point.
        if child != nil {
                // Hold child.dirMu so we can check child.children and
                // child.syntheticChildren. We don't access these fields until a bit later,
                // but locking child.dirMu after calling vfs.PrepareDeleteDentry() would
                // create an inconsistent lock ordering between dentry.dirMu and
                // vfs.Dentry.mu (in the VFS lock order, it would make dentry.dirMu both "a
                // FilesystemImpl lock" and "a lock acquired by a FilesystemImpl between
                // PrepareDeleteDentry and CommitDeleteDentry). To avoid this, lock
                // child.dirMu before calling PrepareDeleteDentry.
                child.dirMu.Lock()
                defer child.dirMu.Unlock()
                if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
                        return err
                }
        }
        flags := uint32(0)
        // If a dentry exists, use it for best-effort checks on its deletability.
        if dir {
                if child != nil {
                        // child must be an empty directory.
                        if child.syntheticChildren != 0 {
                                // This is definitely not an empty directory, irrespective of
                                // fs.opts.interop.
                                vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: PrepareDeleteDentry called if child != nil.
                                return linuxerr.ENOTEMPTY
                        }
                        // If InteropModeShared is in effect and the first call to
                        // PrepareDeleteDentry above succeeded, then child wasn't
                        // revalidated (so we can't expect its file type to be correct) and
                        // individually revalidating its children (to confirm that they
                        // still exist) would be a waste of time.
                        if child.cachedMetadataAuthoritative() {
                                if !child.isDir() {
                                        vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above.
                                        return linuxerr.ENOTDIR
                                }
                                for _, grandchild := range child.children {
                                        if grandchild != nil {
                                                vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above.
                                                return linuxerr.ENOTEMPTY
                                        }
                                }
                        }
                }
                flags = linux.AT_REMOVEDIR
        } else {
                // child must be a non-directory file.
                if child != nil && child.isDir() {
                        vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above.
                        return syserror.EISDIR
                }
                if rp.MustBeDir() {
                        if child != nil {
                                vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above.
                        }
                        return linuxerr.ENOTDIR
                }
        }
        if parent.isSynthetic() {
                if child == nil {
                        return syserror.ENOENT
                }
        } else if child == nil || !child.isSynthetic() {
                err = parent.file.unlinkAt(ctx, name, flags)
                if err != nil {
                        if child != nil {
                                vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above.
                        }
                        return err
                }
        }

        // Generate inotify events for rmdir or unlink.
        if dir {
                parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
        } else {
                var cw *vfs.Watches
                if child != nil {
                        cw = &child.watches
                }
                vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name)
        }

        if child != nil {
                vfsObj.CommitDeleteDentry(ctx, &child.vfsd) // +checklocksforce: see above.
                child.setDeleted()
                if child.isSynthetic() {
                        parent.syntheticChildren--
                        child.decRefNoCaching()
                }
                ds = appendDentry(ds, child)
        }
        parent.cacheNegativeLookupLocked(name)
        if parent.cachedMetadataAuthoritative() {
                parent.dirents = nil
                parent.touchCMtime()
                if dir {
                        parent.decLinks()
                }
        }
        return nil
}

// AccessAt implements vfs.Filesystem.Impl.AccessAt.
func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
        var ds *[]*dentry
        fs.renameMu.RLock()
        defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
        d, err := fs.resolveLocked(ctx, rp, &ds)
        if err != nil {
                return err
        }
        return d.checkPermissions(creds, ats)
}

// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
        var ds *[]*dentry
        fs.renameMu.RLock()
        defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
        d, err := fs.resolveLocked(ctx, rp, &ds)
        if err != nil {
                return nil, err
        }
        if opts.CheckSearchable {
                if !d.isDir() {
                        return nil, linuxerr.ENOTDIR
                }
                if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
                        return nil, err
                }
        }
        d.IncRef()
        // Call d.checkCachingLocked() so it can be removed from the cache if needed.
        ds = appendDentry(ds, d)
        return &d.vfsd, nil
}

// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
        var ds *[]*dentry
        fs.renameMu.RLock()
        defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
        start := rp.Start().Impl().(*dentry)
        d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
        if err != nil {
                return nil, err
        }
        d.IncRef()
        // Call d.checkCachingLocked() so it can be removed from the cache if needed.
        ds = appendDentry(ds, d)
        return &d.vfsd, nil
}

// LinkAt implements vfs.FilesystemImpl.LinkAt.
func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
        return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, _ **[]*dentry) error {
                if rp.Mount() != vd.Mount() {
                        return linuxerr.EXDEV
                }
                d := vd.Dentry().Impl().(*dentry)
                if d.isDir() {
                        return linuxerr.EPERM
                }
                gid := auth.KGID(atomic.LoadUint32(&d.gid))
                uid := auth.KUID(atomic.LoadUint32(&d.uid))
                mode := linux.FileMode(atomic.LoadUint32(&d.mode))
                if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil {
                        return err
                }
                if d.nlink == 0 {
                        return syserror.ENOENT
                }
                if d.nlink == math.MaxUint32 {
                        return linuxerr.EMLINK
                }
                if err := parent.file.link(ctx, d.file, childName); err != nil {
                        return err
                }

                // Success!
                atomic.AddUint32(&d.nlink, 1)
                return nil
        }, nil)
}

// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
        creds := rp.Credentials()
        return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, ds **[]*dentry) error {
                // If the parent is a setgid directory, use the parent's GID
                // rather than the caller's and enable setgid.
                kgid := creds.EffectiveKGID
                mode := opts.Mode
                if atomic.LoadUint32(&parent.mode)&linux.S_ISGID != 0 {
                        kgid = auth.KGID(atomic.LoadUint32(&parent.gid))
                        mode |= linux.S_ISGID
                }
                if _, err := parent.file.mkdir(ctx, name, p9.FileMode(mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid)); err != nil {
                        if !opts.ForSyntheticMountpoint || linuxerr.Equals(linuxerr.EEXIST, err) {
                                return err
                        }
                        ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err)
                        parent.createSyntheticChildLocked(&createSyntheticOpts{
                                name: name,
                                mode: linux.S_IFDIR | opts.Mode,
                                kuid: creds.EffectiveKUID,
                                kgid: creds.EffectiveKGID,
                        })
                        *ds = appendDentry(*ds, parent)
                }
                if fs.opts.interop != InteropModeShared {
                        parent.incLinks()
                }
                return nil
        }, func(parent *dentry, name string) error {
                if !opts.ForSyntheticMountpoint {
                        // Can't create non-synthetic files in synthetic directories.
                        return linuxerr.EPERM
                }
                parent.createSyntheticChildLocked(&createSyntheticOpts{
                        name: name,
                        mode: linux.S_IFDIR | opts.Mode,
                        kuid: creds.EffectiveKUID,
                        kgid: creds.EffectiveKGID,
                })
                parent.incLinks()
                return nil
        })
}

// MknodAt implements vfs.FilesystemImpl.MknodAt.
func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
        return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) error {
                creds := rp.Credentials()
                _, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
                if !linuxerr.Equals(linuxerr.EPERM, err) {
                        return err
                }

                // EPERM means that gofer does not allow creating a socket or pipe. Fallback
                // to creating a synthetic one, i.e. one that is kept entirely in memory.

                // Check that we're not overriding an existing file with a synthetic one.
                _, _, err = fs.stepLocked(ctx, rp, parent, true, ds)
                switch {
                case err == nil:
                        // Step succeeded, another file exists.
                        return linuxerr.EEXIST
                case !linuxerr.Equals(linuxerr.ENOENT, err):
                        // Unexpected error.
                        return err
                }

                switch opts.Mode.FileType() {
                case linux.S_IFSOCK:
                        parent.createSyntheticChildLocked(&createSyntheticOpts{
                                name:     name,
                                mode:     opts.Mode,
                                kuid:     creds.EffectiveKUID,
                                kgid:     creds.EffectiveKGID,
                                endpoint: opts.Endpoint,
                        })
                        *ds = appendDentry(*ds, parent)
                        return nil
                case linux.S_IFIFO:
                        parent.createSyntheticChildLocked(&createSyntheticOpts{
                                name: name,
                                mode: opts.Mode,
                                kuid: creds.EffectiveKUID,
                                kgid: creds.EffectiveKGID,
                                pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize),
                        })
                        *ds = appendDentry(*ds, parent)
                        return nil
                }
                // Retain error from gofer if synthetic file cannot be created internally.
                return linuxerr.EPERM
        }, nil)
}

// OpenAt implements vfs.FilesystemImpl.OpenAt.
func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        // Reject O_TMPFILE, which is not supported; supporting it correctly in the
        // presence of other remote filesystem users requires remote filesystem
        // support, and it isn't clear that there's any way to implement this in
        // 9P.
        if opts.Flags&linux.O_TMPFILE != 0 {
                return nil, linuxerr.EOPNOTSUPP
        }
        mayCreate := opts.Flags&linux.O_CREAT != 0
        mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)

        var ds *[]*dentry
        fs.renameMu.RLock()
        unlocked := false
        unlock := func() {
                if !unlocked {
                        fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
                        unlocked = true
                }
        }
        defer unlock()

        start := rp.Start().Impl().(*dentry)
        if rp.Done() {
                // Reject attempts to open mount root directory with O_CREAT.
                if mayCreate && rp.MustBeDir() {
                        return nil, syserror.EISDIR
                }
                if mustCreate {
                        return nil, linuxerr.EEXIST
                }
                if !start.cachedMetadataAuthoritative() {
                        // Refresh dentry's attributes before opening.
                        if err := start.updateFromGetattr(ctx); err != nil {
                                return nil, err
                        }
                }
                start.IncRef()
                defer start.DecRef(ctx)
                unlock()
                // start is intentionally not added to ds (which would remove it from the
                // cache) because doing so regresses performance in practice.
                return start.open(ctx, rp, &opts)
        }

afterTrailingSymlink:
        parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
        if err != nil {
                return nil, err
        }
        // Check for search permission in the parent directory.
        if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
                return nil, err
        }
        // Reject attempts to open directories with O_CREAT.
        if mayCreate && rp.MustBeDir() {
                return nil, syserror.EISDIR
        }
        if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, rp.Component(), &ds); err != nil {
                return nil, err
        }
        // Determine whether or not we need to create a file.
        parent.dirMu.Lock()
        child, _, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
        if linuxerr.Equals(linuxerr.ENOENT, err) && mayCreate {
                if parent.isSynthetic() {
                        parent.dirMu.Unlock()
                        return nil, linuxerr.EPERM
                }
                fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds)
                parent.dirMu.Unlock()
                return fd, err
        }
        parent.dirMu.Unlock()
        if err != nil {
                return nil, err
        }
        if mustCreate {
                return nil, linuxerr.EEXIST
        }
        // Open existing child or follow symlink.
        if child.isSymlink() && rp.ShouldFollowSymlink() {
                target, err := child.readlink(ctx, rp.Mount())
                if err != nil {
                        return nil, err
                }
                if err := rp.HandleSymlink(target); err != nil {
                        return nil, err
                }
                start = parent
                goto afterTrailingSymlink
        }
        if rp.MustBeDir() && !child.isDir() {
                return nil, linuxerr.ENOTDIR
        }
        child.IncRef()
        defer child.DecRef(ctx)
        unlock()
        // child is intentionally not added to ds (which would remove it from the
        // cache) because doing so regresses performance in practice.
        return child.open(ctx, rp, &opts)
}

// Preconditions: The caller must hold no locks (since opening pipes may block
// indefinitely).
func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
        ats := vfs.AccessTypesForOpenFlags(opts)
        if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
                return nil, err
        }

        trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG
        if trunc {
                // Lock metadataMu *while* we open a regular file with O_TRUNC because
                // open(2) will change the file size on server.
                d.metadataMu.Lock()
                defer d.metadataMu.Unlock()
        }

        var vfd *vfs.FileDescription
        var err error
        mnt := rp.Mount()
        switch d.fileType() {
        case linux.S_IFREG:
                if !d.fs.opts.regularFilesUseSpecialFileFD {
                        if err := d.ensureSharedHandle(ctx, ats.MayRead(), ats.MayWrite(), trunc); err != nil {
                                return nil, err
                        }
                        fd, err := newRegularFileFD(mnt, d, opts.Flags)
                        if err != nil {
                                return nil, err
                        }
                        vfd = &fd.vfsfd
                }
        case linux.S_IFDIR:
                // Can't open directories with O_CREAT.
                if opts.Flags&linux.O_CREAT != 0 {
                        return nil, syserror.EISDIR
                }
                // Can't open directories writably.
                if ats&vfs.MayWrite != 0 {
                        return nil, syserror.EISDIR
                }
                if opts.Flags&linux.O_DIRECT != 0 {
                        return nil, linuxerr.EINVAL
                }
                if !d.isSynthetic() {
                        if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
                                return nil, err
                        }
                }
                fd := &directoryFD{}
                fd.LockFD.Init(&d.locks)
                if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
                        return nil, err
                }
                if atomic.LoadInt32(&d.readFD) >= 0 {
                        fsmetric.GoferOpensHost.Increment()
                } else {
                        fsmetric.GoferOpens9P.Increment()
                }
                return &fd.vfsfd, nil
        case linux.S_IFLNK:
                // Can't open symlinks without O_PATH, which is handled at the VFS layer.
                return nil, linuxerr.ELOOP
        case linux.S_IFSOCK:
                if d.isSynthetic() {
                        return nil, linuxerr.ENXIO
                }
                if d.fs.iopts.OpenSocketsByConnecting {
                        return d.openSocketByConnecting(ctx, opts)
                }
        case linux.S_IFIFO:
                if d.isSynthetic() {
                        return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks)
                }
        }

        if vfd == nil {
                if vfd, err = d.openSpecialFile(ctx, mnt, opts); err != nil {
                        return nil, err
                }
        }

        if trunc {
                // If no errors occured so far then update file size in memory. This
                // step is required even if !d.cachedMetadataAuthoritative() because
                // d.mappings has to be updated.
                // d.metadataMu has already been acquired if trunc == true.
                d.updateSizeLocked(0)

                if d.cachedMetadataAuthoritative() {
                        d.touchCMtimeLocked()
                }
        }
        return vfd, err
}

func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
        if opts.Flags&linux.O_DIRECT != 0 {
                return nil, linuxerr.EINVAL
        }
        fdObj, err := d.file.connect(ctx, p9.AnonymousSocket)
        if err != nil {
                return nil, err
        }
        fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fdObj.FD(), &host.NewFDOptions{
                HaveFlags: true,
                Flags:     opts.Flags,
        })
        if err != nil {
                fdObj.Close()
                return nil, err
        }
        fdObj.Release()
        return fd, nil
}

func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
        ats := vfs.AccessTypesForOpenFlags(opts)
        if opts.Flags&linux.O_DIRECT != 0 {
                return nil, linuxerr.EINVAL
        }
        // We assume that the server silently inserts O_NONBLOCK in the open flags
        // for all named pipes (because all existing gofers do this).
        //
        // NOTE(b/133875563): This makes named pipe opens racy, because the
        // mechanisms for translating nonblocking to blocking opens can only detect
        // the instantaneous presence of a peer holding the other end of the pipe
        // open, not whether the pipe was *previously* opened by a peer that has
        // since closed its end.
        isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0
retry:
        h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0)
        if err != nil {
                if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && linuxerr.Equals(linuxerr.ENXIO, err) {
                        // An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails
                        // with ENXIO if opening the same named pipe with O_WRONLY would
                        // block because there are no readers of the pipe.
                        if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
                                return nil, err
                        }
                        goto retry
                }
                return nil, err
        }
        if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 {
                if err := blockUntilNonblockingPipeHasWriter(ctx, h.fd); err != nil {
                        h.close(ctx)
                        return nil, err
                }
        }
        fd, err := newSpecialFileFD(h, mnt, d, opts.Flags)
        if err != nil {
                h.close(ctx)
                return nil, err
        }
        return &fd.vfsfd, nil
}

// Preconditions:
// * d.fs.renameMu must be locked.
// * d.dirMu must be locked.
// * !d.isSynthetic().
func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
        if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
                return nil, err
        }
        if d.isDeleted() {
                return nil, syserror.ENOENT
        }
        mnt := rp.Mount()
        if err := mnt.CheckBeginWrite(); err != nil {
                return nil, err
        }
        defer mnt.EndWrite()

        // 9P2000.L's lcreate takes a fid representing the parent directory, and
        // converts it into an open fid representing the created file, so we need
        // to duplicate the directory fid first.
        _, dirfile, err := d.file.walk(ctx, nil)
        if err != nil {
                return nil, err
        }
        creds := rp.Credentials()
        name := rp.Component()
        // We only want the access mode for creating the file.
        createFlags := p9.OpenFlags(opts.Flags) & p9.OpenFlagsModeMask

        // If the parent is a setgid directory, use the parent's GID rather
        // than the caller's.
        kgid := creds.EffectiveKGID
        if atomic.LoadUint32(&d.mode)&linux.S_ISGID != 0 {
                kgid = auth.KGID(atomic.LoadUint32(&d.gid))
        }

        fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, p9.FileMode(opts.Mode), (p9.UID)(creds.EffectiveKUID), p9.GID(kgid))
        if err != nil {
                dirfile.close(ctx)
                return nil, err
        }
        // Then we need to walk to the file we just created to get a non-open fid
        // representing it, and to get its metadata. This must use d.file since, as
        // explained above, dirfile was invalidated by dirfile.Create().
        _, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name)
        if err != nil {
                openFile.close(ctx)
                if fdobj != nil {
                        fdobj.Close()
                }
                return nil, err
        }

        // Construct the new dentry.
        child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr)
        if err != nil {
                nonOpenFile.close(ctx)
                openFile.close(ctx)
                if fdobj != nil {
                        fdobj.Close()
                }
                return nil, err
        }
        // Incorporate the fid that was opened by lcreate.
        useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
        if useRegularFileFD {
                openFD := int32(-1)
                if fdobj != nil {
                        openFD = int32(fdobj.Release())
                }
                child.handleMu.Lock()
                if vfs.MayReadFileWithOpenFlags(opts.Flags) {
                        child.readFile = openFile
                        if fdobj != nil {
                                child.readFD = openFD
                                child.mmapFD = openFD
                        }
                }
                if vfs.MayWriteFileWithOpenFlags(opts.Flags) {
                        child.writeFile = openFile
                        child.writeFD = openFD
                }
                child.handleMu.Unlock()
        }
        // Insert the dentry into the tree.
        d.cacheNewChildLocked(child, name)
        appendNewChildDentry(ds, d, child)
        if d.cachedMetadataAuthoritative() {
                d.touchCMtime()
                d.dirents = nil
        }

        // Finally, construct a file description representing the created file.
        var childVFSFD *vfs.FileDescription
        if useRegularFileFD {
                fd, err := newRegularFileFD(mnt, child, opts.Flags)
                if err != nil {
                        return nil, err
                }
                childVFSFD = &fd.vfsfd
        } else {
                h := handle{
                        file: openFile,
                        fd:   -1,
                }
                if fdobj != nil {
                        h.fd = int32(fdobj.Release())
                }
                fd, err := newSpecialFileFD(h, mnt, child, opts.Flags)
                if err != nil {
                        h.close(ctx)
                        return nil, err
                }
                childVFSFD = &fd.vfsfd
        }
        d.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
        return childVFSFD, nil
}

// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
        var ds *[]*dentry
        fs.renameMu.RLock()
        defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
        d, err := fs.resolveLocked(ctx, rp, &ds)
        if err != nil {
                return "", err
        }
        if !d.isSymlink() {
                return "", linuxerr.EINVAL
        }
        return d.readlink(ctx, rp.Mount())
}

// RenameAt implements vfs.FilesystemImpl.RenameAt.
func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
        // Resolve newParent first to verify that it's on this Mount.
        var ds *[]*dentry
        fs.renameMu.Lock()
        defer fs.renameMuUnlockAndCheckCaching(ctx, &ds)
        newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
        if err != nil {
                return err
        }

        if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
                return linuxerr.EINVAL
        }
        if fs.opts.interop == InteropModeShared && opts.Flags&linux.RENAME_NOREPLACE != 0 {
                // Requires 9P support to synchronize with other remote filesystem
                // users.
                return linuxerr.EINVAL
        }

        newName := rp.Component()
        if newName == "." || newName == ".." {
                if opts.Flags&linux.RENAME_NOREPLACE != 0 {
                        return linuxerr.EEXIST
                }
                return linuxerr.EBUSY
        }
        mnt := rp.Mount()
        if mnt != oldParentVD.Mount() {
                return linuxerr.EXDEV
        }
        if err := mnt.CheckBeginWrite(); err != nil {
                return err
        }
        defer mnt.EndWrite()

        oldParent := oldParentVD.Dentry().Impl().(*dentry)
        if !oldParent.cachedMetadataAuthoritative() {
                if err := oldParent.updateFromGetattr(ctx); err != nil {
                        return err
                }
        }
        creds := rp.Credentials()
        if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
                return err
        }

        vfsObj := rp.VirtualFilesystem()
        if err := fs.revalidateOne(ctx, vfsObj, newParent, newName, &ds); err != nil {
                return err
        }
        if err := fs.revalidateOne(ctx, vfsObj, oldParent, oldName, &ds); err != nil {
                return err
        }

        // We need a dentry representing the renamed file since, if it's a
        // directory, we need to check for write permission on it.
        oldParent.dirMu.Lock()
        defer oldParent.dirMu.Unlock()
        renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds)
        if err != nil {
                return err
        }
        if err := oldParent.mayDelete(creds, renamed); err != nil {
                return err
        }
        if renamed.isDir() {
                if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
                        return linuxerr.EINVAL
                }
                if oldParent != newParent {
                        if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
                                return err
                        }
                }
        } else {
                if opts.MustBeDir || rp.MustBeDir() {
                        return linuxerr.ENOTDIR
                }
        }

        if oldParent != newParent {
                if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
                        return err
                }
                newParent.dirMu.Lock()
                defer newParent.dirMu.Unlock()
        }
        if newParent.isDeleted() {
                return syserror.ENOENT
        }
        replaced, err := fs.getChildLocked(ctx, newParent, newName, &ds)
        if err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) {
                return err
        }
        var replacedVFSD *vfs.Dentry
        if replaced != nil {
                if opts.Flags&linux.RENAME_NOREPLACE != 0 {
                        return linuxerr.EEXIST
                }
                replacedVFSD = &replaced.vfsd
                if replaced.isDir() {
                        if !renamed.isDir() {
                                return syserror.EISDIR
                        }
                        if genericIsAncestorDentry(replaced, renamed) {
                                return linuxerr.ENOTEMPTY
                        }
                } else {
                        if rp.MustBeDir() || renamed.isDir() {
                                return linuxerr.ENOTDIR
                        }
                }
        }

        if oldParent == newParent && oldName == newName {
                return nil
        }
        mntns := vfs.MountNamespaceFromContext(ctx)
        defer mntns.DecRef(ctx)
        if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
                return err
        }

        // Update the remote filesystem.
        if !renamed.isSynthetic() {
                if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
                        vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
                        return err
                }
        } else if replaced != nil && !replaced.isSynthetic() {
                // We are replacing an existing real file with a synthetic one, so we
                // need to unlink the former.
                flags := uint32(0)
                if replaced.isDir() {
                        flags = linux.AT_REMOVEDIR
                }
                if err := newParent.file.unlinkAt(ctx, newName, flags); err != nil {
                        vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
                        return err
                }
        }

        // Update the dentry tree.
        vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
        if replaced != nil {
                replaced.setDeleted()
                if replaced.isSynthetic() {
                        newParent.syntheticChildren--
                        replaced.decRefNoCaching()
                }
                ds = appendDentry(ds, replaced)
        }
        oldParent.cacheNegativeLookupLocked(oldName)
        // We don't use newParent.cacheNewChildLocked() since we don't want to mess
        // with reference counts and queue oldParent for checkCachingLocked if the
        // parent isn't actually changing.
        if oldParent != newParent {
                oldParent.decRefNoCaching()
                newParent.IncRef()
                ds = appendDentry(ds, newParent)
                ds = appendDentry(ds, oldParent)
                if renamed.isSynthetic() {
                        oldParent.syntheticChildren--
                        newParent.syntheticChildren++
                }
                renamed.parent = newParent
        }
        renamed.name = newName
        if newParent.children == nil {
                newParent.children = make(map[string]*dentry)
        }
        newParent.children[newName] = renamed

        // Update metadata.
        if renamed.cachedMetadataAuthoritative() {
                renamed.touchCtime()
        }
        if oldParent.cachedMetadataAuthoritative() {
                oldParent.dirents = nil
                oldParent.touchCMtime()
                if renamed.isDir() {
                        oldParent.decLinks()
                }
        }
        if newParent.cachedMetadataAuthoritative() {
                newParent.dirents = nil
                newParent.touchCMtime()
                if renamed.isDir() && (replaced == nil || !replaced.isDir()) {
                        // Increase the link count if we did not replace another directory.
                        newParent.incLinks()
                }
        }
        vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir())
        return nil
}

// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
        return fs.unlinkAt(ctx, rp, true /* dir */)
}

// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
        var ds *[]*dentry
        fs.renameMu.RLock()
        d, err := fs.resolveLocked(ctx, rp, &ds)
        if err != nil {
                fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
                return err
        }
        err = d.setStat(ctx, rp.Credentials(), &opts, rp.Mount())
        fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
        if err != nil {
                return err
        }

        if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
                d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
        }
        return nil
}

// StatAt implements vfs.FilesystemImpl.StatAt.
func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
        var ds *[]*dentry
        fs.renameMu.RLock()
        defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
        d, err := fs.resolveLocked(ctx, rp, &ds)
        if err != nil {
                return linux.Statx{}, err
        }
        // Since walking updates metadata for all traversed dentries under
        // InteropModeShared, including the returned one, we can return cached
        // metadata here regardless of fs.opts.interop.
        var stat linux.Statx
        d.statTo(&stat)
        return stat, nil
}

// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
        var ds *[]*dentry
        fs.renameMu.RLock()
        defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
        d, err := fs.resolveLocked(ctx, rp, &ds)
        if err != nil {
                return linux.Statfs{}, err
        }
        // If d is synthetic, invoke statfs on the first ancestor of d that isn't.
        for d.isSynthetic() {
                d = d.parent
        }
        fsstat, err := d.file.statFS(ctx)
        if err != nil {
                return linux.Statfs{}, err
        }
        nameLen := uint64(fsstat.NameLength)
        if nameLen > maxFilenameLen {
                nameLen = maxFilenameLen
        }
        return linux.Statfs{
                // This is primarily for distinguishing a gofer file system in
                // tests. Testing is important, so instead of defining
                // something completely random, use a standard value.
                Type:            linux.V9FS_MAGIC,
                BlockSize:       int64(fsstat.BlockSize),
                Blocks:          fsstat.Blocks,
                BlocksFree:      fsstat.BlocksFree,
                BlocksAvailable: fsstat.BlocksAvailable,
                Files:           fsstat.Files,
                FilesFree:       fsstat.FilesFree,
                NameLength:      nameLen,
        }, nil
}

// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
        return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, _ **[]*dentry) error {
                creds := rp.Credentials()
                _, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
                return err
        }, nil)
}

// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
        return fs.unlinkAt(ctx, rp, false /* dir */)
}

// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
        var ds *[]*dentry
        fs.renameMu.RLock()
        defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
        d, err := fs.resolveLocked(ctx, rp, &ds)
        if err != nil {
                return nil, err
        }
        if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
                return nil, err
        }
        if d.isSocket() {
                if !d.isSynthetic() {
                        d.IncRef()
                        ds = appendDentry(ds, d)
                        return &endpoint{
                                dentry: d,
                                path:   opts.Addr,
                        }, nil
                }
                if d.endpoint != nil {
                        return d.endpoint, nil
                }
        }
        return nil, linuxerr.ECONNREFUSED
}

// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
        var ds *[]*dentry
        fs.renameMu.RLock()
        defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
        d, err := fs.resolveLocked(ctx, rp, &ds)
        if err != nil {
                return nil, err
        }
        return d.listXattr(ctx, rp.Credentials(), size)
}

// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
        var ds *[]*dentry
        fs.renameMu.RLock()
        defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
        d, err := fs.resolveLocked(ctx, rp, &ds)
        if err != nil {
                return "", err
        }
        return d.getXattr(ctx, rp.Credentials(), &opts)
}

// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
        var ds *[]*dentry
        fs.renameMu.RLock()
        d, err := fs.resolveLocked(ctx, rp, &ds)
        if err != nil {
                fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
                return err
        }
        err = d.setXattr(ctx, rp.Credentials(), &opts)
        fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
        if err != nil {
                return err
        }

        d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
        return nil
}

// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
        var ds *[]*dentry
        fs.renameMu.RLock()
        d, err := fs.resolveLocked(ctx, rp, &ds)
        if err != nil {
                fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
                return err
        }
        err = d.removeXattr(ctx, rp.Credentials(), name)
        fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
        if err != nil {
                return err
        }

        d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
        return nil
}

// PrependPath implements vfs.FilesystemImpl.PrependPath.
func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
        fs.renameMu.RLock()
        defer fs.renameMu.RUnlock()
        return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
}

type mopt struct {
        key   string
        value interface{}
}

func (m mopt) String() string {
        if m.value == nil {
                return fmt.Sprintf("%s", m.key)
        }
        return fmt.Sprintf("%s=%v", m.key, m.value)
}

// MountOptions implements vfs.FilesystemImpl.MountOptions.
func (fs *filesystem) MountOptions() string {
        optsKV := []mopt{
                {moptTransport, transportModeFD}, // Only valid value, currently.
                {moptReadFD, fs.opts.fd},         // Currently, read and write FD are the same.
                {moptWriteFD, fs.opts.fd},        // Currently, read and write FD are the same.
                {moptAname, fs.opts.aname},
                {moptDfltUID, fs.opts.dfltuid},
                {moptDfltGID, fs.opts.dfltgid},
                {moptMsize, fs.opts.msize},
                {moptVersion, fs.opts.version},
                {moptDentryCacheLimit, fs.opts.maxCachedDentries},
        }

        switch fs.opts.interop {
        case InteropModeExclusive:
                optsKV = append(optsKV, mopt{moptCache, cacheFSCache})
        case InteropModeWritethrough:
                optsKV = append(optsKV, mopt{moptCache, cacheFSCacheWritethrough})
        case InteropModeShared:
                if fs.opts.regularFilesUseSpecialFileFD {
                        optsKV = append(optsKV, mopt{moptCache, cacheNone})
                } else {
                        optsKV = append(optsKV, mopt{moptCache, cacheRemoteRevalidating})
                }
        }
        if fs.opts.forcePageCache {
                optsKV = append(optsKV, mopt{moptForcePageCache, nil})
        }
        if fs.opts.limitHostFDTranslation {
                optsKV = append(optsKV, mopt{moptLimitHostFDTranslation, nil})
        }
        if fs.opts.overlayfsStaleRead {
                optsKV = append(optsKV, mopt{moptOverlayfsStaleRead, nil})
        }

        opts := make([]string, 0, len(optsKV))
        for _, opt := range optsKV {
                opts = append(opts, opt.String())
        }
        return strings.Join(opts, ",")
}




































   31 





   31 










   58 





  115 





   36 




   68 








   35 




















   19 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package loopback provides the implemention of loopback data-link layer
// endpoints. Such endpoints just turn outbound packets into inbound ones.
//
// Loopback endpoints can be used in the networking stack by calling New() to
// create a new endpoint, and then passing it as an argument to
// Stack.CreateNIC().
package loopback

import (
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

type endpoint struct {
        dispatcher stack.NetworkDispatcher
}

// New creates a new loopback endpoint. This link-layer endpoint just turns
// outbound packets into inbound packets.
func New() stack.LinkEndpoint {
        return &endpoint{}
}

// Attach implements stack.LinkEndpoint.Attach. It just saves the stack network-
// layer dispatcher for later use when packets need to be dispatched.
func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
        e.dispatcher = dispatcher
}

// IsAttached implements stack.LinkEndpoint.IsAttached.
func (e *endpoint) IsAttached() bool {
        return e.dispatcher != nil
}

// MTU implements stack.LinkEndpoint.MTU. It returns a constant that matches the
// linux loopback interface.
func (*endpoint) MTU() uint32 {
        return 65536
}

// Capabilities implements stack.LinkEndpoint.Capabilities. Loopback advertises
// itself as supporting checksum offload, but in reality it's just omitted.
func (*endpoint) Capabilities() stack.LinkEndpointCapabilities {
        return stack.CapabilityRXChecksumOffload | stack.CapabilityTXChecksumOffload | stack.CapabilitySaveRestore | stack.CapabilityLoopback
}

// MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. Given that the
// loopback interface doesn't have a header, it just returns 0.
func (*endpoint) MaxHeaderLength() uint16 {
        return 0
}

// LinkAddress returns the link address of this endpoint.
func (*endpoint) LinkAddress() tcpip.LinkAddress {
        return ""
}

// Wait implements stack.LinkEndpoint.Wait.
func (*endpoint) Wait() {}

// WritePacket implements stack.LinkEndpoint.WritePacket. It delivers outbound
// packets to the network-layer dispatcher.
func (e *endpoint) WritePacket(_ stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error {
        // Construct data as the unparsed portion for the loopback packet.
        data := buffer.NewVectorisedView(pkt.Size(), pkt.Views())

        // Because we're immediately turning around and writing the packet back
        // to the rx path, we intentionally don't preserve the remote and local
        // link addresses from the stack.Route we're passed.
        newPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                Data: data,
        })
        e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, newPkt)

        return nil
}

// WritePackets implements stack.LinkEndpoint.WritePackets.
func (e *endpoint) WritePackets(stack.RouteInfo, stack.PacketBufferList, tcpip.NetworkProtocolNumber) (int, tcpip.Error) {
        panic("not implemented")
}

// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
func (*endpoint) ARPHardwareType() header.ARPHardwareType {
        return header.ARPHardwareLoopback
}

func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
}














































    1 




    1 




    1 





    1 


    1 


















    1 




    1 




    1 



















    1 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import (
        "encoding/binary"

        "gvisor.dev/gvisor/pkg/tcpip"
)

const (
        nextHdrFrag = 0
        fragOff     = 2
        more        = 3
        idV6        = 4
)

var _ IPv6SerializableExtHdr = (*IPv6SerializableFragmentExtHdr)(nil)

// IPv6SerializableFragmentExtHdr is used to serialize an IPv6 fragment
// extension header as defined in RFC 8200 section 4.5.
type IPv6SerializableFragmentExtHdr struct {
        // FragmentOffset is the "fragment offset" field of an IPv6 fragment.
        FragmentOffset uint16

        // M is the "more" field of an IPv6 fragment.
        M bool

        // Identification is the "identification" field of an IPv6 fragment.
        Identification uint32
}

// identifier implements IPv6SerializableFragmentExtHdr.
func (h *IPv6SerializableFragmentExtHdr) identifier() IPv6ExtensionHeaderIdentifier {
        return IPv6FragmentHeader
}

// length implements IPv6SerializableFragmentExtHdr.
func (h *IPv6SerializableFragmentExtHdr) length() int {
        return IPv6FragmentHeaderSize
}

// serializeInto implements IPv6SerializableFragmentExtHdr.
func (h *IPv6SerializableFragmentExtHdr) serializeInto(nextHeader uint8, b []byte) int {
        // Prevent too many bounds checks.
        _ = b[IPv6FragmentHeaderSize:]
        binary.BigEndian.PutUint32(b[idV6:], h.Identification)
        binary.BigEndian.PutUint16(b[fragOff:], h.FragmentOffset<<ipv6FragmentExtHdrFragmentOffsetShift)
        b[nextHdrFrag] = nextHeader
        if h.M {
                b[more] |= ipv6FragmentExtHdrMFlagMask
        }
        return IPv6FragmentHeaderSize
}

// IPv6Fragment represents an ipv6 fragment header stored in a byte array.
// Most of the methods of IPv6Fragment access to the underlying slice without
// checking the boundaries and could panic because of 'index out of range'.
// Always call IsValid() to validate an instance of IPv6Fragment before using other methods.
type IPv6Fragment []byte

const (
        // IPv6FragmentHeader header is the number used to specify that the next
        // header is a fragment header, per RFC 2460.
        IPv6FragmentHeader = 44

        // IPv6FragmentHeaderSize is the size of the fragment header.
        IPv6FragmentHeaderSize = 8
)

// IsValid performs basic validation on the fragment header.
func (b IPv6Fragment) IsValid() bool {
        return len(b) >= IPv6FragmentHeaderSize
}

// NextHeader returns the value of the "next header" field of the ipv6 fragment.
func (b IPv6Fragment) NextHeader() uint8 {
        return b[nextHdrFrag]
}

// FragmentOffset returns the "fragment offset" field of the ipv6 fragment.
func (b IPv6Fragment) FragmentOffset() uint16 {
        return binary.BigEndian.Uint16(b[fragOff:]) >> 3
}

// More returns the "more" field of the ipv6 fragment.
func (b IPv6Fragment) More() bool {
        return b[more]&1 > 0
}

// Payload implements Network.Payload.
func (b IPv6Fragment) Payload() []byte {
        return b[IPv6FragmentHeaderSize:]
}

// ID returns the value of the identifier field of the ipv6 fragment.
func (b IPv6Fragment) ID() uint32 {
        return binary.BigEndian.Uint32(b[idV6:])
}

// TransportProtocol implements Network.TransportProtocol.
func (b IPv6Fragment) TransportProtocol() tcpip.TransportProtocolNumber {
        return tcpip.TransportProtocolNumber(b.NextHeader())
}

// The functions below have been added only to satisfy the Network interface.

// Checksum is not supported by IPv6Fragment.
func (b IPv6Fragment) Checksum() uint16 {
        panic("not supported")
}

// SourceAddress is not supported by IPv6Fragment.
func (b IPv6Fragment) SourceAddress() tcpip.Address {
        panic("not supported")
}

// DestinationAddress is not supported by IPv6Fragment.
func (b IPv6Fragment) DestinationAddress() tcpip.Address {
        panic("not supported")
}

// SetSourceAddress is not supported by IPv6Fragment.
func (b IPv6Fragment) SetSourceAddress(tcpip.Address) {
        panic("not supported")
}

// SetDestinationAddress is not supported by IPv6Fragment.
func (b IPv6Fragment) SetDestinationAddress(tcpip.Address) {
        panic("not supported")
}

// SetChecksum is not supported by IPv6Fragment.
func (b IPv6Fragment) SetChecksum(uint16) {
        panic("not supported")
}

// TOS is not supported by IPv6Fragment.
func (b IPv6Fragment) TOS() (uint8, uint32) {
        panic("not supported")
}

// SetTOS is not supported by IPv6Fragment.
func (b IPv6Fragment) SetTOS(t uint8, l uint32) {
        panic("not supported")
}


































   55 

    1 


   54 


   55 


   55 























































   49 




   21 





   22 

   22 


   22 
























    6 

    6 
    6 



    5 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ports

// Flags represents the type of port reservation.
//
// +stateify savable
type Flags struct {
        // MostRecent represents UDP SO_REUSEADDR.
        MostRecent bool

        // LoadBalanced indicates SO_REUSEPORT.
        //
        // LoadBalanced takes precidence over MostRecent.
        LoadBalanced bool

        // TupleOnly represents TCP SO_REUSEADDR.
        TupleOnly bool
}

// Bits converts the Flags to their bitset form.
func (f Flags) Bits() BitFlags {
        var rf BitFlags
        if f.MostRecent {
                rf |= MostRecentFlag
        }
        if f.LoadBalanced {
                rf |= LoadBalancedFlag
        }
        if f.TupleOnly {
                rf |= TupleOnlyFlag
        }
        return rf
}

// Effective returns the effective behavior of a flag config.
func (f Flags) Effective() Flags {
        e := f
        if e.LoadBalanced && e.MostRecent {
                e.MostRecent = false
        }
        return e
}

// BitFlags is a bitset representation of Flags.
type BitFlags uint32

const (
        // MostRecentFlag represents Flags.MostRecent.
        MostRecentFlag BitFlags = 1 << iota

        // LoadBalancedFlag represents Flags.LoadBalanced.
        LoadBalancedFlag

        // TupleOnlyFlag represents Flags.TupleOnly.
        TupleOnlyFlag

        // nextFlag is the value that the next added flag will have.
        //
        // It is used to calculate FlagMask below. It is also the number of
        // valid flag states.
        nextFlag

        // FlagMask is a bit mask for BitFlags.
        FlagMask = nextFlag - 1

        // MultiBindFlagMask contains the flags that allow binding the same
        // tuple multiple times.
        MultiBindFlagMask = MostRecentFlag | LoadBalancedFlag
)

// ToFlags converts the bitset into a Flags struct.
func (f BitFlags) ToFlags() Flags {
        return Flags{
                MostRecent:   f&MostRecentFlag != 0,
                LoadBalanced: f&LoadBalancedFlag != 0,
                TupleOnly:    f&TupleOnlyFlag != 0,
        }
}

// FlagCounter counts how many references each flag combination has.
type FlagCounter struct {
        // refs stores the count for each possible flag combination, (0 though
        // FlagMask).
        refs [nextFlag]int
}

// AddRef increases the reference count for a specific flag combination.
func (c *FlagCounter) AddRef(flags BitFlags) {
        c.refs[flags]++
}

// DropRef decreases the reference count for a specific flag combination.
func (c *FlagCounter) DropRef(flags BitFlags) {
        c.refs[flags]--
}

// TotalRefs calculates the total number of references for all flag
// combinations.
func (c FlagCounter) TotalRefs() int {
        var total int
        for _, r := range c.refs {
                total += r
        }
        return total
}

// FlagRefs returns the number of references with all specified flags.
func (c FlagCounter) FlagRefs(flags BitFlags) int {
        var total int
        for i, r := range c.refs {
                if BitFlags(i)&flags == flags {
                        total += r
                }
        }
        return total
}

// AllRefsHave returns if all references have all specified flags.
func (c FlagCounter) AllRefsHave(flags BitFlags) bool {
        for i, r := range c.refs {
                if BitFlags(i)&flags != flags && r > 0 {
                        return false
                }
        }
        return true
}

// SharedFlags returns the set of flags shared by all references.
func (c FlagCounter) SharedFlags() BitFlags {
        intersection := FlagMask
        for i, r := range c.refs {
                if r > 0 {
                        intersection &= BitFlags(i)
                }
        }
        return intersection
}































































   14 




   12 









   16 





    2 












    9 









    7 

    1 


    7 


    1 



    7 
    4 

    3 

    1 


    6 



    7 


























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

const (
        // NumControlCharacters is the number of control characters in Termios.
        NumControlCharacters = 19
        // disabledChar is used to indicate that a control character is
        // disabled.
        disabledChar = 0
)

// Winsize is struct winsize, defined in uapi/asm-generic/termios.h.
//
// +marshal
type Winsize struct {
        Row    uint16
        Col    uint16
        Xpixel uint16
        Ypixel uint16
}

// Termios is struct termios, defined in uapi/asm-generic/termbits.h.
//
// +marshal
type Termios struct {
        InputFlags        uint32
        OutputFlags       uint32
        ControlFlags      uint32
        LocalFlags        uint32
        LineDiscipline    uint8
        ControlCharacters [NumControlCharacters]uint8
}

// KernelTermios is struct ktermios/struct termios2, defined in
// uapi/asm-generic/termbits.h.
//
// +stateify savable
type KernelTermios struct {
        InputFlags        uint32
        OutputFlags       uint32
        ControlFlags      uint32
        LocalFlags        uint32
        LineDiscipline    uint8
        ControlCharacters [NumControlCharacters]uint8
        InputSpeed        uint32
        OutputSpeed       uint32
}

// IEnabled returns whether flag is enabled in termios input flags.
func (t *KernelTermios) IEnabled(flag uint32) bool {
        return t.InputFlags&flag == flag
}

// OEnabled returns whether flag is enabled in termios output flags.
func (t *KernelTermios) OEnabled(flag uint32) bool {
        return t.OutputFlags&flag == flag
}

// CEnabled returns whether flag is enabled in termios control flags.
func (t *KernelTermios) CEnabled(flag uint32) bool {
        return t.ControlFlags&flag == flag
}

// LEnabled returns whether flag is enabled in termios local flags.
func (t *KernelTermios) LEnabled(flag uint32) bool {
        return t.LocalFlags&flag == flag
}

// ToTermios copies fields that are shared with Termios into a new Termios
// struct.
func (t *KernelTermios) ToTermios() Termios {
        return Termios{
                InputFlags:        t.InputFlags,
                OutputFlags:       t.OutputFlags,
                ControlFlags:      t.ControlFlags,
                LocalFlags:        t.LocalFlags,
                LineDiscipline:    t.LineDiscipline,
                ControlCharacters: t.ControlCharacters,
        }
}

// FromTermios copies fields that are shared with Termios into this
// KernelTermios struct.
func (t *KernelTermios) FromTermios(term Termios) {
        t.InputFlags = term.InputFlags
        t.OutputFlags = term.OutputFlags
        t.ControlFlags = term.ControlFlags
        t.LocalFlags = term.LocalFlags
        t.LineDiscipline = term.LineDiscipline
        t.ControlCharacters = term.ControlCharacters
}

// IsTerminating returns whether c is a line terminating character.
func (t *KernelTermios) IsTerminating(cBytes []byte) bool {
        // All terminating characters are 1 byte.
        if len(cBytes) != 1 {
                return false
        }
        c := cBytes[0]

        // Is this the user-set EOF character?
        if t.IsEOF(c) {
                return true
        }

        switch c {
        case disabledChar:
                return false
        case '\n', t.ControlCharacters[VEOL]:
                return true
        case t.ControlCharacters[VEOL2]:
                return t.LEnabled(IEXTEN)
        }
        return false
}

// IsEOF returns whether c is the EOF character.
func (t *KernelTermios) IsEOF(c byte) bool {
        return c == t.ControlCharacters[VEOF] && t.ControlCharacters[VEOF] != disabledChar
}

// Input flags.
const (
        IGNBRK  = 0000001
        BRKINT  = 0000002
        IGNPAR  = 0000004
        PARMRK  = 0000010
        INPCK   = 0000020
        ISTRIP  = 0000040
        INLCR   = 0000100
        IGNCR   = 0000200
        ICRNL   = 0000400
        IUCLC   = 0001000
        IXON    = 0002000
        IXANY   = 0004000
        IXOFF   = 0010000
        IMAXBEL = 0020000
        IUTF8   = 0040000
)

// Output flags.
const (
        OPOST  = 0000001
        OLCUC  = 0000002
        ONLCR  = 0000004
        OCRNL  = 0000010
        ONOCR  = 0000020
        ONLRET = 0000040
        OFILL  = 0000100
        OFDEL  = 0000200
        NLDLY  = 0000400
        NL0    = 0000000
        NL1    = 0000400
        CRDLY  = 0003000
        CR0    = 0000000
        CR1    = 0001000
        CR2    = 0002000
        CR3    = 0003000
        TABDLY = 0014000
        TAB0   = 0000000
        TAB1   = 0004000
        TAB2   = 0010000
        TAB3   = 0014000
        XTABS  = 0014000
        BSDLY  = 0020000
        BS0    = 0000000
        BS1    = 0020000
        VTDLY  = 0040000
        VT0    = 0000000
        VT1    = 0040000
        FFDLY  = 0100000
        FF0    = 0000000
        FF1    = 0100000
)

// Control flags.
const (
        CBAUD    = 0010017
        B0       = 0000000
        B50      = 0000001
        B75      = 0000002
        B110     = 0000003
        B134     = 0000004
        B150     = 0000005
        B200     = 0000006
        B300     = 0000007
        B600     = 0000010
        B1200    = 0000011
        B1800    = 0000012
        B2400    = 0000013
        B4800    = 0000014
        B9600    = 0000015
        B19200   = 0000016
        B38400   = 0000017
        EXTA     = B19200
        EXTB     = B38400
        CSIZE    = 0000060
        CS5      = 0000000
        CS6      = 0000020
        CS7      = 0000040
        CS8      = 0000060
        CSTOPB   = 0000100
        CREAD    = 0000200
        PARENB   = 0000400
        PARODD   = 0001000
        HUPCL    = 0002000
        CLOCAL   = 0004000
        CBAUDEX  = 0010000
        BOTHER   = 0010000
        B57600   = 0010001
        B115200  = 0010002
        B230400  = 0010003
        B460800  = 0010004
        B500000  = 0010005
        B576000  = 0010006
        B921600  = 0010007
        B1000000 = 0010010
        B1152000 = 0010011
        B1500000 = 0010012
        B2000000 = 0010013
        B2500000 = 0010014
        B3000000 = 0010015
        B3500000 = 0010016
        B4000000 = 0010017
        CIBAUD   = 002003600000
        CMSPAR   = 010000000000
        CRTSCTS  = 020000000000

        // IBSHIFT is the shift from CBAUD to CIBAUD.
        IBSHIFT = 16
)

// Local flags.
const (
        ISIG    = 0000001
        ICANON  = 0000002
        XCASE   = 0000004
        ECHO    = 0000010
        ECHOE   = 0000020
        ECHOK   = 0000040
        ECHONL  = 0000100
        NOFLSH  = 0000200
        TOSTOP  = 0000400
        ECHOCTL = 0001000
        ECHOPRT = 0002000
        ECHOKE  = 0004000
        FLUSHO  = 0010000
        PENDIN  = 0040000
        IEXTEN  = 0100000
        EXTPROC = 0200000
)

// Control Character indices.
const (
        VINTR    = 0
        VQUIT    = 1
        VERASE   = 2
        VKILL    = 3
        VEOF     = 4
        VTIME    = 5
        VMIN     = 6
        VSWTC    = 7
        VSTART   = 8
        VSTOP    = 9
        VSUSP    = 10
        VEOL     = 11
        VREPRINT = 12
        VDISCARD = 13
        VWERASE  = 14
        VLNEXT   = 15
        VEOL2    = 16
)

// ControlCharacter returns the termios-style control character for the passed
// character.
//
// e.g., for Ctrl-C, i.e., ^C, call ControlCharacter('C').
//
// Standard control characters are ASCII bytes 0 through 31.
func ControlCharacter(c byte) uint8 {
        // A is 1, B is 2, etc.
        return uint8(c - 'A' + 1)
}

// DefaultControlCharacters is the default set of Termios control characters.
var DefaultControlCharacters = [NumControlCharacters]uint8{
        ControlCharacter('C'),  // VINTR = ^C
        ControlCharacter('\\'), // VQUIT = ^\
        '\x7f',                 // VERASE = DEL
        ControlCharacter('U'),  // VKILL = ^U
        ControlCharacter('D'),  // VEOF = ^D
        0,                      // VTIME
        1,                      // VMIN
        0,                      // VSWTC
        ControlCharacter('Q'),  // VSTART = ^Q
        ControlCharacter('S'),  // VSTOP = ^S
        ControlCharacter('Z'),  // VSUSP = ^Z
        0,                      // VEOL
        ControlCharacter('R'),  // VREPRINT = ^R
        ControlCharacter('O'),  // VDISCARD = ^O
        ControlCharacter('W'),  // VWERASE = ^W
        ControlCharacter('V'),  // VLNEXT = ^V
        0,                      // VEOL2
}

// MasterTermios is the terminal configuration of the master end of a Unix98
// pseudoterminal.
var MasterTermios = KernelTermios{
        ControlFlags:      B38400 | CS8 | CREAD,
        ControlCharacters: DefaultControlCharacters,
        InputSpeed:        38400,
        OutputSpeed:       38400,
}

// DefaultReplicaTermios is the default terminal configuration of the replica
// end of a Unix98 pseudoterminal.
var DefaultReplicaTermios = KernelTermios{
        InputFlags:        ICRNL | IXON,
        OutputFlags:       OPOST | ONLCR,
        ControlFlags:      B38400 | CS8 | CREAD,
        LocalFlags:        ISIG | ICANON | ECHO | ECHOE | ECHOK | ECHOCTL | ECHOKE | IEXTEN,
        ControlCharacters: DefaultControlCharacters,
        InputSpeed:        38400,
        OutputSpeed:       38400,
}

// WindowSize corresponds to struct winsize defined in
// include/uapi/asm-generic/termios.h.
//
// +stateify savable
// +marshal
type WindowSize struct {
        Rows uint16
        Cols uint16
        _    [4]byte // Padding for 2 unused shorts.
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/waiter/waiter_list.go: no such file or directory









































  306 



  306 



  308 


  308 

    2 


  307 





  308 







  304 
  302 


  303 




  306 
  306 


  306 




  306 
  297 


  306 




   22 
    2 


   22 







  308 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fspath

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/gohacks"
)

// Builder is similar to strings.Builder, but is used to produce pathnames
// given path components in reverse order (from leaf to root). This is useful
// in the common case where a filesystem is represented by a tree of named
// nodes, and the path to a given node must be produced by walking upward from
// that node to a given root.
type Builder struct {
        buf     []byte
        start   int
        needSep bool
}

// Reset resets the Builder to be empty.
func (b *Builder) Reset() {
        b.start = len(b.buf)
        b.needSep = false
}

// Len returns the number of accumulated bytes.
func (b *Builder) Len() int {
        return len(b.buf) - b.start
}

func (b *Builder) needToGrow(n int) bool {
        return b.start < n
}

func (b *Builder) grow(n int) {
        newLen := b.Len() + n
        var newCap int
        if len(b.buf) == 0 {
                newCap = 64 // arbitrary
        } else {
                newCap = 2 * len(b.buf)
        }
        for newCap < newLen {
                newCap *= 2
                if newCap == 0 {
                        panic(fmt.Sprintf("required length (%d) causes buffer size to overflow", newLen))
                }
        }
        newBuf := make([]byte, newCap)
        copy(newBuf[newCap-b.Len():], b.buf[b.start:])
        b.start += newCap - len(b.buf)
        b.buf = newBuf
}

// PrependComponent prepends the given path component to b's buffer. A path
// separator is automatically inserted if appropriate.
func (b *Builder) PrependComponent(pc string) {
        if b.needSep {
                b.PrependByte('/')
        }
        b.PrependString(pc)
        b.needSep = true
}

// PrependString prepends the given string to b's buffer.
func (b *Builder) PrependString(str string) {
        if b.needToGrow(len(str)) {
                b.grow(len(str))
        }
        b.start -= len(str)
        copy(b.buf[b.start:], str)
}

// PrependByte prepends the given byte to b's buffer.
func (b *Builder) PrependByte(c byte) {
        if b.needToGrow(1) {
                b.grow(1)
        }
        b.start--
        b.buf[b.start] = c
}

// AppendString appends the given string to b's buffer.
func (b *Builder) AppendString(str string) {
        if b.needToGrow(len(str)) {
                b.grow(len(str))
        }
        oldStart := b.start
        b.start -= len(str)
        copy(b.buf[b.start:], b.buf[oldStart:])
        copy(b.buf[len(b.buf)-len(str):], str)
}

// String returns the accumulated string. No other methods should be called
// after String.
func (b *Builder) String() string {
        return gohacks.StringFromImmutableBytes(b.buf[b.start:])
}




















































    2 





































































































































































































































  399 
  399 








































































































  400 









  398 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/hostcpu"
        "gvisor.dev/gvisor/pkg/usermem"
)

// Restartable sequences.
//
// We support two different APIs for restartable sequences.
//
//  1. The upstream interface added in v4.18.
//  2. The interface described in https://lwn.net/Articles/650333/.
//
// Throughout this file and other parts of the kernel, the latter is referred
// to as "old rseq". This interface was never merged upstream, but is supported
// for a limited set of applications that use it regardless.

// OldRSeqCriticalRegion describes an old rseq critical region.
//
// +stateify savable
type OldRSeqCriticalRegion struct {
        // When a task in this thread group has its CPU preempted (as defined by
        // platform.ErrContextCPUPreempted) or has a signal delivered to an
        // application handler while its instruction pointer is in CriticalSection,
        // set the instruction pointer to Restart and application register r10 (on
        // amd64) to the former instruction pointer.
        CriticalSection hostarch.AddrRange
        Restart         hostarch.Addr
}

// RSeqAvailable returns true if t supports (old and new) restartable sequences.
func (t *Task) RSeqAvailable() bool {
        return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption()
}

// SetRSeq registers addr as this thread's rseq structure.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) SetRSeq(addr hostarch.Addr, length, signature uint32) error {
        if t.rseqAddr != 0 {
                if t.rseqAddr != addr {
                        return linuxerr.EINVAL
                }
                if t.rseqSignature != signature {
                        return linuxerr.EINVAL
                }
                return linuxerr.EBUSY
        }

        // rseq must be aligned and correctly sized.
        if addr&(linux.AlignOfRSeq-1) != 0 {
                return linuxerr.EINVAL
        }
        if length != linux.SizeOfRSeq {
                return linuxerr.EINVAL
        }
        if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok {
                return linuxerr.EFAULT
        }

        t.rseqAddr = addr
        t.rseqSignature = signature

        // Initialize the CPUID.
        //
        // Linux implicitly does this on return from userspace, where failure
        // would cause SIGSEGV.
        if err := t.rseqUpdateCPU(); err != nil {
                t.rseqAddr = 0
                t.rseqSignature = 0

                t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
                t.forceSignal(linux.SIGSEGV, false /* unconditional */)
                t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                return linuxerr.EFAULT
        }

        return nil
}

// ClearRSeq unregisters addr as this thread's rseq structure.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) ClearRSeq(addr hostarch.Addr, length, signature uint32) error {
        if t.rseqAddr == 0 {
                return linuxerr.EINVAL
        }
        if t.rseqAddr != addr {
                return linuxerr.EINVAL
        }
        if length != linux.SizeOfRSeq {
                return linuxerr.EINVAL
        }
        if t.rseqSignature != signature {
                return linuxerr.EPERM
        }

        if err := t.rseqClearCPU(); err != nil {
                return err
        }

        t.rseqAddr = 0
        t.rseqSignature = 0

        if t.oldRSeqCPUAddr == 0 {
                // rseqCPU no longer needed.
                t.rseqCPU = -1
        }

        return nil
}

// OldRSeqCriticalRegion returns a copy of t's thread group's current
// old restartable sequence.
func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion {
        return *t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
}

// SetOldRSeqCriticalRegion replaces t's thread group's old restartable
// sequence.
//
// Preconditions: t.RSeqAvailable() == true.
func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error {
        // These checks are somewhat more lenient than in Linux, which (bizarrely)
        // requires r.CriticalSection to be non-empty and r.Restart to be
        // outside of r.CriticalSection, even if r.CriticalSection.Start == 0
        // (which disables the critical region).
        if r.CriticalSection.Start == 0 {
                r.CriticalSection.End = 0
                r.Restart = 0
                t.tg.oldRSeqCritical.Store(&r)
                return nil
        }
        if r.CriticalSection.Start >= r.CriticalSection.End {
                return linuxerr.EINVAL
        }
        if r.CriticalSection.Contains(r.Restart) {
                return linuxerr.EINVAL
        }
        // TODO(jamieliu): check that r.CriticalSection and r.Restart are in
        // the application address range, for consistency with Linux.
        t.tg.oldRSeqCritical.Store(&r)
        return nil
}

// OldRSeqCPUAddr returns the address that old rseq will keep updated with t's
// CPU number.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) OldRSeqCPUAddr() hostarch.Addr {
        return t.oldRSeqCPUAddr
}

// SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
// t's CPU number.
//
// Preconditions:
// * t.RSeqAvailable() == true.
// * The caller must be running on the task goroutine.
// * t's AddressSpace must be active.
func (t *Task) SetOldRSeqCPUAddr(addr hostarch.Addr) error {
        t.oldRSeqCPUAddr = addr

        // Check that addr is writable.
        //
        // N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's
        // unfortunate, but unlikely in a correct program.
        if err := t.rseqUpdateCPU(); err != nil {
                t.oldRSeqCPUAddr = 0
                return linuxerr.EINVAL // yes, EINVAL, not err or EFAULT
        }
        return nil
}

// Preconditions:
// * The caller must be running on the task goroutine.
// * t's AddressSpace must be active.
func (t *Task) rseqUpdateCPU() error {
        if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
                t.rseqCPU = -1
                return nil
        }

        t.rseqCPU = int32(hostcpu.GetCPU())

        // Update both CPUs, even if one fails.
        rerr := t.rseqCopyOutCPU()
        oerr := t.oldRSeqCopyOutCPU()

        if rerr != nil {
                return rerr
        }
        return oerr
}

// Preconditions:
// * The caller must be running on the task goroutine.
// * t's AddressSpace must be active.
func (t *Task) oldRSeqCopyOutCPU() error {
        if t.oldRSeqCPUAddr == 0 {
                return nil
        }

        buf := t.CopyScratchBuffer(4)
        hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
        _, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf)
        return err
}

// Preconditions:
// * The caller must be running on the task goroutine.
// * t's AddressSpace must be active.
func (t *Task) rseqCopyOutCPU() error {
        if t.rseqAddr == 0 {
                return nil
        }

        buf := t.CopyScratchBuffer(8)
        // CPUIDStart and CPUID are the first two fields in linux.RSeq.
        hostarch.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))     // CPUIDStart
        hostarch.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID
        // N.B. This write is not atomic, but since this occurs on the task
        // goroutine then as long as userspace uses a single-instruction read
        // it can't see an invalid value.
        _, err := t.CopyOutBytes(t.rseqAddr, buf)
        return err
}

// Preconditions:
// * The caller must be running on the task goroutine.
// * t's AddressSpace must be active.
func (t *Task) rseqClearCPU() error {
        buf := t.CopyScratchBuffer(8)
        // CPUIDStart and CPUID are the first two fields in linux.RSeq.
        hostarch.ByteOrder.PutUint32(buf, 0)                                   // CPUIDStart
        hostarch.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID
        // N.B. This write is not atomic, but since this occurs on the task
        // goroutine then as long as userspace uses a single-instruction read
        // it can't see an invalid value.
        _, err := t.CopyOutBytes(t.rseqAddr, buf)
        return err
}

// rseqAddrInterrupt checks if IP is in a critical section, and aborts if so.
//
// This is a bit complex since both the RSeq and RSeqCriticalSection structs
// are stored in userspace. So we must:
//
// 1. Copy in the address of RSeqCriticalSection from RSeq.
// 2. Copy in RSeqCriticalSection itself.
// 3. Validate critical section struct version, address range, abort address.
// 4. Validate the abort signature (4 bytes preceding abort IP match expected
//    signature).
// 5. Clear address of RSeqCriticalSection from RSeq.
// 6. Finally, conditionally abort.
//
// See kernel/rseq.c:rseq_ip_fixup for reference.
//
// Preconditions:
// * The caller must be running on the task goroutine.
// * t's AddressSpace must be active.
func (t *Task) rseqAddrInterrupt() {
        if t.rseqAddr == 0 {
                return
        }

        critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection)
        if !ok {
                // SetRSeq should validate this.
                panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr))
        }

        if t.Arch().Width() != 8 {
                // We only handle 64-bit for now.
                t.Debugf("Only 64-bit rseq supported.")
                t.forceSignal(linux.SIGSEGV, false /* unconditional */)
                t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                return
        }

        buf := t.CopyScratchBuffer(8)
        if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil {
                t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err)
                t.forceSignal(linux.SIGSEGV, false /* unconditional */)
                t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                return
        }

        critAddr := hostarch.Addr(hostarch.ByteOrder.Uint64(buf))
        if critAddr == 0 {
                return
        }

        var cs linux.RSeqCriticalSection
        if _, err := cs.CopyIn(t, critAddr); err != nil {
                t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
                t.forceSignal(linux.SIGSEGV, false /* unconditional */)
                t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                return
        }

        if cs.Version != 0 {
                t.Debugf("Unknown version in %+v", cs)
                t.forceSignal(linux.SIGSEGV, false /* unconditional */)
                t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                return
        }

        start := hostarch.Addr(cs.Start)
        critRange, ok := start.ToRange(cs.PostCommitOffset)
        if !ok {
                t.Debugf("Invalid start and offset in %+v", cs)
                t.forceSignal(linux.SIGSEGV, false /* unconditional */)
                t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                return
        }

        abort := hostarch.Addr(cs.Abort)
        if critRange.Contains(abort) {
                t.Debugf("Abort in critical section in %+v", cs)
                t.forceSignal(linux.SIGSEGV, false /* unconditional */)
                t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                return
        }

        // Verify signature.
        sigAddr := abort - linux.SizeOfRSeqSignature

        buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature)
        if _, err := t.CopyInBytes(sigAddr, buf); err != nil {
                t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err)
                t.forceSignal(linux.SIGSEGV, false /* unconditional */)
                t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                return
        }

        sig := hostarch.ByteOrder.Uint32(buf)
        if sig != t.rseqSignature {
                t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature)
                t.forceSignal(linux.SIGSEGV, false /* unconditional */)
                t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                return
        }

        // Clear the critical section address.
        //
        // NOTE(b/143949567): We don't support any rseq flags, so we always
        // restart if we are in the critical section, and thus *always* clear
        // critAddrAddr.
        if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{
                AddressSpaceActive: true,
        }); err != nil {
                t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err)
                t.forceSignal(linux.SIGSEGV, false /* unconditional */)
                t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                return
        }

        // Finally we can actually decide whether or not to restart.
        if !critRange.Contains(hostarch.Addr(t.Arch().IP())) {
                return
        }

        t.Arch().SetIP(uintptr(cs.Abort))
}

// Preconditions: The caller must be running on the task goroutine.
func (t *Task) oldRSeqInterrupt() {
        r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
        if ip := t.Arch().IP(); r.CriticalSection.Contains(hostarch.Addr(ip)) {
                t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart)
                t.Arch().SetIP(uintptr(r.Restart))
                t.Arch().SetOldRSeqInterruptedIP(ip)
        }
}

// Preconditions: The caller must be running on the task goroutine.
func (t *Task) rseqInterrupt() {
        t.rseqAddrInterrupt()
        t.oldRSeqInterrupt()
}
























































   54 








   54 




   53 


    1 



   54 















   53 



   38 
   38 









   38 







   24 


















    1 







    1 

    1 


    1 







    1 




    1 



    1 

    1 



    1 

    1 



    1 






    1 




    9 

    1 



    8 





    8 

    1 
    1 










    6 





    6 



    6 








    6 







    6 







   10 









  107 






  107 


  106 


















   92 






   92 


    2 




   89 








   90 




    5 




    5 





   16 






   25 






   25 

    7 

   16 

    2 



   23 




   23 



   33 





   34 
   32 

    2 




   32 






   32 






   32 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package unix

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/socket"
        "gvisor.dev/gvisor/pkg/sentry/socket/control"
        "gvisor.dev/gvisor/pkg/sentry/socket/netstack"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// SocketVFS2 implements socket.SocketVFS2 (and by extension,
// vfs.FileDescriptionImpl) for Unix sockets.
//
// +stateify savable
type SocketVFS2 struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.DentryMetadataFileDescriptionImpl
        vfs.LockFD

        socketVFS2Refs
        socketOpsCommon
}

var _ = socket.SocketVFS2(&SocketVFS2{})

// NewSockfsFile creates a new socket file in the global sockfs mount and
// returns a corresponding file description.
func NewSockfsFile(t *kernel.Task, ep transport.Endpoint, stype linux.SockType) (*vfs.FileDescription, *syserr.Error) {
        mnt := t.Kernel().SocketMount()
        d := sockfs.NewDentry(t, mnt)
        defer d.DecRef(t)

        fd, err := NewFileDescription(ep, stype, linux.O_RDWR, mnt, d, &vfs.FileLocks{})
        if err != nil {
                return nil, syserr.FromError(err)
        }
        return fd, nil
}

// NewFileDescription creates and returns a socket file description
// corresponding to the given mount and dentry.
func NewFileDescription(ep transport.Endpoint, stype linux.SockType, flags uint32, mnt *vfs.Mount, d *vfs.Dentry, locks *vfs.FileLocks) (*vfs.FileDescription, error) {
        // You can create AF_UNIX, SOCK_RAW sockets. They're the same as
        // SOCK_DGRAM and don't require CAP_NET_RAW.
        if stype == linux.SOCK_RAW {
                stype = linux.SOCK_DGRAM
        }

        sock := &SocketVFS2{
                socketOpsCommon: socketOpsCommon{
                        ep:    ep,
                        stype: stype,
                },
        }
        sock.InitRefs()
        sock.LockFD.Init(locks)
        vfsfd := &sock.vfsfd
        if err := vfsfd.Init(sock, flags, mnt, d, &vfs.FileDescriptionOptions{
                DenyPRead:         true,
                DenyPWrite:        true,
                UseDentryMetadata: true,
        }); err != nil {
                return nil, err
        }
        return vfsfd, nil
}

// DecRef implements RefCounter.DecRef.
func (s *SocketVFS2) DecRef(ctx context.Context) {
        s.socketVFS2Refs.DecRef(func() {
                kernel.KernelFromContext(ctx).DeleteSocketVFS2(&s.vfsfd)
                s.ep.Close(ctx)
                if s.abstractNamespace != nil {
                        s.abstractNamespace.Remove(s.abstractName, s)
                }
        })
}

// Release implements vfs.FileDescriptionImpl.Release.
func (s *SocketVFS2) Release(ctx context.Context) {
        // Release only decrements a reference on s because s may be referenced in
        // the abstract socket namespace.
        s.DecRef(ctx)
}

// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// a transport.Endpoint.
func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
        return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outPtr, outLen)
}

// blockingAccept implements a blocking version of accept(2), that is, if no
// connections are ready to be accept, it will block until one becomes ready.
func (s *SocketVFS2) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) {
        // Register for notifications.
        e, ch := waiter.NewChannelEntry(nil)
        s.socketOpsCommon.EventRegister(&e, waiter.ReadableEvents)
        defer s.socketOpsCommon.EventUnregister(&e)

        // Try to accept the connection; if it fails, then wait until we get a
        // notification.
        for {
                if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock {
                        return ep, err
                }

                if err := t.Block(ch); err != nil {
                        return nil, syserr.FromError(err)
                }
        }
}

// Accept implements the linux syscall accept(2) for sockets backed by
// a transport.Endpoint.
func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
        var peerAddr *tcpip.FullAddress
        if peerRequested {
                peerAddr = &tcpip.FullAddress{}
        }
        ep, err := s.ep.Accept(peerAddr)
        if err != nil {
                if err != syserr.ErrWouldBlock || !blocking {
                        return 0, nil, 0, err
                }

                var err *syserr.Error
                ep, err = s.blockingAccept(t, peerAddr)
                if err != nil {
                        return 0, nil, 0, err
                }
        }

        ns, err := NewSockfsFile(t, ep, s.stype)
        if err != nil {
                return 0, nil, 0, err
        }
        defer ns.DecRef(t)

        if flags&linux.SOCK_NONBLOCK != 0 {
                ns.SetStatusFlags(t, t.Credentials(), linux.SOCK_NONBLOCK)
        }

        var addr linux.SockAddr
        var addrLen uint32
        if peerAddr != nil {
                addr, addrLen = socket.ConvertAddress(linux.AF_UNIX, *peerAddr)
        }

        fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
                CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
        })
        if e != nil {
                return 0, nil, 0, syserr.FromError(e)
        }

        t.Kernel().RecordSocketVFS2(ns)
        return fd, addr, addrLen, nil
}

// Bind implements the linux syscall bind(2) for unix sockets.
func (s *SocketVFS2) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
        p, e := extractPath(sockaddr)
        if e != nil {
                return e
        }

        bep, ok := s.ep.(transport.BoundEndpoint)
        if !ok {
                // This socket can't be bound.
                return syserr.ErrInvalidArgument
        }

        return s.ep.Bind(tcpip.FullAddress{Addr: tcpip.Address(p)}, func() *syserr.Error {
                // Is it abstract?
                if p[0] == 0 {
                        if t.IsNetworkNamespaced() {
                                return syserr.ErrInvalidEndpointState
                        }
                        asn := t.AbstractSockets()
                        name := p[1:]
                        if err := asn.Bind(t, name, bep, s); err != nil {
                                // syserr.ErrPortInUse corresponds to EADDRINUSE.
                                return syserr.ErrPortInUse
                        }
                        s.abstractName = name
                        s.abstractNamespace = asn
                } else {
                        path := fspath.Parse(p)
                        root := t.FSContext().RootDirectoryVFS2()
                        defer root.DecRef(t)
                        start := root
                        relPath := !path.Absolute
                        if relPath {
                                start = t.FSContext().WorkingDirectoryVFS2()
                                defer start.DecRef(t)
                        }
                        pop := vfs.PathOperation{
                                Root:  root,
                                Start: start,
                                Path:  path,
                        }
                        stat, err := s.vfsfd.Stat(t, vfs.StatOptions{Mask: linux.STATX_MODE})
                        if err != nil {
                                return syserr.FromError(err)
                        }
                        err = t.Kernel().VFS().MknodAt(t, t.Credentials(), &pop, &vfs.MknodOptions{
                                // File permissions correspond to net/unix/af_unix.c:unix_bind.
                                Mode:     linux.FileMode(linux.S_IFSOCK | uint(stat.Mode)&^t.FSContext().Umask()),
                                Endpoint: bep,
                        })
                        if linuxerr.Equals(linuxerr.EEXIST, err) {
                                return syserr.ErrAddressInUse
                        }
                        return syserr.FromError(err)
                }

                return nil
        })
}

// Ioctl implements vfs.FileDescriptionImpl.
func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        return netstack.Ioctl(ctx, s.ep, uio, args)
}

// PRead implements vfs.FileDescriptionImpl.
func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        return 0, linuxerr.ESPIPE
}

// Read implements vfs.FileDescriptionImpl.
func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        // All flags other than RWF_NOWAIT should be ignored.
        // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
        if opts.Flags != 0 {
                return 0, linuxerr.EOPNOTSUPP
        }

        if dst.NumBytes() == 0 {
                return 0, nil
        }
        r := &EndpointReader{
                Ctx:       ctx,
                Endpoint:  s.ep,
                NumRights: 0,
                Peek:      false,
                From:      nil,
        }
        n, err := dst.CopyOutFrom(ctx, r)
        // Drop control messages.
        r.Control.Release(ctx)
        return n, err
}

// PWrite implements vfs.FileDescriptionImpl.
func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
        return 0, linuxerr.ESPIPE
}

// Write implements vfs.FileDescriptionImpl.
func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        // All flags other than RWF_NOWAIT should be ignored.
        // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
        if opts.Flags != 0 {
                return 0, linuxerr.EOPNOTSUPP
        }

        t := kernel.TaskFromContext(ctx)
        ctrl := control.New(t, s.ep, nil)

        if src.NumBytes() == 0 {
                nInt, err := s.ep.SendMsg(ctx, [][]byte{}, ctrl, nil)
                return int64(nInt), err.ToError()
        }

        return src.CopyInTo(ctx, &EndpointWriter{
                Ctx:      ctx,
                Endpoint: s.ep,
                Control:  ctrl,
                To:       nil,
        })
}

// Readiness implements waiter.Waitable.Readiness.
func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
        return s.socketOpsCommon.Readiness(mask)
}

// EventRegister implements waiter.Waitable.EventRegister.
func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        s.socketOpsCommon.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (s *SocketVFS2) EventUnregister(e *waiter.Entry) {
        s.socketOpsCommon.EventUnregister(e)
}

// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
// a transport.Endpoint.
func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
        return netstack.SetSockOpt(t, s, s.ep, level, name, optVal)
}

// providerVFS2 is a unix domain socket provider for VFS2.
type providerVFS2 struct{}

func (*providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
        // Check arguments.
        if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
                return nil, syserr.ErrProtocolNotSupported
        }

        // Create the endpoint and socket.
        var ep transport.Endpoint
        switch stype {
        case linux.SOCK_DGRAM, linux.SOCK_RAW:
                ep = transport.NewConnectionless(t)
        case linux.SOCK_SEQPACKET, linux.SOCK_STREAM:
                ep = transport.NewConnectioned(t, stype, t.Kernel())
        default:
                return nil, syserr.ErrInvalidArgument
        }

        f, err := NewSockfsFile(t, ep, stype)
        if err != nil {
                ep.Close(t)
                return nil, err
        }
        return f, nil
}

// Pair creates a new pair of AF_UNIX connected sockets.
func (*providerVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
        // Check arguments.
        if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
                return nil, nil, syserr.ErrProtocolNotSupported
        }

        switch stype {
        case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET, linux.SOCK_RAW:
                // Ok
        default:
                return nil, nil, syserr.ErrInvalidArgument
        }

        // Create the endpoints and sockets.
        ep1, ep2 := transport.NewPair(t, stype, t.Kernel())
        s1, err := NewSockfsFile(t, ep1, stype)
        if err != nil {
                ep1.Close(t)
                ep2.Close(t)
                return nil, nil, err
        }
        s2, err := NewSockfsFile(t, ep2, stype)
        if err != nil {
                s1.DecRef(t)
                ep2.Close(t)
                return nil, nil, err
        }

        return s1, s2, nil
}




















































  848 




  803 














  662 














   13 






   14 






   17 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package refsvfs2

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/log"
        refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
        "gvisor.dev/gvisor/pkg/sync"
)

var (
        // liveObjects is a global map of reference-counted objects. Objects are
        // inserted when leak check is enabled, and they are removed when they are
        // destroyed. It is protected by liveObjectsMu.
        liveObjects   map[CheckedObject]struct{}
        liveObjectsMu sync.Mutex
)

// CheckedObject represents a reference-counted object with an informative
// leak detection message.
type CheckedObject interface {
        // RefType is the type of the reference-counted object.
        RefType() string

        // LeakMessage supplies a warning to be printed upon leak detection.
        LeakMessage() string

        // LogRefs indicates whether reference-related events should be logged.
        LogRefs() bool
}

func init() {
        liveObjects = make(map[CheckedObject]struct{})
}

// leakCheckEnabled returns whether leak checking is enabled. The following
// functions should only be called if it returns true.
func leakCheckEnabled() bool {
        return refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking
}

// Register adds obj to the live object map.
func Register(obj CheckedObject) {
        if leakCheckEnabled() {
                liveObjectsMu.Lock()
                if _, ok := liveObjects[obj]; ok {
                        panic(fmt.Sprintf("Unexpected entry in leak checking map: reference %p already added", obj))
                }
                liveObjects[obj] = struct{}{}
                liveObjectsMu.Unlock()
                if leakCheckEnabled() && obj.LogRefs() {
                        logEvent(obj, "registered")
                }
        }
}

// Unregister removes obj from the live object map.
func Unregister(obj CheckedObject) {
        if leakCheckEnabled() {
                liveObjectsMu.Lock()
                defer liveObjectsMu.Unlock()
                if _, ok := liveObjects[obj]; !ok {
                        panic(fmt.Sprintf("Expected to find entry in leak checking map for reference %p", obj))
                }
                delete(liveObjects, obj)
                if leakCheckEnabled() && obj.LogRefs() {
                        logEvent(obj, "unregistered")
                }
        }
}

// LogIncRef logs a reference increment.
func LogIncRef(obj CheckedObject, refs int64) {
        if leakCheckEnabled() && obj.LogRefs() {
                logEvent(obj, fmt.Sprintf("IncRef to %d", refs))
        }
}

// LogTryIncRef logs a successful TryIncRef call.
func LogTryIncRef(obj CheckedObject, refs int64) {
        if leakCheckEnabled() && obj.LogRefs() {
                logEvent(obj, fmt.Sprintf("TryIncRef to %d", refs))
        }
}

// LogDecRef logs a reference decrement.
func LogDecRef(obj CheckedObject, refs int64) {
        if leakCheckEnabled() && obj.LogRefs() {
                logEvent(obj, fmt.Sprintf("DecRef to %d", refs))
        }
}

// logEvent logs a message for the given reference-counted object.
//
// obj.LogRefs() should be checked before calling logEvent, in order to avoid
// calling any text processing needed to evaluate msg.
func logEvent(obj CheckedObject, msg string) {
        log.Infof("[%s %p] %s:\n%s", obj.RefType(), obj, msg, refs_vfs1.FormatStack(refs_vfs1.RecordStack()))
}

// checkOnce makes sure that leak checking is only done once. DoLeakCheck is
// called from multiple places (which may overlap) to cover different sandbox
// exit scenarios.
var checkOnce sync.Once

// DoLeakCheck iterates through the live object map and logs a message for each
// object. It is called once no reference-counted objects should be reachable
// anymore, at which point anything left in the map is considered a leak.
func DoLeakCheck() {
        if leakCheckEnabled() {
                checkOnce.Do(func() {
                        liveObjectsMu.Lock()
                        defer liveObjectsMu.Unlock()
                        leaked := len(liveObjects)
                        if leaked > 0 {
                                msg := fmt.Sprintf("Leak checking detected %d leaked objects:\n", leaked)
                                for obj := range liveObjects {
                                        msg += obj.LeakMessage() + "\n"
                                }
                                log.Warningf(msg)
                        }
                })
        }
}






















































  890 

  894 


  891 


  898 

  181 

  164 

    7 















  417 

  376 

    7 







   14 
    1 



   14 
   13 




   13 



   41 

  446 

  629 

    1 
    1 




   12 



   17 



    3 

    4 


















   23 







































    4 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/inet"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        "gvisor.dev/gvisor/pkg/sentry/platform"
        "gvisor.dev/gvisor/pkg/sentry/unimpl"
        "gvisor.dev/gvisor/pkg/sentry/uniqueid"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
)

// Deadline implements context.Context.Deadline.
func (t *Task) Deadline() (time.Time, bool) {
        return time.Time{}, false
}

// Done implements context.Context.Done.
func (t *Task) Done() <-chan struct{} {
        return nil
}

// Err implements context.Context.Err.
func (t *Task) Err() error {
        return nil
}

// Value implements context.Context.Value.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) Value(key interface{}) interface{} {
        // This function is very hot; skip this check outside of +race builds.
        if sync.RaceEnabled {
                t.assertTaskGoroutine()
        }
        return t.contextValue(key, true /* isTaskGoroutine */)
}

func (t *Task) contextValue(key interface{}, isTaskGoroutine bool) interface{} {
        switch key {
        case CtxCanTrace:
                return t.CanTrace
        case CtxKernel:
                return t.k
        case CtxPIDNamespace:
                return t.tg.pidns
        case CtxUTSNamespace:
                if !isTaskGoroutine {
                        t.mu.Lock()
                        defer t.mu.Unlock()
                }
                return t.utsns
        case CtxIPCNamespace:
                if !isTaskGoroutine {
                        t.mu.Lock()
                        defer t.mu.Unlock()
                }
                ipcns := t.ipcns
                ipcns.IncRef()
                return ipcns
        case CtxTask:
                return t
        case auth.CtxCredentials:
                return t.creds.Load()
        case context.CtxThreadGroupID:
                return int32(t.tg.ID())
        case fs.CtxRoot:
                if !isTaskGoroutine {
                        t.mu.Lock()
                        defer t.mu.Unlock()
                }
                return t.fsContext.RootDirectory()
        case vfs.CtxRoot:
                if !isTaskGoroutine {
                        t.mu.Lock()
                        defer t.mu.Unlock()
                }
                return t.fsContext.RootDirectoryVFS2()
        case vfs.CtxMountNamespace:
                if !isTaskGoroutine {
                        t.mu.Lock()
                        defer t.mu.Unlock()
                }
                t.mountNamespaceVFS2.IncRef()
                return t.mountNamespaceVFS2
        case fs.CtxDirentCacheLimiter:
                return t.k.DirentCacheLimiter
        case inet.CtxStack:
                return t.NetworkContext()
        case ktime.CtxRealtimeClock:
                return t.k.RealtimeClock()
        case limits.CtxLimits:
                return t.tg.limits
        case linux.CtxSignalNoInfoFunc:
                return func(sig linux.Signal) error {
                        return t.SendSignal(SignalInfoNoInfo(sig, t, t))
                }
        case pgalloc.CtxMemoryFile:
                return t.k.mf
        case pgalloc.CtxMemoryFileProvider:
                return t.k
        case platform.CtxPlatform:
                return t.k
        case uniqueid.CtxGlobalUniqueID:
                return t.k.UniqueID()
        case uniqueid.CtxGlobalUniqueIDProvider:
                return t.k
        case uniqueid.CtxInotifyCookie:
                return t.k.GenerateInotifyCookie()
        case unimpl.CtxEvents:
                return t.k
        default:
                return nil
        }
}

// taskAsyncContext implements context.Context for a goroutine that performs
// work on behalf of a Task, but is not the task goroutine.
type taskAsyncContext struct {
        context.NoopSleeper

        t *Task
}

// AsyncContext returns a context.Context representing t. The returned
// context.Context is intended for use by goroutines other than t's task
// goroutine; for example, signal delivery to t will not interrupt goroutines
// that are blocking using the returned context.Context.
func (t *Task) AsyncContext() context.Context {
        return taskAsyncContext{t: t}
}

// Debugf implements log.Logger.Debugf.
func (ctx taskAsyncContext) Debugf(format string, v ...interface{}) {
        ctx.t.Debugf(format, v...)
}

// Infof implements log.Logger.Infof.
func (ctx taskAsyncContext) Infof(format string, v ...interface{}) {
        ctx.t.Infof(format, v...)
}

// Warningf implements log.Logger.Warningf.
func (ctx taskAsyncContext) Warningf(format string, v ...interface{}) {
        ctx.t.Warningf(format, v...)
}

// IsLogging implements log.Logger.IsLogging.
func (ctx taskAsyncContext) IsLogging(level log.Level) bool {
        return ctx.t.IsLogging(level)
}

// Deadline implements context.Context.Deadline.
func (ctx taskAsyncContext) Deadline() (time.Time, bool) {
        return time.Time{}, false
}

// Done implements context.Context.Done.
func (ctx taskAsyncContext) Done() <-chan struct{} {
        return nil
}

// Err implements context.Context.Err.
func (ctx taskAsyncContext) Err() error {
        return nil
}

// Value implements context.Context.Value.
func (ctx taskAsyncContext) Value(key interface{}) interface{} {
        return ctx.t.contextValue(key, false /* isTaskGoroutine */)
}









































































    5 















































































    7 








































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
        "time"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/seqnum"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

const (
        // wcDelayedACKTimeout is the recommended maximum delayed ACK timer
        // value as defined in the RFC. It stands for worst case delayed ACK
        // timer (WCDelAckT). When FlightSize is 1, PTO is inflated by
        // WCDelAckT time to compensate for a potential long delayed ACK timer
        // at the receiver.
        // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.
        wcDelayedACKTimeout = 200 * time.Millisecond

        // tcpRACKRecoveryThreshold is the number of loss recoveries for which
        // the reorder window is inflated and after that the reorder window is
        // reset to its initial value of minRTT/4.
        // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2.
        tcpRACKRecoveryThreshold = 16
)

// RACK is a loss detection algorithm used in TCP to detect packet loss and
// reordering using transmission timestamp of the packets instead of packet or
// sequence counts. To use RACK, SACK should be enabled on the connection.

// rackControl stores the rack related fields.
// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-6.1
//
// +stateify savable
type rackControl struct {
        stack.TCPRACKState

        // exitedRecovery indicates if the connection is exiting loss recovery.
        // This flag is set if the sender is leaving the recovery after
        // receiving an ACK and is reset during updating of reorder window.
        exitedRecovery bool

        // minRTT is the estimated minimum RTT of the connection.
        minRTT time.Duration

        // tlpRxtOut indicates whether there is an unacknowledged
        // TLP retransmission.
        tlpRxtOut bool

        // tlpHighRxt the value of sender.sndNxt at the time of sending
        // a TLP retransmission.
        tlpHighRxt seqnum.Value

        // snd is a reference to the sender.
        snd *sender
}

// init initializes RACK specific fields.
func (rc *rackControl) init(snd *sender, iss seqnum.Value) {
        rc.FACK = iss
        rc.ReoWndIncr = 1
        rc.snd = snd
}

// update will update the RACK related fields when an ACK has been received.
// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-09#section-6.2
func (rc *rackControl) update(seg *segment, ackSeg *segment) {
        rtt := rc.snd.ep.stack.Clock().NowMonotonic().Sub(seg.xmitTime)
        tsOffset := rc.snd.ep.TSOffset

        // If the ACK is for a retransmitted packet, do not update if it is a
        // spurious inference which is determined by below checks:
        // 1. When Timestamping option is available, if the TSVal is less than
        // the transmit time of the most recent retransmitted packet.
        // 2. When RTT calculated for the packet is less than the smoothed RTT
        // for the connection.
        // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
        // step 2
        if seg.xmitCount > 1 {
                if ackSeg.parsedOptions.TS && ackSeg.parsedOptions.TSEcr != 0 {
                        if ackSeg.parsedOptions.TSEcr < tcpTimeStamp(seg.xmitTime, tsOffset) {
                                return
                        }
                }
                if rtt < rc.minRTT {
                        return
                }
        }

        rc.RTT = rtt

        // The sender can either track a simple global minimum of all RTT
        // measurements from the connection, or a windowed min-filtered value
        // of recent RTT measurements. This implementation keeps track of the
        // simple global minimum of all RTTs for the connection.
        if rtt < rc.minRTT || rc.minRTT == 0 {
                rc.minRTT = rtt
        }

        // Update rc.xmitTime and rc.endSequence to the transmit time and
        // ending sequence number of the packet which has been acknowledged
        // most recently.
        endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size()))
        if rc.XmitTime.Before(seg.xmitTime) || (seg.xmitTime == rc.XmitTime && rc.EndSequence.LessThan(endSeq)) {
                rc.XmitTime = seg.xmitTime
                rc.EndSequence = endSeq
        }
}

// detectReorder detects if packet reordering has been observed.
// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
// * Step 3: Detect data segment reordering.
//   To detect reordering, the sender looks for original data segments being
//   delivered out of order. To detect such cases, the sender tracks the
//   highest sequence selectively or cumulatively acknowledged in the RACK.fack
//   variable. The name "fack" stands for the most "Forward ACK" (this term is
//   adopted from [FACK]). If a never retransmitted segment that's below
//   RACK.fack is (selectively or cumulatively) acknowledged, it has been
//   delivered out of order. The sender sets RACK.reord to TRUE if such segment
//   is identified.
func (rc *rackControl) detectReorder(seg *segment) {
        endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size()))
        if rc.FACK.LessThan(endSeq) {
                rc.FACK = endSeq
                return
        }

        if endSeq.LessThan(rc.FACK) && seg.xmitCount == 1 {
                rc.Reord = true
        }
}

func (rc *rackControl) setDSACKSeen(dsackSeen bool) {
        rc.DSACKSeen = dsackSeen
}

// shouldSchedulePTO dictates whether we should schedule a PTO or not.
// See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1.
func (s *sender) shouldSchedulePTO() bool {
        // Schedule PTO only if RACK loss detection is enabled.
        return s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 &&
                // The connection supports SACK.
                s.ep.SACKPermitted &&
                // The connection is not in loss recovery.
                (s.state != tcpip.RTORecovery && s.state != tcpip.SACKRecovery) &&
                // The connection has no SACKed sequences in the SACK scoreboard.
                s.ep.scoreboard.Sacked() == 0
}

// schedulePTO schedules the probe timeout as defined in
// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1.
func (s *sender) schedulePTO() {
        pto := time.Second
        s.rtt.Lock()
        if s.rtt.TCPRTTState.SRTTInited && s.rtt.TCPRTTState.SRTT > 0 {
                pto = s.rtt.TCPRTTState.SRTT * 2
                if s.Outstanding == 1 {
                        pto += wcDelayedACKTimeout
                }
        }
        s.rtt.Unlock()

        now := s.ep.stack.Clock().NowMonotonic()
        if s.resendTimer.enabled() {
                if now.Add(pto).After(s.resendTimer.target) {
                        pto = s.resendTimer.target.Sub(now)
                }
                s.resendTimer.disable()
        }

        s.probeTimer.enable(pto)
}

// probeTimerExpired is the same as TLP_send_probe() as defined in
// https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.2.
func (s *sender) probeTimerExpired() tcpip.Error {
        if !s.probeTimer.checkExpiration() {
                return nil
        }

        var dataSent bool
        if s.writeNext != nil && s.writeNext.xmitCount == 0 && s.Outstanding < s.SndCwnd {
                dataSent = s.maybeSendSegment(s.writeNext, int(s.ep.scoreboard.SMSS()), s.SndUna.Add(s.SndWnd))
                if dataSent {
                        s.Outstanding += s.pCount(s.writeNext, s.MaxPayloadSize)
                        s.writeNext = s.writeNext.Next()
                }
        }

        if !dataSent && !s.rc.tlpRxtOut {
                var highestSeqXmit *segment
                for highestSeqXmit = s.writeList.Front(); highestSeqXmit != nil; highestSeqXmit = highestSeqXmit.Next() {
                        if highestSeqXmit.xmitCount == 0 {
                                // Nothing in writeList is transmitted, no need to send a probe.
                                highestSeqXmit = nil
                                break
                        }
                        if highestSeqXmit.Next() == nil || highestSeqXmit.Next().xmitCount == 0 {
                                // Either everything in writeList has been transmitted or the next
                                // sequence has not been transmitted. Either way this is the highest
                                // sequence segment that was transmitted.
                                break
                        }
                }

                if highestSeqXmit != nil {
                        dataSent = s.maybeSendSegment(highestSeqXmit, int(s.ep.scoreboard.SMSS()), s.SndUna.Add(s.SndWnd))
                        if dataSent {
                                s.rc.tlpRxtOut = true
                                s.rc.tlpHighRxt = s.SndNxt
                        }
                }
        }

        // Whether or not the probe was sent, the sender must arm the resend timer,
        // not the probe timer. This ensures that the sender does not send repeated,
        // back-to-back tail loss probes.
        s.postXmit(dataSent, false /* shouldScheduleProbe */)
        return nil
}

// detectTLPRecovery detects if recovery was accomplished by the loss probes
// and updates TLP state accordingly.
// See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.3.
func (s *sender) detectTLPRecovery(ack seqnum.Value, rcvdSeg *segment) {
        if !(s.ep.SACKPermitted && s.rc.tlpRxtOut) {
                return
        }

        // Step 1.
        if s.isDupAck(rcvdSeg) && ack == s.rc.tlpHighRxt {
                var sbAboveTLPHighRxt bool
                for _, sb := range rcvdSeg.parsedOptions.SACKBlocks {
                        if s.rc.tlpHighRxt.LessThan(sb.End) {
                                sbAboveTLPHighRxt = true
                                break
                        }
                }
                if !sbAboveTLPHighRxt {
                        // TLP episode is complete.
                        s.rc.tlpRxtOut = false
                }
        }

        if s.rc.tlpRxtOut && s.rc.tlpHighRxt.LessThanEq(ack) {
                // TLP episode is complete.
                s.rc.tlpRxtOut = false
                if !checkDSACK(rcvdSeg) {
                        // Step 2. Either the original packet or the retransmission (in the
                        // form of a probe) was lost. Invoke a congestion control response
                        // equivalent to fast recovery.
                        s.cc.HandleLossDetected()
                        s.enterRecovery()
                        s.leaveRecovery()
                }
        }
}

// updateRACKReorderWindow updates the reorder window.
// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
// * Step 4: Update RACK reordering window
//   To handle the prevalent small degree of reordering, RACK.reo_wnd serves as
//   an allowance for settling time before marking a packet lost. RACK starts
//   initially with a conservative window of min_RTT/4. If no reordering has
//   been observed RACK uses reo_wnd of zero during loss recovery, in order to
//   retransmit quickly, or when the number of DUPACKs exceeds the classic
//   DUPACKthreshold.
func (rc *rackControl) updateRACKReorderWindow() {
        dsackSeen := rc.DSACKSeen
        snd := rc.snd

        // React to DSACK once per round trip.
        // If SND.UNA < RACK.rtt_seq:
        //   RACK.dsack = false
        if snd.SndUna.LessThan(rc.RTTSeq) {
                dsackSeen = false
        }

        // If RACK.dsack:
        //   RACK.reo_wnd_incr += 1
        //   RACK.dsack = false
        //   RACK.rtt_seq = SND.NXT
        //   RACK.reo_wnd_persist = 16
        if dsackSeen {
                rc.ReoWndIncr++
                dsackSeen = false
                rc.RTTSeq = snd.SndNxt
                rc.ReoWndPersist = tcpRACKRecoveryThreshold
        } else if rc.exitedRecovery {
                // Else if exiting loss recovery:
                //   RACK.reo_wnd_persist -= 1
                //   If RACK.reo_wnd_persist <= 0:
                //      RACK.reo_wnd_incr = 1
                rc.ReoWndPersist--
                if rc.ReoWndPersist <= 0 {
                        rc.ReoWndIncr = 1
                }
                rc.exitedRecovery = false
        }

        // Reorder window is zero during loss recovery, or when the number of
        // DUPACKs exceeds the classic DUPACKthreshold.
        // If RACK.reord is FALSE:
        //   If in loss recovery:  (If in fast or timeout recovery)
        //      RACK.reo_wnd = 0
        //      Return
        //   Else if RACK.pkts_sacked >= RACK.dupthresh:
        //     RACK.reo_wnd = 0
        //     return
        if !rc.Reord {
                if snd.state == tcpip.RTORecovery || snd.state == tcpip.SACKRecovery {
                        rc.ReoWnd = 0
                        return
                }

                if snd.SackedOut >= nDupAckThreshold {
                        rc.ReoWnd = 0
                        return
                }
        }

        // Calculate reorder window.
        // RACK.reo_wnd = RACK.min_RTT / 4 * RACK.reo_wnd_incr
        // RACK.reo_wnd = min(RACK.reo_wnd, SRTT)
        snd.rtt.Lock()
        srtt := snd.rtt.TCPRTTState.SRTT
        snd.rtt.Unlock()
        rc.ReoWnd = time.Duration((int64(rc.minRTT) / 4) * int64(rc.ReoWndIncr))
        if srtt < rc.ReoWnd {
                rc.ReoWnd = srtt
        }
}

func (rc *rackControl) exitRecovery() {
        rc.exitedRecovery = true
}

// detectLoss marks the segment as lost if the reordering window has elapsed
// and the ACK is not received. It will also arm the reorder timer.
// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 Step 5.
func (rc *rackControl) detectLoss(rcvTime tcpip.MonotonicTime) int {
        var timeout time.Duration
        numLost := 0
        for seg := rc.snd.writeList.Front(); seg != nil && seg.xmitCount != 0; seg = seg.Next() {
                if rc.snd.ep.scoreboard.IsSACKED(seg.sackBlock()) {
                        continue
                }

                if seg.lost && seg.xmitCount == 1 {
                        numLost++
                        continue
                }

                endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size()))
                if seg.xmitTime.Before(rc.XmitTime) || (seg.xmitTime == rc.XmitTime && rc.EndSequence.LessThan(endSeq)) {
                        timeRemaining := seg.xmitTime.Sub(rcvTime) + rc.RTT + rc.ReoWnd
                        if timeRemaining <= 0 {
                                seg.lost = true
                                numLost++
                        } else if timeRemaining > timeout {
                                timeout = timeRemaining
                        }
                }
        }

        if timeout != 0 && !rc.snd.reorderTimer.enabled() {
                rc.snd.reorderTimer.enable(timeout)
        }
        return numLost
}

// reorderTimerExpired will retransmit the segments which have not been acked
// before the reorder timer expired.
func (rc *rackControl) reorderTimerExpired() tcpip.Error {
        // Check if the timer actually expired or if it's a spurious wake due
        // to a previously orphaned runtime timer.
        if !rc.snd.reorderTimer.checkExpiration() {
                return nil
        }

        numLost := rc.detectLoss(rc.snd.ep.stack.Clock().NowMonotonic())
        if numLost == 0 {
                return nil
        }

        fastRetransmit := false
        if !rc.snd.FastRecovery.Active {
                rc.snd.cc.HandleLossDetected()
                rc.snd.enterRecovery()
                fastRetransmit = true
        }

        rc.DoRecovery(nil, fastRetransmit)
        return nil
}

// DoRecovery implements lossRecovery.DoRecovery.
func (rc *rackControl) DoRecovery(_ *segment, fastRetransmit bool) {
        snd := rc.snd
        if fastRetransmit {
                snd.resendSegment()
        }

        var dataSent bool
        // Iterate the writeList and retransmit the segments which are marked
        // as lost by RACK.
        for seg := snd.writeList.Front(); seg != nil && seg.xmitCount > 0; seg = seg.Next() {
                if seg == snd.writeNext {
                        break
                }

                if !seg.lost {
                        continue
                }

                // Reset seg.lost as it is already SACKed.
                if snd.ep.scoreboard.IsSACKED(seg.sackBlock()) {
                        seg.lost = false
                        continue
                }

                // Check the congestion window after entering recovery.
                if snd.Outstanding >= snd.SndCwnd {
                        break
                }

                if sent := snd.maybeSendSegment(seg, int(snd.ep.scoreboard.SMSS()), snd.SndUna.Add(snd.SndWnd)); !sent {
                        break
                }
                dataSent = true
                snd.Outstanding += snd.pCount(seg, snd.MaxPayloadSize)
        }

        snd.postXmit(dataSent, true /* shouldScheduleProbe */)
}


























































































































































   10 







   10 













   10 





   10 







   10 













   10 























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package usage

import (
        "fmt"
        "os"
        "sync/atomic"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/bits"
        "gvisor.dev/gvisor/pkg/memutil"
        "gvisor.dev/gvisor/pkg/sync"
)

// MemoryKind represents a type of memory used by the application.
//
// For efficiency reasons, it is assumed that the Memory implementation is
// responsible for specific stats (documented below), and those may be reported
// in aggregate independently. See the platform.Memory interface as well as the
// control.Usage.Collect method for more information.
type MemoryKind int

const (
        // System represents miscellaneous system memory. This may include
        // memory that is in the process of being reclaimed, system caches,
        // page tables, swap, etc.
        //
        // This memory kind is backed by platform memory.
        System MemoryKind = iota

        // Anonymous represents anonymous application memory.
        //
        // This memory kind is backed by platform memory.
        Anonymous

        // PageCache represents memory allocated to back sandbox-visible files that
        // do not have a local fd. The contents of these files are buffered in
        // memory to support application mmaps.
        //
        // This memory kind is backed by platform memory.
        PageCache

        // Tmpfs represents memory used by the sandbox-visible tmpfs.
        //
        // This memory kind is backed by platform memory.
        Tmpfs

        // Ramdiskfs represents memory used by the ramdiskfs.
        //
        // This memory kind is backed by platform memory.
        Ramdiskfs

        // Mapped represents memory related to files which have a local fd on the
        // host, and thus can be directly mapped. Typically these are files backed
        // by gofers with donated-fd support. Note that this value may not track the
        // exact amount of memory used by mapping on the host, because we don't have
        // any visibility into the host kernel memory management. In particular,
        // once we map some part of a host file, the host kernel is free to
        // abitrarily populate/decommit the pages, which it may do for various
        // reasons (ex. host memory reclaim, NUMA balancing).
        //
        // This memory kind is backed by the host pagecache, via host mmaps.
        Mapped
)

// MemoryStats tracks application memory usage in bytes. All fields correspond to the
// memory category with the same name. This object is thread-safe if accessed
// through the provided methods. The public fields may be safely accessed
// directly on a copy of the object obtained from Memory.Copy().
type MemoryStats struct {
        System    uint64
        Anonymous uint64
        PageCache uint64
        Tmpfs     uint64
        // Lazily updated based on the value in RTMapped.
        Mapped    uint64
        Ramdiskfs uint64
}

// RTMemoryStats contains the memory usage values that need to be directly
// exposed through a shared memory file for real-time access. These are
// categories not backed by platform memory. For details about how this works,
// see the memory accounting docs.
//
// N.B. Please keep the struct in sync with the API. Notably, changes to this
// struct requires a version bump and addition of compatibility logic in the
// control server. As a special-case, adding fields without re-ordering existing
// ones do not require a version bump because the mapped page we use is
// initially zeroed. Any added field will be ignored by an older API and will be
// zero if read by a newer API.
type RTMemoryStats struct {
        RTMapped uint64
}

// MemoryLocked is Memory with access methods.
type MemoryLocked struct {
        mu sync.RWMutex
        // MemoryStats records the memory stats.
        MemoryStats
        // RTMemoryStats records the memory stats that need to be exposed through
        // shared page.
        *RTMemoryStats
        // File is the backing file storing the memory stats.
        File *os.File
}

// Init initializes global 'MemoryAccounting'.
func Init() error {
        const name = "memory-usage"
        fd, err := memutil.CreateMemFD(name, 0)
        if err != nil {
                return fmt.Errorf("error creating usage file: %v", err)
        }
        file := os.NewFile(uintptr(fd), name)
        if err := file.Truncate(int64(RTMemoryStatsSize)); err != nil {
                return fmt.Errorf("error truncating usage file: %v", err)
        }
        // Note: We rely on the returned page being initially zeroed. This will
        // always be the case for a newly mapped page from /dev/shm. If we obtain
        // the shared memory through some other means in the future, we may have to
        // explicitly zero the page.
        mmap, err := memutil.MapFile(0, RTMemoryStatsSize, unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED, file.Fd(), 0)
        if err != nil {
                return fmt.Errorf("error mapping usage file: %v", err)
        }

        MemoryAccounting = &MemoryLocked{
                File:          file,
                RTMemoryStats: RTMemoryStatsPointer(mmap),
        }
        return nil
}

// MemoryAccounting is the global memory stats.
//
// There is no need to save or restore the global memory accounting object,
// because individual frame kinds are saved and charged only when they become
// resident.
var MemoryAccounting *MemoryLocked

func (m *MemoryLocked) incLocked(val uint64, kind MemoryKind) {
        switch kind {
        case System:
                atomic.AddUint64(&m.System, val)
        case Anonymous:
                atomic.AddUint64(&m.Anonymous, val)
        case PageCache:
                atomic.AddUint64(&m.PageCache, val)
        case Mapped:
                atomic.AddUint64(&m.RTMapped, val)
        case Tmpfs:
                atomic.AddUint64(&m.Tmpfs, val)
        case Ramdiskfs:
                atomic.AddUint64(&m.Ramdiskfs, val)
        default:
                panic(fmt.Sprintf("invalid memory kind: %v", kind))
        }
}

// Inc adds an additional usage of 'val' bytes to memory category 'kind'.
//
// This method is thread-safe.
func (m *MemoryLocked) Inc(val uint64, kind MemoryKind) {
        m.mu.RLock()
        m.incLocked(val, kind)
        m.mu.RUnlock()
}

func (m *MemoryLocked) decLocked(val uint64, kind MemoryKind) {
        switch kind {
        case System:
                atomic.AddUint64(&m.System, ^(val - 1))
        case Anonymous:
                atomic.AddUint64(&m.Anonymous, ^(val - 1))
        case PageCache:
                atomic.AddUint64(&m.PageCache, ^(val - 1))
        case Mapped:
                atomic.AddUint64(&m.RTMapped, ^(val - 1))
        case Tmpfs:
                atomic.AddUint64(&m.Tmpfs, ^(val - 1))
        case Ramdiskfs:
                atomic.AddUint64(&m.Ramdiskfs, ^(val - 1))
        default:
                panic(fmt.Sprintf("invalid memory kind: %v", kind))
        }
}

// Dec remove a usage of 'val' bytes from memory category 'kind'.
//
// This method is thread-safe.
func (m *MemoryLocked) Dec(val uint64, kind MemoryKind) {
        m.mu.RLock()
        m.decLocked(val, kind)
        m.mu.RUnlock()
}

// Move moves a usage of 'val' bytes from 'from' to 'to'.
//
// This method is thread-safe.
func (m *MemoryLocked) Move(val uint64, to MemoryKind, from MemoryKind) {
        m.mu.RLock()
        // Just call decLocked and incLocked directly. We held the RLock to
        // protect against concurrent callers to Total().
        m.decLocked(val, from)
        m.incLocked(val, to)
        m.mu.RUnlock()
}

// totalLocked returns a total usage.
//
// Precondition: must be called when locked.
func (m *MemoryLocked) totalLocked() (total uint64) {
        total += atomic.LoadUint64(&m.System)
        total += atomic.LoadUint64(&m.Anonymous)
        total += atomic.LoadUint64(&m.PageCache)
        total += atomic.LoadUint64(&m.RTMapped)
        total += atomic.LoadUint64(&m.Tmpfs)
        total += atomic.LoadUint64(&m.Ramdiskfs)
        return
}

// Total returns a total memory usage.
//
// This method is thread-safe.
func (m *MemoryLocked) Total() uint64 {
        m.mu.Lock()
        defer m.mu.Unlock()
        return m.totalLocked()
}

// Copy returns a copy of the structure with a total.
//
// This method is thread-safe.
func (m *MemoryLocked) Copy() (MemoryStats, uint64) {
        m.mu.Lock()
        defer m.mu.Unlock()
        ms := m.MemoryStats
        ms.Mapped = m.RTMapped
        return ms, m.totalLocked()
}

// These options control how much total memory the is reported to the application.
// They may only be set before the application starts executing, and must not
// be modified.
var (
        // MinimumTotalMemoryBytes is the minimum reported total system memory.
        MinimumTotalMemoryBytes uint64 = 2 << 30 // 2 GB

        // MaximumTotalMemoryBytes is the maximum reported total system memory.
        // The 0 value indicates no maximum.
        MaximumTotalMemoryBytes uint64
)

// TotalMemory returns the "total usable memory" available.
//
// This number doesn't really have a true value so it's based on the following
// inputs and further bounded to be above the MinumumTotalMemoryBytes and below
// MaximumTotalMemoryBytes.
//
// memSize should be the platform.Memory size reported by platform.Memory.TotalSize()
// used is the total memory reported by MemoryLocked.Total()
func TotalMemory(memSize, used uint64) uint64 {
        if memSize < MinimumTotalMemoryBytes {
                memSize = MinimumTotalMemoryBytes
        }
        if memSize < used {
                memSize = used
                // Bump memSize to the next largest power of 2, if one exists, so
                // that MemFree isn't 0.
                if msb := bits.MostSignificantOne64(memSize); msb < 63 {
                        memSize = uint64(1) << (uint(msb) + 1)
                }
        }
        if MaximumTotalMemoryBytes > 0 && memSize > MaximumTotalMemoryBytes {
                memSize = MaximumTotalMemoryBytes
        }
        return memSize
}


























   10 




    5 
    6 


    5 



   10 





    7 




   19 




    7 




    3 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package seqnum defines the types and methods for TCP sequence numbers such
// that they fit in 32-bit words and work properly when overflows occur.
package seqnum

// Value represents the value of a sequence number.
type Value uint32

// Size represents the size (length) of a sequence number window.
type Size uint32

// LessThan checks if v is before w, i.e., v < w.
func (v Value) LessThan(w Value) bool {
        return int32(v-w) < 0
}

// LessThanEq returns true if v==w or v is before i.e., v < w.
func (v Value) LessThanEq(w Value) bool {
        if v == w {
                return true
        }
        return v.LessThan(w)
}

// InRange checks if v is in the range [a,b), i.e., a <= v < b.
func (v Value) InRange(a, b Value) bool {
        return v-a < b-a
}

// InWindow checks if v is in the window that starts at 'first' and spans 'size'
// sequence numbers.
func (v Value) InWindow(first Value, size Size) bool {
        return v.InRange(first, first.Add(size))
}

// Add calculates the sequence number following the [v, v+s) window.
func (v Value) Add(s Size) Value {
        return v + Value(s)
}

// Size calculates the size of the window defined by [v, w).
func (v Value) Size(w Value) Size {
        return Size(w - v)
}

// UpdateForward updates v such that it becomes v + s.
func (v *Value) UpdateForward(s Size) {
        *v += Value(s)
}


























   19 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
        "reflect"
        "unsafe"
)

// optionsToArray converts a slice of capacity >-= maxOptionSize to an array.
//
// optionsToArray panics if the capacity of options is smaller than
// maxOptionSize.
func optionsToArray(options []byte) *[maxOptionSize]byte {
        // Reslice to full capacity.
        options = options[0:maxOptionSize]
        return (*[maxOptionSize]byte)(unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&options)).Data))
}





























 1807 











    1 






 1779 






    3 




    2 







   11 








  664 





  662 



    5 
    1 



    4 







    4 











    1 




    1 






    1 






    4 





    1 






    1 





    1 
    1 







    1 


    1 






    1 









    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/kernel/futex"
        "gvisor.dev/gvisor/pkg/usermem"
)

// Futex returns t's futex manager.
//
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) Futex() *futex.Manager {
        return t.image.fu
}

// SwapUint32 implements futex.Target.SwapUint32.
func (t *Task) SwapUint32(addr hostarch.Addr, new uint32) (uint32, error) {
        return t.MemoryManager().SwapUint32(t, addr, new, usermem.IOOpts{
                AddressSpaceActive: true,
        })
}

// CompareAndSwapUint32 implements futex.Target.CompareAndSwapUint32.
func (t *Task) CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error) {
        return t.MemoryManager().CompareAndSwapUint32(t, addr, old, new, usermem.IOOpts{
                AddressSpaceActive: true,
        })
}

// LoadUint32 implements futex.Target.LoadUint32.
func (t *Task) LoadUint32(addr hostarch.Addr) (uint32, error) {
        return t.MemoryManager().LoadUint32(t, addr, usermem.IOOpts{
                AddressSpaceActive: true,
        })
}

// GetSharedKey implements futex.Target.GetSharedKey.
func (t *Task) GetSharedKey(addr hostarch.Addr) (futex.Key, error) {
        return t.MemoryManager().GetSharedFutexKey(t, addr)
}

// GetRobustList sets the robust futex list for the task.
func (t *Task) GetRobustList() hostarch.Addr {
        t.mu.Lock()
        addr := t.robustList
        t.mu.Unlock()
        return addr
}

// SetRobustList sets the robust futex list for the task.
func (t *Task) SetRobustList(addr hostarch.Addr) {
        t.mu.Lock()
        t.robustList = addr
        t.mu.Unlock()
}

// exitRobustList walks the robust futex list, marking locks dead and notifying
// wakers. It corresponds to Linux's exit_robust_list(). Following Linux,
// errors are silently ignored.
func (t *Task) exitRobustList() {
        t.mu.Lock()
        addr := t.robustList
        t.robustList = 0
        t.mu.Unlock()

        if addr == 0 {
                return
        }

        var rl linux.RobustListHead
        if _, err := rl.CopyIn(t, hostarch.Addr(addr)); err != nil {
                return
        }

        next := primitive.Uint64(rl.List)
        done := 0
        var pendingLockAddr hostarch.Addr
        if rl.ListOpPending != 0 {
                pendingLockAddr = hostarch.Addr(rl.ListOpPending + rl.FutexOffset)
        }

        // Wake up normal elements.
        for hostarch.Addr(next) != addr {
                // We traverse to the next element of the list before we
                // actually wake anything. This prevents the race where waking
                // this futex causes a modification of the list.
                thisLockAddr := hostarch.Addr(uint64(next) + rl.FutexOffset)

                // Try to decode the next element in the list before waking the
                // current futex. But don't check the error until after we've
                // woken the current futex. Linux does it in this order too
                _, nextErr := next.CopyIn(t, hostarch.Addr(next))

                // Wakeup the current futex if it's not pending.
                if thisLockAddr != pendingLockAddr {
                        t.wakeRobustListOne(thisLockAddr)
                }

                // If there was an error copying the next futex, we must bail.
                if nextErr != nil {
                        break
                }

                // This is a user structure, so it could be a massive list, or
                // even contain a loop if they are trying to mess with us. We
                // cap traversal to prevent that.
                done++
                if done >= linux.ROBUST_LIST_LIMIT {
                        break
                }
        }

        // Is there a pending entry to wake?
        if pendingLockAddr != 0 {
                t.wakeRobustListOne(pendingLockAddr)
        }
}

// wakeRobustListOne wakes a single futex from the robust list.
func (t *Task) wakeRobustListOne(addr hostarch.Addr) {
        // Bit 0 in address signals PI futex.
        pi := addr&1 == 1
        addr = addr &^ 1

        // Load the futex.
        f, err := t.LoadUint32(addr)
        if err != nil {
                // Can't read this single value? Ignore the problem.
                // We can wake the other futexes in the list.
                return
        }

        tid := uint32(t.ThreadID())
        for {
                // Is this held by someone else?
                if f&linux.FUTEX_TID_MASK != tid {
                        return
                }

                // This thread is dying and it's holding this futex. We need to
                // set the owner died bit and wake up any waiters.
                newF := (f & linux.FUTEX_WAITERS) | linux.FUTEX_OWNER_DIED
                if curF, err := t.CompareAndSwapUint32(addr, f, newF); err != nil {
                        return
                } else if curF != f {
                        // Futex changed out from under us. Try again...
                        f = curF
                        continue
                }

                // Wake waiters if there are any.
                if f&linux.FUTEX_WAITERS != 0 {
                        private := f&linux.FUTEX_PRIVATE_FLAG != 0
                        if pi {
                                t.Futex().UnlockPI(t, addr, tid, private)
                                return
                        }
                        t.Futex().Wake(t, addr, private, linux.FUTEX_BITSET_MATCH_ANY, 1)
                }

                // Done.
                return
        }
}






































































    4 



    3 
    3 
    3 


    3 


    1 




    1 








    2 





    2 





    1 





    2 




    4 



    4 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import (
        "encoding/binary"

        "gvisor.dev/gvisor/pkg/tcpip"
)

const (
        // ARPProtocolNumber is the ARP network protocol number.
        ARPProtocolNumber tcpip.NetworkProtocolNumber = 0x0806

        // ARPSize is the size of an IPv4-over-Ethernet ARP packet.
        ARPSize = 28
)

// ARPHardwareType is the hardware type for LinkEndpoint in an ARP header.
type ARPHardwareType uint16

// Typical ARP HardwareType values. Some of the constants have to be specific
// values as they are egressed on the wire in the HTYPE field of an ARP header.
const (
        ARPHardwareNone ARPHardwareType = 0
        // ARPHardwareEther specifically is the HTYPE for Ethernet as specified
        // in the IANA list here:
        //
        // https://www.iana.org/assignments/arp-parameters/arp-parameters.xhtml#arp-parameters-2
        ARPHardwareEther    ARPHardwareType = 1
        ARPHardwareLoopback ARPHardwareType = 2
)

// ARPOp is an ARP opcode.
type ARPOp uint16

// Typical ARP opcodes defined in RFC 826.
const (
        ARPRequest ARPOp = 1
        ARPReply   ARPOp = 2
)

// ARP is an ARP packet stored in a byte array as described in RFC 826.
type ARP []byte

const (
        hTypeOffset                 = 0
        protocolOffset              = 2
        haAddressSizeOffset         = 4
        protoAddressSizeOffset      = 5
        opCodeOffset                = 6
        senderHAAddressOffset       = 8
        senderProtocolAddressOffset = senderHAAddressOffset + EthernetAddressSize
        targetHAAddressOffset       = senderProtocolAddressOffset + IPv4AddressSize
        targetProtocolAddressOffset = targetHAAddressOffset + EthernetAddressSize
)

func (a ARP) hardwareAddressType() ARPHardwareType {
        return ARPHardwareType(binary.BigEndian.Uint16(a[hTypeOffset:]))
}

func (a ARP) protocolAddressSpace() uint16 { return binary.BigEndian.Uint16(a[protocolOffset:]) }
func (a ARP) hardwareAddressSize() int     { return int(a[haAddressSizeOffset]) }
func (a ARP) protocolAddressSize() int     { return int(a[protoAddressSizeOffset]) }

// Op is the ARP opcode.
func (a ARP) Op() ARPOp { return ARPOp(binary.BigEndian.Uint16(a[opCodeOffset:])) }

// SetOp sets the ARP opcode.
func (a ARP) SetOp(op ARPOp) {
        binary.BigEndian.PutUint16(a[opCodeOffset:], uint16(op))
}

// SetIPv4OverEthernet configures the ARP packet for IPv4-over-Ethernet.
func (a ARP) SetIPv4OverEthernet() {
        binary.BigEndian.PutUint16(a[hTypeOffset:], uint16(ARPHardwareEther))
        binary.BigEndian.PutUint16(a[protocolOffset:], uint16(IPv4ProtocolNumber))
        a[haAddressSizeOffset] = EthernetAddressSize
        a[protoAddressSizeOffset] = uint8(IPv4AddressSize)
}

// HardwareAddressSender is the link address of the sender.
// It is a view on to the ARP packet so it can be used to set the value.
func (a ARP) HardwareAddressSender() []byte {
        return a[senderHAAddressOffset : senderHAAddressOffset+EthernetAddressSize]
}

// ProtocolAddressSender is the protocol address of the sender.
// It is a view on to the ARP packet so it can be used to set the value.
func (a ARP) ProtocolAddressSender() []byte {
        return a[senderProtocolAddressOffset : senderProtocolAddressOffset+IPv4AddressSize]
}

// HardwareAddressTarget is the link address of the target.
// It is a view on to the ARP packet so it can be used to set the value.
func (a ARP) HardwareAddressTarget() []byte {
        return a[targetHAAddressOffset : targetHAAddressOffset+EthernetAddressSize]
}

// ProtocolAddressTarget is the protocol address of the target.
// It is a view on to the ARP packet so it can be used to set the value.
func (a ARP) ProtocolAddressTarget() []byte {
        return a[targetProtocolAddressOffset : targetProtocolAddressOffset+IPv4AddressSize]
}

// IsValid reports whether this is an ARP packet for IPv4 over Ethernet.
func (a ARP) IsValid() bool {
        if len(a) < ARPSize {
                return false
        }
        return a.hardwareAddressType() == ARPHardwareEther &&
                a.protocolAddressSpace() == uint16(IPv4ProtocolNumber) &&
                a.hardwareAddressSize() == EthernetAddressSize &&
                a.protocolAddressSize() == IPv4AddressSize
}






































































  404 



  406 

  219 


  406 






  345 



  346 
  343 


  343 
































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package arch

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal/primitive"

        "gvisor.dev/gvisor/pkg/usermem"
)

// Stack is a simple wrapper around a hostarch.IO and an address. Stack
// implements marshal.CopyContext, and marshallable values can be pushed or
// popped from the stack through the marshal.Marshallable interface.
//
// Stack is not thread-safe.
type Stack struct {
        // Our arch info.
        // We use this for automatic Native conversion of hostarch.Addrs during
        // Push() and Pop().
        Arch Context

        // The interface used to actually copy user memory.
        IO usermem.IO

        // Our current stack bottom.
        Bottom hostarch.Addr

        // Scratch buffer used for marshalling to avoid having to repeatedly
        // allocate scratch memory.
        scratchBuf []byte
}

// scratchBufLen is the default length of Stack.scratchBuf. The
// largest structs the stack regularly serializes are linux.SignalInfo
// and arch.UContext64. We'll set the default size as the larger of
// the two, arch.UContext64.
var scratchBufLen = (*UContext64)(nil).SizeBytes()

// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
func (s *Stack) CopyScratchBuffer(size int) []byte {
        if len(s.scratchBuf) < size {
                s.scratchBuf = make([]byte, size)
        }
        return s.scratchBuf[:size]
}

// StackBottomMagic is the special address callers must past to all stack
// marshalling operations to cause the src/dst address to be computed based on
// the current end of the stack.
const StackBottomMagic = ^hostarch.Addr(0) // hostarch.Addr(-1)

// CopyOutBytes implements marshal.CopyContext.CopyOutBytes. CopyOutBytes
// computes an appropriate address based on the current end of the
// stack. Callers use the sentinel address StackBottomMagic to marshal methods
// to indicate this.
func (s *Stack) CopyOutBytes(sentinel hostarch.Addr, b []byte) (int, error) {
        if sentinel != StackBottomMagic {
                panic("Attempted to copy out to stack with absolute address")
        }
        c := len(b)
        n, err := s.IO.CopyOut(context.Background(), s.Bottom-hostarch.Addr(c), b, usermem.IOOpts{})
        if err == nil && n == c {
                s.Bottom -= hostarch.Addr(n)
        }
        return n, err
}

// CopyInBytes implements marshal.CopyContext.CopyInBytes. CopyInBytes computes
// an appropriate address based on the current end of the stack. Callers must
// use the sentinel address StackBottomMagic to marshal methods to indicate
// this.
func (s *Stack) CopyInBytes(sentinel hostarch.Addr, b []byte) (int, error) {
        if sentinel != StackBottomMagic {
                panic("Attempted to copy in from stack with absolute address")
        }
        n, err := s.IO.CopyIn(context.Background(), s.Bottom, b, usermem.IOOpts{})
        if err == nil {
                s.Bottom += hostarch.Addr(n)
        }
        return n, err
}

// Align aligns the stack to the given offset.
func (s *Stack) Align(offset int) {
        if s.Bottom%hostarch.Addr(offset) != 0 {
                s.Bottom -= (s.Bottom % hostarch.Addr(offset))
        }
}

// PushNullTerminatedByteSlice writes bs to the stack, followed by an extra null
// byte at the end. On error, the contents of the stack and the bottom cursor
// are undefined.
func (s *Stack) PushNullTerminatedByteSlice(bs []byte) (int, error) {
        // Note: Stack grows up, so write the terminal null byte first.
        nNull, err := primitive.CopyUint8Out(s, StackBottomMagic, 0)
        if err != nil {
                return 0, err
        }
        n, err := primitive.CopyByteSliceOut(s, StackBottomMagic, bs)
        if err != nil {
                return 0, err
        }
        return n + nNull, nil
}

// StackLayout describes the location of the arguments and environment on the
// stack.
type StackLayout struct {
        // ArgvStart is the beginning of the argument vector.
        ArgvStart hostarch.Addr

        // ArgvEnd is the end of the argument vector.
        ArgvEnd hostarch.Addr

        // EnvvStart is the beginning of the environment vector.
        EnvvStart hostarch.Addr

        // EnvvEnd is the end of the environment vector.
        EnvvEnd hostarch.Addr
}

// Load pushes the given args, env and aux vector to the stack using the
// well-known format for a new executable. It returns the start and end
// of the argument and environment vectors.
func (s *Stack) Load(args []string, env []string, aux Auxv) (StackLayout, error) {
        l := StackLayout{}

        // Make sure we start with a 16-byte alignment.
        s.Align(16)

        // Push the environment vector so the end of the argument vector is adjacent to
        // the beginning of the environment vector.
        // While the System V abi for x86_64 does not specify an ordering to the
        // Information Block (the block holding the arg, env, and aux vectors),
        // support features like setproctitle(3) naturally expect these segments
        // to be in this order. See: https://www.uclibc.org/docs/psABI-x86_64.pdf
        // page 29.
        l.EnvvEnd = s.Bottom
        envAddrs := make([]hostarch.Addr, len(env))
        for i := len(env) - 1; i >= 0; i-- {
                if _, err := s.PushNullTerminatedByteSlice([]byte(env[i])); err != nil {
                        return StackLayout{}, err
                }
                envAddrs[i] = s.Bottom
        }
        l.EnvvStart = s.Bottom

        // Push our strings.
        l.ArgvEnd = s.Bottom
        argAddrs := make([]hostarch.Addr, len(args))
        for i := len(args) - 1; i >= 0; i-- {
                if _, err := s.PushNullTerminatedByteSlice([]byte(args[i])); err != nil {
                        return StackLayout{}, err
                }
                argAddrs[i] = s.Bottom
        }
        l.ArgvStart = s.Bottom

        // We need to align the arguments appropriately.
        //
        // We must finish on a 16-byte alignment, but we'll play it
        // conservatively and finish at 32-bytes. It would be nice to be able
        // to call Align here, but unfortunately we need to align the stack
        // with all the variable sized arrays pushed. So we just need to do
        // some calculations.
        argvSize := s.Arch.Width() * uint(len(args)+1)
        envvSize := s.Arch.Width() * uint(len(env)+1)
        auxvSize := s.Arch.Width() * 2 * uint(len(aux)+1)
        total := hostarch.Addr(argvSize) + hostarch.Addr(envvSize) + hostarch.Addr(auxvSize) + hostarch.Addr(s.Arch.Width())
        expectedBottom := s.Bottom - total
        if expectedBottom%32 != 0 {
                s.Bottom -= expectedBottom % 32
        }

        // Push our auxvec.
        // NOTE: We need an extra zero here per spec.
        // The Push function will automatically terminate
        // strings and arrays with a single null value.
        auxv := make([]hostarch.Addr, 0, len(aux))
        for _, a := range aux {
                auxv = append(auxv, hostarch.Addr(a.Key), a.Value)
        }
        auxv = append(auxv, hostarch.Addr(0))
        _, err := s.pushAddrSliceAndTerminator(auxv)
        if err != nil {
                return StackLayout{}, err
        }

        // Push environment.
        _, err = s.pushAddrSliceAndTerminator(envAddrs)
        if err != nil {
                return StackLayout{}, err
        }

        // Push args.
        _, err = s.pushAddrSliceAndTerminator(argAddrs)
        if err != nil {
                return StackLayout{}, err
        }

        // Push arg count.
        lenP := s.Arch.Native(uintptr(len(args)))
        if _, err = lenP.CopyOut(s, StackBottomMagic); err != nil {
                return StackLayout{}, err
        }

        return l, nil
}




















































   51 











  123 


   13 


  123 







   10 







   24 




















   12 










   12 













   10 





    3 


    7 


    9 






    1 


    6 


    8 
    6 



    2 




    8 

    7 





    3 



    6 

    6 

    5 




    7 





    4 

    2 

    1 


    6 






    6 



    4 


    4 




    4 

    2 




    4 
    4 

    4 
    2 



    3 

    3 

    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tmpfs

import (
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
)

// +stateify savable
type directory struct {
        // Since directories can't be hard-linked, each directory can only be
        // associated with a single dentry, which we can store in the directory
        // struct.
        dentry dentry
        inode  inode

        // childMap maps the names of the directory's children to their dentries.
        // childMap is protected by filesystem.mu.
        childMap map[string]*dentry

        // numChildren is len(childMap), but accessed using atomic memory
        // operations to avoid locking in inode.statTo().
        numChildren int64

        // childList is a list containing (1) child dentries and (2) fake dentries
        // (with inode == nil) that represent the iteration position of
        // directoryFDs. childList is used to support directoryFD.IterDirents()
        // efficiently. childList is protected by iterMu.
        iterMu    sync.Mutex `state:"nosave"`
        childList dentryList
}

func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) *directory {
        dir := &directory{}
        dir.inode.init(dir, fs, kuid, kgid, linux.S_IFDIR|mode, parentDir)
        dir.inode.nlink = 2 // from "." and parent directory or ".." for root
        dir.dentry.inode = &dir.inode
        dir.dentry.vfsd.Init(&dir.dentry)
        return dir
}

// Preconditions:
// * filesystem.mu must be locked for writing.
// * dir must not already contain a child with the given name.
func (dir *directory) insertChildLocked(child *dentry, name string) {
        child.parent = &dir.dentry
        child.name = name
        if dir.childMap == nil {
                dir.childMap = make(map[string]*dentry)
        }
        dir.childMap[name] = child
        atomic.AddInt64(&dir.numChildren, 1)
        dir.iterMu.Lock()
        dir.childList.PushBack(child)
        dir.iterMu.Unlock()
}

// Preconditions: filesystem.mu must be locked for writing.
func (dir *directory) removeChildLocked(child *dentry) {
        delete(dir.childMap, child.name)
        atomic.AddInt64(&dir.numChildren, -1)
        dir.iterMu.Lock()
        dir.childList.Remove(child)
        dir.iterMu.Unlock()
}

func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error {
        return vfs.CheckDeleteSticky(
                creds,
                linux.FileMode(atomic.LoadUint32(&dir.inode.mode)),
                auth.KUID(atomic.LoadUint32(&dir.inode.uid)),
                auth.KUID(atomic.LoadUint32(&child.inode.uid)),
                auth.KGID(atomic.LoadUint32(&child.inode.gid)),
        )
}

// +stateify savable
type directoryFD struct {
        fileDescription
        vfs.DirectoryFileDescriptionDefaultImpl

        // Protected by directory.iterMu.
        iter *dentry
        off  int64
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *directoryFD) Release(ctx context.Context) {
        if fd.iter != nil {
                dir := fd.inode().impl.(*directory)
                dir.iterMu.Lock()
                dir.childList.Remove(fd.iter)
                dir.iterMu.Unlock()
                fd.iter = nil
        }
}

// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
        fs := fd.filesystem()
        dir := fd.inode().impl.(*directory)

        defer fd.dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent)

        // fs.mu is required to read d.parent and dentry.name.
        fs.mu.RLock()
        defer fs.mu.RUnlock()
        dir.iterMu.Lock()
        defer dir.iterMu.Unlock()

        fd.inode().touchAtime(fd.vfsfd.Mount())

        if fd.off == 0 {
                if err := cb.Handle(vfs.Dirent{
                        Name:    ".",
                        Type:    linux.DT_DIR,
                        Ino:     dir.inode.ino,
                        NextOff: 1,
                }); err != nil {
                        return err
                }
                fd.off++
        }

        if fd.off == 1 {
                parentInode := genericParentOrSelf(&dir.dentry).inode
                if err := cb.Handle(vfs.Dirent{
                        Name:    "..",
                        Type:    parentInode.direntType(),
                        Ino:     parentInode.ino,
                        NextOff: 2,
                }); err != nil {
                        return err
                }
                fd.off++
        }

        var child *dentry
        if fd.iter == nil {
                // Start iteration at the beginning of dir.
                child = dir.childList.Front()
                fd.iter = &dentry{}
        } else {
                // Continue iteration from where we left off.
                child = fd.iter.Next()
                dir.childList.Remove(fd.iter)
        }
        for child != nil {
                // Skip other directoryFD iterators.
                if child.inode != nil {
                        if err := cb.Handle(vfs.Dirent{
                                Name:    child.name,
                                Type:    child.inode.direntType(),
                                Ino:     child.inode.ino,
                                NextOff: fd.off + 1,
                        }); err != nil {
                                dir.childList.InsertBefore(child, fd.iter)
                                return err
                        }
                        fd.off++
                }
                child = child.Next()
        }
        dir.childList.PushBack(fd.iter)
        return nil
}

// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        dir := fd.inode().impl.(*directory)
        dir.iterMu.Lock()
        defer dir.iterMu.Unlock()

        switch whence {
        case linux.SEEK_SET:
                // Use offset as given.
        case linux.SEEK_CUR:
                offset += fd.off
        default:
                return 0, linuxerr.EINVAL
        }
        if offset < 0 {
                return 0, linuxerr.EINVAL
        }

        // If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't
        // seek even if doing so might reposition the iterator due to concurrent
        // mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek().
        if fd.off == offset {
                return offset, nil
        }

        fd.off = offset
        // Compensate for "." and "..".
        remChildren := int64(0)
        if offset >= 2 {
                remChildren = offset - 2
        }

        // Ensure that fd.iter exists and is not linked into dir.childList.
        if fd.iter == nil {
                fd.iter = &dentry{}
        } else {
                dir.childList.Remove(fd.iter)
        }
        // Insert fd.iter before the remChildren'th child, or at the end of the
        // list if remChildren >= number of children.
        child := dir.childList.Front()
        for child != nil {
                // Skip other directoryFD iterators.
                if child.inode != nil {
                        if remChildren == 0 {
                                dir.childList.InsertBefore(child, fd.iter)
                                return offset, nil
                        }
                        remChildren--
                }
                child = child.Next()
        }
        dir.childList.PushBack(fd.iter)
        return offset, nil
}























































































































































































































































































































































































































































   32 

    1 

   32 





































































































































































































































































































































































































































































































































   32 
   32 






















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "fmt"
        "time"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/waiter"
)

// NetworkEndpointID is the identifier of a network layer protocol endpoint.
// Currently the local address is sufficient because all supported protocols
// (i.e., IPv4 and IPv6) have different sizes for their addresses.
type NetworkEndpointID struct {
        LocalAddress tcpip.Address
}

// TransportEndpointID is the identifier of a transport layer protocol endpoint.
//
// +stateify savable
type TransportEndpointID struct {
        // LocalPort is the local port associated with the endpoint.
        LocalPort uint16

        // LocalAddress is the local [network layer] address associated with
        // the endpoint.
        LocalAddress tcpip.Address

        // RemotePort is the remote port associated with the endpoint.
        RemotePort uint16

        // RemoteAddress it the remote [network layer] address associated with
        // the endpoint.
        RemoteAddress tcpip.Address
}

// NetworkPacketInfo holds information about a network layer packet.
type NetworkPacketInfo struct {
        // LocalAddressBroadcast is true if the packet's local address is a broadcast
        // address.
        LocalAddressBroadcast bool

        // IsForwardedPacket is true if the packet is being forwarded.
        IsForwardedPacket bool
}

// TransportErrorKind enumerates error types that are handled by the transport
// layer.
type TransportErrorKind int

const (
        // PacketTooBigTransportError indicates that a packet did not reach its
        // destination because a link on the path to the destination had an MTU that
        // was too small to carry the packet.
        PacketTooBigTransportError TransportErrorKind = iota

        // DestinationHostUnreachableTransportError indicates that the destination
        // host was unreachable.
        DestinationHostUnreachableTransportError

        // DestinationPortUnreachableTransportError indicates that a packet reached
        // the destination host, but the transport protocol was not active on the
        // destination port.
        DestinationPortUnreachableTransportError

        // DestinationNetworkUnreachableTransportError indicates that the destination
        // network was unreachable.
        DestinationNetworkUnreachableTransportError
)

// TransportError is a marker interface for errors that may be handled by the
// transport layer.
type TransportError interface {
        tcpip.SockErrorCause

        // Kind returns the type of the transport error.
        Kind() TransportErrorKind
}

// TransportEndpoint is the interface that needs to be implemented by transport
// protocol (e.g., tcp, udp) endpoints that can handle packets.
type TransportEndpoint interface {
        // UniqueID returns an unique ID for this transport endpoint.
        UniqueID() uint64

        // HandlePacket is called by the stack when new packets arrive to this
        // transport endpoint. It sets the packet buffer's transport header.
        //
        // HandlePacket takes ownership of the packet.
        HandlePacket(TransportEndpointID, *PacketBuffer)

        // HandleError is called when the transport endpoint receives an error.
        //
        // HandleError takes ownership of the packet buffer.
        HandleError(TransportError, *PacketBuffer)

        // Abort initiates an expedited endpoint teardown. It puts the endpoint
        // in a closed state and frees all resources associated with it. This
        // cleanup may happen asynchronously. Wait can be used to block on this
        // asynchronous cleanup.
        Abort()

        // Wait waits for any worker goroutines owned by the endpoint to stop.
        //
        // An endpoint can be requested to stop its worker goroutines by calling
        // its Close method.
        //
        // Wait will not block if the endpoint hasn't started any goroutines
        // yet, even if it might later.
        Wait()
}

// RawTransportEndpoint is the interface that needs to be implemented by raw
// transport protocol endpoints. RawTransportEndpoints receive the entire
// packet - including the network and transport headers - as delivered to
// netstack.
type RawTransportEndpoint interface {
        // HandlePacket is called by the stack when new packets arrive to
        // this transport endpoint. The packet contains all data from the link
        // layer up.
        //
        // HandlePacket takes ownership of the packet.
        HandlePacket(*PacketBuffer)
}

// PacketEndpoint is the interface that needs to be implemented by packet
// transport protocol endpoints. These endpoints receive link layer headers in
// addition to whatever they contain (usually network and transport layer
// headers and a payload).
type PacketEndpoint interface {
        // HandlePacket is called by the stack when new packets arrive that
        // match the endpoint.
        //
        // Implementers should treat packet as immutable and should copy it
        // before before modification.
        //
        // linkHeader may have a length of 0, in which case the PacketEndpoint
        // should construct its own ethernet header for applications.
        //
        // HandlePacket takes ownership of pkt.
        HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
}

// UnknownDestinationPacketDisposition enumerates the possible return values from
// HandleUnknownDestinationPacket().
type UnknownDestinationPacketDisposition int

const (
        // UnknownDestinationPacketMalformed denotes that the packet was malformed
        // and no further processing should be attempted other than updating
        // statistics.
        UnknownDestinationPacketMalformed UnknownDestinationPacketDisposition = iota

        // UnknownDestinationPacketUnhandled tells the caller that the packet was
        // well formed but that the issue was not handled and the stack should take
        // the default action.
        UnknownDestinationPacketUnhandled

        // UnknownDestinationPacketHandled tells the caller that it should do
        // no further processing.
        UnknownDestinationPacketHandled
)

// TransportProtocol is the interface that needs to be implemented by transport
// protocols (e.g., tcp, udp) that want to be part of the networking stack.
type TransportProtocol interface {
        // Number returns the transport protocol number.
        Number() tcpip.TransportProtocolNumber

        // NewEndpoint creates a new endpoint of the transport protocol.
        NewEndpoint(netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error)

        // NewRawEndpoint creates a new raw endpoint of the transport protocol.
        NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error)

        // MinimumPacketSize returns the minimum valid packet size of this
        // transport protocol. The stack automatically drops any packets smaller
        // than this targeted at this protocol.
        MinimumPacketSize() int

        // ParsePorts returns the source and destination ports stored in a
        // packet of this protocol.
        ParsePorts(v buffer.View) (src, dst uint16, err tcpip.Error)

        // HandleUnknownDestinationPacket handles packets targeted at this
        // protocol that don't match any existing endpoint. For example,
        // it is targeted at a port that has no listeners.
        //
        // HandleUnknownDestinationPacket takes ownership of the packet if it handles
        // the issue.
        HandleUnknownDestinationPacket(TransportEndpointID, *PacketBuffer) UnknownDestinationPacketDisposition

        // SetOption allows enabling/disabling protocol specific features.
        // SetOption returns an error if the option is not supported or the
        // provided option value is invalid.
        SetOption(option tcpip.SettableTransportProtocolOption) tcpip.Error

        // Option allows retrieving protocol specific option values.
        // Option returns an error if the option is not supported or the
        // provided option value is invalid.
        Option(option tcpip.GettableTransportProtocolOption) tcpip.Error

        // Close requests that any worker goroutines owned by the protocol
        // stop.
        Close()

        // Wait waits for any worker goroutines owned by the protocol to stop.
        Wait()

        // Parse sets pkt.TransportHeader and trims pkt.Data appropriately. It does
        // neither and returns false if pkt.Data is too small, i.e. pkt.Data.Size() <
        // MinimumPacketSize()
        Parse(pkt *PacketBuffer) (ok bool)
}

// TransportPacketDisposition is the result from attempting to deliver a packet
// to the transport layer.
type TransportPacketDisposition int

const (
        // TransportPacketHandled indicates that a transport packet was handled by the
        // transport layer and callers need not take any further action.
        TransportPacketHandled TransportPacketDisposition = iota

        // TransportPacketProtocolUnreachable indicates that the transport
        // protocol requested in the packet is not supported.
        TransportPacketProtocolUnreachable

        // TransportPacketDestinationPortUnreachable indicates that there weren't any
        // listeners interested in the packet and the transport protocol has no means
        // to notify the sender.
        TransportPacketDestinationPortUnreachable
)

// TransportDispatcher contains the methods used by the network stack to deliver
// packets to the appropriate transport endpoint after it has been handled by
// the network layer.
type TransportDispatcher interface {
        // DeliverTransportPacket delivers packets to the appropriate
        // transport protocol endpoint.
        //
        // pkt.NetworkHeader must be set before calling DeliverTransportPacket.
        //
        // DeliverTransportPacket takes ownership of the packet.
        DeliverTransportPacket(tcpip.TransportProtocolNumber, *PacketBuffer) TransportPacketDisposition

        // DeliverTransportError delivers an error to the appropriate transport
        // endpoint.
        //
        // DeliverTransportError takes ownership of the packet buffer.
        DeliverTransportError(local, remote tcpip.Address, _ tcpip.NetworkProtocolNumber, _ tcpip.TransportProtocolNumber, _ TransportError, _ *PacketBuffer)

        // DeliverRawPacket delivers a packet to any subscribed raw sockets.
        //
        // DeliverRawPacket does NOT take ownership of the packet buffer.
        DeliverRawPacket(tcpip.TransportProtocolNumber, *PacketBuffer)
}

// PacketLooping specifies where an outbound packet should be sent.
type PacketLooping byte

const (
        // PacketOut indicates that the packet should be passed to the link
        // endpoint.
        PacketOut PacketLooping = 1 << iota

        // PacketLoop indicates that the packet should be handled locally.
        PacketLoop
)

// NetworkHeaderParams are the header parameters given as input by the
// transport endpoint to the network.
type NetworkHeaderParams struct {
        // Protocol refers to the transport protocol number.
        Protocol tcpip.TransportProtocolNumber

        // TTL refers to Time To Live field of the IP-header.
        TTL uint8

        // TOS refers to TypeOfService or TrafficClass field of the IP-header.
        TOS uint8
}

// GroupAddressableEndpoint is an endpoint that supports group addressing.
//
// An endpoint is considered to support group addressing when one or more
// endpoints may associate themselves with the same identifier (group address).
type GroupAddressableEndpoint interface {
        // JoinGroup joins the specified group.
        JoinGroup(group tcpip.Address) tcpip.Error

        // LeaveGroup attempts to leave the specified group.
        LeaveGroup(group tcpip.Address) tcpip.Error

        // IsInGroup returns true if the endpoint is a member of the specified group.
        IsInGroup(group tcpip.Address) bool
}

// PrimaryEndpointBehavior is an enumeration of an AddressEndpoint's primary
// behavior.
type PrimaryEndpointBehavior int

const (
        // CanBePrimaryEndpoint indicates the endpoint can be used as a primary
        // endpoint for new connections with no local address. This is the
        // default when calling NIC.AddAddress.
        CanBePrimaryEndpoint PrimaryEndpointBehavior = iota

        // FirstPrimaryEndpoint indicates the endpoint should be the first
        // primary endpoint considered. If there are multiple endpoints with
        // this behavior, they are ordered by recency.
        FirstPrimaryEndpoint

        // NeverPrimaryEndpoint indicates the endpoint should never be a
        // primary endpoint.
        NeverPrimaryEndpoint
)

// AddressConfigType is the method used to add an address.
type AddressConfigType int

const (
        // AddressConfigStatic is a statically configured address endpoint that was
        // added by some user-specified action (adding an explicit address, joining a
        // multicast group).
        AddressConfigStatic AddressConfigType = iota

        // AddressConfigSlaac is an address endpoint added by SLAAC, as per RFC 4862
        // section 5.5.3.
        AddressConfigSlaac

        // AddressConfigSlaacTemp is a temporary address endpoint added by SLAAC as
        // per RFC 4941. Temporary SLAAC addresses are short-lived and are not
        // to be valid (or preferred) forever; hence the term temporary.
        AddressConfigSlaacTemp
)

// AssignableAddressEndpoint is a reference counted address endpoint that may be
// assigned to a NetworkEndpoint.
type AssignableAddressEndpoint interface {
        // AddressWithPrefix returns the endpoint's address.
        AddressWithPrefix() tcpip.AddressWithPrefix

        // Subnet returns the subnet of the endpoint's address.
        Subnet() tcpip.Subnet

        // IsAssigned returns whether or not the endpoint is considered bound
        // to its NetworkEndpoint.
        IsAssigned(allowExpired bool) bool

        // IncRef increments this endpoint's reference count.
        //
        // Returns true if it was successfully incremented. If it returns false, then
        // the endpoint is considered expired and should no longer be used.
        IncRef() bool

        // DecRef decrements this endpoint's reference count.
        DecRef()
}

// AddressEndpoint is an endpoint representing an address assigned to an
// AddressableEndpoint.
type AddressEndpoint interface {
        AssignableAddressEndpoint

        // GetKind returns the address kind for this endpoint.
        GetKind() AddressKind

        // SetKind sets the address kind for this endpoint.
        SetKind(AddressKind)

        // ConfigType returns the method used to add the address.
        ConfigType() AddressConfigType

        // Deprecated returns whether or not this endpoint is deprecated.
        Deprecated() bool

        // SetDeprecated sets this endpoint's deprecated status.
        SetDeprecated(bool)
}

// AddressKind is the kind of an address.
//
// See the values of AddressKind for more details.
type AddressKind int

const (
        // PermanentTentative is a permanent address endpoint that is not yet
        // considered to be fully bound to an interface in the traditional
        // sense. That is, the address is associated with a NIC, but packets
        // destined to the address MUST NOT be accepted and MUST be silently
        // dropped, and the address MUST NOT be used as a source address for
        // outgoing packets. For IPv6, addresses are of this kind until NDP's
        // Duplicate Address Detection (DAD) resolves. If DAD fails, the address
        // is removed.
        PermanentTentative AddressKind = iota

        // Permanent is a permanent endpoint (vs. a temporary one) assigned to the
        // NIC. Its reference count is biased by 1 to avoid removal when no route
        // holds a reference to it. It is removed by explicitly removing the address
        // from the NIC.
        Permanent

        // PermanentExpired is a permanent endpoint that had its address removed from
        // the NIC, and it is waiting to be removed once no references to it are held.
        //
        // If the address is re-added before the endpoint is removed, its type
        // changes back to Permanent.
        PermanentExpired

        // Temporary is an endpoint, created on a one-off basis to temporarily
        // consider the NIC bound an an address that it is not explicitly bound to
        // (such as a permanent address). Its reference count must not be biased by 1
        // so that the address is removed immediately when references to it are no
        // longer held.
        //
        // A temporary endpoint may be promoted to permanent if the address is added
        // permanently.
        Temporary
)

// IsPermanent returns true if the AddressKind represents a permanent address.
func (k AddressKind) IsPermanent() bool {
        switch k {
        case Permanent, PermanentTentative:
                return true
        case Temporary, PermanentExpired:
                return false
        default:
                panic(fmt.Sprintf("unrecognized address kind = %d", k))
        }
}

// AddressableEndpoint is an endpoint that supports addressing.
//
// An endpoint is considered to support addressing when the endpoint may
// associate itself with an identifier (address).
type AddressableEndpoint interface {
        // AddAndAcquirePermanentAddress adds the passed permanent address.
        //
        // Returns *tcpip.ErrDuplicateAddress if the address exists.
        //
        // Acquires and returns the AddressEndpoint for the added address.
        AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb PrimaryEndpointBehavior, configType AddressConfigType, deprecated bool) (AddressEndpoint, tcpip.Error)

        // RemovePermanentAddress removes the passed address if it is a permanent
        // address.
        //
        // Returns *tcpip.ErrBadLocalAddress if the endpoint does not have the passed
        // permanent address.
        RemovePermanentAddress(addr tcpip.Address) tcpip.Error

        // MainAddress returns the endpoint's primary permanent address.
        MainAddress() tcpip.AddressWithPrefix

        // AcquireAssignedAddress returns an address endpoint for the passed address
        // that is considered bound to the endpoint, optionally creating a temporary
        // endpoint if requested and no existing address exists.
        //
        // The returned endpoint's reference count is incremented.
        //
        // Returns nil if the specified address is not local to this endpoint.
        AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB PrimaryEndpointBehavior) AddressEndpoint

        // AcquireOutgoingPrimaryAddress returns a primary address that may be used as
        // a source address when sending packets to the passed remote address.
        //
        // If allowExpired is true, expired addresses may be returned.
        //
        // The returned endpoint's reference count is incremented.
        //
        // Returns nil if a primary address is not available.
        AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) AddressEndpoint

        // PrimaryAddresses returns the primary addresses.
        PrimaryAddresses() []tcpip.AddressWithPrefix

        // PermanentAddresses returns all the permanent addresses.
        PermanentAddresses() []tcpip.AddressWithPrefix
}

// NDPEndpoint is a network endpoint that supports NDP.
type NDPEndpoint interface {
        NetworkEndpoint

        // InvalidateDefaultRouter invalidates a default router discovered through
        // NDP.
        InvalidateDefaultRouter(tcpip.Address)
}

// NetworkInterface is a network interface.
type NetworkInterface interface {
        NetworkLinkEndpoint

        // ID returns the interface's ID.
        ID() tcpip.NICID

        // IsLoopback returns true if the interface is a loopback interface.
        IsLoopback() bool

        // Name returns the name of the interface.
        //
        // May return an empty string if the interface is not configured with a name.
        Name() string

        // Enabled returns true if the interface is enabled.
        Enabled() bool

        // Promiscuous returns true if the interface is in promiscuous mode.
        //
        // When in promiscuous mode, the interface should accept all packets.
        Promiscuous() bool

        // Spoofing returns true if the interface is in spoofing mode.
        //
        // When in spoofing mode, the interface should consider all addresses as
        // assigned to it.
        Spoofing() bool

        // PrimaryAddress returns the primary address associated with the interface.
        //
        // PrimaryAddress will return the first non-deprecated address if such an
        // address exists. If no non-deprecated addresses exist, the first deprecated
        // address will be returned. If no deprecated addresses exist, the zero value
        // will be returned.
        PrimaryAddress(tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, tcpip.Error)

        // CheckLocalAddress returns true if the address exists on the interface.
        CheckLocalAddress(tcpip.NetworkProtocolNumber, tcpip.Address) bool

        // WritePacketToRemote writes the packet to the given remote link address.
        WritePacketToRemote(tcpip.LinkAddress, tcpip.NetworkProtocolNumber, *PacketBuffer) tcpip.Error

        // WritePacket writes a packet with the given protocol through the given
        // route.
        //
        // WritePacket takes ownership of the packet buffer. The packet buffer's
        // network and transport header must be set.
        WritePacket(*Route, tcpip.NetworkProtocolNumber, *PacketBuffer) tcpip.Error

        // WritePackets writes packets with the given protocol through the given
        // route. Must not be called with an empty list of packet buffers.
        //
        // WritePackets takes ownership of the packet buffers.
        //
        // Right now, WritePackets is used only when the software segmentation
        // offload is enabled. If it will be used for something else, syscall filters
        // may need to be updated.
        WritePackets(*Route, PacketBufferList, tcpip.NetworkProtocolNumber) (int, tcpip.Error)

        // HandleNeighborProbe processes an incoming neighbor probe (e.g. ARP
        // request or NDP Neighbor Solicitation).
        //
        // HandleNeighborProbe assumes that the probe is valid for the network
        // interface the probe was received on.
        HandleNeighborProbe(tcpip.NetworkProtocolNumber, tcpip.Address, tcpip.LinkAddress) tcpip.Error

        // HandleNeighborConfirmation processes an incoming neighbor confirmation
        // (e.g. ARP reply or NDP Neighbor Advertisement).
        HandleNeighborConfirmation(tcpip.NetworkProtocolNumber, tcpip.Address, tcpip.LinkAddress, ReachabilityConfirmationFlags) tcpip.Error
}

// LinkResolvableNetworkEndpoint handles link resolution events.
type LinkResolvableNetworkEndpoint interface {
        // HandleLinkResolutionFailure is called when link resolution prevents the
        // argument from having been sent.
        HandleLinkResolutionFailure(*PacketBuffer)
}

// NetworkEndpoint is the interface that needs to be implemented by endpoints
// of network layer protocols (e.g., ipv4, ipv6).
type NetworkEndpoint interface {
        // Enable enables the endpoint.
        //
        // Must only be called when the stack is in a state that allows the endpoint
        // to send and receive packets.
        //
        // Returns *tcpip.ErrNotPermitted if the endpoint cannot be enabled.
        Enable() tcpip.Error

        // Enabled returns true if the endpoint is enabled.
        Enabled() bool

        // Disable disables the endpoint.
        Disable()

        // DefaultTTL is the default time-to-live value (or hop limit, in ipv6)
        // for this endpoint.
        DefaultTTL() uint8

        // MTU is the maximum transmission unit for this endpoint. This is
        // generally calculated as the MTU of the underlying data link endpoint
        // minus the network endpoint max header length.
        MTU() uint32

        // MaxHeaderLength returns the maximum size the network (and lower
        // level layers combined) headers can have. Higher levels use this
        // information to reserve space in the front of the packets they're
        // building.
        MaxHeaderLength() uint16

        // WritePacket writes a packet to the given destination address and
        // protocol. It takes ownership of pkt. pkt.TransportHeader must have
        // already been set.
        WritePacket(r *Route, params NetworkHeaderParams, pkt *PacketBuffer) tcpip.Error

        // WritePackets writes packets to the given destination address and
        // protocol. pkts must not be zero length. It takes ownership of pkts and
        // underlying packets.
        WritePackets(r *Route, pkts PacketBufferList, params NetworkHeaderParams) (int, tcpip.Error)

        // WriteHeaderIncludedPacket writes a packet that includes a network
        // header to the given destination address. It takes ownership of pkt.
        WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) tcpip.Error

        // HandlePacket is called by the link layer when new packets arrive to
        // this network endpoint. It sets pkt.NetworkHeader.
        //
        // HandlePacket takes ownership of pkt.
        HandlePacket(pkt *PacketBuffer)

        // Close is called when the endpoint is removed from a stack.
        Close()

        // NetworkProtocolNumber returns the tcpip.NetworkProtocolNumber for
        // this endpoint.
        NetworkProtocolNumber() tcpip.NetworkProtocolNumber

        // Stats returns a reference to the network endpoint stats.
        Stats() NetworkEndpointStats
}

// NetworkEndpointStats is the interface implemented by each network endpoint
// stats struct.
type NetworkEndpointStats interface {
        // IsNetworkEndpointStats is an empty method to implement the
        // NetworkEndpointStats marker interface.
        IsNetworkEndpointStats()
}

// IPNetworkEndpointStats is a NetworkEndpointStats that tracks IP-related
// statistics.
type IPNetworkEndpointStats interface {
        NetworkEndpointStats

        // IPStats returns the IP statistics of a network endpoint.
        IPStats() *tcpip.IPStats
}

// ForwardingNetworkEndpoint is a network endpoint that may forward packets.
type ForwardingNetworkEndpoint interface {
        NetworkEndpoint

        // Forwarding returns the forwarding configuration.
        Forwarding() bool

        // SetForwarding sets the forwarding configuration.
        SetForwarding(bool)
}

// NetworkProtocol is the interface that needs to be implemented by network
// protocols (e.g., ipv4, ipv6) that want to be part of the networking stack.
type NetworkProtocol interface {
        // Number returns the network protocol number.
        Number() tcpip.NetworkProtocolNumber

        // MinimumPacketSize returns the minimum valid packet size of this
        // network protocol. The stack automatically drops any packets smaller
        // than this targeted at this protocol.
        MinimumPacketSize() int

        // DefaultPrefixLen returns the protocol's default prefix length.
        DefaultPrefixLen() int

        // ParseAddresses returns the source and destination addresses stored in a
        // packet of this protocol.
        ParseAddresses(v buffer.View) (src, dst tcpip.Address)

        // NewEndpoint creates a new endpoint of this protocol.
        NewEndpoint(nic NetworkInterface, dispatcher TransportDispatcher) NetworkEndpoint

        // SetOption allows enabling/disabling protocol specific features.
        // SetOption returns an error if the option is not supported or the
        // provided option value is invalid.
        SetOption(option tcpip.SettableNetworkProtocolOption) tcpip.Error

        // Option allows retrieving protocol specific option values.
        // Option returns an error if the option is not supported or the
        // provided option value is invalid.
        Option(option tcpip.GettableNetworkProtocolOption) tcpip.Error

        // Close requests that any worker goroutines owned by the protocol
        // stop.
        Close()

        // Wait waits for any worker goroutines owned by the protocol to stop.
        Wait()

        // Parse sets pkt.NetworkHeader and trims pkt.Data appropriately. It
        // returns:
        // - The encapsulated protocol, if present.
        // - Whether there is an encapsulated transport protocol payload (e.g. ARP
        //   does not encapsulate anything).
        // - Whether pkt.Data was large enough to parse and set pkt.NetworkHeader.
        Parse(pkt *PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool)
}

// NetworkDispatcher contains the methods used by the network stack to deliver
// inbound/outbound packets to the appropriate network/packet(if any) endpoints.
type NetworkDispatcher interface {
        // DeliverNetworkPacket finds the appropriate network protocol endpoint
        // and hands the packet over for further processing.
        //
        // pkt.LinkHeader may or may not be set before calling
        // DeliverNetworkPacket. Some packets do not have link headers (e.g.
        // packets sent via loopback), and won't have the field set.
        //
        // DeliverNetworkPacket takes ownership of pkt.
        DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer)

        // DeliverOutboundPacket is called by link layer when a packet is being
        // sent out.
        //
        // pkt.LinkHeader may or may not be set before calling
        // DeliverOutboundPacket. Some packets do not have link headers (e.g.
        // packets sent via loopback), and won't have the field set.
        //
        // DeliverOutboundPacket takes ownership of pkt.
        DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
}

// LinkEndpointCapabilities is the type associated with the capabilities
// supported by a link-layer endpoint. It is a set of bitfields.
type LinkEndpointCapabilities uint

// The following are the supported link endpoint capabilities.
const (
        CapabilityNone LinkEndpointCapabilities = 0
        // CapabilityTXChecksumOffload indicates that the link endpoint supports
        // checksum computation for outgoing packets and the stack can skip
        // computing checksums when sending packets.
        CapabilityTXChecksumOffload LinkEndpointCapabilities = 1 << iota
        // CapabilityRXChecksumOffload indicates that the link endpoint supports
        // checksum verification on received packets and that it's safe for the
        // stack to skip checksum verification.
        CapabilityRXChecksumOffload
        CapabilityResolutionRequired
        CapabilitySaveRestore
        CapabilityDisconnectOk
        CapabilityLoopback
)

// NetworkLinkEndpoint is a data-link layer that supports sending network
// layer packets.
type NetworkLinkEndpoint interface {
        // MTU is the maximum transmission unit for this endpoint. This is
        // usually dictated by the backing physical network; when such a
        // physical network doesn't exist, the limit is generally 64k, which
        // includes the maximum size of an IP packet.
        MTU() uint32

        // MaxHeaderLength returns the maximum size the data link (and
        // lower level layers combined) headers can have. Higher levels use this
        // information to reserve space in the front of the packets they're
        // building.
        MaxHeaderLength() uint16

        // LinkAddress returns the link address (typically a MAC) of the
        // endpoint.
        LinkAddress() tcpip.LinkAddress
}

// LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
// ethernet, loopback, raw) and used by network layer protocols to send packets
// out through the implementer's data link endpoint. When a link header exists,
// it sets each PacketBuffer's LinkHeader field before passing it up the
// stack.
type LinkEndpoint interface {
        NetworkLinkEndpoint

        // Capabilities returns the set of capabilities supported by the
        // endpoint.
        Capabilities() LinkEndpointCapabilities

        // Attach attaches the data link layer endpoint to the network-layer
        // dispatcher of the stack.
        //
        // Attach is called with a nil dispatcher when the endpoint's NIC is being
        // removed.
        Attach(dispatcher NetworkDispatcher)

        // IsAttached returns whether a NetworkDispatcher is attached to the
        // endpoint.
        IsAttached() bool

        // Wait waits for any worker goroutines owned by the endpoint to stop.
        //
        // For now, requesting that an endpoint's worker goroutine(s) stop is
        // implementation specific.
        //
        // Wait will not block if the endpoint hasn't started any goroutines
        // yet, even if it might later.
        Wait()

        // ARPHardwareType returns the ARPHRD_TYPE of the link endpoint.
        //
        // See:
        // https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/include/uapi/linux/if_arp.h#L30
        ARPHardwareType() header.ARPHardwareType

        // AddHeader adds a link layer header to pkt if required.
        AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer)

        // WritePacket writes a packet with the given protocol and route.
        //
        // WritePacket takes ownership of the packet buffer. The packet buffer's
        // network and transport header must be set.
        //
        // To participate in transparent bridging, a LinkEndpoint implementation
        // should call eth.Encode with header.EthernetFields.SrcAddr set to
        // r.LocalLinkAddress if it is provided.
        WritePacket(RouteInfo, tcpip.NetworkProtocolNumber, *PacketBuffer) tcpip.Error

        // WritePackets writes packets with the given protocol and route. Must not be
        // called with an empty list of packet buffers.
        //
        // WritePackets takes ownership of the packet buffers.
        //
        // Right now, WritePackets is used only when the software segmentation
        // offload is enabled. If it will be used for something else, syscall filters
        // may need to be updated.
        WritePackets(RouteInfo, PacketBufferList, tcpip.NetworkProtocolNumber) (int, tcpip.Error)
}

// InjectableLinkEndpoint is a LinkEndpoint where inbound packets are
// delivered via the Inject method.
type InjectableLinkEndpoint interface {
        LinkEndpoint

        // InjectInbound injects an inbound packet.
        InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer)

        // InjectOutbound writes a fully formed outbound packet directly to the
        // link.
        //
        // dest is used by endpoints with multiple raw destinations.
        InjectOutbound(dest tcpip.Address, packet []byte) tcpip.Error
}

// DADResult is a marker interface for the result of a duplicate address
// detection process.
type DADResult interface {
        isDADResult()
}

var _ DADResult = (*DADSucceeded)(nil)

// DADSucceeded indicates DAD completed without finding any duplicate addresses.
type DADSucceeded struct{}

func (*DADSucceeded) isDADResult() {}

var _ DADResult = (*DADError)(nil)

// DADError indicates DAD hit an error.
type DADError struct {
        Err tcpip.Error
}

func (*DADError) isDADResult() {}

var _ DADResult = (*DADAborted)(nil)

// DADAborted indicates DAD was aborted.
type DADAborted struct{}

func (*DADAborted) isDADResult() {}

var _ DADResult = (*DADDupAddrDetected)(nil)

// DADDupAddrDetected indicates DAD detected a duplicate address.
type DADDupAddrDetected struct {
        // HolderLinkAddress is the link address of the node that holds the duplicate
        // address.
        HolderLinkAddress tcpip.LinkAddress
}

func (*DADDupAddrDetected) isDADResult() {}

// DADCompletionHandler is a handler for DAD completion.
type DADCompletionHandler func(DADResult)

// DADCheckAddressDisposition enumerates the possible return values from
// DAD.CheckDuplicateAddress.
type DADCheckAddressDisposition int

const (
        _ DADCheckAddressDisposition = iota

        // DADDisabled indicates that DAD is disabled.
        DADDisabled

        // DADStarting indicates that DAD is starting for an address.
        DADStarting

        // DADAlreadyRunning indicates that DAD was already started for an address.
        DADAlreadyRunning
)

const (
        // defaultDupAddrDetectTransmits is the default number of NDP Neighbor
        // Solicitation messages to send when doing Duplicate Address Detection
        // for a tentative address.
        //
        // Default = 1 (from RFC 4862 section 5.1)
        defaultDupAddrDetectTransmits = 1
)

// DADConfigurations holds configurations for duplicate address detection.
type DADConfigurations struct {
        // The number of Neighbor Solicitation messages to send when doing
        // Duplicate Address Detection for a tentative address.
        //
        // Note, a value of zero effectively disables DAD.
        DupAddrDetectTransmits uint8

        // The amount of time to wait between sending Neighbor Solicitation
        // messages.
        //
        // Must be greater than or equal to 1ms.
        RetransmitTimer time.Duration
}

// DefaultDADConfigurations returns the default DAD configurations.
func DefaultDADConfigurations() DADConfigurations {
        return DADConfigurations{
                DupAddrDetectTransmits: defaultDupAddrDetectTransmits,
                RetransmitTimer:        defaultRetransmitTimer,
        }
}

// Validate modifies the configuration with valid values. If invalid values are
// present in the configurations, the corresponding default values are used
// instead.
func (c *DADConfigurations) Validate() {
        if c.RetransmitTimer < minimumRetransmitTimer {
                c.RetransmitTimer = defaultRetransmitTimer
        }
}

// DuplicateAddressDetector handles checking if an address is already assigned
// to some neighboring node on the link.
type DuplicateAddressDetector interface {
        // CheckDuplicateAddress checks if an address is assigned to a neighbor.
        //
        // If DAD is already being performed for the address, the handler will be
        // called with the result of the original DAD request.
        CheckDuplicateAddress(tcpip.Address, DADCompletionHandler) DADCheckAddressDisposition

        // SetDADConfigurations sets the configurations for DAD.
        SetDADConfigurations(c DADConfigurations)

        // DuplicateAddressProtocol returns the network protocol the receiver can
        // perform duplicate address detection for.
        DuplicateAddressProtocol() tcpip.NetworkProtocolNumber
}

// LinkAddressResolver handles link address resolution for a network protocol.
type LinkAddressResolver interface {
        // LinkAddressRequest sends a request for the link address of the target
        // address. The request is broadcast on the local network if a remote link
        // address is not provided.
        LinkAddressRequest(targetAddr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) tcpip.Error

        // ResolveStaticAddress attempts to resolve address without sending
        // requests. It either resolves the name immediately or returns the
        // empty LinkAddress.
        //
        // It can be used to resolve broadcast addresses for example.
        ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool)

        // LinkAddressProtocol returns the network protocol of the
        // addresses this resolver can resolve.
        LinkAddressProtocol() tcpip.NetworkProtocolNumber
}

// RawFactory produces endpoints for writing various types of raw packets.
type RawFactory interface {
        // NewUnassociatedEndpoint produces endpoints for writing packets not
        // associated with a particular transport protocol. Such endpoints can
        // be used to write arbitrary packets that include the network header.
        NewUnassociatedEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error)

        // NewPacketEndpoint produces endpoints for reading and writing packets
        // that include network and (when cooked is false) link layer headers.
        NewPacketEndpoint(stack *Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error)
}

// GSOType is the type of GSO segments.
//
// +stateify savable
type GSOType int

// Types of gso segments.
const (
        GSONone GSOType = iota

        // Hardware GSO types:
        GSOTCPv4
        GSOTCPv6

        // GSOSW is used for software GSO segments which have to be sent by
        // endpoint.WritePackets.
        GSOSW
)

// GSO contains generic segmentation offload properties.
//
// +stateify savable
type GSO struct {
        // Type is one of GSONone, GSOTCPv4, etc.
        Type GSOType
        // NeedsCsum is set if the checksum offload is enabled.
        NeedsCsum bool
        // CsumOffset is offset after that to place checksum.
        CsumOffset uint16

        // Mss is maximum segment size.
        MSS uint16
        // L3Len is L3 (IP) header length.
        L3HdrLen uint16

        // MaxSize is maximum GSO packet size.
        MaxSize uint32
}

// SupportedGSO returns the type of segmentation offloading supported.
type SupportedGSO int

const (
        // GSONotSupported indicates that segmentation offloading is not supported.
        GSONotSupported SupportedGSO = iota

        // HWGSOSupported indicates that segmentation offloading may be performed by
        // the hardware.
        HWGSOSupported

        // SWGSOSupported indicates that segmentation offloading may be performed in
        // software.
        SWGSOSupported
)

// GSOEndpoint provides access to GSO properties.
type GSOEndpoint interface {
        // GSOMaxSize returns the maximum GSO packet size.
        GSOMaxSize() uint32

        // SupportedGSO returns the supported segmentation offloading.
        SupportedGSO() SupportedGSO
}

// SoftwareGSOMaxSize is a maximum allowed size of a software GSO segment.
// This isn't a hard limit, because it is never set into packet headers.
const SoftwareGSOMaxSize = 1 << 16























































































    2 

























    5 





    5 











    5 

    5 


    5 









    5 















    5 















    5 




    4 



    4 














    4 





    4 






























































































  105 








  638 




   16 




  629 




  294 
   45 





  294 



  259 


  291 




    4 




    2 










































  150 





  151 






  151 




















   53 



   54 


   54 








    9 



   10 




  639 



   16 



  627 
   17 

   10 








  307 










  324 
















  318 





   13 



    1 


    2 

    4 








   30 




   30 


   30 
    5 



   25 







   11 

    8 

    2 


    8 




    1 

    1 



   21 




   22 




   22 
    6 






    6 




    6 

   22 
    6 
    2 

    4 


    6 

   22 
    2 

    3 


    5 



   22 











   22 
   10 


   10 



   10 


   22 


   22 



   22 







  316 



    8 

    1 

    7 

    6 





    3 



    3 









  318 




   94 
    2 


   92 


   91 







    3 







  129 










   37 





   30 




   13 




   17 


    8 


    3 



    8 
    2 


    6 


    2 


    4 


   15 
    3 


   12 





   12 


    9 
    3 


    6 


    2 


    4 












   12 



  371 



  364 




  316 






   11 


    4 



    8 


    8 








    2 




    3 




    3 






    3 




    4 

    3 




    1 





    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package tmpfs provides an in-memory filesystem whose contents are
// application-mutable, consistent with Linux's tmpfs.
//
// Lock order:
//
// filesystem.mu
//   inode.mu
//     regularFileFD.offMu
//       *** "memmap.Mappable locks" below this point
//       regularFile.mapsMu
//         *** "memmap.Mappable locks taken by Translate" below this point
//         regularFile.dataMu
//     directory.iterMu
package tmpfs

import (
        "fmt"
        "math"
        "strconv"
        "strings"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sentry/vfs/memxattr"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
)

// Name is the default filesystem name.
const Name = "tmpfs"

// FilesystemType implements vfs.FilesystemType.
//
// +stateify savable
type FilesystemType struct{}

// filesystem implements vfs.FilesystemImpl.
//
// +stateify savable
type filesystem struct {
        vfsfs vfs.Filesystem

        // mfp is used to allocate memory that stores regular file contents. mfp is
        // immutable.
        mfp pgalloc.MemoryFileProvider

        // clock is a realtime clock used to set timestamps in file operations.
        clock time.Clock

        // devMinor is the filesystem's minor device number. devMinor is immutable.
        devMinor uint32

        // mopts contains the tmpfs-specific mount options passed to this
        // filesystem. Immutable.
        mopts string

        // mu serializes changes to the Dentry tree.
        mu sync.RWMutex `state:"nosave"`

        nextInoMinusOne uint64 // accessed using atomic memory operations

        root *dentry
}

// Name implements vfs.FilesystemType.Name.
func (FilesystemType) Name() string {
        return Name
}

// Release implements vfs.FilesystemType.Release.
func (FilesystemType) Release(ctx context.Context) {}

// FilesystemOpts is used to pass configuration data to tmpfs.
//
// +stateify savable
type FilesystemOpts struct {
        // RootFileType is the FileType of the filesystem root. Valid values
        // are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
        RootFileType uint16

        // RootSymlinkTarget is the target of the root symlink. Only valid if
        // RootFileType == S_IFLNK.
        RootSymlinkTarget string

        // FilesystemType allows setting a different FilesystemType for this
        // tmpfs filesystem. This allows tmpfs to "impersonate" other
        // filesystems, like ramdiskfs and cgroupfs.
        FilesystemType vfs.FilesystemType
}

// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
        mfp := pgalloc.MemoryFileProviderFromContext(ctx)
        if mfp == nil {
                panic("MemoryFileProviderFromContext returned nil")
        }

        rootFileType := uint16(linux.S_IFDIR)
        newFSType := vfs.FilesystemType(&fstype)
        tmpfsOpts, ok := opts.InternalData.(FilesystemOpts)
        if ok {
                if tmpfsOpts.RootFileType != 0 {
                        rootFileType = tmpfsOpts.RootFileType
                }
                if tmpfsOpts.FilesystemType != nil {
                        newFSType = tmpfsOpts.FilesystemType
                }
        }

        mopts := vfs.GenericParseMountOptions(opts.Data)
        rootMode := linux.FileMode(0777)
        if rootFileType == linux.S_IFDIR {
                rootMode = 01777
        }
        modeStr, ok := mopts["mode"]
        if ok {
                delete(mopts, "mode")
                mode, err := strconv.ParseUint(modeStr, 8, 32)
                if err != nil {
                        ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr)
                        return nil, nil, linuxerr.EINVAL
                }
                rootMode = linux.FileMode(mode & 07777)
        }
        rootKUID := creds.EffectiveKUID
        uidStr, ok := mopts["uid"]
        if ok {
                delete(mopts, "uid")
                uid, err := strconv.ParseUint(uidStr, 10, 32)
                if err != nil {
                        ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr)
                        return nil, nil, linuxerr.EINVAL
                }
                kuid := creds.UserNamespace.MapToKUID(auth.UID(uid))
                if !kuid.Ok() {
                        ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid)
                        return nil, nil, linuxerr.EINVAL
                }
                rootKUID = kuid
        }
        rootKGID := creds.EffectiveKGID
        gidStr, ok := mopts["gid"]
        if ok {
                delete(mopts, "gid")
                gid, err := strconv.ParseUint(gidStr, 10, 32)
                if err != nil {
                        ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr)
                        return nil, nil, linuxerr.EINVAL
                }
                kgid := creds.UserNamespace.MapToKGID(auth.GID(gid))
                if !kgid.Ok() {
                        ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid)
                        return nil, nil, linuxerr.EINVAL
                }
                rootKGID = kgid
        }
        if len(mopts) != 0 {
                ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
                return nil, nil, linuxerr.EINVAL
        }

        devMinor, err := vfsObj.GetAnonBlockDevMinor()
        if err != nil {
                return nil, nil, err
        }
        clock := time.RealtimeClockFromContext(ctx)
        fs := filesystem{
                mfp:      mfp,
                clock:    clock,
                devMinor: devMinor,
                mopts:    opts.Data,
        }
        fs.vfsfs.Init(vfsObj, newFSType, &fs)

        var root *dentry
        switch rootFileType {
        case linux.S_IFREG:
                root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode, nil /* parentDir */))
        case linux.S_IFLNK:
                root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget, nil /* parentDir */))
        case linux.S_IFDIR:
                root = &fs.newDirectory(rootKUID, rootKGID, rootMode, nil /* parentDir */).dentry
        default:
                fs.vfsfs.DecRef(ctx)
                return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
        }
        fs.root = root
        return &fs.vfsfs, &root.vfsd, nil
}

// NewFilesystem returns a new tmpfs filesystem.
func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*vfs.Filesystem, *vfs.Dentry, error) {
        return FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "", vfs.GetFilesystemOptions{})
}

// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
        fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
        fs.mu.Lock()
        if fs.root.inode.isDir() {
                fs.root.releaseChildrenLocked(ctx)
        }
        fs.mu.Unlock()
}

// releaseChildrenLocked is called on the mount point by filesystem.Release() to
// destroy all objects in the mount. It performs a depth-first walk of the
// filesystem and "unlinks" everything by decrementing link counts
// appropriately. There should be no open file descriptors when this is called,
// so each inode should only have one outstanding reference that is removed once
// its link count hits zero.
//
// Note that we do not update filesystem state precisely while tearing down (for
// instance, the child maps are ignored)--we only care to remove all remaining
// references so that every filesystem object gets destroyed. Also note that we
// do not need to trigger DecRef on the mount point itself or any child mount;
// these are taken care of by the destructor of the enclosing MountNamespace.
//
// Precondition: filesystem.mu is held.
func (d *dentry) releaseChildrenLocked(ctx context.Context) {
        dir := d.inode.impl.(*directory)
        for _, child := range dir.childMap {
                if child.inode.isDir() {
                        child.releaseChildrenLocked(ctx)
                        child.inode.decLinksLocked(ctx) // link for child/.
                        dir.inode.decLinksLocked(ctx)   // link for child/..
                }
                child.inode.decLinksLocked(ctx) // link for child
        }
}

// immutable
var globalStatfs = linux.Statfs{
        Type:         linux.TMPFS_MAGIC,
        BlockSize:    hostarch.PageSize,
        FragmentSize: hostarch.PageSize,
        NameLength:   linux.NAME_MAX,

        // tmpfs currently does not support configurable size limits. In Linux,
        // such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from
        // statfs(2). However, many applications treat this as having a size limit
        // of 0. To work around this, claim to have a very large but non-zero size,
        // chosen to ensure that BlockSize * Blocks does not overflow int64 (which
        // applications may also handle incorrectly).
        // TODO(b/29637826): allow configuring a tmpfs size and enforce it.
        Blocks:          math.MaxInt64 / hostarch.PageSize,
        BlocksFree:      math.MaxInt64 / hostarch.PageSize,
        BlocksAvailable: math.MaxInt64 / hostarch.PageSize,
}

// dentry implements vfs.DentryImpl.
//
// +stateify savable
type dentry struct {
        vfsd vfs.Dentry

        // parent is this dentry's parent directory. Each referenced dentry holds a
        // reference on parent.dentry. If this dentry is a filesystem root, parent
        // is nil. parent is protected by filesystem.mu.
        parent *dentry

        // name is the name of this dentry in its parent. If this dentry is a
        // filesystem root, name is the empty string. name is protected by
        // filesystem.mu.
        name string

        // dentryEntry (ugh) links dentries into their parent directory.childList.
        dentryEntry

        // inode is the inode represented by this dentry. Multiple Dentries may
        // share a single non-directory inode (with hard links). inode is
        // immutable.
        //
        // tmpfs doesn't count references on dentries; because the dentry tree is
        // the sole source of truth, it is by definition always consistent with the
        // state of the filesystem. However, it does count references on inodes,
        // because inode resources are released when all references are dropped.
        // dentry therefore forwards reference counting directly to inode.
        inode *inode
}

func (fs *filesystem) newDentry(inode *inode) *dentry {
        d := &dentry{
                inode: inode,
        }
        d.vfsd.Init(d)
        return d
}

// IncRef implements vfs.DentryImpl.IncRef.
func (d *dentry) IncRef() {
        d.inode.incRef()
}

// TryIncRef implements vfs.DentryImpl.TryIncRef.
func (d *dentry) TryIncRef() bool {
        return d.inode.tryIncRef()
}

// DecRef implements vfs.DentryImpl.DecRef.
func (d *dentry) DecRef(ctx context.Context) {
        d.inode.decRef(ctx)
}

// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
        if d.inode.isDir() {
                events |= linux.IN_ISDIR
        }

        // tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
        // that d was deleted.
        deleted := d.vfsd.IsDead()

        d.inode.fs.mu.RLock()
        // The ordering below is important, Linux always notifies the parent first.
        if d.parent != nil {
                d.parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted)
        }
        d.inode.watches.Notify(ctx, "", events, cookie, et, deleted)
        d.inode.fs.mu.RUnlock()
}

// Watches implements vfs.DentryImpl.Watches.
func (d *dentry) Watches() *vfs.Watches {
        return &d.inode.watches
}

// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
func (d *dentry) OnZeroWatches(context.Context) {}

// inode represents a filesystem object.
//
// +stateify savable
type inode struct {
        // fs is the owning filesystem. fs is immutable.
        fs *filesystem

        // A reference is held on all inodes as long as they are reachable in the
        // filesystem tree, i.e. nlink is nonzero. This reference is dropped when
        // nlink reaches 0.
        refs inodeRefs

        // xattrs implements extended attributes.
        //
        // TODO(b/148380782): Support xattrs other than user.*
        xattrs memxattr.SimpleExtendedAttributes

        // Inode metadata. Writing multiple fields atomically requires holding
        // mu, othewise atomic operations can be used.
        mu    sync.Mutex `state:"nosave"`
        mode  uint32     // file type and mode
        nlink uint32     // protected by filesystem.mu instead of inode.mu
        uid   uint32     // auth.KUID, but stored as raw uint32 for sync/atomic
        gid   uint32     // auth.KGID, but ...
        ino   uint64     // immutable

        // Linux's tmpfs has no concept of btime.
        atime int64 // nanoseconds
        ctime int64 // nanoseconds
        mtime int64 // nanoseconds

        locks vfs.FileLocks

        // Inotify watches for this inode.
        watches vfs.Watches

        impl interface{} // immutable
}

const maxLinks = math.MaxUint32

func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) {
        if mode.FileType() == 0 {
                panic("file type is required in FileMode")
        }

        // Inherit the group and setgid bit as in fs/inode.c:inode_init_owner().
        if parentDir != nil && parentDir.inode.mode&linux.S_ISGID == linux.S_ISGID {
                kgid = auth.KGID(parentDir.inode.gid)
                if mode&linux.S_IFDIR == linux.S_IFDIR {
                        mode |= linux.S_ISGID
                }
        }

        i.fs = fs
        i.mode = uint32(mode)
        i.uid = uint32(kuid)
        i.gid = uint32(kgid)
        i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
        // Tmpfs creation sets atime, ctime, and mtime to current time.
        now := fs.clock.Now().Nanoseconds()
        i.atime = now
        i.ctime = now
        i.mtime = now
        // i.nlink initialized by caller
        i.impl = impl
        i.refs.InitRefs()
}

// incLinksLocked increments i's link count.
//
// Preconditions:
// * filesystem.mu must be locked for writing.
// * i.nlink != 0.
// * i.nlink < maxLinks.
func (i *inode) incLinksLocked() {
        if i.nlink == 0 {
                panic("tmpfs.inode.incLinksLocked() called with no existing links")
        }
        if i.nlink == maxLinks {
                panic("tmpfs.inode.incLinksLocked() called with maximum link count")
        }
        atomic.AddUint32(&i.nlink, 1)
}

// decLinksLocked decrements i's link count. If the link count reaches 0, we
// remove a reference on i as well.
//
// Preconditions:
// * filesystem.mu must be locked for writing.
// * i.nlink != 0.
func (i *inode) decLinksLocked(ctx context.Context) {
        if i.nlink == 0 {
                panic("tmpfs.inode.decLinksLocked() called with no existing links")
        }
        if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 {
                i.decRef(ctx)
        }
}

func (i *inode) incRef() {
        i.refs.IncRef()
}

func (i *inode) tryIncRef() bool {
        return i.refs.TryIncRef()
}

func (i *inode) decRef(ctx context.Context) {
        i.refs.DecRef(func() {
                i.watches.HandleDeletion(ctx)
                if regFile, ok := i.impl.(*regularFile); ok {
                        // Release memory used by regFile to store data. Since regFile is
                        // no longer usable, we don't need to grab any locks or update any
                        // metadata.
                        regFile.data.DropAll(regFile.memFile)
                }
        })
}

func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
        mode := linux.FileMode(atomic.LoadUint32(&i.mode))
        return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
}

// Go won't inline this function, and returning linux.Statx (which is quite
// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
// output parameter.
//
// Note that Linux does not guarantee to return consistent data (in the case of
// a concurrent modification), so we do not require holding inode.mu.
func (i *inode) statTo(stat *linux.Statx) {
        stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
                linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
                linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME |
                linux.STATX_MTIME
        stat.Blksize = hostarch.PageSize
        stat.Nlink = atomic.LoadUint32(&i.nlink)
        stat.UID = atomic.LoadUint32(&i.uid)
        stat.GID = atomic.LoadUint32(&i.gid)
        stat.Mode = uint16(atomic.LoadUint32(&i.mode))
        stat.Ino = i.ino
        stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&i.atime))
        stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&i.ctime))
        stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&i.mtime))
        stat.DevMajor = linux.UNNAMED_MAJOR
        stat.DevMinor = i.fs.devMinor
        switch impl := i.impl.(type) {
        case *regularFile:
                stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
                stat.Size = uint64(atomic.LoadUint64(&impl.size))
                // TODO(jamieliu): This should be impl.data.Span() / 512, but this is
                // too expensive to compute here. Cache it in regularFile.
                stat.Blocks = allocatedBlocksForSize(stat.Size)
        case *directory:
                // "20" is mm/shmem.c:BOGO_DIRENT_SIZE.
                stat.Size = 20 * (2 + uint64(atomic.LoadInt64(&impl.numChildren)))
                // stat.Blocks is 0.
        case *symlink:
                stat.Size = uint64(len(impl.target))
                // stat.Blocks is 0.
        case *namedPipe, *socketFile:
                // stat.Size and stat.Blocks are 0.
        case *deviceFile:
                // stat.Size and stat.Blocks are 0.
                stat.RdevMajor = impl.major
                stat.RdevMinor = impl.minor
        default:
                panic(fmt.Sprintf("unknown inode type: %T", i.impl))
        }
}

func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error {
        stat := &opts.Stat
        if stat.Mask == 0 {
                return nil
        }
        if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 {
                return linuxerr.EPERM
        }
        mode := linux.FileMode(atomic.LoadUint32(&i.mode))
        if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
                return err
        }

        i.mu.Lock()
        defer i.mu.Unlock()
        var (
                needsMtimeBump bool
                needsCtimeBump bool
        )
        clearSID := false
        mask := stat.Mask
        if mask&linux.STATX_SIZE != 0 {
                switch impl := i.impl.(type) {
                case *regularFile:
                        updated, err := impl.truncateLocked(stat.Size)
                        if err != nil {
                                return err
                        }
                        if updated {
                                clearSID = true
                                needsMtimeBump = true
                                needsCtimeBump = true
                        }
                case *directory:
                        return syserror.EISDIR
                default:
                        return linuxerr.EINVAL
                }
        }
        if mask&linux.STATX_UID != 0 {
                atomic.StoreUint32(&i.uid, stat.UID)
                needsCtimeBump = true
                clearSID = true
        }
        if mask&linux.STATX_GID != 0 {
                atomic.StoreUint32(&i.gid, stat.GID)
                needsCtimeBump = true
                clearSID = true
        }
        if mask&linux.STATX_MODE != 0 {
                for {
                        old := atomic.LoadUint32(&i.mode)
                        ft := old & linux.S_IFMT
                        newMode := ft | uint32(stat.Mode & ^uint16(linux.S_IFMT))
                        if clearSID {
                                newMode = vfs.ClearSUIDAndSGID(newMode)
                        }
                        if swapped := atomic.CompareAndSwapUint32(&i.mode, old, newMode); swapped {
                                clearSID = false
                                break
                        }
                }
                needsCtimeBump = true
        }
        now := i.fs.clock.Now().Nanoseconds()
        if mask&linux.STATX_ATIME != 0 {
                if stat.Atime.Nsec == linux.UTIME_NOW {
                        atomic.StoreInt64(&i.atime, now)
                } else {
                        atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped())
                }
                needsCtimeBump = true
        }
        if mask&linux.STATX_MTIME != 0 {
                if stat.Mtime.Nsec == linux.UTIME_NOW {
                        atomic.StoreInt64(&i.mtime, now)
                } else {
                        atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped())
                }
                needsCtimeBump = true
                // Ignore the mtime bump, since we just set it ourselves.
                needsMtimeBump = false
        }
        if mask&linux.STATX_CTIME != 0 {
                if stat.Ctime.Nsec == linux.UTIME_NOW {
                        atomic.StoreInt64(&i.ctime, now)
                } else {
                        atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped())
                }
                // Ignore the ctime bump, since we just set it ourselves.
                needsCtimeBump = false
        }

        // We may have to clear the SUID/SGID bits, but didn't do so as part of
        // STATX_MODE.
        if clearSID {
                for {
                        old := atomic.LoadUint32(&i.mode)
                        newMode := vfs.ClearSUIDAndSGID(old)
                        if swapped := atomic.CompareAndSwapUint32(&i.mode, old, newMode); swapped {
                                break
                        }
                }
                needsCtimeBump = true
        }

        if needsMtimeBump {
                atomic.StoreInt64(&i.mtime, now)
        }
        if needsCtimeBump {
                atomic.StoreInt64(&i.ctime, now)
        }

        return nil
}

// allocatedBlocksForSize returns the number of 512B blocks needed to
// accommodate the given size in bytes, as appropriate for struct
// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
// size is independent of the "preferred block size for I/O", struct
// stat::st_blksize and struct statx::stx_blksize.)
func allocatedBlocksForSize(size uint64) uint64 {
        return (size + 511) / 512
}

func (i *inode) direntType() uint8 {
        switch impl := i.impl.(type) {
        case *regularFile:
                return linux.DT_REG
        case *directory:
                return linux.DT_DIR
        case *symlink:
                return linux.DT_LNK
        case *socketFile:
                return linux.DT_SOCK
        case *namedPipe:
                return linux.DT_FIFO
        case *deviceFile:
                switch impl.kind {
                case vfs.BlockDevice:
                        return linux.DT_BLK
                case vfs.CharDevice:
                        return linux.DT_CHR
                default:
                        panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind))
                }
        default:
                panic(fmt.Sprintf("unknown inode type: %T", i.impl))
        }
}

func (i *inode) isDir() bool {
        mode := linux.FileMode(atomic.LoadUint32(&i.mode))
        return mode.FileType() == linux.S_IFDIR
}

func (i *inode) touchAtime(mnt *vfs.Mount) {
        if mnt.Flags.NoATime {
                return
        }
        if err := mnt.CheckBeginWrite(); err != nil {
                return
        }
        now := i.fs.clock.Now().Nanoseconds()
        i.mu.Lock()
        atomic.StoreInt64(&i.atime, now)
        i.mu.Unlock()
        mnt.EndWrite()
}

// Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
func (i *inode) touchCtime() {
        now := i.fs.clock.Now().Nanoseconds()
        i.mu.Lock()
        atomic.StoreInt64(&i.ctime, now)
        i.mu.Unlock()
}

// Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
func (i *inode) touchCMtime() {
        now := i.fs.clock.Now().Nanoseconds()
        i.mu.Lock()
        atomic.StoreInt64(&i.mtime, now)
        atomic.StoreInt64(&i.ctime, now)
        i.mu.Unlock()
}

// Preconditions:
// * The caller has called vfs.Mount.CheckBeginWrite().
// * inode.mu must be locked.
func (i *inode) touchCMtimeLocked() {
        now := i.fs.clock.Now().Nanoseconds()
        atomic.StoreInt64(&i.mtime, now)
        atomic.StoreInt64(&i.ctime, now)
}

func checkXattrName(name string) error {
        // Linux's tmpfs supports "security" and "trusted" xattr namespaces, and
        // (depending on build configuration) POSIX ACL xattr namespaces
        // ("system.posix_acl_access" and "system.posix_acl_default"). We don't
        // support POSIX ACLs or the "security" namespace (b/148380782).
        if strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) {
                return nil
        }
        // We support the "user" namespace because we have tests that depend on
        // this feature.
        if strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
                return nil
        }
        return linuxerr.EOPNOTSUPP
}

func (i *inode) listXattr(creds *auth.Credentials, size uint64) ([]string, error) {
        return i.xattrs.ListXattr(creds, size)
}

func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
        if err := checkXattrName(opts.Name); err != nil {
                return "", err
        }
        mode := linux.FileMode(atomic.LoadUint32(&i.mode))
        kuid := auth.KUID(atomic.LoadUint32(&i.uid))
        kgid := auth.KGID(atomic.LoadUint32(&i.gid))
        if err := vfs.GenericCheckPermissions(creds, vfs.MayRead, mode, kuid, kgid); err != nil {
                return "", err
        }
        return i.xattrs.GetXattr(creds, mode, kuid, opts)
}

func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
        if err := checkXattrName(opts.Name); err != nil {
                return err
        }
        mode := linux.FileMode(atomic.LoadUint32(&i.mode))
        kuid := auth.KUID(atomic.LoadUint32(&i.uid))
        kgid := auth.KGID(atomic.LoadUint32(&i.gid))
        if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil {
                return err
        }
        return i.xattrs.SetXattr(creds, mode, kuid, opts)
}

func (i *inode) removeXattr(creds *auth.Credentials, name string) error {
        if err := checkXattrName(name); err != nil {
                return err
        }
        mode := linux.FileMode(atomic.LoadUint32(&i.mode))
        kuid := auth.KUID(atomic.LoadUint32(&i.uid))
        kgid := auth.KGID(atomic.LoadUint32(&i.gid))
        if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil {
                return err
        }
        return i.xattrs.RemoveXattr(creds, mode, kuid, name)
}

// fileDescription is embedded by tmpfs implementations of
// vfs.FileDescriptionImpl.
//
// +stateify savable
type fileDescription struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.LockFD
}

func (fd *fileDescription) filesystem() *filesystem {
        return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
}

func (fd *fileDescription) dentry() *dentry {
        return fd.vfsfd.Dentry().Impl().(*dentry)
}

func (fd *fileDescription) inode() *inode {
        return fd.dentry().inode
}

// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
        var stat linux.Statx
        fd.inode().statTo(&stat)
        return stat, nil
}

// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
        creds := auth.CredentialsFromContext(ctx)
        d := fd.dentry()
        if err := d.inode.setStat(ctx, creds, &opts); err != nil {
                return err
        }

        if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
                d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
        }
        return nil
}

// StatFS implements vfs.FileDescriptionImpl.StatFS.
func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
        return globalStatfs, nil
}

// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
        return fd.inode().listXattr(auth.CredentialsFromContext(ctx), size)
}

// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
        return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts)
}

// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
        d := fd.dentry()
        if err := d.inode.setXattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
                return err
        }

        // Generate inotify events.
        d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
        return nil
}

// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
        d := fd.dentry()
        if err := d.inode.removeXattr(auth.CredentialsFromContext(ctx), name); err != nil {
                return err
        }

        // Generate inotify events.
        d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
        return nil
}

// Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all
// filesystem state is in-memory.
func (*fileDescription) Sync(context.Context) error {
        return nil
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/sys/dir_refs.go: no such file or directory



































  143 






  141 




  143 


    2 





    2 

  144 


  143 


  145 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package hostfd

import (
        "io"
        "unsafe"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/safemem"
)

const (
        sizeofIovec  = unsafe.Sizeof(unix.Iovec{})
        sizeofMsghdr = unsafe.Sizeof(unix.Msghdr{})
)

// Preadv2 reads up to dsts.NumBytes() bytes from host file descriptor fd into
// dsts. offset and flags are interpreted as for preadv2(2).
//
// Preconditions: !dsts.IsEmpty().
func Preadv2(fd int32, dsts safemem.BlockSeq, offset int64, flags uint32) (uint64, error) {
        // No buffering is necessary regardless of safecopy; host syscalls will
        // return EFAULT if appropriate, instead of raising SIGBUS.
        var (
                n uintptr
                e unix.Errno
        )
        if flags == 0 && dsts.NumBlocks() == 1 {
                // Use read() or pread() to avoid iovec allocation and copying.
                dst := dsts.Head()
                if offset == -1 {
                        n, _, e = unix.Syscall(unix.SYS_READ, uintptr(fd), dst.Addr(), uintptr(dst.Len()))
                } else {
                        n, _, e = unix.Syscall6(unix.SYS_PREAD64, uintptr(fd), dst.Addr(), uintptr(dst.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */)
                }
        } else {
                iovs := safemem.IovecsFromBlockSeq(dsts)
                if len(iovs) > MaxReadWriteIov {
                        log.Debugf("hostfd.Preadv2: truncating from %d iovecs to %d", len(iovs), MaxReadWriteIov)
                        iovs = iovs[:MaxReadWriteIov]
                }
                n, _, e = unix.Syscall6(unix.SYS_PREADV2, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, uintptr(flags))
        }
        if e != 0 {
                return 0, e
        }
        if n == 0 {
                return 0, io.EOF
        }
        return uint64(n), nil
}

// Pwritev2 writes up to srcs.NumBytes() from srcs into host file descriptor
// fd. offset and flags are interpreted as for pwritev2(2).
//
// Preconditions: !srcs.IsEmpty().
func Pwritev2(fd int32, srcs safemem.BlockSeq, offset int64, flags uint32) (uint64, error) {
        // No buffering is necessary regardless of safecopy; host syscalls will
        // return EFAULT if appropriate, instead of raising SIGBUS.
        var (
                n uintptr
                e unix.Errno
        )
        if flags == 0 && srcs.NumBlocks() == 1 {
                // Use write() or pwrite() to avoid iovec allocation and copying.
                src := srcs.Head()
                if offset == -1 {
                        n, _, e = unix.Syscall(unix.SYS_WRITE, uintptr(fd), src.Addr(), uintptr(src.Len()))
                } else {
                        n, _, e = unix.Syscall6(unix.SYS_PWRITE64, uintptr(fd), src.Addr(), uintptr(src.Len()), uintptr(offset), 0 /* pos_h */, 0 /* unused */)
                }
        } else {
                iovs := safemem.IovecsFromBlockSeq(srcs)
                if len(iovs) > MaxReadWriteIov {
                        log.Debugf("hostfd.Preadv2: truncating from %d iovecs to %d", len(iovs), MaxReadWriteIov)
                        iovs = iovs[:MaxReadWriteIov]
                }
                n, _, e = unix.Syscall6(unix.SYS_PWRITEV2, uintptr(fd), uintptr((unsafe.Pointer)(&iovs[0])), uintptr(len(iovs)), uintptr(offset), 0 /* pos_h */, uintptr(flags))
        }
        if e != 0 {
                return 0, e
        }
        return uint64(n), nil
}


















































































































































































































































































































































































































































































































  332 






  334 

























  334 











  333 






  333 



  335 


  334 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package p9

import (
        "errors"
        "fmt"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/flipcall"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/pool"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/unet"
)

// ErrOutOfTags indicates no tags are available.
var ErrOutOfTags = errors.New("out of tags -- messages lost?")

// ErrOutOfFIDs indicates no more FIDs are available.
var ErrOutOfFIDs = errors.New("out of FIDs -- messages lost?")

// ErrUnexpectedTag indicates a response with an unexpected tag was received.
var ErrUnexpectedTag = errors.New("unexpected tag in response")

// ErrVersionsExhausted indicates that all versions to negotiate have been exhausted.
var ErrVersionsExhausted = errors.New("exhausted all versions to negotiate")

// ErrBadVersionString indicates that the version string is malformed or unsupported.
var ErrBadVersionString = errors.New("bad version string")

// ErrBadResponse indicates the response didn't match the request.
type ErrBadResponse struct {
        Got  MsgType
        Want MsgType
}

// Error returns a highly descriptive error.
func (e *ErrBadResponse) Error() string {
        return fmt.Sprintf("unexpected message type: got %v, want %v", e.Got, e.Want)
}

// response is the asynchronous return from recv.
//
// This is used in the pending map below.
type response struct {
        r    message
        done chan error
}

var responsePool = sync.Pool{
        New: func() interface{} {
                return &response{
                        done: make(chan error, 1),
                }
        },
}

// Client is at least a 9P2000.L client.
type Client struct {
        // socket is the connected socket.
        socket *unet.Socket

        // tagPool is the collection of available tags.
        tagPool pool.Pool

        // fidPool is the collection of available fids.
        fidPool pool.Pool

        // messageSize is the maximum total size of a message.
        messageSize uint32

        // payloadSize is the maximum payload size of a read or write.
        //
        // For large reads and writes this means that the read or write is
        // broken up into buffer-size/payloadSize requests.
        payloadSize uint32

        // version is the agreed upon version X of 9P2000.L.Google.X.
        // version 0 implies 9P2000.L.
        version uint32

        // closedWg is marked as done when the Client.watch() goroutine, which is
        // responsible for closing channels and the socket fd, returns.
        closedWg sync.WaitGroup

        // sendRecv is the transport function.
        //
        // This is determined dynamically based on whether or not the server
        // supports flipcall channels (preferred as it is faster and more
        // efficient, and does not require tags).
        sendRecv func(message, message) error

        // -- below corresponds to sendRecvChannel --

        // channelsMu protects channels.
        channelsMu sync.Mutex

        // channelsWg counts the number of channels for which channel.active ==
        // true.
        channelsWg sync.WaitGroup

        // channels is the set of all initialized channels.
        channels []*channel

        // availableChannels is a FIFO of inactive channels.
        availableChannels []*channel

        // -- below corresponds to sendRecvLegacy --

        // pending is the set of pending messages.
        pending   map[Tag]*response
        pendingMu sync.Mutex

        // sendMu is the lock for sending a request.
        sendMu sync.Mutex

        // recvr is essentially a mutex for calling recv.
        //
        // Whoever writes to this channel is permitted to call recv. When
        // finished calling recv, this channel should be emptied.
        recvr chan bool
}

// NewClient creates a new client.  It performs a Tversion exchange with
// the server to assert that messageSize is ok to use.
//
// If NewClient succeeds, ownership of socket is transferred to the new Client.
func NewClient(socket *unet.Socket, messageSize uint32, version string) (*Client, error) {
        // Need at least one byte of payload.
        if messageSize <= msgRegistry.largestFixedSize {
                return nil, &ErrMessageTooLarge{
                        size:  messageSize,
                        msize: msgRegistry.largestFixedSize,
                }
        }

        // Compute a payload size and round to 512 (normal block size)
        // if it's larger than a single block.
        payloadSize := messageSize - msgRegistry.largestFixedSize
        if payloadSize > 512 && payloadSize%512 != 0 {
                payloadSize -= (payloadSize % 512)
        }
        c := &Client{
                socket:      socket,
                tagPool:     pool.Pool{Start: 1, Limit: uint64(NoTag)},
                fidPool:     pool.Pool{Start: 1, Limit: uint64(NoFID)},
                pending:     make(map[Tag]*response),
                recvr:       make(chan bool, 1),
                messageSize: messageSize,
                payloadSize: payloadSize,
        }
        // Agree upon a version.
        requested, ok := parseVersion(version)
        if !ok {
                return nil, ErrBadVersionString
        }
        for {
                // Always exchange the version using the legacy version of the
                // protocol. If the protocol supports flipcall, then we switch
                // our sendRecv function to use that functionality.  Otherwise,
                // we stick to sendRecvLegacy.
                rversion := Rversion{}
                _, err := c.sendRecvLegacy(&Tversion{
                        Version: versionString(requested),
                        MSize:   messageSize,
                }, &rversion)

                // The server told us to try again with a lower version.
                if err == unix.EAGAIN {
                        if requested == lowestSupportedVersion {
                                return nil, ErrVersionsExhausted
                        }
                        requested--
                        continue
                }

                // We requested an impossible version or our other parameters were bogus.
                if err != nil {
                        return nil, err
                }

                // Parse the version.
                version, ok := parseVersion(rversion.Version)
                if !ok {
                        // The server gave us a bad version. We return a generically worrisome error.
                        log.Warningf("server returned bad version string %q", rversion.Version)
                        return nil, ErrBadVersionString
                }
                c.version = version
                break
        }

        // Can we switch to use the more advanced channels and create
        // independent channels for communication? Prefer it if possible.
        if versionSupportsFlipcall(c.version) {
                // Attempt to initialize IPC-based communication.
                for i := 0; i < channelsPerClient; i++ {
                        if err := c.openChannel(i); err != nil {
                                log.Warningf("error opening flipcall channel: %v", err)
                                break // Stop.
                        }
                }
                if len(c.channels) >= 1 {
                        // At least one channel created.
                        c.sendRecv = c.sendRecvChannel
                } else {
                        // Channel setup failed; fallback.
                        c.sendRecv = c.sendRecvLegacySyscallErr
                }
        } else {
                // No channels available: use the legacy mechanism.
                c.sendRecv = c.sendRecvLegacySyscallErr
        }

        // Ensure that the socket and channels are closed when the socket is shut
        // down.
        c.closedWg.Add(1)
        go c.watch(socket) // S/R-SAFE: not relevant.

        return c, nil
}

// watch watches the given socket and releases resources on hangup events.
//
// This is intended to be called as a goroutine.
func (c *Client) watch(socket *unet.Socket) {
        defer c.closedWg.Done()

        events := []unix.PollFd{
                {
                        Fd:     int32(socket.FD()),
                        Events: unix.POLLHUP | unix.POLLRDHUP,
                },
        }

        // Wait for a shutdown event.
        for {
                n, err := unix.Ppoll(events, nil, nil)
                if err == unix.EINTR || err == unix.EAGAIN {
                        continue
                }
                if err != nil {
                        log.Warningf("p9.Client.watch(): %v", err)
                        break
                }
                if n != 1 {
                        log.Warningf("p9.Client.watch(): got %d events, wanted 1", n)
                }
                break
        }

        // Set availableChannels to nil so that future calls to c.sendRecvChannel()
        // don't attempt to activate a channel, and concurrent calls to
        // c.sendRecvChannel() don't mark released channels as available.
        c.channelsMu.Lock()
        c.availableChannels = nil

        // Shut down all active channels.
        for _, ch := range c.channels {
                if ch.active {
                        log.Debugf("shutting down active channel@%p...", ch)
                        ch.Shutdown()
                }
        }
        c.channelsMu.Unlock()

        // Wait for active channels to become inactive.
        c.channelsWg.Wait()

        // Close all channels.
        c.channelsMu.Lock()
        for _, ch := range c.channels {
                ch.Close()
        }
        c.channelsMu.Unlock()

        // Close the main socket.
        c.socket.Close()
}

// openChannel attempts to open a client channel.
//
// Note that this function returns naked errors which should not be propagated
// directly to a caller. It is expected that the errors will be logged and a
// fallback path will be used instead.
func (c *Client) openChannel(id int) error {
        var (
                rchannel0 Rchannel
                rchannel1 Rchannel
                res       = new(channel)
        )

        // Open the data channel.
        if _, err := c.sendRecvLegacy(&Tchannel{
                ID:      uint32(id),
                Control: 0,
        }, &rchannel0); err != nil {
                return fmt.Errorf("error handling Tchannel message: %v", err)
        }
        if rchannel0.FilePayload() == nil {
                return fmt.Errorf("missing file descriptor on primary channel")
        }

        // We don't need to hold this.
        defer rchannel0.FilePayload().Close()

        // Open the channel for file descriptors.
        if _, err := c.sendRecvLegacy(&Tchannel{
                ID:      uint32(id),
                Control: 1,
        }, &rchannel1); err != nil {
                return err
        }
        if rchannel1.FilePayload() == nil {
                return fmt.Errorf("missing file descriptor on file descriptor channel")
        }

        // Construct the endpoints.
        res.desc = flipcall.PacketWindowDescriptor{
                FD:     rchannel0.FilePayload().FD(),
                Offset: int64(rchannel0.Offset),
                Length: int(rchannel0.Length),
        }
        if err := res.data.Init(flipcall.ClientSide, res.desc); err != nil {
                rchannel1.FilePayload().Close()
                return err
        }

        // The fds channel owns the control payload, and it will be closed when
        // the channel object is closed.
        res.fds.Init(rchannel1.FilePayload().Release())

        // Save the channel.
        c.channelsMu.Lock()
        defer c.channelsMu.Unlock()
        c.channels = append(c.channels, res)
        c.availableChannels = append(c.availableChannels, res)
        return nil
}

// handleOne handles a single incoming message.
//
// This should only be called with the token from recvr. Note that the received
// tag will automatically be cleared from pending.
func (c *Client) handleOne() {
        tag, r, err := recv(c.socket, c.messageSize, func(tag Tag, t MsgType) (message, error) {
                c.pendingMu.Lock()
                resp := c.pending[tag]
                c.pendingMu.Unlock()

                // Not expecting this message?
                if resp == nil {
                        log.Warningf("client received unexpected tag %v, ignoring", tag)
                        return nil, ErrUnexpectedTag
                }

                // Is it an error? We specifically allow this to
                // go through, and then we deserialize below.
                if t == MsgRlerror {
                        return &Rlerror{}, nil
                }

                // Does it match expectations?
                if t != resp.r.Type() {
                        return nil, &ErrBadResponse{Got: t, Want: resp.r.Type()}
                }

                // Return the response.
                return resp.r, nil
        })

        if err != nil {
                // No tag was extracted (probably a socket error).
                //
                // Likely catastrophic. Notify all waiters and clear pending.
                c.pendingMu.Lock()
                for _, resp := range c.pending {
                        resp.done <- err
                }
                c.pending = make(map[Tag]*response)
                c.pendingMu.Unlock()
        } else {
                // Process the tag.
                //
                // We know that is is contained in the map because our lookup function
                // above must have succeeded (found the tag) to return nil err.
                c.pendingMu.Lock()
                resp := c.pending[tag]
                delete(c.pending, tag)
                c.pendingMu.Unlock()
                resp.r = r
                resp.done <- err
        }
}

// waitAndRecv co-ordinates with other receivers to handle responses.
func (c *Client) waitAndRecv(done chan error) error {
        for {
                select {
                case err := <-done:
                        return err
                case c.recvr <- true:
                        select {
                        case err := <-done:
                                // It's possible that we got the token, despite
                                // done also being available. Check for that.
                                <-c.recvr
                                return err
                        default:
                                // Handle receiving one tag.
                                c.handleOne()

                                // Return the token.
                                <-c.recvr
                        }
                }
        }
}

// sendRecvLegacySyscallErr is a wrapper for sendRecvLegacy that converts all
// non-syscall errors to EIO.
func (c *Client) sendRecvLegacySyscallErr(t message, r message) error {
        received, err := c.sendRecvLegacy(t, r)
        if !received {
                log.Warningf("p9.Client.sendRecvChannel: %v", err)
                return unix.EIO
        }
        return err
}

// sendRecvLegacy performs a roundtrip message exchange.
//
// sendRecvLegacy returns true if a message was received. This allows us to
// differentiate between failed receives and successful receives where the
// response was an error message.
//
// This is called by internal functions.
func (c *Client) sendRecvLegacy(t message, r message) (bool, error) {
        tag, ok := c.tagPool.Get()
        if !ok {
                return false, ErrOutOfTags
        }
        defer c.tagPool.Put(tag)

        // Indicate we're expecting a response.
        //
        // Note that the tag will be cleared from pending
        // automatically (see handleOne for details).
        resp := responsePool.Get().(*response)
        defer responsePool.Put(resp)
        resp.r = r
        c.pendingMu.Lock()
        c.pending[Tag(tag)] = resp
        c.pendingMu.Unlock()

        // Send the request over the wire.
        c.sendMu.Lock()
        err := send(c.socket, Tag(tag), t)
        c.sendMu.Unlock()
        if err != nil {
                return false, err
        }

        // Co-ordinate with other receivers.
        if err := c.waitAndRecv(resp.done); err != nil {
                return false, err
        }

        // Is it an error message?
        //
        // For convenience, we transform these directly
        // into errors. Handlers need not handle this case.
        if rlerr, ok := resp.r.(*Rlerror); ok {
                return true, unix.Errno(rlerr.Error)
        }

        // At this point, we know it matches.
        //
        // Per recv call above, we will only allow a type
        // match (and give our r) or an instance of Rlerror.
        return true, nil
}

// sendRecvChannel uses channels to send a message.
func (c *Client) sendRecvChannel(t message, r message) error {
        // Acquire an available channel.
        c.channelsMu.Lock()
        if len(c.availableChannels) == 0 {
                c.channelsMu.Unlock()
                return c.sendRecvLegacySyscallErr(t, r)
        }
        idx := len(c.availableChannels) - 1
        ch := c.availableChannels[idx]
        c.availableChannels = c.availableChannels[:idx]
        ch.active = true
        c.channelsWg.Add(1)
        c.channelsMu.Unlock()

        // Ensure that it's connected.
        if !ch.connected {
                ch.connected = true
                if err := ch.data.Connect(); err != nil {
                        // The channel is unusable, so don't return it to
                        // c.availableChannels. However, we still have to mark it as
                        // inactive so c.watch() doesn't wait for it.
                        c.channelsMu.Lock()
                        ch.active = false
                        c.channelsMu.Unlock()
                        c.channelsWg.Done()
                        // Map all transport errors to EIO, but ensure that the real error
                        // is logged.
                        log.Warningf("p9.Client.sendRecvChannel: flipcall.Endpoint.Connect: %v", err)
                        return unix.EIO
                }
        }

        // Send the request and receive the server's response.
        rsz, err := ch.send(t)
        if err != nil {
                // See above.
                c.channelsMu.Lock()
                ch.active = false
                c.channelsMu.Unlock()
                c.channelsWg.Done()
                log.Warningf("p9.Client.sendRecvChannel: p9.channel.send: %v", err)
                return unix.EIO
        }

        // Parse the server's response.
        resp, retErr := ch.recv(r, rsz)
        if resp == nil {
                log.Warningf("p9.Client.sendRecvChannel: p9.channel.recv: %v", retErr)
                retErr = unix.EIO
        }

        // Release the channel.
        c.channelsMu.Lock()
        ch.active = false
        // If c.availableChannels is nil, c.watch() has fired and we should not
        // mark this channel as available.
        if c.availableChannels != nil {
                c.availableChannels = append(c.availableChannels, ch)
        }
        c.channelsMu.Unlock()
        c.channelsWg.Done()

        return retErr
}

// Version returns the negotiated 9P2000.L.Google version number.
func (c *Client) Version() uint32 {
        return c.version
}

// Close closes the underlying socket and channels.
func (c *Client) Close() {
        // unet.Socket.Shutdown() has no effect if unet.Socket.Close() has already
        // been called (by c.watch()).
        if err := c.socket.Shutdown(); err != nil {
                log.Warningf("Socket.Shutdown() failed (FD: %d): %v", c.socket.FD(), err)
        }
        c.closedWg.Wait()
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/marshal/primitive/primitive_abi_autogen_unsafe.go: no such file or directory















































































































  283 






  400 







  300 


  400 












  201 







  202 



  202 





  399 








  401 
   90 


  401 




  401 




  401 






  216 




  362 





  294 





  378 





  357 









  310 







  354 





  354 





  340 





  340 
  339 

  135 












    8 
    1 


    8 


    4 








    6 






  293 
  283 


  208 






















  181 













  166 
    1 


  165 


  166 

   18 




  151 






  150 




  166 
  140 



   30 





   10 












   86 



   87 










  308 

   91 
















  208 













   18 













  178 

















  274 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
)

// ResolvingPath represents the state of an in-progress path resolution, shared
// between VFS and FilesystemImpl methods that take a path.
//
// From the perspective of FilesystemImpl methods, a ResolvingPath represents a
// starting Dentry on the associated Filesystem (on which a reference is
// already held), a stream of path components relative to that Dentry, and
// elements of the invoking Context that are commonly required by
// FilesystemImpl methods.
//
// ResolvingPath is loosely analogous to Linux's struct nameidata.
//
// +stateify savable
type ResolvingPath struct {
        vfs   *VirtualFilesystem
        root  VirtualDentry // refs borrowed from PathOperation
        mount *Mount
        start *Dentry
        pit   fspath.Iterator

        flags     uint16
        mustBeDir bool  // final file must be a directory?
        symlinks  uint8 // number of symlinks traversed
        curPart   uint8 // index into parts

        creds *auth.Credentials

        // Data associated with resolve*Errors, stored in ResolvingPath so that
        // those errors don't need to allocate.
        nextMount        *Mount  // ref held if not nil
        nextStart        *Dentry // ref held if not nil
        absSymlinkTarget fspath.Path

        // ResolvingPath tracks relative paths, which is updated whenever a relative
        // symlink is encountered.
        parts [1 + linux.MaxSymlinkTraversals]fspath.Iterator
}

const (
        rpflagsHaveMountRef       = 1 << iota // do we hold a reference on mount?
        rpflagsHaveStartRef                   // do we hold a reference on start?
        rpflagsFollowFinalSymlink             // same as PathOperation.FollowFinalSymlink
)

func init() {
        if maxParts := len(ResolvingPath{}.parts); maxParts > 255 {
                panic(fmt.Sprintf("uint8 is insufficient to accommodate len(ResolvingPath.parts) (%d)", maxParts))
        }
}

// Error types that communicate state from the FilesystemImpl-caller,
// VFS-callee side of path resolution (i.e. errors returned by
// ResolvingPath.Resolve*()) to the VFS-caller, FilesystemImpl-callee side
// (i.e. VFS methods => ResolvingPath.handleError()). These are empty structs
// rather than error values because Go doesn't support non-primitive constants,
// so error "constants" are really mutable vars, necessitating somewhat
// expensive interface object comparisons.

// +stateify savable
type resolveMountRootOrJumpError struct{}

// Error implements error.Error.
func (resolveMountRootOrJumpError) Error() string {
        return "resolving mount root or jump"
}

// +stateify savable
type resolveMountPointError struct{}

// Error implements error.Error.
func (resolveMountPointError) Error() string {
        return "resolving mount point"
}

// +stateify savable
type resolveAbsSymlinkError struct{}

// Error implements error.Error.
func (resolveAbsSymlinkError) Error() string {
        return "resolving absolute symlink"
}

var resolvingPathPool = sync.Pool{
        New: func() interface{} {
                return &ResolvingPath{}
        },
}

// getResolvingPath gets a new ResolvingPath from the pool. Caller must call
// ResolvingPath.Release() when done.
func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) *ResolvingPath {
        rp := resolvingPathPool.Get().(*ResolvingPath)
        rp.vfs = vfs
        rp.root = pop.Root
        rp.mount = pop.Start.mount
        rp.start = pop.Start.dentry
        rp.pit = pop.Path.Begin
        rp.flags = 0
        if pop.FollowFinalSymlink {
                rp.flags |= rpflagsFollowFinalSymlink
        }
        rp.mustBeDir = pop.Path.Dir
        rp.symlinks = 0
        rp.curPart = 0
        rp.creds = creds
        rp.parts[0] = pop.Path.Begin
        return rp
}

// Copy creates another ResolvingPath with the same state as the original.
// Copies are independent, using the copy does not change the original and
// vice-versa.
//
// Caller must call Resease() when done.
func (rp *ResolvingPath) Copy() *ResolvingPath {
        copy := resolvingPathPool.Get().(*ResolvingPath)
        *copy = *rp // All fields all shallow copiable.

        // Take extra reference for the copy if the original had them.
        if copy.flags&rpflagsHaveStartRef != 0 {
                copy.start.IncRef()
        }
        if copy.flags&rpflagsHaveMountRef != 0 {
                copy.mount.IncRef()
        }
        // Reset error state.
        copy.nextStart = nil
        copy.nextMount = nil
        return copy
}

// Release decrements references if needed and returns the object to the pool.
func (rp *ResolvingPath) Release(ctx context.Context) {
        rp.root = VirtualDentry{}
        rp.decRefStartAndMount(ctx)
        rp.mount = nil
        rp.start = nil
        rp.releaseErrorState(ctx)
        resolvingPathPool.Put(rp)
}

func (rp *ResolvingPath) decRefStartAndMount(ctx context.Context) {
        if rp.flags&rpflagsHaveStartRef != 0 {
                rp.start.DecRef(ctx)
        }
        if rp.flags&rpflagsHaveMountRef != 0 {
                rp.mount.DecRef(ctx)
        }
}

func (rp *ResolvingPath) releaseErrorState(ctx context.Context) {
        if rp.nextStart != nil {
                rp.nextStart.DecRef(ctx)
                rp.nextStart = nil
        }
        if rp.nextMount != nil {
                rp.nextMount.DecRef(ctx)
                rp.nextMount = nil
        }
}

// VirtualFilesystem returns the containing VirtualFilesystem.
func (rp *ResolvingPath) VirtualFilesystem() *VirtualFilesystem {
        return rp.vfs
}

// Credentials returns the credentials of rp's provider.
func (rp *ResolvingPath) Credentials() *auth.Credentials {
        return rp.creds
}

// Mount returns the Mount on which path resolution is currently occurring. It
// does not take a reference on the returned Mount.
func (rp *ResolvingPath) Mount() *Mount {
        return rp.mount
}

// Start returns the starting Dentry represented by rp. It does not take a
// reference on the returned Dentry.
func (rp *ResolvingPath) Start() *Dentry {
        return rp.start
}

// Done returns true if there are no remaining path components in the stream
// represented by rp.
func (rp *ResolvingPath) Done() bool {
        // We don't need to check for rp.curPart == 0 because rp.Advance() won't
        // set rp.pit to a terminal iterator otherwise.
        return !rp.pit.Ok()
}

// Final returns true if there is exactly one remaining path component in the
// stream represented by rp.
//
// Preconditions: !rp.Done().
func (rp *ResolvingPath) Final() bool {
        return rp.curPart == 0 && !rp.pit.NextOk()
}

// Component returns the current path component in the stream represented by
// rp.
//
// Preconditions: !rp.Done().
func (rp *ResolvingPath) Component() string {
        if checkInvariants {
                if !rp.pit.Ok() {
                        panic("ResolvingPath.Component() called at end of relative path")
                }
        }
        return rp.pit.String()
}

// Advance advances the stream of path components represented by rp.
//
// Preconditions: !rp.Done().
func (rp *ResolvingPath) Advance() {
        if checkInvariants {
                if !rp.pit.Ok() {
                        panic("ResolvingPath.Advance() called at end of relative path")
                }
        }
        next := rp.pit.Next()
        if next.Ok() || rp.curPart == 0 { // have next component, or at end of path
                rp.pit = next
        } else { // at end of path segment, continue with next one
                rp.curPart--
                rp.pit = rp.parts[rp.curPart]
        }
}

// CheckRoot is called before resolving the parent of the Dentry d. If the
// Dentry is contextually a VFS root, such that path resolution should treat
// d's parent as itself, CheckRoot returns (true, nil). If the Dentry is the
// root of a non-root mount, such that path resolution should switch to another
// Mount, CheckRoot returns (unspecified, non-nil error). Otherwise, path
// resolution should resolve d's parent normally, and CheckRoot returns (false,
// nil).
func (rp *ResolvingPath) CheckRoot(ctx context.Context, d *Dentry) (bool, error) {
        if d == rp.root.dentry && rp.mount == rp.root.mount {
                // At contextual VFS root (due to e.g. chroot(2)).
                return true, nil
        } else if d == rp.mount.root {
                // At mount root ...
                vd := rp.vfs.getMountpointAt(ctx, rp.mount, rp.root)
                if vd.Ok() {
                        // ... of non-root mount.
                        rp.nextMount = vd.mount
                        rp.nextStart = vd.dentry
                        return false, resolveMountRootOrJumpError{}
                }
                // ... of root mount.
                return true, nil
        }
        return false, nil
}

// CheckMount is called after resolving the parent or child of another Dentry
// to d. If d is a mount point, such that path resolution should switch to
// another Mount, CheckMount returns a non-nil error. Otherwise, CheckMount
// returns nil.
func (rp *ResolvingPath) CheckMount(ctx context.Context, d *Dentry) error {
        if !d.isMounted() {
                return nil
        }
        if mnt := rp.vfs.getMountAt(ctx, rp.mount, d); mnt != nil {
                rp.nextMount = mnt
                return resolveMountPointError{}
        }
        return nil
}

// ShouldFollowSymlink returns true if, supposing that the current path
// component in pcs represents a symbolic link, the symbolic link should be
// followed.
//
// If path is terminated with '/', the '/' is considered the last element and
// any symlink before that is followed:
//   - For most non-creating walks, the last path component is handled by
//     fs/namei.c:lookup_last(), which sets LOOKUP_FOLLOW if the first byte
//     after the path component is non-NULL (which is only possible if it's '/')
//     and the path component is of type LAST_NORM.
//
//   - For open/openat/openat2 without O_CREAT, the last path component is
//     handled by fs/namei.c:do_last(), which does the same, though without the
//     LAST_NORM check.
//
// Preconditions: !rp.Done().
func (rp *ResolvingPath) ShouldFollowSymlink() bool {
        // Non-final symlinks are always followed. Paths terminated with '/' are also
        // always followed.
        return rp.flags&rpflagsFollowFinalSymlink != 0 || !rp.Final() || rp.MustBeDir()
}

// HandleSymlink is called when the current path component is a symbolic link
// to the given target. If the calling Filesystem method should continue path
// traversal, HandleSymlink updates the path component stream to reflect the
// symlink target and returns nil. Otherwise it returns a non-nil error.
//
// Preconditions: !rp.Done().
//
// Postconditions: If HandleSymlink returns a nil error, then !rp.Done().
func (rp *ResolvingPath) HandleSymlink(target string) error {
        if rp.symlinks >= linux.MaxSymlinkTraversals {
                return linuxerr.ELOOP
        }
        if len(target) == 0 {
                return syserror.ENOENT
        }
        rp.symlinks++
        targetPath := fspath.Parse(target)
        if targetPath.Absolute {
                rp.absSymlinkTarget = targetPath
                return resolveAbsSymlinkError{}
        }
        // Consume the path component that represented the symlink.
        rp.Advance()
        // Prepend the symlink target to the relative path.
        if checkInvariants {
                if !targetPath.HasComponents() {
                        panic(fmt.Sprintf("non-empty pathname %q parsed to relative path with no components", target))
                }
        }
        rp.relpathPrepend(targetPath)
        return nil
}

// Preconditions: path.HasComponents().
func (rp *ResolvingPath) relpathPrepend(path fspath.Path) {
        if rp.pit.Ok() {
                rp.parts[rp.curPart] = rp.pit
                rp.pit = path.Begin
                rp.curPart++
        } else {
                // The symlink was the final path component, so now the symlink target
                // is the whole path.
                rp.pit = path.Begin
                // Symlink targets can set rp.mustBeDir (if they end in a trailing /),
                // but can't unset it.
                if path.Dir {
                        rp.mustBeDir = true
                }
        }
}

// HandleJump is called when the current path component is a "magic" link to
// the given VirtualDentry, like /proc/[pid]/fd/[fd]. If the calling Filesystem
// method should continue path traversal, HandleMagicSymlink updates the path
// component stream to reflect the magic link target and returns nil. Otherwise
// it returns a non-nil error.
//
// Preconditions: !rp.Done().
func (rp *ResolvingPath) HandleJump(target VirtualDentry) error {
        if rp.symlinks >= linux.MaxSymlinkTraversals {
                return linuxerr.ELOOP
        }
        rp.symlinks++
        // Consume the path component that represented the magic link.
        rp.Advance()
        // Unconditionally return a resolveMountRootOrJumpError, even if the Mount
        // isn't changing, to force restarting at the new Dentry.
        target.IncRef()
        rp.nextMount = target.mount
        rp.nextStart = target.dentry
        return resolveMountRootOrJumpError{}
}

func (rp *ResolvingPath) handleError(ctx context.Context, err error) bool {
        switch err.(type) {
        case resolveMountRootOrJumpError:
                // Switch to the new Mount. We hold references on the Mount and Dentry.
                rp.decRefStartAndMount(ctx)
                rp.mount = rp.nextMount
                rp.start = rp.nextStart
                rp.flags |= rpflagsHaveMountRef | rpflagsHaveStartRef
                rp.nextMount = nil
                rp.nextStart = nil
                // Don't consume the path component that caused us to traverse
                // through the mount root - i.e. the ".." - because we still need to
                // resolve the mount point's parent in the new FilesystemImpl.
                //
                // Restart path resolution on the new Mount. Don't bother calling
                // rp.releaseErrorState() since we already set nextMount and nextStart
                // to nil above.
                return true

        case resolveMountPointError:
                // Switch to the new Mount. We hold a reference on the Mount, but
                // borrow the reference on the mount root from the Mount.
                rp.decRefStartAndMount(ctx)
                rp.mount = rp.nextMount
                rp.start = rp.nextMount.root
                rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef
                rp.nextMount = nil
                // Consume the path component that represented the mount point.
                rp.Advance()
                // Restart path resolution on the new Mount.
                rp.releaseErrorState(ctx)
                return true

        case resolveAbsSymlinkError:
                // Switch to the new Mount. References are borrowed from rp.root.
                rp.decRefStartAndMount(ctx)
                rp.mount = rp.root.mount
                rp.start = rp.root.dentry
                rp.flags &^= rpflagsHaveMountRef | rpflagsHaveStartRef
                // Consume the path component that represented the symlink.
                rp.Advance()
                // Prepend the symlink target to the relative path.
                rp.relpathPrepend(rp.absSymlinkTarget)
                // Restart path resolution on the new Mount.
                rp.releaseErrorState(ctx)
                return true

        default:
                // Not an error we can handle.
                return false
        }
}

// canHandleError returns true if err is an error returned by rp.Resolve*()
// that rp.handleError() may attempt to handle.
func (rp *ResolvingPath) canHandleError(err error) bool {
        switch err.(type) {
        case resolveMountRootOrJumpError, resolveMountPointError, resolveAbsSymlinkError:
                return true
        default:
                return false
        }
}

// MustBeDir returns true if the file traversed by rp must be a directory.
func (rp *ResolvingPath) MustBeDir() bool {
        return rp.mustBeDir
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/kernfs/slot_list.go: no such file or directory































































    2 

    1 

    1 













   12 


   12 



   12 



   12 

   12 



   12 






    3 







    2 




    2 
    1 



    2 



















    2 
















    2 































   92 





    2 




   95 





   41 




  157 









    6 










  141 




   12 



































    3 




    2 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package channel provides the implemention of channel-based data-link layer
// endpoints. Such endpoints allow injection of inbound packets and store
// outbound packets in a channel.
package channel

import (
        "context"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

// PacketInfo holds all the information about an outbound packet.
type PacketInfo struct {
        Pkt   *stack.PacketBuffer
        Proto tcpip.NetworkProtocolNumber
        Route stack.RouteInfo
}

// Notification is the interface for receiving notification from the packet
// queue.
type Notification interface {
        // WriteNotify will be called when a write happens to the queue.
        WriteNotify()
}

// NotificationHandle is an opaque handle to the registered notification target.
// It can be used to unregister the notification when no longer interested.
//
// +stateify savable
type NotificationHandle struct {
        n Notification
}

type queue struct {
        // c is the outbound packet channel.
        c chan PacketInfo
        // mu protects fields below.
        mu     sync.RWMutex
        notify []*NotificationHandle
}

func (q *queue) Close() {
        close(q.c)
}

func (q *queue) Read() (PacketInfo, bool) {
        select {
        case p := <-q.c:
                return p, true
        default:
                return PacketInfo{}, false
        }
}

func (q *queue) ReadContext(ctx context.Context) (PacketInfo, bool) {
        select {
        case pkt := <-q.c:
                return pkt, true
        case <-ctx.Done():
                return PacketInfo{}, false
        }
}

func (q *queue) Write(p PacketInfo) bool {
        wrote := false
        select {
        case q.c <- p:
                wrote = true
        default:
        }
        q.mu.Lock()
        notify := q.notify
        q.mu.Unlock()

        if wrote {
                // Send notification outside of lock.
                for _, h := range notify {
                        h.n.WriteNotify()
                }
        }
        return wrote
}

func (q *queue) Num() int {
        return len(q.c)
}

func (q *queue) AddNotify(notify Notification) *NotificationHandle {
        q.mu.Lock()
        defer q.mu.Unlock()
        h := &NotificationHandle{n: notify}
        q.notify = append(q.notify, h)
        return h
}

func (q *queue) RemoveNotify(handle *NotificationHandle) {
        q.mu.Lock()
        defer q.mu.Unlock()
        // Make a copy, since we reads the array outside of lock when notifying.
        notify := make([]*NotificationHandle, 0, len(q.notify))
        for _, h := range q.notify {
                if h != handle {
                        notify = append(notify, h)
                }
        }
        q.notify = notify
}

var _ stack.LinkEndpoint = (*Endpoint)(nil)
var _ stack.GSOEndpoint = (*Endpoint)(nil)

// Endpoint is link layer endpoint that stores outbound packets in a channel
// and allows injection of inbound packets.
type Endpoint struct {
        dispatcher         stack.NetworkDispatcher
        mtu                uint32
        linkAddr           tcpip.LinkAddress
        LinkEPCapabilities stack.LinkEndpointCapabilities
        SupportedGSOKind   stack.SupportedGSO

        // Outbound packet queue.
        q *queue
}

// New creates a new channel endpoint.
func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint {
        return &Endpoint{
                q: &queue{
                        c: make(chan PacketInfo, size),
                },
                mtu:      mtu,
                linkAddr: linkAddr,
        }
}

// Close closes e. Further packet injections will panic. Reads continue to
// succeed until all packets are read.
func (e *Endpoint) Close() {
        e.q.Close()
}

// Read does non-blocking read one packet from the outbound packet queue.
func (e *Endpoint) Read() (PacketInfo, bool) {
        return e.q.Read()
}

// ReadContext does blocking read for one packet from the outbound packet queue.
// It can be cancelled by ctx, and in this case, it returns false.
func (e *Endpoint) ReadContext(ctx context.Context) (PacketInfo, bool) {
        return e.q.ReadContext(ctx)
}

// Drain removes all outbound packets from the channel and counts them.
func (e *Endpoint) Drain() int {
        c := 0
        for {
                if _, ok := e.Read(); !ok {
                        return c
                }
                c++
        }
}

// NumQueued returns the number of packet queued for outbound.
func (e *Endpoint) NumQueued() int {
        return e.q.Num()
}

// InjectInbound injects an inbound packet.
func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
        e.InjectLinkAddr(protocol, "", pkt)
}

// InjectLinkAddr injects an inbound packet with a remote link address.
func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt *stack.PacketBuffer) {
        e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
}

// Attach saves the stack network-layer dispatcher for use later when packets
// are injected.
func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) {
        e.dispatcher = dispatcher
}

// IsAttached implements stack.LinkEndpoint.IsAttached.
func (e *Endpoint) IsAttached() bool {
        return e.dispatcher != nil
}

// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
// during construction.
func (e *Endpoint) MTU() uint32 {
        return e.mtu
}

// Capabilities implements stack.LinkEndpoint.Capabilities.
func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
        return e.LinkEPCapabilities
}

// GSOMaxSize implements stack.GSOEndpoint.
func (*Endpoint) GSOMaxSize() uint32 {
        return 1 << 15
}

// SupportedGSO implements stack.GSOEndpoint.
func (e *Endpoint) SupportedGSO() stack.SupportedGSO {
        return e.SupportedGSOKind
}

// MaxHeaderLength returns the maximum size of the link layer header. Given it
// doesn't have a header, it just returns 0.
func (*Endpoint) MaxHeaderLength() uint16 {
        return 0
}

// LinkAddress returns the link address of this endpoint.
func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
        return e.linkAddr
}

// WritePacket stores outbound packets into the channel.
func (e *Endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error {
        p := PacketInfo{
                Pkt:   pkt,
                Proto: protocol,
                Route: r,
        }

        e.q.Write(p)

        return nil
}

// WritePackets stores outbound packets into the channel.
func (e *Endpoint) WritePackets(r stack.RouteInfo, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, tcpip.Error) {
        n := 0
        for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
                p := PacketInfo{
                        Pkt:   pkt,
                        Proto: protocol,
                        Route: r,
                }

                if !e.q.Write(p) {
                        break
                }
                n++
        }

        return n, nil
}

// Wait implements stack.LinkEndpoint.Wait.
func (*Endpoint) Wait() {}

// AddNotify adds a notification target for receiving event about outgoing
// packets.
func (e *Endpoint) AddNotify(notify Notification) *NotificationHandle {
        return e.q.AddNotify(notify)
}

// RemoveNotify removes handle from the list of notification targets.
func (e *Endpoint) RemoveNotify(handle *NotificationHandle) {
        e.q.RemoveNotify(handle)
}

// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
func (*Endpoint) ARPHardwareType() header.ARPHardwareType {
        return header.ARPHardwareNone
}

// AddHeader implements stack.LinkEndpoint.AddHeader.
func (*Endpoint) AddHeader(tcpip.LinkAddress, tcpip.LinkAddress, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) {
}
































    1 



    1 











    1 



    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package bpf

import (
        "encoding/binary"
)

// InputBytes implements the Input interface by providing access to a byte
// slice. Unaligned loads are supported.
type InputBytes struct {
        // Data is the data accessed through the Input interface.
        Data []byte

        // Order is the byte order the data is accessed with.
        Order binary.ByteOrder
}

// Load32 implements Input.Load32.
func (i InputBytes) Load32(off uint32) (uint32, bool) {
        if uint64(off)+4 > uint64(len(i.Data)) {
                return 0, false
        }
        return i.Order.Uint32(i.Data[int(off):]), true
}

// Load16 implements Input.Load16.
func (i InputBytes) Load16(off uint32) (uint16, bool) {
        if uint64(off)+2 > uint64(len(i.Data)) {
                return 0, false
        }
        return i.Order.Uint16(i.Data[int(off):]), true
}

// Load8 implements Input.Load8.
func (i InputBytes) Load8(off uint32) (uint8, bool) {
        if uint64(off)+1 > uint64(len(i.Data)) {
                return 0, false
        }
        return i.Data[int(off)], true
}

// Length implements Input.Length.
func (i InputBytes) Length() uint32 {
        return uint32(len(i.Data))
}





















































































































   31 




   31 



   31 



   31 




























    4 




    4 



    4 
    1 



    3 



    2 













    2 










    2 






    2 








    2 



































































    1 





























    1 




    1 















    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package fragmentation contains the implementation of IP fragmentation.
// It is based on RFC 791, RFC 815 and RFC 8200.
package fragmentation

import (
        "errors"
        "fmt"
        "log"
        "time"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

const (
        // HighFragThreshold is the threshold at which we start trimming old
        // fragmented packets. Linux uses a default value of 4 MB. See
        // net.ipv4.ipfrag_high_thresh for more information.
        HighFragThreshold = 4 << 20 // 4MB

        // LowFragThreshold is the threshold we reach to when we start dropping
        // older fragmented packets. It's important that we keep enough room for newer
        // packets to be re-assembled. Hence, this needs to be lower than
        // HighFragThreshold enough. Linux uses a default value of 3 MB. See
        // net.ipv4.ipfrag_low_thresh for more information.
        LowFragThreshold = 3 << 20 // 3MB

        // minBlockSize is the minimum block size for fragments.
        minBlockSize = 1
)

var (
        // ErrInvalidArgs indicates to the caller that an invalid argument was
        // provided.
        ErrInvalidArgs = errors.New("invalid args")

        // ErrFragmentOverlap indicates that, during reassembly, a fragment overlaps
        // with another one.
        ErrFragmentOverlap = errors.New("overlapping fragments")

        // ErrFragmentConflict indicates that, during reassembly, some fragments are
        // in conflict with one another.
        ErrFragmentConflict = errors.New("conflicting fragments")
)

// FragmentID is the identifier for a fragment.
type FragmentID struct {
        // Source is the source address of the fragment.
        Source tcpip.Address

        // Destination is the destination address of the fragment.
        Destination tcpip.Address

        // ID is the identification value of the fragment.
        //
        // This is a uint32 because IPv6 uses a 32-bit identification value.
        ID uint32

        // The protocol for the packet.
        Protocol uint8
}

// Fragmentation is the main structure that other modules
// of the stack should use to implement IP Fragmentation.
type Fragmentation struct {
        mu             sync.Mutex
        highLimit      int
        lowLimit       int
        reassemblers   map[FragmentID]*reassembler
        rList          reassemblerList
        memSize        int
        timeout        time.Duration
        blockSize      uint16
        clock          tcpip.Clock
        releaseJob     *tcpip.Job
        timeoutHandler TimeoutHandler
}

// TimeoutHandler is consulted if a packet reassembly has timed out.
type TimeoutHandler interface {
        // OnReassemblyTimeout will be called with the first fragment (or nil, if the
        // first fragment has not been received) of a packet whose reassembly has
        // timed out.
        OnReassemblyTimeout(pkt *stack.PacketBuffer)
}

// NewFragmentation creates a new Fragmentation.
//
// blockSize specifies the fragment block size, in bytes.
//
// highMemoryLimit specifies the limit on the memory consumed
// by the fragments stored by Fragmentation (overhead of internal data-structures
// is not accounted). Fragments are dropped when the limit is reached.
//
// lowMemoryLimit specifies the limit on which we will reach by dropping
// fragments after reaching highMemoryLimit.
//
// reassemblingTimeout specifies the maximum time allowed to reassemble a packet.
// Fragments are lazily evicted only when a new a packet with an
// already existing fragmentation-id arrives after the timeout.
func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration, clock tcpip.Clock, timeoutHandler TimeoutHandler) *Fragmentation {
        if lowMemoryLimit >= highMemoryLimit {
                lowMemoryLimit = highMemoryLimit
        }

        if lowMemoryLimit < 0 {
                lowMemoryLimit = 0
        }

        if blockSize < minBlockSize {
                blockSize = minBlockSize
        }

        f := &Fragmentation{
                reassemblers:   make(map[FragmentID]*reassembler),
                highLimit:      highMemoryLimit,
                lowLimit:       lowMemoryLimit,
                timeout:        reassemblingTimeout,
                blockSize:      blockSize,
                clock:          clock,
                timeoutHandler: timeoutHandler,
        }
        f.releaseJob = tcpip.NewJob(f.clock, &f.mu, f.releaseReassemblersLocked)

        return f
}

// Process processes an incoming fragment belonging to an ID and returns a
// complete packet and its protocol number when all the packets belonging to
// that ID have been received.
//
// [first, last] is the range of the fragment bytes.
//
// first must be a multiple of the block size f is configured with. The size
// of the fragment data must be a multiple of the block size, unless there are
// no fragments following this fragment (more set to false).
//
// proto is the protocol number marked in the fragment being processed. It has
// to be given here outside of the FragmentID struct because IPv6 should not use
// the protocol to identify a fragment.
func (f *Fragmentation) Process(
        id FragmentID, first, last uint16, more bool, proto uint8, pkt *stack.PacketBuffer) (
        *stack.PacketBuffer, uint8, bool, error) {
        if first > last {
                return nil, 0, false, fmt.Errorf("first=%d is greater than last=%d: %w", first, last, ErrInvalidArgs)
        }

        if first%f.blockSize != 0 {
                return nil, 0, false, fmt.Errorf("first=%d is not a multiple of block size=%d: %w", first, f.blockSize, ErrInvalidArgs)
        }

        fragmentSize := last - first + 1
        if more && fragmentSize%f.blockSize != 0 {
                return nil, 0, false, fmt.Errorf("fragment size=%d bytes is not a multiple of block size=%d on non-final fragment: %w", fragmentSize, f.blockSize, ErrInvalidArgs)
        }

        if l := pkt.Data().Size(); l != int(fragmentSize) {
                return nil, 0, false, fmt.Errorf("got fragment size=%d bytes not equal to the expected fragment size=%d bytes (first=%d last=%d): %w", l, fragmentSize, first, last, ErrInvalidArgs)
        }

        f.mu.Lock()
        r, ok := f.reassemblers[id]
        if !ok {
                r = newReassembler(id, f.clock)
                f.reassemblers[id] = r
                wasEmpty := f.rList.Empty()
                f.rList.PushFront(r)
                if wasEmpty {
                        // If we have just pushed a first reassembler into an empty list, we
                        // should kickstart the release job. The release job will keep
                        // rescheduling itself until the list becomes empty.
                        f.releaseReassemblersLocked()
                }
        }
        f.mu.Unlock()

        resPkt, firstFragmentProto, done, memConsumed, err := r.process(first, last, more, proto, pkt)
        if err != nil {
                // We probably got an invalid sequence of fragments. Just
                // discard the reassembler and move on.
                f.mu.Lock()
                f.release(r, false /* timedOut */)
                f.mu.Unlock()
                return nil, 0, false, fmt.Errorf("fragmentation processing error: %w", err)
        }
        f.mu.Lock()
        f.memSize += memConsumed
        if done {
                f.release(r, false /* timedOut */)
        }
        // Evict reassemblers if we are consuming more memory than highLimit until
        // we reach lowLimit.
        if f.memSize > f.highLimit {
                for f.memSize > f.lowLimit {
                        tail := f.rList.Back()
                        if tail == nil {
                                break
                        }
                        f.release(tail, false /* timedOut */)
                }
        }
        f.mu.Unlock()
        return resPkt, firstFragmentProto, done, nil
}

func (f *Fragmentation) release(r *reassembler, timedOut bool) {
        // Before releasing a fragment we need to check if r is already marked as done.
        // Otherwise, we would delete it twice.
        if r.checkDoneOrMark() {
                return
        }

        delete(f.reassemblers, r.id)
        f.rList.Remove(r)
        f.memSize -= r.memSize
        if f.memSize < 0 {
                log.Printf("memory counter < 0 (%d), this is an accounting bug that requires investigation", f.memSize)
                f.memSize = 0
        }

        if h := f.timeoutHandler; timedOut && h != nil {
                h.OnReassemblyTimeout(r.pkt)
        }
}

// releaseReassemblersLocked releases already-expired reassemblers, then
// schedules the job to call back itself for the remaining reassemblers if
// any. This function must be called with f.mu locked.
func (f *Fragmentation) releaseReassemblersLocked() {
        now := f.clock.NowMonotonic()
        for {
                // The reassembler at the end of the list is the oldest.
                r := f.rList.Back()
                if r == nil {
                        // The list is empty.
                        break
                }
                elapsed := now.Sub(r.createdAt)
                if f.timeout > elapsed {
                        // If the oldest reassembler has not expired, schedule the release
                        // job so that this function is called back when it has expired.
                        f.releaseJob.Schedule(f.timeout - elapsed)
                        break
                }
                // If the oldest reassembler has already expired, release it.
                f.release(r, true /* timedOut*/)
        }
}

// PacketFragmenter is the book-keeping struct for packet fragmentation.
type PacketFragmenter struct {
        transportHeader    buffer.View
        data               buffer.VectorisedView
        reserve            int
        fragmentPayloadLen int
        fragmentCount      int
        currentFragment    int
        fragmentOffset     int
}

// MakePacketFragmenter prepares the struct needed for packet fragmentation.
//
// pkt is the packet to be fragmented.
//
// fragmentPayloadLen is the maximum number of bytes of fragmentable data a fragment can
// have.
//
// reserve is the number of bytes that should be reserved for the headers in
// each generated fragment.
func MakePacketFragmenter(pkt *stack.PacketBuffer, fragmentPayloadLen uint32, reserve int) PacketFragmenter {
        // As per RFC 8200 Section 4.5, some IPv6 extension headers should not be
        // repeated in each fragment. However we do not currently support any header
        // of that kind yet, so the following computation is valid for both IPv4 and
        // IPv6.
        // TODO(gvisor.dev/issue/3912): Once Authentication or ESP Headers are
        // supported for outbound packets, the fragmentable data should not include
        // these headers.
        var fragmentableData buffer.VectorisedView
        fragmentableData.AppendView(pkt.TransportHeader().View())
        fragmentableData.Append(pkt.Data().ExtractVV())
        fragmentCount := (uint32(fragmentableData.Size()) + fragmentPayloadLen - 1) / fragmentPayloadLen

        return PacketFragmenter{
                data:               fragmentableData,
                reserve:            reserve,
                fragmentPayloadLen: int(fragmentPayloadLen),
                fragmentCount:      int(fragmentCount),
        }
}

// BuildNextFragment returns a packet with the payload of the next fragment,
// along with the fragment's offset, the number of bytes copied and a boolean
// indicating if there are more fragments left or not. If this function is
// called again after it indicated that no more fragments were left, it will
// panic.
//
// Note that the returned packet will not have its network and link headers
// populated, but space for them will be reserved. The transport header will be
// stored in the packet's data.
func (pf *PacketFragmenter) BuildNextFragment() (*stack.PacketBuffer, int, int, bool) {
        if pf.currentFragment >= pf.fragmentCount {
                panic("BuildNextFragment should not be called again after the last fragment was returned")
        }

        fragPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                ReserveHeaderBytes: pf.reserve,
        })

        // Copy data for the fragment.
        copied := fragPkt.Data().ReadFromVV(&pf.data, pf.fragmentPayloadLen)

        offset := pf.fragmentOffset
        pf.fragmentOffset += copied
        pf.currentFragment++
        more := pf.currentFragment != pf.fragmentCount

        return fragPkt, offset, copied, more
}

// RemainingFragmentCount returns the number of fragments left to be built.
func (pf *PacketFragmenter) RemainingFragmentCount() int {
        return pf.fragmentCount - pf.currentFragment
}















































































    1 






    1 




    1 




    1 


    1 



















































































  328 














































































































  329 







  329 





























































































































































  330 




  330 
































































    2 



















































































































  328 

  328 


  330 


  330 


  330 


  330 


  330 


  330 


  330 


  329 


  330 


  330 


  329 


  330 


  328 


  328 



  329 


















  191 

  191 


  191 


  192 


  192 


  191 


  192 


  192 


  192 


  190 


  192 


  192 


  192 


  190 


  192 


  190 

























  330 



























  329 






































































































































































































































































    1 




    1 





















































































































  323 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package p9 is a 9P2000.L implementation.
package p9

import (
        "fmt"
        "math"
        "os"
        "strings"
        "sync/atomic"
        "syscall"

        "golang.org/x/sys/unix"
)

// OpenFlags is the mode passed to Open and Create operations.
//
// These correspond to bits sent over the wire.
type OpenFlags uint32

const (
        // ReadOnly is a Tlopen and Tlcreate flag indicating read-only mode.
        ReadOnly OpenFlags = 0

        // WriteOnly is a Tlopen and Tlcreate flag indicating write-only mode.
        WriteOnly OpenFlags = 1

        // ReadWrite is a Tlopen flag indicates read-write mode.
        ReadWrite OpenFlags = 2

        // OpenFlagsModeMask is a mask of valid OpenFlags mode bits.
        OpenFlagsModeMask OpenFlags = 3

        // OpenTruncate is a Tlopen flag indicating that the opened file should be
        // truncated.
        OpenTruncate OpenFlags = 01000
)

// ConnectFlags is the mode passed to Connect operations.
//
// These correspond to bits sent over the wire.
type ConnectFlags uint32

const (
        // StreamSocket is a Tlconnect flag indicating SOCK_STREAM mode.
        StreamSocket ConnectFlags = 0

        // DgramSocket is a Tlconnect flag indicating SOCK_DGRAM mode.
        DgramSocket ConnectFlags = 1

        // SeqpacketSocket is a Tlconnect flag indicating SOCK_SEQPACKET mode.
        SeqpacketSocket ConnectFlags = 2

        // AnonymousSocket is a Tlconnect flag indicating that the mode does not
        // matter and that the requester will accept any socket type.
        AnonymousSocket ConnectFlags = 3
)

// OSFlags converts a p9.OpenFlags to an int compatible with open(2).
func (o OpenFlags) OSFlags() int {
        // "flags contains Linux open(2) flags bits" - 9P2000.L
        return int(o)
}

// String implements fmt.Stringer.
func (o OpenFlags) String() string {
        var buf strings.Builder
        switch mode := o & OpenFlagsModeMask; mode {
        case ReadOnly:
                buf.WriteString("ReadOnly")
        case WriteOnly:
                buf.WriteString("WriteOnly")
        case ReadWrite:
                buf.WriteString("ReadWrite")
        default:
                fmt.Fprintf(&buf, "%#o", mode)
        }
        otherFlags := o &^ OpenFlagsModeMask
        if otherFlags&OpenTruncate != 0 {
                buf.WriteString("|OpenTruncate")
                otherFlags &^= OpenTruncate
        }
        if otherFlags != 0 {
                fmt.Fprintf(&buf, "|%#o", otherFlags)
        }
        return buf.String()
}

// Tag is a message tag.
type Tag uint16

// FID is a file identifier.
type FID uint64

// FileMode are flags corresponding to file modes.
//
// These correspond to bits sent over the wire.
// These also correspond to mode_t bits.
type FileMode uint32

const (
        // FileModeMask is a mask of all the file mode bits of FileMode.
        FileModeMask FileMode = 0170000

        // ModeSocket is an (unused) mode bit for a socket.
        ModeSocket FileMode = 0140000

        // ModeSymlink is a mode bit for a symlink.
        ModeSymlink FileMode = 0120000

        // ModeRegular is a mode bit for regular files.
        ModeRegular FileMode = 0100000

        // ModeBlockDevice is a mode bit for block devices.
        ModeBlockDevice FileMode = 060000

        // ModeDirectory is a mode bit for directories.
        ModeDirectory FileMode = 040000

        // ModeCharacterDevice is a mode bit for a character device.
        ModeCharacterDevice FileMode = 020000

        // ModeNamedPipe is a mode bit for a named pipe.
        ModeNamedPipe FileMode = 010000

        // Read is a mode bit indicating read permission.
        Read FileMode = 04

        // Write is a mode bit indicating write permission.
        Write FileMode = 02

        // Exec is a mode bit indicating exec permission.
        Exec FileMode = 01

        // AllPermissions is a mask with rwx bits set for user, group and others.
        AllPermissions FileMode = 0777

        // Sticky is a mode bit indicating sticky directories.
        Sticky FileMode = 01000

        // SetGID is the set group ID bit.
        SetGID FileMode = 02000

        // SetUID is the set user ID bit.
        SetUID FileMode = 04000

        // permissionsMask is the mask to apply to FileModes for permissions. It
        // includes rwx bits for user, group, and others, as well as the sticky
        // bit, setuid bit, and setgid bit.
        permissionsMask FileMode = 07777
)

// QIDType is the most significant byte of the FileMode word, to be used as the
// Type field of p9.QID.
func (m FileMode) QIDType() QIDType {
        switch {
        case m.IsDir():
                return TypeDir
        case m.IsSocket(), m.IsNamedPipe(), m.IsCharacterDevice():
                // Best approximation.
                return TypeAppendOnly
        case m.IsSymlink():
                return TypeSymlink
        default:
                return TypeRegular
        }
}

// FileType returns the file mode without the permission bits.
func (m FileMode) FileType() FileMode {
        return m & FileModeMask
}

// Permissions returns just the permission bits of the mode.
func (m FileMode) Permissions() FileMode {
        return m & permissionsMask
}

// Writable returns the mode with write bits added.
func (m FileMode) Writable() FileMode {
        return m | 0222
}

// IsReadable returns true if m represents a file that can be read.
func (m FileMode) IsReadable() bool {
        return m&0444 != 0
}

// IsWritable returns true if m represents a file that can be written to.
func (m FileMode) IsWritable() bool {
        return m&0222 != 0
}

// IsExecutable returns true if m represents a file that can be executed.
func (m FileMode) IsExecutable() bool {
        return m&0111 != 0
}

// IsRegular returns true if m is a regular file.
func (m FileMode) IsRegular() bool {
        return m&FileModeMask == ModeRegular
}

// IsDir returns true if m represents a directory.
func (m FileMode) IsDir() bool {
        return m&FileModeMask == ModeDirectory
}

// IsNamedPipe returns true if m represents a named pipe.
func (m FileMode) IsNamedPipe() bool {
        return m&FileModeMask == ModeNamedPipe
}

// IsCharacterDevice returns true if m represents a character device.
func (m FileMode) IsCharacterDevice() bool {
        return m&FileModeMask == ModeCharacterDevice
}

// IsBlockDevice returns true if m represents a character device.
func (m FileMode) IsBlockDevice() bool {
        return m&FileModeMask == ModeBlockDevice
}

// IsSocket returns true if m represents a socket.
func (m FileMode) IsSocket() bool {
        return m&FileModeMask == ModeSocket
}

// IsSymlink returns true if m represents a symlink.
func (m FileMode) IsSymlink() bool {
        return m&FileModeMask == ModeSymlink
}

// ModeFromOS returns a FileMode from an os.FileMode.
func ModeFromOS(mode os.FileMode) FileMode {
        m := FileMode(mode.Perm())
        switch {
        case mode.IsDir():
                m |= ModeDirectory
        case mode&os.ModeSymlink != 0:
                m |= ModeSymlink
        case mode&os.ModeSocket != 0:
                m |= ModeSocket
        case mode&os.ModeNamedPipe != 0:
                m |= ModeNamedPipe
        case mode&os.ModeCharDevice != 0:
                m |= ModeCharacterDevice
        case mode&os.ModeDevice != 0:
                m |= ModeBlockDevice
        default:
                m |= ModeRegular
        }
        return m
}

// OSMode converts a p9.FileMode to an os.FileMode.
func (m FileMode) OSMode() os.FileMode {
        var osMode os.FileMode
        osMode |= os.FileMode(m.Permissions())
        switch {
        case m.IsDir():
                osMode |= os.ModeDir
        case m.IsSymlink():
                osMode |= os.ModeSymlink
        case m.IsSocket():
                osMode |= os.ModeSocket
        case m.IsNamedPipe():
                osMode |= os.ModeNamedPipe
        case m.IsCharacterDevice():
                osMode |= os.ModeCharDevice | os.ModeDevice
        case m.IsBlockDevice():
                osMode |= os.ModeDevice
        }
        return osMode
}

// UID represents a user ID.
type UID uint32

// Ok returns true if uid is not NoUID.
func (uid UID) Ok() bool {
        return uid != NoUID
}

// GID represents a group ID.
type GID uint32

// Ok returns true if gid is not NoGID.
func (gid GID) Ok() bool {
        return gid != NoGID
}

const (
        // NoTag is a sentinel used to indicate no valid tag.
        NoTag Tag = math.MaxUint16

        // NoFID is a sentinel used to indicate no valid FID.
        NoFID FID = math.MaxUint32

        // NoUID is a sentinel used to indicate no valid UID.
        NoUID UID = math.MaxUint32

        // NoGID is a sentinel used to indicate no valid GID.
        NoGID GID = math.MaxUint32
)

// MsgType is a type identifier.
type MsgType uint8

// MsgType declarations.
const (
        MsgTlerror       MsgType = 6
        MsgRlerror       MsgType = 7
        MsgTstatfs       MsgType = 8
        MsgRstatfs       MsgType = 9
        MsgTlopen        MsgType = 12
        MsgRlopen        MsgType = 13
        MsgTlcreate      MsgType = 14
        MsgRlcreate      MsgType = 15
        MsgTsymlink      MsgType = 16
        MsgRsymlink      MsgType = 17
        MsgTmknod        MsgType = 18
        MsgRmknod        MsgType = 19
        MsgTrename       MsgType = 20
        MsgRrename       MsgType = 21
        MsgTreadlink     MsgType = 22
        MsgRreadlink     MsgType = 23
        MsgTgetattr      MsgType = 24
        MsgRgetattr      MsgType = 25
        MsgTsetattr      MsgType = 26
        MsgRsetattr      MsgType = 27
        MsgTlistxattr    MsgType = 28
        MsgRlistxattr    MsgType = 29
        MsgTxattrwalk    MsgType = 30
        MsgRxattrwalk    MsgType = 31
        MsgTxattrcreate  MsgType = 32
        MsgRxattrcreate  MsgType = 33
        MsgTgetxattr     MsgType = 34
        MsgRgetxattr     MsgType = 35
        MsgTsetxattr     MsgType = 36
        MsgRsetxattr     MsgType = 37
        MsgTremovexattr  MsgType = 38
        MsgRremovexattr  MsgType = 39
        MsgTreaddir      MsgType = 40
        MsgRreaddir      MsgType = 41
        MsgTfsync        MsgType = 50
        MsgRfsync        MsgType = 51
        MsgTlink         MsgType = 70
        MsgRlink         MsgType = 71
        MsgTmkdir        MsgType = 72
        MsgRmkdir        MsgType = 73
        MsgTrenameat     MsgType = 74
        MsgRrenameat     MsgType = 75
        MsgTunlinkat     MsgType = 76
        MsgRunlinkat     MsgType = 77
        MsgTversion      MsgType = 100
        MsgRversion      MsgType = 101
        MsgTauth         MsgType = 102
        MsgRauth         MsgType = 103
        MsgTattach       MsgType = 104
        MsgRattach       MsgType = 105
        MsgTflush        MsgType = 108
        MsgRflush        MsgType = 109
        MsgTwalk         MsgType = 110
        MsgRwalk         MsgType = 111
        MsgTread         MsgType = 116
        MsgRread         MsgType = 117
        MsgTwrite        MsgType = 118
        MsgRwrite        MsgType = 119
        MsgTclunk        MsgType = 120
        MsgRclunk        MsgType = 121
        MsgTremove       MsgType = 122
        MsgRremove       MsgType = 123
        MsgTflushf       MsgType = 124
        MsgRflushf       MsgType = 125
        MsgTwalkgetattr  MsgType = 126
        MsgRwalkgetattr  MsgType = 127
        MsgTucreate      MsgType = 128
        MsgRucreate      MsgType = 129
        MsgTumkdir       MsgType = 130
        MsgRumkdir       MsgType = 131
        MsgTumknod       MsgType = 132
        MsgRumknod       MsgType = 133
        MsgTusymlink     MsgType = 134
        MsgRusymlink     MsgType = 135
        MsgTlconnect     MsgType = 136
        MsgRlconnect     MsgType = 137
        MsgTallocate     MsgType = 138
        MsgRallocate     MsgType = 139
        MsgTsetattrclunk MsgType = 140
        MsgRsetattrclunk MsgType = 141
        MsgTmultigetattr MsgType = 142
        MsgRmultigetattr MsgType = 143
        MsgTchannel      MsgType = 250
        MsgRchannel      MsgType = 251
)

// QIDType represents the file type for QIDs.
//
// QIDType corresponds to the high 8 bits of a Plan 9 file mode.
type QIDType uint8

const (
        // TypeDir represents a directory type.
        TypeDir QIDType = 0x80

        // TypeAppendOnly represents an append only file.
        TypeAppendOnly QIDType = 0x40

        // TypeExclusive represents an exclusive-use file.
        TypeExclusive QIDType = 0x20

        // TypeMount represents a mounted channel.
        TypeMount QIDType = 0x10

        // TypeAuth represents an authentication file.
        TypeAuth QIDType = 0x08

        // TypeTemporary represents a temporary file.
        TypeTemporary QIDType = 0x04

        // TypeSymlink represents a symlink.
        TypeSymlink QIDType = 0x02

        // TypeLink represents a hard link.
        TypeLink QIDType = 0x01

        // TypeRegular represents a regular file.
        TypeRegular QIDType = 0x00
)

// QID is a unique file identifier.
//
// This may be embedded in other requests and responses.
type QID struct {
        // Type is the highest order byte of the file mode.
        Type QIDType

        // Version is an arbitrary server version number.
        Version uint32

        // Path is a unique server identifier for this path (e.g. inode).
        Path uint64
}

// String implements fmt.Stringer.
func (q QID) String() string {
        return fmt.Sprintf("QID{Type: %d, Version: %d, Path: %d}", q.Type, q.Version, q.Path)
}

// decode implements encoder.decode.
func (q *QID) decode(b *buffer) {
        q.Type = b.ReadQIDType()
        q.Version = b.Read32()
        q.Path = b.Read64()
}

// encode implements encoder.encode.
func (q *QID) encode(b *buffer) {
        b.WriteQIDType(q.Type)
        b.Write32(q.Version)
        b.Write64(q.Path)
}

// QIDGenerator is a simple generator for QIDs that atomically increments Path
// values.
type QIDGenerator struct {
        // uids is an ever increasing value that can be atomically incremented
        // to provide unique Path values for QIDs.
        uids uint64
}

// Get returns a new 9P unique ID with a unique Path given a QID type.
//
// While the 9P spec allows Version to be incremented every time the file is
// modified, we currently do not use the Version member for anything.  Hence,
// it is set to 0.
func (q *QIDGenerator) Get(t QIDType) QID {
        return QID{
                Type:    t,
                Version: 0,
                Path:    atomic.AddUint64(&q.uids, 1),
        }
}

// FSStat is used by statfs.
type FSStat struct {
        // Type is the filesystem type.
        Type uint32

        // BlockSize is the blocksize.
        BlockSize uint32

        // Blocks is the number of blocks.
        Blocks uint64

        // BlocksFree is the number of free blocks.
        BlocksFree uint64

        // BlocksAvailable is the number of blocks *available*.
        BlocksAvailable uint64

        // Files is the number of files available.
        Files uint64

        // FilesFree is the number of free file nodes.
        FilesFree uint64

        // FSID is the filesystem ID.
        FSID uint64

        // NameLength is the maximum name length.
        NameLength uint32
}

// decode implements encoder.decode.
func (f *FSStat) decode(b *buffer) {
        f.Type = b.Read32()
        f.BlockSize = b.Read32()
        f.Blocks = b.Read64()
        f.BlocksFree = b.Read64()
        f.BlocksAvailable = b.Read64()
        f.Files = b.Read64()
        f.FilesFree = b.Read64()
        f.FSID = b.Read64()
        f.NameLength = b.Read32()
}

// encode implements encoder.encode.
func (f *FSStat) encode(b *buffer) {
        b.Write32(f.Type)
        b.Write32(f.BlockSize)
        b.Write64(f.Blocks)
        b.Write64(f.BlocksFree)
        b.Write64(f.BlocksAvailable)
        b.Write64(f.Files)
        b.Write64(f.FilesFree)
        b.Write64(f.FSID)
        b.Write32(f.NameLength)
}

// AttrMask is a mask of attributes for getattr.
type AttrMask struct {
        Mode        bool
        NLink       bool
        UID         bool
        GID         bool
        RDev        bool
        ATime       bool
        MTime       bool
        CTime       bool
        INo         bool
        Size        bool
        Blocks      bool
        BTime       bool
        Gen         bool
        DataVersion bool
}

// Contains returns true if a contains all of the attributes masked as b.
func (a AttrMask) Contains(b AttrMask) bool {
        if b.Mode && !a.Mode {
                return false
        }
        if b.NLink && !a.NLink {
                return false
        }
        if b.UID && !a.UID {
                return false
        }
        if b.GID && !a.GID {
                return false
        }
        if b.RDev && !a.RDev {
                return false
        }
        if b.ATime && !a.ATime {
                return false
        }
        if b.MTime && !a.MTime {
                return false
        }
        if b.CTime && !a.CTime {
                return false
        }
        if b.INo && !a.INo {
                return false
        }
        if b.Size && !a.Size {
                return false
        }
        if b.Blocks && !a.Blocks {
                return false
        }
        if b.BTime && !a.BTime {
                return false
        }
        if b.Gen && !a.Gen {
                return false
        }
        if b.DataVersion && !a.DataVersion {
                return false
        }
        return true
}

// Empty returns true if no fields are masked.
func (a AttrMask) Empty() bool {
        return !a.Mode && !a.NLink && !a.UID && !a.GID && !a.RDev && !a.ATime && !a.MTime && !a.CTime && !a.INo && !a.Size && !a.Blocks && !a.BTime && !a.Gen && !a.DataVersion
}

// AttrMaskAll returns an AttrMask with all fields masked.
func AttrMaskAll() AttrMask {
        return AttrMask{
                Mode:        true,
                NLink:       true,
                UID:         true,
                GID:         true,
                RDev:        true,
                ATime:       true,
                MTime:       true,
                CTime:       true,
                INo:         true,
                Size:        true,
                Blocks:      true,
                BTime:       true,
                Gen:         true,
                DataVersion: true,
        }
}

// String implements fmt.Stringer.
func (a AttrMask) String() string {
        var masks []string
        if a.Mode {
                masks = append(masks, "Mode")
        }
        if a.NLink {
                masks = append(masks, "NLink")
        }
        if a.UID {
                masks = append(masks, "UID")
        }
        if a.GID {
                masks = append(masks, "GID")
        }
        if a.RDev {
                masks = append(masks, "RDev")
        }
        if a.ATime {
                masks = append(masks, "ATime")
        }
        if a.MTime {
                masks = append(masks, "MTime")
        }
        if a.CTime {
                masks = append(masks, "CTime")
        }
        if a.INo {
                masks = append(masks, "INo")
        }
        if a.Size {
                masks = append(masks, "Size")
        }
        if a.Blocks {
                masks = append(masks, "Blocks")
        }
        if a.BTime {
                masks = append(masks, "BTime")
        }
        if a.Gen {
                masks = append(masks, "Gen")
        }
        if a.DataVersion {
                masks = append(masks, "DataVersion")
        }
        return fmt.Sprintf("AttrMask{with: %s}", strings.Join(masks, " "))
}

// decode implements encoder.decode.
func (a *AttrMask) decode(b *buffer) {
        mask := b.Read64()
        a.Mode = mask&0x00000001 != 0
        a.NLink = mask&0x00000002 != 0
        a.UID = mask&0x00000004 != 0
        a.GID = mask&0x00000008 != 0
        a.RDev = mask&0x00000010 != 0
        a.ATime = mask&0x00000020 != 0
        a.MTime = mask&0x00000040 != 0
        a.CTime = mask&0x00000080 != 0
        a.INo = mask&0x00000100 != 0
        a.Size = mask&0x00000200 != 0
        a.Blocks = mask&0x00000400 != 0
        a.BTime = mask&0x00000800 != 0
        a.Gen = mask&0x00001000 != 0
        a.DataVersion = mask&0x00002000 != 0
}

// encode implements encoder.encode.
func (a *AttrMask) encode(b *buffer) {
        var mask uint64
        if a.Mode {
                mask |= 0x00000001
        }
        if a.NLink {
                mask |= 0x00000002
        }
        if a.UID {
                mask |= 0x00000004
        }
        if a.GID {
                mask |= 0x00000008
        }
        if a.RDev {
                mask |= 0x00000010
        }
        if a.ATime {
                mask |= 0x00000020
        }
        if a.MTime {
                mask |= 0x00000040
        }
        if a.CTime {
                mask |= 0x00000080
        }
        if a.INo {
                mask |= 0x00000100
        }
        if a.Size {
                mask |= 0x00000200
        }
        if a.Blocks {
                mask |= 0x00000400
        }
        if a.BTime {
                mask |= 0x00000800
        }
        if a.Gen {
                mask |= 0x00001000
        }
        if a.DataVersion {
                mask |= 0x00002000
        }
        b.Write64(mask)
}

// Attr is a set of attributes for getattr.
type Attr struct {
        Mode             FileMode
        UID              UID
        GID              GID
        NLink            uint64
        RDev             uint64
        Size             uint64
        BlockSize        uint64
        Blocks           uint64
        ATimeSeconds     uint64
        ATimeNanoSeconds uint64
        MTimeSeconds     uint64
        MTimeNanoSeconds uint64
        CTimeSeconds     uint64
        CTimeNanoSeconds uint64
        BTimeSeconds     uint64
        BTimeNanoSeconds uint64
        Gen              uint64
        DataVersion      uint64
}

// String implements fmt.Stringer.
func (a Attr) String() string {
        return fmt.Sprintf("Attr{Mode: 0o%o, UID: %d, GID: %d, NLink: %d, RDev: %d, Size: %d, BlockSize: %d, Blocks: %d, ATime: {Sec: %d, NanoSec: %d}, MTime: {Sec: %d, NanoSec: %d}, CTime: {Sec: %d, NanoSec: %d}, BTime: {Sec: %d, NanoSec: %d}, Gen: %d, DataVersion: %d}",
                a.Mode, a.UID, a.GID, a.NLink, a.RDev, a.Size, a.BlockSize, a.Blocks, a.ATimeSeconds, a.ATimeNanoSeconds, a.MTimeSeconds, a.MTimeNanoSeconds, a.CTimeSeconds, a.CTimeNanoSeconds, a.BTimeSeconds, a.BTimeNanoSeconds, a.Gen, a.DataVersion)
}

// encode implements encoder.encode.
func (a *Attr) encode(b *buffer) {
        b.WriteFileMode(a.Mode)
        b.WriteUID(a.UID)
        b.WriteGID(a.GID)
        b.Write64(a.NLink)
        b.Write64(a.RDev)
        b.Write64(a.Size)
        b.Write64(a.BlockSize)
        b.Write64(a.Blocks)
        b.Write64(a.ATimeSeconds)
        b.Write64(a.ATimeNanoSeconds)
        b.Write64(a.MTimeSeconds)
        b.Write64(a.MTimeNanoSeconds)
        b.Write64(a.CTimeSeconds)
        b.Write64(a.CTimeNanoSeconds)
        b.Write64(a.BTimeSeconds)
        b.Write64(a.BTimeNanoSeconds)
        b.Write64(a.Gen)
        b.Write64(a.DataVersion)
}

// decode implements encoder.decode.
func (a *Attr) decode(b *buffer) {
        a.Mode = b.ReadFileMode()
        a.UID = b.ReadUID()
        a.GID = b.ReadGID()
        a.NLink = b.Read64()
        a.RDev = b.Read64()
        a.Size = b.Read64()
        a.BlockSize = b.Read64()
        a.Blocks = b.Read64()
        a.ATimeSeconds = b.Read64()
        a.ATimeNanoSeconds = b.Read64()
        a.MTimeSeconds = b.Read64()
        a.MTimeNanoSeconds = b.Read64()
        a.CTimeSeconds = b.Read64()
        a.CTimeNanoSeconds = b.Read64()
        a.BTimeSeconds = b.Read64()
        a.BTimeNanoSeconds = b.Read64()
        a.Gen = b.Read64()
        a.DataVersion = b.Read64()
}

// StatToAttr converts a Linux syscall stat structure to an Attr.
func StatToAttr(s *syscall.Stat_t, req AttrMask) (Attr, AttrMask) {
        attr := Attr{
                UID: NoUID,
                GID: NoGID,
        }
        if req.Mode {
                // p9.FileMode corresponds to Linux mode_t.
                attr.Mode = FileMode(s.Mode)
        }
        if req.NLink {
                attr.NLink = uint64(s.Nlink)
        }
        if req.UID {
                attr.UID = UID(s.Uid)
        }
        if req.GID {
                attr.GID = GID(s.Gid)
        }
        if req.RDev {
                attr.RDev = s.Dev
        }
        if req.ATime {
                attr.ATimeSeconds = uint64(s.Atim.Sec)
                attr.ATimeNanoSeconds = uint64(s.Atim.Nsec)
        }
        if req.MTime {
                attr.MTimeSeconds = uint64(s.Mtim.Sec)
                attr.MTimeNanoSeconds = uint64(s.Mtim.Nsec)
        }
        if req.CTime {
                attr.CTimeSeconds = uint64(s.Ctim.Sec)
                attr.CTimeNanoSeconds = uint64(s.Ctim.Nsec)
        }
        if req.Size {
                attr.Size = uint64(s.Size)
        }
        if req.Blocks {
                attr.BlockSize = uint64(s.Blksize)
                attr.Blocks = uint64(s.Blocks)
        }

        // Use the req field because we already have it.
        req.BTime = false
        req.Gen = false
        req.DataVersion = false

        return attr, req
}

// SetAttrMask specifies a valid mask for setattr.
type SetAttrMask struct {
        Permissions        bool
        UID                bool
        GID                bool
        Size               bool
        ATime              bool
        MTime              bool
        CTime              bool
        ATimeNotSystemTime bool
        MTimeNotSystemTime bool
}

// IsSubsetOf returns whether s is a subset of m.
func (s SetAttrMask) IsSubsetOf(m SetAttrMask) bool {
        sb := s.bitmask()
        sm := m.bitmask()
        return sm|sb == sm
}

// String implements fmt.Stringer.
func (s SetAttrMask) String() string {
        var masks []string
        if s.Permissions {
                masks = append(masks, "Permissions")
        }
        if s.UID {
                masks = append(masks, "UID")
        }
        if s.GID {
                masks = append(masks, "GID")
        }
        if s.Size {
                masks = append(masks, "Size")
        }
        if s.ATime {
                masks = append(masks, "ATime")
        }
        if s.MTime {
                masks = append(masks, "MTime")
        }
        if s.CTime {
                masks = append(masks, "CTime")
        }
        if s.ATimeNotSystemTime {
                masks = append(masks, "ATimeNotSystemTime")
        }
        if s.MTimeNotSystemTime {
                masks = append(masks, "MTimeNotSystemTime")
        }
        return fmt.Sprintf("SetAttrMask{with: %s}", strings.Join(masks, " "))
}

// Empty returns true if no fields are masked.
func (s SetAttrMask) Empty() bool {
        return !s.Permissions && !s.UID && !s.GID && !s.Size && !s.ATime && !s.MTime && !s.CTime && !s.ATimeNotSystemTime && !s.MTimeNotSystemTime
}

// decode implements encoder.decode.
func (s *SetAttrMask) decode(b *buffer) {
        mask := b.Read32()
        s.Permissions = mask&0x00000001 != 0
        s.UID = mask&0x00000002 != 0
        s.GID = mask&0x00000004 != 0
        s.Size = mask&0x00000008 != 0
        s.ATime = mask&0x00000010 != 0
        s.MTime = mask&0x00000020 != 0
        s.CTime = mask&0x00000040 != 0
        s.ATimeNotSystemTime = mask&0x00000080 != 0
        s.MTimeNotSystemTime = mask&0x00000100 != 0
}

func (s SetAttrMask) bitmask() uint32 {
        var mask uint32
        if s.Permissions {
                mask |= 0x00000001
        }
        if s.UID {
                mask |= 0x00000002
        }
        if s.GID {
                mask |= 0x00000004
        }
        if s.Size {
                mask |= 0x00000008
        }
        if s.ATime {
                mask |= 0x00000010
        }
        if s.MTime {
                mask |= 0x00000020
        }
        if s.CTime {
                mask |= 0x00000040
        }
        if s.ATimeNotSystemTime {
                mask |= 0x00000080
        }
        if s.MTimeNotSystemTime {
                mask |= 0x00000100
        }
        return mask
}

// encode implements encoder.encode.
func (s *SetAttrMask) encode(b *buffer) {
        b.Write32(s.bitmask())
}

// SetAttr specifies a set of attributes for a setattr.
type SetAttr struct {
        Permissions      FileMode
        UID              UID
        GID              GID
        Size             uint64
        ATimeSeconds     uint64
        ATimeNanoSeconds uint64
        MTimeSeconds     uint64
        MTimeNanoSeconds uint64
}

// String implements fmt.Stringer.
func (s SetAttr) String() string {
        return fmt.Sprintf("SetAttr{Permissions: 0o%o, UID: %d, GID: %d, Size: %d, ATime: {Sec: %d, NanoSec: %d}, MTime: {Sec: %d, NanoSec: %d}}", s.Permissions, s.UID, s.GID, s.Size, s.ATimeSeconds, s.ATimeNanoSeconds, s.MTimeSeconds, s.MTimeNanoSeconds)
}

// decode implements encoder.decode.
func (s *SetAttr) decode(b *buffer) {
        s.Permissions = b.ReadPermissions()
        s.UID = b.ReadUID()
        s.GID = b.ReadGID()
        s.Size = b.Read64()
        s.ATimeSeconds = b.Read64()
        s.ATimeNanoSeconds = b.Read64()
        s.MTimeSeconds = b.Read64()
        s.MTimeNanoSeconds = b.Read64()
}

// encode implements encoder.encode.
func (s *SetAttr) encode(b *buffer) {
        b.WritePermissions(s.Permissions)
        b.WriteUID(s.UID)
        b.WriteGID(s.GID)
        b.Write64(s.Size)
        b.Write64(s.ATimeSeconds)
        b.Write64(s.ATimeNanoSeconds)
        b.Write64(s.MTimeSeconds)
        b.Write64(s.MTimeNanoSeconds)
}

// Apply applies this to the given Attr.
func (a *Attr) Apply(mask SetAttrMask, attr SetAttr) {
        if mask.Permissions {
                a.Mode = a.Mode&^permissionsMask | (attr.Permissions & permissionsMask)
        }
        if mask.UID {
                a.UID = attr.UID
        }
        if mask.GID {
                a.GID = attr.GID
        }
        if mask.Size {
                a.Size = attr.Size
        }
        if mask.ATime {
                a.ATimeSeconds = attr.ATimeSeconds
                a.ATimeNanoSeconds = attr.ATimeNanoSeconds
        }
        if mask.MTime {
                a.MTimeSeconds = attr.MTimeSeconds
                a.MTimeNanoSeconds = attr.MTimeNanoSeconds
        }
}

// Dirent is used for readdir.
type Dirent struct {
        // QID is the entry QID.
        QID QID

        // Offset is the offset in the directory.
        //
        // This will be communicated back the original caller.
        Offset uint64

        // Type is the 9P type.
        Type QIDType

        // Name is the name of the entry (i.e. basename).
        Name string
}

// String implements fmt.Stringer.
func (d Dirent) String() string {
        return fmt.Sprintf("Dirent{QID: %d, Offset: %d, Type: 0x%X, Name: %s}", d.QID, d.Offset, d.Type, d.Name)
}

// decode implements encoder.decode.
func (d *Dirent) decode(b *buffer) {
        d.QID.decode(b)
        d.Offset = b.Read64()
        d.Type = b.ReadQIDType()
        d.Name = b.ReadString()
}

// encode implements encoder.encode.
func (d *Dirent) encode(b *buffer) {
        d.QID.encode(b)
        b.Write64(d.Offset)
        b.WriteQIDType(d.Type)
        b.WriteString(d.Name)
}

// AllocateMode are possible modes to p9.File.Allocate().
type AllocateMode struct {
        KeepSize      bool
        PunchHole     bool
        NoHideStale   bool
        CollapseRange bool
        ZeroRange     bool
        InsertRange   bool
        Unshare       bool
}

// ToAllocateMode returns an AllocateMode from a fallocate(2) mode.
func ToAllocateMode(mode uint64) AllocateMode {
        return AllocateMode{
                KeepSize:      mode&unix.FALLOC_FL_KEEP_SIZE != 0,
                PunchHole:     mode&unix.FALLOC_FL_PUNCH_HOLE != 0,
                NoHideStale:   mode&unix.FALLOC_FL_NO_HIDE_STALE != 0,
                CollapseRange: mode&unix.FALLOC_FL_COLLAPSE_RANGE != 0,
                ZeroRange:     mode&unix.FALLOC_FL_ZERO_RANGE != 0,
                InsertRange:   mode&unix.FALLOC_FL_INSERT_RANGE != 0,
                Unshare:       mode&unix.FALLOC_FL_UNSHARE_RANGE != 0,
        }
}

// ToLinux converts to a value compatible with fallocate(2)'s mode.
func (a *AllocateMode) ToLinux() uint32 {
        rv := uint32(0)
        if a.KeepSize {
                rv |= unix.FALLOC_FL_KEEP_SIZE
        }
        if a.PunchHole {
                rv |= unix.FALLOC_FL_PUNCH_HOLE
        }
        if a.NoHideStale {
                rv |= unix.FALLOC_FL_NO_HIDE_STALE
        }
        if a.CollapseRange {
                rv |= unix.FALLOC_FL_COLLAPSE_RANGE
        }
        if a.ZeroRange {
                rv |= unix.FALLOC_FL_ZERO_RANGE
        }
        if a.InsertRange {
                rv |= unix.FALLOC_FL_INSERT_RANGE
        }
        if a.Unshare {
                rv |= unix.FALLOC_FL_UNSHARE_RANGE
        }
        return rv
}

// decode implements encoder.decode.
func (a *AllocateMode) decode(b *buffer) {
        mask := b.Read32()
        a.KeepSize = mask&0x01 != 0
        a.PunchHole = mask&0x02 != 0
        a.NoHideStale = mask&0x04 != 0
        a.CollapseRange = mask&0x08 != 0
        a.ZeroRange = mask&0x10 != 0
        a.InsertRange = mask&0x20 != 0
        a.Unshare = mask&0x40 != 0
}

// encode implements encoder.encode.
func (a *AllocateMode) encode(b *buffer) {
        mask := uint32(0)
        if a.KeepSize {
                mask |= 0x01
        }
        if a.PunchHole {
                mask |= 0x02
        }
        if a.NoHideStale {
                mask |= 0x04
        }
        if a.CollapseRange {
                mask |= 0x08
        }
        if a.ZeroRange {
                mask |= 0x10
        }
        if a.InsertRange {
                mask |= 0x20
        }
        if a.Unshare {
                mask |= 0x40
        }
        b.Write32(mask)
}

// FullStat is used in the result of a MultiGetAttr call.
type FullStat struct {
        QID   QID
        Valid AttrMask
        Attr  Attr
}

// String implements fmt.Stringer.
func (f *FullStat) String() string {
        return fmt.Sprintf("FullStat{QID: %v, Valid: %v, Attr: %v}", f.QID, f.Valid, f.Attr)
}

// decode implements encoder.decode.
func (f *FullStat) decode(b *buffer) {
        f.QID.decode(b)
        f.Valid.decode(b)
        f.Attr.decode(b)
}

// encode implements encoder.encode.
func (f *FullStat) encode(b *buffer) {
        f.QID.encode(b)
        f.Valid.encode(b)
        f.Attr.encode(b)
}

































































































































































































































































































  631 




  630 




  633 
  632 






































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package metric provides primitives for collecting metrics.
package metric

import (
        "errors"
        "fmt"
        "sort"
        "sync/atomic"
        "time"

        "google.golang.org/protobuf/types/known/timestamppb"
        "gvisor.dev/gvisor/pkg/eventchannel"
        "gvisor.dev/gvisor/pkg/log"
        pb "gvisor.dev/gvisor/pkg/metric/metric_go_proto"
        "gvisor.dev/gvisor/pkg/sync"
)

var (
        // ErrNameInUse indicates that another metric is already defined for
        // the given name.
        ErrNameInUse = errors.New("metric name already in use")

        // ErrInitializationDone indicates that the caller tried to create a
        // new metric after initialization.
        ErrInitializationDone = errors.New("metric cannot be created after initialization is complete")

        // WeirdnessMetric is a metric with fields created to track the number
        // of weird occurrences such as time fallback, partial_result, vsyscall
        // count, watchdog startup timeouts and stuck tasks.
        WeirdnessMetric = MustCreateNewUint64Metric("/weirdness", true /* sync */, "Increment for weird occurrences of problems such as time fallback, partial result, vsyscalls invoked in the sandbox, watchdog startup timeouts and stuck tasks.",
                Field{
                        name:          "weirdness_type",
                        allowedValues: []string{"time_fallback", "partial_result", "vsyscall_count", "watchdog_stuck_startup", "watchdog_stuck_tasks"},
                })

        // SuspiciousOperationsMetric is a metric with fields created to detect
        // operations such as opening an executable file to write from a gofer.
        SuspiciousOperationsMetric = MustCreateNewUint64Metric("/suspicious_operations", true /* sync */, "Increment for suspicious operations such as opening an executable file to write from a gofer.",
                Field{
                        name:          "operation_type",
                        allowedValues: []string{"opened_write_execute_file"},
                })
)

// InitStage is the name of a Sentry initialization stage.
type InitStage string

// List of all Sentry initialization stages.
var (
        InitRestoreConfig InitStage = "restore_config"
        InitExecConfig    InitStage = "exec_config"
        InitRestore       InitStage = "restore"
        InitCreateProcess InitStage = "create_process"
        InitTaskStart     InitStage = "task_start"

        // allStages is the list of allowed stages.
        allStages = []InitStage{
                InitRestoreConfig,
                InitExecConfig,
                InitRestore,
                InitCreateProcess,
                InitTaskStart,
        }
)

// Uint64Metric encapsulates a uint64 that represents some kind of metric to be
// monitored. We currently support metrics with at most one field.
//
// Metrics are not saved across save/restore and thus reset to zero on restore.
//
// TODO(b/67298427): Support metric fields.
type Uint64Metric struct {
        // value is the actual value of the metric. It must be accessed atomically.
        value uint64

        // numFields is the number of metric fields. It is immutable once
        // initialized.
        numFields int

        // mu protects the below fields.
        mu sync.RWMutex `state:"nosave"`

        // fields is the map of fields in the metric.
        fields map[string]uint64
}

var (
        // initialized indicates that all metrics are registered. allMetrics is
        // immutable once initialized is true.
        initialized bool

        // allMetrics are the registered metrics.
        allMetrics = makeMetricSet()
)

// Initialize sends a metric registration event over the event channel.
//
// Precondition:
//  * All metrics are registered.
//  * Initialize/Disable has not been called.
func Initialize() error {
        if initialized {
                return errors.New("metric.Initialize called after metric.Initialize or metric.Disable")
        }

        m := pb.MetricRegistration{}
        for _, v := range allMetrics.m {
                m.Metrics = append(m.Metrics, v.metadata)
        }
        m.Stages = make([]string, 0, len(allStages))
        for _, s := range allStages {
                m.Stages = append(m.Stages, string(s))
        }
        if err := eventchannel.Emit(&m); err != nil {
                return fmt.Errorf("unable to emit metric initialize event: %w", err)
        }

        initialized = true
        return nil
}

// Disable sends an empty metric registration event over the event channel,
// disabling metric collection.
//
// Precondition:
//  * All metrics are registered.
//  * Initialize/Disable has not been called.
func Disable() error {
        if initialized {
                return errors.New("metric.Disable called after metric.Initialize or metric.Disable")
        }

        m := pb.MetricRegistration{}
        if err := eventchannel.Emit(&m); err != nil {
                return fmt.Errorf("unable to emit metric disable event: %w", err)
        }

        initialized = true
        return nil
}

type customUint64Metric struct {
        // metadata describes the metric. It is immutable.
        metadata *pb.MetricMetadata

        // value returns the current value of the metric for the given set of
        // fields. It takes a variadic number of field values as argument.
        value func(fieldValues ...string) uint64
}

// Field contains the field name and allowed values for the metric which is
// used in registration of the metric.
type Field struct {
        // name is the metric field name.
        name string

        // allowedValues is the list of allowed values for the field.
        allowedValues []string
}

// RegisterCustomUint64Metric registers a metric with the given name.
//
// Register must only be called at init and will return and error if called
// after Initialized.
//
// Preconditions:
// * name must be globally unique.
// * Initialize/Disable have not been called.
// * value is expected to accept exactly len(fields) arguments.
func RegisterCustomUint64Metric(name string, cumulative, sync bool, units pb.MetricMetadata_Units, description string, value func(...string) uint64, fields ...Field) error {
        if initialized {
                return ErrInitializationDone
        }

        if _, ok := allMetrics.m[name]; ok {
                return ErrNameInUse
        }

        allMetrics.m[name] = customUint64Metric{
                metadata: &pb.MetricMetadata{
                        Name:        name,
                        Description: description,
                        Cumulative:  cumulative,
                        Sync:        sync,
                        Type:        pb.MetricMetadata_TYPE_UINT64,
                        Units:       units,
                },
                value: value,
        }

        // Metrics can exist without fields.
        if l := len(fields); l > 1 {
                return fmt.Errorf("%d fields provided, must be <= 1", l)
        }

        for _, field := range fields {
                allMetrics.m[name].metadata.Fields = append(allMetrics.m[name].metadata.Fields, &pb.MetricMetadata_Field{
                        FieldName:     field.name,
                        AllowedValues: field.allowedValues,
                })
        }
        return nil
}

// MustRegisterCustomUint64Metric calls RegisterCustomUint64Metric for metrics
// without fields and panics if it returns an error.
func MustRegisterCustomUint64Metric(name string, cumulative, sync bool, description string, value func(...string) uint64, fields ...Field) {
        if err := RegisterCustomUint64Metric(name, cumulative, sync, pb.MetricMetadata_UNITS_NONE, description, value, fields...); err != nil {
                panic(fmt.Sprintf("Unable to register metric %q: %s", name, err))
        }
}

// NewUint64Metric creates and registers a new cumulative metric with the given
// name.
//
// Metrics must be statically defined (i.e., at init).
func NewUint64Metric(name string, sync bool, units pb.MetricMetadata_Units, description string, fields ...Field) (*Uint64Metric, error) {
        m := Uint64Metric{
                numFields: len(fields),
        }

        if m.numFields == 1 {
                m.fields = make(map[string]uint64)
                for _, fieldValue := range fields[0].allowedValues {
                        m.fields[fieldValue] = 0
                }
        }
        return &m, RegisterCustomUint64Metric(name, true /* cumulative */, sync, units, description, m.Value, fields...)
}

// MustCreateNewUint64Metric calls NewUint64Metric and panics if it returns an
// error.
func MustCreateNewUint64Metric(name string, sync bool, description string, fields ...Field) *Uint64Metric {
        m, err := NewUint64Metric(name, sync, pb.MetricMetadata_UNITS_NONE, description, fields...)
        if err != nil {
                panic(fmt.Sprintf("Unable to create metric %q: %s", name, err))
        }
        return m
}

// MustCreateNewUint64NanosecondsMetric calls NewUint64Metric and panics if it
// returns an error.
func MustCreateNewUint64NanosecondsMetric(name string, sync bool, description string) *Uint64Metric {
        m, err := NewUint64Metric(name, sync, pb.MetricMetadata_UNITS_NANOSECONDS, description)
        if err != nil {
                panic(fmt.Sprintf("Unable to create metric %q: %s", name, err))
        }
        return m
}

// Value returns the current value of the metric for the given set of fields.
func (m *Uint64Metric) Value(fieldValues ...string) uint64 {
        if m.numFields != len(fieldValues) {
                panic(fmt.Sprintf("Number of fieldValues %d is not equal to the number of metric fields %d", len(fieldValues), m.numFields))
        }

        switch m.numFields {
        case 0:
                return atomic.LoadUint64(&m.value)
        case 1:
                m.mu.RLock()
                defer m.mu.RUnlock()

                fieldValue := fieldValues[0]
                if _, ok := m.fields[fieldValue]; !ok {
                        panic(fmt.Sprintf("Metric does not allow to have field value %s", fieldValue))
                }
                return m.fields[fieldValue]
        default:
                panic("Sentry metrics do not support more than one field")
        }
}

// Increment increments the metric field by 1.
func (m *Uint64Metric) Increment(fieldValues ...string) {
        m.IncrementBy(1, fieldValues...)
}

// IncrementBy increments the metric by v.
func (m *Uint64Metric) IncrementBy(v uint64, fieldValues ...string) {
        if m.numFields != len(fieldValues) {
                panic(fmt.Sprintf("Number of fieldValues %d is not equal to the number of metric fields %d", len(fieldValues), m.numFields))
        }

        switch m.numFields {
        case 0:
                atomic.AddUint64(&m.value, v)
                return
        case 1:
                fieldValue := fieldValues[0]
                m.mu.Lock()
                defer m.mu.Unlock()

                if _, ok := m.fields[fieldValue]; !ok {
                        panic(fmt.Sprintf("Metric does not allow to have field value %s", fieldValue))
                }
                m.fields[fieldValue] += v
        default:
                panic("Sentry metrics do not support more than one field")
        }
}

// stageTiming contains timing data for an initialization stage.
type stageTiming struct {
        stage   InitStage
        started time.Time
        // ended is the zero time when the stage has not ended yet.
        ended time.Time
}

// inProgress returns whether this stage hasn't ended yet.
func (s stageTiming) inProgress() bool {
        return !s.started.IsZero() && s.ended.IsZero()
}

// metricSet holds metric data.
type metricSet struct {
        // Map of metrics.
        m map[string]customUint64Metric

        // mu protects the fields below.
        mu sync.RWMutex

        // Information about the stages reached by the Sentry. Only appended to, so
        // reading a shallow copy of the slice header concurrently is safe.
        finished []stageTiming

        // The current stage in progress.
        currentStage stageTiming
}

// makeMetricSet returns a new metricSet.
func makeMetricSet() metricSet {
        return metricSet{
                m:        make(map[string]customUint64Metric),
                finished: make([]stageTiming, 0, len(allStages)),
        }
}

// Values returns a snapshot of all values in m.
func (m *metricSet) Values() metricValues {
        m.mu.Lock()
        stages := m.finished[:]
        m.mu.Unlock()

        vals := metricValues{
                m:      make(map[string]interface{}, len(m.m)),
                stages: stages,
        }

        for k, v := range m.m {
                fields := v.metadata.GetFields()
                switch len(fields) {
                case 0:
                        vals.m[k] = v.value()
                case 1:
                        values := fields[0].GetAllowedValues()
                        fieldsMap := make(map[string]uint64)
                        for _, fieldValue := range values {
                                fieldsMap[fieldValue] = v.value(fieldValue)
                        }
                        vals.m[k] = fieldsMap
                default:
                        panic(fmt.Sprintf("Unsupported number of metric fields: %d", len(fields)))
                }
        }
        return vals
}

// metricValues contains a copy of the values of all metrics.
type metricValues struct {
        // m is a map with key as metric name and value can be either uint64 or
        // map[string]uint64 to support metrics with one field.
        m map[string]interface{}

        // Information on when initialization stages were reached. Does not include
        // the currently-ongoing stage, if any.
        stages []stageTiming
}

var (
        // emitMu protects metricsAtLastEmit and ensures that all emitted
        // metrics are strongly ordered (older metrics are never emitted after
        // newer metrics).
        emitMu sync.Mutex

        // metricsAtLastEmit contains the state of the metrics at the last emit event.
        metricsAtLastEmit metricValues
)

// EmitMetricUpdate emits a MetricUpdate over the event channel.
//
// Only metrics that have changed since the last call are emitted.
//
// EmitMetricUpdate is thread-safe.
//
// Preconditions:
// * Initialize has been called.
func EmitMetricUpdate() {
        emitMu.Lock()
        defer emitMu.Unlock()

        snapshot := allMetrics.Values()

        m := pb.MetricUpdate{}
        // On the first call metricsAtLastEmit will be empty. Include all
        // metrics then.
        for k, v := range snapshot.m {
                prev, ok := metricsAtLastEmit.m[k]
                switch t := v.(type) {
                case uint64:
                        // Metric exists and value did not change.
                        if ok && prev.(uint64) == t {
                                continue
                        }

                        m.Metrics = append(m.Metrics, &pb.MetricValue{
                                Name:  k,
                                Value: &pb.MetricValue_Uint64Value{Uint64Value: t},
                        })
                case map[string]uint64:
                        for fieldValue, metricValue := range t {
                                // Emit data on the first call only if the field
                                // value has been incremented. For all other
                                // calls, emit data if the field value has been
                                // changed from the previous emit.
                                if (!ok && metricValue == 0) || (ok && prev.(map[string]uint64)[fieldValue] == metricValue) {
                                        continue
                                }

                                m.Metrics = append(m.Metrics, &pb.MetricValue{
                                        Name:        k,
                                        FieldValues: []string{fieldValue},
                                        Value:       &pb.MetricValue_Uint64Value{Uint64Value: metricValue},
                                })
                        }
                }
        }

        for s := len(metricsAtLastEmit.stages); s < len(snapshot.stages); s++ {
                newStage := snapshot.stages[s]
                m.StageTiming = append(m.StageTiming, &pb.StageTiming{
                        Stage: string(newStage.stage),
                        Started: &timestamppb.Timestamp{
                                Seconds: newStage.started.Unix(),
                                Nanos:   int32(newStage.started.Nanosecond()),
                        },
                        Ended: &timestamppb.Timestamp{
                                Seconds: newStage.ended.Unix(),
                                Nanos:   int32(newStage.ended.Nanosecond()),
                        },
                })
        }

        metricsAtLastEmit = snapshot
        if len(m.Metrics) == 0 && len(m.StageTiming) == 0 {
                return
        }

        if log.IsLogging(log.Debug) {
                sort.Slice(m.Metrics, func(i, j int) bool {
                        return m.Metrics[i].Name < m.Metrics[j].Name
                })
                log.Debugf("Emitting metrics:")
                for _, metric := range m.Metrics {
                        log.Debugf("%s: %+v", metric.Name, metric.Value)
                }
                for _, stage := range m.StageTiming {
                        duration := time.Duration(stage.Ended.Seconds-stage.Started.Seconds)*time.Second + time.Duration(stage.Ended.Nanos-stage.Started.Nanos)*time.Nanosecond
                        log.Debugf("Stage %s took %v", stage.GetStage(), duration)
                }
        }

        if err := eventchannel.Emit(&m); err != nil {
                log.Warningf("Unable to emit metrics: %s", err)
        }
}

// StartStage should be called when an initialization stage is started.
// It returns a function that must be called to indicate that the stage ended.
// Alternatively, future calls to StartStage will implicitly indicate that the
// previous stage ended.
// Stage information will be emitted in the next call to EmitMetricUpdate after
// a stage has ended.
//
// This function may (and is expected to) be called prior to final
// initialization of this metric library, as it has to capture early stages
// of Sentry initialization.
func StartStage(stage InitStage) func() {
        now := time.Now()
        allMetrics.mu.Lock()
        defer allMetrics.mu.Unlock()
        if allMetrics.currentStage.inProgress() {
                endStage(now)
        }
        allMetrics.currentStage.stage = stage
        allMetrics.currentStage.started = now
        return func() {
                now := time.Now()
                allMetrics.mu.Lock()
                defer allMetrics.mu.Unlock()
                // The current stage may have been ended by another call to StartStage, so
                // double-check prior to clearing the current stage.
                if allMetrics.currentStage.inProgress() && allMetrics.currentStage.stage == stage {
                        endStage(now)
                }
        }
}

// endStage marks allMetrics.currentStage as ended, adding it to the list of
// finished stages. It assumes allMetrics.mu is locked.
func endStage(when time.Time) {
        allMetrics.currentStage.ended = when
        allMetrics.finished = append(allMetrics.finished, allMetrics.currentStage)
        allMetrics.currentStage = stageTiming{}
}


























































































































    5 




   14 






    1 


   13 



    7 



    7 

    1 





   13 







   22 











  412 









  409 



  407 
    1 


  411 


  407 





    1 

    1 

    1 



    2 


    1 



    1 


    1 






    2 










    2 



    2 





    2 

    1 


    1 





    6 












    6 
    1 


    5 









    2 







    4 




    1 



    3 



    3 



    2 







    4 


    1 



    3 
    2 


    3 







    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "bytes"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/mm"
        "gvisor.dev/gvisor/pkg/syserror"
)

// Brk implements linux syscall brk(2).
func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr, _ := t.MemoryManager().Brk(t, args[0].Pointer())
        // "However, the actual Linux system call returns the new program break on
        // success. On failure, the system call returns the current break." -
        // brk(2)
        return uintptr(addr), nil, nil
}

// LINT.IfChange

// Mmap implements linux syscall mmap(2).
func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        prot := args[2].Int()
        flags := args[3].Int()
        fd := args[4].Int()
        fixed := flags&linux.MAP_FIXED != 0
        private := flags&linux.MAP_PRIVATE != 0
        shared := flags&linux.MAP_SHARED != 0
        anon := flags&linux.MAP_ANONYMOUS != 0
        map32bit := flags&linux.MAP_32BIT != 0

        // Require exactly one of MAP_PRIVATE and MAP_SHARED.
        if private == shared {
                return 0, nil, linuxerr.EINVAL
        }

        opts := memmap.MMapOpts{
                Length:   args[1].Uint64(),
                Offset:   args[5].Uint64(),
                Addr:     args[0].Pointer(),
                Fixed:    fixed,
                Unmap:    fixed,
                Map32Bit: map32bit,
                Private:  private,
                Perms: hostarch.AccessType{
                        Read:    linux.PROT_READ&prot != 0,
                        Write:   linux.PROT_WRITE&prot != 0,
                        Execute: linux.PROT_EXEC&prot != 0,
                },
                MaxPerms:  hostarch.AnyAccess,
                GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
                Precommit: linux.MAP_POPULATE&flags != 0,
        }
        if linux.MAP_LOCKED&flags != 0 {
                opts.MLockMode = memmap.MLockEager
        }
        defer func() {
                if opts.MappingIdentity != nil {
                        opts.MappingIdentity.DecRef(t)
                }
        }()

        if !anon {
                // Convert the passed FD to a file reference.
                file := t.GetFile(fd)
                if file == nil {
                        return 0, nil, linuxerr.EBADF
                }
                defer file.DecRef(t)

                flags := file.Flags()
                // mmap unconditionally requires that the FD is readable.
                if !flags.Read {
                        return 0, nil, linuxerr.EACCES
                }
                // MAP_SHARED requires that the FD be writable for PROT_WRITE.
                if shared && !flags.Write {
                        opts.MaxPerms.Write = false
                }

                if err := file.ConfigureMMap(t, &opts); err != nil {
                        return 0, nil, err
                }
        } else if shared {
                // Back shared anonymous mappings with a special mappable.
                opts.Offset = 0
                m, err := mm.NewSharedAnonMappable(opts.Length, t.Kernel())
                if err != nil {
                        return 0, nil, err
                }
                opts.MappingIdentity = m // transfers ownership of m to opts
                opts.Mappable = m
        }

        rv, err := t.MemoryManager().MMap(t, opts)
        return uintptr(rv), nil, err
}

// LINT.ThenChange(vfs2/mmap.go)

// Munmap implements linux syscall munmap(2).
func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64())
}

// Mremap implements linux syscall mremap(2).
func Mremap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        oldAddr := args[0].Pointer()
        oldSize := args[1].Uint64()
        newSize := args[2].Uint64()
        flags := args[3].Uint64()
        newAddr := args[4].Pointer()

        if flags&^(linux.MREMAP_MAYMOVE|linux.MREMAP_FIXED) != 0 {
                return 0, nil, linuxerr.EINVAL
        }
        mayMove := flags&linux.MREMAP_MAYMOVE != 0
        fixed := flags&linux.MREMAP_FIXED != 0
        var moveMode mm.MRemapMoveMode
        switch {
        case !mayMove && !fixed:
                moveMode = mm.MRemapNoMove
        case mayMove && !fixed:
                moveMode = mm.MRemapMayMove
        case mayMove && fixed:
                moveMode = mm.MRemapMustMove
        case !mayMove && fixed:
                // "If MREMAP_FIXED is specified, then MREMAP_MAYMOVE must also be
                // specified." - mremap(2)
                return 0, nil, linuxerr.EINVAL
        }

        rv, err := t.MemoryManager().MRemap(t, oldAddr, oldSize, newSize, mm.MRemapOpts{
                Move:    moveMode,
                NewAddr: newAddr,
        })
        return uintptr(rv), nil, err
}

// Mprotect implements linux syscall mprotect(2).
func Mprotect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        length := args[1].Uint64()
        prot := args[2].Int()
        err := t.MemoryManager().MProtect(args[0].Pointer(), length, hostarch.AccessType{
                Read:    linux.PROT_READ&prot != 0,
                Write:   linux.PROT_WRITE&prot != 0,
                Execute: linux.PROT_EXEC&prot != 0,
        }, linux.PROT_GROWSDOWN&prot != 0)
        return 0, nil, err
}

// Madvise implements linux syscall madvise(2).
func Madvise(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        length := uint64(args[1].SizeT())
        adv := args[2].Int()

        // "The Linux implementation requires that the address addr be
        // page-aligned, and allows length to be zero." - madvise(2)
        if addr.RoundDown() != addr {
                return 0, nil, linuxerr.EINVAL
        }
        if length == 0 {
                return 0, nil, nil
        }
        // Not explicitly stated: length need not be page-aligned.
        lenAddr, ok := hostarch.Addr(length).RoundUp()
        if !ok {
                return 0, nil, linuxerr.EINVAL
        }
        length = uint64(lenAddr)

        switch adv {
        case linux.MADV_DONTNEED:
                return 0, nil, t.MemoryManager().Decommit(addr, length)
        case linux.MADV_DOFORK:
                return 0, nil, t.MemoryManager().SetDontFork(addr, length, false)
        case linux.MADV_DONTFORK:
                return 0, nil, t.MemoryManager().SetDontFork(addr, length, true)
        case linux.MADV_HUGEPAGE, linux.MADV_NOHUGEPAGE:
                fallthrough
        case linux.MADV_MERGEABLE, linux.MADV_UNMERGEABLE:
                fallthrough
        case linux.MADV_DONTDUMP, linux.MADV_DODUMP:
                // TODO(b/72045799): Core dumping isn't implemented, so these are
                // no-ops.
                fallthrough
        case linux.MADV_NORMAL, linux.MADV_RANDOM, linux.MADV_SEQUENTIAL, linux.MADV_WILLNEED:
                // Do nothing, we totally ignore the suggestions above.
                return 0, nil, nil
        case linux.MADV_REMOVE:
                // These "suggestions" have application-visible side effects, so we
                // have to indicate that we don't support them.
                return 0, nil, syserror.ENOSYS
        case linux.MADV_HWPOISON:
                // Only privileged processes are allowed to poison pages.
                return 0, nil, linuxerr.EPERM
        default:
                // If adv is not a valid value tell the caller.
                return 0, nil, linuxerr.EINVAL
        }
}

// Mincore implements the syscall mincore(2).
func Mincore(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        length := args[1].SizeT()
        vec := args[2].Pointer()

        if addr != addr.RoundDown() {
                return 0, nil, linuxerr.EINVAL
        }
        // "The length argument need not be a multiple of the page size, but since
        // residency information is returned for whole pages, length is effectively
        // rounded up to the next multiple of the page size." - mincore(2)
        la, ok := hostarch.Addr(length).RoundUp()
        if !ok {
                return 0, nil, syserror.ENOMEM
        }
        ar, ok := addr.ToRange(uint64(la))
        if !ok {
                return 0, nil, syserror.ENOMEM
        }

        // Pretend that all mapped pages are "resident in core".
        mapped := t.MemoryManager().VirtualMemorySizeRange(ar)
        // "ENOMEM: addr to addr + length contained unmapped memory."
        if mapped != uint64(la) {
                return 0, nil, syserror.ENOMEM
        }
        resident := bytes.Repeat([]byte{1}, int(mapped/hostarch.PageSize))
        _, err := t.CopyOutBytes(vec, resident)
        return 0, nil, err
}

// Msync implements Linux syscall msync(2).
func Msync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        length := args[1].SizeT()
        flags := args[2].Int()

        // "The flags argument should specify exactly one of MS_ASYNC and MS_SYNC,
        // and may additionally include the MS_INVALIDATE bit. ... However, Linux
        // permits a call to msync() that specifies neither of these flags, with
        // semantics that are (currently) equivalent to specifying MS_ASYNC." -
        // msync(2)
        if flags&^(linux.MS_ASYNC|linux.MS_SYNC|linux.MS_INVALIDATE) != 0 {
                return 0, nil, linuxerr.EINVAL
        }
        sync := flags&linux.MS_SYNC != 0
        if sync && flags&linux.MS_ASYNC != 0 {
                return 0, nil, linuxerr.EINVAL
        }
        err := t.MemoryManager().MSync(t, addr, uint64(length), mm.MSyncOpts{
                Sync:       sync,
                Invalidate: flags&linux.MS_INVALIDATE != 0,
        })
        // MSync calls fsync, the same interrupt conversion rules apply, see
        // mm/msync.c, fsync POSIX.1-2008.
        return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
}

// Mlock implements linux syscall mlock(2).
func Mlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        length := args[1].SizeT()

        return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockEager)
}

// Mlock2 implements linux syscall mlock2(2).
func Mlock2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        length := args[1].SizeT()
        flags := args[2].Int()

        if flags&^(linux.MLOCK_ONFAULT) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        mode := memmap.MLockEager
        if flags&linux.MLOCK_ONFAULT != 0 {
                mode = memmap.MLockLazy
        }
        return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), mode)
}

// Munlock implements linux syscall munlock(2).
func Munlock(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        length := args[1].SizeT()

        return 0, nil, t.MemoryManager().MLock(t, addr, uint64(length), memmap.MLockNone)
}

// Mlockall implements linux syscall mlockall(2).
func Mlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        flags := args[0].Int()

        if flags&^(linux.MCL_CURRENT|linux.MCL_FUTURE|linux.MCL_ONFAULT) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        mode := memmap.MLockEager
        if flags&linux.MCL_ONFAULT != 0 {
                mode = memmap.MLockLazy
        }
        return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
                Current: flags&linux.MCL_CURRENT != 0,
                Future:  flags&linux.MCL_FUTURE != 0,
                Mode:    mode,
        })
}

// Munlockall implements linux syscall munlockall(2).
func Munlockall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return 0, nil, t.MemoryManager().MLockAll(t, mm.MLockAllOpts{
                Current: true,
                Future:  true,
                Mode:    memmap.MLockNone,
        })
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/auth/id_map_set.go: no such file or directory

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fs/lock/lock_set.go: no such file or directory






















































  179 







  179 


  179 




  179 



   22 



    2 
    2 

































  151 


  151 
  151 








  150 






  151 





  150 



  148 



    2 









    2 








  143 








  149 



    5 




































































































































































  141 




  148 








  148 






  145 









  145 

  144 




























































































































































































































    2 
    1 


    1 








    1 
    1 


    1 






    9 



    3 


    6 




    9 

    2 

    3 

    4 

    4 




    4 



    1 

    2 
    2 



    1 



    1 




    7 


    6 



    2 




  147 


  147 











  146 










  146 




  201 



  202 






  195 




    4 


  193 







  194 




   45 


    2 


   45 













   45 








  204 

  205 





  206 







































































































































































  203 






  179 






  212 






  182 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package gofer

import (
        "fmt"
        "io"
        "math"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/metric"
        "gvisor.dev/gvisor/pkg/p9"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
        "gvisor.dev/gvisor/pkg/sentry/fsmetric"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        "gvisor.dev/gvisor/pkg/sentry/usage"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/usermem"
)

func (d *dentry) isRegularFile() bool {
        return d.fileType() == linux.S_IFREG
}

// +stateify savable
type regularFileFD struct {
        fileDescription

        // off is the file offset. off is protected by mu.
        mu  sync.Mutex `state:"nosave"`
        off int64
}

func newRegularFileFD(mnt *vfs.Mount, d *dentry, flags uint32) (*regularFileFD, error) {
        fd := &regularFileFD{}
        fd.LockFD.Init(&d.locks)
        if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
                AllowDirectIO: true,
        }); err != nil {
                return nil, err
        }
        if fd.vfsfd.IsWritable() && (atomic.LoadUint32(&d.mode)&0111 != 0) {
                metric.SuspiciousOperationsMetric.Increment("opened_write_execute_file")
        }
        if atomic.LoadInt32(&d.mmapFD) >= 0 {
                fsmetric.GoferOpensHost.Increment()
        } else {
                fsmetric.GoferOpens9P.Increment()
        }
        return fd, nil
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *regularFileFD) Release(context.Context) {
}

// OnClose implements vfs.FileDescriptionImpl.OnClose.
func (fd *regularFileFD) OnClose(ctx context.Context) error {
        if !fd.vfsfd.IsWritable() {
                return nil
        }
        // Skip flushing if there are client-buffered writes, since (as with the
        // VFS1 client) we don't flush buffered writes on close anyway.
        d := fd.dentry()
        if d.fs.opts.interop != InteropModeExclusive {
                return nil
        }
        d.dataMu.RLock()
        haveDirtyPages := !d.dirty.IsEmpty()
        d.dataMu.RUnlock()
        if haveDirtyPages {
                return nil
        }
        d.handleMu.RLock()
        defer d.handleMu.RUnlock()
        if d.writeFile.isNil() {
                return nil
        }
        return d.writeFile.flush(ctx)
}

// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
        d := fd.dentry()
        return d.doAllocate(ctx, offset, length, func() error {
                d.handleMu.RLock()
                defer d.handleMu.RUnlock()
                return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
        })
}

// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        start := fsmetric.StartReadWait()
        d := fd.dentry()
        defer func() {
                if atomic.LoadInt32(&d.readFD) >= 0 {
                        fsmetric.GoferReadsHost.Increment()
                        fsmetric.FinishReadWait(fsmetric.GoferReadWaitHost, start)
                } else {
                        fsmetric.GoferReads9P.Increment()
                        fsmetric.FinishReadWait(fsmetric.GoferReadWait9P, start)
                }
        }()

        if offset < 0 {
                return 0, linuxerr.EINVAL
        }

        // Check that flags are supported.
        //
        // TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
        if opts.Flags&^linux.RWF_HIPRI != 0 {
                return 0, linuxerr.EOPNOTSUPP
        }

        // Check for reading at EOF before calling into MM (but not under
        // InteropModeShared, which makes d.size unreliable).
        if d.cachedMetadataAuthoritative() && uint64(offset) >= atomic.LoadUint64(&d.size) {
                return 0, io.EOF
        }

        var (
                n       int64
                readErr error
        )
        if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
                // Lock d.metadataMu for the rest of the read to prevent d.size from
                // changing.
                d.metadataMu.Lock()
                defer d.metadataMu.Unlock()
                // Write dirty cached pages that will be touched by the read back to
                // the remote file.
                if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil {
                        return 0, err
                }
                rw := getDentryReadWriter(ctx, d, offset)
                // Require the read to go to the remote file.
                rw.direct = true
                n, readErr = dst.CopyOutFrom(ctx, rw)
                putDentryReadWriter(rw)
                if d.fs.opts.interop != InteropModeShared {
                        // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
                        d.touchAtimeLocked(fd.vfsfd.Mount())
                }
        } else {
                rw := getDentryReadWriter(ctx, d, offset)
                n, readErr = dst.CopyOutFrom(ctx, rw)
                putDentryReadWriter(rw)
                if d.fs.opts.interop != InteropModeShared {
                        // Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
                        d.touchAtime(fd.vfsfd.Mount())
                }
        }
        return n, readErr
}

// Read implements vfs.FileDescriptionImpl.Read.
func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        fd.mu.Lock()
        n, err := fd.PRead(ctx, dst, fd.off, opts)
        fd.off += n
        fd.mu.Unlock()
        return n, err
}

// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
        n, _, err := fd.pwrite(ctx, src, offset, opts)
        return n, err
}

// pwrite returns the number of bytes written, final offset, error. The final
// offset should be ignored by PWrite.
func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
        if offset < 0 {
                return 0, offset, linuxerr.EINVAL
        }

        // Check that flags are supported.
        //
        // TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
        if opts.Flags&^linux.RWF_HIPRI != 0 {
                return 0, offset, linuxerr.EOPNOTSUPP
        }

        d := fd.dentry()

        d.metadataMu.Lock()
        defer d.metadataMu.Unlock()

        // If the fd was opened with O_APPEND, make sure the file size is updated.
        // There is a possible race here if size is modified externally after
        // metadata cache is updated.
        if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
                if err := d.refreshSizeLocked(ctx); err != nil {
                        return 0, offset, err
                }
        }

        // Set offset to file size if the fd was opened with O_APPEND.
        if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
                // Holding d.metadataMu is sufficient for reading d.size.
                offset = int64(d.size)
        }
        limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
        if err != nil {
                return 0, offset, err
        }
        src = src.TakeFirst64(limit)

        if d.fs.opts.interop != InteropModeShared {
                // Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
                // file_update_time(). This is d.touchCMtime(), but without locking
                // d.metadataMu (recursively).
                d.touchCMtimeLocked()
        }

        rw := getDentryReadWriter(ctx, d, offset)
        defer putDentryReadWriter(rw)

        if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
                if err := fd.writeCache(ctx, d, offset, src); err != nil {
                        return 0, offset, err
                }

                // Require the write to go to the remote file.
                rw.direct = true
        }

        n, err := src.CopyInTo(ctx, rw)
        if err != nil {
                return n, offset + n, err
        }
        if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
                // Note that if any of the following fail, then we can't guarantee that
                // any data was actually written with the semantics of O_DSYNC or
                // O_SYNC, so we return zero bytes written. Compare Linux's
                // mm/filemap.c:generic_file_write_iter() =>
                // include/linux/fs.h:generic_write_sync().
                //
                // Write dirty cached pages touched by the write back to the remote
                // file.
                if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
                        return 0, offset, err
                }
                // Request the remote filesystem to sync the remote file.
                if err := d.syncRemoteFile(ctx); err != nil {
                        return 0, offset, err
                }
        }

        // As with Linux, writing clears the setuid and setgid bits.
        if n > 0 {
                oldMode := atomic.LoadUint32(&d.mode)
                // If setuid or setgid were set, update d.mode and propagate
                // changes to the host.
                if newMode := vfs.ClearSUIDAndSGID(oldMode); newMode != oldMode {
                        atomic.StoreUint32(&d.mode, newMode)
                        if err := d.file.setAttr(ctx, p9.SetAttrMask{Permissions: true}, p9.SetAttr{Permissions: p9.FileMode(newMode)}); err != nil {
                                return 0, offset, err
                        }
                }
        }

        return n, offset + n, nil
}

func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64, src usermem.IOSequence) error {
        // Write dirty cached pages that will be touched by the write back to
        // the remote file.
        if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
                return err
        }

        // Remove touched pages from the cache.
        pgstart := hostarch.PageRoundDown(uint64(offset))
        pgend, ok := hostarch.PageRoundUp(uint64(offset + src.NumBytes()))
        if !ok {
                return linuxerr.EINVAL
        }
        mr := memmap.MappableRange{pgstart, pgend}
        var freed []memmap.FileRange

        d.dataMu.Lock()
        cseg := d.cache.LowerBoundSegment(mr.Start)
        for cseg.Ok() && cseg.Start() < mr.End {
                cseg = d.cache.Isolate(cseg, mr)
                freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
                cseg = d.cache.Remove(cseg).NextSegment()
        }
        d.dataMu.Unlock()

        // Invalidate mappings of removed pages.
        d.mapsMu.Lock()
        d.mappings.Invalidate(mr, memmap.InvalidateOpts{})
        d.mapsMu.Unlock()

        // Finally free pages removed from the cache.
        mf := d.fs.mfp.MemoryFile()
        for _, freedFR := range freed {
                mf.DecRef(freedFR)
        }
        return nil
}

// Write implements vfs.FileDescriptionImpl.Write.
func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        fd.mu.Lock()
        n, off, err := fd.pwrite(ctx, src, fd.off, opts)
        fd.off = off
        fd.mu.Unlock()
        return n, err
}

type dentryReadWriter struct {
        ctx    context.Context
        d      *dentry
        off    uint64
        direct bool
}

var dentryReadWriterPool = sync.Pool{
        New: func() interface{} {
                return &dentryReadWriter{}
        },
}

func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter {
        rw := dentryReadWriterPool.Get().(*dentryReadWriter)
        rw.ctx = ctx
        rw.d = d
        rw.off = uint64(offset)
        rw.direct = false
        return rw
}

func putDentryReadWriter(rw *dentryReadWriter) {
        rw.ctx = nil
        rw.d = nil
        dentryReadWriterPool.Put(rw)
}

// ReadToBlocks implements safemem.Reader.ReadToBlocks.
func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
        if dsts.IsEmpty() {
                return 0, nil
        }

        // If we have a mmappable host FD (which must be used here to ensure
        // coherence with memory-mapped I/O), or if InteropModeShared is in effect
        // (which prevents us from caching file contents and makes dentry.size
        // unreliable), or if the file was opened O_DIRECT, read directly from
        // dentry.readHandleLocked() without locking dentry.dataMu.
        rw.d.handleMu.RLock()
        h := rw.d.readHandleLocked()
        if (rw.d.mmapFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
                n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off)
                rw.d.handleMu.RUnlock()
                rw.off += n
                return n, err
        }

        // Otherwise read from/through the cache.
        mf := rw.d.fs.mfp.MemoryFile()
        fillCache := mf.ShouldCacheEvictable()
        var dataMuUnlock func()
        if fillCache {
                rw.d.dataMu.Lock()
                dataMuUnlock = rw.d.dataMu.Unlock
        } else {
                rw.d.dataMu.RLock()
                dataMuUnlock = rw.d.dataMu.RUnlock
        }

        // Compute the range to read (limited by file size and overflow-checked).
        if rw.off >= rw.d.size {
                dataMuUnlock()
                rw.d.handleMu.RUnlock()
                return 0, io.EOF
        }
        end := rw.d.size
        if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
                end = rend
        }

        var done uint64
        seg, gap := rw.d.cache.Find(rw.off)
        for rw.off < end {
                mr := memmap.MappableRange{rw.off, end}
                switch {
                case seg.Ok():
                        // Get internal mappings from the cache.
                        ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read)
                        if err != nil {
                                dataMuUnlock()
                                rw.d.handleMu.RUnlock()
                                return done, err
                        }

                        // Copy from internal mappings.
                        n, err := safemem.CopySeq(dsts, ims)
                        done += n
                        rw.off += n
                        dsts = dsts.DropFirst64(n)
                        if err != nil {
                                dataMuUnlock()
                                rw.d.handleMu.RUnlock()
                                return done, err
                        }

                        // Continue.
                        seg, gap = seg.NextNonEmpty()

                case gap.Ok():
                        gapMR := gap.Range().Intersect(mr)
                        if fillCache {
                                // Read into the cache, then re-enter the loop to read from the
                                // cache.
                                gapEnd, _ := hostarch.PageRoundUp(gapMR.End)
                                reqMR := memmap.MappableRange{
                                        Start: hostarch.PageRoundDown(gapMR.Start),
                                        End:   gapEnd,
                                }
                                optMR := gap.Range()
                                err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size, mf, usage.PageCache, h.readToBlocksAt)
                                mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
                                seg, gap = rw.d.cache.Find(rw.off)
                                if !seg.Ok() {
                                        dataMuUnlock()
                                        rw.d.handleMu.RUnlock()
                                        return done, err
                                }
                                // err might have occurred in part of gap.Range() outside gapMR
                                // (in particular, gap.End() might be beyond EOF). Forget about
                                // it for now; if the error matters and persists, we'll run
                                // into it again in a later iteration of this loop.
                        } else {
                                // Read directly from the file.
                                gapDsts := dsts.TakeFirst64(gapMR.Length())
                                n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start)
                                done += n
                                rw.off += n
                                dsts = dsts.DropFirst64(n)
                                // Partial reads are fine. But we must stop reading.
                                if n != gapDsts.NumBytes() || err != nil {
                                        dataMuUnlock()
                                        rw.d.handleMu.RUnlock()
                                        return done, err
                                }

                                // Continue.
                                seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
                        }
                }
        }
        dataMuUnlock()
        rw.d.handleMu.RUnlock()
        return done, nil
}

// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
//
// Preconditions: rw.d.metadataMu must be locked.
func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
        if srcs.IsEmpty() {
                return 0, nil
        }

        // If we have a mmappable host FD (which must be used here to ensure
        // coherence with memory-mapped I/O), or if InteropModeShared is in effect
        // (which prevents us from caching file contents), or if the file was
        // opened with O_DIRECT, write directly to dentry.writeHandleLocked()
        // without locking dentry.dataMu.
        rw.d.handleMu.RLock()
        h := rw.d.writeHandleLocked()
        if (rw.d.mmapFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
                n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off)
                rw.off += n
                rw.d.dataMu.Lock()
                if rw.off > rw.d.size {
                        atomic.StoreUint64(&rw.d.size, rw.off)
                        // The remote file's size will implicitly be extended to the correct
                        // value when we write back to it.
                }
                rw.d.dataMu.Unlock()
                rw.d.handleMu.RUnlock()
                return n, err
        }

        // Otherwise write to/through the cache.
        mf := rw.d.fs.mfp.MemoryFile()
        rw.d.dataMu.Lock()

        // Compute the range to write (overflow-checked).
        start := rw.off
        end := rw.off + srcs.NumBytes()
        if end <= rw.off {
                end = math.MaxInt64
        }

        var (
                done   uint64
                retErr error
        )
        seg, gap := rw.d.cache.Find(rw.off)
        for rw.off < end {
                mr := memmap.MappableRange{rw.off, end}
                switch {
                case seg.Ok():
                        // Get internal mappings from the cache.
                        segMR := seg.Range().Intersect(mr)
                        ims, err := mf.MapInternal(seg.FileRangeOf(segMR), hostarch.Write)
                        if err != nil {
                                retErr = err
                                goto exitLoop
                        }

                        // Copy to internal mappings.
                        n, err := safemem.CopySeq(ims, srcs)
                        done += n
                        rw.off += n
                        srcs = srcs.DropFirst64(n)
                        rw.d.dirty.MarkDirty(segMR)
                        if err != nil {
                                retErr = err
                                goto exitLoop
                        }

                        // Continue.
                        seg, gap = seg.NextNonEmpty()

                case gap.Ok():
                        // Write directly to the file. At present, we never fill the cache
                        // when writing, since doing so can convert small writes into
                        // inefficient read-modify-write cycles, and we have no mechanism
                        // for detecting or avoiding this.
                        gapMR := gap.Range().Intersect(mr)
                        gapSrcs := srcs.TakeFirst64(gapMR.Length())
                        n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start)
                        done += n
                        rw.off += n
                        srcs = srcs.DropFirst64(n)
                        // Partial writes are fine. But we must stop writing.
                        if n != gapSrcs.NumBytes() || err != nil {
                                retErr = err
                                goto exitLoop
                        }

                        // Continue.
                        seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
                }
        }
exitLoop:
        if rw.off > rw.d.size {
                atomic.StoreUint64(&rw.d.size, rw.off)
                // The remote file's size will implicitly be extended to the correct
                // value when we write back to it.
        }
        // If InteropModeWritethrough is in effect, flush written data back to the
        // remote filesystem.
        if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 {
                if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{
                        Start: start,
                        End:   rw.off,
                }, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, h.writeFromBlocksAt); err != nil {
                        // We have no idea how many bytes were actually flushed.
                        rw.off = start
                        done = 0
                        retErr = err
                }
        }
        rw.d.dataMu.Unlock()
        rw.d.handleMu.RUnlock()
        return done, retErr
}

func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
        if size == 0 {
                return nil
        }
        d.handleMu.RLock()
        defer d.handleMu.RUnlock()
        h := d.writeHandleLocked()
        d.dataMu.Lock()
        defer d.dataMu.Unlock()
        // Compute the range of valid bytes (overflow-checked).
        if uint64(offset) >= d.size {
                return nil
        }
        end := int64(d.size)
        if rend := offset + size; rend > offset && rend < end {
                end = rend
        }
        return fsutil.SyncDirty(ctx, memmap.MappableRange{
                Start: uint64(offset),
                End:   uint64(end),
        }, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
}

// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        fd.mu.Lock()
        defer fd.mu.Unlock()
        newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
        if err != nil {
                return 0, err
        }
        fd.off = newOffset
        return newOffset, nil
}

// Calculate the new offset for a seek operation on a regular file.
func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) {
        switch whence {
        case linux.SEEK_SET:
                // Use offset as specified.
        case linux.SEEK_CUR:
                offset += fdOffset
        case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE:
                // Ensure file size is up to date.
                if !d.cachedMetadataAuthoritative() {
                        if err := d.updateFromGetattr(ctx); err != nil {
                                return 0, err
                        }
                }
                size := int64(atomic.LoadUint64(&d.size))
                // For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous
                // block of data.
                switch whence {
                case linux.SEEK_END:
                        offset += size
                case linux.SEEK_DATA:
                        if offset > size {
                                return 0, linuxerr.ENXIO
                        }
                        // Use offset as specified.
                case linux.SEEK_HOLE:
                        if offset > size {
                                return 0, linuxerr.ENXIO
                        }
                        offset = size
                }
        default:
                return 0, linuxerr.EINVAL
        }
        if offset < 0 {
                return 0, linuxerr.EINVAL
        }
        return offset, nil
}

// Sync implements vfs.FileDescriptionImpl.Sync.
func (fd *regularFileFD) Sync(ctx context.Context) error {
        return fd.dentry().syncCachedFile(ctx, false /* lowSyncExpectations */)
}

// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
        d := fd.dentry()
        // Force sentry page caching at your own risk.
        if !d.fs.opts.forcePageCache {
                switch d.fs.opts.interop {
                case InteropModeExclusive:
                        // Any mapping is fine.
                case InteropModeWritethrough:
                        // Shared writable mappings require a host FD, since otherwise we
                        // can't synchronously flush memory-mapped writes to the remote
                        // file.
                        if opts.Private || !opts.MaxPerms.Write {
                                break
                        }
                        fallthrough
                case InteropModeShared:
                        // All mappings require a host FD to be coherent with other
                        // filesystem users.
                        if atomic.LoadInt32(&d.mmapFD) < 0 {
                                return linuxerr.ENODEV
                        }
                default:
                        panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop))
                }
        }
        // After this point, d may be used as a memmap.Mappable.
        d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init)
        opts.SentryOwnedContent = d.fs.opts.forcePageCache
        return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts)
}

func (d *dentry) mayCachePages() bool {
        if d.fs.opts.forcePageCache {
                return true
        }
        if d.fs.opts.interop == InteropModeShared {
                return false
        }
        return atomic.LoadInt32(&d.mmapFD) >= 0
}

// AddMapping implements memmap.Mappable.AddMapping.
func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
        d.mapsMu.Lock()
        mapped := d.mappings.AddMapping(ms, ar, offset, writable)
        // Do this unconditionally since whether we have a host FD can change
        // across save/restore.
        for _, r := range mapped {
                d.pf.hostFileMapper.IncRefOn(r)
        }
        if d.mayCachePages() {
                // d.Evict() will refuse to evict memory-mapped pages, so tell the
                // MemoryFile to not bother trying.
                mf := d.fs.mfp.MemoryFile()
                for _, r := range mapped {
                        mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End})
                }
        }
        d.mapsMu.Unlock()
        return nil
}

// RemoveMapping implements memmap.Mappable.RemoveMapping.
func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
        d.mapsMu.Lock()
        unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable)
        for _, r := range unmapped {
                d.pf.hostFileMapper.DecRefOn(r)
        }
        if d.mayCachePages() {
                // Pages that are no longer referenced by any application memory
                // mappings are now considered unused; allow MemoryFile to evict them
                // when necessary.
                mf := d.fs.mfp.MemoryFile()
                d.dataMu.Lock()
                for _, r := range unmapped {
                        // Since these pages are no longer mapped, they are no longer
                        // concurrently dirtyable by a writable memory mapping.
                        d.dirty.AllowClean(r)
                        mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End})
                }
                d.dataMu.Unlock()
        }
        d.mapsMu.Unlock()
}

// CopyMapping implements memmap.Mappable.CopyMapping.
func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
        return d.AddMapping(ctx, ms, dstAR, offset, writable)
}

// Translate implements memmap.Mappable.Translate.
func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
        d.handleMu.RLock()
        if d.mmapFD >= 0 && !d.fs.opts.forcePageCache {
                d.handleMu.RUnlock()
                mr := optional
                if d.fs.opts.limitHostFDTranslation {
                        mr = maxFillRange(required, optional)
                }
                return []memmap.Translation{
                        {
                                Source: mr,
                                File:   &d.pf,
                                Offset: mr.Start,
                                Perms:  hostarch.AnyAccess,
                        },
                }, nil
        }

        d.dataMu.Lock()

        // Constrain translations to d.size (rounded up) to prevent translation to
        // pages that may be concurrently truncated.
        pgend, _ := hostarch.PageRoundUp(d.size)
        var beyondEOF bool
        if required.End > pgend {
                if required.Start >= pgend {
                        d.dataMu.Unlock()
                        d.handleMu.RUnlock()
                        return nil, &memmap.BusError{io.EOF}
                }
                beyondEOF = true
                required.End = pgend
        }
        if optional.End > pgend {
                optional.End = pgend
        }

        mf := d.fs.mfp.MemoryFile()
        h := d.readHandleLocked()
        cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size, mf, usage.PageCache, h.readToBlocksAt)

        var ts []memmap.Translation
        var translatedEnd uint64
        for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
                segMR := seg.Range().Intersect(optional)
                // TODO(jamieliu): Make Translations writable even if writability is
                // not required if already kept-dirty by another writable translation.
                perms := hostarch.AccessType{
                        Read:    true,
                        Execute: true,
                }
                if at.Write {
                        // From this point forward, this memory can be dirtied through the
                        // mapping at any time.
                        d.dirty.KeepDirty(segMR)
                        perms.Write = true
                }
                ts = append(ts, memmap.Translation{
                        Source: segMR,
                        File:   mf,
                        Offset: seg.FileRangeOf(segMR).Start,
                        Perms:  perms,
                })
                translatedEnd = segMR.End
        }

        d.dataMu.Unlock()
        d.handleMu.RUnlock()

        // Don't return the error returned by c.cache.Fill if it occurred outside
        // of required.
        if translatedEnd < required.End && cerr != nil {
                return ts, &memmap.BusError{cerr}
        }
        if beyondEOF {
                return ts, &memmap.BusError{io.EOF}
        }
        return ts, nil
}

func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange {
        const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily
        if required.Length() >= maxReadahead {
                return required
        }
        if optional.Length() <= maxReadahead {
                return optional
        }
        optional.Start = required.Start
        if optional.Length() <= maxReadahead {
                return optional
        }
        optional.End = optional.Start + maxReadahead
        return optional
}

// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
        // Whether we have a host fd (and consequently what memmap.File is
        // mapped) can change across save/restore, so invalidate all translations
        // unconditionally.
        d.mapsMu.Lock()
        defer d.mapsMu.Unlock()
        d.mappings.InvalidateAll(memmap.InvalidateOpts{})

        // Write the cache's contents back to the remote file so that if we have a
        // host fd after restore, the remote file's contents are coherent.
        mf := d.fs.mfp.MemoryFile()
        d.handleMu.RLock()
        defer d.handleMu.RUnlock()
        h := d.writeHandleLocked()
        d.dataMu.Lock()
        defer d.dataMu.Unlock()
        if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
                return err
        }

        // Discard the cache so that it's not stored in saved state. This is safe
        // because per InvalidateUnsavable invariants, no new translations can have
        // been returned after we invalidated all existing translations above.
        d.cache.DropAll(mf)
        d.dirty.RemoveAll()

        return nil
}

// Evict implements pgalloc.EvictableMemoryUser.Evict.
func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
        mr := memmap.MappableRange{er.Start, er.End}
        mf := d.fs.mfp.MemoryFile()
        d.mapsMu.Lock()
        defer d.mapsMu.Unlock()
        d.handleMu.RLock()
        defer d.handleMu.RUnlock()
        h := d.writeHandleLocked()
        d.dataMu.Lock()
        defer d.dataMu.Unlock()

        // Only allow pages that are no longer memory-mapped to be evicted.
        for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
                mgapMR := mgap.Range().Intersect(mr)
                if mgapMR.Length() == 0 {
                        continue
                }
                if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
                        log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
                }
                d.cache.Drop(mgapMR, mf)
                d.dirty.KeepClean(mgapMR)
        }
}

// dentryPlatformFile implements memmap.File. It exists solely because dentry
// cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef.
//
// dentryPlatformFile is only used when a host FD representing the remote file
// is available (i.e. dentry.mmapFD >= 0), and that FD is used for application
// memory mappings (i.e. !filesystem.opts.forcePageCache).
//
// +stateify savable
type dentryPlatformFile struct {
        *dentry

        // fdRefs counts references on memmap.File offsets. fdRefs is protected
        // by dentry.dataMu.
        fdRefs fsutil.FrameRefSet

        // If this dentry represents a regular file, and dentry.mmapFD >= 0,
        // hostFileMapper caches mappings of dentry.mmapFD.
        hostFileMapper fsutil.HostFileMapper

        // hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
        hostFileMapperInitOnce sync.Once `state:"nosave"`
}

// IncRef implements memmap.File.IncRef.
func (d *dentryPlatformFile) IncRef(fr memmap.FileRange) {
        d.dataMu.Lock()
        d.fdRefs.IncRefAndAccount(fr)
        d.dataMu.Unlock()
}

// DecRef implements memmap.File.DecRef.
func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) {
        d.dataMu.Lock()
        d.fdRefs.DecRefAndAccount(fr)
        d.dataMu.Unlock()
}

// MapInternal implements memmap.File.MapInternal.
func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at hostarch.AccessType) (safemem.BlockSeq, error) {
        d.handleMu.RLock()
        defer d.handleMu.RUnlock()
        return d.hostFileMapper.MapInternal(fr, int(d.mmapFD), at.Write)
}

// FD implements memmap.File.FD.
func (d *dentryPlatformFile) FD() int {
        d.handleMu.RLock()
        defer d.handleMu.RUnlock()
        return int(d.mmapFD)
}




















































































































   40 












   22 
    2 



   20 


    3 

    1 










    1 


    1 





    3 






   19 







   16 


   16 




   16 





    2 

















    1 












    1 






    1 
    1 



    1 




    3 




    2 


    1 





    1 








    1 





   16 











   16 





   16 




   16 












   39 






   10 




    2 


    8 


    3 
    2 
    1 



    3 


    1 
    1 
    1 






   16 
   16 
   16 






   17 

   12 


   17 


   25 
    6 


   21 



   36 




    1 









    1 






    6 





    3 



    9 



    2 



    7 
















    4 
    1 



    4 



    1 



    3 
    1 




    2 










    1 




    1 





    1 







    1 







    1 




    3 




    1 



    2 



    2 



    1 








    1 
    1 


    1 



    3 




    1 



    2 
    1 


    1 


    4 








    4 
    1 


    3 
    2 
    1 



    3 



    2 
    1 





    2 
    1 









   14 









   15 
   15 
    3 


   14 




   12 



   11 
    3 


    9 


   11 


   11 



   11 

    6 

    2 

    1 



    1 



   10 
    6 




    6 





    4 



    7 

    2 




    7 





    6 




    5 





    3 




    3 
    3 







    2 






   29 



   35 


   32 

    3 




   35 



   28 





    1 



    1 



    1 



    6 
    5 


    4 



    7 

    6 
    2 




    5 






    5 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package semaphore implements System V semaphores.
package semaphore

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
)

const (
        // Maximum semaphore value.
        valueMax = linux.SEMVMX

        // Maximum number of semaphore sets.
        setsMax = linux.SEMMNI

        // Maximum number of semaphores in a semaphore set.
        semsMax = linux.SEMMSL

        // Maximum number of semaphores in all semaphore sets.
        semsTotalMax = linux.SEMMNS
)

// Registry maintains a set of semaphores that can be found by key or ID.
//
// +stateify savable
type Registry struct {
        // userNS owning the ipc name this registry belongs to. Immutable.
        userNS *auth.UserNamespace
        // mu protects all fields below.
        mu         sync.Mutex `state:"nosave"`
        semaphores map[int32]*Set
        lastIDUsed int32
        // indexes maintains a mapping between a set's index in virtual array and
        // its identifier.
        indexes map[int32]int32
}

// Set represents a set of semaphores that can be operated atomically.
//
// +stateify savable
type Set struct {
        // registry owning this sem set. Immutable.
        registry *Registry

        // Id is a handle that identifies the set.
        ID int32

        // key is an user provided key that can be shared between processes.
        key int32

        // creator is the user that created the set. Immutable.
        creator fs.FileOwner

        // mu protects all fields below.
        mu         sync.Mutex `state:"nosave"`
        owner      fs.FileOwner
        perms      fs.FilePermissions
        opTime     ktime.Time
        changeTime ktime.Time

        // sems holds all semaphores in the set. The slice itself is immutable after
        // it's been set, however each 'sem' object in the slice requires 'mu' lock.
        sems []sem

        // dead is set to true when the set is removed and can't be reached anymore.
        // All waiters must wake up and fail when set is dead.
        dead bool
}

// sem represents a single semaphore from a set.
//
// +stateify savable
type sem struct {
        value   int16
        waiters waiterList `state:"zerovalue"`
        pid     int32
}

// waiter represents a caller that is waiting for the semaphore value to
// become positive or zero.
//
// +stateify savable
type waiter struct {
        waiterEntry

        // value represents how much resource the waiter needs to wake up.
        // The value is either 0 or negative.
        value int16
        ch    chan struct{}
}

// NewRegistry creates a new semaphore set registry.
func NewRegistry(userNS *auth.UserNamespace) *Registry {
        return &Registry{
                userNS:     userNS,
                semaphores: make(map[int32]*Set),
                indexes:    make(map[int32]int32),
        }
}

// FindOrCreate searches for a semaphore set that matches 'key'. If not found,
// it may create a new one if requested. If private is true, key is ignored and
// a new set is always created. If create is false, it fails if a set cannot
// be found. If exclusive is true, it fails if a set with the same key already
// exists.
func (r *Registry) FindOrCreate(ctx context.Context, key, nsems int32, mode linux.FileMode, private, create, exclusive bool) (*Set, error) {
        if nsems < 0 || nsems > semsMax {
                return nil, linuxerr.EINVAL
        }

        r.mu.Lock()
        defer r.mu.Unlock()

        if !private {
                // Look up an existing semaphore.
                if set := r.findByKey(key); set != nil {
                        set.mu.Lock()
                        defer set.mu.Unlock()

                        // Check that caller can access semaphore set.
                        creds := auth.CredentialsFromContext(ctx)
                        if !set.checkPerms(creds, fs.PermsFromMode(mode)) {
                                return nil, linuxerr.EACCES
                        }

                        // Validate parameters.
                        if nsems > int32(set.Size()) {
                                return nil, linuxerr.EINVAL
                        }
                        if create && exclusive {
                                return nil, linuxerr.EEXIST
                        }
                        return set, nil
                }

                if !create {
                        // Semaphore not found and should not be created.
                        return nil, syserror.ENOENT
                }
        }

        // Zero is only valid if an existing set is found.
        if nsems == 0 {
                return nil, linuxerr.EINVAL
        }

        // Apply system limits.
        //
        // Map semaphores and map indexes in a registry are of the same size,
        // check map semaphores only here for the system limit.
        if len(r.semaphores) >= setsMax {
                return nil, syserror.ENOSPC
        }
        if r.totalSems() > int(semsTotalMax-nsems) {
                return nil, syserror.ENOSPC
        }

        // Finally create a new set.
        owner := fs.FileOwnerFromContext(ctx)
        perms := fs.FilePermsFromMode(mode)
        return r.newSet(ctx, key, owner, owner, perms, nsems)
}

// IPCInfo returns information about system-wide semaphore limits and parameters.
func (r *Registry) IPCInfo() *linux.SemInfo {
        return &linux.SemInfo{
                SemMap: linux.SEMMAP,
                SemMni: linux.SEMMNI,
                SemMns: linux.SEMMNS,
                SemMnu: linux.SEMMNU,
                SemMsl: linux.SEMMSL,
                SemOpm: linux.SEMOPM,
                SemUme: linux.SEMUME,
                SemUsz: linux.SEMUSZ,
                SemVmx: linux.SEMVMX,
                SemAem: linux.SEMAEM,
        }
}

// SemInfo returns a seminfo structure containing the same information as
// for IPC_INFO, except that SemUsz field returns the number of existing
// semaphore sets, and SemAem field returns the number of existing semaphores.
func (r *Registry) SemInfo() *linux.SemInfo {
        r.mu.Lock()
        defer r.mu.Unlock()

        info := r.IPCInfo()
        info.SemUsz = uint32(len(r.semaphores))
        info.SemAem = uint32(r.totalSems())

        return info
}

// HighestIndex returns the index of the highest used entry in
// the kernel's array.
func (r *Registry) HighestIndex() int32 {
        r.mu.Lock()
        defer r.mu.Unlock()

        // By default, highest used index is 0 even though
        // there is no semaphore set.
        var highestIndex int32
        for index := range r.indexes {
                if index > highestIndex {
                        highestIndex = index
                }
        }
        return highestIndex
}

// RemoveID removes set with give 'id' from the registry and marks the set as
// dead. All waiters will be awakened and fail.
func (r *Registry) RemoveID(id int32, creds *auth.Credentials) error {
        r.mu.Lock()
        defer r.mu.Unlock()

        set := r.semaphores[id]
        if set == nil {
                return linuxerr.EINVAL
        }
        index, found := r.findIndexByID(id)
        if !found {
                // Inconsistent state.
                panic(fmt.Sprintf("unable to find an index for ID: %d", id))
        }

        set.mu.Lock()
        defer set.mu.Unlock()

        // "The effective user ID of the calling process must match the creator or
        // owner of the semaphore set, or the caller must be privileged."
        if !set.checkCredentials(creds) && !set.checkCapability(creds) {
                return linuxerr.EACCES
        }

        delete(r.semaphores, set.ID)
        delete(r.indexes, index)
        set.destroy()
        return nil
}

func (r *Registry) newSet(ctx context.Context, key int32, owner, creator fs.FileOwner, perms fs.FilePermissions, nsems int32) (*Set, error) {
        set := &Set{
                registry:   r,
                key:        key,
                owner:      owner,
                creator:    owner,
                perms:      perms,
                changeTime: ktime.NowFromContext(ctx),
                sems:       make([]sem, nsems),
        }

        // Find the next available ID.
        for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
                // Handle wrap around.
                if id < 0 {
                        id = 0
                        continue
                }
                if r.semaphores[id] == nil {
                        index, found := r.findFirstAvailableIndex()
                        if !found {
                                panic("unable to find an available index")
                        }
                        r.indexes[index] = id
                        r.lastIDUsed = id
                        r.semaphores[id] = set
                        set.ID = id
                        return set, nil
                }
        }

        log.Warningf("Semaphore map is full, they must be leaking")
        return nil, syserror.ENOMEM
}

// FindByID looks up a set given an ID.
func (r *Registry) FindByID(id int32) *Set {
        r.mu.Lock()
        defer r.mu.Unlock()
        return r.semaphores[id]
}

// FindByIndex looks up a set given an index.
func (r *Registry) FindByIndex(index int32) *Set {
        r.mu.Lock()
        defer r.mu.Unlock()

        id, present := r.indexes[index]
        if !present {
                return nil
        }
        return r.semaphores[id]
}

func (r *Registry) findByKey(key int32) *Set {
        for _, v := range r.semaphores {
                if v.key == key {
                        return v
                }
        }
        return nil
}

func (r *Registry) findIndexByID(id int32) (int32, bool) {
        for k, v := range r.indexes {
                if v == id {
                        return k, true
                }
        }
        return 0, false
}

func (r *Registry) findFirstAvailableIndex() (int32, bool) {
        for index := int32(0); index < setsMax; index++ {
                if _, present := r.indexes[index]; !present {
                        return index, true
                }
        }
        return 0, false
}

func (r *Registry) totalSems() int {
        totalSems := 0
        for _, v := range r.semaphores {
                totalSems += v.Size()
        }
        return totalSems
}

func (s *Set) findSem(num int32) *sem {
        if num < 0 || int(num) >= s.Size() {
                return nil
        }
        return &s.sems[num]
}

// Size returns the number of semaphores in the set. Size is immutable.
func (s *Set) Size() int {
        return len(s.sems)
}

// Change changes some fields from the set atomically.
func (s *Set) Change(ctx context.Context, creds *auth.Credentials, owner fs.FileOwner, perms fs.FilePermissions) error {
        s.mu.Lock()
        defer s.mu.Unlock()

        // "The effective UID of the calling process must match the owner or creator
        // of the semaphore set, or the caller must be privileged."
        if !s.checkCredentials(creds) && !s.checkCapability(creds) {
                return linuxerr.EACCES
        }

        s.owner = owner
        s.perms = perms
        s.changeTime = ktime.NowFromContext(ctx)
        return nil
}

// GetStat extracts semid_ds information from the set.
func (s *Set) GetStat(creds *auth.Credentials) (*linux.SemidDS, error) {
        // "The calling process must have read permission on the semaphore set."
        return s.semStat(creds, fs.PermMask{Read: true})
}

// GetStatAny extracts semid_ds information from the set without requiring read access.
func (s *Set) GetStatAny(creds *auth.Credentials) (*linux.SemidDS, error) {
        return s.semStat(creds, fs.PermMask{})
}

func (s *Set) semStat(creds *auth.Credentials, permMask fs.PermMask) (*linux.SemidDS, error) {
        s.mu.Lock()
        defer s.mu.Unlock()

        if !s.checkPerms(creds, permMask) {
                return nil, linuxerr.EACCES
        }

        return &linux.SemidDS{
                SemPerm: linux.IPCPerm{
                        Key:  uint32(s.key),
                        UID:  uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)),
                        GID:  uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)),
                        CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)),
                        CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)),
                        Mode: uint16(s.perms.LinuxMode()),
                        Seq:  0, // IPC sequence not supported.
                },
                SemOTime: s.opTime.TimeT(),
                SemCTime: s.changeTime.TimeT(),
                SemNSems: uint64(s.Size()),
        }, nil
}

// SetVal overrides a semaphore value, waking up waiters as needed.
func (s *Set) SetVal(ctx context.Context, num int32, val int16, creds *auth.Credentials, pid int32) error {
        if val < 0 || val > valueMax {
                return linuxerr.ERANGE
        }

        s.mu.Lock()
        defer s.mu.Unlock()

        // "The calling process must have alter permission on the semaphore set."
        if !s.checkPerms(creds, fs.PermMask{Write: true}) {
                return linuxerr.EACCES
        }

        sem := s.findSem(num)
        if sem == nil {
                return linuxerr.ERANGE
        }

        // TODO(gvisor.dev/issue/137): Clear undo entries in all processes.
        sem.value = val
        sem.pid = pid
        s.changeTime = ktime.NowFromContext(ctx)
        sem.wakeWaiters()
        return nil
}

// SetValAll overrides all semaphores values, waking up waiters as needed. It also
// sets semaphore's PID which was fixed in Linux 4.6.
//
// 'len(vals)' must be equal to 's.Size()'.
func (s *Set) SetValAll(ctx context.Context, vals []uint16, creds *auth.Credentials, pid int32) error {
        if len(vals) != s.Size() {
                panic(fmt.Sprintf("vals length (%d) different that Set.Size() (%d)", len(vals), s.Size()))
        }

        for _, val := range vals {
                if val > valueMax {
                        return linuxerr.ERANGE
                }
        }

        s.mu.Lock()
        defer s.mu.Unlock()

        // "The calling process must have alter permission on the semaphore set."
        if !s.checkPerms(creds, fs.PermMask{Write: true}) {
                return linuxerr.EACCES
        }

        for i, val := range vals {
                sem := &s.sems[i]

                // TODO(gvisor.dev/issue/137): Clear undo entries in all processes.
                sem.value = int16(val)
                sem.pid = pid
                sem.wakeWaiters()
        }
        s.changeTime = ktime.NowFromContext(ctx)
        return nil
}

// GetVal returns a semaphore value.
func (s *Set) GetVal(num int32, creds *auth.Credentials) (int16, error) {
        s.mu.Lock()
        defer s.mu.Unlock()

        // "The calling process must have read permission on the semaphore set."
        if !s.checkPerms(creds, fs.PermMask{Read: true}) {
                return 0, linuxerr.EACCES
        }

        sem := s.findSem(num)
        if sem == nil {
                return 0, linuxerr.ERANGE
        }
        return sem.value, nil
}

// GetValAll returns value for all semaphores.
func (s *Set) GetValAll(creds *auth.Credentials) ([]uint16, error) {
        s.mu.Lock()
        defer s.mu.Unlock()

        // "The calling process must have read permission on the semaphore set."
        if !s.checkPerms(creds, fs.PermMask{Read: true}) {
                return nil, linuxerr.EACCES
        }

        vals := make([]uint16, s.Size())
        for i, sem := range s.sems {
                vals[i] = uint16(sem.value)
        }
        return vals, nil
}

// GetPID returns the PID set when performing operations in the semaphore.
func (s *Set) GetPID(num int32, creds *auth.Credentials) (int32, error) {
        s.mu.Lock()
        defer s.mu.Unlock()

        // "The calling process must have read permission on the semaphore set."
        if !s.checkPerms(creds, fs.PermMask{Read: true}) {
                return 0, linuxerr.EACCES
        }

        sem := s.findSem(num)
        if sem == nil {
                return 0, linuxerr.ERANGE
        }
        return sem.pid, nil
}

func (s *Set) countWaiters(num int32, creds *auth.Credentials, pred func(w *waiter) bool) (uint16, error) {
        s.mu.Lock()
        defer s.mu.Unlock()

        // The calling process must have read permission on the semaphore set.
        if !s.checkPerms(creds, fs.PermMask{Read: true}) {
                return 0, linuxerr.EACCES
        }

        sem := s.findSem(num)
        if sem == nil {
                return 0, linuxerr.ERANGE
        }
        var cnt uint16
        for w := sem.waiters.Front(); w != nil; w = w.Next() {
                if pred(w) {
                        cnt++
                }
        }
        return cnt, nil
}

// CountZeroWaiters returns number of waiters waiting for the sem's value to increase.
func (s *Set) CountZeroWaiters(num int32, creds *auth.Credentials) (uint16, error) {
        return s.countWaiters(num, creds, func(w *waiter) bool {
                return w.value == 0
        })
}

// CountNegativeWaiters returns number of waiters waiting for the sem to go to zero.
func (s *Set) CountNegativeWaiters(num int32, creds *auth.Credentials) (uint16, error) {
        return s.countWaiters(num, creds, func(w *waiter) bool {
                return w.value < 0
        })
}

// ExecuteOps attempts to execute a list of operations to the set. It only
// succeeds when all operations can be applied. No changes are made if it fails.
//
// On failure, it may return an error (retries are hopeless) or it may return
// a channel that can be waited on before attempting again.
func (s *Set) ExecuteOps(ctx context.Context, ops []linux.Sembuf, creds *auth.Credentials, pid int32) (chan struct{}, int32, error) {
        s.mu.Lock()
        defer s.mu.Unlock()

        // Did it race with a removal operation?
        if s.dead {
                return nil, 0, syserror.EIDRM
        }

        // Validate the operations.
        readOnly := true
        for _, op := range ops {
                if s.findSem(int32(op.SemNum)) == nil {
                        return nil, 0, linuxerr.EFBIG
                }
                if op.SemOp != 0 {
                        readOnly = false
                }
        }

        if !s.checkPerms(creds, fs.PermMask{Read: readOnly, Write: !readOnly}) {
                return nil, 0, linuxerr.EACCES
        }

        ch, num, err := s.executeOps(ctx, ops, pid)
        if err != nil {
                return nil, 0, err
        }
        return ch, num, nil
}

func (s *Set) executeOps(ctx context.Context, ops []linux.Sembuf, pid int32) (chan struct{}, int32, error) {
        // Changes to semaphores go to this slice temporarily until they all succeed.
        tmpVals := make([]int16, len(s.sems))
        for i := range s.sems {
                tmpVals[i] = s.sems[i].value
        }

        for _, op := range ops {
                sem := &s.sems[op.SemNum]
                if op.SemOp == 0 {
                        // Handle 'wait for zero' operation.
                        if tmpVals[op.SemNum] != 0 {
                                // Semaphore isn't 0, must wait.
                                if op.SemFlg&linux.IPC_NOWAIT != 0 {
                                        return nil, 0, syserror.ErrWouldBlock
                                }

                                w := newWaiter(op.SemOp)
                                sem.waiters.PushBack(w)
                                return w.ch, int32(op.SemNum), nil
                        }
                } else {
                        if op.SemOp < 0 {
                                // Handle 'wait' operation.
                                if -op.SemOp > valueMax {
                                        return nil, 0, linuxerr.ERANGE
                                }
                                if -op.SemOp > tmpVals[op.SemNum] {
                                        // Not enough resources, must wait.
                                        if op.SemFlg&linux.IPC_NOWAIT != 0 {
                                                return nil, 0, syserror.ErrWouldBlock
                                        }

                                        w := newWaiter(op.SemOp)
                                        sem.waiters.PushBack(w)
                                        return w.ch, int32(op.SemNum), nil
                                }
                        } else {
                                // op.SemOp > 0: Handle 'signal' operation.
                                if tmpVals[op.SemNum] > valueMax-op.SemOp {
                                        return nil, 0, linuxerr.ERANGE
                                }
                        }

                        tmpVals[op.SemNum] += op.SemOp
                }
        }

        // All operations succeeded, apply them.
        // TODO(gvisor.dev/issue/137): handle undo operations.
        for i, v := range tmpVals {
                s.sems[i].value = v
                s.sems[i].wakeWaiters()
                s.sems[i].pid = pid
        }
        s.opTime = ktime.NowFromContext(ctx)
        return nil, 0, nil
}

// AbortWait notifies that a waiter is giving up and will not wait on the
// channel anymore.
func (s *Set) AbortWait(num int32, ch chan struct{}) {
        s.mu.Lock()
        defer s.mu.Unlock()

        sem := &s.sems[num]
        for w := sem.waiters.Front(); w != nil; w = w.Next() {
                if w.ch == ch {
                        sem.waiters.Remove(w)
                        return
                }
        }
        // Waiter may not be found in case it raced with wakeWaiters().
}

func (s *Set) checkCredentials(creds *auth.Credentials) bool {
        return s.owner.UID == creds.EffectiveKUID ||
                s.owner.GID == creds.EffectiveKGID ||
                s.creator.UID == creds.EffectiveKUID ||
                s.creator.GID == creds.EffectiveKGID
}

func (s *Set) checkCapability(creds *auth.Credentials) bool {
        return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS) && creds.UserNamespace.MapFromKUID(s.owner.UID).Ok()
}

func (s *Set) checkPerms(creds *auth.Credentials, reqPerms fs.PermMask) bool {
        // Are we owner, or in group, or other?
        p := s.perms.Other
        if s.owner.UID == creds.EffectiveKUID {
                p = s.perms.User
        } else if creds.InGroup(s.owner.GID) {
                p = s.perms.Group
        }

        // Are permissions satisfied without capability checks?
        if p.SupersetOf(reqPerms) {
                return true
        }

        return s.checkCapability(creds)
}

// destroy destroys the set.
//
// Preconditions: Caller must hold 's.mu'.
func (s *Set) destroy() {
        // Notify all waiters. They will fail on the next attempt to execute
        // operations and return error.
        s.dead = true
        for _, s := range s.sems {
                for w := s.waiters.Front(); w != nil; w = w.Next() {
                        w.ch <- struct{}{}
                }
                s.waiters.Reset()
        }
}

func abs(val int16) int16 {
        if val < 0 {
                return -val
        }
        return val
}

// wakeWaiters goes over all waiters and checks which of them can be notified.
func (s *sem) wakeWaiters() {
        // Note that this will release all waiters waiting for 0 too.
        for w := s.waiters.Front(); w != nil; {
                if s.value < abs(w.value) {
                        // Still blocked, skip it.
                        w = w.Next()
                        continue
                }
                w.ch <- struct{}{}
                old := w
                w = w.Next()
                s.waiters.Remove(old)
        }
}

func newWaiter(val int16) *waiter {
        return &waiter{
                value: val,
                ch:    make(chan struct{}, 1),
        }
}































  382 
  379 


    2 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package auth

import (
        "gvisor.dev/gvisor/pkg/context"
)

// contextID is the auth package's type for context.Context.Value keys.
type contextID int

const (
        // CtxCredentials is a Context.Value key for Credentials.
        CtxCredentials contextID = iota
)

// CredentialsFromContext returns a copy of the Credentials used by ctx, or a
// set of Credentials with no capabilities if ctx does not have Credentials.
func CredentialsFromContext(ctx context.Context) *Credentials {
        if v := ctx.Value(CtxCredentials); v != nil {
                return v.(*Credentials)
        }
        return NewAnonymousCredentials()
}

// ContextWithCredentials returns a copy of ctx carrying creds.
func ContextWithCredentials(ctx context.Context, creds *Credentials) context.Context {
        return &authContext{ctx, creds}
}

type authContext struct {
        context.Context
        creds *Credentials
}

// Value implements context.Context.
func (ac *authContext) Value(key interface{}) interface{} {
        switch key {
        case CtxCredentials:
                return ac.creds
        default:
                return ac.Context.Value(key)
        }
}





























   11 










   59 





   13 








   54 






  134 
   62 


  122 



   83 




  139 














  149 





   70 
   65 
   57 




   23 















































   17 


   11 



    8 


    4 



   17 


   17 



   13 



   13 


   12 
   10 

   13 


   13 



   13 

    4 







   83 






































  105 

















    1 



    1 




    1 

    1 


    1 



  157 




   39 





   87 
   45 


   63 




   59 

   58 





   58 

   54 


   60 



   24 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package buffer provides the implementation of a buffer view.
package buffer

import (
        "bytes"
        "fmt"
        "io"
)

// View is a slice of a buffer, with convenience methods.
type View []byte

// NewView allocates a new buffer and returns an initialized view that covers
// the whole buffer.
func NewView(size int) View {
        return make(View, size)
}

// NewViewFromBytes allocates a new buffer and copies in the given bytes.
func NewViewFromBytes(b []byte) View {
        return append(View(nil), b...)
}

// TrimFront removes the first "count" bytes from the visible section of the
// buffer.
func (v *View) TrimFront(count int) {
        *v = (*v)[count:]
}

// CapLength irreversibly reduces the length of the visible section of the
// buffer to the value specified.
func (v *View) CapLength(length int) {
        // We also set the slice cap because if we don't, one would be able to
        // expand the view back to include the region just excluded. We want to
        // prevent that to avoid potential data leak if we have uninitialized
        // data in excluded region.
        *v = (*v)[:length:length]
}

// Reader returns a bytes.Reader for v.
func (v *View) Reader() bytes.Reader {
        var r bytes.Reader
        r.Reset(*v)
        return r
}

// ToVectorisedView returns a VectorisedView containing the receiver.
func (v View) ToVectorisedView() VectorisedView {
        if len(v) == 0 {
                return VectorisedView{}
        }
        return NewVectorisedView(len(v), []View{v})
}

// IsEmpty returns whether v is of length zero.
func (v View) IsEmpty() bool {
        return len(v) == 0
}

// Size returns the length of v.
func (v View) Size() int {
        return len(v)
}

// VectorisedView is a vectorised version of View using non contiguous memory.
// It supports all the convenience methods supported by View.
//
// +stateify savable
type VectorisedView struct {
        views []View
        size  int
}

// NewVectorisedView creates a new vectorised view from an already-allocated
// slice of View and sets its size.
func NewVectorisedView(size int, views []View) VectorisedView {
        return VectorisedView{views: views, size: size}
}

// TrimFront removes the first "count" bytes of the vectorised view. It panics
// if count > vv.Size().
func (vv *VectorisedView) TrimFront(count int) {
        for count > 0 && len(vv.views) > 0 {
                if count < len(vv.views[0]) {
                        vv.size -= count
                        vv.views[0].TrimFront(count)
                        return
                }
                count -= len(vv.views[0])
                vv.removeFirst()
        }
}

// Read implements io.Reader.
func (vv *VectorisedView) Read(b []byte) (copied int, err error) {
        count := len(b)
        for count > 0 && len(vv.views) > 0 {
                if count < len(vv.views[0]) {
                        vv.size -= count
                        copy(b[copied:], vv.views[0][:count])
                        vv.views[0].TrimFront(count)
                        copied += count
                        return copied, nil
                }
                count -= len(vv.views[0])
                copy(b[copied:], vv.views[0])
                copied += len(vv.views[0])
                vv.removeFirst()
        }
        if copied == 0 {
                return 0, io.EOF
        }
        return copied, nil
}

// ReadToVV reads up to n bytes from vv to dstVV and removes them from vv. It
// returns the number of bytes copied.
func (vv *VectorisedView) ReadToVV(dstVV *VectorisedView, count int) (copied int) {
        for count > 0 && len(vv.views) > 0 {
                if count < len(vv.views[0]) {
                        vv.size -= count
                        dstVV.AppendView(vv.views[0][:count])
                        vv.views[0].TrimFront(count)
                        copied += count
                        return
                }
                count -= len(vv.views[0])
                dstVV.AppendView(vv.views[0])
                copied += len(vv.views[0])
                vv.removeFirst()
        }
        return copied
}

// ReadTo reads up to count bytes from vv to dst. It also removes them from vv
// unless peek is true.
func (vv *VectorisedView) ReadTo(dst io.Writer, peek bool) (int, error) {
        var err error
        done := 0
        for _, v := range vv.Views() {
                var n int
                n, err = dst.Write(v)
                done += n
                if err != nil {
                        break
                }
                if n != len(v) {
                        panic(fmt.Sprintf("io.Writer.Write succeeded with incomplete write: %d != %d", n, len(v)))
                }
        }
        if !peek {
                vv.TrimFront(done)
        }
        return done, err
}

// CapLength irreversibly reduces the length of the vectorised view.
func (vv *VectorisedView) CapLength(length int) {
        if length < 0 {
                length = 0
        }
        if vv.size < length {
                return
        }
        vv.size = length
        for i := range vv.views {
                v := &vv.views[i]
                if len(*v) >= length {
                        if length == 0 {
                                vv.views = vv.views[:i]
                        } else {
                                v.CapLength(length)
                                vv.views = vv.views[:i+1]
                        }
                        return
                }
                length -= len(*v)
        }
}

// Clone returns a clone of this VectorisedView.
// If the buffer argument is large enough to contain all the Views of this
// VectorisedView, the method will avoid allocations and use the buffer to
// store the Views of the clone.
func (vv VectorisedView) Clone(buffer []View) VectorisedView {
        return VectorisedView{views: append(buffer[:0], vv.views...), size: vv.size}
}

// PullUp returns the first "count" bytes of the vectorised view. If those
// bytes aren't already contiguous inside the vectorised view, PullUp will
// reallocate as needed to make them contiguous. PullUp fails and returns false
// when count > vv.Size().
func (vv *VectorisedView) PullUp(count int) (View, bool) {
        if len(vv.views) == 0 {
                return nil, count == 0
        }
        if count <= len(vv.views[0]) {
                return vv.views[0][:count], true
        }
        if count > vv.size {
                return nil, false
        }

        newFirst := NewView(count)
        i := 0
        for offset := 0; offset < count; i++ {
                copy(newFirst[offset:], vv.views[i])
                if count-offset < len(vv.views[i]) {
                        vv.views[i].TrimFront(count - offset)
                        break
                }
                offset += len(vv.views[i])
                vv.views[i] = nil
        }
        // We're guaranteed that i > 0, since count is too large for the first
        // view.
        vv.views[i-1] = newFirst
        vv.views = vv.views[i-1:]
        return newFirst, true
}

// Size returns the size in bytes of the entire content stored in the
// vectorised view.
func (vv *VectorisedView) Size() int {
        return vv.size
}

// MemSize returns the estimation size of the vv in memory, including backing
// buffer data.
func (vv *VectorisedView) MemSize() int {
        var size int
        for _, v := range vv.views {
                size += cap(v)
        }
        return size + cap(vv.views)*viewStructSize + vectorisedViewStructSize
}

// ToView returns a single view containing the content of the vectorised view.
//
// If the vectorised view contains a single view, that view will be returned
// directly.
func (vv *VectorisedView) ToView() View {
        if len(vv.views) == 1 {
                return vv.views[0]
        }
        return vv.ToOwnedView()
}

// ToOwnedView returns a single view containing the content of the vectorised
// view that vv does not own.
func (vv *VectorisedView) ToOwnedView() View {
        u := make([]byte, 0, vv.size)
        for _, v := range vv.views {
                u = append(u, v...)
        }
        return u
}

// Views returns the slice containing the all views.
func (vv *VectorisedView) Views() []View {
        return vv.views
}

// Append appends the views in a vectorised view to this vectorised view.
func (vv *VectorisedView) Append(vv2 VectorisedView) {
        vv.views = append(vv.views, vv2.views...)
        vv.size += vv2.size
}

// AppendView appends the given view into this vectorised view.
func (vv *VectorisedView) AppendView(v View) {
        if len(v) == 0 {
                return
        }
        vv.views = append(vv.views, v)
        vv.size += len(v)
}

// AppendViews appends views to vv.
func (vv *VectorisedView) AppendViews(views []View) {
        vv.views = append(vv.views, views...)
        for _, v := range views {
                vv.size += len(v)
        }
}

// Readers returns a bytes.Reader for each of vv's views.
func (vv *VectorisedView) Readers() []bytes.Reader {
        readers := make([]bytes.Reader, 0, len(vv.views))
        for _, v := range vv.views {
                readers = append(readers, v.Reader())
        }
        return readers
}

// removeFirst panics when len(vv.views) < 1.
func (vv *VectorisedView) removeFirst() {
        vv.size -= len(vv.views[0])
        vv.views[0] = nil
        vv.views = vv.views[1:]
}


























    5 







    5 








    5 




























    5 
    5 

    5 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

// renoState stores the variables related to TCP New Reno congestion
// control algorithm.
//
// +stateify savable
type renoState struct {
        s *sender
}

// newRenoCC initializes the state for the NewReno congestion control algorithm.
func newRenoCC(s *sender) *renoState {
        return &renoState{s: s}
}

// updateSlowStart will update the congestion window as per the slow-start
// algorithm used by NewReno. If after adjusting the congestion window
// we cross the SSthreshold then it will return the number of packets that
// must be consumed in congestion avoidance mode.
func (r *renoState) updateSlowStart(packetsAcked int) int {
        // Don't let the congestion window cross into the congestion
        // avoidance range.
        newcwnd := r.s.SndCwnd + packetsAcked
        if newcwnd >= r.s.Ssthresh {
                newcwnd = r.s.Ssthresh
                r.s.SndCAAckCount = 0
        }

        packetsAcked -= newcwnd - r.s.SndCwnd
        r.s.SndCwnd = newcwnd
        return packetsAcked
}

// updateCongestionAvoidance will update congestion window in congestion
// avoidance mode as described in RFC5681 section 3.1
func (r *renoState) updateCongestionAvoidance(packetsAcked int) {
        // Consume the packets in congestion avoidance mode.
        r.s.SndCAAckCount += packetsAcked
        if r.s.SndCAAckCount >= r.s.SndCwnd {
                r.s.SndCwnd += r.s.SndCAAckCount / r.s.SndCwnd
                r.s.SndCAAckCount = r.s.SndCAAckCount % r.s.SndCwnd
        }
}

// reduceSlowStartThreshold reduces the slow-start threshold per RFC 5681,
// page 6, eq. 4. It is called when we detect congestion in the network.
func (r *renoState) reduceSlowStartThreshold() {
        r.s.Ssthresh = r.s.Outstanding / 2
        if r.s.Ssthresh < 2 {
                r.s.Ssthresh = 2
        }

}

// Update updates the congestion state based on the number of packets that
// were acknowledged.
// Update implements congestionControl.Update.
func (r *renoState) Update(packetsAcked int) {
        if r.s.SndCwnd < r.s.Ssthresh {
                packetsAcked = r.updateSlowStart(packetsAcked)
                if packetsAcked == 0 {
                        return
                }
        }
        r.updateCongestionAvoidance(packetsAcked)
}

// HandleLossDetected implements congestionControl.HandleLossDetected.
func (r *renoState) HandleLossDetected() {
        // A retransmit was triggered due to nDupAckThreshold or when RACK
        // detected loss. Reduce our slow start threshold.
        r.reduceSlowStartThreshold()
}

// HandleRTOExpired implements congestionControl.HandleRTOExpired.
func (r *renoState) HandleRTOExpired() {
        // We lost a packet, so reduce ssthresh.
        r.reduceSlowStartThreshold()

        // Reduce the congestion window to 1, i.e., enter slow-start. Per
        // RFC 5681, page 7, we must use 1 regardless of the value of the
        // initial congestion window.
        r.s.SndCwnd = 1
}

// PostRecovery implements congestionControl.PostRecovery.
func (r *renoState) PostRecovery() {
        // noop.
}

















































































































































  129 









  129 







  128 








  128 





















  124 







  127 



















  132 










  132 






  132 







  122 




  122 






























  125 





















  129 




  132 



























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package host

import (
        "fmt"
        "sync/atomic"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fdnotifier"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sentry/socket/control"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/sentry/uniqueid"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/unet"
        "gvisor.dev/gvisor/pkg/waiter"
)

// Create a new host-backed endpoint from the given fd and its corresponding
// notification queue.
func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transport.Endpoint, error) {
        // Set up an external transport.Endpoint using the host fd.
        addr := fmt.Sprintf("hostfd:[%d]", hostFD)
        e, err := NewConnectedEndpoint(hostFD, addr)
        if err != nil {
                return nil, err.ToError()
        }
        ep := transport.NewExternal(ctx, e.stype, uniqueid.GlobalProviderFromContext(ctx), queue, e, e)
        return ep, nil
}

// ConnectedEndpoint is an implementation of transport.ConnectedEndpoint and
// transport.Receiver. It is backed by a host fd that was imported at sentry
// startup. This fd is shared with a hostfs inode, which retains ownership of
// it.
//
// ConnectedEndpoint is saveable, since we expect that the host will provide
// the same fd upon restore.
//
// As of this writing, we only allow Unix sockets to be imported.
//
// +stateify savable
type ConnectedEndpoint struct {
        ConnectedEndpointRefs

        // mu protects fd below.
        mu sync.RWMutex `state:"nosave"`

        // fd is the host fd backing this endpoint.
        fd int

        // addr is the address at which this endpoint is bound.
        addr string

        // sndbuf is the size of the send buffer.
        //
        // N.B. When this is smaller than the host size, we present it via
        // GetSockOpt and message splitting/rejection in SendMsg, but do not
        // prevent lots of small messages from filling the real send buffer
        // size on the host.
        sndbuf int64 `state:"nosave"`

        // stype is the type of Unix socket.
        stype linux.SockType
}

// init performs initialization required for creating new ConnectedEndpoints and
// for restoring them.
func (c *ConnectedEndpoint) init() *syserr.Error {
        c.InitRefs()
        return c.initFromOptions()
}

func (c *ConnectedEndpoint) initFromOptions() *syserr.Error {
        family, err := unix.GetsockoptInt(c.fd, unix.SOL_SOCKET, unix.SO_DOMAIN)
        if err != nil {
                return syserr.FromError(err)
        }

        if family != unix.AF_UNIX {
                // We only allow Unix sockets.
                return syserr.ErrInvalidEndpointState
        }

        stype, err := unix.GetsockoptInt(c.fd, unix.SOL_SOCKET, unix.SO_TYPE)
        if err != nil {
                return syserr.FromError(err)
        }

        if err := unix.SetNonblock(c.fd, true); err != nil {
                return syserr.FromError(err)
        }

        sndbuf, err := unix.GetsockoptInt(c.fd, unix.SOL_SOCKET, unix.SO_SNDBUF)
        if err != nil {
                return syserr.FromError(err)
        }

        c.stype = linux.SockType(stype)
        atomic.StoreInt64(&c.sndbuf, int64(sndbuf))

        return nil
}

// NewConnectedEndpoint creates a new ConnectedEndpoint backed by a host fd
// imported at sentry startup,
//
// The caller is responsible for calling Init(). Additionaly, Release needs to
// be called twice because ConnectedEndpoint is both a transport.Receiver and
// transport.ConnectedEndpoint.
func NewConnectedEndpoint(hostFD int, addr string) (*ConnectedEndpoint, *syserr.Error) {
        e := ConnectedEndpoint{
                fd:   hostFD,
                addr: addr,
        }

        if err := e.init(); err != nil {
                return nil, err
        }

        // ConnectedEndpointRefs start off with a single reference. We need two.
        e.IncRef()
        return &e, nil
}

// Send implements transport.ConnectedEndpoint.Send.
func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
        c.mu.RLock()
        defer c.mu.RUnlock()

        if !controlMessages.Empty() {
                return 0, false, syserr.ErrInvalidEndpointState
        }

        // Since stream sockets don't preserve message boundaries, we can write
        // only as much of the message as fits in the send buffer.
        truncate := c.stype == linux.SOCK_STREAM

        n, totalLen, err := fdWriteVec(c.fd, data, c.SendMaxQueueSize(), truncate)
        if n < totalLen && err == nil {
                // The host only returns a short write if it would otherwise
                // block (and only for stream sockets).
                err = linuxerr.EAGAIN
        }
        if n > 0 && !linuxerr.Equals(linuxerr.EAGAIN, err) {
                // The caller may need to block to send more data, but
                // otherwise there isn't anything that can be done about an
                // error with a partial write.
                err = nil
        }

        // There is no need for the callee to call SendNotify because fdWriteVec
        // uses the host's sendmsg(2) and the host kernel's queue.
        return n, false, syserr.FromError(err)
}

// SendNotify implements transport.ConnectedEndpoint.SendNotify.
func (c *ConnectedEndpoint) SendNotify() {}

// CloseSend implements transport.ConnectedEndpoint.CloseSend.
func (c *ConnectedEndpoint) CloseSend() {
        c.mu.Lock()
        defer c.mu.Unlock()

        if err := unix.Shutdown(c.fd, unix.SHUT_WR); err != nil {
                // A well-formed UDS shutdown can't fail. See
                // net/unix/af_unix.c:unix_shutdown.
                panic(fmt.Sprintf("failed write shutdown on host socket %+v: %v", c, err))
        }
}

// CloseNotify implements transport.ConnectedEndpoint.CloseNotify.
func (c *ConnectedEndpoint) CloseNotify() {}

// Writable implements transport.ConnectedEndpoint.Writable.
func (c *ConnectedEndpoint) Writable() bool {
        c.mu.RLock()
        defer c.mu.RUnlock()

        return fdnotifier.NonBlockingPoll(int32(c.fd), waiter.WritableEvents)&waiter.WritableEvents != 0
}

// Passcred implements transport.ConnectedEndpoint.Passcred.
func (c *ConnectedEndpoint) Passcred() bool {
        // We don't support credential passing for host sockets.
        return false
}

// GetLocalAddress implements transport.ConnectedEndpoint.GetLocalAddress.
func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
        return tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, nil
}

// EventUpdate implements transport.ConnectedEndpoint.EventUpdate.
func (c *ConnectedEndpoint) EventUpdate() {
        c.mu.RLock()
        defer c.mu.RUnlock()
        if c.fd != -1 {
                fdnotifier.UpdateFD(int32(c.fd))
        }
}

// Recv implements transport.Receiver.Recv.
func (c *ConnectedEndpoint) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
        c.mu.RLock()
        defer c.mu.RUnlock()

        var cm unet.ControlMessage
        if numRights > 0 {
                cm.EnableFDs(int(numRights))
        }

        // N.B. Unix sockets don't have a receive buffer, the send buffer
        // serves both purposes.
        rl, ml, cl, cTrunc, err := fdReadVec(c.fd, data, []byte(cm), peek, c.RecvMaxQueueSize())
        if rl > 0 && err != nil {
                // We got some data, so all we need to do on error is return
                // the data that we got. Short reads are fine, no need to
                // block.
                err = nil
        }
        if err != nil {
                return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err)
        }

        // There is no need for the callee to call RecvNotify because fdReadVec uses
        // the host's recvmsg(2) and the host kernel's queue.

        // Trim the control data if we received less than the full amount.
        if cl < uint64(len(cm)) {
                cm = cm[:cl]
        }

        // Avoid extra allocations in the case where there isn't any control data.
        if len(cm) == 0 {
                return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil
        }

        fds, err := cm.ExtractFDs()
        if err != nil {
                return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err)
        }

        if len(fds) == 0 {
                return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil
        }
        return rl, ml, control.NewVFS2(nil, nil, newSCMRights(fds)), cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil
}

// RecvNotify implements transport.Receiver.RecvNotify.
func (c *ConnectedEndpoint) RecvNotify() {}

// CloseRecv implements transport.Receiver.CloseRecv.
func (c *ConnectedEndpoint) CloseRecv() {
        c.mu.Lock()
        defer c.mu.Unlock()

        if err := unix.Shutdown(c.fd, unix.SHUT_RD); err != nil {
                // A well-formed UDS shutdown can't fail. See
                // net/unix/af_unix.c:unix_shutdown.
                panic(fmt.Sprintf("failed read shutdown on host socket %+v: %v", c, err))
        }
}

// Readable implements transport.Receiver.Readable.
func (c *ConnectedEndpoint) Readable() bool {
        c.mu.RLock()
        defer c.mu.RUnlock()

        return fdnotifier.NonBlockingPoll(int32(c.fd), waiter.ReadableEvents)&waiter.ReadableEvents != 0
}

// SendQueuedSize implements transport.Receiver.SendQueuedSize.
func (c *ConnectedEndpoint) SendQueuedSize() int64 {
        // TODO(gvisor.dev/issue/273): SendQueuedSize isn't supported for host
        // sockets because we don't allow the sentry to call ioctl(2).
        return -1
}

// RecvQueuedSize implements transport.Receiver.RecvQueuedSize.
func (c *ConnectedEndpoint) RecvQueuedSize() int64 {
        // TODO(gvisor.dev/issue/273): RecvQueuedSize isn't supported for host
        // sockets because we don't allow the sentry to call ioctl(2).
        return -1
}

// SendMaxQueueSize implements transport.Receiver.SendMaxQueueSize.
func (c *ConnectedEndpoint) SendMaxQueueSize() int64 {
        return atomic.LoadInt64(&c.sndbuf)
}

// RecvMaxQueueSize implements transport.Receiver.RecvMaxQueueSize.
func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 {
        // N.B. Unix sockets don't use the receive buffer. We'll claim it is
        // the same size as the send buffer.
        return atomic.LoadInt64(&c.sndbuf)
}

func (c *ConnectedEndpoint) destroyLocked() {
        c.fd = -1
}

// Release implements transport.ConnectedEndpoint.Release and
// transport.Receiver.Release.
func (c *ConnectedEndpoint) Release(ctx context.Context) {
        c.DecRef(func() {
                c.mu.Lock()
                c.destroyLocked()
                c.mu.Unlock()
        })
}

// CloseUnread implements transport.ConnectedEndpoint.CloseUnread.
func (c *ConnectedEndpoint) CloseUnread() {}

// SetSendBufferSize implements transport.ConnectedEndpoint.SetSendBufferSize.
func (c *ConnectedEndpoint) SetSendBufferSize(v int64) (newSz int64) {
        // gVisor does not permit setting of SO_SNDBUF for host backed unix
        // domain sockets.
        return atomic.LoadInt64(&c.sndbuf)
}

// SetReceiveBufferSize implements transport.ConnectedEndpoint.SetReceiveBufferSize.
func (c *ConnectedEndpoint) SetReceiveBufferSize(v int64) (newSz int64) {
        // gVisor does not permit setting of SO_RCVBUF for host backed unix
        // domain sockets. Receive buffer does not have any effect for unix
        // sockets and we claim to be the same as send buffer.
        return atomic.LoadInt64(&c.sndbuf)
}

// SCMConnectedEndpoint represents an endpoint backed by a host fd that was
// passed through a gofer Unix socket. It resembles ConnectedEndpoint, with the
// following differences:
// - SCMConnectedEndpoint is not saveable, because the host cannot guarantee
// the same descriptor number across S/R.
// - SCMConnectedEndpoint holds ownership of its fd and notification queue.
type SCMConnectedEndpoint struct {
        ConnectedEndpoint

        queue *waiter.Queue
}

// Init will do the initialization required without holding other locks.
func (e *SCMConnectedEndpoint) Init() error {
        return fdnotifier.AddFD(int32(e.fd), e.queue)
}

// Release implements transport.ConnectedEndpoint.Release and
// transport.Receiver.Release.
func (e *SCMConnectedEndpoint) Release(ctx context.Context) {
        e.DecRef(func() {
                e.mu.Lock()
                fdnotifier.RemoveFD(int32(e.fd))
                if err := unix.Close(e.fd); err != nil {
                        log.Warningf("Failed to close host fd %d: %v", err)
                }
                e.destroyLocked()
                e.mu.Unlock()
        })
}

// NewSCMEndpoint creates a new SCMConnectedEndpoint backed by a host fd that
// was passed through a Unix socket.
//
// The caller is responsible for calling Init(). Additionaly, Release needs to
// be called twice because ConnectedEndpoint is both a transport.Receiver and
// transport.ConnectedEndpoint.
func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr string) (*SCMConnectedEndpoint, *syserr.Error) {
        e := SCMConnectedEndpoint{
                ConnectedEndpoint: ConnectedEndpoint{
                        fd:   hostFD,
                        addr: addr,
                },
                queue: queue,
        }

        if err := e.init(); err != nil {
                return nil, err
        }

        // e starts off with a single reference. We need two.
        e.IncRef()
        return &e, nil
}




























   16 

    1 



   14 



   15 





    1 



   14 



    3 






    7 






    7 






    7 



    5 






    1 





    5 
    3 



    5 



    5 





    5 


    5 
    1 


    4 

    1 


    3 





    3 



    2 







    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

const allFlags = linux.IN_NONBLOCK | linux.IN_CLOEXEC

// InotifyInit1 implements the inotify_init1() syscalls.
func InotifyInit1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        flags := args[0].Int()
        if flags&^allFlags != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        ino, err := vfs.NewInotifyFD(t, t.Kernel().VFS(), uint32(flags))
        if err != nil {
                return 0, nil, err
        }
        defer ino.DecRef(t)

        fd, err := t.NewFDFromVFS2(0, ino, kernel.FDFlags{
                CloseOnExec: flags&linux.IN_CLOEXEC != 0,
        })

        if err != nil {
                return 0, nil, err
        }

        return uintptr(fd), nil, nil
}

// InotifyInit implements the inotify_init() syscalls.
func InotifyInit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        args[0].Value = 0
        return InotifyInit1(t, args)
}

// fdToInotify resolves an fd to an inotify object. If successful, the file will
// have an extra ref and the caller is responsible for releasing the ref.
func fdToInotify(t *kernel.Task, fd int32) (*vfs.Inotify, *vfs.FileDescription, error) {
        f := t.GetFileVFS2(fd)
        if f == nil {
                // Invalid fd.
                return nil, nil, linuxerr.EBADF
        }

        ino, ok := f.Impl().(*vfs.Inotify)
        if !ok {
                // Not an inotify fd.
                f.DecRef(t)
                return nil, nil, linuxerr.EINVAL
        }

        return ino, f, nil
}

// InotifyAddWatch implements the inotify_add_watch() syscall.
func InotifyAddWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        mask := args[2].Uint()

        // "EINVAL: The given event mask contains no valid events."
        // -- inotify_add_watch(2)
        if mask&linux.ALL_INOTIFY_BITS == 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // "IN_DONT_FOLLOW: Don't dereference pathname if it is a symbolic link."
        //  -- inotify(7)
        follow := followFinalSymlink
        if mask&linux.IN_DONT_FOLLOW == 0 {
                follow = nofollowFinalSymlink
        }

        ino, f, err := fdToInotify(t, fd)
        if err != nil {
                return 0, nil, err
        }
        defer f.DecRef(t)

        path, err := copyInPath(t, addr)
        if err != nil {
                return 0, nil, err
        }
        if mask&linux.IN_ONLYDIR != 0 {
                path.Dir = true
        }
        tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, follow)
        if err != nil {
                return 0, nil, err
        }
        defer tpop.Release(t)
        d, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{})
        if err != nil {
                return 0, nil, err
        }
        defer d.DecRef(t)

        fd, err = ino.AddWatch(d.Dentry(), mask)
        if err != nil {
                return 0, nil, err
        }
        return uintptr(fd), nil, nil
}

// InotifyRmWatch implements the inotify_rm_watch() syscall.
func InotifyRmWatch(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        wd := args[1].Int()

        ino, f, err := fdToInotify(t, fd)
        if err != nil {
                return 0, nil, err
        }
        defer f.DecRef(t)
        return 0, nil, ino.RmWatch(t, wd)
}















































































    1 
















































    1 







































































   32 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcpip

import (
        "time"

        "gvisor.dev/gvisor/pkg/sync"
)

// jobInstance is a specific instance of Job.
//
// Different instances are created each time Job is scheduled so each timer has
// its own earlyReturn signal. This is to address a bug when a Job is stopped
// and reset in quick succession resulting in a timer instance's earlyReturn
// signal being affected or seen by another timer instance.
//
// Consider the following sceneario where timer instances share a common
// earlyReturn signal (T1 creates, stops and resets a Cancellable timer under a
// lock L; T2, T3, T4 and T5 are goroutines that handle the first (A), second
// (B), third (C), and fourth (D) instance of the timer firing, respectively):
//   T1: Obtain L
//   T1: Create a new Job w/ lock L (create instance A)
//   T2: instance A fires, blocked trying to obtain L.
//   T1: Attempt to stop instance A (set earlyReturn = true)
//   T1: Schedule timer (create instance B)
//   T3: instance B fires, blocked trying to obtain L.
//   T1: Attempt to stop instance B (set earlyReturn = true)
//   T1: Schedule timer (create instance C)
//   T4: instance C fires, blocked trying to obtain L.
//   T1: Attempt to stop instance C (set earlyReturn = true)
//   T1: Schedule timer (create instance D)
//   T5: instance D fires, blocked trying to obtain L.
//   T1: Release L
//
// Now that T1 has released L, any of the 4 timer instances can take L and
// check earlyReturn. If the timers simply check earlyReturn and then do
// nothing further, then instance D will never early return even though it was
// not requested to stop. If the timers reset earlyReturn before early
// returning, then all but one of the timers will do work when only one was
// expected to. If Job resets earlyReturn when resetting, then all the timers
// will fire (again, when only one was expected to).
//
// To address the above concerns the simplest solution was to give each timer
// its own earlyReturn signal.
type jobInstance struct {
        timer Timer

        // Used to inform the timer to early return when it gets stopped while the
        // lock the timer tries to obtain when fired is held (T1 is a goroutine that
        // tries to cancel the timer and T2 is the goroutine that handles the timer
        // firing):
        //   T1: Obtain the lock, then call Cancel()
        //   T2: timer fires, and gets blocked on obtaining the lock
        //   T1: Releases lock
        //   T2: Obtains lock does unintended work
        //
        // To resolve this, T1 will check to see if the timer already fired, and
        // inform the timer using earlyReturn to return early so that once T2 obtains
        // the lock, it will see that it is set to true and do nothing further.
        earlyReturn *bool
}

// stop stops the job instance j from firing if it hasn't fired already. If it
// has fired and is blocked at obtaining the lock, earlyReturn will be set to
// true so that it will early return when it obtains the lock.
func (j *jobInstance) stop() {
        if j.timer != nil {
                j.timer.Stop()
                *j.earlyReturn = true
        }
}

// Job represents some work that can be scheduled for execution. The work can
// be safely cancelled when it fires at the same time some "related work" is
// being done.
//
// The term "related work" is defined as some work that needs to be done while
// holding some lock that the timer must also hold while doing some work.
//
// Note, it is not safe to copy a Job as its timer instance creates
// a closure over the address of the Job.
type Job struct {
        _ sync.NoCopy

        // The clock used to schedule the backing timer
        clock Clock

        // The active instance of a cancellable timer.
        instance jobInstance

        // locker is the lock taken by the timer immediately after it fires and must
        // be held when attempting to stop the timer.
        //
        // Must never change after being assigned.
        locker sync.Locker

        // fn is the function that will be called when a timer fires and has not been
        // signaled to early return.
        //
        // fn MUST NOT attempt to lock locker.
        //
        // Must never change after being assigned.
        fn func()
}

// Cancel prevents the Job from executing if it has not executed already.
//
// Cancel requires appropriate locking to be in place for any resources managed
// by the Job. If the Job is blocked on obtaining the lock when Cancel is
// called, it will early return.
//
// Note, t will be modified.
//
// j.locker MUST be locked.
func (j *Job) Cancel() {
        j.instance.stop()

        // Nothing to do with the stopped instance anymore.
        j.instance = jobInstance{}
}

// Schedule schedules the Job for execution after duration d. This can be
// called on cancelled or completed Jobs to schedule them again.
//
// Schedule should be invoked only on unscheduled, cancelled, or completed
// Jobs. To be safe, callers should always call Cancel before calling Schedule.
//
// Note, j will be modified.
func (j *Job) Schedule(d time.Duration) {
        // Create a new instance.
        earlyReturn := false

        // Capture the locker so that updating the timer does not cause a data race
        // when a timer fires and tries to obtain the lock (read the timer's locker).
        locker := j.locker
        j.instance = jobInstance{
                timer: j.clock.AfterFunc(d, func() {
                        locker.Lock()
                        defer locker.Unlock()

                        if earlyReturn {
                                // If we reach this point, it means that the timer fired while another
                                // goroutine called Cancel while it had the lock. Simply return here
                                // and do nothing further.
                                earlyReturn = false
                                return
                        }

                        j.fn()
                }),
                earlyReturn: &earlyReturn,
        }
}

// NewJob returns a new Job that can be used to schedule f to run in its own
// gorountine. l will be locked before calling f then unlocked after f returns.
//
//  var clock tcpip.StdClock
//  var mu sync.Mutex
//  message := "foo"
//  job := tcpip.NewJob(&clock, &mu, func() {
//    fmt.Println(message)
//  })
//  job.Schedule(time.Second)
//
//  mu.Lock()
//  message = "bar"
//  mu.Unlock()
//
//  // Output: bar
//
// f MUST NOT attempt to lock l.
//
// l MUST be locked prior to calling the returned job's Cancel().
//
//  var clock tcpip.StdClock
//  var mu sync.Mutex
//  message := "foo"
//  job := tcpip.NewJob(&clock, &mu, func() {
//    fmt.Println(message)
//  })
//  job.Schedule(time.Second)
//
//  mu.Lock()
//  job.Cancel()
//  mu.Unlock()
func NewJob(c Clock, l sync.Locker, f func()) *Job {
        return &Job{
                clock:  c,
                locker: l,
                fn:     f,
        }
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/tcpip/transport/icmp/icmp_packet_list.go: no such file or directory





























  107 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "math/rand"

        "gvisor.dev/gvisor/pkg/sync"
)

// lockedRandomSource provides a threadsafe rand.Source.
type lockedRandomSource struct {
        mu  sync.Mutex
        src rand.Source
}

func (r *lockedRandomSource) Int63() (n int64) {
        r.mu.Lock()
        n = r.src.Int63()
        r.mu.Unlock()
        return n
}

func (r *lockedRandomSource) Seed(seed int64) {
        r.mu.Lock()
        r.src.Seed(seed)
        r.mu.Unlock()
}














































































    1 

    1 


    1 



    1 


    1 









































































    1 




    1 












































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
)

// This file contains structures required to support IPv6 netfilter and
// ip6tables. Some constants and structs are equal to their IPv4 analogues, and
// are only distinguished by context (e.g. whether used on an IPv4 of IPv6
// socket).

// Socket options for SOL_SOCLET. These correspond to values in
// include/uapi/linux/netfilter_ipv6/ip6_tables.h.
const (
        IP6T_BASE_CTL            = 64
        IP6T_SO_SET_REPLACE      = IPT_BASE_CTL
        IP6T_SO_SET_ADD_COUNTERS = IPT_BASE_CTL + 1
        IP6T_SO_SET_MAX          = IPT_SO_SET_ADD_COUNTERS

        IP6T_SO_GET_INFO            = IPT_BASE_CTL
        IP6T_SO_GET_ENTRIES         = IPT_BASE_CTL + 1
        IP6T_SO_GET_REVISION_MATCH  = IPT_BASE_CTL + 4
        IP6T_SO_GET_REVISION_TARGET = IPT_BASE_CTL + 5
        IP6T_SO_GET_MAX             = IP6T_SO_GET_REVISION_TARGET
)

// IP6T_ORIGINAL_DST is the ip6tables SOL_IPV6 socket option. Corresponds to
// the value in include/uapi/linux/netfilter_ipv6/ip6_tables.h.
const IP6T_ORIGINAL_DST = 80

// IP6TReplace is the argument for the IP6T_SO_SET_REPLACE sockopt. It
// corresponds to struct ip6t_replace in
// include/uapi/linux/netfilter_ipv6/ip6_tables.h.
//
// +marshal
type IP6TReplace struct {
        Name        TableName
        ValidHooks  uint32
        NumEntries  uint32
        Size        uint32
        HookEntry   [NF_INET_NUMHOOKS]uint32
        Underflow   [NF_INET_NUMHOOKS]uint32
        NumCounters uint32
        Counters    uint64 // This is really a *XTCounters.
        // Entries is omitted here because it would cause IP6TReplace to be an
        // extra byte longer (see http://www.catb.org/esr/structure-packing/).
        // Entries [0]IP6TEntry
}

// SizeOfIP6TReplace is the size of an IP6TReplace.
const SizeOfIP6TReplace = 96

// KernelIP6TGetEntries is identical to IP6TGetEntries, but includes the
// Entrytable field.
//
// +marshal dynamic
type KernelIP6TGetEntries struct {
        IPTGetEntries
        Entrytable []KernelIP6TEntry
}

// SizeBytes implements marshal.Marshallable.SizeBytes.
func (ke *KernelIP6TGetEntries) SizeBytes() int {
        res := ke.IPTGetEntries.SizeBytes()
        for _, entry := range ke.Entrytable {
                res += entry.SizeBytes()
        }
        return res
}

// MarshalBytes implements marshal.Marshallable.MarshalBytes.
func (ke *KernelIP6TGetEntries) MarshalBytes(dst []byte) {
        ke.IPTGetEntries.MarshalUnsafe(dst)
        marshalledUntil := ke.IPTGetEntries.SizeBytes()
        for i := range ke.Entrytable {
                ke.Entrytable[i].MarshalBytes(dst[marshalledUntil:])
                marshalledUntil += ke.Entrytable[i].SizeBytes()
        }
}

// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
func (ke *KernelIP6TGetEntries) UnmarshalBytes(src []byte) {
        ke.IPTGetEntries.UnmarshalUnsafe(src)
        unmarshalledUntil := ke.IPTGetEntries.SizeBytes()
        for i := range ke.Entrytable {
                ke.Entrytable[i].UnmarshalBytes(src[unmarshalledUntil:])
                unmarshalledUntil += ke.Entrytable[i].SizeBytes()
        }
}

var _ marshal.Marshallable = (*KernelIP6TGetEntries)(nil)

// IP6TEntry is an iptables rule. It corresponds to struct ip6t_entry in
// include/uapi/linux/netfilter_ipv6/ip6_tables.h.
//
// +marshal
type IP6TEntry struct {
        // IPv6 is used to filter packets based on the IPv6 header.
        IPv6 IP6TIP

        // NFCache relates to kernel-internal caching and isn't used by
        // userspace.
        NFCache uint32

        // TargetOffset is the byte offset from the beginning of this IPTEntry
        // to the start of the entry's target.
        TargetOffset uint16

        // NextOffset is the byte offset from the beginning of this IPTEntry to
        // the start of the next entry. It is thus also the size of the entry.
        NextOffset uint16

        // Comeback is a return pointer. It is not used by userspace.
        Comeback uint32

        _ [4]byte

        // Counters holds the packet and byte counts for this rule.
        Counters XTCounters

        // Elems holds the data for all this rule's matches followed by the
        // target. It is variable length -- users have to iterate over any
        // matches and use TargetOffset and NextOffset to make sense of the
        // data.
        //
        // Elems is omitted here because it would cause IPTEntry to be an extra
        // byte larger (see http://www.catb.org/esr/structure-packing/).
        //
        // Elems [0]byte
}

// SizeOfIP6TEntry is the size of an IP6TEntry.
const SizeOfIP6TEntry = 168

// KernelIP6TEntry is identical to IP6TEntry, but includes the Elems field.
//
// +marshal dynamic
type KernelIP6TEntry struct {
        Entry IP6TEntry

        // Elems holds the data for all this rule's matches followed by the
        // target. It is variable length -- users have to iterate over any
        // matches and use TargetOffset and NextOffset to make sense of the
        // data.
        Elems primitive.ByteSlice
}

// SizeBytes implements marshal.Marshallable.SizeBytes.
func (ke *KernelIP6TEntry) SizeBytes() int {
        return ke.Entry.SizeBytes() + ke.Elems.SizeBytes()
}

// MarshalBytes implements marshal.Marshallable.MarshalBytes.
func (ke *KernelIP6TEntry) MarshalBytes(dst []byte) {
        ke.Entry.MarshalUnsafe(dst)
        ke.Elems.MarshalBytes(dst[ke.Entry.SizeBytes():])
}

// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
func (ke *KernelIP6TEntry) UnmarshalBytes(src []byte) {
        ke.Entry.UnmarshalUnsafe(src)
        ke.Elems.UnmarshalBytes(src[ke.Entry.SizeBytes():])
}

// IP6TIP contains information for matching a packet's IP header.
// It corresponds to struct ip6t_ip6 in
// include/uapi/linux/netfilter_ipv6/ip6_tables.h.
//
// +marshal
type IP6TIP struct {
        // Src is the source IP address.
        Src Inet6Addr

        // Dst is the destination IP address.
        Dst Inet6Addr

        // SrcMask is the source IP mask.
        SrcMask Inet6Addr

        // DstMask is the destination IP mask.
        DstMask Inet6Addr

        // InputInterface is the input network interface.
        InputInterface [IFNAMSIZ]byte

        // OutputInterface is the output network interface.
        OutputInterface [IFNAMSIZ]byte

        // InputInterfaceMask is the input interface mask.
        InputInterfaceMask [IFNAMSIZ]byte

        // OuputInterfaceMask is the output interface mask.
        OutputInterfaceMask [IFNAMSIZ]byte

        // Protocol is the transport protocol.
        Protocol uint16

        // TOS matches TOS flags when Flags indicates filtering by TOS.
        TOS uint8

        // Flags define matching behavior for the IP header.
        Flags uint8

        // InverseFlags invert the meaning of fields in struct IPTIP. See the
        // IP6T_INV_* flags.
        InverseFlags uint8

        // Linux defines in6_addr (Inet6Addr for us) as the union of a
        // 16-element byte array and a 4-element 32-bit integer array, so the
        // whole struct is 4-byte aligned.
        _ [3]byte
}

// SizeOfIP6TIP is the size of an IP6 header.
const SizeOfIP6TIP = 136

// Flags in IP6TIP.Flags. Corresponding constants are in
// include/uapi/linux/netfilter_ipv6/ip6_tables.h.
const (
        // Whether to check the Protocol field.
        IP6T_F_PROTO = 0x01
        // Whether to match the TOS field.
        IP6T_F_TOS = 0x02
        // Indicates that the jump target is an aboslute GOTO, not an offset.
        IP6T_F_GOTO = 0x04
        // Enables all flags.
        IP6T_F_MASK = 0x07
)

// Flags in IP6TIP.InverseFlags. Corresponding constants are in
// include/uapi/linux/netfilter_ipv6/ip6_tables.h.
const (
        // Invert the meaning of InputInterface.
        IP6T_INV_VIA_IN = 0x01
        // Invert the meaning of OutputInterface.
        IP6T_INV_VIA_OUT = 0x02
        // Invert the meaning of TOS.
        IP6T_INV_TOS = 0x04
        // Invert the meaning of Src.
        IP6T_INV_SRCIP = 0x08
        // Invert the meaning of Dst.
        IP6T_INV_DSTIP = 0x10
        // Invert the meaning of the IPT_F_FRAG flag.
        IP6T_INV_FRAG = 0x20
        // Enable all flags.
        IP6T_INV_MASK = 0x7F
)

// NFNATRange corresponds to struct nf_nat_range in
// include/uapi/linux/netfilter/nf_nat.h.
//
// +marshal
type NFNATRange struct {
        Flags    uint32
        MinAddr  Inet6Addr
        MaxAddr  Inet6Addr
        MinProto uint16 // Network byte order.
        MaxProto uint16 // Network byte order.
}

// SizeOfNFNATRange is the size of NFNATRange.
const SizeOfNFNATRange = 40












































    2 
    1 


    2 




    1 
    1 


    1 

































































































  521 

  518 







  522 



  505 








  666 
  489 
  490 








  565 



  568 

  566 


  565 










  567 







  566 


  567 





  293 


  295 





  295 





  295 




  295 


  293 






































































   57 






   56 


   57 

   56 


   57 




   53 


    3 




   54 
   54 







   54 















   54 




   54 





  520 
    2 





  520 


  520 

  519 


  519 




  514 




   15 


  515 
  515 

   15 



  512 

  500 

    1 



















   23 

    4 


   19 


   19 


   23 
    2 





   21 
    2 





   19 
































    4 





    4 









    4 









    2 
    1 



    1 









    1 









 1710 
   70 



 1706 

 1696 




 1695 


   10 







    2 

    2 


    2 



  505 


  505 






  503 





   24 
    1 



   23 


    8 



   23 





   23 



   23 


   23 




   23 



   23 



  489 





  489 
  489 




  491 



  491 




  490 





  491 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "fmt"
        "math"
        "strings"
        "sync/atomic"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/fs/lock"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
)

// FDFlags define flags for an individual descriptor.
//
// +stateify savable
type FDFlags struct {
        // CloseOnExec indicates the descriptor should be closed on exec.
        CloseOnExec bool
}

// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags
// representation.
func (f FDFlags) ToLinuxFileFlags() (mask uint) {
        if f.CloseOnExec {
                mask |= linux.O_CLOEXEC
        }
        return
}

// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags
// representation.
func (f FDFlags) ToLinuxFDFlags() (mask uint) {
        if f.CloseOnExec {
                mask |= linux.FD_CLOEXEC
        }
        return
}

// descriptor holds the details about a file descriptor, namely a pointer to
// the file itself and the descriptor flags.
//
// Note that this is immutable and can only be changed via operations on the
// descriptorTable.
//
// It contains both VFS1 and VFS2 file types, but only one of them can be set.
//
// +stateify savable
type descriptor struct {
        // TODO(gvisor.dev/issue/1624): Remove fs.File.
        file     *fs.File
        fileVFS2 *vfs.FileDescription
        flags    FDFlags
}

// FDTable is used to manage File references and flags.
//
// +stateify savable
type FDTable struct {
        FDTableRefs

        k *Kernel

        // mu protects below.
        mu sync.Mutex `state:"nosave"`

        // next is start position to find fd.
        next int32

        // used contains the number of non-nil entries. It must be accessed
        // atomically. It may be read atomically without holding mu (but not
        // written).
        used int32

        // descriptorTable holds descriptors.
        descriptorTable `state:".(map[int32]descriptor)"`
}

func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
        m := make(map[int32]descriptor)
        f.forEach(context.Background(), func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
                m[fd] = descriptor{
                        file:     file,
                        fileVFS2: fileVFS2,
                        flags:    flags,
                }
        })
        return m
}

func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
        ctx := context.Background()
        f.initNoLeakCheck() // Initialize table.
        f.used = 0
        for fd, d := range m {
                if file, fileVFS2 := f.setAll(ctx, fd, d.file, d.fileVFS2, d.flags); file != nil || fileVFS2 != nil {
                        panic("VFS1 or VFS2 files set")
                }

                // Note that we do _not_ need to acquire a extra table reference here. The
                // table reference will already be accounted for in the file, so we drop the
                // reference taken by set above.
                switch {
                case d.file != nil:
                        d.file.DecRef(ctx)
                case d.fileVFS2 != nil:
                        d.fileVFS2.DecRef(ctx)
                }
        }
}

// drop drops the table reference.
func (f *FDTable) drop(ctx context.Context, file *fs.File) {
        // Release locks.
        file.Dirent.Inode.LockCtx.Posix.UnlockRegion(f, lock.LockRange{0, lock.LockEOF})

        // Send inotify events.
        d := file.Dirent
        var ev uint32
        if fs.IsDir(d.Inode.StableAttr) {
                ev |= linux.IN_ISDIR
        }
        if file.Flags().Write {
                ev |= linux.IN_CLOSE_WRITE
        } else {
                ev |= linux.IN_CLOSE_NOWRITE
        }
        d.InotifyEvent(ev, 0)

        // Drop the table reference.
        file.DecRef(ctx)
}

// dropVFS2 drops the table reference.
func (f *FDTable) dropVFS2(ctx context.Context, file *vfs.FileDescription) {
        // Release any POSIX lock possibly held by the FDTable.
        if file.SupportsLocks() {
                err := file.UnlockPOSIX(ctx, f, lock.LockRange{0, lock.LockEOF})
                if err != nil && !linuxerr.Equals(linuxerr.ENOLCK, err) {
                        panic(fmt.Sprintf("UnlockPOSIX failed: %v", err))
                }
        }

        // Drop the table's reference.
        file.DecRef(ctx)
}

// NewFDTable allocates a new FDTable that may be used by tasks in k.
func (k *Kernel) NewFDTable() *FDTable {
        f := &FDTable{k: k}
        f.init()
        return f
}

// DecRef implements RefCounter.DecRef.
//
// If f reaches zero references, all of its file descriptors are removed.
func (f *FDTable) DecRef(ctx context.Context) {
        f.FDTableRefs.DecRef(func() {
                f.RemoveIf(ctx, func(*fs.File, *vfs.FileDescription, FDFlags) bool {
                        return true
                })
        })
}

// forEach iterates over all non-nil files in sorted order.
//
// It is the caller's responsibility to acquire an appropriate lock.
func (f *FDTable) forEach(ctx context.Context, fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
        // retries tracks the number of failed TryIncRef attempts for the same FD.
        retries := 0
        fd := int32(0)
        for {
                file, fileVFS2, flags, ok := f.getAll(fd)
                if !ok {
                        break
                }
                switch {
                case file != nil:
                        if !file.TryIncRef() {
                                retries++
                                if retries > 1000 {
                                        panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, FileOps: %+v", fd, file, file.FileOperations))
                                }
                                continue // Race caught.
                        }
                        fn(fd, file, nil, flags)
                        file.DecRef(ctx)
                case fileVFS2 != nil:
                        if !fileVFS2.TryIncRef() {
                                retries++
                                if retries > 1000 {
                                        panic(fmt.Sprintf("File in FD table has been destroyed. FD: %d, File: %+v, Impl: %+v", fd, fileVFS2, fileVFS2.Impl()))
                                }
                                continue // Race caught.
                        }
                        fn(fd, nil, fileVFS2, flags)
                        fileVFS2.DecRef(ctx)
                }
                retries = 0
                fd++
        }
}

// String is a stringer for FDTable.
func (f *FDTable) String() string {
        var buf strings.Builder
        ctx := context.Background()
        f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
                switch {
                case file != nil:
                        n, _ := file.Dirent.FullName(nil /* root */)
                        fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, n)

                case fileVFS2 != nil:
                        vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem()
                        vd := fileVFS2.VirtualDentry()
                        if vd.Dentry() == nil {
                                panic(fmt.Sprintf("fd %d (type %T) has nil dentry: %#v", fd, fileVFS2.Impl(), fileVFS2))
                        }
                        name, err := vfsObj.PathnameWithDeleted(ctx, vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
                        if err != nil {
                                fmt.Fprintf(&buf, "<err: %v>\n", err)
                                return
                        }
                        fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, name)
                }
        })
        return buf.String()
}

// NewFDs allocates new FDs guaranteed to be the lowest number available
// greater than or equal to the fd parameter. All files will share the set
// flags. Success is guaranteed to be all or none.
func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags FDFlags) (fds []int32, err error) {
        if fd < 0 {
                // Don't accept negative FDs.
                return nil, unix.EINVAL
        }

        // Default limit.
        end := int32(math.MaxInt32)

        // Ensure we don't get past the provided limit.
        if limitSet := limits.FromContext(ctx); limitSet != nil {
                lim := limitSet.Get(limits.NumberOfFiles)
                if lim.Cur != limits.Infinity {
                        end = int32(lim.Cur)
                }
                if fd >= end {
                        return nil, unix.EMFILE
                }
        }

        f.mu.Lock()

        // From f.next to find available fd.
        if fd < f.next {
                fd = f.next
        }

        // Install all entries.
        for i := fd; i < end && len(fds) < len(files); i++ {
                if d, _, _ := f.get(i); d == nil {
                        // Set the descriptor.
                        f.set(ctx, i, files[len(fds)], flags)
                        fds = append(fds, i) // Record the file descriptor.
                }
        }

        // Failure? Unwind existing FDs.
        if len(fds) < len(files) {
                for _, i := range fds {
                        f.set(ctx, i, nil, FDFlags{})
                }
                f.mu.Unlock()

                // Drop the reference taken by the call to f.set() that
                // originally installed the file. Don't call f.drop()
                // (generating inotify events, etc.) since the file should
                // appear to have never been inserted into f.
                for _, file := range files[:len(fds)] {
                        file.DecRef(ctx)
                }
                return nil, unix.EMFILE
        }

        if fd == f.next {
                // Update next search start position.
                f.next = fds[len(fds)-1] + 1
        }

        f.mu.Unlock()
        return fds, nil
}

// NewFDsVFS2 allocates new FDs guaranteed to be the lowest number available
// greater than or equal to the fd parameter. All files will share the set
// flags. Success is guaranteed to be all or none.
func (f *FDTable) NewFDsVFS2(ctx context.Context, fd int32, files []*vfs.FileDescription, flags FDFlags) (fds []int32, err error) {
        if fd < 0 {
                // Don't accept negative FDs.
                return nil, unix.EINVAL
        }

        // Default limit.
        end := int32(math.MaxInt32)

        // Ensure we don't get past the provided limit.
        if limitSet := limits.FromContext(ctx); limitSet != nil {
                lim := limitSet.Get(limits.NumberOfFiles)
                if lim.Cur != limits.Infinity {
                        end = int32(lim.Cur)
                }
                if fd >= end {
                        return nil, unix.EMFILE
                }
        }

        f.mu.Lock()

        // From f.next to find available fd.
        if fd < f.next {
                fd = f.next
        }

        // Install all entries.
        for i := fd; i < end && len(fds) < len(files); i++ {
                if d, _, _ := f.getVFS2(i); d == nil {
                        // Set the descriptor.
                        f.setVFS2(ctx, i, files[len(fds)], flags)
                        fds = append(fds, i) // Record the file descriptor.
                }
        }

        // Failure? Unwind existing FDs.
        if len(fds) < len(files) {
                for _, i := range fds {
                        f.setVFS2(ctx, i, nil, FDFlags{})
                }
                f.mu.Unlock()

                // Drop the reference taken by the call to f.setVFS2() that
                // originally installed the file. Don't call f.dropVFS2()
                // (generating inotify events, etc.) since the file should
                // appear to have never been inserted into f.
                for _, file := range files[:len(fds)] {
                        file.DecRef(ctx)
                }
                return nil, unix.EMFILE
        }

        if fd == f.next {
                // Update next search start position.
                f.next = fds[len(fds)-1] + 1
        }

        f.mu.Unlock()
        return fds, nil
}

// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for
// the given file description. If it succeeds, it takes a reference on file.
func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
        if minfd < 0 {
                // Don't accept negative FDs.
                return -1, unix.EINVAL
        }

        // Default limit.
        end := int32(math.MaxInt32)

        // Ensure we don't get past the provided limit.
        if limitSet := limits.FromContext(ctx); limitSet != nil {
                lim := limitSet.Get(limits.NumberOfFiles)
                if lim.Cur != limits.Infinity {
                        end = int32(lim.Cur)
                }
                if minfd >= end {
                        return -1, unix.EMFILE
                }
        }

        f.mu.Lock()
        defer f.mu.Unlock()

        // From f.next to find available fd.
        fd := minfd
        if fd < f.next {
                fd = f.next
        }
        for fd < end {
                if d, _, _ := f.getVFS2(fd); d == nil {
                        f.setVFS2(ctx, fd, file, flags)
                        if fd == f.next {
                                // Update next search start position.
                                f.next = fd + 1
                        }
                        return fd, nil
                }
                fd++
        }
        return -1, unix.EMFILE
}

// NewFDAt sets the file reference for the given FD. If there is an active
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
        df, _, err := f.newFDAt(ctx, fd, file, nil, flags)
        if err != nil {
                return err
        }
        if df != nil {
                f.drop(ctx, df)
        }
        return nil
}

// NewFDAtVFS2 sets the file reference for the given FD. If there is an active
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error {
        _, dfVFS2, err := f.newFDAt(ctx, fd, nil, file, flags)
        if err != nil {
                return err
        }
        if dfVFS2 != nil {
                f.dropVFS2(ctx, dfVFS2)
        }
        return nil
}

func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription, error) {
        if fd < 0 {
                // Don't accept negative FDs.
                return nil, nil, unix.EBADF
        }

        // Check the limit for the provided file.
        if limitSet := limits.FromContext(ctx); limitSet != nil {
                if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
                        return nil, nil, unix.EMFILE
                }
        }

        // Install the entry.
        f.mu.Lock()
        defer f.mu.Unlock()

        df, dfVFS2 := f.setAll(ctx, fd, file, fileVFS2, flags)
        return df, dfVFS2, nil
}

// SetFlags sets the flags for the given file descriptor.
//
// True is returned iff flags were changed.
func (f *FDTable) SetFlags(ctx context.Context, fd int32, flags FDFlags) error {
        if fd < 0 {
                // Don't accept negative FDs.
                return unix.EBADF
        }

        f.mu.Lock()
        defer f.mu.Unlock()

        file, _, _ := f.get(fd)
        if file == nil {
                // No file found.
                return unix.EBADF
        }

        // Update the flags.
        f.set(ctx, fd, file, flags)
        return nil
}

// SetFlagsVFS2 sets the flags for the given file descriptor.
//
// True is returned iff flags were changed.
func (f *FDTable) SetFlagsVFS2(ctx context.Context, fd int32, flags FDFlags) error {
        if fd < 0 {
                // Don't accept negative FDs.
                return unix.EBADF
        }

        f.mu.Lock()
        defer f.mu.Unlock()

        file, _, _ := f.getVFS2(fd)
        if file == nil {
                // No file found.
                return unix.EBADF
        }

        // Update the flags.
        f.setVFS2(ctx, fd, file, flags)
        return nil
}

// Get returns a reference to the file and the flags for the FD or nil if no
// file is defined for the given fd.
//
// N.B. Callers are required to use DecRef when they are done.
//
//go:nosplit
func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) {
        if fd < 0 {
                return nil, FDFlags{}
        }

        for {
                file, flags, _ := f.get(fd)
                if file != nil {
                        if !file.TryIncRef() {
                                continue // Race caught.
                        }
                        // Reference acquired.
                        return file, flags
                }
                // No file available.
                return nil, FDFlags{}
        }
}

// GetVFS2 returns a reference to the file and the flags for the FD or nil if no
// file is defined for the given fd.
//
// N.B. Callers are required to use DecRef when they are done.
//
//go:nosplit
func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) {
        if fd < 0 {
                return nil, FDFlags{}
        }

        for {
                file, flags, _ := f.getVFS2(fd)
                if file != nil {
                        if !file.TryIncRef() {
                                continue // Race caught.
                        }
                        // Reference acquired.
                        return file, flags
                }
                // No file available.
                return nil, FDFlags{}
        }
}

// GetFDs returns a sorted list of valid fds.
//
// Precondition: The caller must be running on the task goroutine, or Task.mu
// must be locked.
func (f *FDTable) GetFDs(ctx context.Context) []int32 {
        fds := make([]int32, 0, int(atomic.LoadInt32(&f.used)))
        f.forEach(ctx, func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
                fds = append(fds, fd)
        })
        return fds
}

// Fork returns an independent FDTable.
func (f *FDTable) Fork(ctx context.Context) *FDTable {
        clone := f.k.NewFDTable()

        f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
                // The set function here will acquire an appropriate table
                // reference for the clone. We don't need anything else.
                if df, dfVFS2 := clone.setAll(ctx, fd, file, fileVFS2, flags); df != nil || dfVFS2 != nil {
                        panic("VFS1 or VFS2 files set")
                }
        })
        return clone
}

// Remove removes an FD from and returns a non-file iff successful.
//
// N.B. Callers are required to use DecRef when they are done.
func (f *FDTable) Remove(ctx context.Context, fd int32) (*fs.File, *vfs.FileDescription) {
        if fd < 0 {
                return nil, nil
        }

        f.mu.Lock()

        // Update current available position.
        if fd < f.next {
                f.next = fd
        }

        orig, orig2, _, _ := f.getAll(fd)

        // Add reference for caller.
        switch {
        case orig != nil:
                orig.IncRef()
        case orig2 != nil:
                orig2.IncRef()
        }

        if orig != nil || orig2 != nil {
                orig, orig2 = f.setAll(ctx, fd, nil, nil, FDFlags{}) // Zap entry.
        }
        f.mu.Unlock()

        if orig != nil {
                f.drop(ctx, orig)
        }
        if orig2 != nil {
                f.dropVFS2(ctx, orig2)
        }

        return orig, orig2
}

// RemoveIf removes all FDs where cond is true.
func (f *FDTable) RemoveIf(ctx context.Context, cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
        // TODO(gvisor.dev/issue/1624): Remove fs.File slice.
        var files []*fs.File
        var filesVFS2 []*vfs.FileDescription

        f.mu.Lock()
        f.forEach(ctx, func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
                if cond(file, fileVFS2, flags) {
                        df, dfVFS2 := f.setAll(ctx, fd, nil, nil, FDFlags{}) // Clear from table.
                        if df != nil {
                                files = append(files, df)
                        }
                        if dfVFS2 != nil {
                                filesVFS2 = append(filesVFS2, dfVFS2)
                        }
                        // Update current available position.
                        if fd < f.next {
                                f.next = fd
                        }
                }
        })
        f.mu.Unlock()

        for _, file := range files {
                f.drop(ctx, file)
        }

        for _, file := range filesVFS2 {
                f.dropVFS2(ctx, file)
        }
}



































































    5 







   20 
    1 


   19 



















   30 

    1 


   29 





   30 




   30 



   36 
    1 



   35 




   17 







   35 













   35 




    5 



   22 






   22 






   22 















   22 








   22 




    1 

   18 





    2 



    1 


    1 
    2 


   20 

    1 



   19 






    2 






    2 





   20 



    2 



   18 



   18 





    1 



    1 











    5 



    2 







    3 






   31 
    2 


   29 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "math"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
)

// IntervalTimer represents a POSIX interval timer as described by
// timer_create(2).
//
// +stateify savable
type IntervalTimer struct {
        timer *ktime.Timer

        // If target is not nil, it receives signo from timer expirations. If group
        // is true, these signals are thread-group-directed. These fields are
        // immutable.
        target *Task
        signo  linux.Signal
        id     linux.TimerID
        sigval uint64
        group  bool

        // If sigpending is true, a signal to target is already queued, and timer
        // expirations should increment overrunCur instead of sending another
        // signal. sigpending is protected by target's signal mutex. (If target is
        // nil, the timer will never send signals, so sigpending will be unused.)
        sigpending bool

        // If sigorphan is true, timer's setting has been changed since sigpending
        // last became true, such that overruns should no longer be counted in the
        // pending signals si_overrun. sigorphan is protected by target's signal
        // mutex.
        sigorphan bool

        // overrunCur is the number of overruns that have occurred since the last
        // time a signal was sent. overrunCur is protected by target's signal
        // mutex.
        overrunCur uint64

        // Consider the last signal sent by this timer that has been dequeued.
        // overrunLast is the number of overruns that occurred between when this
        // signal was sent and when it was dequeued. Equivalently, overrunLast was
        // the value of overrunCur when this signal was dequeued. overrunLast is
        // protected by target's signal mutex.
        overrunLast uint64
}

// DestroyTimer releases it's resources.
func (it *IntervalTimer) DestroyTimer() {
        it.timer.Destroy()
        it.timerSettingChanged()
        // A destroyed IntervalTimer is still potentially reachable via a
        // pendingSignal; nil out timer so that it won't be saved.
        it.timer = nil
}

func (it *IntervalTimer) timerSettingChanged() {
        if it.target == nil {
                return
        }
        it.target.tg.pidns.owner.mu.RLock()
        defer it.target.tg.pidns.owner.mu.RUnlock()
        it.target.tg.signalHandlers.mu.Lock()
        defer it.target.tg.signalHandlers.mu.Unlock()
        it.sigorphan = true
        it.overrunCur = 0
        it.overrunLast = 0
}

// PauseTimer pauses the associated Timer.
func (it *IntervalTimer) PauseTimer() {
        it.timer.Pause()
}

// ResumeTimer resumes the associated Timer.
func (it *IntervalTimer) ResumeTimer() {
        it.timer.Resume()
}

// Preconditions: it.target's signal mutex must be locked.
func (it *IntervalTimer) updateDequeuedSignalLocked(si *linux.SignalInfo) {
        it.sigpending = false
        if it.sigorphan {
                return
        }
        it.overrunLast = it.overrunCur
        it.overrunCur = 0
        si.SetOverrun(saturateI32FromU64(it.overrunLast))
}

// Preconditions: it.target's signal mutex must be locked.
func (it *IntervalTimer) signalRejectedLocked() {
        it.sigpending = false
        if it.sigorphan {
                return
        }
        it.overrunCur++
}

// Notify implements ktime.TimerListener.Notify.
func (it *IntervalTimer) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
        if it.target == nil {
                return ktime.Setting{}, false
        }

        it.target.tg.pidns.owner.mu.RLock()
        defer it.target.tg.pidns.owner.mu.RUnlock()
        it.target.tg.signalHandlers.mu.Lock()
        defer it.target.tg.signalHandlers.mu.Unlock()

        if it.sigpending {
                it.overrunCur += exp
                return ktime.Setting{}, false
        }

        // sigpending must be set before sendSignalTimerLocked() so that it can be
        // unset if the signal is discarded (in which case sendSignalTimerLocked()
        // will return nil).
        it.sigpending = true
        it.sigorphan = false
        it.overrunCur += exp - 1
        si := &linux.SignalInfo{
                Signo: int32(it.signo),
                Code:  linux.SI_TIMER,
        }
        si.SetTimerID(it.id)
        si.SetSigval(it.sigval)
        // si_overrun is set when the signal is dequeued.
        if err := it.target.sendSignalTimerLocked(si, it.group, it); err != nil {
                it.signalRejectedLocked()
        }

        return ktime.Setting{}, false
}

// Destroy implements ktime.TimerListener.Destroy. Users of Timer should call
// DestroyTimer instead.
func (it *IntervalTimer) Destroy() {
}

// IntervalTimerCreate implements timer_create(2).
func (t *Task) IntervalTimerCreate(c ktime.Clock, sigev *linux.Sigevent) (linux.TimerID, error) {
        t.tg.timerMu.Lock()
        defer t.tg.timerMu.Unlock()

        // Allocate a timer ID.
        var id linux.TimerID
        end := t.tg.nextTimerID
        for {
                id = t.tg.nextTimerID
                _, ok := t.tg.timers[id]
                t.tg.nextTimerID++
                if t.tg.nextTimerID < 0 {
                        t.tg.nextTimerID = 0
                }
                if !ok {
                        break
                }
                if t.tg.nextTimerID == end {
                        return 0, linuxerr.EAGAIN
                }
        }

        // "The implementation of the default case where evp [sic] is NULL is
        // handled inside glibc, which invokes the underlying system call with a
        // suitably populated sigevent structure." - timer_create(2). This is
        // misleading; the timer_create syscall also handles a NULL sevp as
        // described by the man page
        // (kernel/time/posix-timers.c:sys_timer_create(), do_timer_create()). This
        // must be handled here instead of the syscall wrapper since sigval is the
        // timer ID, which isn't available until we allocate it in this function.
        if sigev == nil {
                sigev = &linux.Sigevent{
                        Signo:  int32(linux.SIGALRM),
                        Notify: linux.SIGEV_SIGNAL,
                        Value:  uint64(id),
                }
        }

        // Construct the timer.
        it := &IntervalTimer{
                id:     id,
                sigval: sigev.Value,
        }
        switch sigev.Notify {
        case linux.SIGEV_NONE:
                // leave it.target = nil
        case linux.SIGEV_SIGNAL, linux.SIGEV_THREAD:
                // POSIX SIGEV_THREAD semantics are implemented in userspace by libc;
                // to the kernel, SIGEV_THREAD and SIGEV_SIGNAL are equivalent. (See
                // Linux's kernel/time/posix-timers.c:good_sigevent().)
                it.target = t.tg.leader
                it.group = true
        case linux.SIGEV_THREAD_ID:
                t.tg.pidns.owner.mu.RLock()
                target, ok := t.tg.pidns.tasks[ThreadID(sigev.Tid)]
                t.tg.pidns.owner.mu.RUnlock()
                if !ok || target.tg != t.tg {
                        return 0, linuxerr.EINVAL
                }
                it.target = target
        default:
                return 0, linuxerr.EINVAL
        }
        if sigev.Notify != linux.SIGEV_NONE {
                it.signo = linux.Signal(sigev.Signo)
                if !it.signo.IsValid() {
                        return 0, linuxerr.EINVAL
                }
        }
        it.timer = ktime.NewTimer(c, it)

        t.tg.timers[id] = it
        return id, nil
}

// IntervalTimerDelete implements timer_delete(2).
func (t *Task) IntervalTimerDelete(id linux.TimerID) error {
        t.tg.timerMu.Lock()
        defer t.tg.timerMu.Unlock()
        it := t.tg.timers[id]
        if it == nil {
                return linuxerr.EINVAL
        }
        delete(t.tg.timers, id)
        it.DestroyTimer()
        return nil
}

// IntervalTimerSettime implements timer_settime(2).
func (t *Task) IntervalTimerSettime(id linux.TimerID, its linux.Itimerspec, abs bool) (linux.Itimerspec, error) {
        t.tg.timerMu.Lock()
        defer t.tg.timerMu.Unlock()
        it := t.tg.timers[id]
        if it == nil {
                return linux.Itimerspec{}, linuxerr.EINVAL
        }

        newS, err := ktime.SettingFromItimerspec(its, abs, it.timer.Clock())
        if err != nil {
                return linux.Itimerspec{}, err
        }
        tm, oldS := it.timer.SwapAnd(newS, it.timerSettingChanged)
        its = ktime.ItimerspecFromSetting(tm, oldS)
        return its, nil
}

// IntervalTimerGettime implements timer_gettime(2).
func (t *Task) IntervalTimerGettime(id linux.TimerID) (linux.Itimerspec, error) {
        t.tg.timerMu.Lock()
        defer t.tg.timerMu.Unlock()
        it := t.tg.timers[id]
        if it == nil {
                return linux.Itimerspec{}, linuxerr.EINVAL
        }

        tm, s := it.timer.Get()
        its := ktime.ItimerspecFromSetting(tm, s)
        return its, nil
}

// IntervalTimerGetoverrun implements timer_getoverrun(2).
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) IntervalTimerGetoverrun(id linux.TimerID) (int32, error) {
        t.tg.timerMu.Lock()
        defer t.tg.timerMu.Unlock()
        it := t.tg.timers[id]
        if it == nil {
                return 0, linuxerr.EINVAL
        }
        // By timer_create(2) invariant, either it.target == nil (in which case
        // it.overrunLast is immutably 0) or t.tg == it.target.tg; and the fact
        // that t is executing timer_getoverrun(2) means that t.tg can't be
        // completing execve, so t.tg.signalHandlers can't be changing, allowing us
        // to lock t.tg.signalHandlers.mu without holding the TaskSet mutex.
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        // This is consistent with Linux after 78c9c4dfbf8c ("posix-timers:
        // Sanitize overrun handling").
        return saturateI32FromU64(it.overrunLast), nil
}

func saturateI32FromU64(x uint64) int32 {
        if x > math.MaxInt32 {
                return math.MaxInt32
        }
        return int32(x)
}



























  413 

   21 


  403 







  394 



  233 
    5 



  231 


   71 

   10 



   61 





  382 










  383 

  222 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

func copyInPath(t *kernel.Task, addr hostarch.Addr) (fspath.Path, error) {
        pathname, err := t.CopyInString(addr, linux.PATH_MAX)
        if err != nil {
                return fspath.Path{}, err
        }
        return fspath.Parse(pathname), nil
}

type taskPathOperation struct {
        pop          vfs.PathOperation
        haveStartRef bool
}

func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink) (taskPathOperation, error) {
        root := t.FSContext().RootDirectoryVFS2()
        start := root
        haveStartRef := false
        if !path.Absolute {
                if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
                        root.DecRef(t)
                        return taskPathOperation{}, syserror.ENOENT
                }
                if dirfd == linux.AT_FDCWD {
                        start = t.FSContext().WorkingDirectoryVFS2()
                        haveStartRef = true
                } else {
                        dirfile := t.GetFileVFS2(dirfd)
                        if dirfile == nil {
                                root.DecRef(t)
                                return taskPathOperation{}, linuxerr.EBADF
                        }
                        start = dirfile.VirtualDentry()
                        start.IncRef()
                        haveStartRef = true
                        dirfile.DecRef(t)
                }
        }
        return taskPathOperation{
                pop: vfs.PathOperation{
                        Root:               root,
                        Start:              start,
                        Path:               path,
                        FollowFinalSymlink: bool(shouldFollowFinalSymlink),
                },
                haveStartRef: haveStartRef,
        }, nil
}

func (tpop *taskPathOperation) Release(t *kernel.Task) {
        tpop.pop.Root.DecRef(t)
        if tpop.haveStartRef {
                tpop.pop.Start.DecRef(t)
                tpop.haveStartRef = false
        }
}

type shouldAllowEmptyPath bool

const (
        disallowEmptyPath shouldAllowEmptyPath = false
        allowEmptyPath    shouldAllowEmptyPath = true
)

type shouldFollowFinalSymlink bool

const (
        nofollowFinalSymlink shouldFollowFinalSymlink = false
        followFinalSymlink   shouldFollowFinalSymlink = true
)


































































































   15 






   15 














   15 




   13 







    1 









    1 
    1 



   13 

    1 










    2 




    2 






    2 





    1 



    2 


















    2 
    1 



    1 







    1 
    1 







    1 
    1 










    1 







    1 






    1 













    1 




    4 




    2 
    1 








    4 











    3 

















    3 








    3 














    3 











    3 





    3 




    1 





    2 







    2 



    2 




    2 

















    3 











    3 








    3 









    3 


    3 








    3 














    3 







  352 


    3 
    1 


    3 



  351 

    1 






    1 





    1 
    1 



    1 
    1 







   11 














    1 











    1 



































    4 








    2 





    3 







    3 
    2 






    2 



    1 



    1 






















    4 





    2 


    4 





    2 



    2 









    2 









    2 


    1 




    1 





    1 











    1 


    1 




    1 



    1 



    1 


    2 









   14 

    6 


   14 



   14 


   12 

   11 


   14 





    3 
    3 


    3 






    1 

    1 


    1 
    1 


    1 



    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "bytes"
        "fmt"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/uniqueid"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// inotifyEventBaseSize is the base size of linux's struct inotify_event. This
// must be a power 2 for rounding below.
const inotifyEventBaseSize = 16

// EventType defines different kinds of inotfiy events.
//
// The way events are labelled appears somewhat arbitrary, but they must match
// Linux so that IN_EXCL_UNLINK behaves as it does in Linux.
//
// +stateify savable
type EventType uint8

// PathEvent and InodeEvent correspond to FSNOTIFY_EVENT_PATH and
// FSNOTIFY_EVENT_INODE in Linux.
const (
        PathEvent  EventType = iota
        InodeEvent EventType = iota
)

// Inotify represents an inotify instance created by inotify_init(2) or
// inotify_init1(2). Inotify implements FileDescriptionImpl.
//
// +stateify savable
type Inotify struct {
        vfsfd FileDescription
        FileDescriptionDefaultImpl
        DentryMetadataFileDescriptionImpl
        NoLockFD

        // Unique identifier for this inotify instance. We don't just reuse the
        // inotify fd because fds can be duped. These should not be exposed to the
        // user, since we may aggressively reuse an id on S/R.
        id uint64

        // queue is used to notify interested parties when the inotify instance
        // becomes readable or writable.
        queue waiter.Queue

        // evMu *only* protects the events list. We need a separate lock while
        // queuing events: using mu may violate lock ordering, since at that point
        // the calling goroutine may already hold Watches.mu.
        evMu sync.Mutex `state:"nosave"`

        // A list of pending events for this inotify instance. Protected by evMu.
        events eventList

        // A scratch buffer, used to serialize inotify events. Allocate this
        // ahead of time for the sake of performance. Protected by evMu.
        scratch []byte

        // mu protects the fields below.
        mu sync.Mutex `state:"nosave"`

        // nextWatchMinusOne is used to allocate watch descriptors on this Inotify
        // instance. Note that Linux starts numbering watch descriptors from 1.
        nextWatchMinusOne int32

        // Map from watch descriptors to watch objects.
        watches map[int32]*Watch
}

var _ FileDescriptionImpl = (*Inotify)(nil)

// NewInotifyFD constructs a new Inotify instance.
func NewInotifyFD(ctx context.Context, vfsObj *VirtualFilesystem, flags uint32) (*FileDescription, error) {
        // O_CLOEXEC affects file descriptors, so it must be handled outside of vfs.
        flags &^= linux.O_CLOEXEC
        if flags&^linux.O_NONBLOCK != 0 {
                return nil, linuxerr.EINVAL
        }

        id := uniqueid.GlobalFromContext(ctx)
        vd := vfsObj.NewAnonVirtualDentry(fmt.Sprintf("[inotifyfd:%d]", id))
        defer vd.DecRef(ctx)
        fd := &Inotify{
                id:      id,
                scratch: make([]byte, inotifyEventBaseSize),
                watches: make(map[int32]*Watch),
        }
        if err := fd.vfsfd.Init(fd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
                UseDentryMetadata: true,
                DenyPRead:         true,
                DenyPWrite:        true,
        }); err != nil {
                return nil, err
        }
        return &fd.vfsfd, nil
}

// Release implements FileDescriptionImpl.Release. Release removes all
// watches and frees all resources for an inotify instance.
func (i *Inotify) Release(ctx context.Context) {
        var ds []*Dentry

        // We need to hold i.mu to avoid a race with concurrent calls to
        // Inotify.handleDeletion from Watches. There's no risk of Watches
        // accessing this Inotify after the destructor ends, because we remove all
        // references to it below.
        i.mu.Lock()
        for _, w := range i.watches {
                // Remove references to the watch from the watches set on the target. We
                // don't need to worry about the references from i.watches, since this
                // file description is about to be destroyed.
                d := w.target
                ws := d.Watches()
                // Watchable dentries should never return a nil watch set.
                if ws == nil {
                        panic("Cannot remove watch from an unwatchable dentry")
                }
                ws.Remove(i.id)
                if ws.Size() == 0 {
                        ds = append(ds, d)
                }
        }
        i.mu.Unlock()

        for _, d := range ds {
                d.OnZeroWatches(ctx)
        }
}

// Allocate implements FileDescription.Allocate.
func (i *Inotify) Allocate(ctx context.Context, mode, offset, length uint64) error {
        panic("Allocate should not be called on read-only inotify fds")
}

// EventRegister implements waiter.Waitable.
func (i *Inotify) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        i.queue.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.
func (i *Inotify) EventUnregister(e *waiter.Entry) {
        i.queue.EventUnregister(e)
}

// Readiness implements waiter.Waitable.Readiness.
//
// Readiness indicates whether there are pending events for an inotify instance.
func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask {
        ready := waiter.EventMask(0)

        i.evMu.Lock()
        defer i.evMu.Unlock()

        if !i.events.Empty() {
                ready |= waiter.ReadableEvents
        }

        return mask & ready
}

// PRead implements FileDescriptionImpl.PRead.
func (*Inotify) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
        return 0, linuxerr.ESPIPE
}

// PWrite implements FileDescriptionImpl.PWrite.
func (*Inotify) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
        return 0, linuxerr.ESPIPE
}

// Write implements FileDescriptionImpl.Write.
func (*Inotify) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
        return 0, linuxerr.EBADF
}

// Read implements FileDescriptionImpl.Read.
func (i *Inotify) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
        if dst.NumBytes() < inotifyEventBaseSize {
                return 0, linuxerr.EINVAL
        }

        i.evMu.Lock()
        defer i.evMu.Unlock()

        if i.events.Empty() {
                // Nothing to read yet, tell caller to block.
                return 0, syserror.ErrWouldBlock
        }

        var writeLen int64
        for it := i.events.Front(); it != nil; {
                // Advance `it` before the element is removed from the list, or else
                // it.Next() will always be nil.
                event := it
                it = it.Next()

                // Does the buffer have enough remaining space to hold the event we're
                // about to write out?
                if dst.NumBytes() < int64(event.sizeOf()) {
                        if writeLen > 0 {
                                // Buffer wasn't big enough for all pending events, but we did
                                // write some events out.
                                return writeLen, nil
                        }
                        return 0, linuxerr.EINVAL
                }

                // Linux always dequeues an available event as long as there's enough
                // buffer space to copy it out, even if the copy below fails. Emulate
                // this behaviour.
                i.events.Remove(event)

                // Buffer has enough space, copy event to the read buffer.
                n, err := event.CopyTo(ctx, i.scratch, dst)
                if err != nil {
                        return 0, err
                }

                writeLen += n
                dst = dst.DropFirst64(n)
        }
        return writeLen, nil
}

// Ioctl implements FileDescriptionImpl.Ioctl.
func (i *Inotify) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        switch args[1].Int() {
        case linux.FIONREAD:
                i.evMu.Lock()
                defer i.evMu.Unlock()
                var n uint32
                for e := i.events.Front(); e != nil; e = e.Next() {
                        n += uint32(e.sizeOf())
                }
                var buf [4]byte
                hostarch.ByteOrder.PutUint32(buf[:], n)
                _, err := uio.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{})
                return 0, err

        default:
                return 0, linuxerr.ENOTTY
        }
}

func (i *Inotify) queueEvent(ev *Event) {
        i.evMu.Lock()

        // Check if we should coalesce the event we're about to queue with the last
        // one currently in the queue. Events are coalesced if they are identical.
        if last := i.events.Back(); last != nil {
                if ev.equals(last) {
                        // "Coalesce" the two events by simply not queuing the new one. We
                        // don't need to raise a waiter.EventIn notification because no new
                        // data is available for reading.
                        i.evMu.Unlock()
                        return
                }
        }

        i.events.PushBack(ev)

        // Release mutex before notifying waiters because we don't control what they
        // can do.
        i.evMu.Unlock()

        i.queue.Notify(waiter.ReadableEvents)
}

// newWatchLocked creates and adds a new watch to target.
//
// Precondition: i.mu must be locked. ws must be the watch set for target d.
func (i *Inotify) newWatchLocked(d *Dentry, ws *Watches, mask uint32) *Watch {
        w := &Watch{
                owner:  i,
                wd:     i.nextWatchIDLocked(),
                target: d,
                mask:   mask,
        }

        // Hold the watch in this inotify instance as well as the watch set on the
        // target.
        i.watches[w.wd] = w
        ws.Add(w)
        return w
}

// newWatchIDLocked allocates and returns a new watch descriptor.
//
// Precondition: i.mu must be locked.
func (i *Inotify) nextWatchIDLocked() int32 {
        i.nextWatchMinusOne++
        return i.nextWatchMinusOne
}

// AddWatch constructs a new inotify watch and adds it to the target. It
// returns the watch descriptor returned by inotify_add_watch(2).
//
// The caller must hold a reference on target.
func (i *Inotify) AddWatch(target *Dentry, mask uint32) (int32, error) {
        // Note: Locking this inotify instance protects the result returned by
        // Lookup() below. With the lock held, we know for sure the lookup result
        // won't become stale because it's impossible for *this* instance to
        // add/remove watches on target.
        i.mu.Lock()
        defer i.mu.Unlock()

        ws := target.Watches()
        if ws == nil {
                // While Linux supports inotify watches on all filesystem types, watches on
                // filesystems like kernfs are not generally useful, so we do not.
                return 0, linuxerr.EPERM
        }
        // Does the target already have a watch from this inotify instance?
        if existing := ws.Lookup(i.id); existing != nil {
                newmask := mask
                if mask&linux.IN_MASK_ADD != 0 {
                        // "Add (OR) events to watch mask for this pathname if it already
                        // exists (instead of replacing mask)." -- inotify(7)
                        newmask |= atomic.LoadUint32(&existing.mask)
                }
                atomic.StoreUint32(&existing.mask, newmask)
                return existing.wd, nil
        }

        // No existing watch, create a new watch.
        w := i.newWatchLocked(target, ws, mask)
        return w.wd, nil
}

// RmWatch looks up an inotify watch for the given 'wd' and configures the
// target to stop sending events to this inotify instance.
func (i *Inotify) RmWatch(ctx context.Context, wd int32) error {
        i.mu.Lock()

        // Find the watch we were asked to removed.
        w, ok := i.watches[wd]
        if !ok {
                i.mu.Unlock()
                return linuxerr.EINVAL
        }

        // Remove the watch from this instance.
        delete(i.watches, wd)

        // Remove the watch from the watch target.
        ws := w.target.Watches()
        // AddWatch ensures that w.target has a non-nil watch set.
        if ws == nil {
                panic("Watched dentry cannot have nil watch set")
        }
        ws.Remove(w.OwnerID())
        remaining := ws.Size()
        i.mu.Unlock()

        if remaining == 0 {
                w.target.OnZeroWatches(ctx)
        }

        // Generate the event for the removal.
        i.queueEvent(newEvent(wd, "", linux.IN_IGNORED, 0))

        return nil
}

// Watches is the collection of all inotify watches on a single file.
//
// +stateify savable
type Watches struct {
        // mu protects the fields below.
        mu sync.RWMutex `state:"nosave"`

        // ws is the map of active watches in this collection, keyed by the inotify
        // instance id of the owner.
        ws map[uint64]*Watch
}

// Size returns the number of watches held by w.
func (w *Watches) Size() int {
        w.mu.Lock()
        defer w.mu.Unlock()
        return len(w.ws)
}

// Lookup returns the watch owned by an inotify instance with the given id.
// Returns nil if no such watch exists.
//
// Precondition: the inotify instance with the given id must be locked to
// prevent the returned watch from being concurrently modified or replaced in
// Inotify.watches.
func (w *Watches) Lookup(id uint64) *Watch {
        w.mu.Lock()
        defer w.mu.Unlock()
        return w.ws[id]
}

// Add adds watch into this set of watches.
//
// Precondition: the inotify instance with the given id must be locked.
func (w *Watches) Add(watch *Watch) {
        w.mu.Lock()
        defer w.mu.Unlock()

        owner := watch.OwnerID()
        // Sanity check, we should never have two watches for one owner on the
        // same target.
        if _, exists := w.ws[owner]; exists {
                panic(fmt.Sprintf("Watch collision with ID %+v", owner))
        }
        if w.ws == nil {
                w.ws = make(map[uint64]*Watch)
        }
        w.ws[owner] = watch
}

// Remove removes a watch with the given id from this set of watches and
// releases it. The caller is responsible for generating any watch removal
// event, as appropriate. The provided id must match an existing watch in this
// collection.
//
// Precondition: the inotify instance with the given id must be locked.
func (w *Watches) Remove(id uint64) {
        w.mu.Lock()
        defer w.mu.Unlock()

        if w.ws == nil {
                // This watch set is being destroyed. The thread executing the
                // destructor is already in the process of deleting all our watches. We
                // got here with no references on the target because we raced with the
                // destructor notifying all the watch owners of destruction. See the
                // comment in Watches.HandleDeletion for why this race exists.
                return
        }

        // It is possible for w.Remove() to be called for the same watch multiple
        // times. See the treatment of one-shot watches in Watches.Notify().
        if _, ok := w.ws[id]; ok {
                delete(w.ws, id)
        }
}

// Notify queues a new event with watches in this set. Watches with
// IN_EXCL_UNLINK are skipped if the event is coming from a child that has been
// unlinked.
func (w *Watches) Notify(ctx context.Context, name string, events, cookie uint32, et EventType, unlinked bool) {
        var hasExpired bool
        w.mu.RLock()
        for _, watch := range w.ws {
                if unlinked && watch.ExcludeUnlinked() && et == PathEvent {
                        continue
                }
                if watch.Notify(name, events, cookie) {
                        hasExpired = true
                }
        }
        w.mu.RUnlock()

        if hasExpired {
                w.cleanupExpiredWatches(ctx)
        }
}

// This function is relatively expensive and should only be called where there
// are expired watches.
func (w *Watches) cleanupExpiredWatches(ctx context.Context) {
        // Because of lock ordering, we cannot acquire Inotify.mu for each watch
        // owner while holding w.mu. As a result, store expired watches locally
        // before removing.
        var toRemove []*Watch
        w.mu.RLock()
        for _, watch := range w.ws {
                if atomic.LoadInt32(&watch.expired) == 1 {
                        toRemove = append(toRemove, watch)
                }
        }
        w.mu.RUnlock()
        for _, watch := range toRemove {
                watch.owner.RmWatch(ctx, watch.wd)
        }
}

// HandleDeletion is called when the watch target is destroyed. Clear the
// watch set, detach watches from the inotify instances they belong to, and
// generate the appropriate events.
func (w *Watches) HandleDeletion(ctx context.Context) {
        w.Notify(ctx, "", linux.IN_DELETE_SELF, 0, InodeEvent, true /* unlinked */)

        // As in Watches.Notify, we can't hold w.mu while acquiring Inotify.mu for
        // the owner of each watch being deleted. Instead, atomically store the
        // watches map in a local variable and set it to nil so we can iterate over
        // it with the assurance that there will be no concurrent accesses.
        var ws map[uint64]*Watch
        w.mu.Lock()
        ws = w.ws
        w.ws = nil
        w.mu.Unlock()

        // Remove each watch from its owner's watch set, and generate a corresponding
        // watch removal event.
        for _, watch := range ws {
                i := watch.owner
                i.mu.Lock()
                _, found := i.watches[watch.wd]
                delete(i.watches, watch.wd)

                // Release mutex before notifying waiters because we don't control what
                // they can do.
                i.mu.Unlock()

                // If watch was not found, it was removed from the inotify instance before
                // we could get to it, in which case we should not generate an event.
                if found {
                        i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0))
                }
        }
}

// Watch represent a particular inotify watch created by inotify_add_watch.
//
// +stateify savable
type Watch struct {
        // Inotify instance which owns this watch.
        //
        // This field is immutable after creation.
        owner *Inotify

        // Descriptor for this watch. This is unique across an inotify instance.
        //
        // This field is immutable after creation.
        wd int32

        // target is a dentry representing the watch target. Its watch set contains this watch.
        //
        // This field is immutable after creation.
        target *Dentry

        // Events being monitored via this watch. Must be accessed with atomic
        // memory operations.
        mask uint32

        // expired is set to 1 to indicate that this watch is a one-shot that has
        // already sent a notification and therefore can be removed. Must be accessed
        // with atomic memory operations.
        expired int32
}

// OwnerID returns the id of the inotify instance that owns this watch.
func (w *Watch) OwnerID() uint64 {
        return w.owner.id
}

// ExcludeUnlinked indicates whether the watched object should continue to be
// notified of events originating from a path that has been unlinked.
//
// For example, if "foo/bar" is opened and then unlinked, operations on the
// open fd may be ignored by watches on "foo" and "foo/bar" with IN_EXCL_UNLINK.
func (w *Watch) ExcludeUnlinked() bool {
        return atomic.LoadUint32(&w.mask)&linux.IN_EXCL_UNLINK != 0
}

// Notify queues a new event on this watch. Returns true if this is a one-shot
// watch that should be deleted, after this event was successfully queued.
func (w *Watch) Notify(name string, events uint32, cookie uint32) bool {
        if atomic.LoadInt32(&w.expired) == 1 {
                // This is a one-shot watch that is already in the process of being
                // removed. This may happen if a second event reaches the watch target
                // before this watch has been removed.
                return false
        }

        mask := atomic.LoadUint32(&w.mask)
        if mask&events == 0 {
                // We weren't watching for this event.
                return false
        }

        // Event mask should include bits matched from the watch plus all control
        // event bits.
        unmaskableBits := ^uint32(0) &^ linux.IN_ALL_EVENTS
        effectiveMask := unmaskableBits | mask
        matchedEvents := effectiveMask & events
        w.owner.queueEvent(newEvent(w.wd, name, matchedEvents, cookie))
        if mask&linux.IN_ONESHOT != 0 {
                atomic.StoreInt32(&w.expired, 1)
                return true
        }
        return false
}

// Event represents a struct inotify_event from linux.
//
// +stateify savable
type Event struct {
        eventEntry

        wd     int32
        mask   uint32
        cookie uint32

        // len is computed based on the name field is set automatically by
        // Event.setName. It should be 0 when no name is set; otherwise it is the
        // length of the name slice.
        len uint32

        // The name field has special padding requirements and should only be set by
        // calling Event.setName.
        name []byte
}

func newEvent(wd int32, name string, events, cookie uint32) *Event {
        e := &Event{
                wd:     wd,
                mask:   events,
                cookie: cookie,
        }
        if name != "" {
                e.setName(name)
        }
        return e
}

// paddedBytes converts a go string to a null-terminated c-string, padded with
// null bytes to a total size of 'l'. 'l' must be large enough for all the bytes
// in the 's' plus at least one null byte.
func paddedBytes(s string, l uint32) []byte {
        if l < uint32(len(s)+1) {
                panic("Converting string to byte array results in truncation, this can lead to buffer-overflow due to the missing null-byte!")
        }
        b := make([]byte, l)
        copy(b, s)

        // b was zero-value initialized during make(), so the rest of the slice is
        // already filled with null bytes.

        return b
}

// setName sets the optional name for this event.
func (e *Event) setName(name string) {
        // We need to pad the name such that the entire event length ends up a
        // multiple of inotifyEventBaseSize.
        unpaddedLen := len(name) + 1
        // Round up to nearest multiple of inotifyEventBaseSize.
        e.len = uint32((unpaddedLen + inotifyEventBaseSize - 1) & ^(inotifyEventBaseSize - 1))
        // Make sure we haven't overflowed and wrapped around when rounding.
        if unpaddedLen > int(e.len) {
                panic("Overflow when rounding inotify event size, the 'name' field was too big.")
        }
        e.name = paddedBytes(name, e.len)
}

func (e *Event) sizeOf() int {
        s := inotifyEventBaseSize + int(e.len)
        if s < inotifyEventBaseSize {
                panic("Overflowed event size")
        }
        return s
}

// CopyTo serializes this event to dst. buf is used as a scratch buffer to
// construct the output. We use a buffer allocated ahead of time for
// performance. buf must be at least inotifyEventBaseSize bytes.
func (e *Event) CopyTo(ctx context.Context, buf []byte, dst usermem.IOSequence) (int64, error) {
        hostarch.ByteOrder.PutUint32(buf[0:], uint32(e.wd))
        hostarch.ByteOrder.PutUint32(buf[4:], e.mask)
        hostarch.ByteOrder.PutUint32(buf[8:], e.cookie)
        hostarch.ByteOrder.PutUint32(buf[12:], e.len)

        writeLen := 0

        n, err := dst.CopyOut(ctx, buf)
        if err != nil {
                return 0, err
        }
        writeLen += n
        dst = dst.DropFirst(n)

        if e.len > 0 {
                n, err = dst.CopyOut(ctx, e.name)
                if err != nil {
                        return 0, err
                }
                writeLen += n
        }

        // Santiy check.
        if writeLen != e.sizeOf() {
                panic(fmt.Sprintf("Serialized unexpected amount of data for an event, expected %d, wrote %d.", e.sizeOf(), writeLen))
        }

        return int64(writeLen), nil
}

func (e *Event) equals(other *Event) bool {
        return e.wd == other.wd &&
                e.mask == other.mask &&
                e.cookie == other.cookie &&
                e.len == other.len &&
                bytes.Equal(e.name, other.name)
}

// InotifyEventFromStatMask generates the appropriate events for an operation
// that set the stats specified in mask.
func InotifyEventFromStatMask(mask uint32) uint32 {
        var ev uint32
        if mask&(linux.STATX_UID|linux.STATX_GID|linux.STATX_MODE) != 0 {
                ev |= linux.IN_ATTRIB
        }
        if mask&linux.STATX_SIZE != 0 {
                ev |= linux.IN_MODIFY
        }

        if (mask & (linux.STATX_ATIME | linux.STATX_MTIME)) == (linux.STATX_ATIME | linux.STATX_MTIME) {
                // Both times indicates a utime(s) call.
                ev |= linux.IN_ATTRIB
        } else if mask&linux.STATX_ATIME != 0 {
                ev |= linux.IN_ACCESS
        } else if mask&linux.STATX_MTIME != 0 {
                ev |= linux.IN_MODIFY
        }
        return ev
}

// InotifyRemoveChild sends the appriopriate notifications to the watch sets of
// the child being removed and its parent. Note that unlike most pairs of
// parent/child notifications, the child is notified first in this case.
func InotifyRemoveChild(ctx context.Context, self, parent *Watches, name string) {
        if self != nil {
                self.Notify(ctx, "", linux.IN_ATTRIB, 0, InodeEvent, true /* unlinked */)
        }
        if parent != nil {
                parent.Notify(ctx, name, linux.IN_DELETE, 0, InodeEvent, true /* unlinked */)
        }
}

// InotifyRename sends the appriopriate notifications to the watch sets of the
// file being renamed and its old/new parents.
func InotifyRename(ctx context.Context, renamed, oldParent, newParent *Watches, oldName, newName string, isDir bool) {
        var dirEv uint32
        if isDir {
                dirEv = linux.IN_ISDIR
        }
        cookie := uniqueid.InotifyCookie(ctx)
        if oldParent != nil {
                oldParent.Notify(ctx, oldName, dirEv|linux.IN_MOVED_FROM, cookie, InodeEvent, false /* unlinked */)
        }
        if newParent != nil {
                newParent.Notify(ctx, newName, dirEv|linux.IN_MOVED_TO, cookie, InodeEvent, false /* unlinked */)
        }
        // Somewhat surprisingly, self move events do not have a cookie.
        if renamed != nil {
                renamed.Notify(ctx, "", linux.IN_MOVE_SELF, 0, InodeEvent, false /* unlinked */)
        }
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/semaphore/waiter_list.go: no such file or directory


















































  591 










































   50 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sync"
)

// UTSNamespace represents a UTS namespace, a holder of two system identifiers:
// the hostname and domain name.
//
// +stateify savable
type UTSNamespace struct {
        // mu protects all fields below.
        mu         sync.Mutex `state:"nosave"`
        hostName   string
        domainName string

        // userns is the user namespace associated with the UTSNamespace.
        // Privileged operations on this UTSNamespace must have appropriate
        // capabilities in userns.
        //
        // userns is immutable.
        userns *auth.UserNamespace
}

// NewUTSNamespace creates a new UTS namespace.
func NewUTSNamespace(hostName, domainName string, userns *auth.UserNamespace) *UTSNamespace {
        return &UTSNamespace{
                hostName:   hostName,
                domainName: domainName,
                userns:     userns,
        }
}

// UTSNamespace returns the task's UTS namespace.
func (t *Task) UTSNamespace() *UTSNamespace {
        t.mu.Lock()
        defer t.mu.Unlock()
        return t.utsns
}

// HostName returns the host name of this UTS namespace.
func (u *UTSNamespace) HostName() string {
        u.mu.Lock()
        defer u.mu.Unlock()
        return u.hostName
}

// SetHostName sets the host name of this UTS namespace.
func (u *UTSNamespace) SetHostName(host string) {
        u.mu.Lock()
        defer u.mu.Unlock()
        u.hostName = host
}

// DomainName returns the domain name of this UTS namespace.
func (u *UTSNamespace) DomainName() string {
        u.mu.Lock()
        defer u.mu.Unlock()
        return u.domainName
}

// SetDomainName sets the domain name of this UTS namespace.
func (u *UTSNamespace) SetDomainName(domain string) {
        u.mu.Lock()
        defer u.mu.Unlock()
        u.domainName = domain
}

// UserNamespace returns the user namespace associated with this UTS namespace.
func (u *UTSNamespace) UserNamespace() *auth.UserNamespace {
        u.mu.Lock()
        defer u.mu.Unlock()
        return u.userns
}

// Clone makes a copy of this UTS namespace, associating the given user
// namespace.
func (u *UTSNamespace) Clone(userns *auth.UserNamespace) *UTSNamespace {
        u.mu.Lock()
        defer u.mu.Unlock()
        return &UTSNamespace{
                hostName:   u.hostName,
                domainName: u.domainName,
                userns:     userns,
        }
}






































































































   39 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "bytes"
        "fmt"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)

// A FilesystemType constructs filesystems.
//
// FilesystemType is analogous to Linux's struct file_system_type.
type FilesystemType interface {
        // GetFilesystem returns a Filesystem configured by the given options,
        // along with its mount root. A reference is taken on the returned
        // Filesystem and Dentry.
        GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error)

        // Name returns the name of this FilesystemType.
        Name() string

        // Release releases all resources held by this FilesystemType.
        Release(ctx context.Context)
}

// GetFilesystemOptions contains options to FilesystemType.GetFilesystem.
type GetFilesystemOptions struct {
        // Data is the string passed as the 5th argument to mount(2), which is
        // usually a comma-separated list of filesystem-specific mount options.
        Data string

        // InternalData holds opaque FilesystemType-specific data. There is
        // intentionally no way for applications to specify InternalData; if it is
        // not nil, the call to GetFilesystem originates from within the sentry.
        InternalData interface{}
}

// +stateify savable
type registeredFilesystemType struct {
        fsType FilesystemType
        opts   RegisterFilesystemTypeOptions
}

// RegisterFilesystemTypeOptions contains options to
// VirtualFilesystem.RegisterFilesystem().
//
// +stateify savable
type RegisterFilesystemTypeOptions struct {
        // AllowUserMount determines whether users are allowed to mount a file system
        // of this type, i.e. through mount(2). If AllowUserMount is true, allow calls
        // to VirtualFilesystem.MountAt() for which MountOptions.InternalMount == false
        // to use this filesystem type.
        AllowUserMount bool

        // If AllowUserList is true, make this filesystem type visible in
        // /proc/filesystems.
        AllowUserList bool

        // If RequiresDevice is true, indicate that mounting this filesystem
        // requires a block device as the mount source in /proc/filesystems.
        RequiresDevice bool
}

// RegisterFilesystemType registers the given FilesystemType in vfs with the
// given name.
func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) error {
        vfs.fsTypesMu.Lock()
        defer vfs.fsTypesMu.Unlock()
        if existing, ok := vfs.fsTypes[name]; ok {
                return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing.fsType)
        }
        vfs.fsTypes[name] = &registeredFilesystemType{
                fsType: fsType,
                opts:   *opts,
        }
        return nil
}

// MustRegisterFilesystemType is equivalent to RegisterFilesystemType but
// panics on failure.
func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) {
        if err := vfs.RegisterFilesystemType(name, fsType, opts); err != nil {
                panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err))
        }
}

func (vfs *VirtualFilesystem) getFilesystemType(name string) *registeredFilesystemType {
        vfs.fsTypesMu.RLock()
        defer vfs.fsTypesMu.RUnlock()
        return vfs.fsTypes[name]
}

// GenerateProcFilesystems emits the contents of /proc/filesystems for vfs to
// buf.
func (vfs *VirtualFilesystem) GenerateProcFilesystems(buf *bytes.Buffer) {
        vfs.fsTypesMu.RLock()
        defer vfs.fsTypesMu.RUnlock()
        for name, rft := range vfs.fsTypes {
                if !rft.opts.AllowUserList {
                        continue
                }
                var nodev string
                if !rft.opts.RequiresDevice {
                        nodev = "nodev"
                }
                fmt.Fprintf(buf, "%s\t%s\n", nodev, name)
        }
}






















































  159 

  159 

  159 




  159 









  158 










  159 

  159 





  159 




  159 





  159 




  159 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package host

import (
        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/hostfd"
)

// copyToMulti copies as many bytes from src to dst as possible.
func copyToMulti(dst [][]byte, src []byte) {
        for _, d := range dst {
                done := copy(d, src)
                src = src[done:]
                if len(src) == 0 {
                        break
                }
        }
}

// copyFromMulti copies as many bytes from src to dst as possible.
func copyFromMulti(dst []byte, src [][]byte) {
        for _, s := range src {
                done := copy(dst, s)
                dst = dst[done:]
                if len(dst) == 0 {
                        break
                }
        }
}

// buildIovec builds an iovec slice from the given []byte slice.
//
// If truncate, truncate bufs > maxlen. Otherwise, immediately return an error.
//
// If length < the total length of bufs, err indicates why, even when returning
// a truncated iovec.
//
// If intermediate != nil, iovecs references intermediate rather than bufs and
// the caller must copy to/from bufs as necessary.
func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovecs []unix.Iovec, intermediate []byte, err error) {
        var iovsRequired int
        for _, b := range bufs {
                length += int64(len(b))
                if len(b) > 0 {
                        iovsRequired++
                }
        }

        stopLen := length
        if length > maxlen {
                if truncate {
                        stopLen = maxlen
                        err = linuxerr.EAGAIN
                } else {
                        return 0, nil, nil, linuxerr.EMSGSIZE
                }
        }

        if iovsRequired > hostfd.MaxSendRecvMsgIov {
                // The kernel will reject our call if we pass this many iovs.
                // Use a single intermediate buffer instead.
                b := make([]byte, stopLen)

                return stopLen, []unix.Iovec{{
                        Base: &b[0],
                        Len:  uint64(stopLen),
                }}, b, err
        }

        var total int64
        iovecs = make([]unix.Iovec, 0, iovsRequired)
        for i := range bufs {
                l := len(bufs[i])
                if l == 0 {
                        continue
                }

                stop := int64(l)
                if total+stop > stopLen {
                        stop = stopLen - total
                }

                iovecs = append(iovecs, unix.Iovec{
                        Base: &bufs[i][0],
                        Len:  uint64(stop),
                })

                total += stop
                if total >= stopLen {
                        break
                }
        }

        return total, iovecs, nil, err
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/tcpip/transport/raw/raw_packet_list.go: no such file or directory























































    5 



























































































    5 
    5 































    5 
    5 










































































































    5 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
        "fmt"
        "strings"

        "github.com/google/btree"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/seqnum"
)

const (
        // maxSACKBlocks is the maximum number of distinct SACKBlocks the
        // scoreboard will track. Once there are 100 distinct blocks, new
        // insertions will fail.
        maxSACKBlocks = 100

        // defaultBtreeDegree is set to 2 as btree.New(2) results in a 2-3-4
        // tree.
        defaultBtreeDegree = 2
)

// SACKScoreboard stores a set of disjoint SACK ranges.
//
// +stateify savable
type SACKScoreboard struct {
        // smss is defined in RFC5681 as following:
        //
        //    The SMSS is the size of the largest segment that the sender can
        //    transmit.  This value can be based on the maximum transmission unit
        //    of the network, the path MTU discovery [RFC1191, RFC4821] algorithm,
        //    RMSS (see next item), or other factors.  The size does not include
        //    the TCP/IP headers and options.
        smss      uint16
        maxSACKED seqnum.Value
        sacked    seqnum.Size  `state:"nosave"`
        ranges    *btree.BTree `state:"nosave"`
}

// NewSACKScoreboard returns a new SACK Scoreboard.
func NewSACKScoreboard(smss uint16, iss seqnum.Value) *SACKScoreboard {
        return &SACKScoreboard{
                smss:      smss,
                ranges:    btree.New(defaultBtreeDegree),
                maxSACKED: iss,
        }
}

// Reset erases all known range information from the SACK scoreboard.
func (s *SACKScoreboard) Reset() {
        s.ranges = btree.New(defaultBtreeDegree)
        s.sacked = 0
}

// Insert inserts/merges the provided SACKBlock into the scoreboard.
func (s *SACKScoreboard) Insert(r header.SACKBlock) {
        if s.ranges.Len() >= maxSACKBlocks {
                return
        }

        // Check if we can merge the new range with a range before or after it.
        var toDelete []btree.Item
        if s.maxSACKED.LessThan(r.End - 1) {
                s.maxSACKED = r.End - 1
        }
        s.ranges.AscendGreaterOrEqual(r, func(i btree.Item) bool {
                if i == r {
                        return true
                }
                sacked := i.(header.SACKBlock)
                // There is a hole between these two SACK blocks, so we can't
                // merge anymore.
                if r.End.LessThan(sacked.Start) {
                        return false
                }
                // There is some overlap at this point, merge the blocks and
                // delete the other one.
                //
                // ----sS--------sE
                // r.S---------------rE
                //               -------sE
                if sacked.End.LessThan(r.End) {
                        // sacked is contained in the newly inserted range.
                        // Delete this block.
                        toDelete = append(toDelete, i)
                        return true
                }
                // sacked covers a range past end of the newly inserted
                // block.
                r.End = sacked.End
                toDelete = append(toDelete, i)
                return true
        })

        s.ranges.DescendLessOrEqual(r, func(i btree.Item) bool {
                if i == r {
                        return true
                }
                sacked := i.(header.SACKBlock)
                // sA------sE
                //            rA----rE
                if sacked.End.LessThan(r.Start) {
                        return false
                }
                // The previous range extends into the current block. Merge it
                // into the newly inserted range and delete the other one.
                //
                //   <-rA---rE----<---rE--->
                // sA--------------sE
                r.Start = sacked.Start
                // Extend r to cover sacked if sacked extends past r.
                if r.End.LessThan(sacked.End) {
                        r.End = sacked.End
                }
                toDelete = append(toDelete, i)
                return true
        })
        for _, i := range toDelete {
                if sb := s.ranges.Delete(i); sb != nil {
                        sb := i.(header.SACKBlock)
                        s.sacked -= sb.Start.Size(sb.End)
                }
        }

        replaced := s.ranges.ReplaceOrInsert(r)
        if replaced == nil {
                s.sacked += r.Start.Size(r.End)
        }
}

// IsSACKED returns true if the a given range of sequence numbers denoted by r
// are already covered by SACK information in the scoreboard.
func (s *SACKScoreboard) IsSACKED(r header.SACKBlock) bool {
        if s.Empty() {
                return false
        }

        found := false
        s.ranges.DescendLessOrEqual(r, func(i btree.Item) bool {
                sacked := i.(header.SACKBlock)
                if sacked.End.LessThan(r.Start) {
                        return false
                }
                if sacked.Contains(r) {
                        found = true
                        return false
                }
                return true
        })
        return found
}

// String returns human-readable state of the scoreboard structure.
func (s *SACKScoreboard) String() string {
        var str strings.Builder
        str.WriteString("SACKScoreboard: {")
        s.ranges.Ascend(func(i btree.Item) bool {
                str.WriteString(fmt.Sprintf("%v,", i))
                return true
        })
        str.WriteString("}\n")
        return str.String()
}

// Delete removes all SACK information prior to seq.
func (s *SACKScoreboard) Delete(seq seqnum.Value) {
        if s.Empty() {
                return
        }
        toDelete := []btree.Item{}
        toInsert := []btree.Item{}
        r := header.SACKBlock{seq, seq.Add(1)}
        s.ranges.DescendLessOrEqual(r, func(i btree.Item) bool {
                if i == r {
                        return true
                }
                sb := i.(header.SACKBlock)
                toDelete = append(toDelete, i)
                if sb.End.LessThanEq(seq) {
                        s.sacked -= sb.Start.Size(sb.End)
                } else {
                        newSB := header.SACKBlock{seq, sb.End}
                        toInsert = append(toInsert, newSB)
                        s.sacked -= sb.Start.Size(seq)
                }
                return true
        })
        for _, sb := range toDelete {
                s.ranges.Delete(sb)
        }
        for _, sb := range toInsert {
                s.ranges.ReplaceOrInsert(sb)
        }
}

// Copy provides a copy of the SACK scoreboard.
func (s *SACKScoreboard) Copy() (sackBlocks []header.SACKBlock, maxSACKED seqnum.Value) {
        s.ranges.Ascend(func(i btree.Item) bool {
                sackBlocks = append(sackBlocks, i.(header.SACKBlock))
                return true
        })
        return sackBlocks, s.maxSACKED
}

// IsRangeLost implements the IsLost(SeqNum) operation defined in RFC 6675
// section 4 but operates on a range of sequence numbers and returns true if
// there are at least nDupAckThreshold SACK blocks greater than the range being
// checked or if at least (nDupAckThreshold-1)*s.smss bytes have been SACKED
// with sequence numbers greater than the block being checked.
func (s *SACKScoreboard) IsRangeLost(r header.SACKBlock) bool {
        if s.Empty() {
                return false
        }
        nDupSACK := 0
        nDupSACKBytes := seqnum.Size(0)
        isLost := false

        // We need to check if the immediate lower (if any) sacked
        // range contains or partially overlaps with r.
        searchMore := true
        s.ranges.DescendLessOrEqual(r, func(i btree.Item) bool {
                sacked := i.(header.SACKBlock)
                if sacked.Contains(r) {
                        searchMore = false
                        return false
                }
                if sacked.End.LessThanEq(r.Start) {
                        // all sequence numbers covered by sacked are below
                        // r so we continue searching.
                        return false
                }
                // There is a partial overlap. In this case we r.Start is
                // between sacked.Start & sacked.End and r.End extends beyond
                // sacked.End.
                // Move r.Start to sacked.End and continuing searching blocks
                // above r.Start.
                r.Start = sacked.End
                return false
        })

        if !searchMore {
                return isLost
        }

        s.ranges.AscendGreaterOrEqual(r, func(i btree.Item) bool {
                sacked := i.(header.SACKBlock)
                if sacked.Contains(r) {
                        return false
                }
                nDupSACKBytes += sacked.Start.Size(sacked.End)
                nDupSACK++
                if nDupSACK >= nDupAckThreshold || nDupSACKBytes >= seqnum.Size((nDupAckThreshold-1)*s.smss) {
                        isLost = true
                        return false
                }
                return true
        })
        return isLost
}

// IsLost implements the IsLost(SeqNum) operation defined in RFC3517 section
// 4.
//
// This routine returns whether the given sequence number is considered to be
// lost. The routine returns true when either nDupAckThreshold discontiguous
// SACKed sequences have arrived above 'SeqNum' or (nDupAckThreshold * SMSS)
// bytes with sequence numbers greater than 'SeqNum' have been SACKed.
// Otherwise, the routine returns false.
func (s *SACKScoreboard) IsLost(seq seqnum.Value) bool {
        return s.IsRangeLost(header.SACKBlock{seq, seq.Add(1)})
}

// Empty returns true if the SACK scoreboard has no entries, false otherwise.
func (s *SACKScoreboard) Empty() bool {
        return s.ranges.Len() == 0
}

// Sacked returns the current number of bytes held in the SACK scoreboard.
func (s *SACKScoreboard) Sacked() seqnum.Size {
        return s.sacked
}

// MaxSACKED returns the highest sequence number ever inserted in the SACK
// scoreboard.
func (s *SACKScoreboard) MaxSACKED() seqnum.Value {
        return s.maxSACKED
}

// SMSS returns the sender's MSS as held by the SACK scoreboard.
func (s *SACKScoreboard) SMSS() uint16 {
        return s.smss
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/mm/vma_set.go: no such file or directory




















































   31 




   19 
















   23 






   15 






   15 




   14 



















   25 




   31 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package udp contains the implementation of the UDP transport protocol.
package udp

import (
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/header/parse"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
        "gvisor.dev/gvisor/pkg/tcpip/transport/raw"
        "gvisor.dev/gvisor/pkg/waiter"
)

const (
        // ProtocolNumber is the udp protocol number.
        ProtocolNumber = header.UDPProtocolNumber

        // MinBufferSize is the smallest size of a receive or send buffer.
        MinBufferSize = 4 << 10 // 4KiB bytes.

        // DefaultSendBufferSize is the default size of the send buffer for
        // an endpoint.
        DefaultSendBufferSize = 32 << 10 // 32KiB

        // DefaultReceiveBufferSize is the default size of the receive buffer
        // for an endpoint.
        DefaultReceiveBufferSize = 32 << 10 // 32KiB

        // MaxBufferSize is the largest size a receive/send buffer can grow to.
        MaxBufferSize = 4 << 20 // 4MiB
)

type protocol struct {
        stack *stack.Stack
}

// Number returns the udp protocol number.
func (*protocol) Number() tcpip.TransportProtocolNumber {
        return ProtocolNumber
}

// NewEndpoint creates a new udp endpoint.
func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
        return newEndpoint(p.stack, netProto, waiterQueue), nil
}

// NewRawEndpoint creates a new raw UDP endpoint. It implements
// stack.TransportProtocol.NewRawEndpoint.
func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
        return raw.NewEndpoint(p.stack, netProto, header.UDPProtocolNumber, waiterQueue)
}

// MinimumPacketSize returns the minimum valid udp packet size.
func (*protocol) MinimumPacketSize() int {
        return header.UDPMinimumSize
}

// ParsePorts returns the source and destination ports stored in the given udp
// packet.
func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err tcpip.Error) {
        h := header.UDP(v)
        return h.SourcePort(), h.DestinationPort(), nil
}

// HandleUnknownDestinationPacket handles packets that are targeted at this
// protocol but don't match any existing endpoint.
func (p *protocol) HandleUnknownDestinationPacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition {
        hdr := header.UDP(pkt.TransportHeader().View())
        if int(hdr.Length()) > pkt.Data().Size()+header.UDPMinimumSize {
                p.stack.Stats().UDP.MalformedPacketsReceived.Increment()
                return stack.UnknownDestinationPacketMalformed
        }

        if !verifyChecksum(hdr, pkt) {
                p.stack.Stats().UDP.ChecksumErrors.Increment()
                return stack.UnknownDestinationPacketMalformed
        }

        return stack.UnknownDestinationPacketUnhandled
}

// SetOption implements stack.TransportProtocol.SetOption.
func (*protocol) SetOption(tcpip.SettableTransportProtocolOption) tcpip.Error {
        return &tcpip.ErrUnknownProtocolOption{}
}

// Option implements stack.TransportProtocol.Option.
func (*protocol) Option(tcpip.GettableTransportProtocolOption) tcpip.Error {
        return &tcpip.ErrUnknownProtocolOption{}
}

// Close implements stack.TransportProtocol.Close.
func (*protocol) Close() {}

// Wait implements stack.TransportProtocol.Wait.
func (*protocol) Wait() {}

// Parse implements stack.TransportProtocol.Parse.
func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
        return parse.UDP(pkt)
}

// NewProtocol returns a UDP transport protocol.
func NewProtocol(s *stack.Stack) stack.TransportProtocol {
        return &protocol{stack: s}
}






























































   32 











































   32 





























   32 






   24 








    1 

    1 










    1 







   31 

















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ipv6

import (
        "fmt"
        "time"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

const (
        // UnsolicitedReportIntervalMax is the maximum delay between sending
        // unsolicited MLD reports.
        //
        // Obtained from RFC 2710 Section 7.10.
        UnsolicitedReportIntervalMax = 10 * time.Second
)

// MLDOptions holds options for MLD.
type MLDOptions struct {
        // Enabled indicates whether MLD will be performed.
        //
        // When enabled, MLD may transmit MLD report and done messages when
        // joining and leaving multicast groups respectively, and handle incoming
        // MLD packets.
        //
        // This field is ignored and is always assumed to be false for interfaces
        // without neighbouring nodes (e.g. loopback).
        Enabled bool
}

var _ ip.MulticastGroupProtocol = (*mldState)(nil)

// mldState is the per-interface MLD state.
//
// mldState.init MUST be called to initialize the MLD state.
type mldState struct {
        // The IPv6 endpoint this mldState is for.
        ep *endpoint

        genericMulticastProtocol ip.GenericMulticastProtocolState
}

// Enabled implements ip.MulticastGroupProtocol.
func (mld *mldState) Enabled() bool {
        // No need to perform MLD on loopback interfaces since they don't have
        // neighbouring nodes.
        return mld.ep.protocol.options.MLD.Enabled && !mld.ep.nic.IsLoopback() && mld.ep.Enabled()
}

// SendReport implements ip.MulticastGroupProtocol.
//
// Precondition: mld.ep.mu must be read locked.
func (mld *mldState) SendReport(groupAddress tcpip.Address) (bool, tcpip.Error) {
        return mld.writePacket(groupAddress, groupAddress, header.ICMPv6MulticastListenerReport)
}

// SendLeave implements ip.MulticastGroupProtocol.
//
// Precondition: mld.ep.mu must be read locked.
func (mld *mldState) SendLeave(groupAddress tcpip.Address) tcpip.Error {
        _, err := mld.writePacket(header.IPv6AllRoutersLinkLocalMulticastAddress, groupAddress, header.ICMPv6MulticastListenerDone)
        return err
}

// ShouldPerformProtocol implements ip.MulticastGroupProtocol.
func (mld *mldState) ShouldPerformProtocol(groupAddress tcpip.Address) bool {
        // As per RFC 2710 section 5 page 10,
        //
        //   The link-scope all-nodes address (FF02::1) is handled as a special
        //   case. The node starts in Idle Listener state for that address on
        //   every interface, never transitions to another state, and never sends
        //   a Report or Done for that address.
        //
        //   MLD messages are never sent for multicast addresses whose scope is 0
        //   (reserved) or 1 (node-local).
        if groupAddress == header.IPv6AllNodesMulticastAddress {
                return false
        }

        scope := header.V6MulticastScope(groupAddress)
        return scope != header.IPv6Reserved0MulticastScope && scope != header.IPv6InterfaceLocalMulticastScope
}

// init sets up an mldState struct, and is required to be called before using
// a new mldState.
//
// Must only be called once for the lifetime of mld.
func (mld *mldState) init(ep *endpoint) {
        mld.ep = ep
        mld.genericMulticastProtocol.Init(&ep.mu.RWMutex, ip.GenericMulticastProtocolOptions{
                Rand:                      ep.protocol.stack.Rand(),
                Clock:                     ep.protocol.stack.Clock(),
                Protocol:                  mld,
                MaxUnsolicitedReportDelay: UnsolicitedReportIntervalMax,
        })
}

// handleMulticastListenerQuery handles a query message.
//
// Precondition: mld.ep.mu must be locked.
func (mld *mldState) handleMulticastListenerQuery(mldHdr header.MLD) {
        mld.genericMulticastProtocol.HandleQueryLocked(mldHdr.MulticastAddress(), mldHdr.MaximumResponseDelay())
}

// handleMulticastListenerReport handles a report message.
//
// Precondition: mld.ep.mu must be locked.
func (mld *mldState) handleMulticastListenerReport(mldHdr header.MLD) {
        mld.genericMulticastProtocol.HandleReportLocked(mldHdr.MulticastAddress())
}

// joinGroup handles joining a new group and sending and scheduling the required
// messages.
//
// If the group is already joined, returns *tcpip.ErrDuplicateAddress.
//
// Precondition: mld.ep.mu must be locked.
func (mld *mldState) joinGroup(groupAddress tcpip.Address) {
        mld.genericMulticastProtocol.JoinGroupLocked(groupAddress)
}

// isInGroup returns true if the specified group has been joined locally.
//
// Precondition: mld.ep.mu must be read locked.
func (mld *mldState) isInGroup(groupAddress tcpip.Address) bool {
        return mld.genericMulticastProtocol.IsLocallyJoinedRLocked(groupAddress)
}

// leaveGroup handles removing the group from the membership map, cancels any
// delay timers associated with that group, and sends the Done message, if
// required.
//
// Precondition: mld.ep.mu must be locked.
func (mld *mldState) leaveGroup(groupAddress tcpip.Address) tcpip.Error {
        // LeaveGroup returns false only if the group was not joined.
        if mld.genericMulticastProtocol.LeaveGroupLocked(groupAddress) {
                return nil
        }

        return &tcpip.ErrBadLocalAddress{}
}

// softLeaveAll leaves all groups from the perspective of MLD, but remains
// joined locally.
//
// Precondition: mld.ep.mu must be locked.
func (mld *mldState) softLeaveAll() {
        mld.genericMulticastProtocol.MakeAllNonMemberLocked()
}

// initializeAll attemps to initialize the MLD state for each group that has
// been joined locally.
//
// Precondition: mld.ep.mu must be locked.
func (mld *mldState) initializeAll() {
        mld.genericMulticastProtocol.InitializeGroupsLocked()
}

// sendQueuedReports attempts to send any reports that are queued for sending.
//
// Precondition: mld.ep.mu must be locked.
func (mld *mldState) sendQueuedReports() {
        mld.genericMulticastProtocol.SendQueuedReportsLocked()
}

// writePacket assembles and sends an MLD packet.
//
// Precondition: mld.ep.mu must be read locked.
func (mld *mldState) writePacket(destAddress, groupAddress tcpip.Address, mldType header.ICMPv6Type) (bool, tcpip.Error) {
        sentStats := mld.ep.stats.icmp.packetsSent
        var mldStat tcpip.MultiCounterStat
        switch mldType {
        case header.ICMPv6MulticastListenerReport:
                mldStat = sentStats.multicastListenerReport
        case header.ICMPv6MulticastListenerDone:
                mldStat = sentStats.multicastListenerDone
        default:
                panic(fmt.Sprintf("unrecognized mld type = %d", mldType))
        }

        icmp := header.ICMPv6(buffer.NewView(header.ICMPv6HeaderSize + header.MLDMinimumSize))
        icmp.SetType(mldType)
        header.MLD(icmp.MessageBody()).SetMulticastAddress(groupAddress)
        // As per RFC 2710 section 3,
        //
        //   All MLD messages described in this document are sent with a link-local
        //   IPv6 Source Address, an IPv6 Hop Limit of 1, and an IPv6 Router Alert
        //   option in a Hop-by-Hop Options header.
        //
        // However, this would cause problems with Duplicate Address Detection with
        // the first address as MLD snooping switches may not send multicast traffic
        // that DAD depends on to the node performing DAD without the MLD report, as
        // documented in RFC 4816:
        //
        //   Note that when a node joins a multicast address, it typically sends a
        //   Multicast Listener Discovery (MLD) report message [RFC2710] [RFC3810]
        //   for the multicast address. In the case of Duplicate Address
        //   Detection, the MLD report message is required in order to inform MLD-
        //   snooping switches, rather than routers, to forward multicast packets.
        //   In the above description, the delay for joining the multicast address
        //   thus means delaying transmission of the corresponding MLD report
        //   message. Since the MLD specifications do not request a random delay
        //   to avoid race conditions, just delaying Neighbor Solicitation would
        //   cause congestion by the MLD report messages. The congestion would
        //   then prevent the MLD-snooping switches from working correctly and, as
        //   a result, prevent Duplicate Address Detection from working. The
        //   requirement to include the delay for the MLD report in this case
        //   avoids this scenario. [RFC3590] also talks about some interaction
        //   issues between Duplicate Address Detection and MLD, and specifies
        //   which source address should be used for the MLD report in this case.
        //
        // As per RFC 3590 section 4, we should still send out MLD reports with an
        // unspecified source address if we do not have an assigned link-local
        // address to use as the source address to ensure DAD works as expected on
        // networks with MLD snooping switches:
        //
        //   MLD Report and Done messages are sent with a link-local address as
        //   the IPv6 source address, if a valid address is available on the
        //   interface.  If a valid link-local address is not available (e.g., one
        //   has not been configured), the message is sent with the unspecified
        //   address (::) as the IPv6 source address.
        //
        //   Once a valid link-local address is available, a node SHOULD generate
        //   new MLD Report messages for all multicast addresses joined on the
        //   interface.
        //
        //   Routers receiving an MLD Report or Done message with the unspecified
        //   address as the IPv6 source address MUST silently discard the packet
        //   without taking any action on the packets contents.
        //
        //   Snooping switches MUST manage multicast forwarding state based on MLD
        //   Report and Done messages sent with the unspecified address as the
        //   IPv6 source address.
        localAddress := mld.ep.getLinkLocalAddressRLocked()
        if len(localAddress) == 0 {
                localAddress = header.IPv6Any
        }

        icmp.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{
                Header: icmp,
                Src:    localAddress,
                Dst:    destAddress,
        }))

        extensionHeaders := header.IPv6ExtHdrSerializer{
                header.IPv6SerializableHopByHopExtHdr{
                        &header.IPv6RouterAlertOption{Value: header.IPv6RouterAlertMLD},
                },
        }

        pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                ReserveHeaderBytes: int(mld.ep.MaxHeaderLength()) + extensionHeaders.Length(),
                Data:               buffer.View(icmp).ToVectorisedView(),
        })

        if err := addIPHeader(localAddress, destAddress, pkt, stack.NetworkHeaderParams{
                Protocol: header.ICMPv6ProtocolNumber,
                TTL:      header.MLDHopLimit,
        }, extensionHeaders); err != nil {
                panic(fmt.Sprintf("failed to add IP header: %s", err))
        }
        if err := mld.ep.nic.WritePacketToRemote(header.EthernetAddressFromMulticastIPv6Address(destAddress), ProtocolNumber, pkt); err != nil {
                sentStats.dropped.Increment()
                return false, err
        }
        mldStat.Increment()
        return localAddress != header.IPv6Any, nil
}






























    7 



    2 

    2 




    1 




    2 


    5 









   12 



    5 




    5 
    3 


    3 






    3 





    3 


    3 


    3 



    1 


    2 





    3 


    1 


   10 







  632 




    1 





    1 


    1 



    9 






    1 












    3 









    5 




    3 

    3 
    3 




    1 

    1 




    1 
    1 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

// Accounting, limits, timers.

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/sentry/usage"
)

// Getitimer implements getitimer(2).
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) Getitimer(id int32) (linux.ItimerVal, error) {
        var tm ktime.Time
        var s ktime.Setting
        switch id {
        case linux.ITIMER_REAL:
                tm, s = t.tg.itimerRealTimer.Get()
        case linux.ITIMER_VIRTUAL:
                tm = t.tg.UserCPUClock().Now()
                t.tg.signalHandlers.mu.Lock()
                s, _ = t.tg.itimerVirtSetting.At(tm)
                t.tg.signalHandlers.mu.Unlock()
        case linux.ITIMER_PROF:
                tm = t.tg.CPUClock().Now()
                t.tg.signalHandlers.mu.Lock()
                s, _ = t.tg.itimerProfSetting.At(tm)
                t.tg.signalHandlers.mu.Unlock()
        default:
                return linux.ItimerVal{}, linuxerr.EINVAL
        }
        val, iv := ktime.SpecFromSetting(tm, s)
        return linux.ItimerVal{
                Value:    linux.DurationToTimeval(val),
                Interval: linux.DurationToTimeval(iv),
        }, nil
}

// Setitimer implements setitimer(2).
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) Setitimer(id int32, newitv linux.ItimerVal) (linux.ItimerVal, error) {
        var tm ktime.Time
        var olds ktime.Setting
        switch id {
        case linux.ITIMER_REAL:
                news, err := ktime.SettingFromSpec(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), t.tg.itimerRealTimer.Clock())
                if err != nil {
                        return linux.ItimerVal{}, err
                }
                tm, olds = t.tg.itimerRealTimer.Swap(news)
        case linux.ITIMER_VIRTUAL:
                c := t.tg.UserCPUClock()
                var err error
                t.k.cpuClockTicker.Atomically(func() {
                        tm = c.Now()
                        var news ktime.Setting
                        news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm)
                        if err != nil {
                                return
                        }
                        t.tg.signalHandlers.mu.Lock()
                        olds = t.tg.itimerVirtSetting
                        t.tg.itimerVirtSetting = news
                        t.tg.updateCPUTimersEnabledLocked()
                        t.tg.signalHandlers.mu.Unlock()
                })
                if err != nil {
                        return linux.ItimerVal{}, err
                }
        case linux.ITIMER_PROF:
                c := t.tg.CPUClock()
                var err error
                t.k.cpuClockTicker.Atomically(func() {
                        tm = c.Now()
                        var news ktime.Setting
                        news, err = ktime.SettingFromSpecAt(newitv.Value.ToDuration(), newitv.Interval.ToDuration(), tm)
                        if err != nil {
                                return
                        }
                        t.tg.signalHandlers.mu.Lock()
                        olds = t.tg.itimerProfSetting
                        t.tg.itimerProfSetting = news
                        t.tg.updateCPUTimersEnabledLocked()
                        t.tg.signalHandlers.mu.Unlock()
                })
                if err != nil {
                        return linux.ItimerVal{}, err
                }
        default:
                return linux.ItimerVal{}, linuxerr.EINVAL
        }
        oldval, oldiv := ktime.SpecFromSetting(tm, olds)
        return linux.ItimerVal{
                Value:    linux.DurationToTimeval(oldval),
                Interval: linux.DurationToTimeval(oldiv),
        }, nil
}

// IOUsage returns the io usage of the thread.
func (t *Task) IOUsage() *usage.IO {
        return t.ioUsage
}

// IOUsage returns the total io usage of all dead and live threads in the group.
func (tg *ThreadGroup) IOUsage() *usage.IO {
        tg.pidns.owner.mu.RLock()
        defer tg.pidns.owner.mu.RUnlock()

        io := *tg.ioUsage
        // Account for active tasks.
        for t := tg.tasks.Front(); t != nil; t = t.Next() {
                io.Accumulate(t.IOUsage())
        }
        return &io
}

// Name returns t's name.
func (t *Task) Name() string {
        t.mu.Lock()
        defer t.mu.Unlock()
        return t.image.Name
}

// SetName changes t's name.
func (t *Task) SetName(name string) {
        t.mu.Lock()
        defer t.mu.Unlock()
        t.image.Name = name
        t.Debugf("Set thread name to %q", name)
}

// Limits implements context.Context.Limits.
func (t *Task) Limits() *limits.LimitSet {
        return t.ThreadGroup().Limits()
}

// StartTime returns t's start time.
func (t *Task) StartTime() ktime.Time {
        t.mu.Lock()
        defer t.mu.Unlock()
        return t.startTime
}

// MaxRSS returns the maximum resident set size of the task in bytes. which
// should be one of RUSAGE_SELF, RUSAGE_CHILDREN, RUSAGE_THREAD, or
// RUSAGE_BOTH. See getrusage(2) for documentation on the behavior of these
// flags.
func (t *Task) MaxRSS(which int32) uint64 {
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()

        switch which {
        case linux.RUSAGE_SELF, linux.RUSAGE_THREAD:
                // If there's an active mm we can use its value.
                if mm := t.MemoryManager(); mm != nil {
                        if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > t.tg.maxRSS {
                                return mmMaxRSS
                        }
                }
                return t.tg.maxRSS
        case linux.RUSAGE_CHILDREN:
                return t.tg.childMaxRSS
        case linux.RUSAGE_BOTH:
                maxRSS := t.tg.maxRSS
                if maxRSS < t.tg.childMaxRSS {
                        maxRSS = t.tg.childMaxRSS
                }
                if mm := t.MemoryManager(); mm != nil {
                        if mmMaxRSS := mm.MaxResidentSetSize(); mmMaxRSS > maxRSS {
                                return mmMaxRSS
                        }
                }
                return maxRSS
        default:
                // We'll only get here if which is invalid.
                return 0
        }
}













































   31 

















































































































































































    3 


    2 


    1 










































  131 





  128 

  131 












































































































































































































    1 


    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "fmt"
        "time"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
)

// TableID identifies a specific table.
type TableID int

// Each value identifies a specific table.
const (
        NATID TableID = iota
        MangleID
        FilterID
        NumTables
)

// HookUnset indicates that there is no hook set for an entrypoint or
// underflow.
const HookUnset = -1

// reaperDelay is how long to wait before starting to reap connections.
const reaperDelay = 5 * time.Second

// DefaultTables returns a default set of tables. Each chain is set to accept
// all packets.
func DefaultTables(seed uint32) *IPTables {
        return &IPTables{
                v4Tables: [NumTables]Table{
                        NATID: {
                                Rules: []Rule{
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
                                        {Target: &ErrorTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
                                },
                                BuiltinChains: [NumHooks]int{
                                        Prerouting:  0,
                                        Input:       1,
                                        Forward:     HookUnset,
                                        Output:      2,
                                        Postrouting: 3,
                                },
                                Underflows: [NumHooks]int{
                                        Prerouting:  0,
                                        Input:       1,
                                        Forward:     HookUnset,
                                        Output:      2,
                                        Postrouting: 3,
                                },
                        },
                        MangleID: {
                                Rules: []Rule{
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
                                        {Target: &ErrorTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
                                },
                                BuiltinChains: [NumHooks]int{
                                        Prerouting: 0,
                                        Output:     1,
                                },
                                Underflows: [NumHooks]int{
                                        Prerouting:  0,
                                        Input:       HookUnset,
                                        Forward:     HookUnset,
                                        Output:      1,
                                        Postrouting: HookUnset,
                                },
                        },
                        FilterID: {
                                Rules: []Rule{
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
                                        {Target: &ErrorTarget{NetworkProtocol: header.IPv4ProtocolNumber}},
                                },
                                BuiltinChains: [NumHooks]int{
                                        Prerouting:  HookUnset,
                                        Input:       0,
                                        Forward:     1,
                                        Output:      2,
                                        Postrouting: HookUnset,
                                },
                                Underflows: [NumHooks]int{
                                        Prerouting:  HookUnset,
                                        Input:       0,
                                        Forward:     1,
                                        Output:      2,
                                        Postrouting: HookUnset,
                                },
                        },
                },
                v6Tables: [NumTables]Table{
                        NATID: {
                                Rules: []Rule{
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
                                        {Target: &ErrorTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
                                },
                                BuiltinChains: [NumHooks]int{
                                        Prerouting:  0,
                                        Input:       1,
                                        Forward:     HookUnset,
                                        Output:      2,
                                        Postrouting: 3,
                                },
                                Underflows: [NumHooks]int{
                                        Prerouting:  0,
                                        Input:       1,
                                        Forward:     HookUnset,
                                        Output:      2,
                                        Postrouting: 3,
                                },
                        },
                        MangleID: {
                                Rules: []Rule{
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
                                        {Target: &ErrorTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
                                },
                                BuiltinChains: [NumHooks]int{
                                        Prerouting: 0,
                                        Output:     1,
                                },
                                Underflows: [NumHooks]int{
                                        Prerouting:  0,
                                        Input:       HookUnset,
                                        Forward:     HookUnset,
                                        Output:      1,
                                        Postrouting: HookUnset,
                                },
                        },
                        FilterID: {
                                Rules: []Rule{
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
                                        {Target: &AcceptTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
                                        {Target: &ErrorTarget{NetworkProtocol: header.IPv6ProtocolNumber}},
                                },
                                BuiltinChains: [NumHooks]int{
                                        Prerouting:  HookUnset,
                                        Input:       0,
                                        Forward:     1,
                                        Output:      2,
                                        Postrouting: HookUnset,
                                },
                                Underflows: [NumHooks]int{
                                        Prerouting:  HookUnset,
                                        Input:       0,
                                        Forward:     1,
                                        Output:      2,
                                        Postrouting: HookUnset,
                                },
                        },
                },
                priorities: [NumHooks][]TableID{
                        Prerouting:  {MangleID, NATID},
                        Input:       {NATID, FilterID},
                        Forward:     {FilterID},
                        Output:      {MangleID, NATID, FilterID},
                        Postrouting: {MangleID, NATID},
                },
                connections: ConnTrack{
                        seed: seed,
                },
                reaperDone: make(chan struct{}, 1),
        }
}

// EmptyFilterTable returns a Table with no rules and the filter table chains
// mapped to HookUnset.
func EmptyFilterTable() Table {
        return Table{
                Rules: []Rule{},
                BuiltinChains: [NumHooks]int{
                        Prerouting:  HookUnset,
                        Postrouting: HookUnset,
                },
                Underflows: [NumHooks]int{
                        Prerouting:  HookUnset,
                        Postrouting: HookUnset,
                },
        }
}

// EmptyNATTable returns a Table with no rules and the filter table chains
// mapped to HookUnset.
func EmptyNATTable() Table {
        return Table{
                Rules: []Rule{},
                BuiltinChains: [NumHooks]int{
                        Forward: HookUnset,
                },
                Underflows: [NumHooks]int{
                        Forward: HookUnset,
                },
        }
}

// GetTable returns a table with the given id and IP version. It panics when an
// invalid id is provided.
func (it *IPTables) GetTable(id TableID, ipv6 bool) Table {
        it.mu.RLock()
        defer it.mu.RUnlock()
        if ipv6 {
                return it.v6Tables[id]
        }
        return it.v4Tables[id]
}

// ReplaceTable replaces or inserts table by name. It panics when an invalid id
// is provided.
func (it *IPTables) ReplaceTable(id TableID, table Table, ipv6 bool) tcpip.Error {
        it.mu.Lock()
        defer it.mu.Unlock()
        // If iptables is being enabled, initialize the conntrack table and
        // reaper.
        if !it.modified {
                it.connections.init()
                it.startReaper(reaperDelay)
        }
        it.modified = true
        if ipv6 {
                it.v6Tables[id] = table
        } else {
                it.v4Tables[id] = table
        }
        return nil
}

// A chainVerdict is what a table decides should be done with a packet.
type chainVerdict int

const (
        // chainAccept indicates the packet should continue through netstack.
        chainAccept chainVerdict = iota

        // chainAccept indicates the packet should be dropped.
        chainDrop

        // chainReturn indicates the packet should return to the calling chain
        // or the underflow rule of a builtin chain.
        chainReturn
)

// Check runs pkt through the rules for hook. It returns true when the packet
// should continue traversing the network stack and false when it should be
// dropped.
//
// Precondition: pkt.NetworkHeader is set.
func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, r *Route, preroutingAddr tcpip.Address, inNicName, outNicName string) bool {
        if pkt.NetworkProtocolNumber != header.IPv4ProtocolNumber && pkt.NetworkProtocolNumber != header.IPv6ProtocolNumber {
                return true
        }
        // Many users never configure iptables. Spare them the cost of rule
        // traversal if rules have never been set.
        it.mu.RLock()
        defer it.mu.RUnlock()
        if !it.modified {
                return true
        }

        // Packets are manipulated only if connection and matching
        // NAT rule exists.
        shouldTrack := it.connections.handlePacket(pkt, hook, r)

        // Go through each table containing the hook.
        priorities := it.priorities[hook]
        for _, tableID := range priorities {
                // If handlePacket already NATed the packet, we don't need to
                // check the NAT table.
                if tableID == NATID && pkt.NatDone {
                        continue
                }
                var table Table
                if pkt.NetworkProtocolNumber == header.IPv6ProtocolNumber {
                        table = it.v6Tables[tableID]
                } else {
                        table = it.v4Tables[tableID]
                }
                ruleIdx := table.BuiltinChains[hook]
                switch verdict := it.checkChain(hook, pkt, table, ruleIdx, r, preroutingAddr, inNicName, outNicName); verdict {
                // If the table returns Accept, move on to the next table.
                case chainAccept:
                        continue
                // The Drop verdict is final.
                case chainDrop:
                        return false
                case chainReturn:
                        // Any Return from a built-in chain means we have to
                        // call the underflow.
                        underflow := table.Rules[table.Underflows[hook]]
                        switch v, _ := underflow.Target.Action(pkt, &it.connections, hook, r, preroutingAddr); v {
                        case RuleAccept:
                                continue
                        case RuleDrop:
                                return false
                        case RuleJump, RuleReturn:
                                panic("Underflows should only return RuleAccept or RuleDrop.")
                        default:
                                panic(fmt.Sprintf("Unknown verdict: %d", v))
                        }

                default:
                        panic(fmt.Sprintf("Unknown verdict %v.", verdict))
                }
        }

        // If this connection should be tracked, try to add an entry for it. If
        // traversing the nat table didn't end in adding an entry,
        // maybeInsertNoop will add a no-op entry for the connection. This is
        // needeed when establishing connections so that the SYN/ACK reply to an
        // outgoing SYN is delivered to the correct endpoint rather than being
        // redirected by a prerouting rule.
        //
        // From the iptables documentation: "If there is no rule, a `null'
        // binding is created: this usually does not map the packet, but exists
        // to ensure we don't map another stream over an existing one."
        if shouldTrack {
                it.connections.maybeInsertNoop(pkt, hook)
        }

        // Every table returned Accept.
        return true
}

// beforeSave is invoked by stateify.
func (it *IPTables) beforeSave() {
        // Ensure the reaper exits cleanly.
        it.reaperDone <- struct{}{}
        // Prevent others from modifying the connection table.
        it.connections.mu.Lock()
}

// afterLoad is invoked by stateify.
func (it *IPTables) afterLoad() {
        it.startReaper(reaperDelay)
}

// startReaper starts a goroutine that wakes up periodically to reap timed out
// connections.
func (it *IPTables) startReaper(interval time.Duration) {
        go func() { // S/R-SAFE: reaperDone is signalled when iptables is saved.
                bucket := 0
                for {
                        select {
                        case <-it.reaperDone:
                                return
                                // TODO(gvisor.dev/issue/5939): do not use the ambient clock.
                        case <-time.After(interval):
                                bucket, interval = it.connections.reapUnused(bucket, interval)
                        }
                }
        }()
}

// CheckPackets runs pkts through the rules for hook and returns a map of packets that
// should not go forward.
//
// Preconditions:
// * pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
// * pkt.NetworkHeader is not nil.
//
// NOTE: unlike the Check API the returned map contains packets that should be
// dropped.
func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, r *Route, inNicName, outNicName string) (drop map[*PacketBuffer]struct{}, natPkts map[*PacketBuffer]struct{}) {
        for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
                if !pkt.NatDone {
                        if ok := it.Check(hook, pkt, r, "", inNicName, outNicName); !ok {
                                if drop == nil {
                                        drop = make(map[*PacketBuffer]struct{})
                                }
                                drop[pkt] = struct{}{}
                        }
                        if pkt.NatDone {
                                if natPkts == nil {
                                        natPkts = make(map[*PacketBuffer]struct{})
                                }
                                natPkts[pkt] = struct{}{}
                        }
                }
        }
        return drop, natPkts
}

// Preconditions:
// * pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
// * pkt.NetworkHeader is not nil.
func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, r *Route, preroutingAddr tcpip.Address, inNicName, outNicName string) chainVerdict {
        // Start from ruleIdx and walk the list of rules until a rule gives us
        // a verdict.
        for ruleIdx < len(table.Rules) {
                switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, r, preroutingAddr, inNicName, outNicName); verdict {
                case RuleAccept:
                        return chainAccept

                case RuleDrop:
                        return chainDrop

                case RuleReturn:
                        return chainReturn

                case RuleJump:
                        // "Jumping" to the next rule just means we're
                        // continuing on down the list.
                        if jumpTo == ruleIdx+1 {
                                ruleIdx++
                                continue
                        }
                        switch verdict := it.checkChain(hook, pkt, table, jumpTo, r, preroutingAddr, inNicName, outNicName); verdict {
                        case chainAccept:
                                return chainAccept
                        case chainDrop:
                                return chainDrop
                        case chainReturn:
                                ruleIdx++
                                continue
                        default:
                                panic(fmt.Sprintf("Unknown verdict: %d", verdict))
                        }

                default:
                        panic(fmt.Sprintf("Unknown verdict: %d", verdict))
                }

        }

        // We got through the entire table without a decision. Default to DROP
        // for safety.
        return chainDrop
}

// Preconditions:
// * pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
// * pkt.NetworkHeader is not nil.
func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, r *Route, preroutingAddr tcpip.Address, inNicName, outNicName string) (RuleVerdict, int) {
        rule := table.Rules[ruleIdx]

        // Check whether the packet matches the IP header filter.
        if !rule.Filter.match(pkt, hook, inNicName, outNicName) {
                // Continue on to the next rule.
                return RuleJump, ruleIdx + 1
        }

        // Go through each rule matcher. If they all match, run
        // the rule target.
        for _, matcher := range rule.Matchers {
                matches, hotdrop := matcher.Match(hook, pkt, inNicName, outNicName)
                if hotdrop {
                        return RuleDrop, 0
                }
                if !matches {
                        // Continue on to the next rule.
                        return RuleJump, ruleIdx + 1
                }
        }

        // All the matchers matched, so run the target.
        return rule.Target.Action(pkt, &it.connections, hook, r, preroutingAddr)
}

// OriginalDst returns the original destination of redirected connections. It
// returns an error if the connection doesn't exist or isn't redirected.
func (it *IPTables) OriginalDst(epID TransportEndpointID, netProto tcpip.NetworkProtocolNumber) (tcpip.Address, uint16, tcpip.Error) {
        it.mu.RLock()
        defer it.mu.RUnlock()
        if !it.modified {
                return "", 0, &tcpip.ErrNotConnected{}
        }
        return it.connections.originalDst(epID, netProto)
}




















































































    3 



















    3 

























































    1 






    3 



    3 












    3 
































    1 







    3 


    3 








    3 











































    1 





















































































    1 








































































    1 














































































    1 















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package netfilter

import (
        "encoding/binary"
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

// ErrorTargetName is used to mark targets as error targets. Error targets
// shouldn't be reached - an error has occurred if we fall through to one.
const ErrorTargetName = "ERROR"

// RedirectTargetName is used to mark targets as redirect targets. Redirect
// targets should be reached for only NAT and Mangle tables. These targets will
// change the destination port and/or IP for packets.
const RedirectTargetName = "REDIRECT"

// SNATTargetName is used to mark targets as SNAT targets. SNAT targets should
// be reached for only NAT table. These targets will change the source port
// and/or IP for packets.
const SNATTargetName = "SNAT"

func init() {
        // Standard targets include ACCEPT, DROP, RETURN, and JUMP.
        registerTargetMaker(&standardTargetMaker{
                NetworkProtocol: header.IPv4ProtocolNumber,
        })
        registerTargetMaker(&standardTargetMaker{
                NetworkProtocol: header.IPv6ProtocolNumber,
        })

        // Both user chains and actual errors are represented in iptables by
        // error targets.
        registerTargetMaker(&errorTargetMaker{
                NetworkProtocol: header.IPv4ProtocolNumber,
        })
        registerTargetMaker(&errorTargetMaker{
                NetworkProtocol: header.IPv6ProtocolNumber,
        })

        registerTargetMaker(&redirectTargetMaker{
                NetworkProtocol: header.IPv4ProtocolNumber,
        })
        registerTargetMaker(&nfNATTargetMaker{
                NetworkProtocol: header.IPv6ProtocolNumber,
        })

        registerTargetMaker(&snatTargetMakerV4{
                NetworkProtocol: header.IPv4ProtocolNumber,
        })
        registerTargetMaker(&snatTargetMakerV6{
                NetworkProtocol: header.IPv6ProtocolNumber,
        })
}

// The stack package provides some basic, useful targets for us. The following
// types wrap them for compatibility with the extension system.

type acceptTarget struct {
        stack.AcceptTarget
}

func (at *acceptTarget) id() targetID {
        return targetID{
                networkProtocol: at.NetworkProtocol,
        }
}

type dropTarget struct {
        stack.DropTarget
}

func (dt *dropTarget) id() targetID {
        return targetID{
                networkProtocol: dt.NetworkProtocol,
        }
}

type errorTarget struct {
        stack.ErrorTarget
}

func (et *errorTarget) id() targetID {
        return targetID{
                name:            ErrorTargetName,
                networkProtocol: et.NetworkProtocol,
        }
}

type userChainTarget struct {
        stack.UserChainTarget
}

func (uc *userChainTarget) id() targetID {
        return targetID{
                name:            ErrorTargetName,
                networkProtocol: uc.NetworkProtocol,
        }
}

type returnTarget struct {
        stack.ReturnTarget
}

func (rt *returnTarget) id() targetID {
        return targetID{
                networkProtocol: rt.NetworkProtocol,
        }
}

type redirectTarget struct {
        stack.RedirectTarget

        // addr must be (un)marshalled when reading and writing the target to
        // userspace, but does not affect behavior.
        addr tcpip.Address
}

func (rt *redirectTarget) id() targetID {
        return targetID{
                name:            RedirectTargetName,
                networkProtocol: rt.NetworkProtocol,
        }
}

type snatTarget struct {
        stack.SNATTarget
}

func (st *snatTarget) id() targetID {
        return targetID{
                name:            SNATTargetName,
                networkProtocol: st.NetworkProtocol,
        }
}

type standardTargetMaker struct {
        NetworkProtocol tcpip.NetworkProtocolNumber
}

func (sm *standardTargetMaker) id() targetID {
        // Standard targets have the empty string as a name and no revisions.
        return targetID{
                networkProtocol: sm.NetworkProtocol,
        }
}

func (*standardTargetMaker) marshal(target target) []byte {
        // Translate verdicts the same way as the iptables tool.
        var verdict int32
        switch tg := target.(type) {
        case *acceptTarget:
                verdict = -linux.NF_ACCEPT - 1
        case *dropTarget:
                verdict = -linux.NF_DROP - 1
        case *returnTarget:
                verdict = linux.NF_RETURN
        case *JumpTarget:
                verdict = int32(tg.Offset)
        default:
                panic(fmt.Errorf("unknown target of type %T", target))
        }

        // The target's name will be the empty string.
        xt := linux.XTStandardTarget{
                Target: linux.XTEntryTarget{
                        TargetSize: linux.SizeOfXTStandardTarget,
                },
                Verdict: verdict,
        }

        return marshal.Marshal(&xt)
}

func (*standardTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
        if len(buf) != linux.SizeOfXTStandardTarget {
                nflog("buf has wrong size for standard target %d", len(buf))
                return nil, syserr.ErrInvalidArgument
        }
        var standardTarget linux.XTStandardTarget
        standardTarget.UnmarshalUnsafe(buf[:standardTarget.SizeBytes()])

        if standardTarget.Verdict < 0 {
                // A Verdict < 0 indicates a non-jump verdict.
                return translateToStandardTarget(standardTarget.Verdict, filter.NetworkProtocol())
        }
        // A verdict >= 0 indicates a jump.
        return &JumpTarget{
                Offset:          uint32(standardTarget.Verdict),
                NetworkProtocol: filter.NetworkProtocol(),
        }, nil
}

type errorTargetMaker struct {
        NetworkProtocol tcpip.NetworkProtocolNumber
}

func (em *errorTargetMaker) id() targetID {
        // Error targets have no revision.
        return targetID{
                name:            ErrorTargetName,
                networkProtocol: em.NetworkProtocol,
        }
}

func (*errorTargetMaker) marshal(target target) []byte {
        var errorName string
        switch tg := target.(type) {
        case *errorTarget:
                errorName = ErrorTargetName
        case *userChainTarget:
                errorName = tg.Name
        default:
                panic(fmt.Sprintf("errorMakerTarget cannot marshal unknown type %T", target))
        }

        // This is an error target named error
        xt := linux.XTErrorTarget{
                Target: linux.XTEntryTarget{
                        TargetSize: linux.SizeOfXTErrorTarget,
                },
        }
        copy(xt.Name[:], errorName)
        copy(xt.Target.Name[:], ErrorTargetName)

        return marshal.Marshal(&xt)
}

func (*errorTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
        if len(buf) != linux.SizeOfXTErrorTarget {
                nflog("buf has insufficient size for error target %d", len(buf))
                return nil, syserr.ErrInvalidArgument
        }
        var errTgt linux.XTErrorTarget
        buf = buf[:linux.SizeOfXTErrorTarget]
        errTgt.UnmarshalUnsafe(buf)

        // Error targets are used in 2 cases:
        // * An actual error case. These rules have an error named
        //   ErrorTargetName. The last entry of the table is usually an error
        //   case to catch any packets that somehow fall through every rule.
        // * To mark the start of a user defined chain. These
        //   rules have an error with the name of the chain.
        switch name := errTgt.Name.String(); name {
        case ErrorTargetName:
                return &errorTarget{stack.ErrorTarget{
                        NetworkProtocol: filter.NetworkProtocol(),
                }}, nil
        default:
                // User defined chain.
                return &userChainTarget{stack.UserChainTarget{
                        Name:            name,
                        NetworkProtocol: filter.NetworkProtocol(),
                }}, nil
        }
}

type redirectTargetMaker struct {
        NetworkProtocol tcpip.NetworkProtocolNumber
}

func (rm *redirectTargetMaker) id() targetID {
        return targetID{
                name:            RedirectTargetName,
                networkProtocol: rm.NetworkProtocol,
        }
}

func (*redirectTargetMaker) marshal(target target) []byte {
        rt := target.(*redirectTarget)
        // This is a redirect target named redirect
        xt := linux.XTRedirectTarget{
                Target: linux.XTEntryTarget{
                        TargetSize: linux.SizeOfXTRedirectTarget,
                },
        }
        copy(xt.Target.Name[:], RedirectTargetName)

        xt.NfRange.RangeSize = 1
        xt.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_PROTO_SPECIFIED
        xt.NfRange.RangeIPV4.MinPort = htons(rt.Port)
        xt.NfRange.RangeIPV4.MaxPort = xt.NfRange.RangeIPV4.MinPort
        return marshal.Marshal(&xt)
}

func (*redirectTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
        if len(buf) < linux.SizeOfXTRedirectTarget {
                nflog("redirectTargetMaker: buf has insufficient size for redirect target %d", len(buf))
                return nil, syserr.ErrInvalidArgument
        }

        if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber {
                nflog("redirectTargetMaker: bad proto %d", p)
                return nil, syserr.ErrInvalidArgument
        }

        var rt linux.XTRedirectTarget
        buf = buf[:linux.SizeOfXTRedirectTarget]
        rt.UnmarshalUnsafe(buf)

        // Copy linux.XTRedirectTarget to stack.RedirectTarget.
        target := redirectTarget{RedirectTarget: stack.RedirectTarget{
                NetworkProtocol: filter.NetworkProtocol(),
        }}

        // RangeSize should be 1.
        nfRange := rt.NfRange
        if nfRange.RangeSize != 1 {
                nflog("redirectTargetMaker: bad rangesize %d", nfRange.RangeSize)
                return nil, syserr.ErrInvalidArgument
        }

        // Also check if we need to map ports or IP.
        // For now, redirect target only supports destination port change.
        // Port range and IP range are not supported yet.
        if nfRange.RangeIPV4.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED {
                nflog("redirectTargetMaker: invalid range flags %d", nfRange.RangeIPV4.Flags)
                return nil, syserr.ErrInvalidArgument
        }

        if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
                nflog("redirectTargetMaker: MinPort != MaxPort (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort)
                return nil, syserr.ErrInvalidArgument
        }
        if nfRange.RangeIPV4.MinIP != nfRange.RangeIPV4.MaxIP {
                nflog("redirectTargetMaker: MinIP != MaxIP (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort)
                return nil, syserr.ErrInvalidArgument
        }

        target.addr = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
        target.Port = ntohs(nfRange.RangeIPV4.MinPort)

        return &target, nil
}

// +marshal
type nfNATTarget struct {
        Target linux.XTEntryTarget
        Range  linux.NFNATRange
}

const nfNATMarshalledSize = linux.SizeOfXTEntryTarget + linux.SizeOfNFNATRange

type nfNATTargetMaker struct {
        NetworkProtocol tcpip.NetworkProtocolNumber
}

func (rm *nfNATTargetMaker) id() targetID {
        return targetID{
                name:            RedirectTargetName,
                networkProtocol: rm.NetworkProtocol,
        }
}

func (*nfNATTargetMaker) marshal(target target) []byte {
        rt := target.(*redirectTarget)
        nt := nfNATTarget{
                Target: linux.XTEntryTarget{
                        TargetSize: nfNATMarshalledSize,
                },
                Range: linux.NFNATRange{
                        Flags: linux.NF_NAT_RANGE_PROTO_SPECIFIED,
                },
        }
        copy(nt.Target.Name[:], RedirectTargetName)
        copy(nt.Range.MinAddr[:], rt.addr)
        copy(nt.Range.MaxAddr[:], rt.addr)

        nt.Range.MinProto = htons(rt.Port)
        nt.Range.MaxProto = nt.Range.MinProto

        return marshal.Marshal(&nt)
}

func (*nfNATTargetMaker) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
        if size := nfNATMarshalledSize; len(buf) < size {
                nflog("nfNATTargetMaker: buf has insufficient size (%d) for nfNAT target (%d)", len(buf), size)
                return nil, syserr.ErrInvalidArgument
        }

        if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber {
                nflog("nfNATTargetMaker: bad proto %d", p)
                return nil, syserr.ErrInvalidArgument
        }

        var natRange linux.NFNATRange
        buf = buf[linux.SizeOfXTEntryTarget:nfNATMarshalledSize]
        natRange.UnmarshalUnsafe(buf)

        // We don't support port or address ranges.
        if natRange.MinAddr != natRange.MaxAddr {
                nflog("nfNATTargetMaker: MinAddr and MaxAddr are different")
                return nil, syserr.ErrInvalidArgument
        }
        if natRange.MinProto != natRange.MaxProto {
                nflog("nfNATTargetMaker: MinProto and MaxProto are different")
                return nil, syserr.ErrInvalidArgument
        }

        // For now, redirect target only supports destination change.
        if natRange.Flags != linux.NF_NAT_RANGE_PROTO_SPECIFIED {
                nflog("nfNATTargetMaker: invalid range flags %d", natRange.Flags)
                return nil, syserr.ErrInvalidArgument
        }

        target := redirectTarget{
                RedirectTarget: stack.RedirectTarget{
                        NetworkProtocol: filter.NetworkProtocol(),
                        Port:            ntohs(natRange.MinProto),
                },
                addr: tcpip.Address(natRange.MinAddr[:]),
        }

        return &target, nil
}

type snatTargetMakerV4 struct {
        NetworkProtocol tcpip.NetworkProtocolNumber
}

func (st *snatTargetMakerV4) id() targetID {
        return targetID{
                name:            SNATTargetName,
                networkProtocol: st.NetworkProtocol,
        }
}

func (*snatTargetMakerV4) marshal(target target) []byte {
        st := target.(*snatTarget)
        // This is a snat target named snat.
        xt := linux.XTSNATTarget{
                Target: linux.XTEntryTarget{
                        TargetSize: linux.SizeOfXTSNATTarget,
                },
        }
        copy(xt.Target.Name[:], SNATTargetName)

        xt.NfRange.RangeSize = 1
        xt.NfRange.RangeIPV4.Flags |= linux.NF_NAT_RANGE_MAP_IPS | linux.NF_NAT_RANGE_PROTO_SPECIFIED
        xt.NfRange.RangeIPV4.MinPort = htons(st.Port)
        xt.NfRange.RangeIPV4.MaxPort = xt.NfRange.RangeIPV4.MinPort
        copy(xt.NfRange.RangeIPV4.MinIP[:], st.Addr)
        copy(xt.NfRange.RangeIPV4.MaxIP[:], st.Addr)
        return marshal.Marshal(&xt)
}

func (*snatTargetMakerV4) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
        if len(buf) < linux.SizeOfXTSNATTarget {
                nflog("snatTargetMakerV4: buf has insufficient size for snat target %d", len(buf))
                return nil, syserr.ErrInvalidArgument
        }

        if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber {
                nflog("snatTargetMakerV4: bad proto %d", p)
                return nil, syserr.ErrInvalidArgument
        }

        var st linux.XTSNATTarget
        buf = buf[:linux.SizeOfXTSNATTarget]
        st.UnmarshalUnsafe(buf)

        // Copy linux.XTSNATTarget to stack.SNATTarget.
        target := snatTarget{SNATTarget: stack.SNATTarget{
                NetworkProtocol: filter.NetworkProtocol(),
        }}

        // RangeSize should be 1.
        nfRange := st.NfRange
        if nfRange.RangeSize != 1 {
                nflog("snatTargetMakerV4: bad rangesize %d", nfRange.RangeSize)
                return nil, syserr.ErrInvalidArgument
        }

        // TODO(gvisor.dev/issue/5772): If the rule doesn't specify the source port,
        // choose one automatically.
        if nfRange.RangeIPV4.MinPort == 0 {
                nflog("snatTargetMakerV4: snat target needs to specify a non-zero port")
                return nil, syserr.ErrInvalidArgument
        }

        if nfRange.RangeIPV4.MinPort != nfRange.RangeIPV4.MaxPort {
                nflog("snatTargetMakerV4: MinPort != MaxPort (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort)
                return nil, syserr.ErrInvalidArgument
        }
        if nfRange.RangeIPV4.MinIP != nfRange.RangeIPV4.MaxIP {
                nflog("snatTargetMakerV4: MinIP != MaxIP (%d, %d)", nfRange.RangeIPV4.MinPort, nfRange.RangeIPV4.MaxPort)
                return nil, syserr.ErrInvalidArgument
        }

        target.Addr = tcpip.Address(nfRange.RangeIPV4.MinIP[:])
        target.Port = ntohs(nfRange.RangeIPV4.MinPort)

        return &target, nil
}

type snatTargetMakerV6 struct {
        NetworkProtocol tcpip.NetworkProtocolNumber
}

func (st *snatTargetMakerV6) id() targetID {
        return targetID{
                name:            SNATTargetName,
                networkProtocol: st.NetworkProtocol,
                revision:        1,
        }
}

func (*snatTargetMakerV6) marshal(target target) []byte {
        st := target.(*snatTarget)
        nt := nfNATTarget{
                Target: linux.XTEntryTarget{
                        TargetSize: nfNATMarshalledSize,
                },
                Range: linux.NFNATRange{
                        Flags: linux.NF_NAT_RANGE_MAP_IPS | linux.NF_NAT_RANGE_PROTO_SPECIFIED,
                },
        }
        copy(nt.Target.Name[:], SNATTargetName)
        copy(nt.Range.MinAddr[:], st.Addr)
        copy(nt.Range.MaxAddr[:], st.Addr)
        nt.Range.MinProto = htons(st.Port)
        nt.Range.MaxProto = nt.Range.MinProto

        return marshal.Marshal(&nt)
}

func (*snatTargetMakerV6) unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error) {
        if size := nfNATMarshalledSize; len(buf) < size {
                nflog("snatTargetMakerV6: buf has insufficient size (%d) for SNAT V6 target (%d)", len(buf), size)
                return nil, syserr.ErrInvalidArgument
        }

        if p := filter.Protocol; p != header.TCPProtocolNumber && p != header.UDPProtocolNumber {
                nflog("snatTargetMakerV6: bad proto %d", p)
                return nil, syserr.ErrInvalidArgument
        }

        var natRange linux.NFNATRange
        buf = buf[linux.SizeOfXTEntryTarget:nfNATMarshalledSize]
        natRange.UnmarshalUnsafe(buf)

        // TODO(gvisor.dev/issue/5697): Support port or address ranges.
        if natRange.MinAddr != natRange.MaxAddr {
                nflog("snatTargetMakerV6: MinAddr and MaxAddr are different")
                return nil, syserr.ErrInvalidArgument
        }
        if natRange.MinProto != natRange.MaxProto {
                nflog("snatTargetMakerV6: MinProto and MaxProto are different")
                return nil, syserr.ErrInvalidArgument
        }

        // TODO(gvisor.dev/issue/5698): Support other NF_NAT_RANGE flags.
        if natRange.Flags != linux.NF_NAT_RANGE_MAP_IPS|linux.NF_NAT_RANGE_PROTO_SPECIFIED {
                nflog("snatTargetMakerV6: invalid range flags %d", natRange.Flags)
                return nil, syserr.ErrInvalidArgument
        }

        target := snatTarget{
                SNATTarget: stack.SNATTarget{
                        NetworkProtocol: filter.NetworkProtocol(),
                        Addr:            tcpip.Address(natRange.MinAddr[:]),
                        Port:            ntohs(natRange.MinProto),
                },
        }

        return &target, nil
}

// translateToStandardTarget translates from the value in a
// linux.XTStandardTarget to an stack.Verdict.
func translateToStandardTarget(val int32, netProto tcpip.NetworkProtocolNumber) (target, *syserr.Error) {
        switch val {
        case -linux.NF_ACCEPT - 1:
                return &acceptTarget{stack.AcceptTarget{
                        NetworkProtocol: netProto,
                }}, nil
        case -linux.NF_DROP - 1:
                return &dropTarget{stack.DropTarget{
                        NetworkProtocol: netProto,
                }}, nil
        case -linux.NF_QUEUE - 1:
                nflog("unsupported iptables verdict QUEUE")
                return nil, syserr.ErrInvalidArgument
        case linux.NF_RETURN:
                return &returnTarget{stack.ReturnTarget{
                        NetworkProtocol: netProto,
                }}, nil
        default:
                nflog("unknown iptables verdict %d", val)
                return nil, syserr.ErrInvalidArgument
        }
}

// parseTarget parses a target from optVal. optVal should contain only the
// target.
func parseTarget(filter stack.IPHeaderFilter, optVal []byte, ipv6 bool) (stack.Target, *syserr.Error) {
        nflog("set entries: parsing target of size %d", len(optVal))
        if len(optVal) < linux.SizeOfXTEntryTarget {
                nflog("optVal has insufficient size for entry target %d", len(optVal))
                return nil, syserr.ErrInvalidArgument
        }
        var target linux.XTEntryTarget
        target.UnmarshalUnsafe(optVal[:target.SizeBytes()])

        return unmarshalTarget(target, filter, optVal)
}

// JumpTarget implements stack.Target.
type JumpTarget struct {
        // Offset is the byte offset of the rule to jump to. It is used for
        // marshaling and unmarshaling.
        Offset uint32

        // RuleNum is the rule to jump to.
        RuleNum int

        // NetworkProtocol is the network protocol the target is used with.
        NetworkProtocol tcpip.NetworkProtocolNumber
}

// ID implements Target.ID.
func (jt *JumpTarget) id() targetID {
        return targetID{
                networkProtocol: jt.NetworkProtocol,
        }
}

// Action implements stack.Target.Action.
func (jt *JumpTarget) Action(*stack.PacketBuffer, *stack.ConnTrack, stack.Hook, *stack.Route, tcpip.Address) (stack.RuleVerdict, int) {
        return stack.RuleJump, jt.RuleNum
}

func ntohs(port uint16) uint16 {
        buf := make([]byte, 2)
        binary.BigEndian.PutUint16(buf, port)
        return hostarch.ByteOrder.Uint16(buf)
}

func htons(port uint16) uint16 {
        buf := make([]byte, 2)
        hostarch.ByteOrder.PutUint16(buf, port)
        return binary.BigEndian.Uint16(buf)
}






































   14 
   13 





    1 
    1 





  784 
  784 





  295 






  294 



  294 


  295 





  292 



  292 


  291 


  292 


  293 
  293 








  292 



  292 




  293 




  295 




  295 





   89 


  294 









  293 



  291 




  291 





  290 




  288 





  268 


  290 























  690 









  690 
















  691 





  689 
  690 






  662 
  661 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "fmt"
        "runtime/trace"
        "sort"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/usermem"
)

const (
        // maxStackDebugBytes is the maximum number of user stack bytes that may be
        // printed by debugDumpStack.
        maxStackDebugBytes = 1024
        // maxCodeDebugBytes is the maximum number of user code bytes that may be
        // printed by debugDumpCode.
        maxCodeDebugBytes = 128
)

// Infof logs an formatted info message by calling log.Infof.
func (t *Task) Infof(fmt string, v ...interface{}) {
        if log.IsLogging(log.Info) {
                log.InfofAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
        }
}

// Warningf logs a warning string by calling log.Warningf.
func (t *Task) Warningf(fmt string, v ...interface{}) {
        if log.IsLogging(log.Warning) {
                log.WarningfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
        }
}

// Debugf creates a debug string that includes the task ID.
func (t *Task) Debugf(fmt string, v ...interface{}) {
        if log.IsLogging(log.Debug) {
                log.DebugfAtDepth(1, t.logPrefix.Load().(string)+fmt, v...)
        }
}

// IsLogging returns true iff this level is being logged.
func (t *Task) IsLogging(level log.Level) bool {
        return log.IsLogging(level)
}

// DebugDumpState logs task state at log level debug.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) DebugDumpState() {
        t.debugDumpRegisters()
        t.debugDumpStack()
        t.debugDumpCode()
        if mm := t.MemoryManager(); mm != nil {
                t.Debugf("Mappings:\n%s", mm)
        }
        t.Debugf("FDTable:\n%s", t.fdTable)
}

// debugDumpRegisters logs register state at log level debug.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) debugDumpRegisters() {
        if !t.IsLogging(log.Debug) {
                return
        }
        regmap, err := t.Arch().RegisterMap()
        if err != nil {
                t.Debugf("Registers: %v", err)
        } else {
                t.Debugf("Registers:")
                var regs []string
                for reg := range regmap {
                        regs = append(regs, reg)
                }
                sort.Strings(regs)
                for _, reg := range regs {
                        t.Debugf("%-8s = %016x", reg, regmap[reg])
                }
        }
}

// debugDumpStack logs user stack contents at log level debug.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) debugDumpStack() {
        if !t.IsLogging(log.Debug) {
                return
        }
        m := t.MemoryManager()
        if m == nil {
                t.Debugf("Memory manager for task is gone, skipping application stack dump.")
                return
        }
        t.Debugf("Stack:")
        start := hostarch.Addr(t.Arch().Stack())
        // Round addr down to a 16-byte boundary.
        start &= ^hostarch.Addr(15)
        // Print 16 bytes per line, one byte at a time.
        for offset := uint64(0); offset < maxStackDebugBytes; offset += 16 {
                addr, ok := start.AddLength(offset)
                if !ok {
                        break
                }
                var data [16]byte
                n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{
                        IgnorePermissions: true,
                })
                // Print as much of the line as we can, even if an error was
                // encountered.
                if n > 0 {
                        t.Debugf("%x: % x", addr, data[:n])
                }
                if err != nil {
                        t.Debugf("Error reading stack at address %x: %v", addr+hostarch.Addr(n), err)
                        break
                }
        }
}

// debugDumpCode logs user code contents at log level debug.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) debugDumpCode() {
        if !t.IsLogging(log.Debug) {
                return
        }
        m := t.MemoryManager()
        if m == nil {
                t.Debugf("Memory manager for task is gone, skipping application code dump.")
                return
        }
        t.Debugf("Code:")
        // Print code on both sides of the instruction register.
        start := hostarch.Addr(t.Arch().IP()) - maxCodeDebugBytes/2
        // Round addr down to a 16-byte boundary.
        start &= ^hostarch.Addr(15)
        // Print 16 bytes per line, one byte at a time.
        for offset := uint64(0); offset < maxCodeDebugBytes; offset += 16 {
                addr, ok := start.AddLength(offset)
                if !ok {
                        break
                }
                var data [16]byte
                n, err := m.CopyIn(t, addr, data[:], usermem.IOOpts{
                        IgnorePermissions: true,
                })
                // Print as much of the line as we can, even if an error was
                // encountered.
                if n > 0 {
                        t.Debugf("%x: % x", addr, data[:n])
                }
                if err != nil {
                        t.Debugf("Error reading stack at address %x: %v", addr+hostarch.Addr(n), err)
                        break
                }
        }
}

// trace definitions.
//
// Note that all region names are prefixed by ':' in order to ensure that they
// are lexically ordered before all system calls, which use the naked system
// call name (e.g. "read") for maximum clarity.
const (
        traceCategory = "task"
        runRegion     = ":run"
        blockRegion   = ":block"
        cpuidRegion   = ":cpuid"
        faultRegion   = ":fault"
)

// updateInfoLocked updates the task's cached log prefix and tracing
// information to reflect its current thread ID.
//
// Preconditions: The task's owning TaskSet.mu must be locked.
func (t *Task) updateInfoLocked() {
        // Use the task's TID in the root PID namespace for logging.
        tid := t.tg.pidns.owner.Root.tids[t]
        t.logPrefix.Store(fmt.Sprintf("[% 4d] ", tid))
        t.rebuildTraceContext(tid)
}

// rebuildTraceContext rebuilds the trace context.
//
// Precondition: the passed tid must be the tid in the root namespace.
func (t *Task) rebuildTraceContext(tid ThreadID) {
        // Re-initialize the trace context.
        if t.traceTask != nil {
                t.traceTask.End()
        }

        // Note that we define the "task type" to be the dynamic TID. This does
        // not align perfectly with the documentation for "tasks" in the
        // tracing package. Tasks may be assumed to be bounded by analysis
        // tools. However, if we just use a generic "task" type here, then the
        // "user-defined tasks" page on the tracing dashboard becomes nearly
        // unusable, as it loads all traces from all tasks.
        //
        // We can assume that the number of tasks in the system is not
        // arbitrarily large (in general it won't be, especially for cases
        // where we're collecting a brief profile), so using the TID is a
        // reasonable compromise in this case.
        t.traceContext, t.traceTask = trace.NewTask(context.Background(), fmt.Sprintf("tid:%d", tid))
}

// traceCloneEvent is called when a new task is spawned.
//
// ntid must be the new task's ThreadID in the root namespace.
func (t *Task) traceCloneEvent(ntid ThreadID) {
        if !trace.IsEnabled() {
                return
        }
        trace.Logf(t.traceContext, traceCategory, "spawn: %d", ntid)
}

// traceExitEvent is called when a task exits.
func (t *Task) traceExitEvent() {
        if !trace.IsEnabled() {
                return
        }
        trace.Logf(t.traceContext, traceCategory, "exit status: %s", t.exitStatus)
}

// traceExecEvent is called when a task calls exec.
func (t *Task) traceExecEvent(image *TaskImage) {
        if !trace.IsEnabled() {
                return
        }
        file := image.MemoryManager.Executable()
        if file == nil {
                trace.Logf(t.traceContext, traceCategory, "exec: << unknown >>")
                return
        }
        defer file.DecRef(t)
        trace.Logf(t.traceContext, traceCategory, "exec: %s", file.PathnameWithDeleted(t))
}































































    2 









    2 

    2 


    2 













    2 



    2 






    2 
    2 

















































    2 





    2 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fragmentation

import (
        "math"
        "sort"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

type hole struct {
        first  uint16
        last   uint16
        filled bool
        final  bool
        // pkt is the fragment packet if hole is filled. We keep the whole pkt rather
        // than the fragmented payload to prevent binding to specific buffer types.
        pkt *stack.PacketBuffer
}

type reassembler struct {
        reassemblerEntry
        id        FragmentID
        memSize   int
        proto     uint8
        mu        sync.Mutex
        holes     []hole
        filled    int
        done      bool
        createdAt tcpip.MonotonicTime
        pkt       *stack.PacketBuffer
}

func newReassembler(id FragmentID, clock tcpip.Clock) *reassembler {
        r := &reassembler{
                id:        id,
                createdAt: clock.NowMonotonic(),
        }
        r.holes = append(r.holes, hole{
                first:  0,
                last:   math.MaxUint16,
                filled: false,
                final:  true,
        })
        return r
}

func (r *reassembler) process(first, last uint16, more bool, proto uint8, pkt *stack.PacketBuffer) (*stack.PacketBuffer, uint8, bool, int, error) {
        r.mu.Lock()
        defer r.mu.Unlock()
        if r.done {
                // A concurrent goroutine might have already reassembled
                // the packet and emptied the heap while this goroutine
                // was waiting on the mutex. We don't have to do anything in this case.
                return nil, 0, false, 0, nil
        }

        var holeFound bool
        var memConsumed int
        for i := range r.holes {
                currentHole := &r.holes[i]

                if last < currentHole.first || currentHole.last < first {
                        continue
                }
                // For IPv6, overlaps with an existing fragment are explicitly forbidden by
                // RFC 8200 section 4.5:
                //   If any of the fragments being reassembled overlap with any other
                //   fragments being reassembled for the same packet, reassembly of that
                //   packet must be abandoned and all the fragments that have been received
                //   for that packet must be discarded, and no ICMP error messages should be
                //   sent.
                //
                // It is not explicitly forbidden for IPv4, but to keep parity with Linux we
                // disallow it as well:
                // https://github.com/torvalds/linux/blob/38525c6/net/ipv4/inet_fragment.c#L349
                if first < currentHole.first || currentHole.last < last {
                        // Incoming fragment only partially fits in the free hole.
                        return nil, 0, false, 0, ErrFragmentOverlap
                }
                if !more {
                        if !currentHole.final || currentHole.filled && currentHole.last != last {
                                // We have another final fragment, which does not perfectly overlap.
                                return nil, 0, false, 0, ErrFragmentConflict
                        }
                }

                holeFound = true
                if currentHole.filled {
                        // Incoming fragment is a duplicate.
                        continue
                }

                // We are populating the current hole with the payload and creating a new
                // hole for any unfilled ranges on either end.
                if first > currentHole.first {
                        r.holes = append(r.holes, hole{
                                first:  currentHole.first,
                                last:   first - 1,
                                filled: false,
                                final:  false,
                        })
                }
                if last < currentHole.last && more {
                        r.holes = append(r.holes, hole{
                                first:  last + 1,
                                last:   currentHole.last,
                                filled: false,
                                final:  currentHole.final,
                        })
                        currentHole.final = false
                }
                memConsumed = pkt.MemSize()
                r.memSize += memConsumed
                // Update the current hole to precisely match the incoming fragment.
                r.holes[i] = hole{
                        first:  first,
                        last:   last,
                        filled: true,
                        final:  currentHole.final,
                        pkt:    pkt,
                }
                r.filled++
                // For IPv6, it is possible to have different Protocol values between
                // fragments of a packet (because, unlike IPv4, the Protocol is not used to
                // identify a fragment). In this case, only the Protocol of the first
                // fragment must be used as per RFC 8200 Section 4.5.
                //
                // TODO(gvisor.dev/issue/3648): During reassembly of an IPv6 packet, IP
                // options received in the first fragment should be used - and they should
                // override options from following fragments.
                if first == 0 {
                        r.pkt = pkt
                        r.proto = proto
                }

                break
        }
        if !holeFound {
                // Incoming fragment is beyond end.
                return nil, 0, false, 0, ErrFragmentConflict
        }

        // Check if all the holes have been filled and we are ready to reassemble.
        if r.filled < len(r.holes) {
                return nil, 0, false, memConsumed, nil
        }

        sort.Slice(r.holes, func(i, j int) bool {
                return r.holes[i].first < r.holes[j].first
        })

        resPkt := r.holes[0].pkt
        for i := 1; i < len(r.holes); i++ {
                stack.MergeFragment(resPkt, r.holes[i].pkt)
        }
        return resPkt, r.proto, true, memConsumed, nil
}

func (r *reassembler) checkDoneOrMark() bool {
        r.mu.Lock()
        prev := r.done
        r.done = true
        r.mu.Unlock()
        return prev
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/kernfs/static_directory_refs.go: no such file or directory

































    2 




    2 




    2 







































    2 




    7 


































    9 




    9 



    1 






    9 




    9 









    1 



    8 








    8 






   22 




    1 



   21 


    1 






    1 


    1 


   20 

    6 










    1 

    5 


    6 

    2 
    2 





    2 

    4 



    4 






   18 
    2 















    2 




















    2 
    1 



    1 














































    1 

    1 




    9 








    8 

    1 

    1 


    1 

    1 


    1 


    1 





    1 


    1 





    1 


    1 



















    9 














































    2 

























































   18 



























    7 






   11 








   11 
    2 


    9 













    9 






    9 


    1 





    1 













    1 







    1 

















    8 

    8 


    8 





    8 
    1 










    8 


    2 

    6 



    8 









    8 



































    8 












    8 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ipv4

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

// icmpv4DestinationUnreachableSockError is a general ICMPv4 Destination
// Unreachable error.
//
// +stateify savable
type icmpv4DestinationUnreachableSockError struct{}

// Origin implements tcpip.SockErrorCause.
func (*icmpv4DestinationUnreachableSockError) Origin() tcpip.SockErrOrigin {
        return tcpip.SockExtErrorOriginICMP
}

// Type implements tcpip.SockErrorCause.
func (*icmpv4DestinationUnreachableSockError) Type() uint8 {
        return uint8(header.ICMPv4DstUnreachable)
}

// Info implements tcpip.SockErrorCause.
func (*icmpv4DestinationUnreachableSockError) Info() uint32 {
        return 0
}

var _ stack.TransportError = (*icmpv4DestinationHostUnreachableSockError)(nil)

// icmpv4DestinationHostUnreachableSockError is an ICMPv4 Destination Host
// Unreachable error.
//
// It indicates that a packet was not able to reach the destination host.
//
// +stateify savable
type icmpv4DestinationHostUnreachableSockError struct {
        icmpv4DestinationUnreachableSockError
}

// Code implements tcpip.SockErrorCause.
func (*icmpv4DestinationHostUnreachableSockError) Code() uint8 {
        return uint8(header.ICMPv4HostUnreachable)
}

// Kind implements stack.TransportError.
func (*icmpv4DestinationHostUnreachableSockError) Kind() stack.TransportErrorKind {
        return stack.DestinationHostUnreachableTransportError
}

var _ stack.TransportError = (*icmpv4DestinationPortUnreachableSockError)(nil)

// icmpv4DestinationPortUnreachableSockError is an ICMPv4 Destination Port
// Unreachable error.
//
// It indicates that a packet reached the destination host, but the transport
// protocol was not active on the destination port.
//
// +stateify savable
type icmpv4DestinationPortUnreachableSockError struct {
        icmpv4DestinationUnreachableSockError
}

// Code implements tcpip.SockErrorCause.
func (*icmpv4DestinationPortUnreachableSockError) Code() uint8 {
        return uint8(header.ICMPv4PortUnreachable)
}

// Kind implements stack.TransportError.
func (*icmpv4DestinationPortUnreachableSockError) Kind() stack.TransportErrorKind {
        return stack.DestinationPortUnreachableTransportError
}

var _ stack.TransportError = (*icmpv4FragmentationNeededSockError)(nil)

// icmpv4FragmentationNeededSockError is an ICMPv4 Destination Unreachable error
// due to fragmentation being required but the packet was set to not be
// fragmented.
//
// It indicates that a link exists on the path to the destination with an MTU
// that is too small to carry the packet.
//
// +stateify savable
type icmpv4FragmentationNeededSockError struct {
        icmpv4DestinationUnreachableSockError

        mtu uint32
}

// Code implements tcpip.SockErrorCause.
func (*icmpv4FragmentationNeededSockError) Code() uint8 {
        return uint8(header.ICMPv4FragmentationNeeded)
}

// Info implements tcpip.SockErrorCause.
func (e *icmpv4FragmentationNeededSockError) Info() uint32 {
        return e.mtu
}

// Kind implements stack.TransportError.
func (*icmpv4FragmentationNeededSockError) Kind() stack.TransportErrorKind {
        return stack.PacketTooBigTransportError
}

func (e *endpoint) checkLocalAddress(addr tcpip.Address) bool {
        if e.nic.Spoofing() {
                return true
        }

        if addressEndpoint := e.AcquireAssignedAddress(addr, false, stack.NeverPrimaryEndpoint); addressEndpoint != nil {
                addressEndpoint.DecRef()
                return true
        }
        return false
}

// handleControl handles the case when an ICMP error packet contains the headers
// of the original packet that caused the ICMP one to be sent. This information
// is used to find out which transport endpoint must be notified about the ICMP
// packet. We only expect the payload, not the enclosing ICMP packet.
func (e *endpoint) handleControl(errInfo stack.TransportError, pkt *stack.PacketBuffer) {
        h, ok := pkt.Data().PullUp(header.IPv4MinimumSize)
        if !ok {
                return
        }
        hdr := header.IPv4(h)

        // We don't use IsValid() here because ICMP only requires that the IP
        // header plus 8 bytes of the transport header be included. So it's
        // likely that it is truncated, which would cause IsValid to return
        // false.
        //
        // Drop packet if it doesn't have the basic IPv4 header or if the
        // original source address doesn't match an address we own.
        srcAddr := hdr.SourceAddress()
        if !e.checkLocalAddress(srcAddr) {
                return
        }

        hlen := int(hdr.HeaderLength())
        if pkt.Data().Size() < hlen || hdr.FragmentOffset() != 0 {
                // We won't be able to handle this if it doesn't contain the
                // full IPv4 header, or if it's a fragment not at offset 0
                // (because it won't have the transport header).
                return
        }

        // Keep needed information before trimming header.
        p := hdr.TransportProtocol()
        dstAddr := hdr.DestinationAddress()
        // Skip the ip header, then deliver the error.
        pkt.Data().DeleteFront(hlen)
        e.dispatcher.DeliverTransportError(srcAddr, dstAddr, ProtocolNumber, p, errInfo, pkt)
}

func (e *endpoint) handleICMP(pkt *stack.PacketBuffer) {
        received := e.stats.icmp.packetsReceived
        // ICMP packets don't have their TransportHeader fields set. See
        // icmp/protocol.go:protocol.Parse for a full explanation.
        v, ok := pkt.Data().PullUp(header.ICMPv4MinimumSize)
        if !ok {
                received.invalid.Increment()
                return
        }
        h := header.ICMPv4(v)

        // Only do in-stack processing if the checksum is correct.
        if pkt.Data().AsRange().Checksum() != 0xffff {
                received.invalid.Increment()
                // It's possible that a raw socket expects to receive this regardless
                // of checksum errors. If it's an echo request we know it's safe because
                // we are the only handler, however other types do not cope well with
                // packets with checksum errors.
                switch h.Type() {
                case header.ICMPv4Echo:
                        e.dispatcher.DeliverTransportPacket(header.ICMPv4ProtocolNumber, pkt)
                }
                return
        }

        iph := header.IPv4(pkt.NetworkHeader().View())
        var newOptions header.IPv4Options
        if opts := iph.Options(); len(opts) != 0 {
                // RFC 1122 section 3.2.2.6 (page 43) (and similar for other round trip
                // type ICMP packets):
                //    If a Record Route and/or Time Stamp option is received in an
                //    ICMP Echo Request, this option (these options) SHOULD be
                //    updated to include the current host and included in the IP
                //    header of the Echo Reply message, without "truncation".
                //    Thus, the recorded route will be for the entire round trip.
                //
                // So we need to let the option processor know how it should handle them.
                var op optionsUsage
                if h.Type() == header.ICMPv4Echo {
                        op = &optionUsageEcho{}
                } else {
                        op = &optionUsageReceive{}
                }
                var optProblem *header.IPv4OptParameterProblem
                newOptions, _, optProblem = e.processIPOptions(pkt, opts, op)
                if optProblem != nil {
                        if optProblem.NeedICMP {
                                _ = e.protocol.returnError(&icmpReasonParamProblem{
                                        pointer: optProblem.Pointer,
                                }, pkt)
                                e.stats.ip.MalformedPacketsReceived.Increment()
                        }
                        return
                }
                copied := copy(opts, newOptions)
                if copied != len(newOptions) {
                        panic(fmt.Sprintf("copied %d bytes of new options, expected %d bytes", copied, len(newOptions)))
                }
                for i := copied; i < len(opts); i++ {
                        // Pad with 0 (EOL). RFC 791 page 23 says "The padding is zero".
                        opts[i] = byte(header.IPv4OptionListEndType)
                }
        }

        // TODO(b/112892170): Meaningfully handle all ICMP types.
        switch h.Type() {
        case header.ICMPv4Echo:
                received.echoRequest.Increment()

                sent := e.stats.icmp.packetsSent
                if !e.protocol.stack.AllowICMPMessage() {
                        sent.rateLimited.Increment()
                        return
                }

                // DeliverTransportPacket will take ownership of pkt so don't use it beyond
                // this point. Make a deep copy of the data before pkt gets sent as we will
                // be modifying fields.
                //
                // TODO(gvisor.dev/issue/4399): The copy may not be needed if there are no
                // waiting endpoints. Consider moving responsibility for doing the copy to
                // DeliverTransportPacket so that is is only done when needed.
                replyData := pkt.Data().AsRange().ToOwnedView()
                ipHdr := header.IPv4(pkt.NetworkHeader().View())
                localAddressBroadcast := pkt.NetworkPacketInfo.LocalAddressBroadcast

                // It's possible that a raw socket expects to receive this.
                e.dispatcher.DeliverTransportPacket(header.ICMPv4ProtocolNumber, pkt)
                pkt = nil

                // Take the base of the incoming request IP header but replace the options.
                replyHeaderLength := uint8(header.IPv4MinimumSize + len(newOptions))
                replyIPHdr := header.IPv4(append(iph[:header.IPv4MinimumSize:header.IPv4MinimumSize], newOptions...))
                replyIPHdr.SetHeaderLength(replyHeaderLength)

                // As per RFC 1122 section 3.2.1.3, when a host sends any datagram, the IP
                // source address MUST be one of its own IP addresses (but not a broadcast
                // or multicast address).
                localAddr := ipHdr.DestinationAddress()
                if localAddressBroadcast || header.IsV4MulticastAddress(localAddr) {
                        localAddr = ""
                }

                r, err := e.protocol.stack.FindRoute(e.nic.ID(), localAddr, ipHdr.SourceAddress(), ProtocolNumber, false /* multicastLoop */)
                if err != nil {
                        // If we cannot find a route to the destination, silently drop the packet.
                        return
                }
                defer r.Release()

                // TODO(gvisor.dev/issue/3810:) When adding protocol numbers into the
                // header information, we may have to change this code to handle the
                // ICMP header no longer being in the data buffer.

                // Because IP and ICMP are so closely intertwined, we need to handcraft our
                // IP header to be able to follow RFC 792. The wording on page 13 is as
                // follows:
                //   IP Fields:
                //   Addresses
                //     The address of the source in an echo message will be the
                //     destination of the echo reply message.  To form an echo reply
                //     message, the source and destination addresses are simply reversed,
                //     the type code changed to 0, and the checksum recomputed.
                //
                // This was interpreted by early implementors to mean that all options must
                // be copied from the echo request IP header to the echo reply IP header
                // and this behaviour is still relied upon by some applications.
                //
                // Create a copy of the IP header we received, options and all, and change
                // The fields we need to alter.
                //
                // We need to produce the entire packet in the data segment in order to
                // use WriteHeaderIncludedPacket(). WriteHeaderIncludedPacket sets the
                // total length and the header checksum so we don't need to set those here.
                replyIPHdr.SetSourceAddress(r.LocalAddress())
                replyIPHdr.SetDestinationAddress(r.RemoteAddress())
                replyIPHdr.SetTTL(r.DefaultTTL())

                replyICMPHdr := header.ICMPv4(replyData)
                replyICMPHdr.SetType(header.ICMPv4EchoReply)
                replyICMPHdr.SetChecksum(0)
                replyICMPHdr.SetChecksum(^header.Checksum(replyData, 0))

                replyVV := buffer.View(replyIPHdr).ToVectorisedView()
                replyVV.AppendView(replyData)
                replyPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                        ReserveHeaderBytes: int(r.MaxHeaderLength()),
                        Data:               replyVV,
                })
                replyPkt.TransportProtocolNumber = header.ICMPv4ProtocolNumber

                if err := r.WriteHeaderIncludedPacket(replyPkt); err != nil {
                        sent.dropped.Increment()
                        return
                }
                sent.echoReply.Increment()

        case header.ICMPv4EchoReply:
                received.echoReply.Increment()

                e.dispatcher.DeliverTransportPacket(header.ICMPv4ProtocolNumber, pkt)

        case header.ICMPv4DstUnreachable:
                received.dstUnreachable.Increment()

                mtu := h.MTU()
                code := h.Code()
                pkt.Data().DeleteFront(header.ICMPv4MinimumSize)
                switch code {
                case header.ICMPv4HostUnreachable:
                        e.handleControl(&icmpv4DestinationHostUnreachableSockError{}, pkt)
                case header.ICMPv4PortUnreachable:
                        e.handleControl(&icmpv4DestinationPortUnreachableSockError{}, pkt)
                case header.ICMPv4FragmentationNeeded:
                        networkMTU, err := calculateNetworkMTU(uint32(mtu), header.IPv4MinimumSize)
                        if err != nil {
                                networkMTU = 0
                        }
                        e.handleControl(&icmpv4FragmentationNeededSockError{mtu: networkMTU}, pkt)
                }
        case header.ICMPv4SrcQuench:
                received.srcQuench.Increment()

        case header.ICMPv4Redirect:
                received.redirect.Increment()

        case header.ICMPv4TimeExceeded:
                received.timeExceeded.Increment()

        case header.ICMPv4ParamProblem:
                received.paramProblem.Increment()

        case header.ICMPv4Timestamp:
                received.timestamp.Increment()

        case header.ICMPv4TimestampReply:
                received.timestampReply.Increment()

        case header.ICMPv4InfoRequest:
                received.infoRequest.Increment()

        case header.ICMPv4InfoReply:
                received.infoReply.Increment()

        default:
                received.invalid.Increment()
        }
}

// ======= ICMP Error packet generation =========

// icmpReason is a marker interface for IPv4 specific ICMP errors.
type icmpReason interface {
        isICMPReason()
        // isForwarding indicates whether or not the error arose while attempting to
        // forward a packet.
        isForwarding() bool
}

// icmpReasonPortUnreachable is an error where the transport protocol has no
// listener and no alternative means to inform the sender.
type icmpReasonPortUnreachable struct{}

func (*icmpReasonPortUnreachable) isICMPReason() {}
func (*icmpReasonPortUnreachable) isForwarding() bool {
        return false
}

// icmpReasonProtoUnreachable is an error where the transport protocol is
// not supported.
type icmpReasonProtoUnreachable struct{}

func (*icmpReasonProtoUnreachable) isICMPReason() {}
func (*icmpReasonProtoUnreachable) isForwarding() bool {
        return false
}

// icmpReasonTTLExceeded is an error where a packet's time to live exceeded in
// transit to its final destination, as per RFC 792 page 6, Time Exceeded
// Message.
type icmpReasonTTLExceeded struct{}

func (*icmpReasonTTLExceeded) isICMPReason() {}
func (*icmpReasonTTLExceeded) isForwarding() bool {
        // If we hit a TTL Exceeded error, then we know we are operating as a router.
        // As per RFC 792 page 6, Time Exceeded Message,
        //
        //   If the gateway processing a datagram finds the time to live field
        //   is zero it must discard the datagram.  The gateway may also notify
        //   the source host via the time exceeded message.
        return true
}

// icmpReasonReassemblyTimeout is an error where insufficient fragments are
// received to complete reassembly of a packet within a configured time after
// the reception of the first-arriving fragment of that packet.
type icmpReasonReassemblyTimeout struct{}

func (*icmpReasonReassemblyTimeout) isICMPReason() {}
func (*icmpReasonReassemblyTimeout) isForwarding() bool {
        return false
}

// icmpReasonParamProblem is an error to use to request a Parameter Problem
// message to be sent.
type icmpReasonParamProblem struct {
        pointer    byte
        forwarding bool
}

func (*icmpReasonParamProblem) isICMPReason() {}
func (r *icmpReasonParamProblem) isForwarding() bool {
        return r.forwarding
}

// icmpReasonNetworkUnreachable is an error in which the network specified in
// the internet destination field of the datagram is unreachable.
type icmpReasonNetworkUnreachable struct{}

func (*icmpReasonNetworkUnreachable) isICMPReason() {}
func (*icmpReasonNetworkUnreachable) isForwarding() bool {
        // If we hit a Net Unreachable error, then we know we are operating as
        // a router. As per RFC 792 page 5, Destination Unreachable Message,
        //
        //  If, according to the information in the gateway's routing tables,
        //  the network specified in the internet destination field of a
        //  datagram is unreachable, e.g., the distance to the network is
        //  infinity, the gateway may send a destination unreachable message to
        //  the internet source host of the datagram.
        return true
}

// icmpReasonFragmentationNeeded is an error where a packet requires
// fragmentation while also having the Don't Fragment flag set, as per RFC 792
// page 3, Destination Unreachable Message.
type icmpReasonFragmentationNeeded struct{}

func (*icmpReasonFragmentationNeeded) isICMPReason() {}
func (*icmpReasonFragmentationNeeded) isForwarding() bool {
        // If we hit a Don't Fragment error, then we know we are operating as a router.
        // As per RFC 792 page 4, Destination Unreachable Message,
        //
        //   Another case is when a datagram must be fragmented to be forwarded by a
        //   gateway yet the Don't Fragment flag is on. In this case the gateway must
        //   discard the datagram and may return a destination unreachable message.
        return true
}

// icmpReasonHostUnreachable is an error in which the host specified in the
// internet destination field of the datagram is unreachable.
type icmpReasonHostUnreachable struct{}

func (*icmpReasonHostUnreachable) isICMPReason() {}
func (*icmpReasonHostUnreachable) isForwarding() bool {
        // If we hit a Host Unreachable error, then we know we are operating as a
        // router. As per RFC 792 page 5, Destination Unreachable Message,
        //
        //   In addition, in some networks, the gateway may be able to determine
        //   if the internet destination host is unreachable.  Gateways in these
        //   networks may send destination unreachable messages to the source host
        //   when the destination host is unreachable.
        return true
}

// returnError takes an error descriptor and generates the appropriate ICMP
// error packet for IPv4 and sends it back to the remote device that sent
// the problematic packet. It incorporates as much of that packet as
// possible as well as any error metadata as is available. returnError
// expects pkt to hold a valid IPv4 packet as per the wire format.
func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer) tcpip.Error {
        origIPHdr := header.IPv4(pkt.NetworkHeader().View())
        origIPHdrSrc := origIPHdr.SourceAddress()
        origIPHdrDst := origIPHdr.DestinationAddress()

        // We check we are responding only when we are allowed to.
        // See RFC 1812 section 4.3.2.7 (shown below).
        //
        // =========
        // 4.3.2.7 When Not to Send ICMP Errors
        //
        //  An ICMP error message MUST NOT be sent as the result of receiving:
        //
        //  o An ICMP error message, or
        //
        //  o A packet which fails the IP header validation tests described in
        //    Section [5.2.2] (except where that section specifically permits
        //    the sending of an ICMP error message), or
        //
        //  o A packet destined to an IP broadcast or IP multicast address, or
        //
        //  o A packet sent as a Link Layer broadcast or multicast, or
        //
        //  o Any fragment of a datagram other then the first fragment (i.e., a
        // packet for which the fragment offset in the IP header is nonzero).
        //
        // TODO(gvisor.dev/issues/4058): Make sure we don't send ICMP errors in
        // response to a non-initial fragment, but it currently can not happen.
        if pkt.NetworkPacketInfo.LocalAddressBroadcast || header.IsV4MulticastAddress(origIPHdrDst) || origIPHdrSrc == header.IPv4Any {
                return nil
        }

        // If we are operating as a router/gateway, don't use the packet's destination
        // address as the response's source address as we should not not own the
        // destination address of a packet we are forwarding.
        localAddr := origIPHdrDst
        if reason.isForwarding() {
                localAddr = ""
        }

        // Even if we were able to receive a packet from some remote, we may not have
        // a route to it - the remote may be blocked via routing rules. We must always
        // consult our routing table and find a route to the remote before sending any
        // packet.
        route, err := p.stack.FindRoute(pkt.NICID, localAddr, origIPHdrSrc, ProtocolNumber, false /* multicastLoop */)
        if err != nil {
                return err
        }
        defer route.Release()

        p.mu.Lock()
        // We retrieve an endpoint using the newly constructed route's NICID rather
        // than the packet's NICID. The packet's NICID corresponds to the NIC on
        // which it arrived, which isn't necessarily the same as the NIC on which it
        // will be transmitted. On the other hand, the route's NIC *is* guaranteed
        // to be the NIC on which the packet will be transmitted.
        netEP, ok := p.mu.eps[route.NICID()]
        p.mu.Unlock()
        if !ok {
                return &tcpip.ErrNotConnected{}
        }

        sent := netEP.stats.icmp.packetsSent

        if !p.stack.AllowICMPMessage() {
                sent.rateLimited.Increment()
                return nil
        }

        transportHeader := pkt.TransportHeader().View()

        // Don't respond to icmp error packets.
        if origIPHdr.Protocol() == uint8(header.ICMPv4ProtocolNumber) {
                // TODO(gvisor.dev/issue/3810):
                // Unfortunately the current stack pretty much always has ICMPv4 headers
                // in the Data section of the packet but there is no guarantee that is the
                // case. If this is the case grab the header to make it like all other
                // packet types. When this is cleaned up the Consume should be removed.
                if transportHeader.IsEmpty() {
                        var ok bool
                        transportHeader, ok = pkt.TransportHeader().Consume(header.ICMPv4MinimumSize)
                        if !ok {
                                return nil
                        }
                } else if transportHeader.Size() < header.ICMPv4MinimumSize {
                        return nil
                }
                // We need to decide to explicitly name the packets we can respond to or
                // the ones we can not respond to. The decision is somewhat arbitrary and
                // if problems arise this could be reversed. It was judged less of a breach
                // of protocol to not respond to unknown non-error packets than to respond
                // to unknown error packets so we take the first approach.
                switch header.ICMPv4(transportHeader).Type() {
                case
                        header.ICMPv4EchoReply,
                        header.ICMPv4Echo,
                        header.ICMPv4Timestamp,
                        header.ICMPv4TimestampReply,
                        header.ICMPv4InfoRequest,
                        header.ICMPv4InfoReply:
                default:
                        // Assume any type we don't know about may be an error type.
                        return nil
                }
        }

        // Now work out how much of the triggering packet we should return.
        // As per RFC 1812 Section 4.3.2.3
        //
        //   ICMP datagram SHOULD contain as much of the original
        //   datagram as possible without the length of the ICMP
        //   datagram exceeding 576 bytes.
        //
        // NOTE: The above RFC referenced is different from the original
        // recommendation in RFC 1122 and RFC 792 where it mentioned that at
        // least 8 bytes of the payload must be included. Today linux and other
        // systems implement the RFC 1812 definition and not the original
        // requirement. We treat 8 bytes as the minimum but will try send more.
        mtu := int(route.MTU())
        const maxIPData = header.IPv4MinimumProcessableDatagramSize - header.IPv4MinimumSize
        if mtu > maxIPData {
                mtu = maxIPData
        }
        available := mtu - header.ICMPv4MinimumSize

        if available < len(origIPHdr)+header.ICMPv4MinimumErrorPayloadSize {
                return nil
        }

        payloadLen := len(origIPHdr) + transportHeader.Size() + pkt.Data().Size()
        if payloadLen > available {
                payloadLen = available
        }

        // The buffers used by pkt may be used elsewhere in the system.
        // For example, an AF_RAW or AF_PACKET socket may use what the transport
        // protocol considers an unreachable destination. Thus we deep copy pkt to
        // prevent multiple ownership and SR errors. The new copy is a vectorized
        // view with the entire incoming IP packet reassembled and truncated as
        // required. This is now the payload of the new ICMP packet and no longer
        // considered a packet in its own right.
        newHeader := append(buffer.View(nil), origIPHdr...)
        newHeader = append(newHeader, transportHeader...)
        payload := newHeader.ToVectorisedView()
        if dataCap := payloadLen - payload.Size(); dataCap > 0 {
                payload.AppendView(pkt.Data().AsRange().Capped(dataCap).ToOwnedView())
        } else {
                payload.CapLength(payloadLen)
        }

        icmpPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                ReserveHeaderBytes: int(route.MaxHeaderLength()) + header.ICMPv4MinimumSize,
                Data:               payload,
        })

        icmpPkt.TransportProtocolNumber = header.ICMPv4ProtocolNumber

        icmpHdr := header.ICMPv4(icmpPkt.TransportHeader().Push(header.ICMPv4MinimumSize))
        var counter tcpip.MultiCounterStat
        switch reason := reason.(type) {
        case *icmpReasonPortUnreachable:
                icmpHdr.SetType(header.ICMPv4DstUnreachable)
                icmpHdr.SetCode(header.ICMPv4PortUnreachable)
                counter = sent.dstUnreachable
        case *icmpReasonProtoUnreachable:
                icmpHdr.SetType(header.ICMPv4DstUnreachable)
                icmpHdr.SetCode(header.ICMPv4ProtoUnreachable)
                counter = sent.dstUnreachable
        case *icmpReasonNetworkUnreachable:
                icmpHdr.SetType(header.ICMPv4DstUnreachable)
                icmpHdr.SetCode(header.ICMPv4NetUnreachable)
                counter = sent.dstUnreachable
        case *icmpReasonHostUnreachable:
                icmpHdr.SetType(header.ICMPv4DstUnreachable)
                icmpHdr.SetCode(header.ICMPv4HostUnreachable)
                counter = sent.dstUnreachable
        case *icmpReasonFragmentationNeeded:
                icmpHdr.SetType(header.ICMPv4DstUnreachable)
                icmpHdr.SetCode(header.ICMPv4FragmentationNeeded)
                counter = sent.dstUnreachable
        case *icmpReasonTTLExceeded:
                icmpHdr.SetType(header.ICMPv4TimeExceeded)
                icmpHdr.SetCode(header.ICMPv4TTLExceeded)
                counter = sent.timeExceeded
        case *icmpReasonReassemblyTimeout:
                icmpHdr.SetType(header.ICMPv4TimeExceeded)
                icmpHdr.SetCode(header.ICMPv4ReassemblyTimeout)
                counter = sent.timeExceeded
        case *icmpReasonParamProblem:
                icmpHdr.SetType(header.ICMPv4ParamProblem)
                icmpHdr.SetCode(header.ICMPv4UnusedCode)
                icmpHdr.SetPointer(reason.pointer)
                counter = sent.paramProblem
        default:
                panic(fmt.Sprintf("unsupported ICMP type %T", reason))
        }
        icmpHdr.SetChecksum(header.ICMPv4Checksum(icmpHdr, icmpPkt.Data().AsRange().Checksum()))

        if err := route.WritePacket(
                stack.NetworkHeaderParams{
                        Protocol: header.ICMPv4ProtocolNumber,
                        TTL:      route.DefaultTTL(),
                        TOS:      stack.DefaultTOS,
                },
                icmpPkt,
        ); err != nil {
                sent.dropped.Increment()
                return err
        }
        counter.Increment()
        return nil
}

// OnReassemblyTimeout implements fragmentation.TimeoutHandler.
func (p *protocol) OnReassemblyTimeout(pkt *stack.PacketBuffer) {
        // OnReassemblyTimeout sends a Time Exceeded Message, as per RFC 792:
        //
        //   If a host reassembling a fragmented datagram cannot complete the
        //   reassembly due to missing fragments within its time limit it discards the
        //   datagram, and it may send a time exceeded message.
        //
        //   If fragment zero is not available then no time exceeded need be sent at
        //   all.
        if pkt != nil {
                p.returnError(&icmpReasonReassemblyTimeout{}, pkt)
        }
}









































   92 




   20 

    1 

   20 

   20 







   20 

   20 








   20 



    5 

    5 

    5 


    5 

    5 






    5 






    5 

    5 





































































































    2 










    2 









    3 










    2 





































    1 

    1 
    1 
    1 



    1 


















































































































    1 



    3 


    3 


    3 

    3 






    3 


















    3 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package netstack

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sentry/inet"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
        "gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
        "gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
)

// Stack implements inet.Stack for netstack/tcpip/stack.Stack.
//
// +stateify savable
type Stack struct {
        Stack *stack.Stack `state:"manual"`
}

// SupportsIPv6 implements Stack.SupportsIPv6.
func (s *Stack) SupportsIPv6() bool {
        return s.Stack.CheckNetworkProtocol(ipv6.ProtocolNumber)
}

// Converts Netstack's ARPHardwareType to equivalent linux constants.
func toLinuxARPHardwareType(t header.ARPHardwareType) uint16 {
        switch t {
        case header.ARPHardwareNone:
                return linux.ARPHRD_NONE
        case header.ARPHardwareLoopback:
                return linux.ARPHRD_LOOPBACK
        case header.ARPHardwareEther:
                return linux.ARPHRD_ETHER
        default:
                panic(fmt.Sprintf("unknown ARPHRD type: %d", t))
        }
}

// Interfaces implements inet.Stack.Interfaces.
func (s *Stack) Interfaces() map[int32]inet.Interface {
        is := make(map[int32]inet.Interface)
        for id, ni := range s.Stack.NICInfo() {
                is[int32(id)] = inet.Interface{
                        Name:       ni.Name,
                        Addr:       []byte(ni.LinkAddress),
                        Flags:      uint32(nicStateFlagsToLinux(ni.Flags)),
                        DeviceType: toLinuxARPHardwareType(ni.ARPHardwareType),
                        MTU:        ni.MTU,
                }
        }
        return is
}

// InterfaceAddrs implements inet.Stack.InterfaceAddrs.
func (s *Stack) InterfaceAddrs() map[int32][]inet.InterfaceAddr {
        nicAddrs := make(map[int32][]inet.InterfaceAddr)
        for id, ni := range s.Stack.NICInfo() {
                var addrs []inet.InterfaceAddr
                for _, a := range ni.ProtocolAddresses {
                        var family uint8
                        switch a.Protocol {
                        case ipv4.ProtocolNumber:
                                family = linux.AF_INET
                        case ipv6.ProtocolNumber:
                                family = linux.AF_INET6
                        default:
                                log.Warningf("Unknown network protocol in %+v", a)
                                continue
                        }

                        addrs = append(addrs, inet.InterfaceAddr{
                                Family:    family,
                                PrefixLen: uint8(a.AddressWithPrefix.PrefixLen),
                                Addr:      []byte(a.AddressWithPrefix.Address),
                                // TODO(b/68878065): Other fields.
                        })
                }
                nicAddrs[int32(id)] = addrs
        }
        return nicAddrs
}

// convertAddr converts an InterfaceAddr to a ProtocolAddress.
func convertAddr(addr inet.InterfaceAddr) (tcpip.ProtocolAddress, error) {
        var (
                protocol        tcpip.NetworkProtocolNumber
                address         tcpip.Address
                protocolAddress tcpip.ProtocolAddress
        )
        switch addr.Family {
        case linux.AF_INET:
                if len(addr.Addr) != header.IPv4AddressSize {
                        return protocolAddress, linuxerr.EINVAL
                }
                if addr.PrefixLen > header.IPv4AddressSize*8 {
                        return protocolAddress, linuxerr.EINVAL
                }
                protocol = ipv4.ProtocolNumber
                address = tcpip.Address(addr.Addr)
        case linux.AF_INET6:
                if len(addr.Addr) != header.IPv6AddressSize {
                        return protocolAddress, linuxerr.EINVAL
                }
                if addr.PrefixLen > header.IPv6AddressSize*8 {
                        return protocolAddress, linuxerr.EINVAL
                }
                protocol = ipv6.ProtocolNumber
                address = tcpip.Address(addr.Addr)
        default:
                return protocolAddress, linuxerr.ENOTSUP
        }

        protocolAddress = tcpip.ProtocolAddress{
                Protocol: protocol,
                AddressWithPrefix: tcpip.AddressWithPrefix{
                        Address:   address,
                        PrefixLen: int(addr.PrefixLen),
                },
        }
        return protocolAddress, nil
}

// AddInterfaceAddr implements inet.Stack.AddInterfaceAddr.
func (s *Stack) AddInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
        protocolAddress, err := convertAddr(addr)
        if err != nil {
                return err
        }

        // Attach address to interface.
        nicID := tcpip.NICID(idx)
        if err := s.Stack.AddProtocolAddressWithOptions(nicID, protocolAddress, stack.CanBePrimaryEndpoint); err != nil {
                return syserr.TranslateNetstackError(err).ToError()
        }

        // Add route for local network if it doesn't exist already.
        localRoute := tcpip.Route{
                Destination: protocolAddress.AddressWithPrefix.Subnet(),
                Gateway:     "", // No gateway for local network.
                NIC:         nicID,
        }

        for _, rt := range s.Stack.GetRouteTable() {
                if rt.Equal(localRoute) {
                        return nil
                }
        }

        // Local route does not exist yet. Add it.
        s.Stack.AddRoute(localRoute)

        return nil
}

// RemoveInterfaceAddr implements inet.Stack.RemoveInterfaceAddr.
func (s *Stack) RemoveInterfaceAddr(idx int32, addr inet.InterfaceAddr) error {
        protocolAddress, err := convertAddr(addr)
        if err != nil {
                return err
        }

        // Remove addresses matching the address and prefix.
        nicID := tcpip.NICID(idx)
        if err := s.Stack.RemoveAddress(nicID, protocolAddress.AddressWithPrefix.Address); err != nil {
                return syserr.TranslateNetstackError(err).ToError()
        }

        // Remove the corresponding local network route if it exists.
        localRoute := tcpip.Route{
                Destination: protocolAddress.AddressWithPrefix.Subnet(),
                Gateway:     "", // No gateway for local network.
                NIC:         nicID,
        }
        s.Stack.RemoveRoutes(func(rt tcpip.Route) bool {
                return rt.Equal(localRoute)
        })

        return nil
}

// TCPReceiveBufferSize implements inet.Stack.TCPReceiveBufferSize.
func (s *Stack) TCPReceiveBufferSize() (inet.TCPBufferSize, error) {
        var rs tcpip.TCPReceiveBufferSizeRangeOption
        err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &rs)
        return inet.TCPBufferSize{
                Min:     rs.Min,
                Default: rs.Default,
                Max:     rs.Max,
        }, syserr.TranslateNetstackError(err).ToError()
}

// SetTCPReceiveBufferSize implements inet.Stack.SetTCPReceiveBufferSize.
func (s *Stack) SetTCPReceiveBufferSize(size inet.TCPBufferSize) error {
        rs := tcpip.TCPReceiveBufferSizeRangeOption{
                Min:     size.Min,
                Default: size.Default,
                Max:     size.Max,
        }
        return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &rs)).ToError()
}

// TCPSendBufferSize implements inet.Stack.TCPSendBufferSize.
func (s *Stack) TCPSendBufferSize() (inet.TCPBufferSize, error) {
        var ss tcpip.TCPSendBufferSizeRangeOption
        err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &ss)
        return inet.TCPBufferSize{
                Min:     ss.Min,
                Default: ss.Default,
                Max:     ss.Max,
        }, syserr.TranslateNetstackError(err).ToError()
}

// SetTCPSendBufferSize implements inet.Stack.SetTCPSendBufferSize.
func (s *Stack) SetTCPSendBufferSize(size inet.TCPBufferSize) error {
        ss := tcpip.TCPSendBufferSizeRangeOption{
                Min:     size.Min,
                Default: size.Default,
                Max:     size.Max,
        }
        return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &ss)).ToError()
}

// TCPSACKEnabled implements inet.Stack.TCPSACKEnabled.
func (s *Stack) TCPSACKEnabled() (bool, error) {
        var sack tcpip.TCPSACKEnabled
        err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &sack)
        return bool(sack), syserr.TranslateNetstackError(err).ToError()
}

// SetTCPSACKEnabled implements inet.Stack.SetTCPSACKEnabled.
func (s *Stack) SetTCPSACKEnabled(enabled bool) error {
        opt := tcpip.TCPSACKEnabled(enabled)
        return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError()
}

// TCPRecovery implements inet.Stack.TCPRecovery.
func (s *Stack) TCPRecovery() (inet.TCPLossRecovery, error) {
        var recovery tcpip.TCPRecovery
        if err := s.Stack.TransportProtocolOption(tcp.ProtocolNumber, &recovery); err != nil {
                return 0, syserr.TranslateNetstackError(err).ToError()
        }
        return inet.TCPLossRecovery(recovery), nil
}

// SetTCPRecovery implements inet.Stack.SetTCPRecovery.
func (s *Stack) SetTCPRecovery(recovery inet.TCPLossRecovery) error {
        opt := tcpip.TCPRecovery(recovery)
        return syserr.TranslateNetstackError(s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, &opt)).ToError()
}

// Statistics implements inet.Stack.Statistics.
func (s *Stack) Statistics(stat interface{}, arg string) error {
        switch stats := stat.(type) {
        case *inet.StatDev:
                for _, ni := range s.Stack.NICInfo() {
                        if ni.Name != arg {
                                continue
                        }
                        // TODO(gvisor.dev/issue/2103) Support stubbed stats.
                        *stats = inet.StatDev{
                                // Receive section.
                                ni.Stats.Rx.Bytes.Value(),   // bytes.
                                ni.Stats.Rx.Packets.Value(), // packets.
                                0,                           // errs.
                                0,                           // drop.
                                0,                           // fifo.
                                0,                           // frame.
                                0,                           // compressed.
                                0,                           // multicast.
                                // Transmit section.
                                ni.Stats.Tx.Bytes.Value(),   // bytes.
                                ni.Stats.Tx.Packets.Value(), // packets.
                                0,                           // errs.
                                0,                           // drop.
                                0,                           // fifo.
                                0,                           // colls.
                                0,                           // carrier.
                                0,                           // compressed.
                        }
                        break
                }
        case *inet.StatSNMPIP:
                ip := Metrics.IP
                // TODO(gvisor.dev/issue/969) Support stubbed stats.
                *stats = inet.StatSNMPIP{
                        0,                          // Ip/Forwarding.
                        0,                          // Ip/DefaultTTL.
                        ip.PacketsReceived.Value(), // InReceives.
                        0,                          // Ip/InHdrErrors.
                        ip.InvalidDestinationAddressesReceived.Value(), // InAddrErrors.
                        0,                               // Ip/ForwDatagrams.
                        0,                               // Ip/InUnknownProtos.
                        0,                               // Ip/InDiscards.
                        ip.PacketsDelivered.Value(),     // InDelivers.
                        ip.PacketsSent.Value(),          // OutRequests.
                        ip.OutgoingPacketErrors.Value(), // OutDiscards.
                        0,                               // Ip/OutNoRoutes.
                        0,                               // Support Ip/ReasmTimeout.
                        0,                               // Support Ip/ReasmReqds.
                        0,                               // Support Ip/ReasmOKs.
                        0,                               // Support Ip/ReasmFails.
                        0,                               // Support Ip/FragOKs.
                        0,                               // Support Ip/FragFails.
                        0,                               // Support Ip/FragCreates.
                }
        case *inet.StatSNMPICMP:
                in := Metrics.ICMP.V4.PacketsReceived.ICMPv4PacketStats
                out := Metrics.ICMP.V4.PacketsSent.ICMPv4PacketStats
                // TODO(gvisor.dev/issue/969) Support stubbed stats.
                *stats = inet.StatSNMPICMP{
                        0, // Icmp/InMsgs.
                        Metrics.ICMP.V4.PacketsSent.Dropped.Value(), // InErrors.
                        0,                         // Icmp/InCsumErrors.
                        in.DstUnreachable.Value(), // InDestUnreachs.
                        in.TimeExceeded.Value(),   // InTimeExcds.
                        in.ParamProblem.Value(),   // InParmProbs.
                        in.SrcQuench.Value(),      // InSrcQuenchs.
                        in.Redirect.Value(),       // InRedirects.
                        in.EchoRequest.Value(),    // InEchos.
                        in.EchoReply.Value(),      // InEchoReps.
                        in.Timestamp.Value(),      // InTimestamps.
                        in.TimestampReply.Value(), // InTimestampReps.
                        in.InfoRequest.Value(),    // InAddrMasks.
                        in.InfoReply.Value(),      // InAddrMaskReps.
                        0,                         // Icmp/OutMsgs.
                        Metrics.ICMP.V4.PacketsReceived.Invalid.Value(), // OutErrors.
                        out.DstUnreachable.Value(),                      // OutDestUnreachs.
                        out.TimeExceeded.Value(),                        // OutTimeExcds.
                        out.ParamProblem.Value(),                        // OutParmProbs.
                        out.SrcQuench.Value(),                           // OutSrcQuenchs.
                        out.Redirect.Value(),                            // OutRedirects.
                        out.EchoRequest.Value(),                         // OutEchos.
                        out.EchoReply.Value(),                           // OutEchoReps.
                        out.Timestamp.Value(),                           // OutTimestamps.
                        out.TimestampReply.Value(),                      // OutTimestampReps.
                        out.InfoRequest.Value(),                         // OutAddrMasks.
                        out.InfoReply.Value(),                           // OutAddrMaskReps.
                }
        case *inet.StatSNMPTCP:
                tcp := Metrics.TCP
                // RFC 2012 (updates 1213):  SNMPv2-MIB-TCP.
                *stats = inet.StatSNMPTCP{
                        1,                                     // RtoAlgorithm.
                        200,                                   // RtoMin.
                        120000,                                // RtoMax.
                        (1<<64 - 1),                           // MaxConn.
                        tcp.ActiveConnectionOpenings.Value(),  // ActiveOpens.
                        tcp.PassiveConnectionOpenings.Value(), // PassiveOpens.
                        tcp.FailedConnectionAttempts.Value(),  // AttemptFails.
                        tcp.EstablishedResets.Value(),         // EstabResets.
                        tcp.CurrentEstablished.Value(),        // CurrEstab.
                        tcp.ValidSegmentsReceived.Value(),     // InSegs.
                        tcp.SegmentsSent.Value(),              // OutSegs.
                        tcp.Retransmits.Value(),               // RetransSegs.
                        tcp.InvalidSegmentsReceived.Value(),   // InErrs.
                        tcp.ResetsSent.Value(),                // OutRsts.
                        tcp.ChecksumErrors.Value(),            // InCsumErrors.
                }
        case *inet.StatSNMPUDP:
                udp := Metrics.UDP
                // TODO(gvisor.dev/issue/969) Support stubbed stats.
                *stats = inet.StatSNMPUDP{
                        udp.PacketsReceived.Value(),     // InDatagrams.
                        udp.UnknownPortErrors.Value(),   // NoPorts.
                        0,                               // Udp/InErrors.
                        udp.PacketsSent.Value(),         // OutDatagrams.
                        udp.ReceiveBufferErrors.Value(), // RcvbufErrors.
                        0,                               // Udp/SndbufErrors.
                        udp.ChecksumErrors.Value(),      // Udp/InCsumErrors.
                        0,                               // Udp/IgnoredMulti.
                }
        default:
                return syserr.ErrEndpointOperation.ToError()
        }
        return nil
}

// RouteTable implements inet.Stack.RouteTable.
func (s *Stack) RouteTable() []inet.Route {
        var routeTable []inet.Route

        for _, rt := range s.Stack.GetRouteTable() {
                var family uint8
                switch len(rt.Destination.ID()) {
                case header.IPv4AddressSize:
                        family = linux.AF_INET
                case header.IPv6AddressSize:
                        family = linux.AF_INET6
                default:
                        log.Warningf("Unknown network protocol in route %+v", rt)
                        continue
                }

                routeTable = append(routeTable, inet.Route{
                        Family: family,
                        DstLen: uint8(rt.Destination.Prefix()), // The CIDR prefix for the destination.

                        // Always return unspecified protocol since we have no notion of
                        // protocol for routes.
                        Protocol: linux.RTPROT_UNSPEC,
                        // Set statically to LINK scope for now.
                        //
                        // TODO(gvisor.dev/issue/595): Set scope for routes.
                        Scope: linux.RT_SCOPE_LINK,
                        Type:  linux.RTN_UNICAST,

                        DstAddr:         []byte(rt.Destination.ID()),
                        OutputInterface: int32(rt.NIC),
                        GatewayAddr:     []byte(rt.Gateway),
                })
        }

        return routeTable
}

// IPTables returns the stack's iptables.
func (s *Stack) IPTables() (*stack.IPTables, error) {
        return s.Stack.IPTables(), nil
}

// Resume implements inet.Stack.Resume.
func (s *Stack) Resume() {
        s.Stack.Resume()
}

// RegisteredEndpoints implements inet.Stack.RegisteredEndpoints.
func (s *Stack) RegisteredEndpoints() []stack.TransportEndpoint {
        return s.Stack.RegisteredEndpoints()
}

// CleanupEndpoints implements inet.Stack.CleanupEndpoints.
func (s *Stack) CleanupEndpoints() []stack.TransportEndpoint {
        return s.Stack.CleanupEndpoints()
}

// RestoreCleanupEndpoints implements inet.Stack.RestoreCleanupEndpoints.
func (s *Stack) RestoreCleanupEndpoints(es []stack.TransportEndpoint) {
        s.Stack.RestoreCleanupEndpoints(es)
}

// SetForwarding implements inet.Stack.SetForwarding.
func (s *Stack) SetForwarding(protocol tcpip.NetworkProtocolNumber, enable bool) error {
        if err := s.Stack.SetForwardingDefaultAndAllNICs(protocol, enable); err != nil {
                return fmt.Errorf("SetForwardingDefaultAndAllNICs(%d, %t): %s", protocol, enable, err)
        }
        return nil
}

// PortRange implements inet.Stack.PortRange.
func (s *Stack) PortRange() (uint16, uint16) {
        return s.Stack.PortRange()
}

// SetPortRange implements inet.Stack.SetPortRange.
func (s *Stack) SetPortRange(start uint16, end uint16) error {
        return syserr.TranslateNetstackError(s.Stack.SetPortRange(start, end)).ToError()
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fs/fsutil/file_range_set_impl.go: no such file or directory

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/socket/unix/socket_vfs2_refs.go: no such file or directory






























    2 




    2 



    4 





    1 


    3 



    3 


    1 


    2 



    2 



    6 





    1 


    5 


    2 


    3 
    1 


    2 



    5 




    4 



    9 









    9 



    9 


    1 



    8 



    6 


    2 
    1 


    1 



    4 






    1 


    3 


    1 



    2 
    1 


    1 



    1 



   12 




    5 



   17 










   17 
    1 


   16 
    1 


   15 


    2 


   13 
    1 



   12 







   10 










   10 
    1 


    9 





    9 
    2 



    7 







    4 




    3 



    7 




    1 


    5 



    6 


    1 



    5 



    6 







    6 






    6 


   45 

    5 
    1 


    4 

   41 


   41 


    5 
    1 


    5 
    1 



    5 



    4 





    4 


   22 
    1 


   21 
    2 


   19 


    3 
    2 


    3 



    2 





    2 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "bytes"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/gohacks"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

// ListXattr implements Linux syscall listxattr(2).
func ListXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return listxattr(t, args, followFinalSymlink)
}

// Llistxattr implements Linux syscall llistxattr(2).
func Llistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return listxattr(t, args, nofollowFinalSymlink)
}

func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
        pathAddr := args[0].Pointer()
        listAddr := args[1].Pointer()
        size := args[2].SizeT()

        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return 0, nil, err
        }
        tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
        if err != nil {
                return 0, nil, err
        }
        defer tpop.Release(t)

        names, err := t.Kernel().VFS().ListXattrAt(t, t.Credentials(), &tpop.pop, uint64(size))
        if err != nil {
                return 0, nil, err
        }
        n, err := copyOutXattrNameList(t, listAddr, size, names)
        if err != nil {
                return 0, nil, err
        }
        return uintptr(n), nil, nil
}

// Flistxattr implements Linux syscall flistxattr(2).
func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        listAddr := args[1].Pointer()
        size := args[2].SizeT()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        names, err := file.ListXattr(t, uint64(size))
        if err != nil {
                return 0, nil, err
        }
        n, err := copyOutXattrNameList(t, listAddr, size, names)
        if err != nil {
                return 0, nil, err
        }
        return uintptr(n), nil, nil
}

// GetXattr implements Linux syscall getxattr(2).
func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return getxattr(t, args, followFinalSymlink)
}

// Lgetxattr implements Linux syscall lgetxattr(2).
func Lgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return getxattr(t, args, nofollowFinalSymlink)
}

func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
        pathAddr := args[0].Pointer()
        nameAddr := args[1].Pointer()
        valueAddr := args[2].Pointer()
        size := args[3].SizeT()

        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return 0, nil, err
        }
        tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
        if err != nil {
                return 0, nil, err
        }
        defer tpop.Release(t)

        name, err := copyInXattrName(t, nameAddr)
        if err != nil {
                return 0, nil, err
        }

        value, err := t.Kernel().VFS().GetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.GetXattrOptions{
                Name: name,
                Size: uint64(size),
        })
        if err != nil {
                return 0, nil, err
        }
        n, err := copyOutXattrValue(t, valueAddr, size, value)
        if err != nil {
                return 0, nil, err
        }
        return uintptr(n), nil, nil
}

// Fgetxattr implements Linux syscall fgetxattr(2).
func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        nameAddr := args[1].Pointer()
        valueAddr := args[2].Pointer()
        size := args[3].SizeT()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        name, err := copyInXattrName(t, nameAddr)
        if err != nil {
                return 0, nil, err
        }

        value, err := file.GetXattr(t, &vfs.GetXattrOptions{Name: name, Size: uint64(size)})
        if err != nil {
                return 0, nil, err
        }
        n, err := copyOutXattrValue(t, valueAddr, size, value)
        if err != nil {
                return 0, nil, err
        }
        return uintptr(n), nil, nil
}

// SetXattr implements Linux syscall setxattr(2).
func SetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return 0, nil, setxattr(t, args, followFinalSymlink)
}

// Lsetxattr implements Linux syscall lsetxattr(2).
func Lsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return 0, nil, setxattr(t, args, nofollowFinalSymlink)
}

func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
        pathAddr := args[0].Pointer()
        nameAddr := args[1].Pointer()
        valueAddr := args[2].Pointer()
        size := args[3].SizeT()
        flags := args[4].Int()

        if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
                return linuxerr.EINVAL
        }

        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return err
        }
        tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
        if err != nil {
                return err
        }
        defer tpop.Release(t)

        name, err := copyInXattrName(t, nameAddr)
        if err != nil {
                return err
        }
        value, err := copyInXattrValue(t, valueAddr, size)
        if err != nil {
                return err
        }

        return t.Kernel().VFS().SetXattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetXattrOptions{
                Name:  name,
                Value: value,
                Flags: uint32(flags),
        })
}

// Fsetxattr implements Linux syscall fsetxattr(2).
func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        nameAddr := args[1].Pointer()
        valueAddr := args[2].Pointer()
        size := args[3].SizeT()
        flags := args[4].Int()

        if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        name, err := copyInXattrName(t, nameAddr)
        if err != nil {
                return 0, nil, err
        }
        value, err := copyInXattrValue(t, valueAddr, size)
        if err != nil {
                return 0, nil, err
        }

        return 0, nil, file.SetXattr(t, &vfs.SetXattrOptions{
                Name:  name,
                Value: value,
                Flags: uint32(flags),
        })
}

// RemoveXattr implements Linux syscall removexattr(2).
func RemoveXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return 0, nil, removexattr(t, args, followFinalSymlink)
}

// Lremovexattr implements Linux syscall lremovexattr(2).
func Lremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return 0, nil, removexattr(t, args, nofollowFinalSymlink)
}

func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
        pathAddr := args[0].Pointer()
        nameAddr := args[1].Pointer()

        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return err
        }
        tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
        if err != nil {
                return err
        }
        defer tpop.Release(t)

        name, err := copyInXattrName(t, nameAddr)
        if err != nil {
                return err
        }

        return t.Kernel().VFS().RemoveXattrAt(t, t.Credentials(), &tpop.pop, name)
}

// Fremovexattr implements Linux syscall fremovexattr(2).
func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        nameAddr := args[1].Pointer()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        name, err := copyInXattrName(t, nameAddr)
        if err != nil {
                return 0, nil, err
        }

        return 0, nil, file.RemoveXattr(t, name)
}

func copyInXattrName(t *kernel.Task, nameAddr hostarch.Addr) (string, error) {
        name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1)
        if err != nil {
                if linuxerr.Equals(linuxerr.ENAMETOOLONG, err) {
                        return "", linuxerr.ERANGE
                }
                return "", err
        }
        if len(name) == 0 {
                return "", linuxerr.ERANGE
        }
        return name, nil
}

func copyOutXattrNameList(t *kernel.Task, listAddr hostarch.Addr, size uint, names []string) (int, error) {
        if size > linux.XATTR_LIST_MAX {
                size = linux.XATTR_LIST_MAX
        }
        var buf bytes.Buffer
        for _, name := range names {
                buf.WriteString(name)
                buf.WriteByte(0)
        }
        if size == 0 {
                // Return the size that would be required to accomodate the list.
                return buf.Len(), nil
        }
        if buf.Len() > int(size) {
                if size >= linux.XATTR_LIST_MAX {
                        return 0, linuxerr.E2BIG
                }
                return 0, linuxerr.ERANGE
        }
        return t.CopyOutBytes(listAddr, buf.Bytes())
}

func copyInXattrValue(t *kernel.Task, valueAddr hostarch.Addr, size uint) (string, error) {
        if size > linux.XATTR_SIZE_MAX {
                return "", linuxerr.E2BIG
        }
        buf := make([]byte, size)
        if _, err := t.CopyInBytes(valueAddr, buf); err != nil {
                return "", err
        }
        return gohacks.StringFromImmutableBytes(buf), nil
}

func copyOutXattrValue(t *kernel.Task, valueAddr hostarch.Addr, size uint, value string) (int, error) {
        if size > linux.XATTR_SIZE_MAX {
                size = linux.XATTR_SIZE_MAX
        }
        if size == 0 {
                // Return the size that would be required to accomodate the value.
                return len(value), nil
        }
        if len(value) > int(size) {
                if size >= linux.XATTR_SIZE_MAX {
                        return 0, linuxerr.E2BIG
                }
                return 0, linuxerr.ERANGE
        }
        return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value))
}








































    1 









    1 





    1 





    1 




    1 










    1 




    1 







    1 


    1 





    1 

    1 


    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package loader

import (
        "bytes"
        "io"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/fsbridge"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
)

const (
        // interpreterScriptMagic identifies an interpreter script.
        interpreterScriptMagic = "#!"

        // interpMaxLineLength is the maximum length for the first line of an
        // interpreter script.
        //
        // From execve(2): "A maximum line length of 127 characters is allowed
        // for the first line in a #! executable shell script."
        interpMaxLineLength = 127
)

// parseInterpreterScript returns the interpreter path and argv.
func parseInterpreterScript(ctx context.Context, filename string, f fsbridge.File, argv []string) (newpath string, newargv []string, err error) {
        line := make([]byte, interpMaxLineLength)
        n, err := f.ReadFull(ctx, usermem.BytesIOSequence(line), 0)
        // Short read is OK.
        if err != nil && err != io.ErrUnexpectedEOF {
                if err == io.EOF {
                        err = syserror.ENOEXEC
                }
                return "", []string{}, err
        }
        line = line[:n]

        if !bytes.Equal(line[:2], []byte(interpreterScriptMagic)) {
                return "", []string{}, syserror.ENOEXEC
        }
        // Ignore #!.
        line = line[2:]

        // Ignore everything after newline.
        // Linux silently truncates the remainder of the line if it exceeds
        // interpMaxLineLength.
        i := bytes.IndexByte(line, '\n')
        if i >= 0 {
                line = line[:i]
        }

        // Skip any whitespace before the interpeter.
        line = bytes.TrimLeft(line, " \t")

        // Linux only looks for spaces or tabs delimiting the interpreter and
        // arg.
        //
        // execve(2): "On Linux, the entire string following the interpreter
        // name is passed as a single argument to the interpreter, and this
        // string can include white space."
        interp := line
        var arg []byte
        i = bytes.IndexAny(line, " \t")
        if i >= 0 {
                interp = line[:i]
                arg = bytes.TrimLeft(line[i:], " \t")
        }

        if string(interp) == "" {
                ctx.Infof("Interpreter script contains no interpreter: %v", line)
                return "", []string{}, syserror.ENOEXEC
        }

        // Build the new argument list:
        //
        // 1. The interpreter.
        newargv = append(newargv, string(interp))

        // 2. The optional interpreter argument.
        if len(arg) > 0 {
                newargv = append(newargv, string(arg))
        }

        // 3. The original arguments. The original argv[0] is replaced with the
        // full script filename.
        if len(argv) > 0 {
                argv[0] = filename
        } else {
                argv = []string{filename}
        }
        newargv = append(newargv, argv...)

        return string(interp), newargv, nil
}




























  123 














  123 








  123 




  123 




  122 






  124 






  124 



  127 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build linux

package fdnotifier

import (
        "unsafe"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/waiter"
)

// NonBlockingPoll polls the given FD in non-blocking fashion. It is used just
// to query the FD's current state.
func NonBlockingPoll(fd int32, mask waiter.EventMask) waiter.EventMask {
        e := struct {
                fd      int32
                events  int16
                revents int16
        }{
                fd:     fd,
                events: int16(mask.ToLinux()),
        }

        ts := unix.Timespec{
                Sec:  0,
                Nsec: 0,
        }

        for {
                n, _, err := unix.RawSyscall6(unix.SYS_PPOLL, uintptr(unsafe.Pointer(&e)), 1,
                        uintptr(unsafe.Pointer(&ts)), 0, 0, 0)
                // Interrupted by signal, try again.
                if err == unix.EINTR {
                        continue
                }
                // If an error occur we'll conservatively say the FD is ready for
                // whatever is being checked.
                if err != 0 {
                        return mask
                }

                // If no FDs were returned, it wasn't ready for anything.
                if n == 0 {
                        return 0
                }

                // Otherwise we got the ready events in the revents field.
                return waiter.EventMaskFromLinux(uint32(e.revents))
        }
}

// epollWait performs a blocking wait on epfd.
//
// Preconditions: len(events) > 0
func epollWait(epfd int, events []unix.EpollEvent, msec int) (int, error) {
        if len(events) == 0 {
                panic("Empty events passed to EpollWait")
        }

        // We actually use epoll_pwait with NULL sigmask instead of epoll_wait
        // since that is what the Go >= 1.11 runtime prefers.
        r, _, e := unix.Syscall6(unix.SYS_EPOLL_PWAIT, uintptr(epfd), uintptr(unsafe.Pointer(&events[0])), uintptr(len(events)), uintptr(msec), 0, 0)
        if e != 0 {
                return 0, e
        }
        return int(r), nil
}





























  568 
   18 



  556 
  325 

   37 











  551 
   12 



  549 
  328 


  547 
























































  321 
























  592 






  495 







  480 
  411 






  463 
  465 
  464 


    2 

  464 


  464 


  465 








  439 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package fspath provides efficient tools for working with file paths in
// Linux-compatible filesystem implementations.
package fspath

import (
        "strings"
)

const pathSep = '/'

// Parse parses a pathname as described by path_resolution(7), except that
// empty pathnames will be parsed successfully to a Path for which
// Path.Absolute == Path.Dir == Path.HasComponents() == false. (This is
// necessary to support AT_EMPTY_PATH.)
func Parse(pathname string) Path {
        if len(pathname) == 0 {
                return Path{}
        }
        // Skip leading path separators.
        i := 0
        for pathname[i] == pathSep {
                i++
                if i == len(pathname) {
                        // pathname consists entirely of path separators.
                        return Path{
                                Absolute: true,
                                Dir:      true,
                        }
                }
        }
        // Skip trailing path separators. This is required by Iterator.Next. This
        // loop is guaranteed to terminate with j >= 0 because otherwise the
        // pathname would consist entirely of path separators, so we would have
        // returned above.
        j := len(pathname) - 1
        for pathname[j] == pathSep {
                j--
        }
        // Find the end of the first path component.
        firstEnd := i + 1
        for firstEnd != len(pathname) && pathname[firstEnd] != pathSep {
                firstEnd++
        }
        return Path{
                Begin: Iterator{
                        partialPathname: pathname[i : j+1],
                        end:             firstEnd - i,
                },
                Absolute: i != 0,
                Dir:      j != len(pathname)-1,
        }
}

// Path contains the information contained in a pathname string.
//
// Path is copyable by value. The zero value for Path is equivalent to
// fspath.Parse(""), i.e. the empty path.
type Path struct {
        // Begin is an iterator to the first path component in the relative part of
        // the path.
        //
        // Path doesn't store information about path components after the first
        // since this would require allocation.
        Begin Iterator

        // If true, the path is absolute, such that lookup should begin at the
        // filesystem root. If false, the path is relative, such that where lookup
        // begins is unspecified.
        Absolute bool

        // If true, the pathname contains trailing path separators, so the last
        // path component must exist and resolve to a directory.
        Dir bool
}

// String returns a pathname string equivalent to p. Note that the returned
// string is not necessarily equal to the string p was parsed from; in
// particular, redundant path separators will not be present.
func (p Path) String() string {
        var b strings.Builder
        if p.Absolute {
                b.WriteByte(pathSep)
        }
        sep := false
        for pit := p.Begin; pit.Ok(); pit = pit.Next() {
                if sep {
                        b.WriteByte(pathSep)
                }
                b.WriteString(pit.String())
                sep = true
        }
        // Don't return "//" for Parse("/").
        if p.Dir && p.Begin.Ok() {
                b.WriteByte(pathSep)
        }
        return b.String()
}

// HasComponents returns true if p contains a non-zero number of path
// components.
func (p Path) HasComponents() bool {
        return p.Begin.Ok()
}

// An Iterator represents either a path component in a Path or a terminal
// iterator indicating that the end of the path has been reached.
//
// Iterator is immutable and copyable by value. The zero value of Iterator is
// valid, and represents a terminal iterator.
type Iterator struct {
        // partialPathname is a substring of the original pathname beginning at the
        // start of the represented path component and ending immediately after the
        // end of the last path component in the pathname. If partialPathname is
        // empty, the PathnameIterator is terminal.
        //
        // See TestParseIteratorPartialPathnames in fspath_test.go for a worked
        // example.
        partialPathname string

        // end is the offset into partialPathname of the first byte after the end
        // of the represented path component.
        end int
}

// Ok returns true if it is not terminal.
func (it Iterator) Ok() bool {
        return len(it.partialPathname) != 0
}

// String returns the path component represented by it.
//
// Preconditions: it.Ok().
func (it Iterator) String() string {
        return it.partialPathname[:it.end]
}

// Next returns an iterator to the path component after it. If it is the last
// component in the path, Next returns a terminal iterator.
//
// Preconditions: it.Ok().
func (it Iterator) Next() Iterator {
        if it.end == len(it.partialPathname) {
                // End of the path.
                return Iterator{}
        }
        // Skip path separators. Since Parse trims trailing path separators, if we
        // aren't at the end of the path, there is definitely another path
        // component.
        i := it.end + 1
        for {
                if it.partialPathname[i] != pathSep {
                        break
                }
                i++
        }
        nextPartialPathname := it.partialPathname[i:]
        // Find the end of this path component.
        nextEnd := 1
        for nextEnd < len(nextPartialPathname) && nextPartialPathname[nextEnd] != pathSep {
                nextEnd++
        }
        return Iterator{
                partialPathname: nextPartialPathname,
                end:             nextEnd,
        }
}

// NextOk is equivalent to it.Next().Ok(), but is faster.
//
// Preconditions: it.Ok().
func (it Iterator) NextOk() bool {
        return it.end != len(it.partialPathname)
}













































    2 






























































































  243 

    3 

  240 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package safecopy provides an efficient implementation of functions to access
// memory that may result in SIGSEGV or SIGBUS being sent to the accessor.
package safecopy

import (
        "fmt"
        "runtime"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/syserror"
)

// SegvError is returned when a safecopy function receives SIGSEGV.
type SegvError struct {
        // Addr is the address at which the SIGSEGV occurred.
        Addr uintptr
}

// Error implements error.Error.
func (e SegvError) Error() string {
        return fmt.Sprintf("SIGSEGV at %#x", e.Addr)
}

// BusError is returned when a safecopy function receives SIGBUS.
type BusError struct {
        // Addr is the address at which the SIGBUS occurred.
        Addr uintptr
}

// Error implements error.Error.
func (e BusError) Error() string {
        return fmt.Sprintf("SIGBUS at %#x", e.Addr)
}

// AlignmentError is returned when a safecopy function is passed an address
// that does not meet alignment requirements.
type AlignmentError struct {
        // Addr is the invalid address.
        Addr uintptr

        // Alignment is the required alignment.
        Alignment uintptr
}

// Error implements error.Error.
func (e AlignmentError) Error() string {
        return fmt.Sprintf("address %#x is not aligned to a %d-byte boundary", e.Addr, e.Alignment)
}

var (
        // The begin and end addresses below are for the functions that are
        // checked by the signal handler.
        memcpyBegin               uintptr
        memcpyEnd                 uintptr
        memclrBegin               uintptr
        memclrEnd                 uintptr
        swapUint32Begin           uintptr
        swapUint32End             uintptr
        swapUint64Begin           uintptr
        swapUint64End             uintptr
        compareAndSwapUint32Begin uintptr
        compareAndSwapUint32End   uintptr
        loadUint32Begin           uintptr
        loadUint32End             uintptr

        // savedSigSegVHandler is a pointer to the SIGSEGV handler that was
        // configured before we replaced it with our own. We still call into it
        // when we get a SIGSEGV that is not interesting to us.
        savedSigSegVHandler uintptr

        // same a above, but for SIGBUS signals.
        savedSigBusHandler uintptr
)

// signalHandler is our replacement signal handler for SIGSEGV and SIGBUS
// signals.
func signalHandler()

// addrOfSignalHandler returns the start address of signalHandler.
//
// See comment on addrOfMemcpy for more details.
func addrOfSignalHandler() uintptr

// FindEndAddress returns the end address (one byte beyond the last) of the
// function that contains the specified address (begin).
func FindEndAddress(begin uintptr) uintptr {
        f := runtime.FuncForPC(begin)
        if f != nil {
                for p := begin; ; p++ {
                        g := runtime.FuncForPC(p)
                        if f != g {
                                return p
                        }
                }
        }
        return begin
}

// initializeAddresses initializes the addresses used by the signal handler.
func initializeAddresses() {
        // The following functions are written in assembly language, so they won't
        // be inlined by the existing compiler/linker. Tests will fail if this
        // assumption is violated.
        memcpyBegin = addrOfMemcpy()
        memcpyEnd = FindEndAddress(memcpyBegin)
        memclrBegin = addrOfMemclr()
        memclrEnd = FindEndAddress(memclrBegin)
        swapUint32Begin = addrOfSwapUint32()
        swapUint32End = FindEndAddress(swapUint32Begin)
        swapUint64Begin = addrOfSwapUint64()
        swapUint64End = FindEndAddress(swapUint64Begin)
        compareAndSwapUint32Begin = addrOfCompareAndSwapUint32()
        compareAndSwapUint32End = FindEndAddress(compareAndSwapUint32Begin)
        loadUint32Begin = addrOfLoadUint32()
        loadUint32End = FindEndAddress(loadUint32Begin)
}

func init() {
        initializeAddresses()
        if err := ReplaceSignalHandler(unix.SIGSEGV, addrOfSignalHandler(), &savedSigSegVHandler); err != nil {
                panic(fmt.Sprintf("Unable to set handler for SIGSEGV: %v", err))
        }
        if err := ReplaceSignalHandler(unix.SIGBUS, addrOfSignalHandler(), &savedSigBusHandler); err != nil {
                panic(fmt.Sprintf("Unable to set handler for SIGBUS: %v", err))
        }
        syserror.AddErrorUnwrapper(func(e error) (unix.Errno, bool) {
                switch e.(type) {
                case SegvError, BusError, AlignmentError:
                        return unix.EFAULT, true
                default:
                        return 0, false
                }
        })
}

































































    3 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import "fmt"

// Seccomp constants taken from <linux/seccomp.h>.
const (
        SECCOMP_MODE_NONE   = 0
        SECCOMP_MODE_FILTER = 2

        SECCOMP_RET_ACTION_FULL = 0xffff0000
        SECCOMP_RET_ACTION      = 0x7fff0000
        SECCOMP_RET_DATA        = 0x0000ffff

        SECCOMP_SET_MODE_FILTER   = 1
        SECCOMP_FILTER_FLAG_TSYNC = 1
        SECCOMP_GET_ACTION_AVAIL  = 2
)

// BPFAction is an action for a BPF filter.
type BPFAction uint32

// BPFAction definitions.
const (
        SECCOMP_RET_KILL_PROCESS BPFAction = 0x80000000
        SECCOMP_RET_KILL_THREAD  BPFAction = 0x00000000
        SECCOMP_RET_TRAP         BPFAction = 0x00030000
        SECCOMP_RET_ERRNO        BPFAction = 0x00050000
        SECCOMP_RET_TRACE        BPFAction = 0x7ff00000
        SECCOMP_RET_ALLOW        BPFAction = 0x7fff0000
)

func (a BPFAction) String() string {
        switch a & SECCOMP_RET_ACTION_FULL {
        case SECCOMP_RET_KILL_PROCESS:
                return "kill process"
        case SECCOMP_RET_KILL_THREAD:
                return "kill thread"
        case SECCOMP_RET_TRAP:
                return fmt.Sprintf("trap (%d)", a.Data())
        case SECCOMP_RET_ERRNO:
                return fmt.Sprintf("errno (%d)", a.Data())
        case SECCOMP_RET_TRACE:
                return fmt.Sprintf("trace (%d)", a.Data())
        case SECCOMP_RET_ALLOW:
                return "allow"
        }
        return fmt.Sprintf("invalid action: %#x", a)
}

// Data returns the SECCOMP_RET_DATA portion of the action.
func (a BPFAction) Data() uint16 {
        return uint16(a & SECCOMP_RET_DATA)
}

// WithReturnCode sets the lower 16 bits of the SECCOMP_RET_ERRNO or
// SECCOMP_RET_TRACE actions to the provided return code, overwriting the previous
// action, and returns a new BPFAction. If not SECCOMP_RET_ERRNO or
// SECCOMP_RET_TRACE then this panics.
func (a BPFAction) WithReturnCode(code uint16) BPFAction {
        // mask out the previous return value
        baseAction := a & SECCOMP_RET_ACTION_FULL
        if baseAction == SECCOMP_RET_ERRNO || baseAction == SECCOMP_RET_TRACE {
                return BPFAction(uint32(baseAction) | uint32(code))
        }
        panic("WithReturnCode only valid for SECCOMP_RET_ERRNO and SECCOMP_RET_TRACE")
}

// SockFprog is sock_fprog taken from <linux/filter.h>.
type SockFprog struct {
        Len    uint16
        pad    [6]byte
        Filter *BPFInstruction
}

// SeccompData is equivalent to struct seccomp_data, which contains the data
// passed to seccomp-bpf filters.
//
// +marshal
type SeccompData struct {
        // Nr is the system call number.
        Nr int32

        // Arch is an AUDIT_ARCH_* value indicating the system call convention.
        Arch uint32

        // InstructionPointer is the value of the instruction pointer at the time
        // of the system call.
        InstructionPointer uint64

        // Args contains the first 6 system call arguments.
        Args [6]uint64
}













































































  647 


  580 
   88 


  127 


  647 







  771 











  756 


  638 


  638 





  639 

  637 


  640 


  641 



   61 

   29 
   25 



   61 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/bits"
)

const (
        // stdSignalCap is the maximum number of instances of a given standard
        // signal that may be pending. ("[If] multiple instances of a standard
        // signal are delivered while that signal is currently blocked, then only
        // one instance is queued.") - signal(7)
        stdSignalCap = 1

        // rtSignalCap is the maximum number of instances of a given realtime
        // signal that may be pending.
        //
        // TODO(igudger): In Linux, the minimum signal queue size is
        // RLIMIT_SIGPENDING, which is by default max_threads/2.
        rtSignalCap = 32
)

// pendingSignals holds a collection of pending signals. The zero value of
// pendingSignals is a valid empty collection. pendingSignals is thread-unsafe;
// users must provide synchronization.
//
// +stateify savable
type pendingSignals struct {
        // signals contains all pending signals.
        //
        // Note that signals is zero-indexed, but signal 1 is the first valid
        // signal, so signals[0] contains signals with signo 1 etc. This offset is
        // usually handled by using Signal.index().
        signals [linux.SignalMaximum]pendingSignalQueue `state:".([]savedPendingSignal)"`

        // Bit i of pendingSet is set iff there is at least one signal with signo
        // i+1 pending.
        pendingSet linux.SignalSet `state:"manual"`
}

// pendingSignalQueue holds a pendingSignalList for a single signal number.
//
// +stateify savable
type pendingSignalQueue struct {
        pendingSignalList
        length int
}

// +stateify savable
type pendingSignal struct {
        // pendingSignalEntry links into a pendingSignalList.
        pendingSignalEntry
        *linux.SignalInfo

        // If timer is not nil, it is the IntervalTimer which sent this signal.
        timer *IntervalTimer
}

// enqueue enqueues the given signal. enqueue returns true on success and false
// on failure (if the given signal's queue is full).
//
// Preconditions: info represents a valid signal.
func (p *pendingSignals) enqueue(info *linux.SignalInfo, timer *IntervalTimer) bool {
        sig := linux.Signal(info.Signo)
        q := &p.signals[sig.Index()]
        if sig.IsStandard() {
                if q.length >= stdSignalCap {
                        return false
                }
        } else if q.length >= rtSignalCap {
                return false
        }
        q.pendingSignalList.PushBack(&pendingSignal{SignalInfo: info, timer: timer})
        q.length++
        p.pendingSet |= linux.SignalSetOf(sig)
        return true
}

// dequeue dequeues and returns any pending signal not masked by mask. If no
// unmasked signals are pending, dequeue returns nil.
func (p *pendingSignals) dequeue(mask linux.SignalSet) *linux.SignalInfo {
        // "Real-time signals are delivered in a guaranteed order. Multiple
        // real-time signals of the same type are delivered in the order they were
        // sent. If different real-time signals are sent to a process, they are
        // delivered starting with the lowest-numbered signal. (I.e., low-numbered
        // signals have highest priority.) By contrast, if multiple standard
        // signals are pending for a process, the order in which they are delivered
        // is unspecified. If both standard and real-time signals are pending for a
        // process, POSIX leaves it unspecified which is delivered first. Linux,
        // like many other implementations, gives priority to standard signals in
        // this case." - signal(7)
        lowestPendingUnblockedBit := bits.TrailingZeros64(uint64(p.pendingSet &^ mask))
        if lowestPendingUnblockedBit >= linux.SignalMaximum {
                return nil
        }
        return p.dequeueSpecific(linux.Signal(lowestPendingUnblockedBit + 1))
}

func (p *pendingSignals) dequeueSpecific(sig linux.Signal) *linux.SignalInfo {
        q := &p.signals[sig.Index()]
        ps := q.pendingSignalList.Front()
        if ps == nil {
                return nil
        }
        q.pendingSignalList.Remove(ps)
        q.length--
        if q.length == 0 {
                p.pendingSet &^= linux.SignalSetOf(sig)
        }
        if ps.timer != nil {
                ps.timer.updateDequeuedSignalLocked(ps.SignalInfo)
        }
        return ps.SignalInfo
}

// discardSpecific causes all pending signals with number sig to be discarded.
func (p *pendingSignals) discardSpecific(sig linux.Signal) {
        q := &p.signals[sig.Index()]
        for ps := q.pendingSignalList.Front(); ps != nil; ps = ps.Next() {
                if ps.timer != nil {
                        ps.timer.signalRejectedLocked()
                }
        }
        q.pendingSignalList.Reset()
        q.length = 0
        p.pendingSet &^= linux.SignalSetOf(sig)
}




























    3 




   15 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package raw

import (
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
        "gvisor.dev/gvisor/pkg/tcpip/transport/packet"
        "gvisor.dev/gvisor/pkg/waiter"
)

// EndpointFactory implements stack.RawFactory.
type EndpointFactory struct{}

// NewUnassociatedEndpoint implements stack.RawFactory.NewUnassociatedEndpoint.
func (EndpointFactory) NewUnassociatedEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
        return newEndpoint(stack, netProto, transProto, waiterQueue, false /* associated */)
}

// NewPacketEndpoint implements stack.RawFactory.NewPacketEndpoint.
func (EndpointFactory) NewPacketEndpoint(stack *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
        return packet.NewEndpoint(stack, cooked, netProto, waiterQueue)
}

































































































   29 




    1 


   28 




   30 




   31 
   30 




    1 






    1 



    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
)

// DeviceKind indicates whether a device is a block or character device.
//
// +stateify savable
type DeviceKind uint32

const (
        // BlockDevice indicates a block device.
        BlockDevice DeviceKind = iota

        // CharDevice indicates a character device.
        CharDevice
)

// String implements fmt.Stringer.String.
func (kind DeviceKind) String() string {
        switch kind {
        case BlockDevice:
                return "block"
        case CharDevice:
                return "character"
        default:
                return fmt.Sprintf("invalid device kind %d", kind)
        }
}

// +stateify savable
type devTuple struct {
        kind  DeviceKind
        major uint32
        minor uint32
}

// A Device backs device special files.
type Device interface {
        // Open returns a FileDescription representing this device.
        Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error)
}

// +stateify savable
type registeredDevice struct {
        dev  Device
        opts RegisterDeviceOptions
}

// RegisterDeviceOptions contains options to
// VirtualFilesystem.RegisterDevice().
//
// +stateify savable
type RegisterDeviceOptions struct {
        // GroupName is the name shown for this device registration in
        // /proc/devices. If GroupName is empty, this registration will not be
        // shown in /proc/devices.
        GroupName string
}

// RegisterDevice registers the given Device in vfs with the given major and
// minor device numbers.
func (vfs *VirtualFilesystem) RegisterDevice(kind DeviceKind, major, minor uint32, dev Device, opts *RegisterDeviceOptions) error {
        tup := devTuple{kind, major, minor}
        vfs.devicesMu.Lock()
        defer vfs.devicesMu.Unlock()
        if existing, ok := vfs.devices[tup]; ok {
                return fmt.Errorf("%s device number (%d, %d) is already registered to device type %T", kind, major, minor, existing.dev)
        }
        vfs.devices[tup] = &registeredDevice{
                dev:  dev,
                opts: *opts,
        }
        return nil
}

// OpenDeviceSpecialFile returns a FileDescription representing the given
// device.
func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mount, d *Dentry, kind DeviceKind, major, minor uint32, opts *OpenOptions) (*FileDescription, error) {
        tup := devTuple{kind, major, minor}
        vfs.devicesMu.RLock()
        defer vfs.devicesMu.RUnlock()
        rd, ok := vfs.devices[tup]
        if !ok {
                return nil, linuxerr.ENXIO
        }
        return rd.dev.Open(ctx, mnt, d, *opts)
}

// GetAnonBlockDevMinor allocates and returns an unused minor device number for
// an "anonymous" block device with major number UNNAMED_MAJOR.
func (vfs *VirtualFilesystem) GetAnonBlockDevMinor() (uint32, error) {
        vfs.anonBlockDevMinorMu.Lock()
        defer vfs.anonBlockDevMinorMu.Unlock()
        minor := vfs.anonBlockDevMinorNext
        const maxDevMinor = (1 << 20) - 1
        for minor < maxDevMinor {
                if _, ok := vfs.anonBlockDevMinor[minor]; !ok {
                        vfs.anonBlockDevMinor[minor] = struct{}{}
                        vfs.anonBlockDevMinorNext = minor + 1
                        return minor, nil
                }
                minor++
        }
        return 0, linuxerr.EMFILE
}

// PutAnonBlockDevMinor deallocates a minor device number returned by a
// previous call to GetAnonBlockDevMinor.
func (vfs *VirtualFilesystem) PutAnonBlockDevMinor(minor uint32) {
        vfs.anonBlockDevMinorMu.Lock()
        defer vfs.anonBlockDevMinorMu.Unlock()
        delete(vfs.anonBlockDevMinor, minor)
        if minor < vfs.anonBlockDevMinorNext {
                vfs.anonBlockDevMinorNext = minor
        }
}









































































    9 





































  430 
  126 


  355 


  356 
    1 


  354 


  355 




   23 



   23 















































































































































  164 
  127 


  109 



   14 



    2 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package syserr contains sandbox-internal errors. These errors are distinct
// from both the errors returned by host system calls and the errors returned
// to sandboxed applications.
package syserr

import (
        "fmt"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux/errno"
        "gvisor.dev/gvisor/pkg/errors"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/syserror"
)

// Error represents an internal error.
type Error struct {
        // message is the human readable form of this Error.
        message string

        // noTranslation indicates that this Error cannot be translated to a
        // errno.Errno.
        noTranslation bool

        // errno is the errno.Errno this Error should be translated to.
        errno errno.Errno
}

// New creates a new Error and adds a translation for it.
//
// New must only be called at init.
func New(message string, linuxTranslation errno.Errno) *Error {
        err := &Error{message: message, errno: linuxTranslation}

        // TODO(b/34162363): Remove this.
        if int(err.errno) >= len(linuxBackwardsTranslations) {
                panic(fmt.Sprint("invalid errno: ", err.errno))
        }

        e := error(unix.Errno(err.errno))
        // syserror.ErrWouldBlock gets translated to linuxerr.EWOULDBLOCK and
        // enables proper blocking semantics. This should temporary address the
        // class of blocking bugs that keep popping up with the current state of
        // the error space.
        if err.errno == linuxerr.EWOULDBLOCK.Errno() {
                e = syserror.ErrWouldBlock
        }
        linuxBackwardsTranslations[err.errno] = linuxBackwardsTranslation{err: e, ok: true}

        return err
}

// NewDynamic creates a new error with a dynamic error message and an errno
// translation.
//
// NewDynamic should only be used sparingly and not be used for static error
// messages. Errors with static error messages should be declared with New as
// global variables.
func NewDynamic(message string, linuxTranslation errno.Errno) *Error {
        return &Error{message: message, errno: linuxTranslation}
}

// NewWithoutTranslation creates a new Error. If translation is attempted on
// the error, translation will fail.
//
// NewWithoutTranslation may be called at any time, but static errors should
// be declared as global variables and dynamic errors should be used sparingly.
func NewWithoutTranslation(message string) *Error {
        return &Error{message: message, noTranslation: true}
}

func newWithHost(message string, linuxTranslation errno.Errno, hostErrno unix.Errno) *Error {
        e := New(message, linuxTranslation)
        addLinuxHostTranslation(hostErrno, e)
        return e
}

// String implements fmt.Stringer.String.
func (e *Error) String() string {
        if e == nil {
                return "<nil>"
        }
        return e.message
}

type linuxBackwardsTranslation struct {
        err error
        ok  bool
}

// TODO(b/34162363): Remove this.
var linuxBackwardsTranslations [maxErrno]linuxBackwardsTranslation

// ToError translates an Error to a corresponding error value.
//
// TODO(b/34162363): Remove this.
func (e *Error) ToError() error {
        if e == nil {
                return nil
        }
        if e.noTranslation {
                panic(fmt.Sprintf("error %q does not support translation", e.message))
        }
        err := int(e.errno)
        if err == errno.NOERRNO {
                return nil
        }
        if err >= len(linuxBackwardsTranslations) || !linuxBackwardsTranslations[err].ok {
                panic(fmt.Sprintf("unknown error %q (%d)", e.message, err))
        }
        return linuxBackwardsTranslations[err].err
}

// ToLinux converts the Error to a Linux ABI error that can be returned to the
// application.
func (e *Error) ToLinux() errno.Errno {
        if e.noTranslation {
                panic(fmt.Sprintf("No Linux ABI translation available for %q", e.message))
        }
        return e.errno
}

// TODO(b/34162363): Remove or replace most of these errors.
//
// Some of the errors should be replaced with package specific errors and
// others should be removed entirely.
var (
        ErrNotPermitted               = newWithHost("operation not permitted", errno.EPERM, unix.EPERM)
        ErrNoFileOrDir                = newWithHost("no such file or directory", errno.ENOENT, unix.ENOENT)
        ErrNoProcess                  = newWithHost("no such process", errno.ESRCH, unix.ESRCH)
        ErrInterrupted                = newWithHost("interrupted system call", errno.EINTR, unix.EINTR)
        ErrIO                         = newWithHost("I/O error", errno.EIO, unix.EIO)
        ErrDeviceOrAddress            = newWithHost("no such device or address", errno.ENXIO, unix.ENXIO)
        ErrTooManyArgs                = newWithHost("argument list too long", errno.E2BIG, unix.E2BIG)
        ErrEcec                       = newWithHost("exec format error", errno.ENOEXEC, unix.ENOEXEC)
        ErrBadFD                      = newWithHost("bad file number", errno.EBADF, unix.EBADF)
        ErrNoChild                    = newWithHost("no child processes", errno.ECHILD, unix.ECHILD)
        ErrTryAgain                   = newWithHost("try again", errno.EAGAIN, unix.EAGAIN)
        ErrNoMemory                   = newWithHost("out of memory", errno.ENOMEM, unix.ENOMEM)
        ErrPermissionDenied           = newWithHost("permission denied", errno.EACCES, unix.EACCES)
        ErrBadAddress                 = newWithHost("bad address", errno.EFAULT, unix.EFAULT)
        ErrNotBlockDevice             = newWithHost("block device required", errno.ENOTBLK, unix.ENOTBLK)
        ErrBusy                       = newWithHost("device or resource busy", errno.EBUSY, unix.EBUSY)
        ErrExists                     = newWithHost("file exists", errno.EEXIST, unix.EEXIST)
        ErrCrossDeviceLink            = newWithHost("cross-device link", errno.EXDEV, unix.EXDEV)
        ErrNoDevice                   = newWithHost("no such device", errno.ENODEV, unix.ENODEV)
        ErrNotDir                     = newWithHost("not a directory", errno.ENOTDIR, unix.ENOTDIR)
        ErrIsDir                      = newWithHost("is a directory", errno.EISDIR, unix.EISDIR)
        ErrInvalidArgument            = newWithHost("invalid argument", errno.EINVAL, unix.EINVAL)
        ErrFileTableOverflow          = newWithHost("file table overflow", errno.ENFILE, unix.ENFILE)
        ErrTooManyOpenFiles           = newWithHost("too many open files", errno.EMFILE, unix.EMFILE)
        ErrNotTTY                     = newWithHost("not a typewriter", errno.ENOTTY, unix.ENOTTY)
        ErrTestFileBusy               = newWithHost("text file busy", errno.ETXTBSY, unix.ETXTBSY)
        ErrFileTooBig                 = newWithHost("file too large", errno.EFBIG, unix.EFBIG)
        ErrNoSpace                    = newWithHost("no space left on device", errno.ENOSPC, unix.ENOSPC)
        ErrIllegalSeek                = newWithHost("illegal seek", errno.ESPIPE, unix.ESPIPE)
        ErrReadOnlyFS                 = newWithHost("read-only file system", errno.EROFS, unix.EROFS)
        ErrTooManyLinks               = newWithHost("too many links", errno.EMLINK, unix.EMLINK)
        ErrBrokenPipe                 = newWithHost("broken pipe", errno.EPIPE, unix.EPIPE)
        ErrDomain                     = newWithHost("math argument out of domain of func", errno.EDOM, unix.EDOM)
        ErrRange                      = newWithHost("math result not representable", errno.ERANGE, unix.ERANGE)
        ErrDeadlock                   = newWithHost("resource deadlock would occur", errno.EDEADLOCK, unix.EDEADLOCK)
        ErrNameTooLong                = newWithHost("file name too long", errno.ENAMETOOLONG, unix.ENAMETOOLONG)
        ErrNoLocksAvailable           = newWithHost("no record locks available", errno.ENOLCK, unix.ENOLCK)
        ErrInvalidSyscall             = newWithHost("invalid system call number", errno.ENOSYS, unix.ENOSYS)
        ErrDirNotEmpty                = newWithHost("directory not empty", errno.ENOTEMPTY, unix.ENOTEMPTY)
        ErrLinkLoop                   = newWithHost("too many symbolic links encountered", errno.ELOOP, unix.ELOOP)
        ErrNoMessage                  = newWithHost("no message of desired type", errno.ENOMSG, unix.ENOMSG)
        ErrIdentifierRemoved          = newWithHost("identifier removed", errno.EIDRM, unix.EIDRM)
        ErrChannelOutOfRange          = newWithHost("channel number out of range", errno.ECHRNG, unix.ECHRNG)
        ErrLevelTwoNotSynced          = newWithHost("level 2 not synchronized", errno.EL2NSYNC, unix.EL2NSYNC)
        ErrLevelThreeHalted           = newWithHost("level 3 halted", errno.EL3HLT, unix.EL3HLT)
        ErrLevelThreeReset            = newWithHost("level 3 reset", errno.EL3RST, unix.EL3RST)
        ErrLinkNumberOutOfRange       = newWithHost("link number out of range", errno.ELNRNG, unix.ELNRNG)
        ErrProtocolDriverNotAttached  = newWithHost("protocol driver not attached", errno.EUNATCH, unix.EUNATCH)
        ErrNoCSIAvailable             = newWithHost("no CSI structure available", errno.ENOCSI, unix.ENOCSI)
        ErrLevelTwoHalted             = newWithHost("level 2 halted", errno.EL2HLT, unix.EL2HLT)
        ErrInvalidExchange            = newWithHost("invalid exchange", errno.EBADE, unix.EBADE)
        ErrInvalidRequestDescriptor   = newWithHost("invalid request descriptor", errno.EBADR, unix.EBADR)
        ErrExchangeFull               = newWithHost("exchange full", errno.EXFULL, unix.EXFULL)
        ErrNoAnode                    = newWithHost("no anode", errno.ENOANO, unix.ENOANO)
        ErrInvalidRequestCode         = newWithHost("invalid request code", errno.EBADRQC, unix.EBADRQC)
        ErrInvalidSlot                = newWithHost("invalid slot", errno.EBADSLT, unix.EBADSLT)
        ErrBadFontFile                = newWithHost("bad font file format", errno.EBFONT, unix.EBFONT)
        ErrNotStream                  = newWithHost("device not a stream", errno.ENOSTR, unix.ENOSTR)
        ErrNoDataAvailable            = newWithHost("no data available", errno.ENODATA, unix.ENODATA)
        ErrTimerExpired               = newWithHost("timer expired", errno.ETIME, unix.ETIME)
        ErrStreamsResourceDepleted    = newWithHost("out of streams resources", errno.ENOSR, unix.ENOSR)
        ErrMachineNotOnNetwork        = newWithHost("machine is not on the network", errno.ENONET, unix.ENONET)
        ErrPackageNotInstalled        = newWithHost("package not installed", errno.ENOPKG, unix.ENOPKG)
        ErrIsRemote                   = newWithHost("object is remote", errno.EREMOTE, unix.EREMOTE)
        ErrNoLink                     = newWithHost("link has been severed", errno.ENOLINK, unix.ENOLINK)
        ErrAdvertise                  = newWithHost("advertise error", errno.EADV, unix.EADV)
        ErrSRMount                    = newWithHost("srmount error", errno.ESRMNT, unix.ESRMNT)
        ErrSendCommunication          = newWithHost("communication error on send", errno.ECOMM, unix.ECOMM)
        ErrProtocol                   = newWithHost("protocol error", errno.EPROTO, unix.EPROTO)
        ErrMultihopAttempted          = newWithHost("multihop attempted", errno.EMULTIHOP, unix.EMULTIHOP)
        ErrRFS                        = newWithHost("RFS specific error", errno.EDOTDOT, unix.EDOTDOT)
        ErrInvalidDataMessage         = newWithHost("not a data message", errno.EBADMSG, unix.EBADMSG)
        ErrOverflow                   = newWithHost("value too large for defined data type", errno.EOVERFLOW, unix.EOVERFLOW)
        ErrNetworkNameNotUnique       = newWithHost("name not unique on network", errno.ENOTUNIQ, unix.ENOTUNIQ)
        ErrFDInBadState               = newWithHost("file descriptor in bad state", errno.EBADFD, unix.EBADFD)
        ErrRemoteAddressChanged       = newWithHost("remote address changed", errno.EREMCHG, unix.EREMCHG)
        ErrSharedLibraryInaccessible  = newWithHost("can not access a needed shared library", errno.ELIBACC, unix.ELIBACC)
        ErrCorruptedSharedLibrary     = newWithHost("accessing a corrupted shared library", errno.ELIBBAD, unix.ELIBBAD)
        ErrLibSectionCorrupted        = newWithHost(".lib section in a.out corrupted", errno.ELIBSCN, unix.ELIBSCN)
        ErrTooManySharedLibraries     = newWithHost("attempting to link in too many shared libraries", errno.ELIBMAX, unix.ELIBMAX)
        ErrSharedLibraryExeced        = newWithHost("cannot exec a shared library directly", errno.ELIBEXEC, unix.ELIBEXEC)
        ErrIllegalByteSequence        = newWithHost("illegal byte sequence", errno.EILSEQ, unix.EILSEQ)
        ErrShouldRestart              = newWithHost("interrupted system call should be restarted", errno.ERESTART, unix.ERESTART)
        ErrStreamPipe                 = newWithHost("streams pipe error", errno.ESTRPIPE, unix.ESTRPIPE)
        ErrTooManyUsers               = newWithHost("too many users", errno.EUSERS, unix.EUSERS)
        ErrNotASocket                 = newWithHost("socket operation on non-socket", errno.ENOTSOCK, unix.ENOTSOCK)
        ErrDestinationAddressRequired = newWithHost("destination address required", errno.EDESTADDRREQ, unix.EDESTADDRREQ)
        ErrMessageTooLong             = newWithHost("message too long", errno.EMSGSIZE, unix.EMSGSIZE)
        ErrWrongProtocolForSocket     = newWithHost("protocol wrong type for socket", errno.EPROTOTYPE, unix.EPROTOTYPE)
        ErrProtocolNotAvailable       = newWithHost("protocol not available", errno.ENOPROTOOPT, unix.ENOPROTOOPT)
        ErrProtocolNotSupported       = newWithHost("protocol not supported", errno.EPROTONOSUPPORT, unix.EPROTONOSUPPORT)
        ErrSocketNotSupported         = newWithHost("socket type not supported", errno.ESOCKTNOSUPPORT, unix.ESOCKTNOSUPPORT)
        ErrEndpointOperation          = newWithHost("operation not supported on transport endpoint", errno.EOPNOTSUPP, unix.EOPNOTSUPP)
        ErrProtocolFamilyNotSupported = newWithHost("protocol family not supported", errno.EPFNOSUPPORT, unix.EPFNOSUPPORT)
        ErrAddressFamilyNotSupported  = newWithHost("address family not supported by protocol", errno.EAFNOSUPPORT, unix.EAFNOSUPPORT)
        ErrAddressInUse               = newWithHost("address already in use", errno.EADDRINUSE, unix.EADDRINUSE)
        ErrAddressNotAvailable        = newWithHost("cannot assign requested address", errno.EADDRNOTAVAIL, unix.EADDRNOTAVAIL)
        ErrNetworkDown                = newWithHost("network is down", errno.ENETDOWN, unix.ENETDOWN)
        ErrNetworkUnreachable         = newWithHost("network is unreachable", errno.ENETUNREACH, unix.ENETUNREACH)
        ErrNetworkReset               = newWithHost("network dropped connection because of reset", errno.ENETRESET, unix.ENETRESET)
        ErrConnectionAborted          = newWithHost("software caused connection abort", errno.ECONNABORTED, unix.ECONNABORTED)
        ErrConnectionReset            = newWithHost("connection reset by peer", errno.ECONNRESET, unix.ECONNRESET)
        ErrNoBufferSpace              = newWithHost("no buffer space available", errno.ENOBUFS, unix.ENOBUFS)
        ErrAlreadyConnected           = newWithHost("transport endpoint is already connected", errno.EISCONN, unix.EISCONN)
        ErrNotConnected               = newWithHost("transport endpoint is not connected", errno.ENOTCONN, unix.ENOTCONN)
        ErrShutdown                   = newWithHost("cannot send after transport endpoint shutdown", errno.ESHUTDOWN, unix.ESHUTDOWN)
        ErrTooManyRefs                = newWithHost("too many references: cannot splice", errno.ETOOMANYREFS, unix.ETOOMANYREFS)
        ErrTimedOut                   = newWithHost("connection timed out", errno.ETIMEDOUT, unix.ETIMEDOUT)
        ErrConnectionRefused          = newWithHost("connection refused", errno.ECONNREFUSED, unix.ECONNREFUSED)
        ErrHostDown                   = newWithHost("host is down", errno.EHOSTDOWN, unix.EHOSTDOWN)
        ErrNoRoute                    = newWithHost("no route to host", errno.EHOSTUNREACH, unix.EHOSTUNREACH)
        ErrAlreadyInProgress          = newWithHost("operation already in progress", errno.EALREADY, unix.EALREADY)
        ErrInProgress                 = newWithHost("operation now in progress", errno.EINPROGRESS, unix.EINPROGRESS)
        ErrStaleFileHandle            = newWithHost("stale file handle", errno.ESTALE, unix.ESTALE)
        ErrStructureNeedsCleaning     = newWithHost("structure needs cleaning", errno.EUCLEAN, unix.EUCLEAN)
        ErrIsNamedFile                = newWithHost("is a named type file", errno.ENOTNAM, unix.ENOTNAM)
        ErrRemoteIO                   = newWithHost("remote I/O error", errno.EREMOTEIO, unix.EREMOTEIO)
        ErrQuotaExceeded              = newWithHost("quota exceeded", errno.EDQUOT, unix.EDQUOT)
        ErrNoMedium                   = newWithHost("no medium found", errno.ENOMEDIUM, unix.ENOMEDIUM)
        ErrWrongMediumType            = newWithHost("wrong medium type", errno.EMEDIUMTYPE, unix.EMEDIUMTYPE)
        ErrCanceled                   = newWithHost("operation canceled", errno.ECANCELED, unix.ECANCELED)
        ErrNoKey                      = newWithHost("required key not available", errno.ENOKEY, unix.ENOKEY)
        ErrKeyExpired                 = newWithHost("key has expired", errno.EKEYEXPIRED, unix.EKEYEXPIRED)
        ErrKeyRevoked                 = newWithHost("key has been revoked", errno.EKEYREVOKED, unix.EKEYREVOKED)
        ErrKeyRejected                = newWithHost("key was rejected by service", errno.EKEYREJECTED, unix.EKEYREJECTED)
        ErrOwnerDied                  = newWithHost("owner died", errno.EOWNERDEAD, unix.EOWNERDEAD)
        ErrNotRecoverable             = newWithHost("state not recoverable", errno.ENOTRECOVERABLE, unix.ENOTRECOVERABLE)

        // ErrWouldBlock translates to EWOULDBLOCK which is the same as EAGAIN
        // on Linux.
        ErrWouldBlock = New("operation would block", errno.EWOULDBLOCK)
)

// FromError converts a generic error to an *Error.
//
// TODO(b/34162363): Remove this function.
func FromError(err error) *Error {
        if err == nil {
                return nil
        }
        if errno, ok := err.(unix.Errno); ok {
                return FromHost(errno)
        }

        if linuxErr, ok := err.(*errors.Error); ok {
                return FromHost(unix.Errno(linuxErr.Errno()))
        }

        if errno, ok := syserror.TranslateError(err); ok {
                return FromHost(errno)
        }
        panic("unknown error: " + err.Error())
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/mm/io_list.go: no such file or directory

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/seqatomic_taskgoroutineschedinfo_unsafe.go: no such file or directory

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/tcpip/transport/packet/packet_list.go: no such file or directory




















































   32 






  243 




  243 




  243 




    1 





   56 




   30 








   23 




   23 



























 1645 




    1 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mm

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        "gvisor.dev/gvisor/pkg/sentry/usage"
)

// SpecialMappable implements memmap.MappingIdentity and memmap.Mappable with
// semantics similar to Linux's mm/mmap.c:_install_special_mapping(), except
// that SpecialMappable takes ownership of the memory that it represents
// (_install_special_mapping() does not.)
//
// +stateify savable
type SpecialMappable struct {
        SpecialMappableRefs

        mfp  pgalloc.MemoryFileProvider
        fr   memmap.FileRange
        name string
}

// NewSpecialMappable returns a SpecialMappable that owns fr, which represents
// offsets in mfp.MemoryFile() that contain the SpecialMappable's data. The
// SpecialMappable will use the given name in /proc/[pid]/maps.
//
// Preconditions: fr.Length() != 0.
func NewSpecialMappable(name string, mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *SpecialMappable {
        m := SpecialMappable{mfp: mfp, fr: fr, name: name}
        m.InitRefs()
        return &m
}

// DecRef implements refs.RefCounter.DecRef.
func (m *SpecialMappable) DecRef(ctx context.Context) {
        m.SpecialMappableRefs.DecRef(func() {
                m.mfp.MemoryFile().DecRef(m.fr)
        })
}

// MappedName implements memmap.MappingIdentity.MappedName.
func (m *SpecialMappable) MappedName(ctx context.Context) string {
        return m.name
}

// DeviceID implements memmap.MappingIdentity.DeviceID.
func (m *SpecialMappable) DeviceID() uint64 {
        return 0
}

// InodeID implements memmap.MappingIdentity.InodeID.
func (m *SpecialMappable) InodeID() uint64 {
        return 0
}

// Msync implements memmap.MappingIdentity.Msync.
func (m *SpecialMappable) Msync(ctx context.Context, mr memmap.MappableRange) error {
        // Linux: vm_file is NULL, causing msync to skip it entirely.
        return nil
}

// AddMapping implements memmap.Mappable.AddMapping.
func (*SpecialMappable) AddMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, uint64, bool) error {
        return nil
}

// RemoveMapping implements memmap.Mappable.RemoveMapping.
func (*SpecialMappable) RemoveMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, uint64, bool) {
}

// CopyMapping implements memmap.Mappable.CopyMapping.
func (*SpecialMappable) CopyMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, hostarch.AddrRange, uint64, bool) error {
        return nil
}

// Translate implements memmap.Mappable.Translate.
func (m *SpecialMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
        var err error
        if required.End > m.fr.Length() {
                err = &memmap.BusError{linuxerr.EFAULT}
        }
        if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 {
                return []memmap.Translation{
                        {
                                Source: source,
                                File:   m.mfp.MemoryFile(),
                                Offset: m.fr.Start + source.Start,
                                Perms:  hostarch.AnyAccess,
                        },
                }, err
        }
        return nil, err
}

// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
func (m *SpecialMappable) InvalidateUnsavable(ctx context.Context) error {
        // Since data is stored in pgalloc.MemoryFile, the contents of which are
        // preserved across save/restore, we don't need to do anything.
        return nil
}

// MemoryFileProvider returns the MemoryFileProvider whose MemoryFile stores
// the SpecialMappable's contents.
func (m *SpecialMappable) MemoryFileProvider() pgalloc.MemoryFileProvider {
        return m.mfp
}

// FileRange returns the offsets into MemoryFileProvider().MemoryFile() that
// store the SpecialMappable's contents.
func (m *SpecialMappable) FileRange() memmap.FileRange {
        return m.fr
}

// Length returns the length of the SpecialMappable.
func (m *SpecialMappable) Length() uint64 {
        return m.fr.Length()
}

// NewSharedAnonMappable returns a SpecialMappable that implements the
// semantics of mmap(MAP_SHARED|MAP_ANONYMOUS) and mappings of /dev/zero.
//
// TODO(gvisor.dev/issue/1624): Linux uses an ephemeral file created by
// mm/shmem.c:shmem_zero_setup(), and VFS2 does something analogous. VFS1 uses
// a SpecialMappable instead, incorrectly getting device and inode IDs of zero
// and causing memory for shared anonymous mappings to be allocated up-front
// instead of on first touch; this is to avoid exacerbating the fs.MountSource
// leak (b/143656263). Delete this function along with VFS1.
func NewSharedAnonMappable(length uint64, mfp pgalloc.MemoryFileProvider) (*SpecialMappable, error) {
        if length == 0 {
                return nil, linuxerr.EINVAL
        }
        alignedLen, ok := hostarch.Addr(length).RoundUp()
        if !ok {
                return nil, linuxerr.EINVAL
        }
        fr, err := mfp.MemoryFile().Allocate(uint64(alignedLen), usage.Anonymous)
        if err != nil {
                return nil, err
        }
        return NewSpecialMappable("/dev/zero (deleted)", mfp, fr), nil
}











































































































































 1961 
 1959 


































































































































































































 1961 
 1957 

























  244 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "fmt"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi"
        "gvisor.dev/gvisor/pkg/bits"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sync"
)

// maxSyscallNum is the highest supported syscall number.
//
// The types below create fast lookup slices for all syscalls. This maximum
// serves as a sanity check that we don't allocate huge slices for a very large
// syscall. This is checked during registration.
const maxSyscallNum = 2000

// SyscallSupportLevel is a syscall support levels.
type SyscallSupportLevel int

// String returns a human readable represetation of the support level.
func (l SyscallSupportLevel) String() string {
        switch l {
        case SupportUnimplemented:
                return "Unimplemented"
        case SupportPartial:
                return "Partial Support"
        case SupportFull:
                return "Full Support"
        default:
                return "Undocumented"
        }
}

const (
        // SupportUndocumented indicates the syscall is not documented yet.
        SupportUndocumented = iota

        // SupportUnimplemented indicates the syscall is unimplemented.
        SupportUnimplemented

        // SupportPartial indicates the syscall is partially supported.
        SupportPartial

        // SupportFull indicates the syscall is fully supported.
        SupportFull
)

// Syscall includes the syscall implementation and compatibility information.
type Syscall struct {
        // Name is the syscall name.
        Name string
        // Fn is the implementation of the syscall.
        Fn SyscallFn
        // SupportLevel is the level of support implemented in gVisor.
        SupportLevel SyscallSupportLevel
        // Note describes the compatibility of the syscall.
        Note string
        // URLs is set of URLs to any relevant bugs or issues.
        URLs []string
}

// SyscallFn is a syscall implementation.
type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error)

// MissingFn is a syscall to be called when an implementation is missing.
type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error)

// Possible flags for SyscallFlagsTable.enable.
const (
        // syscallPresent indicates that this is not a missing syscall.
        //
        // This flag is used internally in SyscallFlagsTable.
        syscallPresent = 1 << iota

        // StraceEnableLog enables syscall log tracing.
        StraceEnableLog

        // StraceEnableEvent enables syscall event tracing.
        StraceEnableEvent

        // ExternalBeforeEnable enables the external hook before syscall execution.
        ExternalBeforeEnable

        // ExternalAfterEnable enables the external hook after syscall execution.
        ExternalAfterEnable
)

// StraceEnableBits combines both strace log and event flags.
const StraceEnableBits = StraceEnableLog | StraceEnableEvent

// SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall
// basis.
type SyscallFlagsTable struct {
        // mu protects writes to the fields below.
        //
        // Atomic loads are always allowed. Atomic stores are allowed only
        // while mu is held.
        mu sync.Mutex

        // enable contains the enable bits for each syscall.
        //
        // missing syscalls have the same value in enable as missingEnable to
        // avoid an extra branch in Word.
        enable []uint32

        // missingEnable contains the enable bits for missing syscalls.
        missingEnable uint32
}

// Init initializes the struct, with all syscalls in table set to enable.
//
// max is the largest syscall number in table.
func (e *SyscallFlagsTable) init(table map[uintptr]Syscall, max uintptr) {
        e.enable = make([]uint32, max+1)
        for num := range table {
                e.enable[num] = syscallPresent
        }
}

// Word returns the enable bitfield for sysno.
func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 {
        if sysno < uintptr(len(e.enable)) {
                return atomic.LoadUint32(&e.enable[sysno])
        }

        return atomic.LoadUint32(&e.missingEnable)
}

// Enable sets enable bit bit for all syscalls based on s.
//
// Syscalls missing from s are disabled.
//
// Syscalls missing from the initial table passed to Init cannot be added as
// individual syscalls. If present in s they will be ignored.
//
// Callers to Word may see either the old or new value while this function
// is executing.
func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) {
        e.mu.Lock()
        defer e.mu.Unlock()

        missingVal := atomic.LoadUint32(&e.missingEnable)
        if missingEnable {
                missingVal |= bit
        } else {
                missingVal &^= bit
        }
        atomic.StoreUint32(&e.missingEnable, missingVal)

        for num := range e.enable {
                val := atomic.LoadUint32(&e.enable[num])
                if !bits.IsOn32(val, syscallPresent) {
                        // Missing.
                        atomic.StoreUint32(&e.enable[num], missingVal)
                        continue
                }

                if s[uintptr(num)] {
                        val |= bit
                } else {
                        val &^= bit
                }
                atomic.StoreUint32(&e.enable[num], val)
        }
}

// EnableAll sets enable bit bit for all syscalls, present and missing.
func (e *SyscallFlagsTable) EnableAll(bit uint32) {
        e.mu.Lock()
        defer e.mu.Unlock()

        missingVal := atomic.LoadUint32(&e.missingEnable)
        missingVal |= bit
        atomic.StoreUint32(&e.missingEnable, missingVal)

        for num := range e.enable {
                val := atomic.LoadUint32(&e.enable[num])
                if !bits.IsOn32(val, syscallPresent) {
                        // Missing.
                        atomic.StoreUint32(&e.enable[num], missingVal)
                        continue
                }

                val |= bit
                atomic.StoreUint32(&e.enable[num], val)
        }
}

// Stracer traces syscall execution.
type Stracer interface {
        // SyscallEnter is called on syscall entry.
        //
        // The returned private data is passed to SyscallExit.
        SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{}

        // SyscallExit is called on syscall exit.
        SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error)
}

// SyscallTable is a lookup table of system calls.
//
// Note that a SyscallTable is not savable directly. Instead, they are saved as
// an OS/Arch pair and lookup happens again on restore.
type SyscallTable struct {
        // OS is the operating system that this syscall table implements.
        OS abi.OS

        // Arch is the architecture that this syscall table targets.
        Arch arch.Arch

        // The OS version that this syscall table implements.
        Version Version

        // AuditNumber is a numeric constant that represents the syscall table. If
        // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by
        // linux/audit.h.
        AuditNumber uint32

        // Table is the collection of functions.
        Table map[uintptr]Syscall

        // lookup is a fixed-size array that holds the syscalls (indexed by
        // their numbers). It is used for fast look ups.
        lookup []SyscallFn

        // Emulate is a collection of instruction addresses to emulate. The
        // keys are addresses, and the values are system call numbers.
        Emulate map[hostarch.Addr]uintptr

        // The function to call in case of a missing system call.
        Missing MissingFn

        // Stracer traces this syscall table.
        Stracer Stracer

        // External is used to handle an external callback.
        External func(*Kernel)

        // ExternalFilterBefore is called before External is called before the syscall is executed.
        // External is not called if it returns false.
        ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool

        // ExternalFilterAfter is called before External is called after the syscall is executed.
        // External is not called if it returns false.
        ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool

        // FeatureEnable stores the strace and one-shot enable bits.
        FeatureEnable SyscallFlagsTable
}

// MaxSysno returns the largest system call number.
func (s *SyscallTable) MaxSysno() (max uintptr) {
        for num := range s.Table {
                if num > max {
                        max = num
                }
        }
        return max
}

// allSyscallTables contains all known tables.
var allSyscallTables []*SyscallTable

// SyscallTables returns a read-only slice of registered SyscallTables.
func SyscallTables() []*SyscallTable {
        return allSyscallTables
}

// LookupSyscallTable returns the SyscallCall table for the OS/Arch combination.
func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) {
        for _, s := range allSyscallTables {
                if s.OS == os && s.Arch == a {
                        return s, true
                }
        }
        return nil, false
}

// RegisterSyscallTable registers a new syscall table for use by a Kernel.
func RegisterSyscallTable(s *SyscallTable) {
        if max := s.MaxSysno(); max > maxSyscallNum {
                panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max))
        }
        if _, ok := LookupSyscallTable(s.OS, s.Arch); ok {
                panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch))
        }
        allSyscallTables = append(allSyscallTables, s)
        s.Init()
}

// Init initializes the system call table.
//
// This should normally be called only during registration.
func (s *SyscallTable) Init() {
        if s.Table == nil {
                // Ensure non-nil lookup table.
                s.Table = make(map[uintptr]Syscall)
        }
        if s.Emulate == nil {
                // Ensure non-nil emulate table.
                s.Emulate = make(map[hostarch.Addr]uintptr)
        }

        max := s.MaxSysno() // Checked during RegisterSyscallTable.

        // Initialize the fast-lookup table.
        s.lookup = make([]SyscallFn, max+1)
        for num, sc := range s.Table {
                s.lookup[num] = sc.Fn
        }

        // Initialize all features.
        s.FeatureEnable.init(s.Table, max)
}

// Lookup returns the syscall implementation, if one exists.
func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn {
        if sysno < uintptr(len(s.lookup)) {
                return s.lookup[sysno]
        }

        return nil
}

// LookupName looks up a syscall name.
func (s *SyscallTable) LookupName(sysno uintptr) string {
        if sc, ok := s.Table[sysno]; ok {
                return sc.Name
        }
        return fmt.Sprintf("sys_%d", sysno) // Unlikely.
}

// LookupNo looks up a syscall number by name.
func (s *SyscallTable) LookupNo(name string) (uintptr, error) {
        for i, syscall := range s.Table {
                if syscall.Name == name {
                        return uintptr(i), nil
                }
        }
        return 0, fmt.Errorf("syscall %q not found", name)
}

// LookupEmulate looks up an emulation syscall number.
func (s *SyscallTable) LookupEmulate(addr hostarch.Addr) (uintptr, bool) {
        sysno, ok := s.Emulate[addr]
        return sysno, ok
}

// mapLookup is similar to Lookup, except that it only uses the syscall table,
// that is, it skips the fast look array. This is available for benchmarking.
func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn {
        if sc, ok := s.Table[sysno]; ok {
                return sc.Fn
        }
        return nil
}


















































































   59 

   15 

   49 






   12 

    1 

   11 





    2 

    1 

    1 


















    8 



    8 



































































































































































































































































































































































































































































































































   15 








   15 












   82 
   82 



    5 


    2 






    5 


   83 


















   82 



   82 

   82 



   82 



   10 


    5 



    6 






    6 






















   54 


    6 


    7 

   42 
    2 


   42 
   54 
    4 



   54 



   77 




    9 





   18 

















   51 































   51 



   51 
   51 



   51 
   50 



   51 
   51 



   51 




   51 
   50 



   51 
   51 



   51 





   51 









    3 



    1 







    1 



    1 

    1 




    1 


    3 




















    3 


    6 



    9 
    9 






    9 
    9 





    9 







































   40 






   40 






















   40 




   40 




    3 
    3 




    2 
















   40 


    2 



   38 


    5 





   33 







   38 




   43 





   42 



    3 


    1 




    3 





   41 







    9 




   42 












   42 



    9 




   42 





   19 





   13 

   12 




   13 
    6 


   13 












   13 




    1 








    1 
    1 




    1 


    1 
















    1 





    1 






    1 
    1 












    1 

    1 





   46 




    8 















    1 


    1 





















    8 







    4 



    4 


    4 


    4 






    2 






    2 



    2 





    2 





    4 


    3 









    7 







    1 







    7 



    2 






    2 



    5 



    1 


    4 












    2 







    1 







    2 

    1 









    2 







   15 


    3 
    1 


    2 
    3 

    1 








    8 



    8 




    7 





    8 





    8 








    8 


    8 


    7 

    1 


    6 




   15 




    7 




    8 
    1 




    7 



    6 











    6 





    6 







   15 










    9 



    6 






    8 











    4 


    8 


    8 



    7 





















    2 


    1 


    2 



    1 


    2 





    2 



    1 


















   48 







    2 
    1 





    8 




    1 









    1 



    1 










    1 





    7 




    1 












    1 






    1 




    1 











    1 




    1 



    1 



    2 
    1 


    1 








    1 
    1 
    1 



    1 



    6 







    8 







    1 





    1 




    2 







    2 
    2 
    1 









    1 






    1 

    2 



    1 












    1 


    2 










    1 


    1 


    4 



    2 








    2 






    6 



















    1 












    2 


    1 

















    1 


    1 




    2 


    1 

    1 


    2 
    1 














    2 




    9 

    2 







    1 




    1 




    1 




    1 




    1 




    1 




    1 







    1 


    7 




   22 

    2 


   21 



    1 




   15 

   15 
    9 






   15 








   15 






    1 



   14 


    5 







   14 

    3 


    1 



    2 



    2 

    9 



    1 




    5 
    4 


    1 

    1 




   12 
    2 


   10 






    3 





    7 


















    7 







    7 






    7 
    6 

    6 

    1 




    7 
    7 



    7 




























































    7 




















    7 











   10 
























   10 
   10 




   10 




   10 















   40 


    3 

    2 








    1 







    2 



    2 











    2 
















    2 
    3 
    3 













    3 
   35 






    8 

    1 
    1 




    8 


    8 



    1 









    1 





    1 




    1 




    7 









    7 





    6 



    6 






    6 


    6 










    4 











    2 












    2 

    2 


    2 
    1 


    2 
    1 


    2 



   11 






   13 



    1 



   13 

    1 



   12 




    7 


    6 




   12 


    6 

    2 


    4 


   10 









    9 













    9 

   10 




    9 





































    1 














   21 







   22 



























































    5 










    2 







    5 

    5 






    5 






   22 







   22 





   16 







    7 








   27 




   22 





   14 





   13 







   13 









   13 




    6 
    6 







    8 
    5 







   19 






   19 






   53 















    8 





    8 





    5 



































































   13 


   13 










    2 


































   65 


















    1 












    1 




    1 





    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
        "container/list"
        "encoding/binary"
        "fmt"
        "io"
        "math"
        "math/rand"
        "runtime"
        "strings"
        "sync/atomic"
        "time"

        "gvisor.dev/gvisor/pkg/sleep"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/ports"
        "gvisor.dev/gvisor/pkg/tcpip/seqnum"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
        "gvisor.dev/gvisor/pkg/waiter"
)

// EndpointState represents the state of a TCP endpoint.
type EndpointState tcpip.EndpointState

// Endpoint states. Note that are represented in a netstack-specific manner and
// may not be meaningful externally. Specifically, they need to be translated to
// Linux's representation for these states if presented to userspace.
const (
        _ EndpointState = iota
        // TCP protocol states in sync with the definitions in
        // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13
        StateEstablished
        StateSynSent
        StateSynRecv
        StateFinWait1
        StateFinWait2
        StateTimeWait
        StateClose
        StateCloseWait
        StateLastAck
        StateListen
        StateClosing

        // Endpoint states internal to netstack.
        StateInitial
        StateBound
        StateConnecting // Connect() called, but the initial SYN hasn't been sent.
        StateError
)

const (
        // rcvAdvWndScale is used to split the available socket buffer into
        // application buffer and the window to be advertised to the peer. This is
        // currently hard coded to split the available space equally.
        rcvAdvWndScale = 1

        // SegOverheadFactor is used to multiply the value provided by the
        // user on a SetSockOpt for setting the socket send/receive buffer sizes.
        SegOverheadFactor = 2
)

// connected returns true when s is one of the states representing an
// endpoint connected to a peer.
func (s EndpointState) connected() bool {
        switch s {
        case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
                return true
        default:
                return false
        }
}

// connecting returns true when s is one of the states representing a
// connection in progress, but not yet fully established.
func (s EndpointState) connecting() bool {
        switch s {
        case StateConnecting, StateSynSent, StateSynRecv:
                return true
        default:
                return false
        }
}

// internal returns true when the state is netstack internal.
func (s EndpointState) internal() bool {
        switch s {
        case StateInitial, StateBound, StateConnecting, StateError:
                return true
        default:
                return false
        }
}

// handshake returns true when s is one of the states representing an endpoint
// in the middle of a TCP handshake.
func (s EndpointState) handshake() bool {
        switch s {
        case StateSynSent, StateSynRecv:
                return true
        default:
                return false
        }
}

// closed returns true when s is one of the states an endpoint transitions to
// when closed or when it encounters an error. This is distinct from a newly
// initialized endpoint that was never connected.
func (s EndpointState) closed() bool {
        switch s {
        case StateClose, StateError:
                return true
        default:
                return false
        }
}

// String implements fmt.Stringer.String.
func (s EndpointState) String() string {
        switch s {
        case StateInitial:
                return "INITIAL"
        case StateBound:
                return "BOUND"
        case StateConnecting:
                return "CONNECTING"
        case StateError:
                return "ERROR"
        case StateEstablished:
                return "ESTABLISHED"
        case StateSynSent:
                return "SYN-SENT"
        case StateSynRecv:
                return "SYN-RCVD"
        case StateFinWait1:
                return "FIN-WAIT1"
        case StateFinWait2:
                return "FIN-WAIT2"
        case StateTimeWait:
                return "TIME-WAIT"
        case StateClose:
                return "CLOSED"
        case StateCloseWait:
                return "CLOSE-WAIT"
        case StateLastAck:
                return "LAST-ACK"
        case StateListen:
                return "LISTEN"
        case StateClosing:
                return "CLOSING"
        default:
                panic("unreachable")
        }
}

// Reasons for notifying the protocol goroutine.
const (
        notifyNonZeroReceiveWindow = 1 << iota
        notifyClose
        notifyMTUChanged
        notifyDrain
        notifyReset
        notifyResetByPeer
        // notifyAbort is a request for an expedited teardown.
        notifyAbort
        notifyKeepaliveChanged
        notifyMSSChanged
        // notifyTickleWorker is used to tickle the protocol main loop during a
        // restore after we update the endpoint state to the correct one. This
        // ensures the loop terminates if the final state of the endpoint is
        // say TIME_WAIT.
        notifyTickleWorker
        notifyError
)

// SACKInfo holds TCP SACK related information for a given endpoint.
//
// +stateify savable
type SACKInfo struct {
        // Blocks is the maximum number of SACK blocks we track
        // per endpoint.
        Blocks [MaxSACKBlocks]header.SACKBlock

        // NumBlocks is the number of valid SACK blocks stored in the
        // blocks array above.
        NumBlocks int
}

// ReceiveErrors collect segment receive errors within transport layer.
type ReceiveErrors struct {
        tcpip.ReceiveErrors

        // SegmentQueueDropped is the number of segments dropped due to
        // a full segment queue.
        SegmentQueueDropped tcpip.StatCounter

        // ChecksumErrors is the number of segments dropped due to bad checksums.
        ChecksumErrors tcpip.StatCounter

        // ListenOverflowSynDrop is the number of times the listen queue overflowed
        // and a SYN was dropped.
        ListenOverflowSynDrop tcpip.StatCounter

        // ListenOverflowAckDrop is the number of times the final ACK
        // in the handshake was dropped due to overflow.
        ListenOverflowAckDrop tcpip.StatCounter

        // ZeroRcvWindowState is the number of times we advertised
        // a zero receive window when rcvQueue is full.
        ZeroRcvWindowState tcpip.StatCounter

        // WantZeroWindow is the number of times we wanted to advertise a
        // zero receive window but couldn't because it would have caused
        // the receive window's right edge to shrink.
        WantZeroRcvWindow tcpip.StatCounter
}

// SendErrors collect segment send errors within the transport layer.
type SendErrors struct {
        tcpip.SendErrors

        // SegmentSendToNetworkFailed is the number of TCP segments failed to be sent
        // to the network endpoint.
        SegmentSendToNetworkFailed tcpip.StatCounter

        // SynSendToNetworkFailed is the number of TCP SYNs failed to be sent
        // to the network endpoint.
        SynSendToNetworkFailed tcpip.StatCounter

        // Retransmits is the number of TCP segments retransmitted.
        Retransmits tcpip.StatCounter

        // FastRetransmit is the number of segments retransmitted in fast
        // recovery.
        FastRetransmit tcpip.StatCounter

        // Timeouts is the number of times the RTO expired.
        Timeouts tcpip.StatCounter
}

// Stats holds statistics about the endpoint.
type Stats struct {
        // SegmentsReceived is the number of TCP segments received that
        // the transport layer successfully parsed.
        SegmentsReceived tcpip.StatCounter

        // SegmentsSent is the number of TCP segments sent.
        SegmentsSent tcpip.StatCounter

        // FailedConnectionAttempts is the number of times we saw Connect and
        // Accept errors.
        FailedConnectionAttempts tcpip.StatCounter

        // ReceiveErrors collects segment receive errors within the
        // transport layer.
        ReceiveErrors ReceiveErrors

        // ReadErrors collects segment read errors from an endpoint read call.
        ReadErrors tcpip.ReadErrors

        // SendErrors collects segment send errors within the transport layer.
        SendErrors SendErrors

        // WriteErrors collects segment write errors from an endpoint write call.
        WriteErrors tcpip.WriteErrors
}

// IsEndpointStats is an empty method to implement the tcpip.EndpointStats
// marker interface.
func (*Stats) IsEndpointStats() {}

// sndQueueInfo implements a send queue.
//
// +stateify savable
type sndQueueInfo struct {
        sndQueueMu sync.Mutex `state:"nosave"`
        stack.TCPSndBufState

        // sndWaker is used to signal the protocol goroutine when there may be
        // segments that need to be sent.
        sndWaker sleep.Waker `state:"manual"`
}

// rcvQueueInfo contains the endpoint's rcvQueue and associated metadata.
//
// +stateify savable
type rcvQueueInfo struct {
        rcvQueueMu sync.Mutex `state:"nosave"`
        stack.TCPRcvBufState

        // rcvQueue is the queue for ready-for-delivery segments. This struct's
        // mutex must be held in order append segments to list.
        rcvQueue segmentList `state:"wait"`
}

// +stateify savable
type accepted struct {
        // NB: this could be an endpointList, but ilist only permits endpoints to
        // belong to one list at a time, and endpoints are already stored in the
        // dispatcher's list.
        endpoints list.List `state:".([]*endpoint)"`
        cap       int
}

// endpoint represents a TCP endpoint. This struct serves as the interface
// between users of the endpoint and the protocol implementation; it is legal to
// have concurrent goroutines make calls into the endpoint, they are properly
// synchronized. The protocol implementation, however, runs in a single
// goroutine.
//
// Each endpoint has a few mutexes:
//
// e.mu -> Primary mutex for an endpoint must be held for all operations except
// in e.Readiness where acquiring it will result in a deadlock in epoll
// implementation.
//
// The following three mutexes can be acquired independent of e.mu but if
// acquired with e.mu then e.mu must be acquired first.
//
// e.acceptMu -> protects accepted.
// e.rcvQueueMu -> Protects e.rcvQueue and associated fields.
// e.sndQueueMu -> Protects the e.sndQueue and associated fields.
// e.lastErrorMu -> Protects the lastError field.
//
// LOCKING/UNLOCKING of the endpoint.  The locking of an endpoint is different
// based on the context in which the lock is acquired. In the syscall context
// e.LockUser/e.UnlockUser should be used and when doing background processing
// e.mu.Lock/e.mu.Unlock should be used. The distinction is described below
// in brief.
//
// The reason for this locking behaviour is to avoid wakeups to handle packets.
// In cases where the endpoint is already locked the background processor can
// queue the packet up and go its merry way and the lock owner will eventually
// process the backlog when releasing the lock. Similarly when acquiring the
// lock from say a syscall goroutine we can implement a bit of spinning if we
// know that the lock is not held by another syscall goroutine. Background
// processors should never hold the lock for long and we can avoid an expensive
// sleep/wakeup by spinning for a shortwhile.
//
// For more details please see the detailed documentation on
// e.LockUser/e.UnlockUser methods.
//
// +stateify savable
type endpoint struct {
        stack.TCPEndpointStateInner
        stack.TransportEndpointInfo
        tcpip.DefaultSocketOptionsHandler

        // endpointEntry is used to queue endpoints for processing to the
        // a given tcp processor goroutine.
        //
        // Precondition: epQueue.mu must be held to read/write this field..
        endpointEntry `state:"nosave"`

        // pendingProcessing is true if this endpoint is queued for processing
        // to a TCP processor.
        //
        // Precondition: epQueue.mu must be held to read/write this field..
        pendingProcessing bool `state:"nosave"`

        // The following fields are initialized at creation time and do not
        // change throughout the lifetime of the endpoint.
        stack       *stack.Stack  `state:"manual"`
        waiterQueue *waiter.Queue `state:"wait"`
        uniqueID    uint64

        // hardError is meaningful only when state is stateError. It stores the
        // error to be returned when read/write syscalls are called and the
        // endpoint is in this state. hardError is protected by endpoint mu.
        hardError tcpip.Error

        // lastError represents the last error that the endpoint reported;
        // access to it is protected by the following mutex.
        lastErrorMu sync.Mutex `state:"nosave"`
        lastError   tcpip.Error

        // rcvReadMu synchronizes calls to Read.
        //
        // mu and rcvQueueMu are temporarily released during data copying. rcvReadMu
        // must be held during each read to ensure atomicity, so that multiple reads
        // do not interleave.
        //
        // rcvReadMu should be held before holding mu.
        rcvReadMu sync.Mutex `state:"nosave"`

        // rcvQueueInfo holds the implementation of the endpoint's receive buffer.
        // The data within rcvQueueInfo should only be accessed while rcvReadMu, mu,
        // and rcvQueueMu are held, in that stated order. While processing the segment
        // range, you can determine a range and then temporarily release mu and
        // rcvQueueMu, which allows new segments to be appended to the queue while
        // processing.
        rcvQueueInfo rcvQueueInfo

        // rcvMemUsed tracks the total amount of memory in use by received segments
        // held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to
        // compute the window and the actual available buffer space. This is distinct
        // from rcvBufUsed above which is the actual number of payload bytes held in
        // the buffer not including any segment overheads.
        //
        // rcvMemUsed must be accessed atomically.
        rcvMemUsed int32

        // mu protects all endpoint fields unless documented otherwise. mu must
        // be acquired before interacting with the endpoint fields.
        //
        // During handshake, mu is locked by the protocol listen goroutine and
        // released by the handshake completion goroutine.
        mu          sync.CrossGoroutineMutex `state:"nosave"`
        ownedByUser uint32

        // state must be read/set using the EndpointState()/setEndpointState()
        // methods.
        state uint32 `state:".(EndpointState)"`

        // origEndpointState is only used during a restore phase to save the
        // endpoint state at restore time as the socket is moved to it's correct
        // state.
        origEndpointState uint32 `state:"nosave"`

        isPortReserved    bool `state:"manual"`
        isRegistered      bool `state:"manual"`
        boundNICID        tcpip.NICID
        route             *stack.Route `state:"manual"`
        ttl               uint8
        isConnectNotified bool

        // h stores a reference to the current handshake state if the endpoint is in
        // the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep.
        // nil otherwise.
        h *handshake `state:"nosave"`

        // portFlags stores the current values of port related flags.
        portFlags ports.Flags

        // Values used to reserve a port or register a transport endpoint
        // (which ever happens first).
        boundBindToDevice tcpip.NICID
        boundPortFlags    ports.Flags
        boundDest         tcpip.FullAddress

        // effectiveNetProtos contains the network protocols actually in use. In
        // most cases it will only contain "netProto", but in cases like IPv6
        // endpoints with v6only set to false, this could include multiple
        // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
        // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
        // address).
        effectiveNetProtos []tcpip.NetworkProtocolNumber

        // workerRunning specifies if a worker goroutine is running.
        workerRunning bool

        // workerCleanup specifies if the worker goroutine must perform cleanup
        // before exiting. This can only be set to true when workerRunning is
        // also true, and they're both protected by the mutex.
        workerCleanup bool

        // recentTSTime is the unix time when we last updated
        // TCPEndpointStateInner.RecentTS.
        recentTSTime tcpip.MonotonicTime

        // shutdownFlags represent the current shutdown state of the endpoint.
        shutdownFlags tcpip.ShutdownFlags

        // tcpRecovery is the loss deteoction algorithm used by TCP.
        tcpRecovery tcpip.TCPRecovery

        // sack holds TCP SACK related information for this endpoint.
        sack SACKInfo

        // delay enables Nagle's algorithm.
        //
        // delay is a boolean (0 is false) and must be accessed atomically.
        delay uint32

        // scoreboard holds TCP SACK Scoreboard information for this endpoint.
        scoreboard *SACKScoreboard

        // segmentQueue is used to hand received segments to the protocol
        // goroutine. Segments are queued as long as the queue is not full,
        // and dropped when it is.
        segmentQueue segmentQueue `state:"wait"`

        // synRcvdCount is the number of connections for this endpoint that are
        // in SYN-RCVD state; this is only accessed atomically.
        synRcvdCount int32

        // userMSS if non-zero is the MSS value explicitly set by the user
        // for this endpoint using the TCP_MAXSEG setsockopt.
        userMSS uint16

        // maxSynRetries is the maximum number of SYN retransmits that TCP should
        // send before aborting the attempt to connect. It cannot exceed 255.
        //
        // NOTE: This is currently a no-op and does not change the SYN
        // retransmissions.
        maxSynRetries uint8

        // windowClamp is used to bound the size of the advertised window to
        // this value.
        windowClamp uint32

        // sndQueueInfo contains the implementation of the endpoint's send queue.
        sndQueueInfo sndQueueInfo

        // cc stores the name of the Congestion Control algorithm to use for
        // this endpoint.
        cc tcpip.CongestionControlOption

        // newSegmentWaker is used to indicate to the protocol goroutine that
        // it needs to wake up and handle new segments queued to it.
        newSegmentWaker sleep.Waker `state:"manual"`

        // notificationWaker is used to indicate to the protocol goroutine that
        // it needs to wake up and check for notifications.
        notificationWaker sleep.Waker `state:"manual"`

        // notifyFlags is a bitmask of flags used to indicate to the protocol
        // goroutine what it was notified; this is only accessed atomically.
        notifyFlags uint32 `state:"nosave"`

        // keepalive manages TCP keepalive state. When the connection is idle
        // (no data sent or received) for keepaliveIdle, we start sending
        // keepalives every keepalive.interval. If we send keepalive.count
        // without hearing a response, the connection is closed.
        keepalive keepalive

        // userTimeout if non-zero specifies a user specified timeout for
        // a connection w/ pending data to send. A connection that has pending
        // unacked data will be forcibily aborted if the timeout is reached
        // without any data being acked.
        userTimeout time.Duration

        // deferAccept if non-zero specifies a user specified time during
        // which the final ACK of a handshake will be dropped provided the
        // ACK is a bare ACK and carries no data. If the timeout is crossed then
        // the bare ACK is accepted and the connection is delivered to the
        // listener.
        deferAccept time.Duration

        // pendingAccepted tracks connections queued to be accepted. It is used to
        // ensure such queued connections are terminated before the accepted queue is
        // marked closed (by setting its capacity to zero).
        pendingAccepted sync.WaitGroup `state:"nosave"`

        // acceptMu protects accepted.
        acceptMu sync.Mutex `state:"nosave"`

        // acceptCond is a condition variable that can be used to block on when
        // accepted is full and an endpoint is ready to be delivered.
        //
        // We use this condition variable to block/unblock goroutines which
        // tried to deliver an endpoint but couldn't because accept backlog was
        // full ( See: endpoint.deliverAccepted ).
        acceptCond *sync.Cond `state:"nosave"`

        // accepted is used by a listening endpoint protocol goroutine to
        // send newly accepted connections to the endpoint so that they can be
        // read by Accept() calls.
        accepted accepted

        // The following are only used from the protocol goroutine, and
        // therefore don't need locks to protect them.
        rcv *receiver `state:"wait"`
        snd *sender   `state:"wait"`

        // The goroutine drain completion notification channel.
        drainDone chan struct{} `state:"nosave"`

        // The goroutine undrain notification channel. This is currently used as
        // a way to block the worker goroutines. Today nothing closes/writes
        // this channel and this causes any goroutines waiting on this to just
        // block. This is used during save/restore to prevent worker goroutines
        // from mutating state as it's being saved.
        undrain chan struct{} `state:"nosave"`

        // probe if not nil is invoked on every received segment. It is passed
        // a copy of the current state of the endpoint.
        probe stack.TCPProbeFunc `state:"nosave"`

        // The following are only used to assist the restore run to re-connect.
        connectingAddress tcpip.Address

        // amss is the advertised MSS to the peer by this endpoint.
        amss uint16

        // sendTOS represents IPv4 TOS or IPv6 TrafficClass,
        // applied while sending packets. Defaults to 0 as on Linux.
        sendTOS uint8

        gso stack.GSO

        // TODO(b/142022063): Add ability to save and restore per endpoint stats.
        stats Stats `state:"nosave"`

        // tcpLingerTimeout is the maximum amount of a time a socket
        // a socket stays in TIME_WAIT state before being marked
        // closed.
        tcpLingerTimeout time.Duration

        // closed indicates that the user has called closed on the
        // endpoint and at this point the endpoint is only around
        // to complete the TCP shutdown.
        closed bool

        // txHash is the transport layer hash to be set on outbound packets
        // emitted by this endpoint.
        txHash uint32

        // owner is used to get uid and gid of the packet.
        owner tcpip.PacketOwner

        // ops is used to get socket level options.
        ops tcpip.SocketOptions

        // lastOutOfWindowAckTime is the time at which the an ACK was sent in response
        // to an out of window segment being received by this endpoint.
        lastOutOfWindowAckTime tcpip.MonotonicTime
}

// UniqueID implements stack.TransportEndpoint.UniqueID.
func (e *endpoint) UniqueID() uint64 {
        return e.uniqueID
}

// calculateAdvertisedMSS calculates the MSS to advertise.
//
// If userMSS is non-zero and is not greater than the maximum possible MSS for
// r, it will be used; otherwise, the maximum possible MSS will be used.
func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 {
        // The maximum possible MSS is dependent on the route.
        // TODO(b/143359391): Respect TCP Min and Max size.
        maxMSS := uint16(r.MTU() - header.TCPMinimumSize)

        if userMSS != 0 && userMSS < maxMSS {
                return userMSS
        }

        return maxMSS
}

// LockUser tries to lock e.mu and if it fails it will check if the lock is held
// by another syscall goroutine. If yes, then it will goto sleep waiting for the
// lock to be released, if not then it will spin till it acquires the lock or
// another syscall goroutine acquires it in which case it will goto sleep as
// described above.
//
// The assumption behind spinning here being that background packet processing
// should not be holding the lock for long and spinning reduces latency as we
// avoid an expensive sleep/wakeup of of the syscall goroutine).
// +checklocksacquire:e.mu
func (e *endpoint) LockUser() {
        for {
                // Try first if the sock is locked then check if it's owned
                // by another user goroutine if not then we spin, otherwise
                // we just go to sleep on the Lock() and wait.
                if !e.mu.TryLock() {
                        // If socket is owned by the user then just go to sleep
                        // as the lock could be held for a reasonably long time.
                        if atomic.LoadUint32(&e.ownedByUser) == 1 {
                                e.mu.Lock()
                                atomic.StoreUint32(&e.ownedByUser, 1)
                                return
                        }
                        // Spin but yield the processor since the lower half
                        // should yield the lock soon.
                        runtime.Gosched()
                        continue
                }
                atomic.StoreUint32(&e.ownedByUser, 1)
                return // +checklocksforce
        }
}

// UnlockUser will check if there are any segments already queued for processing
// and process any such segments before unlocking e.mu. This is required because
// we when packets arrive and endpoint lock is already held then such packets
// are queued up to be processed. If the lock is held by the endpoint goroutine
// then it will process these packets but if the lock is instead held by the
// syscall goroutine then we can have the syscall goroutine process the backlog
// before unlocking.
//
// This avoids an unnecessary wakeup of the endpoint protocol goroutine for the
// endpoint. It's also required eventually when we get rid of the endpoint
// protocol goroutine altogether.
//
// Precondition: e.LockUser() must have been called before calling e.UnlockUser()
// +checklocksrelease:e.mu
func (e *endpoint) UnlockUser() {
        // Lock segment queue before checking so that we avoid a race where
        // segments can be queued between the time we check if queue is empty
        // and actually unlock the endpoint mutex.
        for {
                e.segmentQueue.mu.Lock()
                if e.segmentQueue.emptyLocked() {
                        if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
                                panic("e.UnlockUser() called without calling e.LockUser()")
                        }
                        e.mu.Unlock()
                        e.segmentQueue.mu.Unlock()
                        return
                }
                e.segmentQueue.mu.Unlock()

                switch e.EndpointState() {
                case StateEstablished:
                        if err := e.handleSegmentsLocked(true /* fastPath */); err != nil {
                                e.notifyProtocolGoroutine(notifyTickleWorker)
                        }
                default:
                        // Since we are waking the endpoint goroutine here just unlock
                        // and let it process the queued segments.
                        e.newSegmentWaker.Assert()
                        if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
                                panic("e.UnlockUser() called without calling e.LockUser()")
                        }
                        e.mu.Unlock()
                        return
                }
        }
}

// StopWork halts packet processing. Only to be used in tests.
// +checklocksacquire:e.mu
func (e *endpoint) StopWork() {
        e.mu.Lock()
}

// ResumeWork resumes packet processing. Only to be used in tests.
// +checklocksrelease:e.mu
func (e *endpoint) ResumeWork() {
        e.mu.Unlock()
}

// setEndpointState updates the state of the endpoint to state atomically. This
// method is unexported as the only place we should update the state is in this
// package but we allow the state to be read freely without holding e.mu.
//
// Precondition: e.mu must be held to call this method.
func (e *endpoint) setEndpointState(state EndpointState) {
        oldstate := EndpointState(atomic.LoadUint32(&e.state))
        switch state {
        case StateEstablished:
                e.stack.Stats().TCP.CurrentEstablished.Increment()
                e.stack.Stats().TCP.CurrentConnected.Increment()
        case StateError:
                fallthrough
        case StateClose:
                if oldstate == StateCloseWait || oldstate == StateEstablished {
                        e.stack.Stats().TCP.EstablishedResets.Increment()
                }
                fallthrough
        default:
                if oldstate == StateEstablished {
                        e.stack.Stats().TCP.CurrentEstablished.Decrement()
                }
        }
        atomic.StoreUint32(&e.state, uint32(state))
}

// EndpointState returns the current state of the endpoint.
func (e *endpoint) EndpointState() EndpointState {
        return EndpointState(atomic.LoadUint32(&e.state))
}

// setRecentTimestamp sets the recentTS field to the provided value.
func (e *endpoint) setRecentTimestamp(recentTS uint32) {
        e.RecentTS = recentTS
        e.recentTSTime = e.stack.Clock().NowMonotonic()
}

// recentTimestamp returns the value of the recentTS field.
func (e *endpoint) recentTimestamp() uint32 {
        return e.RecentTS
}

// keepalive is a synchronization wrapper used to appease stateify. See the
// comment in endpoint, where it is used.
//
// +stateify savable
type keepalive struct {
        sync.Mutex `state:"nosave"`
        idle       time.Duration
        interval   time.Duration
        count      int
        unacked    int
        timer      timer       `state:"nosave"`
        waker      sleep.Waker `state:"nosave"`
}

func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
        e := &endpoint{
                stack: s,
                TransportEndpointInfo: stack.TransportEndpointInfo{
                        NetProto:   netProto,
                        TransProto: header.TCPProtocolNumber,
                },
                sndQueueInfo: sndQueueInfo{
                        TCPSndBufState: stack.TCPSndBufState{
                                SndMTU: math.MaxInt32,
                        },
                },
                waiterQueue: waiterQueue,
                state:       uint32(StateInitial),
                keepalive: keepalive{
                        // Linux defaults.
                        idle:     2 * time.Hour,
                        interval: 75 * time.Second,
                        count:    9,
                },
                uniqueID:      s.UniqueID(),
                txHash:        s.Rand().Uint32(),
                windowClamp:   DefaultReceiveBufferSize,
                maxSynRetries: DefaultSynRetries,
        }
        e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits)
        e.ops.SetMulticastLoop(true)
        e.ops.SetQuickAck(true)
        e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */)
        e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */)

        var ss tcpip.TCPSendBufferSizeRangeOption
        if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
                e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */)
        }

        var rs tcpip.TCPReceiveBufferSizeRangeOption
        if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
                e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */)
        }

        var cs tcpip.CongestionControlOption
        if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil {
                e.cc = cs
        }

        var mrb tcpip.TCPModerateReceiveBufferOption
        if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
                e.rcvQueueInfo.RcvAutoParams.Disabled = !bool(mrb)
        }

        var de tcpip.TCPDelayEnabled
        if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
                e.ops.SetDelayOption(true)
        }

        var tcpLT tcpip.TCPLingerTimeoutOption
        if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil {
                e.tcpLingerTimeout = time.Duration(tcpLT)
        }

        var synRetries tcpip.TCPSynRetriesOption
        if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil {
                e.maxSynRetries = uint8(synRetries)
        }

        s.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery)

        if p := s.GetTCPProbe(); p != nil {
                e.probe = p
        }

        e.segmentQueue.ep = e
        e.TSOffset = timeStampOffset(e.stack.Rand())
        e.acceptCond = sync.NewCond(&e.acceptMu)
        e.keepalive.timer.init(e.stack.Clock(), &e.keepalive.waker)

        return e
}

// Readiness returns the current readiness of the endpoint. For example, if
// waiter.EventIn is set, the endpoint is immediately readable.
func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
        result := waiter.EventMask(0)

        switch e.EndpointState() {
        case StateInitial, StateBound:
                // This prevents blocking of new sockets which are not
                // connected when SO_LINGER is set.
                result |= waiter.EventHUp

        case StateConnecting, StateSynSent, StateSynRecv:
                // Ready for nothing.

        case StateClose, StateError, StateTimeWait:
                // Ready for anything.
                result = mask

        case StateListen:
                // Check if there's anything in the accepted queue.
                if (mask & waiter.ReadableEvents) != 0 {
                        e.acceptMu.Lock()
                        if e.accepted.endpoints.Len() != 0 {
                                result |= waiter.ReadableEvents
                        }
                        e.acceptMu.Unlock()
                }
        }
        if e.EndpointState().connected() {
                // Determine if the endpoint is writable if requested.
                if (mask & waiter.WritableEvents) != 0 {
                        e.sndQueueInfo.sndQueueMu.Lock()
                        sndBufSize := e.getSendBufferSize()
                        if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize {
                                result |= waiter.WritableEvents
                        }
                        e.sndQueueInfo.sndQueueMu.Unlock()
                }

                // Determine if the endpoint is readable if requested.
                if (mask & waiter.ReadableEvents) != 0 {
                        e.rcvQueueInfo.rcvQueueMu.Lock()
                        if e.rcvQueueInfo.RcvBufUsed > 0 || e.rcvQueueInfo.RcvClosed {
                                result |= waiter.ReadableEvents
                        }
                        e.rcvQueueInfo.rcvQueueMu.Unlock()
                }
        }

        return result
}

func (e *endpoint) fetchNotifications() uint32 {
        return atomic.SwapUint32(&e.notifyFlags, 0)
}

func (e *endpoint) notifyProtocolGoroutine(n uint32) {
        for {
                v := atomic.LoadUint32(&e.notifyFlags)
                if v&n == n {
                        // The flags are already set.
                        return
                }

                if atomic.CompareAndSwapUint32(&e.notifyFlags, v, v|n) {
                        if v == 0 {
                                // We are causing a transition from no flags to
                                // at least one flag set, so we must cause the
                                // protocol goroutine to wake up.
                                e.notificationWaker.Assert()
                        }
                        return
                }
        }
}

// Abort implements stack.TransportEndpoint.Abort.
func (e *endpoint) Abort() {
        // The abort notification is not processed synchronously, so no
        // synchronization is needed.
        //
        // If the endpoint becomes connected after this check, we still close
        // the endpoint. This worst case results in a slower abort.
        //
        // If the endpoint disconnected after the check, nothing needs to be
        // done, so sending a notification which will potentially be ignored is
        // fine.
        //
        // If the endpoint connecting finishes after the check, the endpoint
        // is either in a connected state (where we would notifyAbort anyway),
        // SYN-RECV (where we would also notifyAbort anyway), or in an error
        // state where nothing is required and the notification can be safely
        // ignored.
        //
        // Endpoints where a Close during connecting or SYN-RECV state would be
        // problematic are set to state connecting before being registered (and
        // thus possible to be Aborted). They are never available in initial
        // state.
        //
        // Endpoints transitioning from initial to connecting state may be
        // safely either closed or sent notifyAbort.
        if s := e.EndpointState(); s == StateConnecting || s == StateSynRecv || s.connected() {
                e.notifyProtocolGoroutine(notifyAbort)
                return
        }
        e.Close()
}

// Close puts the endpoint in a closed state and frees all resources associated
// with it. It must be called only once and with no other concurrent calls to
// the endpoint.
func (e *endpoint) Close() {
        e.LockUser()
        defer e.UnlockUser()
        if e.closed {
                return
        }

        linger := e.SocketOptions().GetLinger()
        if linger.Enabled && linger.Timeout == 0 {
                s := e.EndpointState()
                isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv
                if isResetState {
                        // Close the endpoint without doing full shutdown and
                        // send a RST.
                        e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
                        e.closeNoShutdownLocked()

                        // Wake up worker to close the endpoint.
                        switch s {
                        case StateSynRecv:
                                e.notifyProtocolGoroutine(notifyClose)
                        default:
                                e.notifyProtocolGoroutine(notifyTickleWorker)
                        }
                        return
                }
        }

        // Issue a shutdown so that the peer knows we won't send any more data
        // if we're connected, or stop accepting if we're listening.
        e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
        e.closeNoShutdownLocked()
}

// closeNoShutdown closes the endpoint without doing a full shutdown.
func (e *endpoint) closeNoShutdownLocked() {
        // For listening sockets, we always release ports inline so that they
        // are immediately available for reuse after Close() is called. If also
        // registered, we unregister as well otherwise the next user would fail
        // in Listen() when trying to register.
        if e.EndpointState() == StateListen && e.isPortReserved {
                if e.isRegistered {
                        e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
                        e.isRegistered = false
                }

                portRes := ports.Reservation{
                        Networks:     e.effectiveNetProtos,
                        Transport:    ProtocolNumber,
                        Addr:         e.TransportEndpointInfo.ID.LocalAddress,
                        Port:         e.TransportEndpointInfo.ID.LocalPort,
                        Flags:        e.boundPortFlags,
                        BindToDevice: e.boundBindToDevice,
                        Dest:         e.boundDest,
                }
                e.stack.ReleasePort(portRes)
                e.isPortReserved = false
                e.boundBindToDevice = 0
                e.boundPortFlags = ports.Flags{}
                e.boundDest = tcpip.FullAddress{}
        }

        // Mark endpoint as closed.
        e.closed = true

        switch e.EndpointState() {
        case StateClose, StateError:
                return
        }

        eventMask := waiter.ReadableEvents | waiter.WritableEvents
        // Either perform the local cleanup or kick the worker to make sure it
        // knows it needs to cleanup.
        if e.workerRunning {
                e.workerCleanup = true
                tcpip.AddDanglingEndpoint(e)
                // Worker will remove the dangling endpoint when the endpoint
                // goroutine terminates.
                e.notifyProtocolGoroutine(notifyClose)
        } else {
                e.transitionToStateCloseLocked()
                // Notify that the endpoint is closed.
                eventMask |= waiter.EventHUp
        }

        // The TCP closing state-machine would eventually notify EventHUp, but we
        // notify EventIn|EventOut immediately to unblock any blocked waiters.
        e.waiterQueue.Notify(eventMask)
}

// closePendingAcceptableConnections closes all connections that have completed
// handshake but not yet been delivered to the application.
func (e *endpoint) closePendingAcceptableConnectionsLocked() {
        e.acceptMu.Lock()
        acceptedCopy := e.accepted
        e.accepted = accepted{}
        e.acceptMu.Unlock()

        if acceptedCopy == (accepted{}) {
                return
        }

        e.acceptCond.Broadcast()

        // Reset all connections that are waiting to be accepted.
        for n := acceptedCopy.endpoints.Front(); n != nil; n = n.Next() {
                n.Value.(*endpoint).notifyProtocolGoroutine(notifyReset)
        }
        // Wait for reset of all endpoints that are still waiting to be delivered to
        // the now closed accepted.
        e.pendingAccepted.Wait()
}

// cleanupLocked frees all resources associated with the endpoint. It is called
// after Close() is called and the worker goroutine (if any) is done with its
// work.
func (e *endpoint) cleanupLocked() {
        // Close all endpoints that might have been accepted by TCP but not by
        // the client.
        e.closePendingAcceptableConnectionsLocked()
        e.keepalive.timer.cleanup()

        e.workerCleanup = false

        if e.isRegistered {
                e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
                e.isRegistered = false
        }

        if e.isPortReserved {
                portRes := ports.Reservation{
                        Networks:     e.effectiveNetProtos,
                        Transport:    ProtocolNumber,
                        Addr:         e.TransportEndpointInfo.ID.LocalAddress,
                        Port:         e.TransportEndpointInfo.ID.LocalPort,
                        Flags:        e.boundPortFlags,
                        BindToDevice: e.boundBindToDevice,
                        Dest:         e.boundDest,
                }
                e.stack.ReleasePort(portRes)
                e.isPortReserved = false
        }
        e.boundBindToDevice = 0
        e.boundPortFlags = ports.Flags{}
        e.boundDest = tcpip.FullAddress{}

        if e.route != nil {
                e.route.Release()
                e.route = nil
        }

        e.stack.CompleteTransportEndpointCleanup(e)
        tcpip.DeleteDanglingEndpoint(e)
}

// wndFromSpace returns the window that we can advertise based on the available
// receive buffer space.
func wndFromSpace(space int) int {
        return space >> rcvAdvWndScale
}

// initialReceiveWindow returns the initial receive window to advertise in the
// SYN/SYN-ACK.
func (e *endpoint) initialReceiveWindow() int {
        rcvWnd := wndFromSpace(e.receiveBufferAvailable())
        if rcvWnd > math.MaxUint16 {
                rcvWnd = math.MaxUint16
        }

        // Use the user supplied MSS, if available.
        routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2
        if rcvWnd > routeWnd {
                rcvWnd = routeWnd
        }
        rcvWndScale := e.rcvWndScaleForHandshake()

        // Round-down the rcvWnd to a multiple of wndScale. This ensures that the
        // window offered in SYN won't be reduced due to the loss of precision if
        // window scaling is enabled after the handshake.
        rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)

        // Ensure we can always accept at least 1 byte if the scale specified
        // was too high for the provided rcvWnd.
        if rcvWnd == 0 {
                rcvWnd = 1
        }

        return rcvWnd
}

// ModerateRecvBuf adjusts the receive buffer and the advertised window
// based on the number of bytes copied to userspace.
func (e *endpoint) ModerateRecvBuf(copied int) {
        e.LockUser()
        defer e.UnlockUser()

        e.rcvQueueInfo.rcvQueueMu.Lock()
        if e.rcvQueueInfo.RcvAutoParams.Disabled {
                e.rcvQueueInfo.rcvQueueMu.Unlock()
                return
        }
        now := e.stack.Clock().NowMonotonic()
        if rtt := e.rcvQueueInfo.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.rcvQueueInfo.RcvAutoParams.MeasureTime) < rtt {
                e.rcvQueueInfo.RcvAutoParams.CopiedBytes += copied
                e.rcvQueueInfo.rcvQueueMu.Unlock()
                return
        }
        prevRTTCopied := e.rcvQueueInfo.RcvAutoParams.CopiedBytes + copied
        prevCopied := e.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes
        rcvWnd := 0
        if prevRTTCopied > prevCopied {
                // The minimal receive window based on what was copied by the app
                // in the immediate preceding RTT and some extra buffer for 16
                // segments to account for variations.
                // We multiply by 2 to account for packet losses.
                rcvWnd = prevRTTCopied*2 + 16*int(e.amss)

                // Scale for slow start based on bytes copied in this RTT vs previous.
                grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied

                // Multiply growth factor by 2 again to account for sender being
                // in slow-start where the sender grows it's congestion window
                // by 100% per RTT.
                rcvWnd += grow * 2

                // Make sure auto tuned buffer size can always receive upto 2x
                // the initial window of 10 segments.
                if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd {
                        rcvWnd = minRcvWnd
                }

                // Cap the auto tuned buffer size by the maximum permissible
                // receive buffer size.
                if max := e.maxReceiveBufferSize(); rcvWnd > max {
                        rcvWnd = max
                }

                // We do not adjust downwards as that can cause the receiver to
                // reject valid data that might already be in flight as the
                // acceptable window will shrink.
                rcvBufSize := int(e.ops.GetReceiveBufferSize())
                if rcvWnd > rcvBufSize {
                        availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize))
                        e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */)
                        availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd))
                        if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above {
                                e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
                        }
                }

                // We only update PrevCopiedBytes when we grow the buffer because in cases
                // where PrevCopiedBytes > prevRTTCopied the existing buffer is already big
                // enough to handle the current rate and we don't need to do any
                // adjustments.
                e.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = prevRTTCopied
        }
        e.rcvQueueInfo.RcvAutoParams.MeasureTime = now
        e.rcvQueueInfo.RcvAutoParams.CopiedBytes = 0
        e.rcvQueueInfo.rcvQueueMu.Unlock()
}

// SetOwner implements tcpip.Endpoint.SetOwner.
func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
        e.owner = owner
}

// Preconditions: e.mu must be held to call this function.
func (e *endpoint) hardErrorLocked() tcpip.Error {
        err := e.hardError
        e.hardError = nil
        return err
}

// Preconditions: e.mu must be held to call this function.
func (e *endpoint) lastErrorLocked() tcpip.Error {
        e.lastErrorMu.Lock()
        defer e.lastErrorMu.Unlock()
        err := e.lastError
        e.lastError = nil
        return err
}

// LastError implements tcpip.Endpoint.LastError.
func (e *endpoint) LastError() tcpip.Error {
        e.LockUser()
        defer e.UnlockUser()
        if err := e.hardErrorLocked(); err != nil {
                return err
        }
        return e.lastErrorLocked()
}

// LastErrorLocked reads and clears lastError with e.mu held.
// Only to be used in tests.
func (e *endpoint) LastErrorLocked() tcpip.Error {
        return e.lastErrorLocked()
}

// UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError.
func (e *endpoint) UpdateLastError(err tcpip.Error) {
        e.LockUser()
        e.lastErrorMu.Lock()
        e.lastError = err
        e.lastErrorMu.Unlock()
        e.UnlockUser()
}

// Read implements tcpip.Endpoint.Read.
func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) {
        e.rcvReadMu.Lock()
        defer e.rcvReadMu.Unlock()

        // N.B. Here we get a range of segments to be processed. It is safe to not
        // hold rcvQueueMu when processing, since we hold rcvReadMu to ensure only we
        // can remove segments from the list through commitRead().
        first, last, serr := e.startRead()
        if serr != nil {
                if _, ok := serr.(*tcpip.ErrClosedForReceive); ok {
                        e.stats.ReadErrors.ReadClosed.Increment()
                }
                return tcpip.ReadResult{}, serr
        }

        var err error
        done := 0
        s := first
        for s != nil {
                var n int
                n, err = s.data.ReadTo(dst, opts.Peek)
                // Book keeping first then error handling.

                done += n

                if opts.Peek {
                        // For peek, we use the (first, last) range of segment returned from
                        // startRead. We don't consume the receive buffer, so commitRead should
                        // not be called.
                        //
                        // N.B. It is important to use `last` to determine the last segment, since
                        // appending can happen while we process, and will lead to data race.
                        if s == last {
                                break
                        }
                        s = s.Next()
                } else {
                        // N.B. commitRead() conveniently returns the next segment to read, after
                        // removing the data/segment that is read.
                        s = e.commitRead(n)
                }

                if err != nil {
                        break
                }
        }

        // If something is read, we must report it. Report error when nothing is read.
        if done == 0 && err != nil {
                return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{}
        }
        return tcpip.ReadResult{
                Count: done,
                Total: done,
        }, nil
}

// startRead checks that endpoint is in a readable state, and return the
// inclusive range of segments that can be read.
//
// Precondition: e.rcvReadMu must be held.
func (e *endpoint) startRead() (first, last *segment, err tcpip.Error) {
        e.LockUser()
        defer e.UnlockUser()

        // When in SYN-SENT state, let the caller block on the receive.
        // An application can initiate a non-blocking connect and then block
        // on a receive. It can expect to read any data after the handshake
        // is complete. RFC793, section 3.9, p58.
        if e.EndpointState() == StateSynSent {
                return nil, nil, &tcpip.ErrWouldBlock{}
        }

        // The endpoint can be read if it's connected, or if it's already closed
        // but has some pending unread data. Also note that a RST being received
        // would cause the state to become StateError so we should allow the
        // reads to proceed before returning a ECONNRESET.
        e.rcvQueueInfo.rcvQueueMu.Lock()
        defer e.rcvQueueInfo.rcvQueueMu.Unlock()

        bufUsed := e.rcvQueueInfo.RcvBufUsed
        if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
                if s == StateError {
                        if err := e.hardErrorLocked(); err != nil {
                                return nil, nil, err
                        }
                        return nil, nil, &tcpip.ErrClosedForReceive{}
                }
                e.stats.ReadErrors.NotConnected.Increment()
                return nil, nil, &tcpip.ErrNotConnected{}
        }

        if e.rcvQueueInfo.RcvBufUsed == 0 {
                if e.rcvQueueInfo.RcvClosed || !e.EndpointState().connected() {
                        return nil, nil, &tcpip.ErrClosedForReceive{}
                }
                return nil, nil, &tcpip.ErrWouldBlock{}
        }

        return e.rcvQueueInfo.rcvQueue.Front(), e.rcvQueueInfo.rcvQueue.Back(), nil
}

// commitRead commits a read of done bytes and returns the next non-empty
// segment to read. Data read from the segment must have also been removed from
// the segment in order for this method to work correctly.
//
// It is performance critical to call commitRead frequently when servicing a big
// Read request, so TCP can make progress timely. Right now, it is designed to
// do this per segment read, hence this method conveniently returns the next
// segment to read while holding the lock.
//
// Precondition: e.rcvReadMu must be held.
func (e *endpoint) commitRead(done int) *segment {
        e.LockUser()
        defer e.UnlockUser()
        e.rcvQueueInfo.rcvQueueMu.Lock()
        defer e.rcvQueueInfo.rcvQueueMu.Unlock()

        memDelta := 0
        s := e.rcvQueueInfo.rcvQueue.Front()
        for s != nil && s.data.Size() == 0 {
                e.rcvQueueInfo.rcvQueue.Remove(s)
                // Memory is only considered released when the whole segment has been
                // read.
                memDelta += s.segMemSize()
                s.decRef()
                s = e.rcvQueueInfo.rcvQueue.Front()
        }
        e.rcvQueueInfo.RcvBufUsed -= done

        if memDelta > 0 {
                // If the window was small before this read and if the read freed up
                // enough buffer space, to either fit an aMSS or half a receive buffer
                // (whichever smaller), then notify the protocol goroutine to send a
                // window update.
                if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above {
                        e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
                }
        }

        return e.rcvQueueInfo.rcvQueue.Front()
}

// isEndpointWritableLocked checks if a given endpoint is writable
// and also returns the number of bytes that can be written at this
// moment. If the endpoint is not writable then it returns an error
// indicating the reason why it's not writable.
// Caller must hold e.mu and e.sndQueueMu
func (e *endpoint) isEndpointWritableLocked() (int, tcpip.Error) {
        // The endpoint cannot be written to if it's not connected.
        switch s := e.EndpointState(); {
        case s == StateError:
                if err := e.hardErrorLocked(); err != nil {
                        return 0, err
                }
                return 0, &tcpip.ErrClosedForSend{}
        case !s.connecting() && !s.connected():
                return 0, &tcpip.ErrClosedForSend{}
        case s.connecting():
                // As per RFC793, page 56, a send request arriving when in connecting
                // state, can be queued to be completed after the state becomes
                // connected. Return an error code for the caller of endpoint Write to
                // try again, until the connection handshake is complete.
                return 0, &tcpip.ErrWouldBlock{}
        }

        // Check if the connection has already been closed for sends.
        if e.sndQueueInfo.SndClosed {
                return 0, &tcpip.ErrClosedForSend{}
        }

        sndBufSize := e.getSendBufferSize()
        avail := sndBufSize - e.sndQueueInfo.SndBufUsed
        if avail <= 0 {
                return 0, &tcpip.ErrWouldBlock{}
        }
        return avail, nil
}

// readFromPayloader reads a slice from the Payloader.
// +checklocks:e.mu
// +checklocks:e.sndQueueInfo.sndQueueMu
func (e *endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) ([]byte, tcpip.Error) {
        // We can release locks while copying data.
        //
        // This is not possible if atomic is set, because we can't allow the
        // available buffer space to be consumed by some other caller while we
        // are copying data in.
        if !opts.Atomic {
                e.sndQueueInfo.sndQueueMu.Unlock()
                defer e.sndQueueInfo.sndQueueMu.Lock()

                e.UnlockUser()
                defer e.LockUser()
        }

        // Fetch data.
        if l := p.Len(); l < avail {
                avail = l
        }
        if avail == 0 {
                return nil, nil
        }
        v := make([]byte, avail)
        n, err := p.Read(v)
        if err != nil && err != io.EOF {
                return nil, &tcpip.ErrBadBuffer{}
        }
        return v[:n], nil
}

// queueSegment reads data from the payloader and returns a segment to be sent.
// +checklocks:e.mu
func (e *endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) {
        e.sndQueueInfo.sndQueueMu.Lock()
        defer e.sndQueueInfo.sndQueueMu.Unlock()

        avail, err := e.isEndpointWritableLocked()
        if err != nil {
                e.stats.WriteErrors.WriteClosed.Increment()
                return nil, 0, err
        }

        v, err := e.readFromPayloader(p, opts, avail)
        if err != nil {
                return nil, 0, err
        }

        // Do not queue zero length segments.
        if len(v) == 0 {
                return nil, 0, nil
        }

        if !opts.Atomic {
                // Since we released locks in between it's possible that the
                // endpoint transitioned to a CLOSED/ERROR states so make
                // sure endpoint is still writable before trying to write.
                avail, err := e.isEndpointWritableLocked()
                if err != nil {
                        e.stats.WriteErrors.WriteClosed.Increment()
                        return nil, 0, err
                }

                // Discard any excess data copied in due to avail being reduced due
                // to a simultaneous write call to the socket.
                if avail < len(v) {
                        v = v[:avail]
                }
        }

        // Add data to the send queue.
        s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), v)
        e.sndQueueInfo.SndBufUsed += len(v)
        e.snd.writeList.PushBack(s)

        return s, len(v), nil
}

// Write writes data to the endpoint's peer.
func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
        // Linux completely ignores any address passed to sendto(2) for TCP sockets
        // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
        // and opts.EndOfRecord are also ignored.

        e.LockUser()
        defer e.UnlockUser()

        // Return if either we didn't queue anything or if an error occurred while
        // attempting to queue data.
        nextSeg, n, err := e.queueSegment(p, opts)
        if n == 0 || err != nil {
                return 0, err
        }

        e.sendData(nextSeg)
        return int64(n), nil
}

// selectWindowLocked returns the new window without checking for shrinking or scaling
// applied.
// Precondition: e.mu and e.rcvQueueMu must be held.
func (e *endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) {
        wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize))
        maxWindow := wndFromSpace(rcvBufSize)
        wndFromUsedBytes := maxWindow - e.rcvQueueInfo.RcvBufUsed

        // We take the lesser of the wndFromAvailable and wndFromUsedBytes because in
        // cases where we receive a lot of small segments the segment overhead is a
        // lot higher and we can run out socket buffer space before we can fill the
        // previous window we advertised. In cases where we receive MSS sized or close
        // MSS sized segments we will probably run out of window space before we
        // exhaust receive buffer.
        newWnd := wndFromAvailable
        if newWnd > wndFromUsedBytes {
                newWnd = wndFromUsedBytes
        }
        if newWnd < 0 {
                newWnd = 0
        }
        return seqnum.Size(newWnd)
}

// selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu.
func (e *endpoint) selectWindow() (wnd seqnum.Size) {
        e.rcvQueueInfo.rcvQueueMu.Lock()
        wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize()))
        e.rcvQueueInfo.rcvQueueMu.Unlock()
        return wnd
}

// windowCrossedACKThresholdLocked checks if the receive window to be announced
// would be under aMSS or under the window derived from half receive buffer,
// whichever smaller. This is useful as a receive side silly window syndrome
// prevention mechanism. If window grows to reasonable value, we should send ACK
// to the sender to inform the rx space is now large. We also want ensure a
// series of small read()'s won't trigger a flood of spurious tiny ACK's.
//
// For large receive buffers, the threshold is aMSS - once reader reads more
// than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of
// receive buffer size. This is chosen arbitrarily.
// crossed will be true if the window size crossed the ACK threshold.
// above will be true if the new window is >= ACK threshold and false
// otherwise.
//
// Precondition: e.mu and e.rcvQueueMu must be held.
func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) {
        newAvail := int(e.selectWindowLocked(rcvBufSize))
        oldAvail := newAvail - deltaBefore
        if oldAvail < 0 {
                oldAvail = 0
        }
        threshold := int(e.amss)
        // rcvBufFraction is the inverse of the fraction of receive buffer size that
        // is used to decide if the available buffer space is now above it.
        const rcvBufFraction = 2
        if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold {
                threshold = wndThreshold
        }
        switch {
        case oldAvail < threshold && newAvail >= threshold:
                return true, true
        case oldAvail >= threshold && newAvail < threshold:
                return true, false
        }
        return false, false
}

// OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet.
func (e *endpoint) OnReuseAddressSet(v bool) {
        e.LockUser()
        e.portFlags.TupleOnly = v
        e.UnlockUser()
}

// OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet.
func (e *endpoint) OnReusePortSet(v bool) {
        e.LockUser()
        e.portFlags.LoadBalanced = v
        e.UnlockUser()
}

// OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet.
func (e *endpoint) OnKeepAliveSet(bool) {
        e.notifyProtocolGoroutine(notifyKeepaliveChanged)
}

// OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet.
func (e *endpoint) OnDelayOptionSet(v bool) {
        if !v {
                // Handle delayed data.
                e.sndQueueInfo.sndWaker.Assert()
        }
}

// OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet.
func (e *endpoint) OnCorkOptionSet(v bool) {
        if !v {
                // Handle the corked data.
                e.sndQueueInfo.sndWaker.Assert()
        }
}

func (e *endpoint) getSendBufferSize() int {
        return int(e.ops.GetSendBufferSize())
}

// OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize.
func (e *endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64) {
        e.LockUser()
        e.rcvQueueInfo.rcvQueueMu.Lock()

        // Make sure the receive buffer size allows us to send a
        // non-zero window size.
        scale := uint8(0)
        if e.rcv != nil {
                scale = e.rcv.RcvWndScale
        }
        if rcvBufSz>>scale == 0 {
                rcvBufSz = 1 << scale
        }

        availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz)))
        availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz)))
        e.rcvQueueInfo.RcvAutoParams.Disabled = true

        // Immediately send an ACK to uncork the sender silly window
        // syndrome prevetion, when our available space grows above aMSS
        // or half receive buffer, whichever smaller.
        if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above {
                e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
        }

        e.rcvQueueInfo.rcvQueueMu.Unlock()
        e.UnlockUser()
        return rcvBufSz
}

// SetSockOptInt sets a socket option.
func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
        // Lower 2 bits represents ECN bits. RFC 3168, section 23.1
        const inetECNMask = 3

        switch opt {
        case tcpip.KeepaliveCountOption:
                e.keepalive.Lock()
                e.keepalive.count = v
                e.keepalive.Unlock()
                e.notifyProtocolGoroutine(notifyKeepaliveChanged)

        case tcpip.IPv4TOSOption:
                e.LockUser()
                // TODO(gvisor.dev/issue/995): ECN is not currently supported,
                // ignore the bits for now.
                e.sendTOS = uint8(v) & ^uint8(inetECNMask)
                e.UnlockUser()

        case tcpip.IPv6TrafficClassOption:
                e.LockUser()
                // TODO(gvisor.dev/issue/995): ECN is not currently supported,
                // ignore the bits for now.
                e.sendTOS = uint8(v) & ^uint8(inetECNMask)
                e.UnlockUser()

        case tcpip.MaxSegOption:
                userMSS := v
                if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
                        return &tcpip.ErrInvalidOptionValue{}
                }
                e.LockUser()
                e.userMSS = uint16(userMSS)
                e.UnlockUser()
                e.notifyProtocolGoroutine(notifyMSSChanged)

        case tcpip.MTUDiscoverOption:
                // Return not supported if attempting to set this option to
                // anything other than path MTU discovery disabled.
                if v != tcpip.PMTUDiscoveryDont {
                        return &tcpip.ErrNotSupported{}
                }

        case tcpip.TTLOption:
                e.LockUser()
                e.ttl = uint8(v)
                e.UnlockUser()

        case tcpip.TCPSynCountOption:
                if v < 1 || v > 255 {
                        return &tcpip.ErrInvalidOptionValue{}
                }
                e.LockUser()
                e.maxSynRetries = uint8(v)
                e.UnlockUser()

        case tcpip.TCPWindowClampOption:
                if v == 0 {
                        e.LockUser()
                        switch e.EndpointState() {
                        case StateClose, StateInitial:
                                e.windowClamp = 0
                                e.UnlockUser()
                                return nil
                        default:
                                e.UnlockUser()
                                return &tcpip.ErrInvalidOptionValue{}
                        }
                }
                var rs tcpip.TCPReceiveBufferSizeRangeOption
                if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
                        if v < rs.Min/2 {
                                v = rs.Min / 2
                        }
                }
                e.LockUser()
                e.windowClamp = uint32(v)
                e.UnlockUser()
        }
        return nil
}

func (e *endpoint) HasNIC(id int32) bool {
        return id == 0 || e.stack.HasNIC(tcpip.NICID(id))
}

// SetSockOpt sets a socket option.
func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
        switch v := opt.(type) {
        case *tcpip.KeepaliveIdleOption:
                e.keepalive.Lock()
                e.keepalive.idle = time.Duration(*v)
                e.keepalive.Unlock()
                e.notifyProtocolGoroutine(notifyKeepaliveChanged)

        case *tcpip.KeepaliveIntervalOption:
                e.keepalive.Lock()
                e.keepalive.interval = time.Duration(*v)
                e.keepalive.Unlock()
                e.notifyProtocolGoroutine(notifyKeepaliveChanged)

        case *tcpip.TCPUserTimeoutOption:
                e.LockUser()
                e.userTimeout = time.Duration(*v)
                e.UnlockUser()

        case *tcpip.CongestionControlOption:
                // Query the available cc algorithms in the stack and
                // validate that the specified algorithm is actually
                // supported in the stack.
                var avail tcpip.TCPAvailableCongestionControlOption
                if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
                        return err
                }
                availCC := strings.Split(string(avail), " ")
                for _, cc := range availCC {
                        if *v == tcpip.CongestionControlOption(cc) {
                                e.LockUser()
                                state := e.EndpointState()
                                e.cc = *v
                                switch state {
                                case StateEstablished:
                                        if e.EndpointState() == state {
                                                e.snd.cc = e.snd.initCongestionControl(e.cc)
                                        }
                                }
                                e.UnlockUser()
                                return nil
                        }
                }

                // Linux returns ENOENT when an invalid congestion
                // control algorithm is specified.
                return &tcpip.ErrNoSuchFile{}

        case *tcpip.TCPLingerTimeoutOption:
                e.LockUser()

                switch {
                case *v < 0:
                        // Same as effectively disabling TCPLinger timeout.
                        *v = -1
                case *v == 0:
                        // Same as the stack default.
                        var stackLingerTimeout tcpip.TCPLingerTimeoutOption
                        if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil {
                                panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err))
                        }
                        *v = stackLingerTimeout
                case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout):
                        // Cap it to Stack's default TCP_LINGER2 timeout.
                        *v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
                default:
                }

                e.tcpLingerTimeout = time.Duration(*v)
                e.UnlockUser()

        case *tcpip.TCPDeferAcceptOption:
                e.LockUser()
                if time.Duration(*v) > MaxRTO {
                        *v = tcpip.TCPDeferAcceptOption(MaxRTO)
                }
                e.deferAccept = time.Duration(*v)
                e.UnlockUser()

        case *tcpip.SocketDetachFilterOption:
                return nil

        default:
                return nil
        }
        return nil
}

// readyReceiveSize returns the number of bytes ready to be received.
func (e *endpoint) readyReceiveSize() (int, tcpip.Error) {
        e.LockUser()
        defer e.UnlockUser()

        // The endpoint cannot be in listen state.
        if e.EndpointState() == StateListen {
                return 0, &tcpip.ErrInvalidEndpointState{}
        }

        e.rcvQueueInfo.rcvQueueMu.Lock()
        defer e.rcvQueueInfo.rcvQueueMu.Unlock()

        return e.rcvQueueInfo.RcvBufUsed, nil
}

// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
        switch opt {
        case tcpip.KeepaliveCountOption:
                e.keepalive.Lock()
                v := e.keepalive.count
                e.keepalive.Unlock()
                return v, nil

        case tcpip.IPv4TOSOption:
                e.LockUser()
                v := int(e.sendTOS)
                e.UnlockUser()
                return v, nil

        case tcpip.IPv6TrafficClassOption:
                e.LockUser()
                v := int(e.sendTOS)
                e.UnlockUser()
                return v, nil

        case tcpip.MaxSegOption:
                // This is just stubbed out. Linux never returns the user_mss
                // value as it either returns the defaultMSS or returns the
                // actual current MSS. Netstack just returns the defaultMSS
                // always for now.
                v := header.TCPDefaultMSS
                return v, nil

        case tcpip.MTUDiscoverOption:
                // Always return the path MTU discovery disabled setting since
                // it's the only one supported.
                return tcpip.PMTUDiscoveryDont, nil

        case tcpip.ReceiveQueueSizeOption:
                return e.readyReceiveSize()

        case tcpip.TTLOption:
                e.LockUser()
                v := int(e.ttl)
                e.UnlockUser()
                return v, nil

        case tcpip.TCPSynCountOption:
                e.LockUser()
                v := int(e.maxSynRetries)
                e.UnlockUser()
                return v, nil

        case tcpip.TCPWindowClampOption:
                e.LockUser()
                v := int(e.windowClamp)
                e.UnlockUser()
                return v, nil

        case tcpip.MulticastTTLOption:
                return 1, nil

        default:
                return -1, &tcpip.ErrUnknownProtocolOption{}
        }
}

func (e *endpoint) getTCPInfo() tcpip.TCPInfoOption {
        info := tcpip.TCPInfoOption{}
        e.LockUser()
        if state := e.EndpointState(); state.internal() {
                info.State = tcpip.EndpointState(StateClose)
        } else {
                info.State = tcpip.EndpointState(state)
        }
        snd := e.snd
        if snd != nil {
                // We do not calculate RTT before sending the data packets. If
                // the connection did not send and receive data, then RTT will
                // be zero.
                snd.rtt.Lock()
                info.RTT = snd.rtt.TCPRTTState.SRTT
                info.RTTVar = snd.rtt.TCPRTTState.RTTVar
                snd.rtt.Unlock()

                info.RTO = snd.RTO
                info.CcState = snd.state
                info.SndSsthresh = uint32(snd.Ssthresh)
                info.SndCwnd = uint32(snd.SndCwnd)
                info.ReorderSeen = snd.rc.Reord
        }
        e.UnlockUser()
        return info
}

// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error {
        switch o := opt.(type) {
        case *tcpip.TCPInfoOption:
                *o = e.getTCPInfo()

        case *tcpip.KeepaliveIdleOption:
                e.keepalive.Lock()
                *o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
                e.keepalive.Unlock()

        case *tcpip.KeepaliveIntervalOption:
                e.keepalive.Lock()
                *o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
                e.keepalive.Unlock()

        case *tcpip.TCPUserTimeoutOption:
                e.LockUser()
                *o = tcpip.TCPUserTimeoutOption(e.userTimeout)
                e.UnlockUser()

        case *tcpip.CongestionControlOption:
                e.LockUser()
                *o = e.cc
                e.UnlockUser()

        case *tcpip.TCPLingerTimeoutOption:
                e.LockUser()
                *o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
                e.UnlockUser()

        case *tcpip.TCPDeferAcceptOption:
                e.LockUser()
                *o = tcpip.TCPDeferAcceptOption(e.deferAccept)
                e.UnlockUser()

        case *tcpip.OriginalDestinationOption:
                e.LockUser()
                ipt := e.stack.IPTables()
                addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto)
                e.UnlockUser()
                if err != nil {
                        return err
                }
                *o = tcpip.OriginalDestinationOption{
                        Addr: addr,
                        Port: port,
                }

        default:
                return &tcpip.ErrUnknownProtocolOption{}
        }
        return nil
}

// checkV4MappedLocked determines the effective network protocol and converts
// addr to its canonical form.
func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) {
        unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only())
        if err != nil {
                return tcpip.FullAddress{}, 0, err
        }
        return unwrapped, netProto, nil
}

// Disconnect implements tcpip.Endpoint.Disconnect.
func (*endpoint) Disconnect() tcpip.Error {
        return &tcpip.ErrNotSupported{}
}

// Connect connects the endpoint to its peer.
func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error {
        err := e.connect(addr, true, true)
        if err != nil {
                if !err.IgnoreStats() {
                        // Connect failed. Let's wake up any waiters.
                        e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
                        e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
                        e.stats.FailedConnectionAttempts.Increment()
                }
        }
        return err
}

// connect connects the endpoint to its peer. In the normal non-S/R case, the
// new connection is expected to run the main goroutine and perform handshake.
// In restore of previously connected endpoints, both ends will be passively
// created (so no new handshaking is done); for stack-accepted connections not
// yet accepted by the app, they are restored without running the main goroutine
// here.
func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) tcpip.Error {
        e.LockUser()
        defer e.UnlockUser()

        connectingAddr := addr.Addr

        addr, netProto, err := e.checkV4MappedLocked(addr)
        if err != nil {
                return err
        }

        if e.EndpointState().connected() {
                // The endpoint is already connected. If caller hasn't been
                // notified yet, return success.
                if !e.isConnectNotified {
                        e.isConnectNotified = true
                        return nil
                }
                // Otherwise return that it's already connected.
                return &tcpip.ErrAlreadyConnected{}
        }

        nicID := addr.NIC
        switch e.EndpointState() {
        case StateBound:
                // If we're already bound to a NIC but the caller is requesting
                // that we use a different one now, we cannot proceed.
                if e.boundNICID == 0 {
                        break
                }

                if nicID != 0 && nicID != e.boundNICID {
                        return &tcpip.ErrNoRoute{}
                }

                nicID = e.boundNICID

        case StateInitial:
                // Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
                // when we find a route.

        case StateConnecting, StateSynSent, StateSynRecv:
                // A connection request has already been issued but hasn't completed
                // yet.
                return &tcpip.ErrAlreadyConnecting{}

        case StateError:
                if err := e.hardErrorLocked(); err != nil {
                        return err
                }
                return &tcpip.ErrConnectionAborted{}

        default:
                return &tcpip.ErrInvalidEndpointState{}
        }

        // Find a route to the desired destination.
        r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
        if err != nil {
                return err
        }
        defer r.Release()

        netProtos := []tcpip.NetworkProtocolNumber{netProto}
        e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress()
        e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress()
        e.TransportEndpointInfo.ID.RemotePort = addr.Port

        if e.TransportEndpointInfo.ID.LocalPort != 0 {
                // The endpoint is bound to a port, attempt to register it.
                err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice)
                if err != nil {
                        return err
                }
        } else {
                // The endpoint doesn't have a local port yet, so try to get
                // one. Make sure that it isn't one that will result in the same
                // address/port for both local and remote (otherwise this
                // endpoint would be trying to connect to itself).
                sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress

                // Calculate a port offset based on the destination IP/port and
                // src IP to ensure that for a given tuple (srcIP, destIP,
                // destPort) the offset used as a starting point is the same to
                // ensure that we can cycle through the port space effectively.
                portBuf := make([]byte, 2)
                binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort)

                h := jenkins.Sum32(e.stack.Seed())
                for _, s := range [][]byte{
                        []byte(e.ID.LocalAddress),
                        []byte(e.ID.RemoteAddress),
                        portBuf,
                } {
                        // Per io.Writer.Write:
                        //
                        // Write must return a non-nil error if it returns n < len(p).
                        if _, err := h.Write(s); err != nil {
                                panic(err)
                        }
                }
                portOffset := h.Sum32()

                var twReuse tcpip.TCPTimeWaitReuseOption
                if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil {
                        panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err))
                }

                reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal
                if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly {
                        switch netProto {
                        case header.IPv4ProtocolNumber:
                                reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress)
                        case header.IPv6ProtocolNumber:
                                reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback
                        }
                }

                bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
                if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, tcpip.Error) {
                        if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort {
                                return false, nil
                        }
                        portRes := ports.Reservation{
                                Networks:     netProtos,
                                Transport:    ProtocolNumber,
                                Addr:         e.TransportEndpointInfo.ID.LocalAddress,
                                Port:         p,
                                Flags:        e.portFlags,
                                BindToDevice: bindToDevice,
                                Dest:         addr,
                        }
                        if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil {
                                if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse {
                                        return false, nil
                                }
                                transEPID := e.TransportEndpointInfo.ID
                                transEPID.LocalPort = p
                                // Check if an endpoint is registered with demuxer in TIME-WAIT and if
                                // we can reuse it. If we can't find a transport endpoint then we just
                                // skip using this port as it's possible that either an endpoint has
                                // bound the port but not registered with demuxer yet (no listen/connect
                                // done yet) or the reservation was freed between the check above and
                                // the FindTransportEndpoint below. But rather than retry the same port
                                // we just skip it and move on.
                                transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, r.NICID())
                                if transEP == nil {
                                        // ReservePort failed but there is no registered endpoint with
                                        // demuxer. Which indicates there is at least some endpoint that has
                                        // bound the port.
                                        return false, nil
                                }

                                tcpEP := transEP.(*endpoint)
                                tcpEP.LockUser()
                                // If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but
                                // less than 1 second has elapsed since its recentTS was updated then
                                // we cannot reuse the port.
                                if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second {
                                        tcpEP.UnlockUser()
                                        return false, nil
                                }
                                // Since the endpoint is in TIME-WAIT it should be safe to acquire its
                                // Lock while holding the lock for this endpoint as endpoints in
                                // TIME-WAIT do not acquire locks on other endpoints.
                                tcpEP.workerCleanup = false
                                tcpEP.cleanupLocked()
                                tcpEP.notifyProtocolGoroutine(notifyAbort)
                                tcpEP.UnlockUser()
                                // Now try and Reserve again if it fails then we skip.
                                portRes := ports.Reservation{
                                        Networks:     netProtos,
                                        Transport:    ProtocolNumber,
                                        Addr:         e.TransportEndpointInfo.ID.LocalAddress,
                                        Port:         p,
                                        Flags:        e.portFlags,
                                        BindToDevice: bindToDevice,
                                        Dest:         addr,
                                }
                                if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil {
                                        return false, nil
                                }
                        }

                        id := e.TransportEndpointInfo.ID
                        id.LocalPort = p
                        if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil {
                                portRes := ports.Reservation{
                                        Networks:     netProtos,
                                        Transport:    ProtocolNumber,
                                        Addr:         e.TransportEndpointInfo.ID.LocalAddress,
                                        Port:         p,
                                        Flags:        e.portFlags,
                                        BindToDevice: bindToDevice,
                                        Dest:         addr,
                                }
                                e.stack.ReleasePort(portRes)
                                if _, ok := err.(*tcpip.ErrPortInUse); ok {
                                        return false, nil
                                }
                                return false, err
                        }

                        // Port picking successful. Save the details of
                        // the selected port.
                        e.TransportEndpointInfo.ID = id
                        e.isPortReserved = true
                        e.boundBindToDevice = bindToDevice
                        e.boundPortFlags = e.portFlags
                        e.boundDest = addr
                        return true, nil
                }); err != nil {
                        e.stack.Stats().TCP.FailedPortReservations.Increment()
                        return err
                }
        }

        e.isRegistered = true
        e.setEndpointState(StateConnecting)
        r.Acquire()
        e.route = r
        e.boundNICID = nicID
        e.effectiveNetProtos = netProtos
        e.connectingAddress = connectingAddr

        e.initGSO()

        // Connect in the restore phase does not perform handshake. Restore its
        // connection setting here.
        if !handshake {
                e.segmentQueue.mu.Lock()
                for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} {
                        for s := l.Front(); s != nil; s = s.Next() {
                                s.id = e.TransportEndpointInfo.ID
                                e.sndQueueInfo.sndWaker.Assert()
                        }
                }
                e.segmentQueue.mu.Unlock()
                e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
                e.setEndpointState(StateEstablished)
        }

        if run {
                if handshake {
                        h := e.newHandshake()
                        e.setEndpointState(StateSynSent)
                        h.start()
                }
                e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
                e.workerRunning = true
                go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save.
        }

        return &tcpip.ErrConnectStarted{}
}

// ConnectEndpoint is not supported.
func (*endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error {
        return &tcpip.ErrInvalidEndpointState{}
}

// Shutdown closes the read and/or write end of the endpoint connection to its
// peer.
func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error {
        e.LockUser()
        defer e.UnlockUser()
        return e.shutdownLocked(flags)
}

func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error {
        e.shutdownFlags |= flags
        switch {
        case e.EndpointState().connected():
                // Close for read.
                if e.shutdownFlags&tcpip.ShutdownRead != 0 {
                        // Mark read side as closed.
                        e.rcvQueueInfo.rcvQueueMu.Lock()
                        e.rcvQueueInfo.RcvClosed = true
                        rcvBufUsed := e.rcvQueueInfo.RcvBufUsed
                        e.rcvQueueInfo.rcvQueueMu.Unlock()

                        // If we're fully closed and we have unread data we need to abort
                        // the connection with a RST.
                        if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 {
                                e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
                                // Wake up worker to terminate loop.
                                e.notifyProtocolGoroutine(notifyTickleWorker)
                                return nil
                        }
                        // Wake up any readers that maybe waiting for the stream to become
                        // readable.
                        e.waiterQueue.Notify(waiter.ReadableEvents)
                }

                // Close for write.
                if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
                        e.sndQueueInfo.sndQueueMu.Lock()
                        if e.sndQueueInfo.SndClosed {
                                // Already closed.
                                e.sndQueueInfo.sndQueueMu.Unlock()
                                if e.EndpointState() == StateTimeWait {
                                        return &tcpip.ErrNotConnected{}
                                }
                                return nil
                        }

                        // Queue fin segment.
                        s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), nil)
                        e.snd.writeList.PushBack(s)
                        // Mark endpoint as closed.
                        e.sndQueueInfo.SndClosed = true
                        e.sndQueueInfo.sndQueueMu.Unlock()

                        // Drain the send queue.
                        e.sendData(s)

                        // Mark send side as closed.
                        e.snd.Closed = true

                        // Wake up any writers that maybe waiting for the stream to become
                        // writable.
                        e.waiterQueue.Notify(waiter.WritableEvents)
                }

                return nil
        case e.EndpointState() == StateListen:
                if e.shutdownFlags&tcpip.ShutdownRead != 0 {
                        // Reset all connections from the accept queue and keep the
                        // worker running so that it can continue handling incoming
                        // segments by replying with RST.
                        //
                        // By not removing this endpoint from the demuxer mapping, we
                        // ensure that any other bind to the same port fails, as on Linux.
                        e.rcvQueueInfo.rcvQueueMu.Lock()
                        e.rcvQueueInfo.RcvClosed = true
                        e.rcvQueueInfo.rcvQueueMu.Unlock()
                        e.closePendingAcceptableConnectionsLocked()
                        // Notify waiters that the endpoint is shutdown.
                        e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
                }
                return nil
        default:
                return &tcpip.ErrNotConnected{}
        }
}

// Listen puts the endpoint in "listen" mode, which allows it to accept
// new connections.
func (e *endpoint) Listen(backlog int) tcpip.Error {
        err := e.listen(backlog)
        if err != nil {
                if !err.IgnoreStats() {
                        e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
                        e.stats.FailedConnectionAttempts.Increment()
                }
        }
        return err
}

func (e *endpoint) listen(backlog int) tcpip.Error {
        e.LockUser()
        defer e.UnlockUser()

        if e.EndpointState() == StateListen && !e.closed {
                e.acceptMu.Lock()
                defer e.acceptMu.Unlock()
                if e.accepted == (accepted{}) {
                        // listen is called after shutdown.
                        e.accepted.cap = backlog
                        e.shutdownFlags = 0
                        e.rcvQueueInfo.rcvQueueMu.Lock()
                        e.rcvQueueInfo.RcvClosed = false
                        e.rcvQueueInfo.rcvQueueMu.Unlock()
                } else {
                        // Adjust the size of the backlog iff we can fit
                        // existing pending connections into the new one.
                        if e.accepted.endpoints.Len() > backlog {
                                return &tcpip.ErrInvalidEndpointState{}
                        }
                        e.accepted.cap = backlog
                }

                // Notify any blocked goroutines that they can attempt to
                // deliver endpoints again.
                e.acceptCond.Broadcast()

                return nil
        }

        if e.EndpointState() == StateInitial {
                // The listen is called on an unbound socket, the socket is
                // automatically bound to a random free port with the local
                // address set to INADDR_ANY.
                if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
                        return err
                }
        }

        // Endpoint must be bound before it can transition to listen mode.
        if e.EndpointState() != StateBound {
                e.stats.ReadErrors.InvalidEndpointState.Increment()
                return &tcpip.ErrInvalidEndpointState{}
        }

        // Register the endpoint.
        if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil {
                return err
        }

        e.isRegistered = true
        e.setEndpointState(StateListen)

        // The queue may be non-zero when we're restoring the endpoint, and it
        // may be pre-populated with some previously accepted (but not Accepted)
        // endpoints.
        e.acceptMu.Lock()
        if e.accepted == (accepted{}) {
                e.accepted.cap = backlog
        }
        e.acceptMu.Unlock()

        e.workerRunning = true
        go e.protocolListenLoop( // S/R-SAFE: drained on save.
                seqnum.Size(e.receiveBufferAvailable()))
        return nil
}

// startAcceptedLoop sets up required state and starts a goroutine with the
// main loop for accepted connections.
// +checklocksrelease:e.mu
func (e *endpoint) startAcceptedLoop() {
        e.workerRunning = true
        e.mu.Unlock()
        wakerInitDone := make(chan struct{})
        go e.protocolMainLoop(false, wakerInitDone) // S/R-SAFE: drained on save.
        <-wakerInitDone
}

// Accept returns a new endpoint if a peer has established a connection
// to an endpoint previously set to listen mode.
//
// addr if not-nil will contain the peer address of the returned endpoint.
func (e *endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) {
        e.LockUser()
        defer e.UnlockUser()

        e.rcvQueueInfo.rcvQueueMu.Lock()
        rcvClosed := e.rcvQueueInfo.RcvClosed
        e.rcvQueueInfo.rcvQueueMu.Unlock()
        // Endpoint must be in listen state before it can accept connections.
        if rcvClosed || e.EndpointState() != StateListen {
                return nil, nil, &tcpip.ErrInvalidEndpointState{}
        }

        // Get the new accepted endpoint.
        var n *endpoint
        e.acceptMu.Lock()
        if element := e.accepted.endpoints.Front(); element != nil {
                n = e.accepted.endpoints.Remove(element).(*endpoint)
        }
        e.acceptMu.Unlock()
        if n == nil {
                return nil, nil, &tcpip.ErrWouldBlock{}
        }
        e.acceptCond.Signal()
        if peerAddr != nil {
                *peerAddr = n.getRemoteAddress()
        }
        return n, n.waiterQueue, nil
}

// Bind binds the endpoint to a specific local port and optionally address.
func (e *endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) {
        e.LockUser()
        defer e.UnlockUser()

        return e.bindLocked(addr)
}

func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) {
        // Don't allow binding once endpoint is not in the initial state
        // anymore. This is because once the endpoint goes into a connected or
        // listen state, it is already bound.
        if e.EndpointState() != StateInitial {
                return &tcpip.ErrAlreadyBound{}
        }

        e.BindAddr = addr.Addr
        addr, netProto, err := e.checkV4MappedLocked(addr)
        if err != nil {
                return err
        }

        netProtos := []tcpip.NetworkProtocolNumber{netProto}

        // Expand netProtos to include v4 and v6 under dual-stack if the caller is
        // binding to a wildcard (empty) address, and this is an IPv6 endpoint with
        // v6only set to false.
        if netProto == header.IPv6ProtocolNumber {
                stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber)
                alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == "" && stackHasV4
                if alsoBindToV4 {
                        netProtos = append(netProtos, header.IPv4ProtocolNumber)
                }
        }

        var nic tcpip.NICID
        // If an address is specified, we must ensure that it's one of our
        // local addresses.
        if len(addr.Addr) != 0 {
                nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
                if nic == 0 {
                        return &tcpip.ErrBadLocalAddress{}
                }
                e.TransportEndpointInfo.ID.LocalAddress = addr.Addr
        }

        bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
        portRes := ports.Reservation{
                Networks:     netProtos,
                Transport:    ProtocolNumber,
                Addr:         addr.Addr,
                Port:         addr.Port,
                Flags:        e.portFlags,
                BindToDevice: bindToDevice,
                Dest:         tcpip.FullAddress{},
        }
        port, err := e.stack.ReservePort(e.stack.Rand(), portRes, func(p uint16) (bool, tcpip.Error) {
                id := e.TransportEndpointInfo.ID
                id.LocalPort = p
                // CheckRegisterTransportEndpoint should only return an error if there is a
                // listening endpoint bound with the same id and portFlags and bindToDevice
                // options.
                //
                // NOTE: Only listening and connected endpoint register with
                // demuxer. Further connected endpoints always have a remote
                // address/port. Hence this will only return an error if there is a matching
                // listening endpoint.
                if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil {
                        return false, nil
                }
                return true, nil
        })
        if err != nil {
                e.stack.Stats().TCP.FailedPortReservations.Increment()
                return err
        }

        e.boundBindToDevice = bindToDevice
        e.boundPortFlags = e.portFlags
        // TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct.
        e.boundNICID = nic
        e.isPortReserved = true
        e.effectiveNetProtos = netProtos
        e.TransportEndpointInfo.ID.LocalPort = port

        // Mark endpoint as bound.
        e.setEndpointState(StateBound)

        return nil
}

// GetLocalAddress returns the address to which the endpoint is bound.
func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
        e.LockUser()
        defer e.UnlockUser()

        return tcpip.FullAddress{
                Addr: e.TransportEndpointInfo.ID.LocalAddress,
                Port: e.TransportEndpointInfo.ID.LocalPort,
                NIC:  e.boundNICID,
        }, nil
}

// GetRemoteAddress returns the address to which the endpoint is connected.
func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) {
        e.LockUser()
        defer e.UnlockUser()

        if !e.EndpointState().connected() {
                return tcpip.FullAddress{}, &tcpip.ErrNotConnected{}
        }

        return e.getRemoteAddress(), nil
}

func (e *endpoint) getRemoteAddress() tcpip.FullAddress {
        return tcpip.FullAddress{
                Addr: e.TransportEndpointInfo.ID.RemoteAddress,
                Port: e.TransportEndpointInfo.ID.RemotePort,
                NIC:  e.boundNICID,
        }
}

func (*endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) {
        // TCP HandlePacket is not required anymore as inbound packets first
        // land at the Dispatcher which then can either deliver using the
        // worker go routine or directly do the invoke the tcp processing inline
        // based on the state of the endpoint.
}

func (e *endpoint) enqueueSegment(s *segment) bool {
        // Send packet to worker goroutine.
        if !e.segmentQueue.enqueue(s) {
                // The queue is full, so we drop the segment.
                e.stack.Stats().DroppedPackets.Increment()
                e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
                return false
        }
        return true
}

func (e *endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) {
        // Update last error first.
        e.lastErrorMu.Lock()
        e.lastError = err
        e.lastErrorMu.Unlock()

        // Update the error queue if IP_RECVERR is enabled.
        if e.SocketOptions().GetRecvError() {
                e.SocketOptions().QueueErr(&tcpip.SockError{
                        Err:   err,
                        Cause: transErr,
                        // Linux passes the payload with the TCP header. We don't know if the TCP
                        // header even exists, it may not for fragmented packets.
                        Payload: pkt.Data().AsRange().ToOwnedView(),
                        Dst: tcpip.FullAddress{
                                NIC:  pkt.NICID,
                                Addr: e.TransportEndpointInfo.ID.RemoteAddress,
                                Port: e.TransportEndpointInfo.ID.RemotePort,
                        },
                        Offender: tcpip.FullAddress{
                                NIC:  pkt.NICID,
                                Addr: e.TransportEndpointInfo.ID.LocalAddress,
                                Port: e.TransportEndpointInfo.ID.LocalPort,
                        },
                        NetProto: pkt.NetworkProtocolNumber,
                })
        }

        // Notify of the error.
        e.notifyProtocolGoroutine(notifyError)
}

// HandleError implements stack.TransportEndpoint.
func (e *endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) {
        handlePacketTooBig := func(mtu uint32) {
                e.sndQueueInfo.sndQueueMu.Lock()
                e.sndQueueInfo.PacketTooBigCount++
                if v := int(mtu); v < e.sndQueueInfo.SndMTU {
                        e.sndQueueInfo.SndMTU = v
                }
                e.sndQueueInfo.sndQueueMu.Unlock()
                e.notifyProtocolGoroutine(notifyMTUChanged)
        }

        // TODO(gvisor.dev/issues/5270): Handle all transport errors.
        switch transErr.Kind() {
        case stack.PacketTooBigTransportError:
                handlePacketTooBig(transErr.Info())
        case stack.DestinationHostUnreachableTransportError:
                e.onICMPError(&tcpip.ErrNoRoute{}, transErr, pkt)
        case stack.DestinationNetworkUnreachableTransportError:
                e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt)
        }
}

// updateSndBufferUsage is called by the protocol goroutine when room opens up
// in the send buffer. The number of newly available bytes is v.
func (e *endpoint) updateSndBufferUsage(v int) {
        sendBufferSize := e.getSendBufferSize()
        e.sndQueueInfo.sndQueueMu.Lock()
        notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1
        e.sndQueueInfo.SndBufUsed -= v
        // We only notify when there is half the sendBufferSize available after
        // a full buffer event occurs. This ensures that we don't wake up
        // writers to queue just 1-2 segments and go back to sleep.
        notify = notify && e.sndQueueInfo.SndBufUsed < sendBufferSize>>1
        e.sndQueueInfo.sndQueueMu.Unlock()

        if notify {
                e.waiterQueue.Notify(waiter.WritableEvents)
        }
}

// readyToRead is called by the protocol goroutine when a new segment is ready
// to be read, or when the connection is closed for receiving (in which case
// s will be nil).
func (e *endpoint) readyToRead(s *segment) {
        e.rcvQueueInfo.rcvQueueMu.Lock()
        if s != nil {
                e.rcvQueueInfo.RcvBufUsed += s.payloadSize()
                s.incRef()
                e.rcvQueueInfo.rcvQueue.PushBack(s)
        } else {
                e.rcvQueueInfo.RcvClosed = true
        }
        e.rcvQueueInfo.rcvQueueMu.Unlock()
        e.waiterQueue.Notify(waiter.ReadableEvents)
}

// receiveBufferAvailableLocked calculates how many bytes are still available
// in the receive buffer.
// rcvQueueMu must be held when this function is called.
func (e *endpoint) receiveBufferAvailableLocked(rcvBufSize int) int {
        // We may use more bytes than the buffer size when the receive buffer
        // shrinks.
        memUsed := e.receiveMemUsed()
        if memUsed >= rcvBufSize {
                return 0
        }

        return rcvBufSize - memUsed
}

// receiveBufferAvailable calculates how many bytes are still available in the
// receive buffer based on the actual memory used by all segments held in
// receive buffer/pending and segment queue.
func (e *endpoint) receiveBufferAvailable() int {
        e.rcvQueueInfo.rcvQueueMu.Lock()
        available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize()))
        e.rcvQueueInfo.rcvQueueMu.Unlock()
        return available
}

// receiveBufferUsed returns the amount of in-use receive buffer.
func (e *endpoint) receiveBufferUsed() int {
        e.rcvQueueInfo.rcvQueueMu.Lock()
        used := e.rcvQueueInfo.RcvBufUsed
        e.rcvQueueInfo.rcvQueueMu.Unlock()
        return used
}

// receiveMemUsed returns the total memory in use by segments held by this
// endpoint.
func (e *endpoint) receiveMemUsed() int {
        return int(atomic.LoadInt32(&e.rcvMemUsed))
}

// updateReceiveMemUsed adds the provided delta to e.rcvMemUsed.
func (e *endpoint) updateReceiveMemUsed(delta int) {
        atomic.AddInt32(&e.rcvMemUsed, int32(delta))
}

// maxReceiveBufferSize returns the stack wide maximum receive buffer size for
// an endpoint.
func (e *endpoint) maxReceiveBufferSize() int {
        var rs tcpip.TCPReceiveBufferSizeRangeOption
        if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
                // As a fallback return the hardcoded max buffer size.
                return MaxBufferSize
        }
        return rs.Max
}

// rcvWndScaleForHandshake computes the receive window scale to offer to the
// peer when window scaling is enabled (true by default). If auto-tuning is
// disabled then the window scaling factor is based on the size of the
// receiveBuffer otherwise we use the max permissible receive buffer size to
// compute the scale.
func (e *endpoint) rcvWndScaleForHandshake() int {
        bufSizeForScale := e.ops.GetReceiveBufferSize()

        e.rcvQueueInfo.rcvQueueMu.Lock()
        autoTuningDisabled := e.rcvQueueInfo.RcvAutoParams.Disabled
        e.rcvQueueInfo.rcvQueueMu.Unlock()
        if autoTuningDisabled {
                return FindWndScale(seqnum.Size(bufSizeForScale))
        }

        return FindWndScale(seqnum.Size(e.maxReceiveBufferSize()))
}

// updateRecentTimestamp updates the recent timestamp using the algorithm
// described in https://tools.ietf.org/html/rfc7323#section-4.3
func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
        if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
                e.setRecentTimestamp(tsVal)
        }
}

// maybeEnableTimestamp marks the timestamp option enabled for this endpoint if
// the SYN options indicate that timestamp option was negotiated. It also
// initializes the recentTS with the value provided in synOpts.TSval.
func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) {
        if synOpts.TS {
                e.SendTSOk = true
                e.setRecentTimestamp(synOpts.TSVal)
        }
}

// timestamp returns the timestamp value to be used in the TSVal field of the
// timestamp option for outgoing TCP segments for a given endpoint.
func (e *endpoint) timestamp() uint32 {
        return tcpTimeStamp(e.stack.Clock().NowMonotonic(), e.TSOffset)
}

// tcpTimeStamp returns a timestamp offset by the provided offset. This is
// not inlined above as it's used when SYN cookies are in use and endpoint
// is not created at the time when the SYN cookie is sent.
func tcpTimeStamp(curTime tcpip.MonotonicTime, offset uint32) uint32 {
        d := curTime.Sub(tcpip.MonotonicTime{})
        return uint32(d.Milliseconds()) + offset
}

// timeStampOffset returns a randomized timestamp offset to be used when sending
// timestamp values in a timestamp option for a TCP segment.
func timeStampOffset(rng *rand.Rand) uint32 {
        // Initialize a random tsOffset that will be added to the recentTS
        // everytime the timestamp is sent when the Timestamp option is enabled.
        //
        // See https://tools.ietf.org/html/rfc7323#section-5.4 for details on
        // why this is required.
        //
        // NOTE: This is not completely to spec as normally this should be
        // initialized in a manner analogous to how sequence numbers are
        // randomized per connection basis. But for now this is sufficient.
        return rng.Uint32()
}

// maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint
// if the SYN options indicate that the SACK option was negotiated and the TCP
// stack is configured to enable TCP SACK option.
func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) {
        var v tcpip.TCPSACKEnabled
        if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
                // Stack doesn't support SACK. So just return.
                return
        }
        if bool(v) && synOpts.SACKPermitted {
                e.SACKPermitted = true
        }
}

// maxOptionSize return the maximum size of TCP options.
func (e *endpoint) maxOptionSize() (size int) {
        var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
        options := e.makeOptions(maxSackBlocks[:])
        size = len(options)
        putOptions(options)

        return size
}

// completeStateLocked makes a full copy of the endpoint and returns it. This is
// used before invoking the probe.
//
// Precondition: e.mu must be held.
func (e *endpoint) completeStateLocked() stack.TCPEndpointState {
        s := stack.TCPEndpointState{
                TCPEndpointStateInner: e.TCPEndpointStateInner,
                ID:                    stack.TCPEndpointID(e.TransportEndpointInfo.ID),
                SegTime:               e.stack.Clock().NowMonotonic(),
                Receiver:              e.rcv.TCPReceiverState,
                Sender:                e.snd.TCPSenderState,
        }

        sndBufSize := e.getSendBufferSize()
        // Copy the send buffer atomically.
        e.sndQueueInfo.sndQueueMu.Lock()
        s.SndBufState = e.sndQueueInfo.TCPSndBufState
        s.SndBufState.SndBufSize = sndBufSize
        e.sndQueueInfo.sndQueueMu.Unlock()

        // Copy the receive buffer atomically.
        e.rcvQueueInfo.rcvQueueMu.Lock()
        s.RcvBufState = e.rcvQueueInfo.TCPRcvBufState
        e.rcvQueueInfo.rcvQueueMu.Unlock()

        // Copy the endpoint TCP Option state.
        s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
        copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
        s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()

        e.snd.rtt.Lock()
        s.Sender.RTTState = e.snd.rtt.TCPRTTState
        e.snd.rtt.Unlock()

        if cubic, ok := e.snd.cc.(*cubicState); ok {
                s.Sender.Cubic = cubic.TCPCubicState
                s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T)
        }

        s.Sender.RACKState = e.snd.rc.TCPRACKState
        return s
}

func (e *endpoint) initHardwareGSO() {
        switch e.route.NetProto() {
        case header.IPv4ProtocolNumber:
                e.gso.Type = stack.GSOTCPv4
                e.gso.L3HdrLen = header.IPv4MinimumSize
        case header.IPv6ProtocolNumber:
                e.gso.Type = stack.GSOTCPv6
                e.gso.L3HdrLen = header.IPv6MinimumSize
        default:
                panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto))
        }
        e.gso.NeedsCsum = true
        e.gso.CsumOffset = header.TCPChecksumOffset
        e.gso.MaxSize = e.route.GSOMaxSize()
}

func (e *endpoint) initGSO() {
        if e.route.HasHardwareGSOCapability() {
                e.initHardwareGSO()
        } else if e.route.HasSoftwareGSOCapability() {
                e.gso = stack.GSO{
                        MaxSize:   e.route.GSOMaxSize(),
                        Type:      stack.GSOSW,
                        NeedsCsum: false,
                }
        }
}

// State implements tcpip.Endpoint.State. It exports the endpoint's protocol
// state for diagnostics.
func (e *endpoint) State() uint32 {
        return uint32(e.EndpointState())
}

// Info returns a copy of the endpoint info.
func (e *endpoint) Info() tcpip.EndpointInfo {
        e.LockUser()
        // Make a copy of the endpoint info.
        ret := e.TransportEndpointInfo
        e.UnlockUser()
        return &ret
}

// Stats returns a pointer to the endpoint stats.
func (e *endpoint) Stats() tcpip.EndpointStats {
        return &e.stats
}

// Wait implements stack.TransportEndpoint.Wait.
func (e *endpoint) Wait() {
        waitEntry, notifyCh := waiter.NewChannelEntry(nil)
        e.waiterQueue.EventRegister(&waitEntry, waiter.EventHUp)
        defer e.waiterQueue.EventUnregister(&waitEntry)
        for {
                e.LockUser()
                running := e.workerRunning
                e.UnlockUser()
                if !running {
                        break
                }
                <-notifyCh
        }
}

// SocketOptions implements tcpip.Endpoint.SocketOptions.
func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
        return &e.ops
}

// GetTCPSendBufferLimits is used to get send buffer size limits for TCP.
func GetTCPSendBufferLimits(s tcpip.StackHandler) tcpip.SendBufferSizeOption {
        var ss tcpip.TCPSendBufferSizeRangeOption
        if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil {
                panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err))
        }

        return tcpip.SendBufferSizeOption{
                Min:     ss.Min,
                Default: ss.Default,
                Max:     ss.Max,
        }
}

// allowOutOfWindowAck returns true if an out-of-window ACK can be sent now.
func (e *endpoint) allowOutOfWindowAck() bool {
        now := e.stack.Clock().NowMonotonic()

        if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) {
                var limit stack.TCPInvalidRateLimitOption
                if err := e.stack.Option(&limit); err != nil {
                        panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err))
                }
                if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) {
                        return false
                }
        }

        e.lastOutOfWindowAckTime = now
        return true
}

// GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP.
func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption {
        var ss tcpip.TCPReceiveBufferSizeRangeOption
        if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil {
                panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err))
        }

        return tcpip.ReceiveBufferSizeOption{
                Min:     ss.Min,
                Default: ss.Default,
                Max:     ss.Max,
        }
}























  295 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package gofer

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

func (d *dentry) isSymlink() bool {
        return d.fileType() == linux.S_IFLNK
}

// Precondition: d.isSymlink().
func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
        if d.fs.opts.interop != InteropModeShared {
                d.touchAtime(mnt)
                d.dataMu.Lock()
                if d.haveTarget {
                        target := d.target
                        d.dataMu.Unlock()
                        return target, nil
                }
        }
        target, err := d.file.readlink(ctx)
        if d.fs.opts.interop != InteropModeShared {
                if err == nil {
                        d.haveTarget = true
                        d.target = target
                }
                d.dataMu.Unlock() // +checklocksforce: guaranteed locked from above.
        }
        return target, err
}













































































  293 




























   21 


















   21 




    2 




















    1 



    1 
















   21 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package pipefs provides the filesystem implementation backing
// Kernel.PipeMount.
package pipefs

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

// +stateify savable
type filesystemType struct{}

// Name implements vfs.FilesystemType.Name.
func (filesystemType) Name() string {
        return "pipefs"
}

// Release implements vfs.FilesystemType.Release.
func (filesystemType) Release(ctx context.Context) {}

// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
        panic("pipefs.filesystemType.GetFilesystem should never be called")
}

// +stateify savable
type filesystem struct {
        kernfs.Filesystem

        devMinor uint32
}

// NewFilesystem sets up and returns a new vfs.Filesystem implemented by pipefs.
func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
        devMinor, err := vfsObj.GetAnonBlockDevMinor()
        if err != nil {
                return nil, err
        }
        fs := &filesystem{
                devMinor: devMinor,
        }
        fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs)
        return fs.Filesystem.VFSFilesystem(), nil
}

// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
        fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
        fs.Filesystem.Release(ctx)
}

// PrependPath implements vfs.FilesystemImpl.PrependPath.
func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
        inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode)
        b.PrependComponent(fmt.Sprintf("pipe:[%d]", inode.ino))
        return vfs.PrependPathSyntheticError{}
}

// MountOptions implements vfs.FilesystemImpl.MountOptions.
func (fs *filesystem) MountOptions() string {
        return ""
}

// inode implements kernfs.Inode.
//
// +stateify savable
type inode struct {
        kernfs.InodeNotDirectory
        kernfs.InodeNotSymlink
        kernfs.InodeNoopRefCount

        locks vfs.FileLocks
        pipe  *pipe.VFSPipe

        ino uint64
        uid auth.KUID
        gid auth.KGID
        // We use the creation timestamp for all of atime, mtime, and ctime.
        ctime ktime.Time
}

func newInode(ctx context.Context, fs *filesystem) *inode {
        creds := auth.CredentialsFromContext(ctx)
        return &inode{
                pipe:  pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize),
                ino:   fs.Filesystem.NextIno(),
                uid:   creds.EffectiveKUID,
                gid:   creds.EffectiveKGID,
                ctime: ktime.NowFromContext(ctx),
        }
}

const pipeMode = 0600 | linux.S_IFIFO

// CheckPermissions implements kernfs.Inode.CheckPermissions.
func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
        return vfs.GenericCheckPermissions(creds, ats, pipeMode, i.uid, i.gid)
}

// Mode implements kernfs.Inode.Mode.
func (i *inode) Mode() linux.FileMode {
        return pipeMode
}

// Stat implements kernfs.Inode.Stat.
func (i *inode) Stat(_ context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
        ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds())
        return linux.Statx{
                Mask:     linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
                Blksize:  hostarch.PageSize,
                Nlink:    1,
                UID:      uint32(i.uid),
                GID:      uint32(i.gid),
                Mode:     pipeMode,
                Ino:      i.ino,
                Size:     0,
                Blocks:   0,
                Atime:    ts,
                Ctime:    ts,
                Mtime:    ts,
                DevMajor: linux.UNNAMED_MAJOR,
                DevMinor: vfsfs.Impl().(*filesystem).devMinor,
        }, nil
}

// SetStat implements kernfs.Inode.SetStat.
func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
        if opts.Stat.Mask == 0 {
                return nil
        }
        return linuxerr.EPERM
}

// Open implements kernfs.Inode.Open.
func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        return i.pipe.Open(ctx, rp.Mount(), d.VFSDentry(), opts.Flags, &i.locks)
}

// StatFS implements kernfs.Inode.StatFS.
func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
        return vfs.GenericStatFS(linux.PIPEFS_MAGIC), nil
}

// NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
// and write ends of a newly-created pipe, as for pipe(2) and pipe2(2).
//
// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem().
func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, *vfs.FileDescription, error) {
        fs := mnt.Filesystem().Impl().(*filesystem)
        inode := newInode(ctx, fs)
        var d kernfs.Dentry
        d.Init(&fs.Filesystem, inode)
        defer d.DecRef(ctx)
        return inode.pipe.ReaderWriterPair(ctx, mnt, d.VFSDentry(), flags)
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/buffer/buffer_list.go: no such file or directory

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/tcpip/transport/tcp/tcp_endpoint_list.go: no such file or directory






























   21 














































    4 





    4 













    2 











    2 






    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package sys

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/usermem"
)

func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
        k := &kcovInode{}
        k.InodeAttrs.Init(ctx, creds, 0, 0, fs.NextIno(), linux.S_IFREG|0600)
        return k
}

// kcovInode implements kernfs.Inode.
//
// +stateify savable
type kcovInode struct {
        kernfs.InodeAttrs
        kernfs.InodeNoopRefCount
        kernfs.InodeNotDirectory
        kernfs.InodeNotSymlink
        implStatFS
}

func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        k := kernel.KernelFromContext(ctx)
        if k == nil {
                panic("KernelFromContext returned nil")
        }
        fd := &kcovFD{
                inode: i,
                kcov:  k.NewKcov(),
        }

        if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{
                DenyPRead:  true,
                DenyPWrite: true,
        }); err != nil {
                return nil, err
        }
        return &fd.vfsfd, nil
}

// +stateify savable
type kcovFD struct {
        vfs.FileDescriptionDefaultImpl
        vfs.NoLockFD

        vfsfd vfs.FileDescription
        inode *kcovInode
        kcov  *kernel.Kcov
}

// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
func (fd *kcovFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        cmd := uint32(args[1].Int())
        arg := args[2].Uint64()
        switch uint32(cmd) {
        case linux.KCOV_INIT_TRACE:
                return 0, fd.kcov.InitTrace(arg)
        case linux.KCOV_ENABLE:
                return 0, fd.kcov.EnableTrace(ctx, uint8(arg))
        case linux.KCOV_DISABLE:
                if arg != 0 {
                        // This arg is unused; it should be 0.
                        return 0, linuxerr.EINVAL
                }
                return 0, fd.kcov.DisableTrace(ctx)
        default:
                return 0, linuxerr.ENOTTY
        }
}

// ConfigureMmap implements vfs.FileDescriptionImpl.ConfigureMmap.
func (fd *kcovFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
        return fd.kcov.ConfigureMMap(ctx, opts)
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *kcovFD) Release(ctx context.Context) {
        // kcov instances have reference counts in Linux, but this seems sufficient
        // for our purposes.
        fd.kcov.Clear(ctx)
}

// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *kcovFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
        creds := auth.CredentialsFromContext(ctx)
        fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
        return fd.inode.SetStat(ctx, fs, creds, opts)
}

// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *kcovFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
        return fd.inode.Stat(ctx, fd.vfsfd.Mount().Filesystem(), opts)
}














































































































































































   22 


   13 


   16 


    4 







    3 




   24 




   13 






















    2 









    2 




    2 















   10 



















   33 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import (
        "encoding/binary"

        "gvisor.dev/gvisor/pkg/tcpip"
)

// ICMPv6 represents an ICMPv6 header stored in a byte array.
type ICMPv6 []byte

const (
        // ICMPv6HeaderSize is the size of the ICMPv6 header. That is, the
        // sum of the size of the ICMPv6 Type, Code and Checksum fields, as
        // per RFC 4443 section 2.1. After the ICMPv6 header, the ICMPv6
        // message body begins.
        ICMPv6HeaderSize = 4

        // ICMPv6MinimumSize is the minimum size of a valid ICMP packet.
        ICMPv6MinimumSize = 8

        // ICMPv6PayloadOffset is the offset of the payload in an
        // ICMP packet.
        ICMPv6PayloadOffset = 8

        // ICMPv6ProtocolNumber is the ICMP transport protocol number.
        ICMPv6ProtocolNumber tcpip.TransportProtocolNumber = 58

        // ICMPv6NeighborSolicitMinimumSize is the minimum size of a
        // neighbor solicitation packet.
        ICMPv6NeighborSolicitMinimumSize = ICMPv6HeaderSize + NDPNSMinimumSize

        // ICMPv6NeighborAdvertMinimumSize is the minimum size of a
        // neighbor advertisement packet.
        ICMPv6NeighborAdvertMinimumSize = ICMPv6HeaderSize + NDPNAMinimumSize

        // ICMPv6EchoMinimumSize is the minimum size of a valid echo packet.
        ICMPv6EchoMinimumSize = 8

        // ICMPv6ErrorHeaderSize is the size of an ICMP error packet header,
        // as per RFC 4443, Apendix A, item 4 and the errata.
        //   ... all ICMP error messages shall have exactly
        //   32 bits of type-specific data, so that receivers can reliably find
        //   the embedded invoking packet even when they don't recognize the
        //   ICMP message Type.
        ICMPv6ErrorHeaderSize = 8

        // ICMPv6DstUnreachableMinimumSize is the minimum size of a valid ICMP
        // destination unreachable packet.
        ICMPv6DstUnreachableMinimumSize = ICMPv6MinimumSize

        // ICMPv6PacketTooBigMinimumSize is the minimum size of a valid ICMP
        // packet-too-big packet.
        ICMPv6PacketTooBigMinimumSize = ICMPv6MinimumSize

        // icmpv6ChecksumOffset is the offset of the checksum field
        // in an ICMPv6 message.
        icmpv6ChecksumOffset = 2

        // icmpv6PointerOffset is the offset of the pointer
        // in an ICMPv6 Parameter problem message.
        icmpv6PointerOffset = 4

        // icmpv6MTUOffset is the offset of the MTU field in an ICMPv6
        // PacketTooBig message.
        icmpv6MTUOffset = 4

        // icmpv6IdentOffset is the offset of the ident field
        // in a ICMPv6 Echo Request/Reply message.
        icmpv6IdentOffset = 4

        // icmpv6SequenceOffset is the offset of the sequence field
        // in a ICMPv6 Echo Request/Reply message.
        icmpv6SequenceOffset = 6

        // NDPHopLimit is the expected IP hop limit value of 255 for received
        // NDP packets, as per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1,
        // 7.1.2 and 8.1. If the hop limit value is not 255, nodes MUST silently
        // drop the NDP packet. All outgoing NDP packets must use this value for
        // its IP hop limit field.
        NDPHopLimit = 255
)

// ICMPv6Type is the ICMP type field described in RFC 4443.
type ICMPv6Type byte

// Values for use in the Type field of ICMPv6 packet from RFC 4433.
const (
        ICMPv6DstUnreachable ICMPv6Type = 1
        ICMPv6PacketTooBig   ICMPv6Type = 2
        ICMPv6TimeExceeded   ICMPv6Type = 3
        ICMPv6ParamProblem   ICMPv6Type = 4
        ICMPv6EchoRequest    ICMPv6Type = 128
        ICMPv6EchoReply      ICMPv6Type = 129

        // Neighbor Discovery Protocol (NDP) messages, see RFC 4861.

        ICMPv6RouterSolicit   ICMPv6Type = 133
        ICMPv6RouterAdvert    ICMPv6Type = 134
        ICMPv6NeighborSolicit ICMPv6Type = 135
        ICMPv6NeighborAdvert  ICMPv6Type = 136
        ICMPv6RedirectMsg     ICMPv6Type = 137

        // Multicast Listener Discovery (MLD) messages, see RFC 2710.

        ICMPv6MulticastListenerQuery  ICMPv6Type = 130
        ICMPv6MulticastListenerReport ICMPv6Type = 131
        ICMPv6MulticastListenerDone   ICMPv6Type = 132
)

// IsErrorType returns true if the receiver is an ICMP error type.
func (typ ICMPv6Type) IsErrorType() bool {
        // Per RFC 4443 section 2.1:
        //   ICMPv6 messages are grouped into two classes: error messages and
        //   informational messages.  Error messages are identified as such by a
        //   zero in the high-order bit of their message Type field values.  Thus,
        //   error messages have message types from 0 to 127; informational
        //   messages have message types from 128 to 255.
        return typ&0x80 == 0
}

// ICMPv6Code is the ICMP Code field described in RFC 4443.
type ICMPv6Code byte

// ICMP codes used with Destination Unreachable (Type 1). As per RFC 4443
// section 3.1.
const (
        ICMPv6NetworkUnreachable ICMPv6Code = 0
        ICMPv6Prohibited         ICMPv6Code = 1
        ICMPv6BeyondScope        ICMPv6Code = 2
        ICMPv6AddressUnreachable ICMPv6Code = 3
        ICMPv6PortUnreachable    ICMPv6Code = 4
        ICMPv6Policy             ICMPv6Code = 5
        ICMPv6RejectRoute        ICMPv6Code = 6
)

// ICMP codes used with Time Exceeded (Type 3). As per RFC 4443 section 3.3.
const (
        ICMPv6HopLimitExceeded  ICMPv6Code = 0
        ICMPv6ReassemblyTimeout ICMPv6Code = 1
)

// ICMP codes used with Parameter Problem (Type 4). As per RFC 4443 section 3.4.
const (
        // ICMPv6ErroneousHeader indicates an erroneous header field was encountered.
        ICMPv6ErroneousHeader ICMPv6Code = 0

        // ICMPv6UnknownHeader indicates an unrecognized Next Header type encountered.
        ICMPv6UnknownHeader ICMPv6Code = 1

        // ICMPv6UnknownOption indicates an unrecognized IPv6 option was encountered.
        ICMPv6UnknownOption ICMPv6Code = 2
)

// ICMPv6UnusedCode is the code value used with ICMPv6 messages which don't use
// the code field. (Types not mentioned above.)
const ICMPv6UnusedCode ICMPv6Code = 0

// Type is the ICMP type field.
func (b ICMPv6) Type() ICMPv6Type { return ICMPv6Type(b[0]) }

// SetType sets the ICMP type field.
func (b ICMPv6) SetType(t ICMPv6Type) { b[0] = byte(t) }

// Code is the ICMP code field. Its meaning depends on the value of Type.
func (b ICMPv6) Code() ICMPv6Code { return ICMPv6Code(b[1]) }

// SetCode sets the ICMP code field.
func (b ICMPv6) SetCode(c ICMPv6Code) { b[1] = byte(c) }

// TypeSpecific returns the type specific data field.
func (b ICMPv6) TypeSpecific() uint32 {
        return binary.BigEndian.Uint32(b[icmpv6PointerOffset:])
}

// SetTypeSpecific sets the type specific data field.
func (b ICMPv6) SetTypeSpecific(val uint32) {
        binary.BigEndian.PutUint32(b[icmpv6PointerOffset:], val)
}

// Checksum is the ICMP checksum field.
func (b ICMPv6) Checksum() uint16 {
        return binary.BigEndian.Uint16(b[icmpv6ChecksumOffset:])
}

// SetChecksum sets the ICMP checksum field.
func (b ICMPv6) SetChecksum(checksum uint16) {
        binary.BigEndian.PutUint16(b[icmpv6ChecksumOffset:], checksum)
}

// SourcePort implements Transport.SourcePort.
func (ICMPv6) SourcePort() uint16 {
        return 0
}

// DestinationPort implements Transport.DestinationPort.
func (ICMPv6) DestinationPort() uint16 {
        return 0
}

// SetSourcePort implements Transport.SetSourcePort.
func (ICMPv6) SetSourcePort(uint16) {
}

// SetDestinationPort implements Transport.SetDestinationPort.
func (ICMPv6) SetDestinationPort(uint16) {
}

// MTU retrieves the MTU field from an ICMPv6 message.
func (b ICMPv6) MTU() uint32 {
        return binary.BigEndian.Uint32(b[icmpv6MTUOffset:])
}

// SetMTU sets the MTU field from an ICMPv6 message.
func (b ICMPv6) SetMTU(mtu uint32) {
        binary.BigEndian.PutUint32(b[icmpv6MTUOffset:], mtu)
}

// Ident retrieves the Ident field from an ICMPv6 message.
func (b ICMPv6) Ident() uint16 {
        return binary.BigEndian.Uint16(b[icmpv6IdentOffset:])
}

// SetIdent sets the Ident field from an ICMPv6 message.
func (b ICMPv6) SetIdent(ident uint16) {
        binary.BigEndian.PutUint16(b[icmpv6IdentOffset:], ident)
}

// Sequence retrieves the Sequence field from an ICMPv6 message.
func (b ICMPv6) Sequence() uint16 {
        return binary.BigEndian.Uint16(b[icmpv6SequenceOffset:])
}

// SetSequence sets the Sequence field from an ICMPv6 message.
func (b ICMPv6) SetSequence(sequence uint16) {
        binary.BigEndian.PutUint16(b[icmpv6SequenceOffset:], sequence)
}

// MessageBody returns the message body as defined by RFC 4443 section 2.1; the
// portion of the ICMPv6 buffer after the first ICMPv6HeaderSize bytes.
func (b ICMPv6) MessageBody() []byte {
        return b[ICMPv6HeaderSize:]
}

// Payload implements Transport.Payload.
func (b ICMPv6) Payload() []byte {
        return b[ICMPv6PayloadOffset:]
}

// ICMPv6ChecksumParams contains parameters to calculate ICMPv6 checksum.
type ICMPv6ChecksumParams struct {
        Header      ICMPv6
        Src         tcpip.Address
        Dst         tcpip.Address
        PayloadCsum uint16
        PayloadLen  int
}

// ICMPv6Checksum calculates the ICMP checksum over the provided ICMPv6 header,
// IPv6 src/dst addresses and the payload.
func ICMPv6Checksum(params ICMPv6ChecksumParams) uint16 {
        h := params.Header

        xsum := PseudoHeaderChecksum(ICMPv6ProtocolNumber, params.Src, params.Dst, uint16(len(h)+params.PayloadLen))
        xsum = ChecksumCombine(xsum, params.PayloadCsum)

        // h[2:4] is the checksum itself, skip it to avoid checksumming the checksum.
        xsum = Checksum(h[:2], xsum)
        xsum = Checksum(h[4:], xsum)

        return ^xsum
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fs/lock/lock_range.go: no such file or directory
































  740 







  738 


  738 








  220 






  696 











    3 
    2 



    1 



    2 
    1 



    1 

    1 




























































    1 













    1 






    1 











    1 


    1 





    3 






    1 


    2 
    2 


    2 
    1 



    1 






    1 


    1 










 1961 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build amd64 i386

package fpu

import (
        "io"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/cpuid"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sync"
)

// initX86FPState (defined in asm files) sets up initial state.
func initX86FPState(data *byte, useXsave bool)

func newX86FPStateSlice() State {
        size, align := cpuid.HostFeatureSet().ExtendedStateSize()
        capacity := size
        // Always use at least 4096 bytes.
        //
        // For the KVM platform, this state is a fixed 4096 bytes, so make sure
        // that the underlying array is at _least_ that size otherwise we will
        // corrupt random memory. This is not a pleasant thing to debug.
        if capacity < 4096 {
                capacity = 4096
        }
        return alignedBytes(capacity, align)[:size]
}

// NewState returns an initialized floating point state.
//
// The returned state is large enough to store all floating point state
// supported by host, even if the app won't use much of it due to a restricted
// FeatureSet. Since they may still be able to see state not advertised by
// CPUID we must ensure it does not contain any sentry state.
func NewState() State {
        f := newX86FPStateSlice()
        initX86FPState(&f[0], cpuid.HostFeatureSet().UseXsave())
        return f
}

// Fork creates and returns an identical copy of the x86 floating point state.
func (s *State) Fork() State {
        n := newX86FPStateSlice()
        copy(n, *s)
        return n
}

// ptraceFPRegsSize is the size in bytes of Linux's user_i387_struct, the type
// manipulated by PTRACE_GETFPREGS and PTRACE_SETFPREGS on x86. Equivalently,
// ptraceFPRegsSize is the size in bytes of the x86 FXSAVE area.
const ptraceFPRegsSize = 512

// PtraceGetFPRegs implements Context.PtraceGetFPRegs.
func (s *State) PtraceGetFPRegs(dst io.Writer, maxlen int) (int, error) {
        if maxlen < ptraceFPRegsSize {
                return 0, linuxerr.EFAULT
        }

        return dst.Write((*s)[:ptraceFPRegsSize])
}

// PtraceSetFPRegs implements Context.PtraceSetFPRegs.
func (s *State) PtraceSetFPRegs(src io.Reader, maxlen int) (int, error) {
        if maxlen < ptraceFPRegsSize {
                return 0, linuxerr.EFAULT
        }

        var f [ptraceFPRegsSize]byte
        n, err := io.ReadFull(src, f[:])
        if err != nil {
                return 0, err
        }
        // Force reserved bits in MXCSR to 0. This is consistent with Linux.
        sanitizeMXCSR(State(f[:]))
        // N.B. this only copies the beginning of the FP state, which
        // corresponds to the FXSAVE area.
        copy(*s, f[:])
        return n, nil
}

const (
        // mxcsrOffset is the offset in bytes of the MXCSR field from the start of
        // the FXSAVE area. (Intel SDM Vol. 1, Table 10-2 "Format of an FXSAVE
        // Area")
        mxcsrOffset = 24

        // mxcsrMaskOffset is the offset in bytes of the MXCSR_MASK field from the
        // start of the FXSAVE area.
        mxcsrMaskOffset = 28
)

var (
        mxcsrMask     uint32
        initMXCSRMask sync.Once
)

const (
        // minXstateBytes is the minimum size in bytes of an x86 XSAVE area, equal
        // to the size of the XSAVE legacy area (512 bytes) plus the size of the
        // XSAVE header (64 bytes). Equivalently, minXstateBytes is GDB's
        // X86_XSTATE_SSE_SIZE.
        minXstateBytes = 512 + 64

        // userXstateXCR0Offset is the offset in bytes of the USER_XSTATE_XCR0_WORD
        // field in Linux's struct user_xstateregs, which is the type manipulated
        // by ptrace(PTRACE_GET/SETREGSET, NT_X86_XSTATE). Equivalently,
        // userXstateXCR0Offset is GDB's I386_LINUX_XSAVE_XCR0_OFFSET.
        userXstateXCR0Offset = 464

        // xstateBVOffset is the offset in bytes of the XSTATE_BV field in an x86
        // XSAVE area.
        xstateBVOffset = 512

        // xsaveHeaderZeroedOffset and xsaveHeaderZeroedBytes indicate parts of the
        // XSAVE header that we coerce to zero: "Bytes 15:8 of the XSAVE header is
        // a state-component bitmap called XCOMP_BV. ... Bytes 63:16 of the XSAVE
        // header are reserved." - Intel SDM Vol. 1, Section 13.4.2 "XSAVE Header".
        // Linux ignores XCOMP_BV, but it's able to recover from XRSTOR #GP
        // exceptions resulting from invalid values; we aren't. Linux also never
        // uses the compacted format when doing XSAVE and doesn't even define the
        // compaction extensions to XSAVE as a CPU feature, so for simplicity we
        // assume no one is using them.
        xsaveHeaderZeroedOffset = 512 + 8
        xsaveHeaderZeroedBytes  = 64 - 8
)

// sanitizeMXCSR coerces reserved bits in the MXCSR field of f to 0. ("FXRSTOR
// generates a general-protection fault (#GP) in response to an attempt to set
// any of the reserved bits of the MXCSR register." - Intel SDM Vol. 1, Section
// 10.5.1.2 "SSE State")
func sanitizeMXCSR(f State) {
        mxcsr := hostarch.ByteOrder.Uint32(f[mxcsrOffset:])
        initMXCSRMask.Do(func() {
                temp := State(alignedBytes(uint(ptraceFPRegsSize), 16))
                initX86FPState(&temp[0], false /* useXsave */)
                mxcsrMask = hostarch.ByteOrder.Uint32(temp[mxcsrMaskOffset:])
                if mxcsrMask == 0 {
                        // "If the value of the MXCSR_MASK field is 00000000H, then the
                        // MXCSR_MASK value is the default value of 0000FFBFH." - Intel SDM
                        // Vol. 1, Section 11.6.6 "Guidelines for Writing to the MXCSR
                        // Register"
                        mxcsrMask = 0xffbf
                }
        })
        mxcsr &= mxcsrMask
        hostarch.ByteOrder.PutUint32(f[mxcsrOffset:], mxcsr)
}

// PtraceGetXstateRegs implements ptrace(PTRACE_GETREGS, NT_X86_XSTATE) by
// writing the floating point registers from this state to dst and returning the
// number of bytes written, which must be less than or equal to maxlen.
func (s *State) PtraceGetXstateRegs(dst io.Writer, maxlen int, featureSet *cpuid.FeatureSet) (int, error) {
        // N.B. s.x86FPState may contain more state than the application
        // expects. We only copy the subset that would be in their XSAVE area.
        ess, _ := featureSet.ExtendedStateSize()
        f := make([]byte, ess)
        copy(f, *s)
        // "The XSAVE feature set does not use bytes 511:416; bytes 463:416 are
        // reserved." - Intel SDM Vol 1., Section 13.4.1 "Legacy Region of an XSAVE
        // Area". Linux uses the first 8 bytes of this area to store the OS XSTATE
        // mask. GDB relies on this: see
        // gdb/x86-linux-nat.c:x86_linux_read_description().
        hostarch.ByteOrder.PutUint64(f[userXstateXCR0Offset:], featureSet.ValidXCR0Mask())
        if len(f) > maxlen {
                f = f[:maxlen]
        }
        return dst.Write(f)
}

// PtraceSetXstateRegs implements ptrace(PTRACE_SETREGS, NT_X86_XSTATE) by
// reading floating point registers from src and returning the number of bytes
// read, which must be less than or equal to maxlen.
func (s *State) PtraceSetXstateRegs(src io.Reader, maxlen int, featureSet *cpuid.FeatureSet) (int, error) {
        // Allow users to pass an xstate register set smaller than ours (they can
        // mask bits out of XSTATE_BV), as long as it's at least minXstateBytes.
        // Also allow users to pass a register set larger than ours; anything after
        // their ExtendedStateSize will be ignored. (I think Linux technically
        // permits setting a register set smaller than minXstateBytes, but it has
        // the same silent truncation behavior in kernel/ptrace.c:ptrace_regset().)
        if maxlen < minXstateBytes {
                return 0, unix.EFAULT
        }
        ess, _ := featureSet.ExtendedStateSize()
        if maxlen > int(ess) {
                maxlen = int(ess)
        }
        f := make([]byte, maxlen)
        if _, err := io.ReadFull(src, f); err != nil {
                return 0, err
        }
        // Force reserved bits in MXCSR to 0. This is consistent with Linux.
        sanitizeMXCSR(State(f))
        // Users can't enable *more* XCR0 bits than what we, and the CPU, support.
        xstateBV := hostarch.ByteOrder.Uint64(f[xstateBVOffset:])
        xstateBV &= featureSet.ValidXCR0Mask()
        hostarch.ByteOrder.PutUint64(f[xstateBVOffset:], xstateBV)
        // Force XCOMP_BV and reserved bytes in the XSAVE header to 0.
        reserved := f[xsaveHeaderZeroedOffset : xsaveHeaderZeroedOffset+xsaveHeaderZeroedBytes]
        for i := range reserved {
                reserved[i] = 0
        }
        return copy(*s, f), nil
}

// SetMXCSR sets the MXCSR control/status register in the state.
func (s *State) SetMXCSR(mxcsr uint32) {
        hostarch.ByteOrder.PutUint32((*s)[mxcsrOffset:], mxcsr)
}

// BytePointer returns a pointer to the first byte of the state.
//
//go:nosplit
func (s *State) BytePointer() *byte {
        return &(*s)[0]
}

// XSTATE_BV does not exist if FXSAVE is used, but FXSAVE implicitly saves x87
// and SSE state, so this is the equivalent XSTATE_BV value.
const fxsaveBV uint64 = cpuid.XSAVEFeatureX87 | cpuid.XSAVEFeatureSSE

// AfterLoad converts the loaded state to the format that compatible with the
// current processor.
func (s *State) AfterLoad() {
        old := *s

        // Recreate the slice. This is done to ensure that it is aligned
        // appropriately in memory, and large enough to accommodate any new
        // state that may be saved by the new CPU. Even if extraneous new state
        // is saved, the state we care about is guaranteed to be a subset of
        // new state. Later optimizations can use less space when using a
        // smaller state component bitmap. Intel SDM Volume 1 Chapter 13 has
        // more info.
        *s = NewState()

        // x86FPState always contains all the FP state supported by the host.
        // We may have come from a newer machine that supports additional state
        // which we cannot restore.
        //
        // The x86 FP state areas are backwards compatible, so we can simply
        // truncate the additional floating point state.
        //
        // Applications should not depend on the truncated state because it
        // should relate only to features that were not exposed in the app
        // FeatureSet. However, because we do not *prevent* them from using
        // this state, we must verify here that there is no in-use state
        // (according to XSTATE_BV) which we do not support.
        if len(*s) < len(old) {
                // What do we support?
                supportedBV := fxsaveBV
                if fs := cpuid.HostFeatureSet(); fs.UseXsave() {
                        supportedBV = fs.ValidXCR0Mask()
                }

                // What was in use?
                savedBV := fxsaveBV
                if len(old) >= xstateBVOffset+8 {
                        savedBV = hostarch.ByteOrder.Uint64(old[xstateBVOffset:])
                }

                // Supported features must be a superset of saved features.
                if savedBV&^supportedBV != 0 {
                        panic(ErrLoadingState{supportedFeatures: supportedBV, savedFeatures: savedBV})
                }
        }

        // Copy to the new, aligned location.
        copy(*s, old)
}












































































































































  700 



    2 




  698 



  697 




  694 



  696 





  698 








   78 









   76 




  699 



  698 
   56 





  696 
   38 



  674 


  699 


  697 


   30 




  697 
  698 

    1 




  699 



  696 



  697 
   35 


  697 
    1 




  696 
  378 

  393 




  690 
  504 

  249 




  697 


  697 



  698 


  666 



  666 
  525 


  667 





  696 


















  663 

   33 


  696 












  698 



  698 









  695 






    1 



  698 



  698 


  697 



  698 
   56 



  698 
  670 

   56 


  696 





  696 



   57 







  667 








  671 




  667 


  661 


  661 



  661 




























  661 








   12 












    1 


   11 
    4 

    2 



    2 



   11 



    1 



    1 




    1 

   11 
    2 



    2 

   11 

    3 
    1 



    2 

   10 
    1 





    2 

    9 






    3 


    8 
    2 



    9 
    2 



    9 
    2 


    9 


    9 












    1 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/bpf"
        "gvisor.dev/gvisor/pkg/cleanup"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/inet"
        "gvisor.dev/gvisor/pkg/usermem"
)

// SharingOptions controls what resources are shared by a new task created by
// Task.Clone, or an existing task affected by Task.Unshare.
type SharingOptions struct {
        // If NewAddressSpace is true, the task should have an independent virtual
        // address space.
        NewAddressSpace bool

        // If NewSignalHandlers is true, the task should use an independent set of
        // signal handlers.
        NewSignalHandlers bool

        // If NewThreadGroup is true, the task should be the leader of its own
        // thread group. TerminationSignal is the signal that the thread group
        // will send to its parent when it exits. If NewThreadGroup is false,
        // TerminationSignal is ignored.
        NewThreadGroup    bool
        TerminationSignal linux.Signal

        // If NewPIDNamespace is true:
        //
        // - In the context of Task.Clone, the new task should be the init task
        // (TID 1) in a new PID namespace.
        //
        // - In the context of Task.Unshare, the task should create a new PID
        // namespace, and all subsequent clones of the task should be members of
        // the new PID namespace.
        NewPIDNamespace bool

        // If NewUserNamespace is true, the task should have an independent user
        // namespace.
        NewUserNamespace bool

        // If NewNetworkNamespace is true, the task should have an independent
        // network namespace.
        NewNetworkNamespace bool

        // If NewFiles is true, the task should use an independent file descriptor
        // table.
        NewFiles bool

        // If NewFSContext is true, the task should have an independent FSContext.
        NewFSContext bool

        // If NewUTSNamespace is true, the task should have an independent UTS
        // namespace.
        NewUTSNamespace bool

        // If NewIPCNamespace is true, the task should have an independent IPC
        // namespace.
        NewIPCNamespace bool
}

// CloneOptions controls the behavior of Task.Clone.
type CloneOptions struct {
        // SharingOptions defines the set of resources that the new task will share
        // with its parent.
        SharingOptions

        // Stack is the initial stack pointer of the new task. If Stack is 0, the
        // new task will start with the same stack pointer as its parent.
        Stack hostarch.Addr

        // If SetTLS is true, set the new task's TLS (thread-local storage)
        // descriptor to TLS. If SetTLS is false, TLS is ignored.
        SetTLS bool
        TLS    hostarch.Addr

        // If ChildClearTID is true, when the child exits, 0 is written to the
        // address ChildTID in the child's memory, and if the write is successful a
        // futex wake on the same address is performed.
        //
        // If ChildSetTID is true, the child's thread ID (in the child's PID
        // namespace) is written to address ChildTID in the child's memory. (As in
        // Linux, failed writes are silently ignored.)
        ChildClearTID bool
        ChildSetTID   bool
        ChildTID      hostarch.Addr

        // If ParentSetTID is true, the child's thread ID (in the parent's PID
        // namespace) is written to address ParentTID in the parent's memory. (As
        // in Linux, failed writes are silently ignored.)
        //
        // Older versions of the clone(2) man page state that CLONE_PARENT_SETTID
        // causes the child's thread ID to be written to ptid in both the parent
        // and child's memory, but this is a documentation error fixed by
        // 87ab04792ced ("clone.2: Fix description of CLONE_PARENT_SETTID").
        ParentSetTID bool
        ParentTID    hostarch.Addr

        // If Vfork is true, place the parent in vforkStop until the cloned task
        // releases its TaskImage.
        Vfork bool

        // If Untraced is true, do not report PTRACE_EVENT_CLONE/FORK/VFORK for
        // this clone(), and do not ptrace-attach the caller's tracer to the new
        // task. (PTRACE_EVENT_VFORK_DONE will still be reported if appropriate).
        Untraced bool

        // If InheritTracer is true, ptrace-attach the caller's tracer to the new
        // task, even if no PTRACE_EVENT_CLONE/FORK/VFORK event would be reported
        // for it. If both Untraced and InheritTracer are true, no event will be
        // reported, but tracer inheritance will still occur.
        InheritTracer bool
}

// Clone implements the clone(2) syscall and returns the thread ID of the new
// task in t's PID namespace. Clone may return both a non-zero thread ID and a
// non-nil error.
//
// Preconditions: The caller must be running Task.doSyscallInvoke on the task
// goroutine.
func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
        // Since signal actions may refer to application signal handlers by virtual
        // address, any set of signal handlers must refer to the same address
        // space.
        if !opts.NewSignalHandlers && opts.NewAddressSpace {
                return 0, nil, linuxerr.EINVAL
        }
        // In order for the behavior of thread-group-directed signals to be sane,
        // all tasks in a thread group must share signal handlers.
        if !opts.NewThreadGroup && opts.NewSignalHandlers {
                return 0, nil, linuxerr.EINVAL
        }
        // All tasks in a thread group must be in the same PID namespace.
        if !opts.NewThreadGroup && (opts.NewPIDNamespace || t.childPIDNamespace != nil) {
                return 0, nil, linuxerr.EINVAL
        }
        // The two different ways of specifying a new PID namespace are
        // incompatible.
        if opts.NewPIDNamespace && t.childPIDNamespace != nil {
                return 0, nil, linuxerr.EINVAL
        }
        // Thread groups and FS contexts cannot span user namespaces.
        if opts.NewUserNamespace && (!opts.NewThreadGroup || !opts.NewFSContext) {
                return 0, nil, linuxerr.EINVAL
        }

        // Pull task registers and FPU state, a cloned task will inherit the
        // state of the current task.
        t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())

        // "If CLONE_NEWUSER is specified along with other CLONE_NEW* flags in a
        // single clone(2) or unshare(2) call, the user namespace is guaranteed to
        // be created first, giving the child (clone(2)) or caller (unshare(2))
        // privileges over the remaining namespaces created by the call." -
        // user_namespaces(7)
        creds := t.Credentials()
        userns := creds.UserNamespace
        if opts.NewUserNamespace {
                var err error
                // "EPERM (since Linux 3.9): CLONE_NEWUSER was specified in flags and
                // the caller is in a chroot environment (i.e., the caller's root
                // directory does not match the root directory of the mount namespace
                // in which it resides)." - clone(2). Neither chroot(2) nor
                // user_namespaces(7) document this.
                if t.IsChrooted() {
                        return 0, nil, linuxerr.EPERM
                }
                userns, err = creds.NewChildUserNamespace()
                if err != nil {
                        return 0, nil, err
                }
        }
        if (opts.NewPIDNamespace || opts.NewNetworkNamespace || opts.NewUTSNamespace) && !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, userns) {
                return 0, nil, linuxerr.EPERM
        }

        utsns := t.UTSNamespace()
        if opts.NewUTSNamespace {
                // Note that this must happen after NewUserNamespace so we get
                // the new userns if there is one.
                utsns = t.UTSNamespace().Clone(userns)
        }

        ipcns := t.IPCNamespace()
        if opts.NewIPCNamespace {
                // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
                // namespace"
                ipcns = NewIPCNamespace(userns)
        } else {
                ipcns.IncRef()
        }
        cu := cleanup.Make(func() {
                ipcns.DecRef(t)
        })
        defer cu.Clean()

        netns := t.NetworkNamespace()
        if opts.NewNetworkNamespace {
                netns = inet.NewNamespace(netns)
        }

        // TODO(b/63601033): Implement CLONE_NEWNS.
        mntnsVFS2 := t.mountNamespaceVFS2
        if mntnsVFS2 != nil {
                mntnsVFS2.IncRef()
                cu.Add(func() {
                        mntnsVFS2.DecRef(t)
                })
        }

        image, err := t.image.Fork(t, t.k, !opts.NewAddressSpace)
        if err != nil {
                return 0, nil, err
        }
        cu.Add(func() {
                image.release()
        })
        // clone() returns 0 in the child.
        image.Arch.SetReturn(0)
        if opts.Stack != 0 {
                image.Arch.SetStack(uintptr(opts.Stack))
        }
        if opts.SetTLS {
                if !image.Arch.SetTLS(uintptr(opts.TLS)) {
                        return 0, nil, linuxerr.EPERM
                }
        }

        var fsContext *FSContext
        if opts.NewFSContext {
                fsContext = t.fsContext.Fork()
        } else {
                fsContext = t.fsContext
                fsContext.IncRef()
        }

        var fdTable *FDTable
        if opts.NewFiles {
                fdTable = t.fdTable.Fork(t)
        } else {
                fdTable = t.fdTable
                fdTable.IncRef()
        }

        pidns := t.tg.pidns
        if t.childPIDNamespace != nil {
                pidns = t.childPIDNamespace
        } else if opts.NewPIDNamespace {
                pidns = pidns.NewChild(userns)
        }

        tg := t.tg
        rseqAddr := hostarch.Addr(0)
        rseqSignature := uint32(0)
        if opts.NewThreadGroup {
                if tg.mounts != nil {
                        tg.mounts.IncRef()
                }
                sh := t.tg.signalHandlers
                if opts.NewSignalHandlers {
                        sh = sh.Fork()
                }
                tg = t.k.NewThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy())
                tg.oomScoreAdj = atomic.LoadInt32(&t.tg.oomScoreAdj)
                rseqAddr = t.rseqAddr
                rseqSignature = t.rseqSignature
        }

        cfg := &TaskConfig{
                Kernel:                  t.k,
                ThreadGroup:             tg,
                SignalMask:              t.SignalMask(),
                TaskImage:               image,
                FSContext:               fsContext,
                FDTable:                 fdTable,
                Credentials:             creds,
                Niceness:                t.Niceness(),
                NetworkNamespace:        netns,
                AllowedCPUMask:          t.CPUMask(),
                UTSNamespace:            utsns,
                IPCNamespace:            ipcns,
                AbstractSocketNamespace: t.abstractSockets,
                MountNamespaceVFS2:      mntnsVFS2,
                RSeqAddr:                rseqAddr,
                RSeqSignature:           rseqSignature,
                ContainerID:             t.ContainerID(),
        }
        if opts.NewThreadGroup {
                cfg.Parent = t
        } else {
                cfg.InheritParent = t
        }
        nt, err := t.tg.pidns.owner.NewTask(t, cfg)
        // If NewTask succeeds, we transfer references to nt. If NewTask fails, it does
        // the cleanup for us.
        cu.Release()
        if err != nil {
                return 0, nil, err
        }

        // "A child process created via fork(2) inherits a copy of its parent's
        // alternate signal stack settings" - sigaltstack(2).
        //
        // However kernel/fork.c:copy_process() adds a limitation to this:
        // "sigaltstack should be cleared when sharing the same VM".
        if opts.NewAddressSpace || opts.Vfork {
                nt.SetSignalStack(t.SignalStack())
        }

        if userns != creds.UserNamespace {
                if err := nt.SetUserNamespace(userns); err != nil {
                        // This shouldn't be possible: userns was created from nt.creds, so
                        // nt should have CAP_SYS_ADMIN in userns.
                        panic("Task.Clone: SetUserNamespace failed: " + err.Error())
                }
        }

        // This has to happen last, because e.g. ptraceClone may send a SIGSTOP to
        // nt that it must receive before its task goroutine starts running.
        tid := nt.k.tasks.Root.IDOfTask(nt)
        defer nt.Start(tid)
        t.traceCloneEvent(tid)

        // "If fork/clone and execve are allowed by @prog, any child processes will
        // be constrained to the same filters and system call ABI as the parent." -
        // Documentation/prctl/seccomp_filter.txt
        if f := t.syscallFilters.Load(); f != nil {
                copiedFilters := append([]bpf.Program(nil), f.([]bpf.Program)...)
                nt.syscallFilters.Store(copiedFilters)
        }
        if opts.Vfork {
                nt.vforkParent = t
        }

        if opts.ChildClearTID {
                nt.SetClearTID(opts.ChildTID)
        }
        if opts.ChildSetTID {
                ctid := nt.ThreadID()
                ctid.CopyOut(nt.CopyContext(t, usermem.IOOpts{AddressSpaceActive: false}), opts.ChildTID)
        }
        ntid := t.tg.pidns.IDOfTask(nt)
        if opts.ParentSetTID {
                ntid.CopyOut(t, opts.ParentTID)
        }

        kind := ptraceCloneKindClone
        if opts.Vfork {
                kind = ptraceCloneKindVfork
        } else if opts.TerminationSignal == linux.SIGCHLD {
                kind = ptraceCloneKindFork
        }
        if t.ptraceClone(kind, nt, opts) {
                if opts.Vfork {
                        return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{vforkChild: nt, vforkChildTID: ntid}}, nil
                }
                return ntid, &SyscallControl{next: &runSyscallAfterPtraceEventClone{}}, nil
        }
        if opts.Vfork {
                t.maybeBeginVforkStop(nt)
                return ntid, &SyscallControl{next: &runSyscallAfterVforkStop{childTID: ntid}}, nil
        }
        return ntid, nil, nil
}

// maybeBeginVforkStop checks if a previously-started vfork child is still
// running and has not yet released its MM, such that its parent t should enter
// a vforkStop.
//
// Preconditions: The caller must be running on t's task goroutine.
func (t *Task) maybeBeginVforkStop(child *Task) {
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        if t.killedLocked() {
                child.vforkParent = nil
                return
        }
        if child.vforkParent == t {
                t.beginInternalStopLocked((*vforkStop)(nil))
        }
}

func (t *Task) unstopVforkParent() {
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        if p := t.vforkParent; p != nil {
                p.tg.signalHandlers.mu.Lock()
                defer p.tg.signalHandlers.mu.Unlock()
                if _, ok := p.stop.(*vforkStop); ok {
                        p.endInternalStopLocked()
                }
                // Parent no longer needs to be unstopped.
                t.vforkParent = nil
        }
}

// +stateify savable
type runSyscallAfterPtraceEventClone struct {
        vforkChild *Task

        // If vforkChild is not nil, vforkChildTID is its thread ID in the parent's
        // PID namespace. vforkChildTID must be stored since the child may exit and
        // release its TID before the PTRACE_EVENT stop ends.
        vforkChildTID ThreadID
}

func (r *runSyscallAfterPtraceEventClone) execute(t *Task) taskRunState {
        if r.vforkChild != nil {
                t.maybeBeginVforkStop(r.vforkChild)
                return &runSyscallAfterVforkStop{r.vforkChildTID}
        }
        return (*runSyscallExit)(nil)
}

// +stateify savable
type runSyscallAfterVforkStop struct {
        // childTID has the same meaning as
        // runSyscallAfterPtraceEventClone.vforkChildTID.
        childTID ThreadID
}

func (r *runSyscallAfterVforkStop) execute(t *Task) taskRunState {
        t.ptraceVforkDone(r.childTID)
        return (*runSyscallExit)(nil)
}

// Unshare changes the set of resources t shares with other tasks, as specified
// by opts.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) Unshare(opts *SharingOptions) error {
        // In Linux unshare(2), NewThreadGroup implies NewSignalHandlers and
        // NewSignalHandlers implies NewAddressSpace. All three flags are no-ops if
        // t is the only task using its MM, which due to clone(2)'s rules imply
        // that it is also the only task using its signal handlers / in its thread
        // group, and cause EINVAL to be returned otherwise.
        //
        // Since we don't count the number of tasks using each address space or set
        // of signal handlers, we reject NewSignalHandlers and NewAddressSpace
        // altogether, and interpret NewThreadGroup as requiring that t be the only
        // member of its thread group. This seems to be logically coherent, in the
        // sense that clone(2) allows a task to share signal handlers and address
        // spaces with tasks in other thread groups.
        if opts.NewAddressSpace || opts.NewSignalHandlers {
                return linuxerr.EINVAL
        }
        creds := t.Credentials()
        if opts.NewThreadGroup {
                t.tg.signalHandlers.mu.Lock()
                if t.tg.tasksCount != 1 {
                        t.tg.signalHandlers.mu.Unlock()
                        return linuxerr.EINVAL
                }
                t.tg.signalHandlers.mu.Unlock()
                // This isn't racy because we're the only living task, and therefore
                // the only task capable of creating new ones, in our thread group.
        }
        if opts.NewUserNamespace {
                if t.IsChrooted() {
                        return linuxerr.EPERM
                }
                newUserNS, err := creds.NewChildUserNamespace()
                if err != nil {
                        return err
                }
                err = t.SetUserNamespace(newUserNS)
                if err != nil {
                        return err
                }
                // Need to reload creds, becaue t.SetUserNamespace() changed task credentials.
                creds = t.Credentials()
        }
        haveCapSysAdmin := t.HasCapability(linux.CAP_SYS_ADMIN)
        if opts.NewPIDNamespace {
                if !haveCapSysAdmin {
                        return linuxerr.EPERM
                }
                t.childPIDNamespace = t.tg.pidns.NewChild(t.UserNamespace())
        }
        t.mu.Lock()
        // Can't defer unlock: DecRefs must occur without holding t.mu.
        if opts.NewNetworkNamespace {
                if !haveCapSysAdmin {
                        t.mu.Unlock()
                        return linuxerr.EPERM
                }
                t.netns = inet.NewNamespace(t.netns)
        }
        if opts.NewUTSNamespace {
                if !haveCapSysAdmin {
                        t.mu.Unlock()
                        return linuxerr.EPERM
                }
                // Note that this must happen after NewUserNamespace, so the
                // new user namespace is used if there is one.
                t.utsns = t.utsns.Clone(creds.UserNamespace)
        }
        if opts.NewIPCNamespace {
                if !haveCapSysAdmin {
                        t.mu.Unlock()
                        return linuxerr.EPERM
                }
                // Note that "If CLONE_NEWIPC is set, then create the process in a new IPC
                // namespace"
                t.ipcns.DecRef(t)
                t.ipcns = NewIPCNamespace(creds.UserNamespace)
        }
        var oldFDTable *FDTable
        if opts.NewFiles {
                oldFDTable = t.fdTable
                t.fdTable = oldFDTable.Fork(t)
        }
        var oldFSContext *FSContext
        if opts.NewFSContext {
                oldFSContext = t.fsContext
                t.fsContext = oldFSContext.Fork()
        }
        t.mu.Unlock()
        if oldFDTable != nil {
                oldFDTable.DecRef(t)
        }
        if oldFSContext != nil {
                oldFSContext.DecRef(t)
        }
        return nil
}

// vforkStop is a TaskStop imposed on a task that creates a child with
// CLONE_VFORK or vfork(2), that ends when the child task ceases to use its
// current MM. (Normally, CLONE_VFORK is used in conjunction with CLONE_VM, so
// that the child and parent share mappings until the child execve()s into a
// new process image or exits.)
//
// +stateify savable
type vforkStop struct{}

// StopIgnoresKill implements TaskStop.Killable.
func (*vforkStop) Killable() bool { return true }
































































    1 




    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/marshal"
)

// Socket error origin codes as defined in include/uapi/linux/errqueue.h.
const (
        SO_EE_ORIGIN_NONE  = 0
        SO_EE_ORIGIN_LOCAL = 1
        SO_EE_ORIGIN_ICMP  = 2
        SO_EE_ORIGIN_ICMP6 = 3
)

// SockExtendedErr represents struct sock_extended_err in Linux defined in
// include/uapi/linux/errqueue.h.
//
// +marshal
type SockExtendedErr struct {
        Errno  uint32
        Origin uint8
        Type   uint8
        Code   uint8
        Pad    uint8
        Info   uint32
        Data   uint32
}

// SockErrCMsg represents the IP*_RECVERR control message.
type SockErrCMsg interface {
        marshal.Marshallable

        CMsgLevel() uint32
        CMsgType() uint32
}

// SockErrCMsgIPv4 is the IP_RECVERR control message used in
// recvmsg(MSG_ERRQUEUE) by ipv4 sockets. This is equilavent to `struct errhdr`
// defined in net/ipv4/ip_sockglue.c:ip_recv_error().
//
// +marshal
type SockErrCMsgIPv4 struct {
        SockExtendedErr
        Offender SockAddrInet
}

var _ SockErrCMsg = (*SockErrCMsgIPv4)(nil)

// CMsgLevel implements SockErrCMsg.CMsgLevel.
func (*SockErrCMsgIPv4) CMsgLevel() uint32 {
        return SOL_IP
}

// CMsgType implements SockErrCMsg.CMsgType.
func (*SockErrCMsgIPv4) CMsgType() uint32 {
        return IP_RECVERR
}

// SockErrCMsgIPv6 is the IPV6_RECVERR control message used in
// recvmsg(MSG_ERRQUEUE) by ipv6 sockets. This is equilavent to `struct errhdr`
// defined in net/ipv6/datagram.c:ipv6_recv_error().
//
// +marshal
type SockErrCMsgIPv6 struct {
        SockExtendedErr
        Offender SockAddrInet6
}

var _ SockErrCMsg = (*SockErrCMsgIPv6)(nil)

// CMsgLevel implements SockErrCMsg.CMsgLevel.
func (*SockErrCMsgIPv6) CMsgLevel() uint32 {
        return SOL_IPV6
}

// CMsgType implements SockErrCMsg.CMsgType.
func (*SockErrCMsgIPv6) CMsgType() uint32 {
        return IPV6_RECVERR
}











































































    5 









   25 








   23 





















































    3 
    2 
    2 



    1 





    8 






    8 
    8 



    8 





    8 
    8 
    4 





    8 


    8 


    8 











   92 

   90 


   92 









   91 



   92 




   91 









   87 
   31 


   84 
   79 



   54 



    3 











   92 









   90 

   89 


   90 



  118 

   38 
















   90 







































































































    2 

    1 


    2 

    1 


    2 

























































































































































































































































































































































































































































































































































































































































































































































































































    1 




    1 
















































  173 




    5 




    1 




  174 













   32 





  146 





  133 
























































































































































































































































































































































































































































































































































































































































































































































































   32 
   31 

   32 
   32 


   31 






   31 






















































































































































  101 







  100 






   90 


   89 
   91 





   92 







   91 



   92 
































    5 






   42 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package tcpip provides the interfaces and related types that users of the
// tcpip stack will use in order to create endpoints used to send and receive
// data over the network stack.
//
// The starting point is the creation and configuration of a stack. A stack can
// be created by calling the New() function of the tcpip/stack/stack package;
// configuring a stack involves creating NICs (via calls to Stack.CreateNIC()),
// adding network addresses (via calls to Stack.AddAddress()), and
// setting a route table (via a call to Stack.SetRouteTable()).
//
// Once a stack is configured, endpoints can be created by calling
// Stack.NewEndpoint(). Such endpoints can be used to send/receive data, connect
// to peers, listen for connections, accept connections, etc., depending on the
// transport protocol selected.
package tcpip

import (
        "bytes"
        "errors"
        "fmt"
        "io"
        "math/bits"
        "reflect"
        "strconv"
        "strings"
        "time"

        "gvisor.dev/gvisor/pkg/atomicbitops"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/waiter"
)

// Using header.IPv4AddressSize would cause an import cycle.
const ipv4AddressSize = 4

// Errors related to Subnet
var (
        errSubnetLengthMismatch = errors.New("subnet length of address and mask differ")
        errSubnetAddressMasked  = errors.New("subnet address has bits set outside the mask")
)

// ErrSaveRejection indicates a failed save due to unsupported networking state.
// This type of errors is only used for save logic.
type ErrSaveRejection struct {
        Err error
}

// Error returns a sensible description of the save rejection error.
func (e *ErrSaveRejection) Error() string {
        return "save rejected due to unsupported networking state: " + e.Err.Error()
}

// MonotonicTime is a monotonic clock reading.
//
// +stateify savable
type MonotonicTime struct {
        nanoseconds int64
}

// Before reports whether the monotonic clock reading mt is before u.
func (mt MonotonicTime) Before(u MonotonicTime) bool {
        return mt.nanoseconds < u.nanoseconds
}

// After reports whether the monotonic clock reading mt is after u.
func (mt MonotonicTime) After(u MonotonicTime) bool {
        return mt.nanoseconds > u.nanoseconds
}

// Add returns the monotonic clock reading mt+d.
func (mt MonotonicTime) Add(d time.Duration) MonotonicTime {
        return MonotonicTime{
                nanoseconds: time.Unix(0, mt.nanoseconds).Add(d).Sub(time.Unix(0, 0)).Nanoseconds(),
        }
}

// Sub returns the duration mt-u. If the result exceeds the maximum (or minimum)
// value that can be stored in a Duration, the maximum (or minimum) duration
// will be returned. To compute t-d for a duration d, use t.Add(-d).
func (mt MonotonicTime) Sub(u MonotonicTime) time.Duration {
        return time.Unix(0, mt.nanoseconds).Sub(time.Unix(0, u.nanoseconds))
}

// A Clock provides the current time and schedules work for execution.
//
// Times returned by a Clock should always be used for application-visible
// time. Only monotonic times should be used for netstack internal timekeeping.
type Clock interface {
        // Now returns the current local time.
        Now() time.Time

        // NowMonotonic returns the current monotonic clock reading.
        NowMonotonic() MonotonicTime

        // AfterFunc waits for the duration to elapse and then calls f in its own
        // goroutine. It returns a Timer that can be used to cancel the call using
        // its Stop method.
        AfterFunc(d time.Duration, f func()) Timer
}

// Timer represents a single event. A Timer must be created with
// Clock.AfterFunc.
type Timer interface {
        // Stop prevents the Timer from firing. It returns true if the call stops the
        // timer, false if the timer has already expired or been stopped.
        //
        // If Stop returns false, then the timer has already expired and the function
        // f of Clock.AfterFunc(d, f) has been started in its own goroutine; Stop
        // does not wait for f to complete before returning. If the caller needs to
        // know whether f is completed, it must coordinate with f explicitly.
        Stop() bool

        // Reset changes the timer to expire after duration d.
        //
        // Reset should be invoked only on stopped or expired timers. If the timer is
        // known to have expired, Reset can be used directly. Otherwise, the caller
        // must coordinate with the function f of Clock.AfterFunc(d, f).
        Reset(d time.Duration)
}

// Address is a byte slice cast as a string that represents the address of a
// network node. Or, in the case of unix endpoints, it may represent a path.
type Address string

// WithPrefix returns the address with a prefix that represents a point subnet.
func (a Address) WithPrefix() AddressWithPrefix {
        return AddressWithPrefix{
                Address:   a,
                PrefixLen: len(a) * 8,
        }
}

// Unspecified returns true if the address is unspecified.
func (a Address) Unspecified() bool {
        for _, b := range a {
                if b != 0 {
                        return false
                }
        }
        return true
}

// MatchingPrefix returns the matching prefix length in bits.
//
// Panics if b and a have different lengths.
func (a Address) MatchingPrefix(b Address) uint8 {
        const bitsInAByte = 8

        if len(a) != len(b) {
                panic(fmt.Sprintf("addresses %s and %s do not have the same length", a, b))
        }

        var prefix uint8
        for i := range a {
                aByte := a[i]
                bByte := b[i]

                if aByte == bByte {
                        prefix += bitsInAByte
                        continue
                }

                // Count the remaining matching bits in the byte from MSbit to LSBbit.
                mask := uint8(1) << (bitsInAByte - 1)
                for {
                        if aByte&mask == bByte&mask {
                                prefix++
                                mask >>= 1
                                continue
                        }

                        break
                }

                break
        }

        return prefix
}

// AddressMask is a bitmask for an address.
type AddressMask string

// String implements Stringer.
func (m AddressMask) String() string {
        return Address(m).String()
}

// Prefix returns the number of bits before the first host bit.
func (m AddressMask) Prefix() int {
        p := 0
        for _, b := range []byte(m) {
                p += bits.LeadingZeros8(^b)
        }
        return p
}

// Subnet is a subnet defined by its address and mask.
type Subnet struct {
        address Address
        mask    AddressMask
}

// NewSubnet creates a new Subnet, checking that the address and mask are the same length.
func NewSubnet(a Address, m AddressMask) (Subnet, error) {
        if len(a) != len(m) {
                return Subnet{}, errSubnetLengthMismatch
        }
        for i := 0; i < len(a); i++ {
                if a[i]&^m[i] != 0 {
                        return Subnet{}, errSubnetAddressMasked
                }
        }
        return Subnet{a, m}, nil
}

// String implements Stringer.
func (s Subnet) String() string {
        return fmt.Sprintf("%s/%d", s.ID(), s.Prefix())
}

// Contains returns true iff the address is of the same length and matches the
// subnet address and mask.
func (s *Subnet) Contains(a Address) bool {
        if len(a) != len(s.address) {
                return false
        }
        for i := 0; i < len(a); i++ {
                if a[i]&s.mask[i] != s.address[i] {
                        return false
                }
        }
        return true
}

// ID returns the subnet ID.
func (s *Subnet) ID() Address {
        return s.address
}

// Bits returns the number of ones (network bits) and zeros (host bits) in the
// subnet mask.
func (s *Subnet) Bits() (ones int, zeros int) {
        ones = s.mask.Prefix()
        return ones, len(s.mask)*8 - ones
}

// Prefix returns the number of bits before the first host bit.
func (s *Subnet) Prefix() int {
        return s.mask.Prefix()
}

// Mask returns the subnet mask.
func (s *Subnet) Mask() AddressMask {
        return s.mask
}

// Broadcast returns the subnet's broadcast address.
func (s *Subnet) Broadcast() Address {
        addr := []byte(s.address)
        for i := range addr {
                addr[i] |= ^s.mask[i]
        }
        return Address(addr)
}

// IsBroadcast returns true if the address is considered a broadcast address.
func (s *Subnet) IsBroadcast(address Address) bool {
        // Only IPv4 supports the notion of a broadcast address.
        if len(address) != ipv4AddressSize {
                return false
        }

        // Normally, we would just compare address with the subnet's broadcast
        // address but there is an exception where a simple comparison is not
        // correct. This exception is for /31 and /32 IPv4 subnets where all
        // addresses are considered valid host addresses.
        //
        // For /31 subnets, the case is easy. RFC 3021 Section 2.1 states that
        // both addresses in a /31 subnet "MUST be interpreted as host addresses."
        //
        // For /32, the case is a bit more vague. RFC 3021 makes no mention of /32
        // subnets. However, the same reasoning applies - if an exception is not
        // made, then there do not exist any host addresses in a /32 subnet. RFC
        // 4632 Section 3.1 also vaguely implies this interpretation by referring
        // to addresses in /32 subnets as "host routes."
        return s.Prefix() <= 30 && s.Broadcast() == address
}

// Equal returns true if this Subnet is equal to the given Subnet.
func (s Subnet) Equal(o Subnet) bool {
        // If this changes, update Route.Equal accordingly.
        return s == o
}

// NICID is a number that uniquely identifies a NIC.
type NICID int32

// ShutdownFlags represents flags that can be passed to the Shutdown() method
// of the Endpoint interface.
type ShutdownFlags int

// Values of the flags that can be passed to the Shutdown() method. They can
// be OR'ed together.
const (
        ShutdownRead ShutdownFlags = 1 << iota
        ShutdownWrite
)

// PacketType is used to indicate the destination of the packet.
type PacketType uint8

const (
        // PacketHost indicates a packet addressed to the local host.
        PacketHost PacketType = iota

        // PacketOtherHost indicates an outgoing packet addressed to
        // another host caught by a NIC in promiscuous mode.
        PacketOtherHost

        // PacketOutgoing for a packet originating from the local host
        // that is looped back to a packet socket.
        PacketOutgoing

        // PacketBroadcast indicates a link layer broadcast packet.
        PacketBroadcast

        // PacketMulticast indicates a link layer multicast packet.
        PacketMulticast
)

// FullAddress represents a full transport node address, as required by the
// Connect() and Bind() methods.
//
// +stateify savable
type FullAddress struct {
        // NIC is the ID of the NIC this address refers to.
        //
        // This may not be used by all endpoint types.
        NIC NICID

        // Addr is the network or link layer address.
        Addr Address

        // Port is the transport port.
        //
        // This may not be used by all endpoint types.
        Port uint16
}

// Payloader is an interface that provides data.
//
// This interface allows the endpoint to request the amount of data it needs
// based on internal buffers without exposing them.
type Payloader interface {
        io.Reader

        // Len returns the number of bytes of the unread portion of the
        // Reader.
        Len() int
}

var _ Payloader = (*bytes.Buffer)(nil)
var _ Payloader = (*bytes.Reader)(nil)

var _ io.Writer = (*SliceWriter)(nil)

// SliceWriter implements io.Writer for slices.
type SliceWriter []byte

// Write implements io.Writer.Write.
func (s *SliceWriter) Write(b []byte) (int, error) {
        n := copy(*s, b)
        *s = (*s)[n:]
        var err error
        if n != len(b) {
                err = io.ErrShortWrite
        }
        return n, err
}

var _ io.Writer = (*LimitedWriter)(nil)

// A LimitedWriter writes to W but limits the amount of data copied to just N
// bytes. Each call to Write updates N to reflect the new amount remaining.
type LimitedWriter struct {
        W io.Writer
        N int64
}

func (l *LimitedWriter) Write(p []byte) (int, error) {
        pLen := int64(len(p))
        if pLen > l.N {
                p = p[:l.N]
        }
        n, err := l.W.Write(p)
        n64 := int64(n)
        if err == nil && n64 != pLen {
                err = io.ErrShortWrite
        }
        l.N -= n64
        return n, err
}

// A ControlMessages contains socket control messages for IP sockets.
//
// +stateify savable
type ControlMessages struct {
        // HasTimestamp indicates whether Timestamp is valid/set.
        HasTimestamp bool

        // Timestamp is the time (in ns) that the last packet used to create
        // the read data was received.
        Timestamp int64

        // HasInq indicates whether Inq is valid/set.
        HasInq bool

        // Inq is the number of bytes ready to be received.
        Inq int32

        // HasTOS indicates whether Tos is valid/set.
        HasTOS bool

        // TOS is the IPv4 type of service of the associated packet.
        TOS uint8

        // HasTClass indicates whether TClass is valid/set.
        HasTClass bool

        // TClass is the IPv6 traffic class of the associated packet.
        TClass uint32

        // HasIPPacketInfo indicates whether PacketInfo is set.
        HasIPPacketInfo bool

        // PacketInfo holds interface and address data on an incoming packet.
        PacketInfo IPPacketInfo

        // HasOriginalDestinationAddress indicates whether OriginalDstAddress is
        // set.
        HasOriginalDstAddress bool

        // OriginalDestinationAddress holds the original destination address
        // and port of the incoming packet.
        OriginalDstAddress FullAddress

        // SockErr is the dequeued socket error on recvmsg(MSG_ERRQUEUE).
        SockErr *SockError
}

// PacketOwner is used to get UID and GID of the packet.
type PacketOwner interface {
        // UID returns KUID of the packet.
        KUID() uint32

        // GID returns KGID of the packet.
        KGID() uint32
}

// ReadOptions contains options for Endpoint.Read.
type ReadOptions struct {
        // Peek indicates whether this read is a peek.
        Peek bool

        // NeedRemoteAddr indicates whether to return the remote address, if
        // supported.
        NeedRemoteAddr bool

        // NeedLinkPacketInfo indicates whether to return the link-layer information,
        // if supported.
        NeedLinkPacketInfo bool
}

// ReadResult represents result for a successful Endpoint.Read.
type ReadResult struct {
        // Count is the number of bytes received and written to the buffer.
        Count int

        // Total is the number of bytes of the received packet. This can be used to
        // determine whether the read is truncated.
        Total int

        // ControlMessages is the control messages received.
        ControlMessages ControlMessages

        // RemoteAddr is the remote address if ReadOptions.NeedAddr is true.
        RemoteAddr FullAddress

        // LinkPacketInfo is the link-layer information of the received packet if
        // ReadOptions.NeedLinkPacketInfo is true.
        LinkPacketInfo LinkPacketInfo
}

// Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
// that exposes functionality like read, write, connect, etc. to users of the
// networking stack.
type Endpoint interface {
        // Close puts the endpoint in a closed state and frees all resources
        // associated with it. Close initiates the teardown process, the
        // Endpoint may not be fully closed when Close returns.
        Close()

        // Abort initiates an expedited endpoint teardown. As compared to
        // Close, Abort prioritizes closing the Endpoint quickly over cleanly.
        // Abort is best effort; implementing Abort with Close is acceptable.
        Abort()

        // Read reads data from the endpoint and optionally writes to dst.
        //
        // This method does not block if there is no data pending; in this case,
        // ErrWouldBlock is returned.
        //
        // If non-zero number of bytes are successfully read and written to dst, err
        // must be nil. Otherwise, if dst failed to write anything, ErrBadBuffer
        // should be returned.
        Read(io.Writer, ReadOptions) (ReadResult, Error)

        // Write writes data to the endpoint's peer. This method does not block if
        // the data cannot be written.
        //
        // Unlike io.Writer.Write, Endpoint.Write transfers ownership of any bytes
        // successfully written to the Endpoint. That is, if a call to
        // Write(SlicePayload{data}) returns (n, err), it may retain data[:n], and
        // the caller should not use data[:n] after Write returns.
        //
        // Note that unlike io.Writer.Write, it is not an error for Write to
        // perform a partial write (if n > 0, no error may be returned). Only
        // stream (TCP) Endpoints may return partial writes, and even then only
        // in the case where writing additional data would block. Other Endpoints
        // will either write the entire message or return an error.
        Write(Payloader, WriteOptions) (int64, Error)

        // Connect connects the endpoint to its peer. Specifying a NIC is
        // optional.
        //
        // There are three classes of return values:
        //        nil -- the attempt to connect succeeded.
        //        ErrConnectStarted/ErrAlreadyConnecting -- the connect attempt started
        //                but hasn't completed yet. In this case, the caller must call Connect
        //                or GetSockOpt(ErrorOption) when the endpoint becomes writable to
        //                get the actual result. The first call to Connect after the socket has
        //                connected returns nil. Calling connect again results in ErrAlreadyConnected.
        //        Anything else -- the attempt to connect failed.
        //
        // If address.Addr is empty, this means that Endpoint has to be
        // disconnected if this is supported, otherwise
        // ErrAddressFamilyNotSupported must be returned.
        Connect(address FullAddress) Error

        // Disconnect disconnects the endpoint from its peer.
        Disconnect() Error

        // Shutdown closes the read and/or write end of the endpoint connection
        // to its peer.
        Shutdown(flags ShutdownFlags) Error

        // Listen puts the endpoint in "listen" mode, which allows it to accept
        // new connections.
        Listen(backlog int) Error

        // Accept returns a new endpoint if a peer has established a connection
        // to an endpoint previously set to listen mode. This method does not
        // block if no new connections are available.
        //
        // The returned Queue is the wait queue for the newly created endpoint.
        //
        // If peerAddr is not nil then it is populated with the peer address of the
        // returned endpoint.
        Accept(peerAddr *FullAddress) (Endpoint, *waiter.Queue, Error)

        // Bind binds the endpoint to a specific local address and port.
        // Specifying a NIC is optional.
        Bind(address FullAddress) Error

        // GetLocalAddress returns the address to which the endpoint is bound.
        GetLocalAddress() (FullAddress, Error)

        // GetRemoteAddress returns the address to which the endpoint is
        // connected.
        GetRemoteAddress() (FullAddress, Error)

        // Readiness returns the current readiness of the endpoint. For example,
        // if waiter.EventIn is set, the endpoint is immediately readable.
        Readiness(mask waiter.EventMask) waiter.EventMask

        // SetSockOpt sets a socket option.
        SetSockOpt(opt SettableSocketOption) Error

        // SetSockOptInt sets a socket option, for simple cases where a value
        // has the int type.
        SetSockOptInt(opt SockOptInt, v int) Error

        // GetSockOpt gets a socket option.
        GetSockOpt(opt GettableSocketOption) Error

        // GetSockOptInt gets a socket option for simple cases where a return
        // value has the int type.
        GetSockOptInt(SockOptInt) (int, Error)

        // State returns a socket's lifecycle state. The returned value is
        // protocol-specific and is primarily used for diagnostics.
        State() uint32

        // ModerateRecvBuf should be called everytime data is copied to the user
        // space. This allows for dynamic tuning of recv buffer space for a
        // given socket.
        //
        // NOTE: This method is a no-op for sockets other than TCP.
        ModerateRecvBuf(copied int)

        // Info returns a copy to the transport endpoint info.
        Info() EndpointInfo

        // Stats returns a reference to the endpoint stats.
        Stats() EndpointStats

        // SetOwner sets the task owner to the endpoint owner.
        SetOwner(owner PacketOwner)

        // LastError clears and returns the last error reported by the endpoint.
        LastError() Error

        // SocketOptions returns the structure which contains all the socket
        // level options.
        SocketOptions() *SocketOptions
}

// LinkPacketInfo holds Link layer information for a received packet.
//
// +stateify savable
type LinkPacketInfo struct {
        // Protocol is the NetworkProtocolNumber for the packet.
        Protocol NetworkProtocolNumber

        // PktType is used to indicate the destination of the packet.
        PktType PacketType
}

// EndpointInfo is the interface implemented by each endpoint info struct.
type EndpointInfo interface {
        // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
        // marker interface.
        IsEndpointInfo()
}

// EndpointStats is the interface implemented by each endpoint stats struct.
type EndpointStats interface {
        // IsEndpointStats is an empty method to implement the tcpip.EndpointStats
        // marker interface.
        IsEndpointStats()
}

// WriteOptions contains options for Endpoint.Write.
type WriteOptions struct {
        // If To is not nil, write to the given address instead of the endpoint's
        // peer.
        To *FullAddress

        // More has the same semantics as Linux's MSG_MORE.
        More bool

        // EndOfRecord has the same semantics as Linux's MSG_EOR.
        EndOfRecord bool

        // Atomic means that all data fetched from Payloader must be written to the
        // endpoint. If Atomic is false, then data fetched from the Payloader may be
        // discarded if available endpoint buffer space is unsufficient.
        Atomic bool
}

// SockOptInt represents socket options which values have the int type.
type SockOptInt int

const (
        // KeepaliveCountOption is used by SetSockOptInt/GetSockOptInt to
        // specify the number of un-ACKed TCP keepalives that will be sent
        // before the connection is closed.
        KeepaliveCountOption SockOptInt = iota

        // IPv4TOSOption is used by SetSockOptInt/GetSockOptInt to specify TOS
        // for all subsequent outgoing IPv4 packets from the endpoint.
        IPv4TOSOption

        // IPv6TrafficClassOption is used by SetSockOptInt/GetSockOptInt to
        // specify TOS for all subsequent outgoing IPv6 packets from the
        // endpoint.
        IPv6TrafficClassOption

        // MaxSegOption is used by SetSockOptInt/GetSockOptInt to set/get the
        // current Maximum Segment Size(MSS) value as specified using the
        // TCP_MAXSEG option.
        MaxSegOption

        // MTUDiscoverOption is used to set/get the path MTU discovery setting.
        //
        // NOTE: Setting this option to any other value than PMTUDiscoveryDont
        // is not supported and will fail as such, and getting this option will
        // always return PMTUDiscoveryDont.
        MTUDiscoverOption

        // MulticastTTLOption is used by SetSockOptInt/GetSockOptInt to control
        // the default TTL value for multicast messages. The default is 1.
        MulticastTTLOption

        // ReceiveQueueSizeOption is used in GetSockOptInt to specify that the
        // number of unread bytes in the input buffer should be returned.
        ReceiveQueueSizeOption

        // SendQueueSizeOption is used in GetSockOptInt to specify that the
        // number of unread bytes in the output buffer should be returned.
        SendQueueSizeOption

        // TTLOption is used by SetSockOptInt/GetSockOptInt to control the
        // default TTL/hop limit value for unicast messages. The default is
        // protocol specific.
        //
        // A zero value indicates the default.
        TTLOption

        // TCPSynCountOption is used by SetSockOptInt/GetSockOptInt to specify
        // the number of SYN retransmits that TCP should send before aborting
        // the attempt to connect. It cannot exceed 255.
        //
        // NOTE: This option is currently only stubbed out and is no-op.
        TCPSynCountOption

        // TCPWindowClampOption is used by SetSockOptInt/GetSockOptInt to bound
        // the size of the advertised window to this value.
        //
        // NOTE: This option is currently only stubed out and is a no-op
        TCPWindowClampOption
)

const (
        // PMTUDiscoveryWant is a setting of the MTUDiscoverOption to use
        // per-route settings.
        PMTUDiscoveryWant int = iota

        // PMTUDiscoveryDont is a setting of the MTUDiscoverOption to disable
        // path MTU discovery.
        PMTUDiscoveryDont

        // PMTUDiscoveryDo is a setting of the MTUDiscoverOption to always do
        // path MTU discovery.
        PMTUDiscoveryDo

        // PMTUDiscoveryProbe is a setting of the MTUDiscoverOption to set DF
        // but ignore path MTU.
        PMTUDiscoveryProbe
)

// GettableNetworkProtocolOption is a marker interface for network protocol
// options that may be queried.
type GettableNetworkProtocolOption interface {
        isGettableNetworkProtocolOption()
}

// SettableNetworkProtocolOption is a marker interface for network protocol
// options that may be set.
type SettableNetworkProtocolOption interface {
        isSettableNetworkProtocolOption()
}

// DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify
// a default TTL.
type DefaultTTLOption uint8

func (*DefaultTTLOption) isGettableNetworkProtocolOption() {}

func (*DefaultTTLOption) isSettableNetworkProtocolOption() {}

// GettableTransportProtocolOption is a marker interface for transport protocol
// options that may be queried.
type GettableTransportProtocolOption interface {
        isGettableTransportProtocolOption()
}

// SettableTransportProtocolOption is a marker interface for transport protocol
// options that may be set.
type SettableTransportProtocolOption interface {
        isSettableTransportProtocolOption()
}

// TCPSACKEnabled the SACK option for TCP.
//
// See: https://tools.ietf.org/html/rfc2018.
type TCPSACKEnabled bool

func (*TCPSACKEnabled) isGettableTransportProtocolOption() {}

func (*TCPSACKEnabled) isSettableTransportProtocolOption() {}

// TCPRecovery is the loss deteoction algorithm used by TCP.
type TCPRecovery int32

func (*TCPRecovery) isGettableTransportProtocolOption() {}

func (*TCPRecovery) isSettableTransportProtocolOption() {}

// TCPAlwaysUseSynCookies indicates unconditional usage of syncookies.
type TCPAlwaysUseSynCookies bool

func (*TCPAlwaysUseSynCookies) isGettableTransportProtocolOption() {}

func (*TCPAlwaysUseSynCookies) isSettableTransportProtocolOption() {}

const (
        // TCPRACKLossDetection indicates RACK is used for loss detection and
        // recovery.
        TCPRACKLossDetection TCPRecovery = 1 << iota

        // TCPRACKStaticReoWnd indicates the reordering window should not be
        // adjusted when DSACK is received.
        TCPRACKStaticReoWnd

        // TCPRACKNoDupTh indicates RACK should not consider the classic three
        // duplicate acknowledgements rule to mark the segments as lost. This
        // is used when reordering is not detected.
        TCPRACKNoDupTh
)

// TCPDelayEnabled enables/disables Nagle's algorithm in TCP.
type TCPDelayEnabled bool

func (*TCPDelayEnabled) isGettableTransportProtocolOption() {}

func (*TCPDelayEnabled) isSettableTransportProtocolOption() {}

// TCPSendBufferSizeRangeOption is the send buffer size range for TCP.
type TCPSendBufferSizeRangeOption struct {
        Min     int
        Default int
        Max     int
}

func (*TCPSendBufferSizeRangeOption) isGettableTransportProtocolOption() {}

func (*TCPSendBufferSizeRangeOption) isSettableTransportProtocolOption() {}

// TCPReceiveBufferSizeRangeOption is the receive buffer size range for TCP.
type TCPReceiveBufferSizeRangeOption struct {
        Min     int
        Default int
        Max     int
}

func (*TCPReceiveBufferSizeRangeOption) isGettableTransportProtocolOption() {}

func (*TCPReceiveBufferSizeRangeOption) isSettableTransportProtocolOption() {}

// TCPAvailableCongestionControlOption is the supported congestion control
// algorithms for TCP
type TCPAvailableCongestionControlOption string

func (*TCPAvailableCongestionControlOption) isGettableTransportProtocolOption() {}

func (*TCPAvailableCongestionControlOption) isSettableTransportProtocolOption() {}

// TCPModerateReceiveBufferOption enables/disables receive buffer moderation
// for TCP.
type TCPModerateReceiveBufferOption bool

func (*TCPModerateReceiveBufferOption) isGettableTransportProtocolOption() {}

func (*TCPModerateReceiveBufferOption) isSettableTransportProtocolOption() {}

// GettableSocketOption is a marker interface for socket options that may be
// queried.
type GettableSocketOption interface {
        isGettableSocketOption()
}

// SettableSocketOption is a marker interface for socket options that may be
// configured.
type SettableSocketOption interface {
        isSettableSocketOption()
}

// EndpointState represents the state of an endpoint.
type EndpointState uint8

// CongestionControlState indicates the current congestion control state for
// TCP sender.
type CongestionControlState int

const (
        // Open indicates that the sender is receiving acks in order and
        // no loss or dupACK's etc have been detected.
        Open CongestionControlState = iota
        // RTORecovery indicates that an RTO has occurred and the sender
        // has entered an RTO based recovery phase.
        RTORecovery
        // FastRecovery indicates that the sender has entered FastRecovery
        // based on receiving nDupAck's. This state is entered only when
        // SACK is not in use.
        FastRecovery
        // SACKRecovery indicates that the sender has entered SACK based
        // recovery.
        SACKRecovery
        // Disorder indicates the sender either received some SACK blocks
        // or dupACK's.
        Disorder
)

// TCPInfoOption is used by GetSockOpt to expose TCP statistics.
//
// TODO(b/64800844): Add and populate stat fields.
type TCPInfoOption struct {
        // RTT is the smoothed round trip time.
        RTT time.Duration

        // RTTVar is the round trip time variation.
        RTTVar time.Duration

        // RTO is the retransmission timeout for the endpoint.
        RTO time.Duration

        // State is the current endpoint protocol state.
        State EndpointState

        // CcState is the congestion control state.
        CcState CongestionControlState

        // SndCwnd is the congestion window, in packets.
        SndCwnd uint32

        // SndSsthresh is the threshold between slow start and congestion
        // avoidance.
        SndSsthresh uint32

        // ReorderSeen indicates if reordering is seen in the endpoint.
        ReorderSeen bool
}

func (*TCPInfoOption) isGettableSocketOption() {}

// KeepaliveIdleOption is used by SetSockOpt/GetSockOpt to specify the time a
// connection must remain idle before the first TCP keepalive packet is sent.
// Once this time is reached, KeepaliveIntervalOption is used instead.
type KeepaliveIdleOption time.Duration

func (*KeepaliveIdleOption) isGettableSocketOption() {}

func (*KeepaliveIdleOption) isSettableSocketOption() {}

// KeepaliveIntervalOption is used by SetSockOpt/GetSockOpt to specify the
// interval between sending TCP keepalive packets.
type KeepaliveIntervalOption time.Duration

func (*KeepaliveIntervalOption) isGettableSocketOption() {}

func (*KeepaliveIntervalOption) isSettableSocketOption() {}

// TCPUserTimeoutOption is used by SetSockOpt/GetSockOpt to specify a user
// specified timeout for a given TCP connection.
// See: RFC5482 for details.
type TCPUserTimeoutOption time.Duration

func (*TCPUserTimeoutOption) isGettableSocketOption() {}

func (*TCPUserTimeoutOption) isSettableSocketOption() {}

// CongestionControlOption is used by SetSockOpt/GetSockOpt to set/get
// the current congestion control algorithm.
type CongestionControlOption string

func (*CongestionControlOption) isGettableSocketOption() {}

func (*CongestionControlOption) isSettableSocketOption() {}

func (*CongestionControlOption) isGettableTransportProtocolOption() {}

func (*CongestionControlOption) isSettableTransportProtocolOption() {}

// TCPLingerTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
// maximum duration for which a socket lingers in the TCP_FIN_WAIT_2 state
// before being marked closed.
type TCPLingerTimeoutOption time.Duration

func (*TCPLingerTimeoutOption) isGettableSocketOption() {}

func (*TCPLingerTimeoutOption) isSettableSocketOption() {}

func (*TCPLingerTimeoutOption) isGettableTransportProtocolOption() {}

func (*TCPLingerTimeoutOption) isSettableTransportProtocolOption() {}

// TCPTimeWaitTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
// maximum duration for which a socket lingers in the TIME_WAIT state
// before being marked closed.
type TCPTimeWaitTimeoutOption time.Duration

func (*TCPTimeWaitTimeoutOption) isGettableSocketOption() {}

func (*TCPTimeWaitTimeoutOption) isSettableSocketOption() {}

func (*TCPTimeWaitTimeoutOption) isGettableTransportProtocolOption() {}

func (*TCPTimeWaitTimeoutOption) isSettableTransportProtocolOption() {}

// TCPDeferAcceptOption is used by SetSockOpt/GetSockOpt to allow a
// accept to return a completed connection only when there is data to be
// read. This usually means the listening socket will drop the final ACK
// for a handshake till the specified timeout until a segment with data arrives.
type TCPDeferAcceptOption time.Duration

func (*TCPDeferAcceptOption) isGettableSocketOption() {}

func (*TCPDeferAcceptOption) isSettableSocketOption() {}

// TCPMinRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
// default MinRTO used by the Stack.
type TCPMinRTOOption time.Duration

func (*TCPMinRTOOption) isGettableSocketOption() {}

func (*TCPMinRTOOption) isSettableSocketOption() {}

func (*TCPMinRTOOption) isGettableTransportProtocolOption() {}

func (*TCPMinRTOOption) isSettableTransportProtocolOption() {}

// TCPMaxRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
// default MaxRTO used by the Stack.
type TCPMaxRTOOption time.Duration

func (*TCPMaxRTOOption) isGettableSocketOption() {}

func (*TCPMaxRTOOption) isSettableSocketOption() {}

func (*TCPMaxRTOOption) isGettableTransportProtocolOption() {}

func (*TCPMaxRTOOption) isSettableTransportProtocolOption() {}

// TCPMaxRetriesOption is used by SetSockOpt/GetSockOpt to set/get the
// maximum number of retransmits after which we time out the connection.
type TCPMaxRetriesOption uint64

func (*TCPMaxRetriesOption) isGettableSocketOption() {}

func (*TCPMaxRetriesOption) isSettableSocketOption() {}

func (*TCPMaxRetriesOption) isGettableTransportProtocolOption() {}

func (*TCPMaxRetriesOption) isSettableTransportProtocolOption() {}

// TCPSynRetriesOption is used by SetSockOpt/GetSockOpt to specify stack-wide
// default for number of times SYN is retransmitted before aborting a connect.
type TCPSynRetriesOption uint8

func (*TCPSynRetriesOption) isGettableSocketOption() {}

func (*TCPSynRetriesOption) isSettableSocketOption() {}

func (*TCPSynRetriesOption) isGettableTransportProtocolOption() {}

func (*TCPSynRetriesOption) isSettableTransportProtocolOption() {}

// MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a
// default interface for multicast.
type MulticastInterfaceOption struct {
        NIC           NICID
        InterfaceAddr Address
}

func (*MulticastInterfaceOption) isGettableSocketOption() {}

func (*MulticastInterfaceOption) isSettableSocketOption() {}

// MembershipOption is used to identify a multicast membership on an interface.
type MembershipOption struct {
        NIC           NICID
        InterfaceAddr Address
        MulticastAddr Address
}

// AddMembershipOption identifies a multicast group to join on some interface.
type AddMembershipOption MembershipOption

func (*AddMembershipOption) isSettableSocketOption() {}

// RemoveMembershipOption identifies a multicast group to leave on some
// interface.
type RemoveMembershipOption MembershipOption

func (*RemoveMembershipOption) isSettableSocketOption() {}

// SocketDetachFilterOption is used by SetSockOpt to detach a previously attached
// classic BPF filter on a given endpoint.
type SocketDetachFilterOption int

func (*SocketDetachFilterOption) isSettableSocketOption() {}

// OriginalDestinationOption is used to get the original destination address
// and port of a redirected packet.
type OriginalDestinationOption FullAddress

func (*OriginalDestinationOption) isGettableSocketOption() {}

// TCPTimeWaitReuseOption is used stack.(*Stack).TransportProtocolOption to
// specify if the stack can reuse the port bound by an endpoint in TIME-WAIT for
// new connections when it is safe from protocol viewpoint.
type TCPTimeWaitReuseOption uint8

func (*TCPTimeWaitReuseOption) isGettableSocketOption() {}

func (*TCPTimeWaitReuseOption) isSettableSocketOption() {}

func (*TCPTimeWaitReuseOption) isGettableTransportProtocolOption() {}

func (*TCPTimeWaitReuseOption) isSettableTransportProtocolOption() {}

const (
        // TCPTimeWaitReuseDisabled indicates reuse of port bound by endponts in TIME-WAIT cannot
        // be reused for new connections.
        TCPTimeWaitReuseDisabled TCPTimeWaitReuseOption = iota

        // TCPTimeWaitReuseGlobal indicates reuse of port bound by endponts in TIME-WAIT can
        // be reused for new connections irrespective of the src/dest addresses.
        TCPTimeWaitReuseGlobal

        // TCPTimeWaitReuseLoopbackOnly indicates reuse of port bound by endpoint in TIME-WAIT can
        // only be reused if the connection was a connection over loopback. i.e src/dest adddresses
        // are loopback addresses.
        TCPTimeWaitReuseLoopbackOnly
)

// LingerOption is used by SetSockOpt/GetSockOpt to set/get the
// duration for which a socket lingers before returning from Close.
//
// +marshal
// +stateify savable
type LingerOption struct {
        Enabled bool
        Timeout time.Duration
}

// IPPacketInfo is the message structure for IP_PKTINFO.
//
// +stateify savable
type IPPacketInfo struct {
        // NIC is the ID of the NIC to be used.
        NIC NICID

        // LocalAddr is the local address.
        LocalAddr Address

        // DestinationAddr is the destination address found in the IP header.
        DestinationAddr Address
}

// SendBufferSizeOption is used by stack.(Stack*).Option/SetOption to
// get/set the default, min and max send buffer sizes.
type SendBufferSizeOption struct {
        // Min is the minimum size for send buffer.
        Min int

        // Default is the default size for send buffer.
        Default int

        // Max is the maximum size for send buffer.
        Max int
}

// ReceiveBufferSizeOption is used by stack.(Stack*).Option/SetOption to
// get/set the default, min and max receive buffer sizes.
type ReceiveBufferSizeOption struct {
        // Min is the minimum size for send buffer.
        Min int

        // Default is the default size for send buffer.
        Default int

        // Max is the maximum size for send buffer.
        Max int
}

// GetSendBufferLimits is used to get the send buffer size limits.
type GetSendBufferLimits func(StackHandler) SendBufferSizeOption

// GetStackSendBufferLimits is used to get default, min and max send buffer size.
func GetStackSendBufferLimits(so StackHandler) SendBufferSizeOption {
        var ss SendBufferSizeOption
        if err := so.Option(&ss); err != nil {
                panic(fmt.Sprintf("s.Option(%#v) = %s", ss, err))
        }
        return ss
}

// GetReceiveBufferLimits is used to get the send buffer size limits.
type GetReceiveBufferLimits func(StackHandler) ReceiveBufferSizeOption

// GetStackReceiveBufferLimits is used to get default, min and max send buffer size.
func GetStackReceiveBufferLimits(so StackHandler) ReceiveBufferSizeOption {
        var ss ReceiveBufferSizeOption
        if err := so.Option(&ss); err != nil {
                panic(fmt.Sprintf("s.Option(%#v) = %s", ss, err))
        }
        return ss
}

// Route is a row in the routing table. It specifies through which NIC (and
// gateway) sets of packets should be routed. A row is considered viable if the
// masked target address matches the destination address in the row.
type Route struct {
        // Destination must contain the target address for this row to be viable.
        Destination Subnet

        // Gateway is the gateway to be used if this row is viable.
        Gateway Address

        // NIC is the id of the nic to be used if this row is viable.
        NIC NICID
}

// String implements the fmt.Stringer interface.
func (r Route) String() string {
        var out strings.Builder
        fmt.Fprintf(&out, "%s", r.Destination)
        if len(r.Gateway) > 0 {
                fmt.Fprintf(&out, " via %s", r.Gateway)
        }
        fmt.Fprintf(&out, " nic %d", r.NIC)
        return out.String()
}

// Equal returns true if the given Route is equal to this Route.
func (r Route) Equal(to Route) bool {
        // NOTE: This relies on the fact that r.Destination == to.Destination
        return r == to
}

// TransportProtocolNumber is the number of a transport protocol.
type TransportProtocolNumber uint32

// NetworkProtocolNumber is the EtherType of a network protocol in an Ethernet
// frame.
//
// See: https://www.iana.org/assignments/ieee-802-numbers/ieee-802-numbers.xhtml
type NetworkProtocolNumber uint32

// A StatCounter keeps track of a statistic.
type StatCounter struct {
        count atomicbitops.AlignedAtomicUint64
}

// Increment adds one to the counter.
func (s *StatCounter) Increment() {
        s.IncrementBy(1)
}

// Decrement minuses one to the counter.
func (s *StatCounter) Decrement() {
        s.IncrementBy(^uint64(0))
}

// Value returns the current value of the counter.
func (s *StatCounter) Value(name ...string) uint64 {
        return s.count.Load()
}

// IncrementBy increments the counter by v.
func (s *StatCounter) IncrementBy(v uint64) {
        s.count.Add(v)
}

func (s *StatCounter) String() string {
        return strconv.FormatUint(s.Value(), 10)
}

// A MultiCounterStat keeps track of two counters at once.
type MultiCounterStat struct {
        a, b *StatCounter
}

// Init sets both internal counters to point to a and b.
func (m *MultiCounterStat) Init(a, b *StatCounter) {
        m.a = a
        m.b = b
}

// Increment adds one to the counters.
func (m *MultiCounterStat) Increment() {
        m.a.Increment()
        m.b.Increment()
}

// IncrementBy increments the counters by v.
func (m *MultiCounterStat) IncrementBy(v uint64) {
        m.a.IncrementBy(v)
        m.b.IncrementBy(v)
}

// ICMPv4PacketStats enumerates counts for all ICMPv4 packet types.
type ICMPv4PacketStats struct {
        // LINT.IfChange(ICMPv4PacketStats)

        // EchoRequest is the number of ICMPv4 echo packets counted.
        EchoRequest *StatCounter

        // EchoReply is the number of ICMPv4 echo reply packets counted.
        EchoReply *StatCounter

        // DstUnreachable is the number of ICMPv4 destination unreachable packets
        // counted.
        DstUnreachable *StatCounter

        // SrcQuench is the number of ICMPv4 source quench packets counted.
        SrcQuench *StatCounter

        // Redirect is the number of ICMPv4 redirect packets counted.
        Redirect *StatCounter

        // TimeExceeded is the number of ICMPv4 time exceeded packets counted.
        TimeExceeded *StatCounter

        // ParamProblem is the number of ICMPv4 parameter problem packets counted.
        ParamProblem *StatCounter

        // Timestamp is the number of ICMPv4 timestamp packets counted.
        Timestamp *StatCounter

        // TimestampReply is the number of ICMPv4 timestamp reply packets counted.
        TimestampReply *StatCounter

        // InfoRequest is the number of ICMPv4 information request packets counted.
        InfoRequest *StatCounter

        // InfoReply is the number of ICMPv4 information reply packets counted.
        InfoReply *StatCounter

        // LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4PacketStats)
}

// ICMPv4SentPacketStats collects outbound ICMPv4-specific stats.
type ICMPv4SentPacketStats struct {
        // LINT.IfChange(ICMPv4SentPacketStats)

        ICMPv4PacketStats

        // Dropped is the number of ICMPv4 packets dropped due to link layer errors.
        Dropped *StatCounter

        // RateLimited is the number of ICMPv4 packets dropped due to rate limit being
        // exceeded.
        RateLimited *StatCounter

        // LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4SentPacketStats)
}

// ICMPv4ReceivedPacketStats collects inbound ICMPv4-specific stats.
type ICMPv4ReceivedPacketStats struct {
        // LINT.IfChange(ICMPv4ReceivedPacketStats)

        ICMPv4PacketStats

        // Invalid is the number of invalid ICMPv4 packets received.
        Invalid *StatCounter

        // LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4ReceivedPacketStats)
}

// ICMPv4Stats collects ICMPv4-specific stats.
type ICMPv4Stats struct {
        // LINT.IfChange(ICMPv4Stats)

        // PacketsSent contains statistics about sent packets.
        PacketsSent ICMPv4SentPacketStats

        // PacketsReceived contains statistics about received packets.
        PacketsReceived ICMPv4ReceivedPacketStats

        // LINT.ThenChange(network/ipv4/stats.go:multiCounterICMPv4Stats)
}

// ICMPv6PacketStats enumerates counts for all ICMPv6 packet types.
type ICMPv6PacketStats struct {
        // LINT.IfChange(ICMPv6PacketStats)

        // EchoRequest is the number of ICMPv6 echo request packets counted.
        EchoRequest *StatCounter

        // EchoReply is the number of ICMPv6 echo reply packets counted.
        EchoReply *StatCounter

        // DstUnreachable is the number of ICMPv6 destination unreachable packets
        // counted.
        DstUnreachable *StatCounter

        // PacketTooBig is the number of ICMPv6 packet too big packets counted.
        PacketTooBig *StatCounter

        // TimeExceeded is the number of ICMPv6 time exceeded packets counted.
        TimeExceeded *StatCounter

        // ParamProblem is the number of ICMPv6 parameter problem packets counted.
        ParamProblem *StatCounter

        // RouterSolicit is the number of ICMPv6 router solicit packets counted.
        RouterSolicit *StatCounter

        // RouterAdvert is the number of ICMPv6 router advert packets counted.
        RouterAdvert *StatCounter

        // NeighborSolicit is the number of ICMPv6 neighbor solicit packets counted.
        NeighborSolicit *StatCounter

        // NeighborAdvert is the number of ICMPv6 neighbor advert packets counted.
        NeighborAdvert *StatCounter

        // RedirectMsg is the number of ICMPv6 redirect message packets counted.
        RedirectMsg *StatCounter

        // MulticastListenerQuery is the number of Multicast Listener Query messages
        // counted.
        MulticastListenerQuery *StatCounter

        // MulticastListenerReport is the number of Multicast Listener Report messages
        // counted.
        MulticastListenerReport *StatCounter

        // MulticastListenerDone is the number of Multicast Listener Done messages
        // counted.
        MulticastListenerDone *StatCounter

        // LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6PacketStats)
}

// ICMPv6SentPacketStats collects outbound ICMPv6-specific stats.
type ICMPv6SentPacketStats struct {
        // LINT.IfChange(ICMPv6SentPacketStats)

        ICMPv6PacketStats

        // Dropped is the number of ICMPv6 packets dropped due to link layer errors.
        Dropped *StatCounter

        // RateLimited is the number of ICMPv6 packets dropped due to rate limit being
        // exceeded.
        RateLimited *StatCounter

        // LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6SentPacketStats)
}

// ICMPv6ReceivedPacketStats collects inbound ICMPv6-specific stats.
type ICMPv6ReceivedPacketStats struct {
        // LINT.IfChange(ICMPv6ReceivedPacketStats)

        ICMPv6PacketStats

        // Unrecognized is the number of ICMPv6 packets received that the transport
        // layer does not know how to parse.
        Unrecognized *StatCounter

        // Invalid is the number of invalid ICMPv6 packets received.
        Invalid *StatCounter

        // RouterOnlyPacketsDroppedByHost is the number of ICMPv6 packets dropped due
        // to being router-specific packets.
        RouterOnlyPacketsDroppedByHost *StatCounter

        // LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6ReceivedPacketStats)
}

// ICMPv6Stats collects ICMPv6-specific stats.
type ICMPv6Stats struct {
        // LINT.IfChange(ICMPv6Stats)

        // PacketsSent contains statistics about sent packets.
        PacketsSent ICMPv6SentPacketStats

        // PacketsReceived contains statistics about received packets.
        PacketsReceived ICMPv6ReceivedPacketStats

        // LINT.ThenChange(network/ipv6/stats.go:multiCounterICMPv6Stats)
}

// ICMPStats collects ICMP-specific stats (both v4 and v6).
type ICMPStats struct {
        // V4 contains the ICMPv4-specifics stats.
        V4 ICMPv4Stats

        // V6 contains the ICMPv4-specifics stats.
        V6 ICMPv6Stats
}

// IGMPPacketStats enumerates counts for all IGMP packet types.
type IGMPPacketStats struct {
        // LINT.IfChange(IGMPPacketStats)

        // MembershipQuery is the number of Membership Query messages counted.
        MembershipQuery *StatCounter

        // V1MembershipReport is the number of Version 1 Membership Report messages
        // counted.
        V1MembershipReport *StatCounter

        // V2MembershipReport is the number of Version 2 Membership Report messages
        // counted.
        V2MembershipReport *StatCounter

        // LeaveGroup is the number of Leave Group messages counted.
        LeaveGroup *StatCounter

        // LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPPacketStats)
}

// IGMPSentPacketStats collects outbound IGMP-specific stats.
type IGMPSentPacketStats struct {
        // LINT.IfChange(IGMPSentPacketStats)

        IGMPPacketStats

        // Dropped is the number of IGMP packets dropped.
        Dropped *StatCounter

        // LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPSentPacketStats)
}

// IGMPReceivedPacketStats collects inbound IGMP-specific stats.
type IGMPReceivedPacketStats struct {
        // LINT.IfChange(IGMPReceivedPacketStats)

        IGMPPacketStats

        // Invalid is the number of invalid IGMP packets received.
        Invalid *StatCounter

        // ChecksumErrors is the number of IGMP packets dropped due to bad checksums.
        ChecksumErrors *StatCounter

        // Unrecognized is the number of unrecognized messages counted, these are
        // silently ignored for forward-compatibilty.
        Unrecognized *StatCounter

        // LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPReceivedPacketStats)
}

// IGMPStats collects IGMP-specific stats.
type IGMPStats struct {
        // LINT.IfChange(IGMPStats)

        // PacketsSent contains statistics about sent packets.
        PacketsSent IGMPSentPacketStats

        // PacketsReceived contains statistics about received packets.
        PacketsReceived IGMPReceivedPacketStats

        // LINT.ThenChange(network/ipv4/stats.go:multiCounterIGMPStats)
}

// IPForwardingStats collects stats related to IP forwarding (both v4 and v6).
type IPForwardingStats struct {
        // LINT.IfChange(IPForwardingStats)

        // Unrouteable is the number of IP packets received which were dropped
        // because a route to their destination could not be constructed.
        Unrouteable *StatCounter

        // ExhaustedTTL is the number of IP packets received which were dropped
        // because their TTL was exhausted.
        ExhaustedTTL *StatCounter

        // LinkLocalSource is the number of IP packets which were dropped
        // because they contained a link-local source address.
        LinkLocalSource *StatCounter

        // LinkLocalDestination is the number of IP packets which were dropped
        // because they contained a link-local destination address.
        LinkLocalDestination *StatCounter

        // PacketTooBig is the number of IP packets which were dropped because they
        // were too big for the outgoing MTU.
        PacketTooBig *StatCounter

        // HostUnreachable is the number of IP packets received which could not be
        // successfully forwarded due to an unresolvable next hop.
        HostUnreachable *StatCounter

        // ExtensionHeaderProblem is the number of IP packets which were dropped
        // because of a problem encountered when processing an IPv6 extension
        // header.
        ExtensionHeaderProblem *StatCounter

        // Errors is the number of IP packets received which could not be
        // successfully forwarded.
        Errors *StatCounter

        // LINT.ThenChange(network/internal/ip/stats.go:multiCounterIPForwardingStats)
}

// IPStats collects IP-specific stats (both v4 and v6).
type IPStats struct {
        // LINT.IfChange(IPStats)

        // PacketsReceived is the number of IP packets received from the link layer.
        PacketsReceived *StatCounter

        // ValidPacketsReceived is the number of valid IP packets that reached the IP
        // layer.
        ValidPacketsReceived *StatCounter

        // DisabledPacketsReceived is the number of IP packets received from the link
        // layer when the IP layer is disabled.
        DisabledPacketsReceived *StatCounter

        // InvalidDestinationAddressesReceived is the number of IP packets received
        // with an unknown or invalid destination address.
        InvalidDestinationAddressesReceived *StatCounter

        // InvalidSourceAddressesReceived is the number of IP packets received with a
        // source address that should never have been received on the wire.
        InvalidSourceAddressesReceived *StatCounter

        // PacketsDelivered is the number of incoming IP packets that are successfully
        // delivered to the transport layer.
        PacketsDelivered *StatCounter

        // PacketsSent is the number of IP packets sent via WritePacket.
        PacketsSent *StatCounter

        // OutgoingPacketErrors is the number of IP packets which failed to write to a
        // link-layer endpoint.
        OutgoingPacketErrors *StatCounter

        // MalformedPacketsReceived is the number of IP Packets that were dropped due
        // to the IP packet header failing validation checks.
        MalformedPacketsReceived *StatCounter

        // MalformedFragmentsReceived is the number of IP Fragments that were dropped
        // due to the fragment failing validation checks.
        MalformedFragmentsReceived *StatCounter

        // IPTablesPreroutingDropped is the number of IP packets dropped in the
        // Prerouting chain.
        IPTablesPreroutingDropped *StatCounter

        // IPTablesInputDropped is the number of IP packets dropped in the Input
        // chain.
        IPTablesInputDropped *StatCounter

        // IPTablesForwardDropped is the number of IP packets dropped in the Forward
        // chain.
        IPTablesForwardDropped *StatCounter

        // IPTablesOutputDropped is the number of IP packets dropped in the Output
        // chain.
        IPTablesOutputDropped *StatCounter

        // IPTablesPostroutingDropped is the number of IP packets dropped in the
        // Postrouting chain.
        IPTablesPostroutingDropped *StatCounter

        // TODO(https://gvisor.dev/issues/5529): Move the IPv4-only option stats out
        // of IPStats.
        // OptionTimestampReceived is the number of Timestamp options seen.
        OptionTimestampReceived *StatCounter

        // OptionRecordRouteReceived is the number of Record Route options seen.
        OptionRecordRouteReceived *StatCounter

        // OptionRouterAlertReceived is the number of Router Alert options seen.
        OptionRouterAlertReceived *StatCounter

        // OptionUnknownReceived is the number of unknown IP options seen.
        OptionUnknownReceived *StatCounter

        // Forwarding collects stats related to IP forwarding.
        Forwarding IPForwardingStats

        // LINT.ThenChange(network/internal/ip/stats.go:MultiCounterIPStats)
}

// ARPStats collects ARP-specific stats.
type ARPStats struct {
        // LINT.IfChange(ARPStats)

        // PacketsReceived is the number of ARP packets received from the link layer.
        PacketsReceived *StatCounter

        // DisabledPacketsReceived is the number of ARP packets received from the link
        // layer when the ARP layer is disabled.
        DisabledPacketsReceived *StatCounter

        // MalformedPacketsReceived is the number of ARP packets that were dropped due
        // to being malformed.
        MalformedPacketsReceived *StatCounter

        // RequestsReceived is the number of ARP requests received.
        RequestsReceived *StatCounter

        // RequestsReceivedUnknownTargetAddress is the number of ARP requests that
        // were targeted to an interface different from the one it was received on.
        RequestsReceivedUnknownTargetAddress *StatCounter

        // OutgoingRequestInterfaceHasNoLocalAddressErrors is the number of failures
        // to send an ARP request because the interface has no network address
        // assigned to it.
        OutgoingRequestInterfaceHasNoLocalAddressErrors *StatCounter

        // OutgoingRequestBadLocalAddressErrors is the number of failures to send an
        // ARP request with a bad local address.
        OutgoingRequestBadLocalAddressErrors *StatCounter

        // OutgoingRequestsDropped is the number of ARP requests which failed to write
        // to a link-layer endpoint.
        OutgoingRequestsDropped *StatCounter

        // OutgoingRequestSent is the number of ARP requests successfully written to a
        // link-layer endpoint.
        OutgoingRequestsSent *StatCounter

        // RepliesReceived is the number of ARP replies received.
        RepliesReceived *StatCounter

        // OutgoingRepliesDropped is the number of ARP replies which failed to write
        // to a link-layer endpoint.
        OutgoingRepliesDropped *StatCounter

        // OutgoingRepliesSent is the number of ARP replies successfully written to a
        // link-layer endpoint.
        OutgoingRepliesSent *StatCounter

        // LINT.ThenChange(network/arp/stats.go:multiCounterARPStats)
}

// TCPStats collects TCP-specific stats.
type TCPStats struct {
        // ActiveConnectionOpenings is the number of connections opened
        // successfully via Connect.
        ActiveConnectionOpenings *StatCounter

        // PassiveConnectionOpenings is the number of connections opened
        // successfully via Listen.
        PassiveConnectionOpenings *StatCounter

        // CurrentEstablished is the number of TCP connections for which the
        // current state is ESTABLISHED.
        CurrentEstablished *StatCounter

        // CurrentConnected is the number of TCP connections that
        // are in connected state.
        CurrentConnected *StatCounter

        // EstablishedResets is the number of times TCP connections have made
        // a direct transition to the CLOSED state from either the
        // ESTABLISHED state or the CLOSE-WAIT state.
        EstablishedResets *StatCounter

        // EstablishedClosed is the number of times established TCP connections
        // made a transition to CLOSED state.
        EstablishedClosed *StatCounter

        // EstablishedTimedout is the number of times an established connection
        // was reset because of keep-alive time out.
        EstablishedTimedout *StatCounter

        // ListenOverflowSynDrop is the number of times the listen queue overflowed
        // and a SYN was dropped.
        ListenOverflowSynDrop *StatCounter

        // ListenOverflowAckDrop is the number of times the final ACK
        // in the handshake was dropped due to overflow.
        ListenOverflowAckDrop *StatCounter

        // ListenOverflowCookieSent is the number of times a SYN cookie was sent.
        ListenOverflowSynCookieSent *StatCounter

        // ListenOverflowSynCookieRcvd is the number of times a valid SYN
        // cookie was received.
        ListenOverflowSynCookieRcvd *StatCounter

        // ListenOverflowInvalidSynCookieRcvd is the number of times an invalid SYN cookie
        // was received.
        ListenOverflowInvalidSynCookieRcvd *StatCounter

        // FailedConnectionAttempts is the number of calls to Connect or Listen
        // (active and passive openings, respectively) that end in an error.
        FailedConnectionAttempts *StatCounter

        // ValidSegmentsReceived is the number of TCP segments received that
        // the transport layer successfully parsed.
        ValidSegmentsReceived *StatCounter

        // InvalidSegmentsReceived is the number of TCP segments received that
        // the transport layer could not parse.
        InvalidSegmentsReceived *StatCounter

        // SegmentsSent is the number of TCP segments sent.
        SegmentsSent *StatCounter

        // SegmentSendErrors is the number of TCP segments failed to be sent.
        SegmentSendErrors *StatCounter

        // ResetsSent is the number of TCP resets sent.
        ResetsSent *StatCounter

        // ResetsReceived is the number of TCP resets received.
        ResetsReceived *StatCounter

        // Retransmits is the number of TCP segments retransmitted.
        Retransmits *StatCounter

        // FastRecovery is the number of times Fast Recovery was used to
        // recover from packet loss.
        FastRecovery *StatCounter

        // SACKRecovery is the number of times SACK Recovery was used to
        // recover from packet loss.
        SACKRecovery *StatCounter

        // TLPRecovery is the number of times recovery was accomplished by the tail
        // loss probe.
        TLPRecovery *StatCounter

        // SlowStartRetransmits is the number of segments retransmitted in slow
        // start.
        SlowStartRetransmits *StatCounter

        // FastRetransmit is the number of segments retransmitted in fast
        // recovery.
        FastRetransmit *StatCounter

        // Timeouts is the number of times the RTO expired.
        Timeouts *StatCounter

        // ChecksumErrors is the number of segments dropped due to bad checksums.
        ChecksumErrors *StatCounter

        // FailedPortReservations is the number of times TCP failed to reserve
        // a port.
        FailedPortReservations *StatCounter
}

// UDPStats collects UDP-specific stats.
type UDPStats struct {
        // PacketsReceived is the number of UDP datagrams received via
        // HandlePacket.
        PacketsReceived *StatCounter

        // UnknownPortErrors is the number of incoming UDP datagrams dropped
        // because they did not have a known destination port.
        UnknownPortErrors *StatCounter

        // ReceiveBufferErrors is the number of incoming UDP datagrams dropped
        // due to the receiving buffer being in an invalid state.
        ReceiveBufferErrors *StatCounter

        // MalformedPacketsReceived is the number of incoming UDP datagrams
        // dropped due to the UDP header being in a malformed state.
        MalformedPacketsReceived *StatCounter

        // PacketsSent is the number of UDP datagrams sent via sendUDP.
        PacketsSent *StatCounter

        // PacketSendErrors is the number of datagrams failed to be sent.
        PacketSendErrors *StatCounter

        // ChecksumErrors is the number of datagrams dropped due to bad checksums.
        ChecksumErrors *StatCounter
}

// NICNeighborStats holds metrics for the neighbor table.
type NICNeighborStats struct {
        // LINT.IfChange(NICNeighborStats)

        // UnreachableEntryLookups counts the number of lookups performed on an
        // entry in Unreachable state.
        UnreachableEntryLookups *StatCounter

        // LINT.ThenChange(stack/nic_stats.go:multiCounterNICNeighborStats)
}

// NICPacketStats holds basic packet statistics.
type NICPacketStats struct {
        // LINT.IfChange(NICPacketStats)

        // Packets is the number of packets counted.
        Packets *StatCounter

        // Bytes is the number of bytes counted.
        Bytes *StatCounter

        // LINT.ThenChange(stack/nic_stats.go:multiCounterNICPacketStats)
}

// NICStats holds NIC statistics.
type NICStats struct {
        // LINT.IfChange(NICStats)

        // UnknownL3ProtocolRcvdPackets is the number of packets received that were
        // for an unknown or unsupported network protocol.
        UnknownL3ProtocolRcvdPackets *StatCounter

        // UnknownL4ProtocolRcvdPackets is the number of packets received that were
        // for an unknown or unsupported transport protocol.
        UnknownL4ProtocolRcvdPackets *StatCounter

        // MalformedL4RcvdPackets is the number of packets received by a NIC that
        // could not be delivered to a transport endpoint because the L4 header could
        // not be parsed.
        MalformedL4RcvdPackets *StatCounter

        // Tx contains statistics about transmitted packets.
        Tx NICPacketStats

        // Rx contains statistics about received packets.
        Rx NICPacketStats

        // DisabledRx contains statistics about received packets on disabled NICs.
        DisabledRx NICPacketStats

        // Neighbor contains statistics about neighbor entries.
        Neighbor NICNeighborStats

        // LINT.ThenChange(stack/nic_stats.go:multiCounterNICStats)
}

// FillIn returns a copy of s with nil fields initialized to new StatCounters.
func (s NICStats) FillIn() NICStats {
        InitStatCounters(reflect.ValueOf(&s).Elem())
        return s
}

// Stats holds statistics about the networking stack.
type Stats struct {
        // TODO(https://gvisor.dev/issues/5986): Make the DroppedPackets stat less
        // ambiguous.

        // DroppedPackets is the number of packets dropped at the transport layer.
        DroppedPackets *StatCounter

        // NICs is an aggregation of every NIC's statistics. These should not be
        // incremented using this field, but using the relevant NIC multicounters.
        NICs NICStats

        // ICMP is an aggregation of every NetworkEndpoint's ICMP statistics (both v4
        // and v6). These should not be incremented using this field, but using the
        // relevant NetworkEndpoint ICMP multicounters.
        ICMP ICMPStats

        // IGMP is an aggregation of every NetworkEndpoint's IGMP statistics. These
        // should not be incremented using this field, but using the relevant
        // NetworkEndpoint IGMP multicounters.
        IGMP IGMPStats

        // IP is an aggregation of every NetworkEndpoint's IP statistics. These should
        // not be incremented using this field, but using the relevant NetworkEndpoint
        // IP multicounters.
        IP IPStats

        // ARP is an aggregation of every NetworkEndpoint's ARP statistics. These
        // should not be incremented using this field, but using the relevant
        // NetworkEndpoint ARP multicounters.
        ARP ARPStats

        // TCP holds TCP-specific stats.
        TCP TCPStats

        // UDP holds UDP-specific stats.
        UDP UDPStats
}

// ReceiveErrors collects packet receive errors within transport endpoint.
type ReceiveErrors struct {
        // ReceiveBufferOverflow is the number of received packets dropped
        // due to the receive buffer being full.
        ReceiveBufferOverflow StatCounter

        // MalformedPacketsReceived is the number of incoming packets
        // dropped due to the packet header being in a malformed state.
        MalformedPacketsReceived StatCounter

        // ClosedReceiver is the number of received packets dropped because
        // of receiving endpoint state being closed.
        ClosedReceiver StatCounter

        // ChecksumErrors is the number of packets dropped due to bad checksums.
        ChecksumErrors StatCounter
}

// SendErrors collects packet send errors within the transport layer for
// an endpoint.
type SendErrors struct {
        // SendToNetworkFailed is the number of packets failed to be written to
        // the network endpoint.
        SendToNetworkFailed StatCounter

        // NoRoute is the number of times we failed to resolve IP route.
        NoRoute StatCounter
}

// ReadErrors collects segment read errors from an endpoint read call.
type ReadErrors struct {
        // ReadClosed is the number of received packet drops because the endpoint
        // was shutdown for read.
        ReadClosed StatCounter

        // InvalidEndpointState is the number of times we found the endpoint state
        // to be unexpected.
        InvalidEndpointState StatCounter

        // NotConnected is the number of times we tried to read but found that the
        // endpoint was not connected.
        NotConnected StatCounter
}

// WriteErrors collects packet write errors from an endpoint write call.
type WriteErrors struct {
        // WriteClosed is the number of packet drops because the endpoint
        // was shutdown for write.
        WriteClosed StatCounter

        // InvalidEndpointState is the number of times we found the endpoint state
        // to be unexpected.
        InvalidEndpointState StatCounter

        // InvalidArgs is the number of times invalid input arguments were
        // provided for endpoint Write call.
        InvalidArgs StatCounter
}

// TransportEndpointStats collects statistics about the endpoint.
type TransportEndpointStats struct {
        // PacketsReceived is the number of successful packet receives.
        PacketsReceived StatCounter

        // PacketsSent is the number of successful packet sends.
        PacketsSent StatCounter

        // ReceiveErrors collects packet receive errors within transport layer.
        ReceiveErrors ReceiveErrors

        // ReadErrors collects packet read errors from an endpoint read call.
        ReadErrors ReadErrors

        // SendErrors collects packet send errors within the transport layer.
        SendErrors SendErrors

        // WriteErrors collects packet write errors from an endpoint write call.
        WriteErrors WriteErrors
}

// IsEndpointStats is an empty method to implement the tcpip.EndpointStats
// marker interface.
func (*TransportEndpointStats) IsEndpointStats() {}

// InitStatCounters initializes v's fields with nil StatCounter fields to new
// StatCounters.
func InitStatCounters(v reflect.Value) {
        for i := 0; i < v.NumField(); i++ {
                v := v.Field(i)
                if s, ok := v.Addr().Interface().(**StatCounter); ok {
                        if *s == nil {
                                *s = new(StatCounter)
                        }
                } else {
                        InitStatCounters(v)
                }
        }
}

// FillIn returns a copy of s with nil fields initialized to new StatCounters.
func (s Stats) FillIn() Stats {
        InitStatCounters(reflect.ValueOf(&s).Elem())
        return s
}

// Clone returns a copy of the TransportEndpointStats by atomically reading
// each field.
func (src *TransportEndpointStats) Clone() TransportEndpointStats {
        var dst TransportEndpointStats
        clone(reflect.ValueOf(&dst).Elem(), reflect.ValueOf(src).Elem())
        return dst
}

func clone(dst reflect.Value, src reflect.Value) {
        for i := 0; i < dst.NumField(); i++ {
                d := dst.Field(i)
                s := src.Field(i)
                if c, ok := s.Addr().Interface().(*StatCounter); ok {
                        d.Addr().Interface().(*StatCounter).IncrementBy(c.Value())
                } else {
                        clone(d, s)
                }
        }
}

// String implements the fmt.Stringer interface.
func (a Address) String() string {
        switch len(a) {
        case 4:
                return fmt.Sprintf("%d.%d.%d.%d", int(a[0]), int(a[1]), int(a[2]), int(a[3]))
        case 16:
                // Find the longest subsequence of hexadecimal zeros.
                start, end := -1, -1
                for i := 0; i < len(a); i += 2 {
                        j := i
                        for j < len(a) && a[j] == 0 && a[j+1] == 0 {
                                j += 2
                        }
                        if j > i+2 && j-i > end-start {
                                start, end = i, j
                        }
                }

                var b strings.Builder
                for i := 0; i < len(a); i += 2 {
                        if i == start {
                                b.WriteString("::")
                                i = end
                                if end >= len(a) {
                                        break
                                }
                        } else if i > 0 {
                                b.WriteByte(':')
                        }
                        v := uint16(a[i+0])<<8 | uint16(a[i+1])
                        if v == 0 {
                                b.WriteByte('0')
                        } else {
                                const digits = "0123456789abcdef"
                                for i := uint(3); i < 4; i-- {
                                        if v := v >> (i * 4); v != 0 {
                                                b.WriteByte(digits[v&0xf])
                                        }
                                }
                        }
                }
                return b.String()
        default:
                return fmt.Sprintf("%x", []byte(a))
        }
}

// To4 converts the IPv4 address to a 4-byte representation.
// If the address is not an IPv4 address, To4 returns "".
func (a Address) To4() Address {
        const (
                ipv4len = 4
                ipv6len = 16
        )
        if len(a) == ipv4len {
                return a
        }
        if len(a) == ipv6len &&
                isZeros(a[0:10]) &&
                a[10] == 0xff &&
                a[11] == 0xff {
                return a[12:16]
        }
        return ""
}

// isZeros reports whether a is all zeros.
func isZeros(a Address) bool {
        for i := 0; i < len(a); i++ {
                if a[i] != 0 {
                        return false
                }
        }
        return true
}

// LinkAddress is a byte slice cast as a string that represents a link address.
// It is typically a 6-byte MAC address.
type LinkAddress string

// String implements the fmt.Stringer interface.
func (a LinkAddress) String() string {
        switch len(a) {
        case 6:
                return fmt.Sprintf("%02x:%02x:%02x:%02x:%02x:%02x", a[0], a[1], a[2], a[3], a[4], a[5])
        default:
                return fmt.Sprintf("%x", []byte(a))
        }
}

// ParseMACAddress parses an IEEE 802 address.
//
// It must be in the format aa:bb:cc:dd:ee:ff or aa-bb-cc-dd-ee-ff.
func ParseMACAddress(s string) (LinkAddress, error) {
        parts := strings.FieldsFunc(s, func(c rune) bool {
                return c == ':' || c == '-'
        })
        if len(parts) != 6 {
                return "", fmt.Errorf("inconsistent parts: %s", s)
        }
        addr := make([]byte, 0, len(parts))
        for _, part := range parts {
                u, err := strconv.ParseUint(part, 16, 8)
                if err != nil {
                        return "", fmt.Errorf("invalid hex digits: %s", s)
                }
                addr = append(addr, byte(u))
        }
        return LinkAddress(addr), nil
}

// AddressWithPrefix is an address with its subnet prefix length.
type AddressWithPrefix struct {
        // Address is a network address.
        Address Address

        // PrefixLen is the subnet prefix length.
        PrefixLen int
}

// String implements the fmt.Stringer interface.
func (a AddressWithPrefix) String() string {
        return fmt.Sprintf("%s/%d", a.Address, a.PrefixLen)
}

// Subnet converts the address and prefix into a Subnet value and returns it.
func (a AddressWithPrefix) Subnet() Subnet {
        addrLen := len(a.Address)
        if a.PrefixLen <= 0 {
                return Subnet{
                        address: Address(strings.Repeat("\x00", addrLen)),
                        mask:    AddressMask(strings.Repeat("\x00", addrLen)),
                }
        }
        if a.PrefixLen >= addrLen*8 {
                return Subnet{
                        address: a.Address,
                        mask:    AddressMask(strings.Repeat("\xff", addrLen)),
                }
        }

        sa := make([]byte, addrLen)
        sm := make([]byte, addrLen)
        n := uint(a.PrefixLen)
        for i := 0; i < addrLen; i++ {
                if n >= 8 {
                        sa[i] = a.Address[i]
                        sm[i] = 0xff
                        n -= 8
                        continue
                }
                sm[i] = ^byte(0xff >> n)
                sa[i] = a.Address[i] & sm[i]
                n = 0
        }

        // For extra caution, call NewSubnet rather than directly creating the Subnet
        // value. If that fails it indicates a serious bug in this code, so panic is
        // in order.
        s, err := NewSubnet(Address(sa), AddressMask(sm))
        if err != nil {
                panic("invalid subnet: " + err.Error())
        }
        return s
}

// ProtocolAddress is an address and the network protocol it is associated
// with.
type ProtocolAddress struct {
        // Protocol is the protocol of the address.
        Protocol NetworkProtocolNumber

        // AddressWithPrefix is a network address with its subnet prefix length.
        AddressWithPrefix AddressWithPrefix
}

var (
        // danglingEndpointsMu protects access to danglingEndpoints.
        danglingEndpointsMu sync.Mutex

        // danglingEndpoints tracks all dangling endpoints no longer owned by the app.
        danglingEndpoints = make(map[Endpoint]struct{})
)

// GetDanglingEndpoints returns all dangling endpoints.
func GetDanglingEndpoints() []Endpoint {
        danglingEndpointsMu.Lock()
        es := make([]Endpoint, 0, len(danglingEndpoints))
        for e := range danglingEndpoints {
                es = append(es, e)
        }
        danglingEndpointsMu.Unlock()
        return es
}

// AddDanglingEndpoint adds a dangling endpoint.
func AddDanglingEndpoint(e Endpoint) {
        danglingEndpointsMu.Lock()
        danglingEndpoints[e] = struct{}{}
        danglingEndpointsMu.Unlock()
}

// DeleteDanglingEndpoint removes a dangling endpoint.
func DeleteDanglingEndpoint(e Endpoint) {
        danglingEndpointsMu.Lock()
        delete(danglingEndpoints, e)
        danglingEndpointsMu.Unlock()
}

// AsyncLoading is the global barrier for asynchronous endpoint loading
// activities.
var AsyncLoading sync.WaitGroup

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/tcpip/transport/udp/udp_packet_list.go: no such file or directory



















































































































  597 



  599 















  600 



  600 






























  598 










  602 


  601 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

// This file implements task stops, which represent the equivalent of Linux's
// uninterruptible sleep states in a way that is compatible with save/restore.
// Task stops comprise both internal stops (which form part of the task's
// "normal" control flow) and external stops (which do not); see README.md for
// details.
//
// There are multiple interfaces for interacting with stops because there are
// multiple cases to consider:
//
// - A task goroutine can begin a stop on its associated task (e.g. a
// vfork() syscall stopping the calling task until the child task releases its
// MM). In this case, calling Task.interrupt is both unnecessary (the task
// goroutine obviously cannot be blocked in Task.block or executing application
// code) and undesirable (as it may spuriously interrupt a in-progress
// syscall).
//
// Beginning internal stops in this case is implemented by
// Task.beginInternalStop / Task.beginInternalStopLocked. As of this writing,
// there are no instances of this case that begin external stops, except for
// autosave; however, autosave terminates the sentry without ending the
// external stop, so the spurious interrupt is moot.
//
// - An arbitrary goroutine can begin a stop on an unrelated task (e.g. all
// tasks being stopped in preparation for state checkpointing). If the task
// goroutine may be in Task.block or executing application code, it must be
// interrupted by Task.interrupt for it to actually enter the stop; since,
// strictly speaking, we have no way of determining this, we call
// Task.interrupt unconditionally.
//
// Beginning external stops in this case is implemented by
// Task.BeginExternalStop. As of this writing, there are no instances of this
// case that begin internal stops.
//
// - An arbitrary goroutine can end a stop on an unrelated task (e.g. an
// exiting task resuming a sibling task that has been blocked in an execve()
// syscall waiting for other tasks to exit). In this case, Task.endStopCond
// must be notified to kick the task goroutine out of Task.doStop.
//
// Ending internal stops in this case is implemented by
// Task.endInternalStopLocked. Ending external stops in this case is
// implemented by Task.EndExternalStop.
//
// - Hypothetically, a task goroutine can end an internal stop on its
// associated task. As of this writing, there are no instances of this case.
// However, any instances of this case could still use the above functions,
// since notifying Task.endStopCond would be unnecessary but harmless.

import (
        "fmt"
        "sync/atomic"
)

// A TaskStop is a condition visible to the task control flow graph that
// prevents a task goroutine from running or exiting, i.e. an internal stop.
//
// NOTE(b/30793614): Most TaskStops don't contain any data; they're
// distinguished by their type. The obvious way to implement such a TaskStop
// is:
//
//     type groupStop struct{}
//     func (groupStop) Killable() bool { return true }
//     ...
//     t.beginInternalStop(groupStop{})
//
// However, this doesn't work because the state package can't serialize values,
// only pointers. Furthermore, the correctness of save/restore depends on the
// ability to pass a TaskStop to endInternalStop that will compare equal to the
// TaskStop that was passed to beginInternalStop, even if a save/restore cycle
// occurred between the two. As a result, the current idiom is to always use a
// typecast nil for data-free TaskStops:
//
//     type groupStop struct{}
//     func (*groupStop) Killable() bool { return true }
//     ...
//     t.beginInternalStop((*groupStop)(nil))
//
// This is pretty gross, but the alternatives seem grosser.
type TaskStop interface {
        // Killable returns true if Task.Kill should end the stop prematurely.
        // Killable is analogous to Linux's TASK_WAKEKILL.
        Killable() bool
}

// beginInternalStop indicates the start of an internal stop that applies to t.
//
// Preconditions:
// * The caller must be running on the task goroutine.
// * The task must not already be in an internal stop (i.e. t.stop == nil).
func (t *Task) beginInternalStop(s TaskStop) {
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        t.beginInternalStopLocked(s)
}

// Preconditions: Same as beginInternalStop, plus:
// * The signal mutex must be locked.
func (t *Task) beginInternalStopLocked(s TaskStop) {
        if t.stop != nil {
                panic(fmt.Sprintf("Attempting to enter internal stop %#v when already in internal stop %#v", s, t.stop))
        }
        t.Debugf("Entering internal stop %#v", s)
        t.stop = s
        t.beginStopLocked()
}

// endInternalStopLocked indicates the end of an internal stop that applies to
// t. endInternalStopLocked does not wait for the task to resume.
//
// The caller is responsible for ensuring that the internal stop they expect
// actually applies to t; this requires holding the signal mutex which protects
// t.stop, which is why there is no endInternalStop that locks the signal mutex
// for you.
//
// Preconditions:
// * The signal mutex must be locked.
// * The task must be in an internal stop (i.e. t.stop != nil).
func (t *Task) endInternalStopLocked() {
        if t.stop == nil {
                panic("Attempting to leave non-existent internal stop")
        }
        t.Debugf("Leaving internal stop %#v", t.stop)
        t.stop = nil
        t.endStopLocked()
}

// BeginExternalStop indicates the start of an external stop that applies to t.
// BeginExternalStop does not wait for t's task goroutine to stop.
func (t *Task) BeginExternalStop() {
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        t.beginStopLocked()
        t.interrupt()
}

// EndExternalStop indicates the end of an external stop started by a previous
// call to Task.BeginExternalStop. EndExternalStop does not wait for t's task
// goroutine to resume.
func (t *Task) EndExternalStop() {
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        t.endStopLocked()
}

// beginStopLocked increments t.stopCount to indicate that a new internal or
// external stop applies to t.
//
// Preconditions: The signal mutex must be locked.
func (t *Task) beginStopLocked() {
        if newval := atomic.AddInt32(&t.stopCount, 1); newval <= 0 {
                // Most likely overflow.
                panic(fmt.Sprintf("Invalid stopCount: %d", newval))
        }
}

// endStopLocked decrements t.stopCount to indicate that an existing internal
// or external stop no longer applies to t.
//
// Preconditions: The signal mutex must be locked.
func (t *Task) endStopLocked() {
        if newval := atomic.AddInt32(&t.stopCount, -1); newval < 0 {
                panic(fmt.Sprintf("Invalid stopCount: %d", newval))
        } else if newval == 0 {
                t.endStopCond.Signal()
        }
}

// BeginExternalStop indicates the start of an external stop that applies to
// all current and future tasks in ts. BeginExternalStop does not wait for
// task goroutines to stop.
func (ts *TaskSet) BeginExternalStop() {
        ts.mu.Lock()
        defer ts.mu.Unlock()
        ts.stopCount++
        if ts.stopCount <= 0 {
                panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount))
        }
        if ts.Root == nil {
                return
        }
        for t := range ts.Root.tids {
                t.tg.signalHandlers.mu.Lock()
                t.beginStopLocked()
                t.tg.signalHandlers.mu.Unlock()
                t.interrupt()
        }
}

// PullFullState receives full states for all tasks.
func (ts *TaskSet) PullFullState() {
        ts.mu.Lock()
        defer ts.mu.Unlock()
        if ts.Root == nil {
                return
        }
        for t := range ts.Root.tids {
                t.Activate()
                if mm := t.MemoryManager(); mm != nil {
                        t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())
                }
                t.Deactivate()
        }
}

// EndExternalStop indicates the end of an external stop started by a previous
// call to TaskSet.BeginExternalStop. EndExternalStop does not wait for task
// goroutines to resume.
func (ts *TaskSet) EndExternalStop() {
        ts.mu.Lock()
        defer ts.mu.Unlock()
        ts.stopCount--
        if ts.stopCount < 0 {
                panic(fmt.Sprintf("Invalid stopCount: %d", ts.stopCount))
        }
        if ts.Root == nil {
                return
        }
        for t := range ts.Root.tids {
                t.tg.signalHandlers.mu.Lock()
                t.endStopLocked()
                t.tg.signalHandlers.mu.Unlock()
        }
}
































    3 







   13 








   16 
    1 



   15 
    1 


   14 
    4 


    2 



   12 


    1 




   11 



    4 









    4 
    1 


    3 













    2 


    1 




    8 
















    9 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsbridge"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/loader"
        slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

// Execve implements linux syscall execve(2).
func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pathnameAddr := args[0].Pointer()
        argvAddr := args[1].Pointer()
        envvAddr := args[2].Pointer()
        return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */)
}

// Execveat implements linux syscall execveat(2).
func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        dirfd := args[0].Int()
        pathnameAddr := args[1].Pointer()
        argvAddr := args[2].Pointer()
        envvAddr := args[3].Pointer()
        flags := args[4].Int()
        return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags)
}

func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr hostarch.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
        if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
        if err != nil {
                return 0, nil, err
        }
        var argv, envv []string
        if argvAddr != 0 {
                var err error
                argv, err = t.CopyInVector(argvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
                if err != nil {
                        return 0, nil, err
                }
        }
        if envvAddr != 0 {
                var err error
                envv, err = t.CopyInVector(envvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
                if err != nil {
                        return 0, nil, err
                }
        }

        root := t.FSContext().RootDirectoryVFS2()
        defer root.DecRef(t)
        var executable fsbridge.File
        closeOnExec := false
        if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute {
                // We must open the executable ourselves since dirfd is used as the
                // starting point while resolving path, but the task working directory
                // is used as the starting point while resolving interpreters (Linux:
                // fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() =>
                // do_open_execat(fd=AT_FDCWD)), and the loader package is currently
                // incapable of handling this correctly.
                if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
                        return 0, nil, syserror.ENOENT
                }
                dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd)
                if dirfile == nil {
                        return 0, nil, linuxerr.EBADF
                }
                start := dirfile.VirtualDentry()
                start.IncRef()
                dirfile.DecRef(t)
                closeOnExec = dirfileFlags.CloseOnExec
                file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{
                        Root:               root,
                        Start:              start,
                        Path:               path,
                        FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
                }, &vfs.OpenOptions{
                        Flags:    linux.O_RDONLY,
                        FileExec: true,
                })
                start.DecRef(t)
                if err != nil {
                        return 0, nil, err
                }
                defer file.DecRef(t)
                executable = fsbridge.NewVFSFile(file)
        }

        // Load the new TaskImage.
        mntns := t.MountNamespaceVFS2()
        wd := t.FSContext().WorkingDirectoryVFS2()
        defer wd.DecRef(t)
        remainingTraversals := uint(linux.MaxSymlinkTraversals)
        loadArgs := loader.LoadArgs{
                Opener:              fsbridge.NewVFSLookup(mntns, root, wd),
                RemainingTraversals: &remainingTraversals,
                ResolveFinal:        flags&linux.AT_SYMLINK_NOFOLLOW == 0,
                Filename:            pathname,
                File:                executable,
                CloseOnExec:         closeOnExec,
                Argv:                argv,
                Envv:                envv,
                Features:            t.Arch().FeatureSet(),
        }

        image, se := t.Kernel().LoadTaskImage(t, loadArgs)
        if se != nil {
                return 0, nil, se.ToError()
        }

        ctrl, err := t.Execve(image)
        return 0, ctrl, err
}



























  132 







  132 





  132 





  132 




  131 
  120 



  122 






  122 





  122 






  129 







  129 



  129 
  129 




  129 





  126 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package host

import (
        "unsafe"

        "golang.org/x/sys/unix"
)

// fdReadVec receives from fd to bufs.
//
// If the total length of bufs is > maxlen, fdReadVec will do a partial read
// and err will indicate why the message was truncated.
func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (readLen int64, msgLen int64, controlLen uint64, controlTrunc bool, err error) {
        flags := uintptr(unix.MSG_DONTWAIT | unix.MSG_TRUNC)
        if peek {
                flags |= unix.MSG_PEEK
        }

        // Always truncate the receive buffer. All socket types will truncate
        // received messages.
        length, iovecs, intermediate, err := buildIovec(bufs, maxlen, true)
        if err != nil && len(iovecs) == 0 {
                // No partial write to do, return error immediately.
                return 0, 0, 0, false, err
        }

        var msg unix.Msghdr
        if len(control) != 0 {
                msg.Control = &control[0]
                msg.Controllen = uint64(len(control))
        }

        if len(iovecs) != 0 {
                msg.Iov = &iovecs[0]
                msg.Iovlen = uint64(len(iovecs))
        }

        rawN, _, e := unix.RawSyscall(unix.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
        if e != 0 {
                // N.B. prioritize the syscall error over the buildIovec error.
                return 0, 0, 0, false, e
        }
        n := int64(rawN)

        // Copy data back to bufs.
        if intermediate != nil {
                copyToMulti(bufs, intermediate)
        }

        controlTrunc = msg.Flags&unix.MSG_CTRUNC == unix.MSG_CTRUNC

        if n > length {
                return length, n, msg.Controllen, controlTrunc, nil
        }

        return n, n, msg.Controllen, controlTrunc, nil
}

// fdWriteVec sends from bufs to fd.
//
// If the total length of bufs is > maxlen && truncate, fdWriteVec will do a
// partial write and err will indicate why the message was truncated.
func fdWriteVec(fd int, bufs [][]byte, maxlen int64, truncate bool) (int64, int64, error) {
        length, iovecs, intermediate, err := buildIovec(bufs, maxlen, truncate)
        if err != nil && len(iovecs) == 0 {
                // No partial write to do, return error immediately.
                return 0, length, err
        }

        // Copy data to intermediate buf.
        if intermediate != nil {
                copyFromMulti(intermediate, bufs)
        }

        var msg unix.Msghdr
        if len(iovecs) > 0 {
                msg.Iov = &iovecs[0]
                msg.Iovlen = uint64(len(iovecs))
        }

        n, _, e := unix.RawSyscall(unix.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), unix.MSG_DONTWAIT|unix.MSG_NOSIGNAL)
        if e != 0 {
                // N.B. prioritize the syscall error over the buildIovec error.
                return 0, length, e
        }

        return int64(n), length, err
}






































































































































  250 







   10 
    2 





    8 




    6 






    6 






  326 

  251 




  326 




   59 




  410 




























































  477 









    1 
    1 




    1 



    1 





  496 




  496 



  449 




  176 




  176 

  176 












  341 









  341 












  341 
  121 


  117 




  117 



  236 






  230 


   57 


  177 







  177 








  177 
  176 







  176 













  277 


  277 








  277 


  176 



  277 
















  495 








   31 












  449 





  221 


  451 


  450 



  487 




  286 




  266 








  535 





























  248 



  250 


  198 


  250 



  298 




































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package kernfs provides the tools to implement inode-based filesystems.
// Kernfs has two main features:
//
// 1. The Inode interface, which maps VFS2's path-based filesystem operations to
//    specific filesystem nodes. Kernfs uses the Inode interface to provide a
//    blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as
//    the synchronization mechanism for all filesystem operations by holding a
//    filesystem-wide lock across all operations.
//
// 2. Various utility types which provide generic implementations for various
//    parts of the Inode and vfs.FileDescription interfaces. Client filesystems
//    based on kernfs can embed the appropriate set of these to avoid having to
//    reimplement common filesystem operations. See inode_impl_util.go and
//    fd_impl_util.go.
//
// Reference Model:
//
// Kernfs dentries represents named pointers to inodes. Kernfs is solely
// reponsible for maintaining and modifying its dentry tree; inode
// implementations can not access the tree. Dentries and inodes have
// independent lifetimes and reference counts. A child dentry unconditionally
// holds a reference on its parent directory's dentry. A dentry also holds a
// reference on the inode it points to (although that might not be the only
// reference on the inode). Due to this inodes can outlive the dentries that
// point to them. Multiple dentries can point to the same inode (for example,
// in the case of hardlinks). File descriptors hold a reference to the dentry
// they're opened on.
//
// Dentries are guaranteed to exist while holding Filesystem.mu for
// reading. Dropping dentries require holding Filesystem.mu for writing. To
// queue dentries for destruction from a read critical section, see
// Filesystem.deferDecRef.
//
// Lock ordering:
//
// kernfs.Filesystem.mu
//   kernfs.Dentry.dirMu
//     vfs.VirtualFilesystem.mountMu
//       vfs.Dentry.mu
//   (inode implementation locks, if any)
// kernfs.Filesystem.deferredDecRefsMu
package kernfs

import (
        "fmt"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/refsvfs2"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
)

// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
// filesystem. Concrete implementations are expected to embed this in their own
// Filesystem type.
//
// +stateify savable
type Filesystem struct {
        vfsfs vfs.Filesystem

        deferredDecRefsMu sync.Mutex `state:"nosave"`

        // deferredDecRefs is a list of dentries waiting to be DecRef()ed. This is
        // used to defer dentry destruction until mu can be acquired for
        // writing. Protected by deferredDecRefsMu.
        deferredDecRefs []refsvfs2.RefCounter

        // mu synchronizes the lifetime of Dentries on this filesystem. Holding it
        // for reading guarantees continued existence of any resolved dentries, but
        // the dentry tree may be modified.
        //
        // Kernfs dentries can only be DecRef()ed while holding mu for writing. For
        // example:
        //
        //   fs.mu.Lock()
        //   defer fs.mu.Unlock()
        //   ...
        //   dentry1.DecRef()
        //   defer dentry2.DecRef() // Ok, will run before Unlock.
        //
        // If discarding dentries in a read context, use Filesystem.deferDecRef. For
        // example:
        //
        //   fs.mu.RLock()
        //   defer fs.processDeferredDecRefs()
        //   defer fs.mu.RUnlock()
        //   ...
        //   fs.deferDecRef(dentry)
        mu sync.RWMutex `state:"nosave"`

        // nextInoMinusOne is used to to allocate inode numbers on this
        // filesystem. Must be accessed by atomic operations.
        nextInoMinusOne uint64

        // cachedDentries contains all dentries with 0 references. (Due to race
        // conditions, it may also contain dentries with non-zero references.)
        // cachedDentriesLen is the number of dentries in cachedDentries. These
        // fields are protected by mu.
        cachedDentries    dentryList
        cachedDentriesLen uint64

        // MaxCachedDentries is the maximum size of cachedDentries. If not set,
        // defaults to 0 and kernfs does not cache any dentries. This is immutable.
        MaxCachedDentries uint64

        // root is the root dentry of this filesystem. Note that root may be nil for
        // filesystems on a disconnected mount without a root (e.g. pipefs, sockfs,
        // hostfs). Filesystem holds an extra reference on root to prevent it from
        // being destroyed prematurely. This is immutable.
        root *Dentry
}

// deferDecRef defers dropping a dentry ref until the next call to
// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
// This may be called while Filesystem.mu or Dentry.dirMu is locked.
func (fs *Filesystem) deferDecRef(d refsvfs2.RefCounter) {
        fs.deferredDecRefsMu.Lock()
        fs.deferredDecRefs = append(fs.deferredDecRefs, d)
        fs.deferredDecRefsMu.Unlock()
}

// SafeDecRefFD safely DecRef the FileDescription making sure DecRef is deferred
// in case Filesystem.mu is held. See comment on Filesystem.mu.
func (fs *Filesystem) SafeDecRefFD(ctx context.Context, fd *vfs.FileDescription) {
        if d, ok := fd.Dentry().Impl().(*Dentry); ok && d.fs == fs {
                // Only defer if dentry belongs to this filesystem, since locks cannot cross
                // filesystems.
                fs.deferDecRef(fd)
                return
        }
        fd.DecRef(ctx)
}

// SafeDecRef safely DecRef the virtual dentry making sure DecRef is deferred
// in case Filesystem.mu is held. See comment on Filesystem.mu.
func (fs *Filesystem) SafeDecRef(ctx context.Context, vd vfs.VirtualDentry) {
        if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs {
                // Only defer if dentry belongs to this filesystem, since locks cannot cross
                // filesystems.
                fs.deferDecRef(&vd)
                return
        }
        vd.DecRef(ctx)
}

// processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
// deferredDecRefs list. See comment on Filesystem.mu.
//
// Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked.
func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) {
        fs.deferredDecRefsMu.Lock()
        for _, d := range fs.deferredDecRefs {
                // Defer the DecRef call so that we are not holding deferredDecRefsMu
                // when DecRef is called.
                defer d.DecRef(ctx)
        }
        fs.deferredDecRefs = fs.deferredDecRefs[:0] // Keep slice memory for reuse.
        fs.deferredDecRefsMu.Unlock()
}

// VFSFilesystem returns the generic vfs filesystem object.
func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem {
        return &fs.vfsfs
}

// NextIno allocates a new inode number on this filesystem.
func (fs *Filesystem) NextIno() uint64 {
        return atomic.AddUint64(&fs.nextInoMinusOne, 1)
}

// These consts are used in the Dentry.flags field.
const (
        // Dentry points to a directory inode.
        dflagsIsDir = 1 << iota

        // Dentry points to a symlink inode.
        dflagsIsSymlink
)

// Dentry implements vfs.DentryImpl.
//
// A kernfs dentry is similar to a dentry in a traditional filesystem: it's a
// named reference to an inode. A dentry generally lives as long as it's part of
// a mounted filesystem tree. Kernfs drops dentries once all references to them
// are dropped. Dentries hold a single reference to the inode they point
// to, and child dentries hold a reference on their parent.
//
// Must be initialized by Init prior to first use.
//
// +stateify savable
type Dentry struct {
        vfsd vfs.Dentry

        // refs is the reference count. When refs reaches 0, the dentry may be
        // added to the cache or destroyed. If refs == -1, the dentry has already
        // been destroyed. refs are allowed to go to 0 and increase again. refs is
        // accessed using atomic memory operations.
        refs int64

        // fs is the owning filesystem. fs is immutable.
        fs *Filesystem

        // flags caches useful information about the dentry from the inode. See the
        // dflags* consts above. Must be accessed by atomic ops.
        flags uint32

        parent *Dentry
        name   string

        // If cached is true, dentryEntry links dentry into
        // Filesystem.cachedDentries. cached and dentryEntry are protected by
        // Filesystem.mu.
        cached bool
        dentryEntry

        // dirMu protects children and the names of child Dentries.
        //
        // Note that holding fs.mu for writing is not sufficient;
        // revalidateChildLocked(), which is a very hot path, may modify children with
        // fs.mu acquired for reading only.
        dirMu    sync.Mutex `state:"nosave"`
        children map[string]*Dentry

        inode Inode
}

// IncRef implements vfs.DentryImpl.IncRef.
func (d *Dentry) IncRef() {
        // d.refs may be 0 if d.fs.mu is locked, which serializes against
        // d.cacheLocked().
        r := atomic.AddInt64(&d.refs, 1)
        if d.LogRefs() {
                refsvfs2.LogIncRef(d, r)
        }
}

// TryIncRef implements vfs.DentryImpl.TryIncRef.
func (d *Dentry) TryIncRef() bool {
        for {
                r := atomic.LoadInt64(&d.refs)
                if r <= 0 {
                        return false
                }
                if atomic.CompareAndSwapInt64(&d.refs, r, r+1) {
                        if d.LogRefs() {
                                refsvfs2.LogTryIncRef(d, r+1)
                        }
                        return true
                }
        }
}

// DecRef implements vfs.DentryImpl.DecRef.
func (d *Dentry) DecRef(ctx context.Context) {
        r := atomic.AddInt64(&d.refs, -1)
        if d.LogRefs() {
                refsvfs2.LogDecRef(d, r)
        }
        if r == 0 {
                d.fs.mu.Lock()
                d.cacheLocked(ctx)
                d.fs.mu.Unlock()
        } else if r < 0 {
                panic("kernfs.Dentry.DecRef() called without holding a reference")
        }
}

func (d *Dentry) decRefLocked(ctx context.Context) {
        r := atomic.AddInt64(&d.refs, -1)
        if d.LogRefs() {
                refsvfs2.LogDecRef(d, r)
        }
        if r == 0 {
                d.cacheLocked(ctx)
        } else if r < 0 {
                panic("kernfs.Dentry.DecRef() called without holding a reference")
        }
}

// cacheLocked should be called after d's reference count becomes 0. The ref
// count check may happen before acquiring d.fs.mu so there might be a race
// condition where the ref count is increased again by the time the caller
// acquires d.fs.mu. This race is handled.
// Only reachable dentries are added to the cache. However, a dentry might
// become unreachable *while* it is in the cache due to invalidation.
//
// Preconditions: d.fs.mu must be locked for writing.
func (d *Dentry) cacheLocked(ctx context.Context) {
        // Dentries with a non-zero reference count must be retained. (The only way
        // to obtain a reference on a dentry with zero references is via path
        // resolution, which requires d.fs.mu, so if d.refs is zero then it will
        // remain zero while we hold d.fs.mu for writing.)
        refs := atomic.LoadInt64(&d.refs)
        if refs == -1 {
                // Dentry has already been destroyed.
                return
        }
        if refs > 0 {
                if d.cached {
                        d.fs.cachedDentries.Remove(d)
                        d.fs.cachedDentriesLen--
                        d.cached = false
                }
                return
        }
        // If the dentry is deleted and invalidated or has no parent, then it is no
        // longer reachable by path resolution and should be dropped immediately
        // because it has zero references.
        // Note that a dentry may not always have a parent; for example magic links
        // as described in Inode.Getlink.
        if isDead := d.VFSDentry().IsDead(); isDead || d.parent == nil {
                if !isDead {
                        d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry())
                }
                if d.cached {
                        d.fs.cachedDentries.Remove(d)
                        d.fs.cachedDentriesLen--
                        d.cached = false
                }
                d.destroyLocked(ctx)
                return
        }
        // If d is already cached, just move it to the front of the LRU.
        if d.cached {
                d.fs.cachedDentries.Remove(d)
                d.fs.cachedDentries.PushFront(d)
                return
        }
        // Cache the dentry, then evict the least recently used cached dentry if
        // the cache becomes over-full.
        d.fs.cachedDentries.PushFront(d)
        d.fs.cachedDentriesLen++
        d.cached = true
        if d.fs.cachedDentriesLen <= d.fs.MaxCachedDentries {
                return
        }
        d.fs.evictCachedDentryLocked(ctx)
        // Whether or not victim was destroyed, we brought fs.cachedDentriesLen
        // back down to fs.opts.maxCachedDentries, so we don't loop.
}

// Preconditions:
// * fs.mu must be locked for writing.
// * fs.cachedDentriesLen != 0.
func (fs *Filesystem) evictCachedDentryLocked(ctx context.Context) {
        // Evict the least recently used dentry because cache size is greater than
        // max cache size (configured on mount).
        victim := fs.cachedDentries.Back()
        fs.cachedDentries.Remove(victim)
        fs.cachedDentriesLen--
        victim.cached = false
        // victim.refs may have become non-zero from an earlier path resolution
        // after it was inserted into fs.cachedDentries.
        if atomic.LoadInt64(&victim.refs) == 0 {
                if !victim.vfsd.IsDead() {
                        victim.parent.dirMu.Lock()
                        // Note that victim can't be a mount point (in any mount
                        // namespace), since VFS holds references on mount points.
                        fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, victim.VFSDentry())
                        delete(victim.parent.children, victim.name)
                        victim.parent.dirMu.Unlock()
                }
                victim.destroyLocked(ctx)
        }
        // Whether or not victim was destroyed, we brought fs.cachedDentriesLen
        // back down to fs.MaxCachedDentries, so we don't loop.
}

// destroyLocked destroys the dentry.
//
// Preconditions:
// * d.fs.mu must be locked for writing.
// * d.refs == 0.
// * d should have been removed from d.parent.children, i.e. d is not reachable
//   by path traversal.
// * d.vfsd.IsDead() is true.
func (d *Dentry) destroyLocked(ctx context.Context) {
        refs := atomic.LoadInt64(&d.refs)
        switch refs {
        case 0:
                // Mark the dentry destroyed.
                atomic.StoreInt64(&d.refs, -1)
        case -1:
                panic("dentry.destroyLocked() called on already destroyed dentry")
        default:
                panic("dentry.destroyLocked() called with references on the dentry")
        }

        d.inode.DecRef(ctx) // IncRef from Init.
        d.inode = nil

        if d.parent != nil {
                d.parent.decRefLocked(ctx)
        }

        refsvfs2.Unregister(d)
}

// RefType implements refsvfs2.CheckedObject.Type.
func (d *Dentry) RefType() string {
        return "kernfs.Dentry"
}

// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
func (d *Dentry) LeakMessage() string {
        return fmt.Sprintf("[kernfs.Dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
}

// LogRefs implements refsvfs2.CheckedObject.LogRefs.
//
// This should only be set to true for debugging purposes, as it can generate an
// extremely large amount of output and drastically degrade performance.
func (d *Dentry) LogRefs() bool {
        return false
}

// InitRoot initializes this dentry as the root of the filesystem.
//
// Precondition: Caller must hold a reference on inode.
//
// Postcondition: Caller's reference on inode is transferred to the dentry.
func (d *Dentry) InitRoot(fs *Filesystem, inode Inode) {
        d.Init(fs, inode)
        fs.root = d
        // Hold an extra reference on the root dentry. It is held by fs to prevent the
        // root from being "cached" and subsequently evicted.
        d.IncRef()
}

// Init initializes this dentry.
//
// Precondition: Caller must hold a reference on inode.
//
// Postcondition: Caller's reference on inode is transferred to the dentry.
func (d *Dentry) Init(fs *Filesystem, inode Inode) {
        d.vfsd.Init(d)
        d.fs = fs
        d.inode = inode
        atomic.StoreInt64(&d.refs, 1)
        ftype := inode.Mode().FileType()
        if ftype == linux.ModeDirectory {
                d.flags |= dflagsIsDir
        }
        if ftype == linux.ModeSymlink {
                d.flags |= dflagsIsSymlink
        }
        refsvfs2.Register(d)
}

// VFSDentry returns the generic vfs dentry for this kernfs dentry.
func (d *Dentry) VFSDentry() *vfs.Dentry {
        return &d.vfsd
}

// isDir checks whether the dentry points to a directory inode.
func (d *Dentry) isDir() bool {
        return atomic.LoadUint32(&d.flags)&dflagsIsDir != 0
}

// isSymlink checks whether the dentry points to a symlink inode.
func (d *Dentry) isSymlink() bool {
        return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0
}

// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
//
// Although Linux technically supports inotify on pseudo filesystems (inotify
// is implemented at the vfs layer), it is not particularly useful. It is left
// unimplemented until someone actually needs it.
func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {}

// Watches implements vfs.DentryImpl.Watches.
func (d *Dentry) Watches() *vfs.Watches {
        return nil
}

// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
func (d *Dentry) OnZeroWatches(context.Context) {}

// insertChild inserts child into the vfs dentry cache with the given name under
// this dentry. This does not update the directory inode, so calling this on its
// own isn't sufficient to insert a child into a directory.
//
// Preconditions:
// * d must represent a directory inode.
// * d.fs.mu must be locked for at least reading.
func (d *Dentry) insertChild(name string, child *Dentry) {
        d.dirMu.Lock()
        d.insertChildLocked(name, child)
        d.dirMu.Unlock()
}

// insertChildLocked is equivalent to insertChild, with additional
// preconditions.
//
// Preconditions:
// * d must represent a directory inode.
// * d.dirMu must be locked.
// * d.fs.mu must be locked for at least reading.
func (d *Dentry) insertChildLocked(name string, child *Dentry) {
        if !d.isDir() {
                panic(fmt.Sprintf("insertChildLocked called on non-directory Dentry: %+v.", d))
        }
        d.IncRef() // DecRef in child's Dentry.destroy.
        child.parent = d
        child.name = name
        if d.children == nil {
                d.children = make(map[string]*Dentry)
        }
        d.children[name] = child
}

// Inode returns the dentry's inode.
func (d *Dentry) Inode() Inode {
        return d.inode
}

// FSLocalPath returns an absolute path to d, relative to the root of its
// filesystem.
func (d *Dentry) FSLocalPath() string {
        var b fspath.Builder
        _ = genericPrependPath(vfs.VirtualDentry{}, nil, d, &b)
        b.PrependByte('/')
        return b.String()
}

// The Inode interface maps filesystem-level operations that operate on paths to
// equivalent operations on specific filesystem nodes.
//
// The interface methods are groups into logical categories as sub interfaces
// below. Generally, an implementation for each sub interface can be provided by
// embedding an appropriate type from inode_impl_utils.go. The sub interfaces
// are purely organizational. Methods declared directly in the main interface
// have no generic implementations, and should be explicitly provided by the
// client filesystem.
//
// Generally, implementations are not responsible for tasks that are common to
// all filesystems. These include:
//
// - Checking that dentries passed to methods are of the appropriate file type.
// - Checking permissions.
//
// Inode functions may be called holding filesystem wide locks and are not
// allowed to call vfs functions that may reenter, unless otherwise noted.
//
// Specific responsibilities of implementations are documented below.
type Inode interface {
        // Methods related to reference counting. A generic implementation is
        // provided by InodeNoopRefCount. These methods are generally called by the
        // equivalent Dentry methods.
        inodeRefs

        // Methods related to node metadata. A generic implementation is provided by
        // InodeAttrs. Note that a concrete filesystem using kernfs is responsible for
        // managing link counts.
        inodeMetadata

        // Method for inodes that represent symlink. InodeNotSymlink provides a
        // blanket implementation for all non-symlink inodes.
        inodeSymlink

        // Method for inodes that represent directories. InodeNotDirectory provides
        // a blanket implementation for all non-directory inodes.
        inodeDirectory

        // Open creates a file description for the filesystem object represented by
        // this inode. The returned file description should hold a reference on the
        // dentry for its lifetime.
        //
        // Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
        // the inode on which Open() is being called.
        Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)

        // StatFS returns filesystem statistics for the client filesystem. This
        // corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem
        // doesn't support statfs(2), this should return ENOSYS.
        StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error)

        // Keep indicates whether the dentry created after Inode.Lookup should be
        // kept in the kernfs dentry tree.
        Keep() bool

        // Valid should return true if this inode is still valid, or needs to
        // be resolved again by a call to Lookup.
        Valid(ctx context.Context) bool
}

type inodeRefs interface {
        IncRef()
        DecRef(ctx context.Context)
        TryIncRef() bool
}

type inodeMetadata interface {
        // CheckPermissions checks that creds may access this inode for the
        // requested access type, per the the rules of
        // fs/namei.c:generic_permission().
        CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error

        // Mode returns the (struct stat)::st_mode value for this inode. This is
        // separated from Stat for performance.
        Mode() linux.FileMode

        // Stat returns the metadata for this inode. This corresponds to
        // vfs.FilesystemImpl.StatAt.
        Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)

        // SetStat updates the metadata for this inode. This corresponds to
        // vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking
        // if the operation can be performed (see vfs.CheckSetStat() for common
        // checks).
        SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error
}

// Precondition: All methods in this interface may only be called on directory
// inodes.
type inodeDirectory interface {
        // The New{File,Dir,Node,Link,Symlink} methods below should return a new inode
        // that will be hashed into the dentry tree.
        //
        // These inode constructors are inode-level operations rather than
        // filesystem-level operations to allow client filesystems to mix different
        // implementations based on the new node's location in the
        // filesystem.

        // HasChildren returns true if the directory inode has any children.
        HasChildren() bool

        // NewFile creates a new regular file inode.
        NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error)

        // NewDir creates a new directory inode.
        NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error)

        // NewLink creates a new hardlink to a specified inode in this
        // directory. Implementations should create a new kernfs Dentry pointing to
        // target, and update target's link count.
        NewLink(ctx context.Context, name string, target Inode) (Inode, error)

        // NewSymlink creates a new symbolic link inode.
        NewSymlink(ctx context.Context, name, target string) (Inode, error)

        // NewNode creates a new filesystem node for a mknod syscall.
        NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error)

        // Unlink removes a child dentry from this directory inode.
        Unlink(ctx context.Context, name string, child Inode) error

        // RmDir removes an empty child directory from this directory
        // inode. Implementations must update the parent directory's link count,
        // if required. Implementations are not responsible for checking that child
        // is a directory, checking for an empty directory.
        RmDir(ctx context.Context, name string, child Inode) error

        // Rename is called on the source directory containing an inode being
        // renamed. child should point to the resolved child in the source
        // directory.
        //
        // Precondition: Caller must serialize concurrent calls to Rename.
        Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error

        // Lookup should return an appropriate inode if name should resolve to a
        // child of this directory inode. This gives the directory an opportunity
        // on every lookup to resolve additional entries. This is only called when
        // the inode is a directory.
        //
        // The child returned by Lookup will be hashed into the VFS dentry tree,
        // at least for the duration of the current FS operation.
        //
        // Lookup must return the child with an extra reference whose ownership is
        // transferred to the dentry that is created to point to that inode. If
        // Inode.Keep returns false, that new dentry will be dropped at the end of
        // the current filesystem operation (before returning back to the VFS
        // layer) if no other ref is picked on that dentry. If Inode.Keep returns
        // true, then the dentry will be cached into the dentry tree until it is
        // Unlink'd or RmDir'd.
        Lookup(ctx context.Context, name string) (Inode, error)

        // IterDirents is used to iterate over dynamically created entries. It invokes
        // cb on each entry in the directory represented by the Inode.
        // 'offset' is the offset for the entire IterDirents call, which may include
        // results from the caller (e.g. "." and ".."). 'relOffset' is the offset
        // inside the entries returned by this IterDirents invocation. In other words,
        // 'offset' should be used to calculate each vfs.Dirent.NextOff as well as
        // the return value, while 'relOffset' is the place to start iteration.
        IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
}

type inodeSymlink interface {
        // Readlink returns the target of a symbolic link. If an inode is not a
        // symlink, the implementation should return EINVAL.
        //
        // Readlink is called with no kernfs locks held, so it may reenter if needed
        // to resolve symlink targets.
        Readlink(ctx context.Context, mnt *vfs.Mount) (string, error)

        // Getlink returns the target of a symbolic link, as used by path
        // resolution:
        //
        // - If the inode is a "magic link" (a link whose target is most accurately
        // represented as a VirtualDentry), Getlink returns (ok VirtualDentry, "",
        // nil). A reference is taken on the returned VirtualDentry.
        //
        // - If the inode is an ordinary symlink, Getlink returns (zero-value
        // VirtualDentry, symlink target, nil).
        //
        // - If the inode is not a symlink, Getlink returns (zero-value
        // VirtualDentry, "", EINVAL).
        Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error)
}






























  546 



    1 















   14 



   14 












    2 






  192 













    2 






    2 






    1 






    1 













    1 













    1 




















    3 























































    1 



























  322 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package gofer

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/fd"
        "gvisor.dev/gvisor/pkg/p9"
        "gvisor.dev/gvisor/pkg/syserror"
)

// p9file is a wrapper around p9.File that provides methods that are
// Context-aware.
type p9file struct {
        file p9.File
}

func (f p9file) isNil() bool {
        return f.file == nil
}

func (f p9file) walk(ctx context.Context, names []string) ([]p9.QID, p9file, error) {
        ctx.UninterruptibleSleepStart(false)
        qids, newfile, err := f.file.Walk(names)
        ctx.UninterruptibleSleepFinish(false)
        return qids, p9file{newfile}, err
}

func (f p9file) walkGetAttr(ctx context.Context, names []string) ([]p9.QID, p9file, p9.AttrMask, p9.Attr, error) {
        ctx.UninterruptibleSleepStart(false)
        qids, newfile, attrMask, attr, err := f.file.WalkGetAttr(names)
        ctx.UninterruptibleSleepFinish(false)
        return qids, p9file{newfile}, attrMask, attr, err
}

// walkGetAttrOne is a wrapper around p9.File.WalkGetAttr that takes a single
// path component and returns a single qid.
func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file, p9.AttrMask, p9.Attr, error) {
        ctx.UninterruptibleSleepStart(false)
        qids, newfile, attrMask, attr, err := f.file.WalkGetAttr([]string{name})
        ctx.UninterruptibleSleepFinish(false)
        if err != nil {
                return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, err
        }
        if len(qids) != 1 {
                ctx.Warningf("p9.File.WalkGetAttr returned %d qids (%v), wanted 1", len(qids), qids)
                if newfile != nil {
                        p9file{newfile}.close(ctx)
                }
                return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO
        }
        return qids[0], p9file{newfile}, attrMask, attr, nil
}

func (f p9file) statFS(ctx context.Context) (p9.FSStat, error) {
        ctx.UninterruptibleSleepStart(false)
        fsstat, err := f.file.StatFS()
        ctx.UninterruptibleSleepFinish(false)
        return fsstat, err
}

func (f p9file) getAttr(ctx context.Context, req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
        ctx.UninterruptibleSleepStart(false)
        qid, attrMask, attr, err := f.file.GetAttr(req)
        ctx.UninterruptibleSleepFinish(false)
        return qid, attrMask, attr, err
}

func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
        ctx.UninterruptibleSleepStart(false)
        err := f.file.SetAttr(valid, attr)
        ctx.UninterruptibleSleepFinish(false)
        return err
}

func (f p9file) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
        ctx.UninterruptibleSleepStart(false)
        xattrs, err := f.file.ListXattr(size)
        ctx.UninterruptibleSleepFinish(false)
        return xattrs, err
}

func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) {
        ctx.UninterruptibleSleepStart(false)
        val, err := f.file.GetXattr(name, size)
        ctx.UninterruptibleSleepFinish(false)
        return val, err
}

func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) error {
        ctx.UninterruptibleSleepStart(false)
        err := f.file.SetXattr(name, value, flags)
        ctx.UninterruptibleSleepFinish(false)
        return err
}

func (f p9file) removeXattr(ctx context.Context, name string) error {
        ctx.UninterruptibleSleepStart(false)
        err := f.file.RemoveXattr(name)
        ctx.UninterruptibleSleepFinish(false)
        return err
}

func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
        ctx.UninterruptibleSleepStart(false)
        err := f.file.Allocate(mode, offset, length)
        ctx.UninterruptibleSleepFinish(false)
        return err
}

func (f p9file) close(ctx context.Context) error {
        ctx.UninterruptibleSleepStart(false)
        err := f.file.Close()
        ctx.UninterruptibleSleepFinish(false)
        return err
}

func (f p9file) setAttrClose(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
        ctx.UninterruptibleSleepStart(false)
        err := f.file.SetAttrClose(valid, attr)
        ctx.UninterruptibleSleepFinish(false)
        return err
}

func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
        ctx.UninterruptibleSleepStart(false)
        fdobj, qid, iounit, err := f.file.Open(flags)
        ctx.UninterruptibleSleepFinish(false)
        return fdobj, qid, iounit, err
}

func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (int, error) {
        ctx.UninterruptibleSleepStart(false)
        n, err := f.file.ReadAt(p, offset)
        ctx.UninterruptibleSleepFinish(false)
        return n, err
}

func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) {
        ctx.UninterruptibleSleepStart(false)
        n, err := f.file.WriteAt(p, offset)
        ctx.UninterruptibleSleepFinish(false)
        return n, err
}

func (f p9file) fsync(ctx context.Context) error {
        ctx.UninterruptibleSleepStart(false)
        err := f.file.FSync()
        ctx.UninterruptibleSleepFinish(false)
        return err
}

func (f p9file) create(ctx context.Context, name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9file, p9.QID, uint32, error) {
        ctx.UninterruptibleSleepStart(false)
        fdobj, newfile, qid, iounit, err := f.file.Create(name, flags, permissions, uid, gid)
        ctx.UninterruptibleSleepFinish(false)
        return fdobj, p9file{newfile}, qid, iounit, err
}

func (f p9file) mkdir(ctx context.Context, name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
        ctx.UninterruptibleSleepStart(false)
        qid, err := f.file.Mkdir(name, permissions, uid, gid)
        ctx.UninterruptibleSleepFinish(false)
        return qid, err
}

func (f p9file) symlink(ctx context.Context, oldName string, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
        ctx.UninterruptibleSleepStart(false)
        qid, err := f.file.Symlink(oldName, newName, uid, gid)
        ctx.UninterruptibleSleepFinish(false)
        return qid, err
}

func (f p9file) link(ctx context.Context, target p9file, newName string) error {
        ctx.UninterruptibleSleepStart(false)
        err := f.file.Link(target.file, newName)
        ctx.UninterruptibleSleepFinish(false)
        return err
}

func (f p9file) mknod(ctx context.Context, name string, mode p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
        ctx.UninterruptibleSleepStart(false)
        qid, err := f.file.Mknod(name, mode, major, minor, uid, gid)
        ctx.UninterruptibleSleepFinish(false)
        return qid, err
}

func (f p9file) rename(ctx context.Context, newDir p9file, newName string) error {
        ctx.UninterruptibleSleepStart(false)
        err := f.file.Rename(newDir.file, newName)
        ctx.UninterruptibleSleepFinish(false)
        return err
}

func (f p9file) unlinkAt(ctx context.Context, name string, flags uint32) error {
        ctx.UninterruptibleSleepStart(false)
        err := f.file.UnlinkAt(name, flags)
        ctx.UninterruptibleSleepFinish(false)
        return err
}

func (f p9file) readdir(ctx context.Context, offset uint64, count uint32) ([]p9.Dirent, error) {
        ctx.UninterruptibleSleepStart(false)
        dirents, err := f.file.Readdir(offset, count)
        ctx.UninterruptibleSleepFinish(false)
        return dirents, err
}

func (f p9file) readlink(ctx context.Context) (string, error) {
        ctx.UninterruptibleSleepStart(false)
        target, err := f.file.Readlink()
        ctx.UninterruptibleSleepFinish(false)
        return target, err
}

func (f p9file) flush(ctx context.Context) error {
        ctx.UninterruptibleSleepStart(false)
        err := f.file.Flush()
        ctx.UninterruptibleSleepFinish(false)
        return err
}

func (f p9file) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, error) {
        ctx.UninterruptibleSleepStart(false)
        fdobj, err := f.file.Connect(flags)
        ctx.UninterruptibleSleepFinish(false)
        return fdobj, err
}

func (f p9file) multiGetAttr(ctx context.Context, names []string) ([]p9.FullStat, error) {
        ctx.UninterruptibleSleepStart(false)
        stats, err := f.file.MultiGetAttr(names)
        ctx.UninterruptibleSleepFinish(false)
        return stats, err
}



































   20 






   20 













    7 




   20 














    2 




    1 




    5 
    4 












    1 



    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package memdev

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/usermem"
)

const zeroDevMinor = 5

// zeroDevice implements vfs.Device for /dev/zero.
//
// +stateify savable
type zeroDevice struct{}

// Open implements vfs.Device.Open.
func (zeroDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd := &zeroFD{}
        if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
                UseDentryMetadata: true,
        }); err != nil {
                return nil, err
        }
        return &fd.vfsfd, nil
}

// zeroFD implements vfs.FileDescriptionImpl for /dev/zero.
//
// +stateify savable
type zeroFD struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.DentryMetadataFileDescriptionImpl
        vfs.NoLockFD
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *zeroFD) Release(context.Context) {
        // noop
}

// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *zeroFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        return dst.ZeroOut(ctx, dst.NumBytes())
}

// Read implements vfs.FileDescriptionImpl.Read.
func (fd *zeroFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        return dst.ZeroOut(ctx, dst.NumBytes())
}

// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *zeroFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
        return src.NumBytes(), nil
}

// Write implements vfs.FileDescriptionImpl.Write.
func (fd *zeroFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        return src.NumBytes(), nil
}

// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *zeroFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        return 0, nil
}

// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (fd *zeroFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
        if opts.Private || !opts.MaxPerms.Write {
                // This mapping will never permit writing to the "underlying file" (in
                // Linux terms, it isn't VM_SHARED), so implement it as an anonymous
                // mapping, but back it with fd; this is what Linux does, and is
                // actually application-visible because the resulting VMA will show up
                // in /proc/[pid]/maps with fd.vfsfd.VirtualDentry()'s path rather than
                // "/dev/zero (deleted)".
                opts.Offset = 0
                opts.MappingIdentity = &fd.vfsfd
                opts.SentryOwnedContent = true
                opts.MappingIdentity.IncRef()
                return nil
        }
        tmpfsFD, err := tmpfs.NewZeroFile(ctx, auth.CredentialsFromContext(ctx), kernel.KernelFromContext(ctx).ShmMount(), opts.Length)
        if err != nil {
                return err
        }
        defer tmpfsFD.DecRef(ctx)
        return tmpfsFD.ConfigureMMap(ctx, opts)
}













































  740 























































































































































































































































































































































































































































































































































    7 
    7 


    7 


    7 










































    7 

    7 
    7 
    7 
    7 





    7 





    7 






















































  739 

  735 




  740 




  740 



  739 























 1962 
 1963 










    2 



    2 

































  738 












  737 

  739 
  739 
  739 


  733 


  738 

























 1956 












































































































 1963 

















  739 










































  736 





  738 





  735 
  737 








  733 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build 386 amd64

package cpuid

import (
        "bytes"
        "fmt"
        "io/ioutil"
        "strconv"
        "strings"

        "gvisor.dev/gvisor/pkg/log"
)

// Common references for CPUID leaves and bits:
//
// Intel:
//   * Intel SDM Volume 2, Chapter 3.2 "CPUID" (more up-to-date)
//   * Intel Application Note 485 (more detailed)
//
// AMD:
//   * AMD64 APM Volume 3, Appendix 3 "Obtaining Processor Information ..."

// block is a collection of 32 Feature bits.
type block int

const blockSize = 32

// Feature bits are numbered according to "blocks". Each block is 32 bits, and
// feature bits from the same source (cpuid leaf/level) are in the same block.
func featureID(b block, bit int) Feature {
        return Feature(32*int(b) + bit)
}

// Block 0 constants are all of the "basic" feature bits returned by a cpuid in
// ecx with eax=1.
const (
        X86FeatureSSE3 Feature = iota
        X86FeaturePCLMULDQ
        X86FeatureDTES64
        X86FeatureMONITOR
        X86FeatureDSCPL
        X86FeatureVMX
        X86FeatureSMX
        X86FeatureEST
        X86FeatureTM2
        X86FeatureSSSE3 // Not a typo, "supplemental" SSE3.
        X86FeatureCNXTID
        X86FeatureSDBG
        X86FeatureFMA
        X86FeatureCX16
        X86FeatureXTPR
        X86FeaturePDCM
        _ // ecx bit 16 is reserved.
        X86FeaturePCID
        X86FeatureDCA
        X86FeatureSSE4_1
        X86FeatureSSE4_2
        X86FeatureX2APIC
        X86FeatureMOVBE
        X86FeaturePOPCNT
        X86FeatureTSCD
        X86FeatureAES
        X86FeatureXSAVE
        X86FeatureOSXSAVE
        X86FeatureAVX
        X86FeatureF16C
        X86FeatureRDRAND
        _ // ecx bit 31 is reserved.
)

// Block 1 constants are all of the "basic" feature bits returned by a cpuid in
// edx with eax=1.
const (
        X86FeatureFPU Feature = 32 + iota
        X86FeatureVME
        X86FeatureDE
        X86FeaturePSE
        X86FeatureTSC
        X86FeatureMSR
        X86FeaturePAE
        X86FeatureMCE
        X86FeatureCX8
        X86FeatureAPIC
        _ // edx bit 10 is reserved.
        X86FeatureSEP
        X86FeatureMTRR
        X86FeaturePGE
        X86FeatureMCA
        X86FeatureCMOV
        X86FeaturePAT
        X86FeaturePSE36
        X86FeaturePSN
        X86FeatureCLFSH
        _ // edx bit 20 is reserved.
        X86FeatureDS
        X86FeatureACPI
        X86FeatureMMX
        X86FeatureFXSR
        X86FeatureSSE
        X86FeatureSSE2
        X86FeatureSS
        X86FeatureHTT
        X86FeatureTM
        X86FeatureIA64
        X86FeaturePBE
)

// Block 2 bits are the "structured extended" features returned in ebx for
// eax=7, ecx=0.
const (
        X86FeatureFSGSBase Feature = 2*32 + iota
        X86FeatureTSC_ADJUST
        _ // ebx bit 2 is reserved.
        X86FeatureBMI1
        X86FeatureHLE
        X86FeatureAVX2
        X86FeatureFDP_EXCPTN_ONLY
        X86FeatureSMEP
        X86FeatureBMI2
        X86FeatureERMS
        X86FeatureINVPCID
        X86FeatureRTM
        X86FeatureCQM
        X86FeatureFPCSDS
        X86FeatureMPX
        X86FeatureRDT
        X86FeatureAVX512F
        X86FeatureAVX512DQ
        X86FeatureRDSEED
        X86FeatureADX
        X86FeatureSMAP
        X86FeatureAVX512IFMA
        X86FeaturePCOMMIT
        X86FeatureCLFLUSHOPT
        X86FeatureCLWB
        X86FeatureIPT // Intel processor trace.
        X86FeatureAVX512PF
        X86FeatureAVX512ER
        X86FeatureAVX512CD
        X86FeatureSHA
        X86FeatureAVX512BW
        X86FeatureAVX512VL
)

// Block 3 bits are the "extended" features returned in ecx for eax=7, ecx=0.
const (
        X86FeaturePREFETCHWT1 Feature = 3*32 + iota
        X86FeatureAVX512VBMI
        X86FeatureUMIP
        X86FeaturePKU
        X86FeatureOSPKE
        X86FeatureWAITPKG
        X86FeatureAVX512_VBMI2
        _ // ecx bit 7 is reserved
        X86FeatureGFNI
        X86FeatureVAES
        X86FeatureVPCLMULQDQ
        X86FeatureAVX512_VNNI
        X86FeatureAVX512_BITALG
        X86FeatureTME
        X86FeatureAVX512_VPOPCNTDQ
        _ // ecx bit 15 is reserved
        X86FeatureLA57
        // ecx bits 17-21 are reserved
        _
        _
        _
        _
        _
        X86FeatureRDPID
        // ecx bits 23-24 are reserved
        _
        _
        X86FeatureCLDEMOTE
        _ // ecx bit 26 is reserved
        X86FeatureMOVDIRI
        X86FeatureMOVDIR64B
)

// Block 4 constants are for xsave capabilities in CPUID.(EAX=0DH,ECX=01H):EAX.
// The CPUID leaf is available only if 'X86FeatureXSAVE' is present.
const (
        X86FeatureXSAVEOPT Feature = 4*32 + iota
        X86FeatureXSAVEC
        X86FeatureXGETBV1
        X86FeatureXSAVES
        // EAX[31:4] are reserved.
)

// Block 5 constants are the extended feature bits in
// CPUID.(EAX=0x80000001):ECX.
const (
        X86FeatureLAHF64 Feature = 5*32 + iota
        X86FeatureCMP_LEGACY
        X86FeatureSVM
        X86FeatureEXTAPIC
        X86FeatureCR8_LEGACY
        X86FeatureLZCNT
        X86FeatureSSE4A
        X86FeatureMISALIGNSSE
        X86FeaturePREFETCHW
        X86FeatureOSVW
        X86FeatureIBS
        X86FeatureXOP
        X86FeatureSKINIT
        X86FeatureWDT
        _ // ecx bit 14 is reserved.
        X86FeatureLWP
        X86FeatureFMA4
        X86FeatureTCE
        _ // ecx bit 18 is reserved.
        _ // ecx bit 19 is reserved.
        _ // ecx bit 20 is reserved.
        X86FeatureTBM
        X86FeatureTOPOLOGY
        X86FeaturePERFCTR_CORE
        X86FeaturePERFCTR_NB
        _ // ecx bit 25 is reserved.
        X86FeatureBPEXT
        X86FeaturePERFCTR_TSC
        X86FeaturePERFCTR_LLC
        X86FeatureMWAITX
        // TODO(b/152776797): Some CPUs set this but it is not documented anywhere.
        X86FeatureBlock5Bit30
        _ // ecx bit 31 is reserved.
)

// Block 6 constants are the extended feature bits in
// CPUID.(EAX=0x80000001):EDX.
//
// These are sparse, and so the bit positions are assigned manually.
const (
        // On AMD, EDX[24:23] | EDX[17:12] | EDX[9:0] are duplicate features
        // also defined in block 1 (in identical bit positions). Those features
        // are not listed here.
        block6DuplicateMask = 0x183f3ff

        X86FeatureSYSCALL  Feature = 6*32 + 11
        X86FeatureNX       Feature = 6*32 + 20
        X86FeatureMMXEXT   Feature = 6*32 + 22
        X86FeatureFXSR_OPT Feature = 6*32 + 25
        X86FeatureGBPAGES  Feature = 6*32 + 26
        X86FeatureRDTSCP   Feature = 6*32 + 27
        X86FeatureLM       Feature = 6*32 + 29
        X86Feature3DNOWEXT Feature = 6*32 + 30
        X86Feature3DNOW    Feature = 6*32 + 31
)

// linuxBlockOrder defines the order in which linux organizes the feature
// blocks. Linux also tracks feature bits in 32-bit blocks, but in an order
// which doesn't match well here, so for the /proc/cpuinfo generation we simply
// re-map the blocks to Linux's ordering and then go through the bits in each
// block.
var linuxBlockOrder = []block{1, 6, 0, 5, 2, 4, 3}

// To make emulation of /proc/cpuinfo easy, these names match the names of the
// basic features in Linux defined in arch/x86/kernel/cpu/capflags.c.
var x86FeatureStrings = map[Feature]string{
        // Block 0.
        X86FeatureSSE3:     "pni",
        X86FeaturePCLMULDQ: "pclmulqdq",
        X86FeatureDTES64:   "dtes64",
        X86FeatureMONITOR:  "monitor",
        X86FeatureDSCPL:    "ds_cpl",
        X86FeatureVMX:      "vmx",
        X86FeatureSMX:      "smx",
        X86FeatureEST:      "est",
        X86FeatureTM2:      "tm2",
        X86FeatureSSSE3:    "ssse3",
        X86FeatureCNXTID:   "cid",
        X86FeatureSDBG:     "sdbg",
        X86FeatureFMA:      "fma",
        X86FeatureCX16:     "cx16",
        X86FeatureXTPR:     "xtpr",
        X86FeaturePDCM:     "pdcm",
        X86FeaturePCID:     "pcid",
        X86FeatureDCA:      "dca",
        X86FeatureSSE4_1:   "sse4_1",
        X86FeatureSSE4_2:   "sse4_2",
        X86FeatureX2APIC:   "x2apic",
        X86FeatureMOVBE:    "movbe",
        X86FeaturePOPCNT:   "popcnt",
        X86FeatureTSCD:     "tsc_deadline_timer",
        X86FeatureAES:      "aes",
        X86FeatureXSAVE:    "xsave",
        X86FeatureAVX:      "avx",
        X86FeatureF16C:     "f16c",
        X86FeatureRDRAND:   "rdrand",

        // Block 1.
        X86FeatureFPU:   "fpu",
        X86FeatureVME:   "vme",
        X86FeatureDE:    "de",
        X86FeaturePSE:   "pse",
        X86FeatureTSC:   "tsc",
        X86FeatureMSR:   "msr",
        X86FeaturePAE:   "pae",
        X86FeatureMCE:   "mce",
        X86FeatureCX8:   "cx8",
        X86FeatureAPIC:  "apic",
        X86FeatureSEP:   "sep",
        X86FeatureMTRR:  "mtrr",
        X86FeaturePGE:   "pge",
        X86FeatureMCA:   "mca",
        X86FeatureCMOV:  "cmov",
        X86FeaturePAT:   "pat",
        X86FeaturePSE36: "pse36",
        X86FeaturePSN:   "pn",
        X86FeatureCLFSH: "clflush",
        X86FeatureDS:    "dts",
        X86FeatureACPI:  "acpi",
        X86FeatureMMX:   "mmx",
        X86FeatureFXSR:  "fxsr",
        X86FeatureSSE:   "sse",
        X86FeatureSSE2:  "sse2",
        X86FeatureSS:    "ss",
        X86FeatureHTT:   "ht",
        X86FeatureTM:    "tm",
        X86FeatureIA64:  "ia64",
        X86FeaturePBE:   "pbe",

        // Block 2.
        X86FeatureFSGSBase:   "fsgsbase",
        X86FeatureTSC_ADJUST: "tsc_adjust",
        X86FeatureBMI1:       "bmi1",
        X86FeatureHLE:        "hle",
        X86FeatureAVX2:       "avx2",
        X86FeatureSMEP:       "smep",
        X86FeatureBMI2:       "bmi2",
        X86FeatureERMS:       "erms",
        X86FeatureINVPCID:    "invpcid",
        X86FeatureRTM:        "rtm",
        X86FeatureCQM:        "cqm",
        X86FeatureMPX:        "mpx",
        X86FeatureRDT:        "rdt_a",
        X86FeatureAVX512F:    "avx512f",
        X86FeatureAVX512DQ:   "avx512dq",
        X86FeatureRDSEED:     "rdseed",
        X86FeatureADX:        "adx",
        X86FeatureSMAP:       "smap",
        X86FeatureCLWB:       "clwb",
        X86FeatureAVX512PF:   "avx512pf",
        X86FeatureAVX512ER:   "avx512er",
        X86FeatureAVX512CD:   "avx512cd",
        X86FeatureSHA:        "sha_ni",
        X86FeatureAVX512BW:   "avx512bw",
        X86FeatureAVX512VL:   "avx512vl",

        // Block 3.
        X86FeatureAVX512VBMI:       "avx512vbmi",
        X86FeatureUMIP:             "umip",
        X86FeaturePKU:              "pku",
        X86FeatureOSPKE:            "ospke",
        X86FeatureWAITPKG:          "waitpkg",
        X86FeatureAVX512_VBMI2:     "avx512_vbmi2",
        X86FeatureGFNI:             "gfni",
        X86FeatureVAES:             "vaes",
        X86FeatureVPCLMULQDQ:       "vpclmulqdq",
        X86FeatureAVX512_VNNI:      "avx512_vnni",
        X86FeatureAVX512_BITALG:    "avx512_bitalg",
        X86FeatureTME:              "tme",
        X86FeatureAVX512_VPOPCNTDQ: "avx512_vpopcntdq",
        X86FeatureLA57:             "la57",
        X86FeatureRDPID:            "rdpid",
        X86FeatureCLDEMOTE:         "cldemote",
        X86FeatureMOVDIRI:          "movdiri",
        X86FeatureMOVDIR64B:        "movdir64b",

        // Block 4.
        X86FeatureXSAVEOPT: "xsaveopt",
        X86FeatureXSAVEC:   "xsavec",
        X86FeatureXGETBV1:  "xgetbv1",
        X86FeatureXSAVES:   "xsaves",

        // Block 5.
        X86FeatureLAHF64:       "lahf_lm", // LAHF/SAHF in long mode
        X86FeatureCMP_LEGACY:   "cmp_legacy",
        X86FeatureSVM:          "svm",
        X86FeatureEXTAPIC:      "extapic",
        X86FeatureCR8_LEGACY:   "cr8_legacy",
        X86FeatureLZCNT:        "abm", // Advanced bit manipulation
        X86FeatureSSE4A:        "sse4a",
        X86FeatureMISALIGNSSE:  "misalignsse",
        X86FeaturePREFETCHW:    "3dnowprefetch",
        X86FeatureOSVW:         "osvw",
        X86FeatureIBS:          "ibs",
        X86FeatureXOP:          "xop",
        X86FeatureSKINIT:       "skinit",
        X86FeatureWDT:          "wdt",
        X86FeatureLWP:          "lwp",
        X86FeatureFMA4:         "fma4",
        X86FeatureTCE:          "tce",
        X86FeatureTBM:          "tbm",
        X86FeatureTOPOLOGY:     "topoext",
        X86FeaturePERFCTR_CORE: "perfctr_core",
        X86FeaturePERFCTR_NB:   "perfctr_nb",
        X86FeatureBPEXT:        "bpext",
        X86FeaturePERFCTR_TSC:  "ptsc",
        X86FeaturePERFCTR_LLC:  "perfctr_llc",
        X86FeatureMWAITX:       "mwaitx",

        // Block 6.
        X86FeatureSYSCALL:  "syscall",
        X86FeatureNX:       "nx",
        X86FeatureMMXEXT:   "mmxext",
        X86FeatureFXSR_OPT: "fxsr_opt",
        X86FeatureGBPAGES:  "pdpe1gb",
        X86FeatureRDTSCP:   "rdtscp",
        X86FeatureLM:       "lm",
        X86Feature3DNOWEXT: "3dnowext",
        X86Feature3DNOW:    "3dnow",
}

// These flags are parse only---they can be used for setting / unsetting the
// flags, but will not get printed out in /proc/cpuinfo.
var x86FeatureParseOnlyStrings = map[Feature]string{
        // Block 0.
        X86FeatureOSXSAVE: "osxsave",

        // Block 2.
        X86FeatureFDP_EXCPTN_ONLY: "fdp_excptn_only",
        X86FeatureFPCSDS:          "fpcsds",
        X86FeatureIPT:             "pt",
        X86FeatureCLFLUSHOPT:      "clfushopt",

        // Block 3.
        X86FeaturePREFETCHWT1: "prefetchwt1",

        // Block 5.
        X86FeatureBlock5Bit30: "block5_bit30",
}

// intelCacheDescriptors describe the caches and TLBs on the system. They are
// returned in the registers for eax=2. Intel only.
type intelCacheDescriptor uint8

// Valid cache/TLB descriptors. All descriptors can be found in Intel SDM Vol.
// 2, Ch. 3.2, "CPUID", Table 3-12 "Encoding of CPUID Leaf 2 Descriptors".
const (
        intelNullDescriptor    intelCacheDescriptor = 0
        intelNoTLBDescriptor   intelCacheDescriptor = 0xfe
        intelNoCacheDescriptor intelCacheDescriptor = 0xff

        // Most descriptors omitted for brevity as they are currently unused.
)

// CacheType describes the type of a cache, as returned in eax[4:0] for eax=4.
type CacheType uint8

const (
        // cacheNull indicates that there are no more entries.
        cacheNull CacheType = iota

        // CacheData is a data cache.
        CacheData

        // CacheInstruction is an instruction cache.
        CacheInstruction

        // CacheUnified is a unified instruction and data cache.
        CacheUnified
)

// Cache describes the parameters of a single cache on the system.
//
// +stateify savable
type Cache struct {
        // Level is the hierarchical level of this cache (L1, L2, etc).
        Level uint32

        // Type is the type of cache.
        Type CacheType

        // FullyAssociative indicates that entries may be placed in any block.
        FullyAssociative bool

        // Partitions is the number of physical partitions in the cache.
        Partitions uint32

        // Ways is the number of ways of associativity in the cache.
        Ways uint32

        // Sets is the number of sets in the cache.
        Sets uint32

        // InvalidateHierarchical indicates that WBINVD/INVD from threads
        // sharing this cache acts upon lower level caches for threads sharing
        // this cache.
        InvalidateHierarchical bool

        // Inclusive indicates that this cache is inclusive of lower cache
        // levels.
        Inclusive bool

        // DirectMapped indicates that this cache is directly mapped from
        // address, rather than using a hash function.
        DirectMapped bool
}

// Just a way to wrap cpuid function numbers.
type cpuidFunction uint32

// The constants below are the lower or "standard" cpuid functions, ordered as
// defined by the hardware.
const (
        vendorID                      cpuidFunction = iota // Returns vendor ID and largest standard function.
        featureInfo                                        // Returns basic feature bits and processor signature.
        intelCacheDescriptors                              // Returns list of cache descriptors. Intel only.
        intelSerialNumber                                  // Returns processor serial number (obsolete on new hardware). Intel only.
        intelDeterministicCacheParams                      // Returns deterministic cache information. Intel only.
        monitorMwaitParams                                 // Returns information about monitor/mwait instructions.
        powerParams                                        // Returns information about power management and thermal sensors.
        extendedFeatureInfo                                // Returns extended feature bits.
        _                                                  // Function 0x8 is reserved.
        intelDCAParams                                     // Returns direct cache access information. Intel only.
        intelPMCInfo                                       // Returns information about performance monitoring features. Intel only.
        intelX2APICInfo                                    // Returns core/logical processor topology. Intel only.
        _                                                  // Function 0xc is reserved.
        xSaveInfo                                          // Returns information about extended state management.
)

// The "extended" functions start at 0x80000000.
const (
        extendedFunctionInfo cpuidFunction = 0x80000000 + iota // Returns highest available extended function in eax.
        extendedFeatures                                       // Returns some extended feature bits in edx and ecx.
)

// These are the extended floating point state features. They are used to
// enumerate floating point features in XCR0, XSTATE_BV, etc.
const (
        XSAVEFeatureX87         = 1 << 0
        XSAVEFeatureSSE         = 1 << 1
        XSAVEFeatureAVX         = 1 << 2
        XSAVEFeatureBNDREGS     = 1 << 3
        XSAVEFeatureBNDCSR      = 1 << 4
        XSAVEFeatureAVX512op    = 1 << 5
        XSAVEFeatureAVX512zmm0  = 1 << 6
        XSAVEFeatureAVX512zmm16 = 1 << 7
        XSAVEFeaturePKRU        = 1 << 9
)

var cpuFreqMHz float64

// x86FeaturesFromString includes features from x86FeatureStrings and
// x86FeatureParseOnlyStrings.
var x86FeaturesFromString = make(map[string]Feature)

// FeatureFromString returns the Feature associated with the given feature
// string plus a bool to indicate if it could find the feature.
func FeatureFromString(s string) (Feature, bool) {
        f, b := x86FeaturesFromString[s]
        return f, b
}

// String implements fmt.Stringer.
func (f Feature) String() string {
        if s := f.flagString(false); s != "" {
                return s
        }

        block := int(f) / 32
        bit := int(f) % 32
        return fmt.Sprintf("<cpuflag %d; block %d bit %d>", f, block, bit)
}

func (f Feature) flagString(cpuinfoOnly bool) string {
        if s, ok := x86FeatureStrings[f]; ok {
                return s
        }
        if !cpuinfoOnly {
                return x86FeatureParseOnlyStrings[f]
        }
        return ""
}

// FeatureSet is a set of Features for a CPU.
//
// +stateify savable
type FeatureSet struct {
        // Set is the set of features that are enabled in this FeatureSet.
        Set map[Feature]bool

        // VendorID is the 12-char string returned in ebx:edx:ecx for eax=0.
        VendorID string

        // ExtendedFamily is part of the processor signature.
        ExtendedFamily uint8

        // ExtendedModel is part of the processor signature.
        ExtendedModel uint8

        // ProcessorType is part of the processor signature.
        ProcessorType uint8

        // Family is part of the processor signature.
        Family uint8

        // Model is part of the processor signature.
        Model uint8

        // SteppingID is part of the processor signature.
        SteppingID uint8

        // Caches describes the caches on the CPU.
        Caches []Cache

        // CacheLine is the size of a cache line in bytes.
        //
        // All caches use the same line size. This is not enforced in the CPUID
        // encoding, but is true on all known x86 processors.
        CacheLine uint32
}

// FlagsString prints out supported CPU flags. If cpuinfoOnly is true, it is
// equivalent to the "flags" field in /proc/cpuinfo.
func (fs *FeatureSet) FlagsString(cpuinfoOnly bool) string {
        var s []string
        for _, b := range linuxBlockOrder {
                for i := 0; i < blockSize; i++ {
                        if f := featureID(b, i); fs.Set[f] {
                                if fstr := f.flagString(cpuinfoOnly); fstr != "" {
                                        s = append(s, fstr)
                                }
                        }
                }
        }
        return strings.Join(s, " ")
}

// WriteCPUInfoTo is to generate a section of one cpu in /proc/cpuinfo. This is
// a minimal /proc/cpuinfo, it is missing some fields like "microcode" that are
// not always printed in Linux. The bogomips field is simply made up.
func (fs FeatureSet) WriteCPUInfoTo(cpu uint, b *bytes.Buffer) {
        fmt.Fprintf(b, "processor\t: %d\n", cpu)
        fmt.Fprintf(b, "vendor_id\t: %s\n", fs.VendorID)
        fmt.Fprintf(b, "cpu family\t: %d\n", ((fs.ExtendedFamily<<4)&0xff)|fs.Family)
        fmt.Fprintf(b, "model\t\t: %d\n", ((fs.ExtendedModel<<4)&0xff)|fs.Model)
        fmt.Fprintf(b, "model name\t: %s\n", "unknown") // Unknown for now.
        fmt.Fprintf(b, "stepping\t: %s\n", "unknown")   // Unknown for now.
        fmt.Fprintf(b, "cpu MHz\t\t: %.3f\n", cpuFreqMHz)
        fmt.Fprintln(b, "fpu\t\t: yes")
        fmt.Fprintln(b, "fpu_exception\t: yes")
        fmt.Fprintf(b, "cpuid level\t: %d\n", uint32(xSaveInfo)) // Same as ax in vendorID.
        fmt.Fprintln(b, "wp\t\t: yes")
        fmt.Fprintf(b, "flags\t\t: %s\n", fs.FlagsString(true))
        fmt.Fprintf(b, "bogomips\t: %.02f\n", cpuFreqMHz) // It's bogus anyway.
        fmt.Fprintf(b, "clflush size\t: %d\n", fs.CacheLine)
        fmt.Fprintf(b, "cache_alignment\t: %d\n", fs.CacheLine)
        fmt.Fprintf(b, "address sizes\t: %d bits physical, %d bits virtual\n", 46, 48)
        fmt.Fprintln(b, "power management:") // This is always here, but can be blank.
        fmt.Fprintln(b, "")                  // The /proc/cpuinfo file ends with an extra newline.
}

const (
        amdVendorID   = "AuthenticAMD"
        intelVendorID = "GenuineIntel"
)

// AMD returns true if fs describes an AMD CPU.
func (fs *FeatureSet) AMD() bool {
        return fs.VendorID == amdVendorID
}

// Intel returns true if fs describes an Intel CPU.
func (fs *FeatureSet) Intel() bool {
        return fs.VendorID == intelVendorID
}

// CheckHostCompatible returns nil if fs is a subset of the host feature set.
func (fs *FeatureSet) CheckHostCompatible() error {
        hfs := HostFeatureSet()

        if diff := fs.Subtract(hfs); diff != nil {
                return ErrIncompatible{fmt.Sprintf("CPU feature set %v incompatible with host feature set %v (missing: %v)", fs.FlagsString(false), hfs.FlagsString(false), diff)}
        }

        // The size of a cache line must match, as it is critical to correctly
        // utilizing CLFLUSH. Other cache properties are allowed to change, as
        // they are not important to correctness.
        if fs.CacheLine != hfs.CacheLine {
                return ErrIncompatible{fmt.Sprintf("CPU cache line size %d incompatible with host cache line size %d", fs.CacheLine, hfs.CacheLine)}
        }

        return nil
}

// Helper to convert 3 regs into 12-byte vendor ID.
func vendorIDFromRegs(bx, cx, dx uint32) string {
        bytes := make([]byte, 0, 12)
        for i := uint(0); i < 4; i++ {
                b := byte(bx >> (i * 8))
                bytes = append(bytes, b)
        }

        for i := uint(0); i < 4; i++ {
                b := byte(dx >> (i * 8))
                bytes = append(bytes, b)
        }

        for i := uint(0); i < 4; i++ {
                b := byte(cx >> (i * 8))
                bytes = append(bytes, b)
        }
        return string(bytes)
}

var maxXsaveSize = func() uint32 {
        // Leaf 0 of xsaveinfo function returns the size for currently
        // enabled xsave features in ebx, the maximum size if all valid
        // features are saved with xsave in ecx, and valid XCR0 bits in
        // edx:eax.
        //
        // If xSaveInfo isn't supported, cpuid will not fault but will
        // return bogus values.
        _, _, maxXsaveSize, _ := HostID(uint32(xSaveInfo), 0)
        return maxXsaveSize
}()

// ExtendedStateSize returns the number of bytes needed to save the "extended
// state" for this processor and the boundary it must be aligned to. Extended
// state includes floating point registers, and other cpu state that's not
// associated with the normal task context.
//
// Note: We can save some space here with an optimization where we use a
// smaller chunk of memory depending on features that are actually enabled.
// Currently we just use the largest possible size for simplicity (which is
// about 2.5K worst case, with avx512).
func (fs *FeatureSet) ExtendedStateSize() (size, align uint) {
        if fs.UseXsave() {
                return uint(maxXsaveSize), 64
        }

        // If we don't support xsave, we fall back to fxsave, which requires
        // 512 bytes aligned to 16 bytes.
        return 512, 16
}

// ValidXCR0Mask returns the bits that may be set to 1 in control register
// XCR0.
func (fs *FeatureSet) ValidXCR0Mask() uint64 {
        if !fs.UseXsave() {
                return 0
        }
        eax, _, _, edx := HostID(uint32(xSaveInfo), 0)
        return uint64(edx)<<32 | uint64(eax)
}

// vendorIDRegs returns the 3 register values used to construct the 12-byte
// vendor ID string for eax=0.
func (fs *FeatureSet) vendorIDRegs() (bx, dx, cx uint32) {
        for i := uint(0); i < 4; i++ {
                bx |= uint32(fs.VendorID[i]) << (i * 8)
        }

        for i := uint(0); i < 4; i++ {
                dx |= uint32(fs.VendorID[i+4]) << (i * 8)
        }

        for i := uint(0); i < 4; i++ {
                cx |= uint32(fs.VendorID[i+8]) << (i * 8)
        }
        return
}

// signature returns the signature dword that's returned in eax when eax=1.
func (fs *FeatureSet) signature() uint32 {
        var s uint32
        s |= uint32(fs.SteppingID & 0xf)
        s |= uint32(fs.Model&0xf) << 4
        s |= uint32(fs.Family&0xf) << 8
        s |= uint32(fs.ProcessorType&0x3) << 12
        s |= uint32(fs.ExtendedModel&0xf) << 16
        s |= uint32(fs.ExtendedFamily&0xff) << 20
        return s
}

// Helper to deconstruct signature dword.
func signatureSplit(v uint32) (ef, em, pt, f, m, sid uint8) {
        sid = uint8(v & 0xf)
        m = uint8(v>>4) & 0xf
        f = uint8(v>>8) & 0xf
        pt = uint8(v>>12) & 0x3
        em = uint8(v>>16) & 0xf
        ef = uint8(v >> 20)
        return
}

// Helper to convert blockwise feature bit masks into a set of features. Masks
// must be provided in order for each block, without skipping them. If a block
// does not matter for this feature set, 0 is specified.
func setFromBlockMasks(blocks ...uint32) map[Feature]bool {
        s := make(map[Feature]bool)
        for b, blockMask := range blocks {
                for i := 0; i < blockSize; i++ {
                        if blockMask&1 != 0 {
                                s[featureID(block(b), i)] = true
                        }
                        blockMask >>= 1
                }
        }
        return s
}

// blockMask returns the 32-bit mask associated with a block of features.
func (fs *FeatureSet) blockMask(b block) uint32 {
        var mask uint32
        for i := 0; i < blockSize; i++ {
                if fs.Set[featureID(b, i)] {
                        mask |= 1 << uint(i)
                }
        }
        return mask
}

// Remove removes a Feature from a FeatureSet. It ignores features
// that are not in the FeatureSet.
func (fs *FeatureSet) Remove(feature Feature) {
        delete(fs.Set, feature)
}

// Add adds a Feature to a FeatureSet. It ignores duplicate features.
func (fs *FeatureSet) Add(feature Feature) {
        fs.Set[feature] = true
}

// HasFeature tests whether or not a feature is in the given feature set.
func (fs *FeatureSet) HasFeature(feature Feature) bool {
        return fs.Set[feature]
}

// Subtract returns the features present in fs that are not present in other.
// If all features in fs are present in other, Subtract returns nil.
func (fs *FeatureSet) Subtract(other *FeatureSet) (diff map[Feature]bool) {
        for f := range fs.Set {
                if !other.Set[f] {
                        if diff == nil {
                                diff = make(map[Feature]bool)
                        }
                        diff[f] = true
                }
        }

        return
}

// EmulateID emulates a cpuid instruction based on the feature set.
func (fs *FeatureSet) EmulateID(origAx, origCx uint32) (ax, bx, cx, dx uint32) {
        switch cpuidFunction(origAx) {
        case vendorID:
                ax = uint32(xSaveInfo) // 0xd (xSaveInfo) is the highest function we support.
                bx, dx, cx = fs.vendorIDRegs()
        case featureInfo:
                // CLFLUSH line size is encoded in quadwords. Other fields in bx unsupported.
                bx = (fs.CacheLine / 8) << 8
                cx = fs.blockMask(block(0))
                dx = fs.blockMask(block(1))
                ax = fs.signature()
        case intelCacheDescriptors:
                if !fs.Intel() {
                        // Reserved on non-Intel.
                        return 0, 0, 0, 0
                }

                // "The least-significant byte in register EAX (register AL)
                // will always return 01H. Software should ignore this value
                // and not interpret it as an informational descriptor." - SDM
                //
                // We only support reporting cache parameters via
                // intelDeterministicCacheParams; report as much here.
                //
                // We do not support exposing TLB information at all.
                ax = 1 | (uint32(intelNoCacheDescriptor) << 8)
        case intelDeterministicCacheParams:
                if !fs.Intel() {
                        // Reserved on non-Intel.
                        return 0, 0, 0, 0
                }

                // cx is the index of the cache to describe.
                if int(origCx) >= len(fs.Caches) {
                        return uint32(cacheNull), 0, 0, 0
                }
                c := fs.Caches[origCx]

                ax = uint32(c.Type)
                ax |= c.Level << 5
                ax |= 1 << 8 // Always claim the cache is "self-initializing".
                if c.FullyAssociative {
                        ax |= 1 << 9
                }
                // Processor topology not supported.

                bx = fs.CacheLine - 1
                bx |= (c.Partitions - 1) << 12
                bx |= (c.Ways - 1) << 22

                cx = c.Sets - 1

                if !c.InvalidateHierarchical {
                        dx |= 1
                }
                if c.Inclusive {
                        dx |= 1 << 1
                }
                if !c.DirectMapped {
                        dx |= 1 << 2
                }
        case xSaveInfo:
                if !fs.UseXsave() {
                        return 0, 0, 0, 0
                }
                return HostID(uint32(xSaveInfo), origCx)
        case extendedFeatureInfo:
                if origCx != 0 {
                        break // Only leaf 0 is supported.
                }
                bx = fs.blockMask(block(2))
                cx = fs.blockMask(block(3))
        case extendedFunctionInfo:
                // We only support showing the extended features.
                ax = uint32(extendedFeatures)
                cx = 0
        case extendedFeatures:
                cx = fs.blockMask(block(5))
                dx = fs.blockMask(block(6))
                if fs.AMD() {
                        // AMD duplicates some block 1 features in block 6.
                        dx |= fs.blockMask(block(1)) & block6DuplicateMask
                }
        }

        return
}

// UseXsave returns the choice of fp state saving instruction.
func (fs *FeatureSet) UseXsave() bool {
        return fs.HasFeature(X86FeatureXSAVE) && fs.HasFeature(X86FeatureOSXSAVE)
}

// UseXsaveopt returns true if 'fs' supports the "xsaveopt" instruction.
func (fs *FeatureSet) UseXsaveopt() bool {
        return fs.UseXsave() && fs.HasFeature(X86FeatureXSAVEOPT)
}

// HostID executes a native CPUID instruction.
func HostID(axArg, cxArg uint32) (ax, bx, cx, dx uint32)

// HostFeatureSet uses cpuid to get host values and construct a feature set
// that matches that of the host machine. Note that there are several places
// where there appear to be some unnecessary assignments between register names
// (ax, bx, cx, or dx) and featureBlockN variables. This is to explicitly show
// where the different feature blocks come from, to make the code easier to
// inspect and read.
func HostFeatureSet() *FeatureSet {
        // eax=0 gets max supported feature and vendor ID.
        _, bx, cx, dx := HostID(0, 0)
        vendorID := vendorIDFromRegs(bx, cx, dx)

        // eax=1 gets basic features in ecx:edx.
        ax, bx, cx, dx := HostID(1, 0)
        featureBlock0 := cx
        featureBlock1 := dx
        ef, em, pt, f, m, sid := signatureSplit(ax)
        cacheLine := 8 * (bx >> 8) & 0xff

        // eax=4, ecx=i gets details about cache index i. Only supported on Intel.
        var caches []Cache
        if vendorID == intelVendorID {
                // ecx selects the cache index until a null type is returned.
                for i := uint32(0); ; i++ {
                        ax, bx, cx, dx := HostID(4, i)
                        t := CacheType(ax & 0xf)
                        if t == cacheNull {
                                break
                        }

                        lineSize := (bx & 0xfff) + 1
                        if lineSize != cacheLine {
                                panic(fmt.Sprintf("Mismatched cache line size: %d vs %d", lineSize, cacheLine))
                        }

                        caches = append(caches, Cache{
                                Type:                   t,
                                Level:                  (ax >> 5) & 0x7,
                                FullyAssociative:       ((ax >> 9) & 1) == 1,
                                Partitions:             ((bx >> 12) & 0x3ff) + 1,
                                Ways:                   ((bx >> 22) & 0x3ff) + 1,
                                Sets:                   cx + 1,
                                InvalidateHierarchical: (dx & 1) == 0,
                                Inclusive:              ((dx >> 1) & 1) == 1,
                                DirectMapped:           ((dx >> 2) & 1) == 0,
                        })
                }
        }

        // eax=7, ecx=0 gets extended features in ecx:ebx.
        _, bx, cx, _ = HostID(7, 0)
        featureBlock2 := bx
        featureBlock3 := cx

        // Leaf 0xd is supported only if CPUID.1:ECX.XSAVE[bit 26] is set.
        var featureBlock4 uint32
        if (featureBlock0 & (1 << 26)) != 0 {
                featureBlock4, _, _, _ = HostID(uint32(xSaveInfo), 1)
        }

        // eax=0x80000000 gets supported extended levels. We use this to
        // determine if there are any non-zero block 4 or block 6 bits to find.
        var featureBlock5, featureBlock6 uint32
        if ax, _, _, _ := HostID(uint32(extendedFunctionInfo), 0); ax >= uint32(extendedFeatures) {
                // eax=0x80000001 gets AMD added feature bits.
                _, _, cx, dx = HostID(uint32(extendedFeatures), 0)
                featureBlock5 = cx
                // Ignore features duplicated from block 1 on AMD. These bits
                // are reserved on Intel.
                featureBlock6 = dx &^ block6DuplicateMask
        }

        set := setFromBlockMasks(featureBlock0, featureBlock1, featureBlock2, featureBlock3, featureBlock4, featureBlock5, featureBlock6)
        return &FeatureSet{
                Set:            set,
                VendorID:       vendorID,
                ExtendedFamily: ef,
                ExtendedModel:  em,
                ProcessorType:  pt,
                Family:         f,
                Model:          m,
                SteppingID:     sid,
                CacheLine:      cacheLine,
                Caches:         caches,
        }
}

// Reads max cpu frequency from host /proc/cpuinfo. Must run before syscall
// filter installation. This value is used to create the fake /proc/cpuinfo
// from a FeatureSet.
func initCPUFreq() {
        cpuinfob, err := ioutil.ReadFile("/proc/cpuinfo")
        if err != nil {
                // Leave it as 0... The standalone VDSO bails out in the same
                // way.
                log.Warningf("Could not read /proc/cpuinfo: %v", err)
                return
        }
        cpuinfo := string(cpuinfob)

        // We get the value straight from host /proc/cpuinfo. On machines with
        // frequency scaling enabled, this will only get the current value
        // which will likely be inaccurate. This is fine on machines with
        // frequency scaling disabled.
        for _, line := range strings.Split(cpuinfo, "\n") {
                if strings.Contains(line, "cpu MHz") {
                        splitMHz := strings.Split(line, ":")
                        if len(splitMHz) < 2 {
                                log.Warningf("Could not read /proc/cpuinfo: malformed cpu MHz line")
                                return
                        }

                        // If there was a problem, leave cpuFreqMHz as 0.
                        var err error
                        cpuFreqMHz, err = strconv.ParseFloat(strings.TrimSpace(splitMHz[1]), 64)
                        if err != nil {
                                log.Warningf("Could not parse cpu MHz value %v: %v", splitMHz[1], err)
                                cpuFreqMHz = 0
                                return
                        }
                        return
                }
        }
        log.Warningf("Could not parse /proc/cpuinfo, it is empty or does not contain cpu MHz")
}

func initFeaturesFromString() {
        for f, s := range x86FeatureStrings {
                x86FeaturesFromString[s] = f
        }
        for f, s := range x86FeatureParseOnlyStrings {
                x86FeaturesFromString[s] = f
        }
}

func init() {
        initCPUFreq()
        initFeaturesFromString()
}






















































   12 











   12 





   12 














    4 






    1 




    1 




    1 




    1 










    2 






    2 


    1 










    7 






    7 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package netlink

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/socket"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// SocketVFS2 is the base VFS2 socket type for netlink sockets.
//
// This implementation only supports userspace sending and receiving messages
// to/from the kernel.
//
// SocketVFS2 implements socket.SocketVFS2 and transport.Credentialer.
//
// +stateify savable
type SocketVFS2 struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.DentryMetadataFileDescriptionImpl
        vfs.LockFD

        socketOpsCommon
}

var _ socket.SocketVFS2 = (*SocketVFS2)(nil)
var _ transport.Credentialer = (*SocketVFS2)(nil)

// NewVFS2 creates a new SocketVFS2.
func NewVFS2(t *kernel.Task, skType linux.SockType, protocol Protocol) (*SocketVFS2, *syserr.Error) {
        // Datagram endpoint used to buffer kernel -> user messages.
        ep := transport.NewConnectionless(t)

        // Bind the endpoint for good measure so we can connect to it. The
        // bound address will never be exposed.
        if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil {
                ep.Close(t)
                return nil, err
        }

        // Create a connection from which the kernel can write messages.
        connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t)
        if err != nil {
                ep.Close(t)
                return nil, err
        }

        fd := &SocketVFS2{
                socketOpsCommon: socketOpsCommon{
                        ports:          t.Kernel().NetlinkPorts(),
                        protocol:       protocol,
                        skType:         skType,
                        ep:             ep,
                        connection:     connection,
                        sendBufferSize: defaultSendBufferSize,
                },
        }
        fd.LockFD.Init(&vfs.FileLocks{})
        return fd, nil
}

// Release implements vfs.FileDescriptionImpl.Release.
func (s *SocketVFS2) Release(ctx context.Context) {
        t := kernel.TaskFromContext(ctx)
        t.Kernel().DeleteSocketVFS2(&s.vfsfd)
        s.socketOpsCommon.Release(ctx)
}

// Readiness implements waiter.Waitable.Readiness.
func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
        return s.socketOpsCommon.Readiness(mask)
}

// EventRegister implements waiter.Waitable.EventRegister.
func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        s.socketOpsCommon.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (s *SocketVFS2) EventUnregister(e *waiter.Entry) {
        s.socketOpsCommon.EventUnregister(e)
}

// Ioctl implements vfs.FileDescriptionImpl.
func (*SocketVFS2) Ioctl(context.Context, usermem.IO, arch.SyscallArguments) (uintptr, error) {
        // TODO(b/68878065): no ioctls supported.
        return 0, linuxerr.ENOTTY
}

// PRead implements vfs.FileDescriptionImpl.
func (s *SocketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        return 0, linuxerr.ESPIPE
}

// Read implements vfs.FileDescriptionImpl.
func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        // All flags other than RWF_NOWAIT should be ignored.
        // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
        if opts.Flags != 0 {
                return 0, linuxerr.EOPNOTSUPP
        }

        if dst.NumBytes() == 0 {
                return 0, nil
        }
        return dst.CopyOutFrom(ctx, &unix.EndpointReader{
                Endpoint: s.ep,
        })
}

// PWrite implements vfs.FileDescriptionImpl.
func (s *SocketVFS2) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
        return 0, linuxerr.ESPIPE
}

// Write implements vfs.FileDescriptionImpl.
func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        // All flags other than RWF_NOWAIT should be ignored.
        // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
        if opts.Flags != 0 {
                return 0, linuxerr.EOPNOTSUPP
        }

        n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{})
        return int64(n), err.ToError()
}














 1624 


 1626 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
// Copyright 2020 The gVisor Authors.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build !checklocks

package sync

import (
        "unsafe"
)

func noteLock(l unsafe.Pointer) {
}

func noteUnlock(l unsafe.Pointer) {
}














































































































































































































































































































































 1959 
 1029 


 1959 



 1959 
 1957 

  270 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License"),;
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package linuxerr contains syscall error codes exported as an error interface
// pointers. This allows for fast comparison and return operations comperable
// to unix.Errno constants.
package linuxerr

import (
        "fmt"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux/errno"
        "gvisor.dev/gvisor/pkg/errors"
)

const maxErrno uint32 = errno.EHWPOISON + 1

var (
        NOERROR = errors.New(errno.NOERRNO, "not an error")
        EPERM   = errors.New(errno.EPERM, "operation not permitted")
        ENOENT  = errors.New(errno.ENOENT, "no such file or directory")
        ESRCH   = errors.New(errno.ESRCH, "no such process")
        EINTR   = errors.New(errno.EINTR, "interrupted system call")
        EIO     = errors.New(errno.EIO, "I/O error")
        ENXIO   = errors.New(errno.ENXIO, "no such device or address")
        E2BIG   = errors.New(errno.E2BIG, "argument list too long")
        ENOEXEC = errors.New(errno.ENOEXEC, "exec format error")
        EBADF   = errors.New(errno.EBADF, "bad file number")
        ECHILD  = errors.New(errno.ECHILD, "no child processes")
        EAGAIN  = errors.New(errno.EAGAIN, "try again")
        ENOMEM  = errors.New(errno.ENOMEM, "out of memory")
        EACCES  = errors.New(errno.EACCES, "permission denied")
        EFAULT  = errors.New(errno.EFAULT, "bad address")
        ENOTBLK = errors.New(errno.ENOTBLK, "block device required")
        EBUSY   = errors.New(errno.EBUSY, "device or resource busy")
        EEXIST  = errors.New(errno.EEXIST, "file exists")
        EXDEV   = errors.New(errno.EXDEV, "cross-device link")
        ENODEV  = errors.New(errno.ENODEV, "no such device")
        ENOTDIR = errors.New(errno.ENOTDIR, "not a directory")
        EISDIR  = errors.New(errno.EISDIR, "is a directory")
        EINVAL  = errors.New(errno.EINVAL, "invalid argument")
        ENFILE  = errors.New(errno.ENFILE, "file table overflow")
        EMFILE  = errors.New(errno.EMFILE, "too many open files")
        ENOTTY  = errors.New(errno.ENOTTY, "not a typewriter")
        ETXTBSY = errors.New(errno.ETXTBSY, "text file busy")
        EFBIG   = errors.New(errno.EFBIG, "file too large")
        ENOSPC  = errors.New(errno.ENOSPC, "no space left on device")
        ESPIPE  = errors.New(errno.ESPIPE, "illegal seek")
        EROFS   = errors.New(errno.EROFS, "read-only file system")
        EMLINK  = errors.New(errno.EMLINK, "too many links")
        EPIPE   = errors.New(errno.EPIPE, "broken pipe")
        EDOM    = errors.New(errno.EDOM, "math argument out of domain of func")
        ERANGE  = errors.New(errno.ERANGE, "math result not representable")

        // Errno values from include/uapi/asm-generic/errno.h.
        EDEADLK         = errors.New(errno.EDEADLK, "resource deadlock would occur")
        ENAMETOOLONG    = errors.New(errno.ENAMETOOLONG, "file name too long")
        ENOLCK          = errors.New(errno.ENOLCK, "no record locks available")
        ENOSYS          = errors.New(errno.ENOSYS, "invalid system call number")
        ENOTEMPTY       = errors.New(errno.ENOTEMPTY, "directory not empty")
        ELOOP           = errors.New(errno.ELOOP, "too many symbolic links encountered")
        ENOMSG          = errors.New(errno.ENOMSG, "no message of desired type")
        EIDRM           = errors.New(errno.EIDRM, "identifier removed")
        ECHRNG          = errors.New(errno.ECHRNG, "channel number out of range")
        EL2NSYNC        = errors.New(errno.EL2NSYNC, "level 2 not synchronized")
        EL3HLT          = errors.New(errno.EL3HLT, "level 3 halted")
        EL3RST          = errors.New(errno.EL3RST, "level 3 reset")
        ELNRNG          = errors.New(errno.ELNRNG, "link number out of range")
        EUNATCH         = errors.New(errno.EUNATCH, "protocol driver not attached")
        ENOCSI          = errors.New(errno.ENOCSI, "no CSI structure available")
        EL2HLT          = errors.New(errno.EL2HLT, "level 2 halted")
        EBADE           = errors.New(errno.EBADE, "invalid exchange")
        EBADR           = errors.New(errno.EBADR, "invalid request descriptor")
        EXFULL          = errors.New(errno.EXFULL, "exchange full")
        ENOANO          = errors.New(errno.ENOANO, "no anode")
        EBADRQC         = errors.New(errno.EBADRQC, "invalid request code")
        EBADSLT         = errors.New(errno.EBADSLT, "invalid slot")
        EBFONT          = errors.New(errno.EBFONT, "bad font file format")
        ENOSTR          = errors.New(errno.ENOSTR, "device not a stream")
        ENODATA         = errors.New(errno.ENODATA, "no data available")
        ETIME           = errors.New(errno.ETIME, "timer expired")
        ENOSR           = errors.New(errno.ENOSR, "out of streams resources")
        ENOPKG          = errors.New(errno.ENOPKG, "package not installed")
        EREMOTE         = errors.New(errno.EREMOTE, "object is remote")
        ENOLINK         = errors.New(errno.ENOLINK, "link has been severed")
        EADV            = errors.New(errno.EADV, "advertise error")
        ESRMNT          = errors.New(errno.ESRMNT, "srmount error")
        ECOMM           = errors.New(errno.ECOMM, "communication error on send")
        EPROTO          = errors.New(errno.EPROTO, "protocol error")
        EMULTIHOP       = errors.New(errno.EMULTIHOP, "multihop attempted")
        EDOTDOT         = errors.New(errno.EDOTDOT, "RFS specific error")
        EBADMSG         = errors.New(errno.EBADMSG, "not a data message")
        EOVERFLOW       = errors.New(errno.EOVERFLOW, "value too large for defined data type")
        ENOTUNIQ        = errors.New(errno.ENOTUNIQ, "name not unique on network")
        EBADFD          = errors.New(errno.EBADFD, "file descriptor in bad state")
        EREMCHG         = errors.New(errno.EREMCHG, "remote address changed")
        ELIBACC         = errors.New(errno.ELIBACC, "can not access a needed shared library")
        ELIBBAD         = errors.New(errno.ELIBBAD, "accessing a corrupted shared library")
        ELIBSCN         = errors.New(errno.ELIBSCN, ".lib section in a.out corrupted")
        ELIBMAX         = errors.New(errno.ELIBMAX, "attempting to link in too many shared libraries")
        ELIBEXEC        = errors.New(errno.ELIBEXEC, "cannot exec a shared library directly")
        EILSEQ          = errors.New(errno.EILSEQ, "illegal byte sequence")
        ERESTART        = errors.New(errno.ERESTART, "interrupted system call should be restarted")
        ESTRPIPE        = errors.New(errno.ESTRPIPE, "streams pipe error")
        EUSERS          = errors.New(errno.EUSERS, "too many users")
        ENOTSOCK        = errors.New(errno.ENOTSOCK, "socket operation on non-socket")
        EDESTADDRREQ    = errors.New(errno.EDESTADDRREQ, "destination address required")
        EMSGSIZE        = errors.New(errno.EMSGSIZE, "message too long")
        EPROTOTYPE      = errors.New(errno.EPROTOTYPE, "protocol wrong type for socket")
        ENOPROTOOPT     = errors.New(errno.ENOPROTOOPT, "protocol not available")
        EPROTONOSUPPORT = errors.New(errno.EPROTONOSUPPORT, "protocol not supported")
        ESOCKTNOSUPPORT = errors.New(errno.ESOCKTNOSUPPORT, "socket type not supported")
        EOPNOTSUPP      = errors.New(errno.EOPNOTSUPP, "operation not supported on transport endpoint")
        EPFNOSUPPORT    = errors.New(errno.EPFNOSUPPORT, "protocol family not supported")
        EAFNOSUPPORT    = errors.New(errno.EAFNOSUPPORT, "address family not supported by protocol")
        EADDRINUSE      = errors.New(errno.EADDRINUSE, "address already in use")
        EADDRNOTAVAIL   = errors.New(errno.EADDRNOTAVAIL, "cannot assign requested address")
        ENETDOWN        = errors.New(errno.ENETDOWN, "network is down")
        ENETUNREACH     = errors.New(errno.ENETUNREACH, "network is unreachable")
        ENETRESET       = errors.New(errno.ENETRESET, "network dropped connection because of reset")
        ECONNABORTED    = errors.New(errno.ECONNABORTED, "software caused connection abort")
        ECONNRESET      = errors.New(errno.ECONNRESET, "connection reset by peer")
        ENOBUFS         = errors.New(errno.ENOBUFS, "no buffer space available")
        EISCONN         = errors.New(errno.EISCONN, "transport endpoint is already connected")
        ENOTCONN        = errors.New(errno.ENOTCONN, "transport endpoint is not connected")
        ESHUTDOWN       = errors.New(errno.ESHUTDOWN, "cannot send after transport endpoint shutdown")
        ETOOMANYREFS    = errors.New(errno.ETOOMANYREFS, "too many references: cannot splice")
        ETIMEDOUT       = errors.New(errno.ETIMEDOUT, "connection timed out")
        ECONNREFUSED    = errors.New(errno.ECONNREFUSED, "connection refused")
        EHOSTDOWN       = errors.New(errno.EHOSTDOWN, "host is down")
        EHOSTUNREACH    = errors.New(errno.EHOSTUNREACH, "no route to host")
        EALREADY        = errors.New(errno.EALREADY, "operation already in progress")
        EINPROGRESS     = errors.New(errno.EINPROGRESS, "operation now in progress")
        ESTALE          = errors.New(errno.ESTALE, "stale file handle")
        EUCLEAN         = errors.New(errno.EUCLEAN, "structure needs cleaning")
        ENOTNAM         = errors.New(errno.ENOTNAM, "not a XENIX named type file")
        ENAVAIL         = errors.New(errno.ENAVAIL, "no XENIX semaphores available")
        EISNAM          = errors.New(errno.EISNAM, "is a named type file")
        EREMOTEIO       = errors.New(errno.EREMOTEIO, "remote I/O error")
        EDQUOT          = errors.New(errno.EDQUOT, "quota exceeded")
        ENOMEDIUM       = errors.New(errno.ENOMEDIUM, "no medium found")
        EMEDIUMTYPE     = errors.New(errno.EMEDIUMTYPE, "wrong medium type")
        ECANCELED       = errors.New(errno.ECANCELED, "operation Canceled")
        ENOKEY          = errors.New(errno.ENOKEY, "required key not available")
        EKEYEXPIRED     = errors.New(errno.EKEYEXPIRED, "key has expired")
        EKEYREVOKED     = errors.New(errno.EKEYREVOKED, "key has been revoked")
        EKEYREJECTED    = errors.New(errno.EKEYREJECTED, "key was rejected by service")
        EOWNERDEAD      = errors.New(errno.EOWNERDEAD, "owner died")
        ENOTRECOVERABLE = errors.New(errno.ENOTRECOVERABLE, "state not recoverable")
        ERFKILL         = errors.New(errno.ERFKILL, "operation not possible due to RF-kill")
        EHWPOISON       = errors.New(errno.EHWPOISON, "memory page has hardware error")

        // Errors equivalent to other errors.
        EWOULDBLOCK = EAGAIN
        EDEADLOCK   = EDEADLK
        ENONET      = ENOENT
        ENOATTR     = ENODATA
        ENOTSUP     = EOPNOTSUPP
)

// A nil *errors.Error denotes no error and is placed at the 0 index of
// errorSlice. Thus, any other empty index should not be nil or a valid error.
// This marks that index as an invalid error so any comparison to nil or a
// valid linuxerr fails.
var errNotValidError = errors.New(errno.Errno(maxErrno), "not a valid error")

// The following errorSlice holds errors by errno for fast translation between
// errnos (especially uint32(sycall.Errno)) and *Error.
var errorSlice = []*errors.Error{
        // Errno values from include/uapi/asm-generic/errno-base.h.
        errno.NOERRNO: NOERROR,
        errno.EPERM:   EPERM,
        errno.ENOENT:  ENOENT,
        errno.ESRCH:   ESRCH,
        errno.EINTR:   EINTR,
        errno.EIO:     EIO,
        errno.ENXIO:   ENXIO,
        errno.E2BIG:   E2BIG,
        errno.ENOEXEC: ENOEXEC,
        errno.EBADF:   EBADF,
        errno.ECHILD:  ECHILD,
        errno.EAGAIN:  EAGAIN,
        errno.ENOMEM:  ENOMEM,
        errno.EACCES:  EACCES,
        errno.EFAULT:  EFAULT,
        errno.ENOTBLK: ENOTBLK,
        errno.EBUSY:   EBUSY,
        errno.EEXIST:  EEXIST,
        errno.EXDEV:   EXDEV,
        errno.ENODEV:  ENODEV,
        errno.ENOTDIR: ENOTDIR,
        errno.EISDIR:  EISDIR,
        errno.EINVAL:  EINVAL,
        errno.ENFILE:  ENFILE,
        errno.EMFILE:  EMFILE,
        errno.ENOTTY:  ENOTTY,
        errno.ETXTBSY: ETXTBSY,
        errno.EFBIG:   EFBIG,
        errno.ENOSPC:  ENOSPC,
        errno.ESPIPE:  ESPIPE,
        errno.EROFS:   EROFS,
        errno.EMLINK:  EMLINK,
        errno.EPIPE:   EPIPE,
        errno.EDOM:    EDOM,
        errno.ERANGE:  ERANGE,

        // Errno values from include/uapi/asm-generic/errno.h.
        errno.EDEADLK:         EDEADLK,
        errno.ENAMETOOLONG:    ENAMETOOLONG,
        errno.ENOLCK:          ENOLCK,
        errno.ENOSYS:          ENOSYS,
        errno.ENOTEMPTY:       ENOTEMPTY,
        errno.ELOOP:           ELOOP,
        errno.ELOOP + 1:       errNotValidError, // No valid errno between ELOOP and ENOMSG.
        errno.ENOMSG:          ENOMSG,
        errno.EIDRM:           EIDRM,
        errno.ECHRNG:          ECHRNG,
        errno.EL2NSYNC:        EL2NSYNC,
        errno.EL3HLT:          EL3HLT,
        errno.EL3RST:          EL3RST,
        errno.ELNRNG:          ELNRNG,
        errno.EUNATCH:         EUNATCH,
        errno.ENOCSI:          ENOCSI,
        errno.EL2HLT:          EL2HLT,
        errno.EBADE:           EBADE,
        errno.EBADR:           EBADR,
        errno.EXFULL:          EXFULL,
        errno.ENOANO:          ENOANO,
        errno.EBADRQC:         EBADRQC,
        errno.EBADSLT:         EBADSLT,
        errno.EBADSLT + 1:     errNotValidError, // No valid errno between EBADSLT and ENOPKG.
        errno.EBFONT:          EBFONT,
        errno.ENOSTR:          ENOSTR,
        errno.ENODATA:         ENODATA,
        errno.ETIME:           ETIME,
        errno.ENOSR:           ENOSR,
        errno.ENOSR + 1:       errNotValidError, // No valid errno betweeen ENOSR and ENOPKG.
        errno.ENOPKG:          ENOPKG,
        errno.EREMOTE:         EREMOTE,
        errno.ENOLINK:         ENOLINK,
        errno.EADV:            EADV,
        errno.ESRMNT:          ESRMNT,
        errno.ECOMM:           ECOMM,
        errno.EPROTO:          EPROTO,
        errno.EMULTIHOP:       EMULTIHOP,
        errno.EDOTDOT:         EDOTDOT,
        errno.EBADMSG:         EBADMSG,
        errno.EOVERFLOW:       EOVERFLOW,
        errno.ENOTUNIQ:        ENOTUNIQ,
        errno.EBADFD:          EBADFD,
        errno.EREMCHG:         EREMCHG,
        errno.ELIBACC:         ELIBACC,
        errno.ELIBBAD:         ELIBBAD,
        errno.ELIBSCN:         ELIBSCN,
        errno.ELIBMAX:         ELIBMAX,
        errno.ELIBEXEC:        ELIBEXEC,
        errno.EILSEQ:          EILSEQ,
        errno.ERESTART:        ERESTART,
        errno.ESTRPIPE:        ESTRPIPE,
        errno.EUSERS:          EUSERS,
        errno.ENOTSOCK:        ENOTSOCK,
        errno.EDESTADDRREQ:    EDESTADDRREQ,
        errno.EMSGSIZE:        EMSGSIZE,
        errno.EPROTOTYPE:      EPROTOTYPE,
        errno.ENOPROTOOPT:     ENOPROTOOPT,
        errno.EPROTONOSUPPORT: EPROTONOSUPPORT,
        errno.ESOCKTNOSUPPORT: ESOCKTNOSUPPORT,
        errno.EOPNOTSUPP:      EOPNOTSUPP,
        errno.EPFNOSUPPORT:    EPFNOSUPPORT,
        errno.EAFNOSUPPORT:    EAFNOSUPPORT,
        errno.EADDRINUSE:      EADDRINUSE,
        errno.EADDRNOTAVAIL:   EADDRNOTAVAIL,
        errno.ENETDOWN:        ENETDOWN,
        errno.ENETUNREACH:     ENETUNREACH,
        errno.ENETRESET:       ENETRESET,
        errno.ECONNABORTED:    ECONNABORTED,
        errno.ECONNRESET:      ECONNRESET,
        errno.ENOBUFS:         ENOBUFS,
        errno.EISCONN:         EISCONN,
        errno.ENOTCONN:        ENOTCONN,
        errno.ESHUTDOWN:       ESHUTDOWN,
        errno.ETOOMANYREFS:    ETOOMANYREFS,
        errno.ETIMEDOUT:       ETIMEDOUT,
        errno.ECONNREFUSED:    ECONNREFUSED,
        errno.EHOSTDOWN:       EHOSTDOWN,
        errno.EHOSTUNREACH:    EHOSTUNREACH,
        errno.EALREADY:        EALREADY,
        errno.EINPROGRESS:     EINPROGRESS,
        errno.ESTALE:          ESTALE,
        errno.EUCLEAN:         EUCLEAN,
        errno.ENOTNAM:         ENOTNAM,
        errno.ENAVAIL:         ENAVAIL,
        errno.EISNAM:          EISNAM,
        errno.EREMOTEIO:       EREMOTEIO,
        errno.EDQUOT:          EDQUOT,
        errno.ENOMEDIUM:       ENOMEDIUM,
        errno.EMEDIUMTYPE:     EMEDIUMTYPE,
        errno.ECANCELED:       ECANCELED,
        errno.ENOKEY:          ENOKEY,
        errno.EKEYEXPIRED:     EKEYEXPIRED,
        errno.EKEYREVOKED:     EKEYREVOKED,
        errno.EKEYREJECTED:    EKEYREJECTED,
        errno.EOWNERDEAD:      EOWNERDEAD,
        errno.ENOTRECOVERABLE: ENOTRECOVERABLE,
        errno.ERFKILL:         ERFKILL,
        errno.EHWPOISON:       EHWPOISON,
}

// ErrorFromErrno gets an error from the list and panics if an invalid entry is requested.
func ErrorFromErrno(e errno.Errno) *errors.Error {
        err := errorSlice[e]
        // Done this way because a single comparison in benchmarks is 2-3 faster
        // than something like ( if err == nil && err > 0 ).
        if err != errNotValidError {
                return err
        }
        panic(fmt.Sprintf("invalid error requested with errno: %d", e))
}

// Equals compars a linuxerr to a given error
// TODO(b/34162363): Remove when syserror is removed.
func Equals(e *errors.Error, err error) bool {
        if err == nil {
                return e == NOERROR || e == nil
        }
        if e == nil {
                return err == NOERROR || err == unix.Errno(0)
        }

        switch err.(type) {
        case *errors.Error:
                return e == err
        case unix.Errno, error:
                return unix.Errno(e.Errno()) == err
        }
        return false
}


































    5 

    1 



    4 



    4 




    1 


    3 



    5 




    1 



    4 



    4 


    1 


    3 



   15 






    2 


   13 

    1 


   13 
    1 


   12 
    1 



   11 

    7 



    7 
    1 

    3 
    1 


    2 
    1 




 1004 

    1 



 1005 
    1 


 1007 







 1005 





 1013 

  204 


  208 




 1013 







  939 




  929 

  921 




 1054 
  916 


  920 







 1006 









 1000 



    1 



 1003 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "math"
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/waiter"
)

var sizeofEpollEvent = (*linux.EpollEvent)(nil).SizeBytes()

// EpollCreate1 implements Linux syscall epoll_create1(2).
func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        flags := args[0].Int()
        if flags&^linux.EPOLL_CLOEXEC != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        file, err := t.Kernel().VFS().NewEpollInstanceFD(t)
        if err != nil {
                return 0, nil, err
        }
        defer file.DecRef(t)

        fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
                CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0,
        })
        if err != nil {
                return 0, nil, err
        }
        return uintptr(fd), nil, nil
}

// EpollCreate implements Linux syscall epoll_create(2).
func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        size := args[0].Int()

        // "Since Linux 2.6.8, the size argument is ignored, but must be greater
        // than zero" - epoll_create(2)
        if size <= 0 {
                return 0, nil, linuxerr.EINVAL
        }

        file, err := t.Kernel().VFS().NewEpollInstanceFD(t)
        if err != nil {
                return 0, nil, err
        }
        defer file.DecRef(t)

        fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
        if err != nil {
                return 0, nil, err
        }
        return uintptr(fd), nil, nil
}

// EpollCtl implements Linux syscall epoll_ctl(2).
func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        epfd := args[0].Int()
        op := args[1].Int()
        fd := args[2].Int()
        eventAddr := args[3].Pointer()

        epfile := t.GetFileVFS2(epfd)
        if epfile == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer epfile.DecRef(t)
        ep, ok := epfile.Impl().(*vfs.EpollInstance)
        if !ok {
                return 0, nil, linuxerr.EINVAL
        }
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)
        if epfile == file {
                return 0, nil, linuxerr.EINVAL
        }

        var event linux.EpollEvent
        switch op {
        case linux.EPOLL_CTL_ADD:
                if _, err := event.CopyIn(t, eventAddr); err != nil {
                        return 0, nil, err
                }
                return 0, nil, ep.AddInterest(file, fd, event)
        case linux.EPOLL_CTL_DEL:
                return 0, nil, ep.DeleteInterest(file, fd)
        case linux.EPOLL_CTL_MOD:
                if _, err := event.CopyIn(t, eventAddr); err != nil {
                        return 0, nil, err
                }
                return 0, nil, ep.ModifyInterest(file, fd, event)
        default:
                return 0, nil, linuxerr.EINVAL
        }
}

func waitEpoll(t *kernel.Task, epfd int32, eventsAddr hostarch.Addr, maxEvents int, timeoutInNanos int64) (uintptr, *kernel.SyscallControl, error) {
        var _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS
        if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS {
                return 0, nil, linuxerr.EINVAL
        }

        epfile := t.GetFileVFS2(epfd)
        if epfile == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer epfile.DecRef(t)
        ep, ok := epfile.Impl().(*vfs.EpollInstance)
        if !ok {
                return 0, nil, linuxerr.EINVAL
        }

        // Allocate space for a few events on the stack for the common case in
        // which we don't have too many events.
        var (
                eventsArr    [16]linux.EpollEvent
                ch           chan struct{}
                haveDeadline bool
                deadline     ktime.Time
        )
        for {
                events := ep.ReadEvents(eventsArr[:0], maxEvents)
                if len(events) != 0 {
                        copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events)
                        copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
                        if copiedEvents != 0 {
                                return uintptr(copiedEvents), nil, nil
                        }
                        return 0, nil, err
                }
                if timeoutInNanos == 0 {
                        return 0, nil, nil
                }
                // In the first iteration of this loop, register with the epoll
                // instance for readability events, but then immediately continue the
                // loop since we need to retry ReadEvents() before blocking. In all
                // subsequent iterations, block until events are available, the timeout
                // expires, or an interrupt arrives.
                if ch == nil {
                        var w waiter.Entry
                        w, ch = waiter.NewChannelEntry(nil)
                        epfile.EventRegister(&w, waiter.ReadableEvents)
                        defer epfile.EventUnregister(&w)
                } else {
                        // Set up the timer if a timeout was specified.
                        if timeoutInNanos > 0 && !haveDeadline {
                                timeoutDur := time.Duration(timeoutInNanos) * time.Nanosecond
                                deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur)
                                haveDeadline = true
                        }
                        if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
                                if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
                                        err = nil
                                }
                                return 0, nil, err
                        }
                }
        }

}

// EpollWait implements Linux syscall epoll_wait(2).
func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        epfd := args[0].Int()
        eventsAddr := args[1].Pointer()
        maxEvents := int(args[2].Int())
        timeoutInNanos := int64(args[3].Int()) * 1000000

        return waitEpoll(t, epfd, eventsAddr, maxEvents, timeoutInNanos)
}

// EpollPwait implements Linux syscall epoll_pwait(2).
func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        maskAddr := args[4].Pointer()
        maskSize := uint(args[5].Uint())

        if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
                return 0, nil, err
        }

        return EpollWait(t, args)
}

// EpollPwait2 implements Linux syscall epoll_pwait(2).
func EpollPwait2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        epfd := args[0].Int()
        eventsAddr := args[1].Pointer()
        maxEvents := int(args[2].Int())
        timeoutPtr := args[3].Pointer()
        maskAddr := args[4].Pointer()
        maskSize := uint(args[5].Uint())
        haveTimeout := timeoutPtr != 0

        var timeoutInNanos int64 = -1
        if haveTimeout {
                var timeout linux.Timespec
                if _, err := timeout.CopyIn(t, timeoutPtr); err != nil {
                        return 0, nil, err
                }
                timeoutInNanos = timeout.ToNsec()
        }

        if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
                return 0, nil, err
        }

        return waitEpoll(t, epfd, eventsAddr, maxEvents, timeoutInNanos)
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/tcpip/sock_err_list.go: no such file or directory































    1 
    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/context"
)

// contextID is the linux package's type for context.Context.Value keys.
type contextID int

const (
        // CtxSignalNoInfoFunc is a Context.Value key for a function to send signals.
        CtxSignalNoInfoFunc contextID = iota
)

// SignalNoInfoFuncFromContext returns a callback function that can be used to send a
// signal to the given context.
func SignalNoInfoFuncFromContext(ctx context.Context) func(Signal) error {
        if f := ctx.Value(CtxSignalNoInfoFunc); f != nil {
                return f.(func(Signal) error)
        }
        return nil
}












































    2 










    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Code generated by "stringer -type ndpOptionIdentifier"; DO NOT EDIT.

package header

import "strconv"

func _() {
        // An "invalid array index" compiler error signifies that the constant values have changed.
        // Re-run the stringer command to generate them again.
        var x [1]struct{}
        _ = x[ndpSourceLinkLayerAddressOptionType-1]
        _ = x[ndpTargetLinkLayerAddressOptionType-2]
        _ = x[ndpPrefixInformationType-3]
        _ = x[ndpNonceOptionType-14]
        _ = x[ndpRecursiveDNSServerOptionType-25]
        _ = x[ndpDNSSearchListOptionType-31]
}

const (
        _ndpOptionIdentifier_name_0 = "ndpSourceLinkLayerAddressOptionTypendpTargetLinkLayerAddressOptionTypendpPrefixInformationType"
        _ndpOptionIdentifier_name_1 = "ndpNonceOptionType"
        _ndpOptionIdentifier_name_2 = "ndpRecursiveDNSServerOptionType"
        _ndpOptionIdentifier_name_3 = "ndpDNSSearchListOptionType"
)

var (
        _ndpOptionIdentifier_index_0 = [...]uint8{0, 35, 70, 94}
)

func (i ndpOptionIdentifier) String() string {
        switch {
        case 1 <= i && i <= 3:
                i -= 1
                return _ndpOptionIdentifier_name_0[_ndpOptionIdentifier_index_0[i]:_ndpOptionIdentifier_index_0[i+1]]
        case i == 14:
                return _ndpOptionIdentifier_name_1
        case i == 25:
                return _ndpOptionIdentifier_name_2
        case i == 31:
                return _ndpOptionIdentifier_name_3
        default:
                return "ndpOptionIdentifier(" + strconv.FormatInt(int64(i), 10) + ")"
        }
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/auth/id_map_range.go: no such file or directory





































































    3 





























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

// A Capability represents the ability to perform a privileged operation.
type Capability int

// Capabilities defined by Linux. Taken from the kernel's
// include/uapi/linux/capability.h. See capabilities(7) or that file for more
// detailed capability descriptions.
const (
        CAP_CHOWN            = Capability(0)
        CAP_DAC_OVERRIDE     = Capability(1)
        CAP_DAC_READ_SEARCH  = Capability(2)
        CAP_FOWNER           = Capability(3)
        CAP_FSETID           = Capability(4)
        CAP_KILL             = Capability(5)
        CAP_SETGID           = Capability(6)
        CAP_SETUID           = Capability(7)
        CAP_SETPCAP          = Capability(8)
        CAP_LINUX_IMMUTABLE  = Capability(9)
        CAP_NET_BIND_SERVICE = Capability(10)
        CAP_NET_BROADCAST    = Capability(11)
        CAP_NET_ADMIN        = Capability(12)
        CAP_NET_RAW          = Capability(13)
        CAP_IPC_LOCK         = Capability(14)
        CAP_IPC_OWNER        = Capability(15)
        CAP_SYS_MODULE       = Capability(16)
        CAP_SYS_RAWIO        = Capability(17)
        CAP_SYS_CHROOT       = Capability(18)
        CAP_SYS_PTRACE       = Capability(19)
        CAP_SYS_PACCT        = Capability(20)
        CAP_SYS_ADMIN        = Capability(21)
        CAP_SYS_BOOT         = Capability(22)
        CAP_SYS_NICE         = Capability(23)
        CAP_SYS_RESOURCE     = Capability(24)
        CAP_SYS_TIME         = Capability(25)
        CAP_SYS_TTY_CONFIG   = Capability(26)
        CAP_MKNOD            = Capability(27)
        CAP_LEASE            = Capability(28)
        CAP_AUDIT_WRITE      = Capability(29)
        CAP_AUDIT_CONTROL    = Capability(30)
        CAP_SETFCAP          = Capability(31)
        CAP_MAC_OVERRIDE     = Capability(32)
        CAP_MAC_ADMIN        = Capability(33)
        CAP_SYSLOG           = Capability(34)
        CAP_WAKE_ALARM       = Capability(35)
        CAP_BLOCK_SUSPEND    = Capability(36)
        CAP_AUDIT_READ       = Capability(37)

        // CAP_LAST_CAP is the highest-numbered capability.
        // Seach for "CAP_LAST_CAP" to find other places that need to change.
        CAP_LAST_CAP = CAP_AUDIT_READ
)

// Ok returns true if cp is a supported capability.
func (cp Capability) Ok() bool {
        return cp >= 0 && cp <= CAP_LAST_CAP
}

// String returns the capability name.
func (cp Capability) String() string {
        switch cp {
        case CAP_CHOWN:
                return "CAP_CHOWN"
        case CAP_DAC_OVERRIDE:
                return "CAP_DAC_OVERRIDE"
        case CAP_DAC_READ_SEARCH:
                return "CAP_DAC_READ_SEARCH"
        case CAP_FOWNER:
                return "CAP_FOWNER"
        case CAP_FSETID:
                return "CAP_FSETID"
        case CAP_KILL:
                return "CAP_KILL"
        case CAP_SETGID:
                return "CAP_SETGID"
        case CAP_SETUID:
                return "CAP_SETUID"
        case CAP_SETPCAP:
                return "CAP_SETPCAP"
        case CAP_LINUX_IMMUTABLE:
                return "CAP_LINUX_IMMUTABLE"
        case CAP_NET_BIND_SERVICE:
                return "CAP_NET_BIND_SERVICE"
        case CAP_NET_BROADCAST:
                return "CAP_NET_BROADCAST"
        case CAP_NET_ADMIN:
                return "CAP_NET_ADMIN"
        case CAP_NET_RAW:
                return "CAP_NET_RAW"
        case CAP_IPC_LOCK:
                return "CAP_IPC_LOCK"
        case CAP_IPC_OWNER:
                return "CAP_IPC_OWNER"
        case CAP_SYS_MODULE:
                return "CAP_SYS_MODULE"
        case CAP_SYS_RAWIO:
                return "CAP_SYS_RAWIO"
        case CAP_SYS_CHROOT:
                return "CAP_SYS_CHROOT"
        case CAP_SYS_PTRACE:
                return "CAP_SYS_PTRACE"
        case CAP_SYS_PACCT:
                return "CAP_SYS_PACCT"
        case CAP_SYS_ADMIN:
                return "CAP_SYS_ADMIN"
        case CAP_SYS_BOOT:
                return "CAP_SYS_BOOT"
        case CAP_SYS_NICE:
                return "CAP_SYS_NICE"
        case CAP_SYS_RESOURCE:
                return "CAP_SYS_RESOURCE"
        case CAP_SYS_TIME:
                return "CAP_SYS_TIME"
        case CAP_SYS_TTY_CONFIG:
                return "CAP_SYS_TTY_CONFIG"
        case CAP_MKNOD:
                return "CAP_MKNOD"
        case CAP_LEASE:
                return "CAP_LEASE"
        case CAP_AUDIT_WRITE:
                return "CAP_AUDIT_WRITE"
        case CAP_AUDIT_CONTROL:
                return "CAP_AUDIT_CONTROL"
        case CAP_SETFCAP:
                return "CAP_SETFCAP"
        case CAP_MAC_OVERRIDE:
                return "CAP_MAC_OVERRIDE"
        case CAP_MAC_ADMIN:
                return "CAP_MAC_ADMIN"
        case CAP_SYSLOG:
                return "CAP_SYSLOG"
        case CAP_WAKE_ALARM:
                return "CAP_WAKE_ALARM"
        case CAP_BLOCK_SUSPEND:
                return "CAP_BLOCK_SUSPEND"
        case CAP_AUDIT_READ:
                return "CAP_AUDIT_READ"
        default:
                return "UNKNOWN"
        }
}

// Version numbers used by the capget/capset syscalls, defined in Linux's
// include/uapi/linux/capability.h.
const (
        // LINUX_CAPABILITY_VERSION_1 causes the data pointer to be
        // interpreted as a pointer to a single cap_user_data_t. Since capability
        // sets are 64 bits and the "capability sets" in cap_user_data_t are 32
        // bits only, this causes the upper 32 bits to be implicitly 0.
        LINUX_CAPABILITY_VERSION_1 = 0x19980330

        // LINUX_CAPABILITY_VERSION_2 and LINUX_CAPABILITY_VERSION_3 cause the
        // data pointer to be interpreted as a pointer to an array of 2
        // cap_user_data_t, using the second to store the 32 MSB of each capability
        // set. Versions 2 and 3 are identical, but Linux printk's a warning on use
        // of version 2 due to a userspace API defect.
        LINUX_CAPABILITY_VERSION_2 = 0x20071026
        LINUX_CAPABILITY_VERSION_3 = 0x20080522

        // HighestCapabilityVersion is the highest supported
        // LINUX_CAPABILITY_VERSION_* version.
        HighestCapabilityVersion = LINUX_CAPABILITY_VERSION_3
)

// CapUserHeader is equivalent to Linux's cap_user_header_t.
//
// +marshal
type CapUserHeader struct {
        Version uint32
        Pid     int32
}

// CapUserData is equivalent to Linux's cap_user_data_t.
//
// +marshal slice:CapUserDataSlice
type CapUserData struct {
        Effective   uint32
        Permitted   uint32
        Inheritable uint32
}




































































































   15 














   15 



   15 
   15 



   15 


   14 








   11 







   11 











   11 








    3 




    1 





    1 



    3 
    3 




    3 








    2 


    3 



    3 
    2 


    1 



    1 





    1 





    1 











    1 





    1 




    1 
















    1 







    1 











    1 
























    1 




    1 





    1 



    1 







































   49 










   49 
   15 






   33 



   25 








    9 









   34 


   17 















   17 



   17 
    2 










   15 


   17 






   33 







    3 



























    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package packet provides the implementation of packet sockets (see
// packet(7)). Packet sockets allow applications to:
//
//   * manually write and inspect link, network, and transport headers
//   * receive all traffic of a given network protocol, or all protocols
//
// Packet sockets are similar to raw sockets, but provide even more power to
// users, letting them effectively talk directly to the network device.
//
// Packet sockets skip the input and output iptables chains.
package packet

import (
        "fmt"
        "io"
        "time"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
        "gvisor.dev/gvisor/pkg/waiter"
)

// +stateify savable
type packet struct {
        packetEntry
        // data holds the actual packet data, including any headers and
        // payload.
        data       buffer.VectorisedView `state:".(buffer.VectorisedView)"`
        receivedAt time.Time             `state:".(int64)"`
        // senderAddr is the network address of the sender.
        senderAddr tcpip.FullAddress
        // packetInfo holds additional information like the protocol
        // of the packet etc.
        packetInfo tcpip.LinkPacketInfo
}

// endpoint is the packet socket implementation of tcpip.Endpoint. It is legal
// to have goroutines make concurrent calls into the endpoint.
//
// Lock order:
//   endpoint.mu
//     endpoint.rcvMu
//
// +stateify savable
type endpoint struct {
        stack.TransportEndpointInfo
        tcpip.DefaultSocketOptionsHandler

        // The following fields are initialized at creation time and are
        // immutable.
        stack       *stack.Stack `state:"manual"`
        netProto    tcpip.NetworkProtocolNumber
        waiterQueue *waiter.Queue
        cooked      bool

        // The following fields are used to manage the receive queue and are
        // protected by rcvMu.
        rcvMu      sync.Mutex `state:"nosave"`
        rcvList    packetList
        rcvBufSize int
        rcvClosed  bool

        // The following fields are protected by mu.
        mu       sync.RWMutex `state:"nosave"`
        closed   bool
        stats    tcpip.TransportEndpointStats `state:"nosave"`
        bound    bool
        boundNIC tcpip.NICID

        // lastErrorMu protects lastError.
        lastErrorMu sync.Mutex `state:"nosave"`
        lastError   tcpip.Error

        // ops is used to get socket level options.
        ops tcpip.SocketOptions

        // frozen indicates if the packets should be delivered to the endpoint
        // during restore.
        frozen bool
}

// NewEndpoint returns a new packet endpoint.
func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
        ep := &endpoint{
                stack: s,
                TransportEndpointInfo: stack.TransportEndpointInfo{
                        NetProto: netProto,
                },
                cooked:      cooked,
                netProto:    netProto,
                waiterQueue: waiterQueue,
        }
        ep.ops.InitHandler(ep, ep.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits)
        ep.ops.SetReceiveBufferSize(32*1024, false /* notify */)

        // Override with stack defaults.
        var ss tcpip.SendBufferSizeOption
        if err := s.Option(&ss); err == nil {
                ep.ops.SetSendBufferSize(int64(ss.Default), false /* notify */)
        }

        var rs tcpip.ReceiveBufferSizeOption
        if err := s.Option(&rs); err == nil {
                ep.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */)
        }

        if err := s.RegisterPacketEndpoint(0, netProto, ep); err != nil {
                return nil, err
        }
        return ep, nil
}

// Abort implements stack.TransportEndpoint.Abort.
func (ep *endpoint) Abort() {
        ep.Close()
}

// Close implements tcpip.Endpoint.Close.
func (ep *endpoint) Close() {
        ep.mu.Lock()
        defer ep.mu.Unlock()

        if ep.closed {
                return
        }

        ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep)

        ep.rcvMu.Lock()
        defer ep.rcvMu.Unlock()

        // Clear the receive list.
        ep.rcvClosed = true
        ep.rcvBufSize = 0
        for !ep.rcvList.Empty() {
                ep.rcvList.Remove(ep.rcvList.Front())
        }

        ep.closed = true
        ep.bound = false
        ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
}

// ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
func (*endpoint) ModerateRecvBuf(int) {}

// Read implements tcpip.Endpoint.Read.
func (ep *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) {
        ep.rcvMu.Lock()

        // If there's no data to read, return that read would block or that the
        // endpoint is closed.
        if ep.rcvList.Empty() {
                var err tcpip.Error = &tcpip.ErrWouldBlock{}
                if ep.rcvClosed {
                        ep.stats.ReadErrors.ReadClosed.Increment()
                        err = &tcpip.ErrClosedForReceive{}
                }
                ep.rcvMu.Unlock()
                return tcpip.ReadResult{}, err
        }

        packet := ep.rcvList.Front()
        if !opts.Peek {
                ep.rcvList.Remove(packet)
                ep.rcvBufSize -= packet.data.Size()
        }

        ep.rcvMu.Unlock()

        res := tcpip.ReadResult{
                Total: packet.data.Size(),
                ControlMessages: tcpip.ControlMessages{
                        HasTimestamp: true,
                        Timestamp:    packet.receivedAt.UnixNano(),
                },
        }
        if opts.NeedRemoteAddr {
                res.RemoteAddr = packet.senderAddr
        }
        if opts.NeedLinkPacketInfo {
                res.LinkPacketInfo = packet.packetInfo
        }

        n, err := packet.data.ReadTo(dst, opts.Peek)
        if n == 0 && err != nil {
                return res, &tcpip.ErrBadBuffer{}
        }
        res.Count = n
        return res, nil
}

func (*endpoint) Write(tcpip.Payloader, tcpip.WriteOptions) (int64, tcpip.Error) {
        return 0, &tcpip.ErrInvalidOptionValue{}
}

// Disconnect implements tcpip.Endpoint.Disconnect. Packet sockets cannot be
// disconnected, and this function always returns tpcip.ErrNotSupported.
func (*endpoint) Disconnect() tcpip.Error {
        return &tcpip.ErrNotSupported{}
}

// Connect implements tcpip.Endpoint.Connect. Packet sockets cannot be
// connected, and this function always returnes *tcpip.ErrNotSupported.
func (*endpoint) Connect(tcpip.FullAddress) tcpip.Error {
        return &tcpip.ErrNotSupported{}
}

// Shutdown implements tcpip.Endpoint.Shutdown. Packet sockets cannot be used
// with Shutdown, and this function always returns *tcpip.ErrNotSupported.
func (*endpoint) Shutdown(tcpip.ShutdownFlags) tcpip.Error {
        return &tcpip.ErrNotSupported{}
}

// Listen implements tcpip.Endpoint.Listen. Packet sockets cannot be used with
// Listen, and this function always returns *tcpip.ErrNotSupported.
func (*endpoint) Listen(int) tcpip.Error {
        return &tcpip.ErrNotSupported{}
}

// Accept implements tcpip.Endpoint.Accept. Packet sockets cannot be used with
// Accept, and this function always returns *tcpip.ErrNotSupported.
func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) {
        return nil, nil, &tcpip.ErrNotSupported{}
}

// Bind implements tcpip.Endpoint.Bind.
func (ep *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error {
        // "By default, all packets of the specified protocol type are passed
        // to a packet socket.  To get packets only from a specific interface
        // use bind(2) specifying an address in a struct sockaddr_ll to bind
        // the packet socket  to  an interface.  Fields used for binding are
        // sll_family (should be AF_PACKET), sll_protocol, and sll_ifindex."
        // - packet(7).

        ep.mu.Lock()
        defer ep.mu.Unlock()

        if ep.bound && ep.boundNIC == addr.NIC {
                // If the NIC being bound is the same then just return success.
                return nil
        }

        // Unregister endpoint with all the nics.
        ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep)
        ep.bound = false

        // Bind endpoint to receive packets from specific interface.
        if err := ep.stack.RegisterPacketEndpoint(addr.NIC, ep.netProto, ep); err != nil {
                return err
        }

        ep.bound = true
        ep.boundNIC = addr.NIC

        return nil
}

// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
func (*endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
        return tcpip.FullAddress{}, &tcpip.ErrNotSupported{}
}

// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress.
func (*endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) {
        // Even a connected socket doesn't return a remote address.
        return tcpip.FullAddress{}, &tcpip.ErrNotConnected{}
}

// Readiness implements tcpip.Endpoint.Readiness.
func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
        // The endpoint is always writable.
        result := waiter.WritableEvents & mask

        // Determine whether the endpoint is readable.
        if (mask & waiter.ReadableEvents) != 0 {
                ep.rcvMu.Lock()
                if !ep.rcvList.Empty() || ep.rcvClosed {
                        result |= waiter.ReadableEvents
                }
                ep.rcvMu.Unlock()
        }

        return result
}

// SetSockOpt implements tcpip.Endpoint.SetSockOpt. Packet sockets cannot be
// used with SetSockOpt, and this function always returns
// *tcpip.ErrNotSupported.
func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
        switch opt.(type) {
        case *tcpip.SocketDetachFilterOption:
                return nil

        default:
                return &tcpip.ErrUnknownProtocolOption{}
        }
}

// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
func (*endpoint) SetSockOptInt(tcpip.SockOptInt, int) tcpip.Error {
        return &tcpip.ErrUnknownProtocolOption{}
}

func (ep *endpoint) LastError() tcpip.Error {
        ep.lastErrorMu.Lock()
        defer ep.lastErrorMu.Unlock()

        err := ep.lastError
        ep.lastError = nil
        return err
}

// UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError.
func (ep *endpoint) UpdateLastError(err tcpip.Error) {
        ep.lastErrorMu.Lock()
        ep.lastError = err
        ep.lastErrorMu.Unlock()
}

// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) tcpip.Error {
        return &tcpip.ErrNotSupported{}
}

// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
        switch opt {
        case tcpip.ReceiveQueueSizeOption:
                v := 0
                ep.rcvMu.Lock()
                if !ep.rcvList.Empty() {
                        p := ep.rcvList.Front()
                        v = p.data.Size()
                }
                ep.rcvMu.Unlock()
                return v, nil

        default:
                return -1, &tcpip.ErrUnknownProtocolOption{}
        }
}

// HandlePacket implements stack.PacketEndpoint.HandlePacket.
func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
        ep.rcvMu.Lock()

        // Drop the packet if our buffer is currently full.
        if ep.rcvClosed {
                ep.rcvMu.Unlock()
                ep.stack.Stats().DroppedPackets.Increment()
                ep.stats.ReceiveErrors.ClosedReceiver.Increment()
                return
        }

        rcvBufSize := ep.ops.GetReceiveBufferSize()
        if ep.frozen || ep.rcvBufSize >= int(rcvBufSize) {
                ep.rcvMu.Unlock()
                ep.stack.Stats().DroppedPackets.Increment()
                ep.stats.ReceiveErrors.ReceiveBufferOverflow.Increment()
                return
        }

        wasEmpty := ep.rcvBufSize == 0

        // Push new packet into receive list and increment the buffer size.
        var packet packet
        if !pkt.LinkHeader().View().IsEmpty() {
                // Get info directly from the ethernet header.
                hdr := header.Ethernet(pkt.LinkHeader().View())
                packet.senderAddr = tcpip.FullAddress{
                        NIC:  nicID,
                        Addr: tcpip.Address(hdr.SourceAddress()),
                }
                packet.packetInfo.Protocol = netProto
                packet.packetInfo.PktType = pkt.PktType
        } else {
                // Guess the would-be ethernet header.
                packet.senderAddr = tcpip.FullAddress{
                        NIC:  nicID,
                        Addr: tcpip.Address(localAddr),
                }
                packet.packetInfo.Protocol = netProto
                packet.packetInfo.PktType = pkt.PktType
        }

        if ep.cooked {
                // Cooked packets can simply be queued.
                switch pkt.PktType {
                case tcpip.PacketHost:
                        packet.data = pkt.Data().ExtractVV()
                case tcpip.PacketOutgoing:
                        // Strip Link Header.
                        var combinedVV buffer.VectorisedView
                        if v := pkt.NetworkHeader().View(); !v.IsEmpty() {
                                combinedVV.AppendView(v)
                        }
                        if v := pkt.TransportHeader().View(); !v.IsEmpty() {
                                combinedVV.AppendView(v)
                        }
                        combinedVV.Append(pkt.Data().ExtractVV())
                        packet.data = combinedVV
                default:
                        panic(fmt.Sprintf("unexpected PktType in pkt: %+v", pkt))
                }
        } else {
                // Raw packets need their ethernet headers prepended before
                // queueing.
                var linkHeader buffer.View
                if pkt.PktType != tcpip.PacketOutgoing {
                        if pkt.LinkHeader().View().IsEmpty() {
                                // We weren't provided with an actual ethernet header,
                                // so fake one.
                                ethFields := header.EthernetFields{
                                        SrcAddr: tcpip.LinkAddress([]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00}),
                                        DstAddr: localAddr,
                                        Type:    netProto,
                                }
                                fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
                                fakeHeader.Encode(&ethFields)
                                linkHeader = buffer.View(fakeHeader)
                        } else {
                                linkHeader = append(buffer.View(nil), pkt.LinkHeader().View()...)
                        }
                        combinedVV := linkHeader.ToVectorisedView()
                        combinedVV.Append(pkt.Data().ExtractVV())
                        packet.data = combinedVV
                } else {
                        packet.data = buffer.NewVectorisedView(pkt.Size(), pkt.Views())
                }
        }
        packet.receivedAt = ep.stack.Clock().Now()

        ep.rcvList.PushBack(&packet)
        ep.rcvBufSize += packet.data.Size()

        ep.rcvMu.Unlock()
        ep.stats.PacketsReceived.Increment()
        // Notify waiters that there's data to be read.
        if wasEmpty {
                ep.waiterQueue.Notify(waiter.ReadableEvents)
        }
}

// State implements socket.Socket.State.
func (*endpoint) State() uint32 {
        return 0
}

// Info returns a copy of the endpoint info.
func (ep *endpoint) Info() tcpip.EndpointInfo {
        ep.mu.RLock()
        // Make a copy of the endpoint info.
        ret := ep.TransportEndpointInfo
        ep.mu.RUnlock()
        return &ret
}

// Stats returns a pointer to the endpoint stats.
func (ep *endpoint) Stats() tcpip.EndpointStats {
        return &ep.stats
}

// SetOwner implements tcpip.Endpoint.SetOwner.
func (*endpoint) SetOwner(tcpip.PacketOwner) {}

// SocketOptions implements tcpip.Endpoint.SocketOptions.
func (ep *endpoint) SocketOptions() *tcpip.SocketOptions {
        return &ep.ops
}

// freeze prevents any more packets from being delivered to the endpoint.
func (ep *endpoint) freeze() {
        ep.mu.Lock()
        ep.frozen = true
        ep.mu.Unlock()
}

// thaw unfreezes a previously frozen endpoint using endpoint.freeze() allows
// new packets to be delivered again.
func (ep *endpoint) thaw() {
        ep.mu.Lock()
        ep.frozen = false
        ep.mu.Unlock()
}




















































 1182 







    9 











  745 












 1289 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build go1.13
// +build !go1.18

// Check type signatures when updating Go version.

// Package gohacks contains utilities for subverting the Go compiler.
package gohacks

import (
        "unsafe"
)

// SliceHeader is equivalent to reflect.SliceHeader, but represents the pointer
// to the underlying array as unsafe.Pointer rather than uintptr, allowing
// SliceHeaders to be directly converted to slice objects.
type SliceHeader struct {
        Data unsafe.Pointer
        Len  int
        Cap  int
}

// StringHeader is equivalent to reflect.StringHeader, but represents the
// pointer to the underlying array as unsafe.Pointer rather than uintptr,
// allowing StringHeaders to be directly converted to strings.
type StringHeader struct {
        Data unsafe.Pointer
        Len  int
}

// Noescape hides a pointer from escape analysis. Noescape is the identity
// function but escape analysis doesn't think the output depends on the input.
// Noescape is inlined and currently compiles down to zero instructions.
// USE CAREFULLY!
//
// (Noescape is copy/pasted from Go's runtime/stubs.go:noescape().)
//
//go:nosplit
func Noescape(p unsafe.Pointer) unsafe.Pointer {
        x := uintptr(p)
        return unsafe.Pointer(x ^ 0)
}

// ImmutableBytesFromString is equivalent to []byte(s), except that it uses the
// same memory backing s instead of making a heap-allocated copy. This is only
// valid if the returned slice is never mutated.
func ImmutableBytesFromString(s string) (bs []byte) {
        shdr := (*StringHeader)(unsafe.Pointer(&s))
        bshdr := (*SliceHeader)(unsafe.Pointer(&bs))
        bshdr.Data = shdr.Data
        bshdr.Len = shdr.Len
        bshdr.Cap = shdr.Len
        return
}

// StringFromImmutableBytes is equivalent to string(bs), except that it uses
// the same memory backing bs instead of making a heap-allocated copy. This is
// only valid if bs is never mutated after StringFromImmutableBytes returns.
func StringFromImmutableBytes(bs []byte) string {
        // This is cheaper than messing with StringHeader and SliceHeader, which as
        // of this writing produces many dead stores of zeroes. Compare
        // strings.Builder.String().
        return *(*string)(unsafe.Pointer(&bs))
}

// Note that go:linkname silently doesn't work if the local name is exported,
// necessitating an indirection for exported functions.

// Memmove is runtime.memmove, exported for SeqAtomicLoad/SeqAtomicTryLoad<T>.
//
//go:nosplit
func Memmove(to, from unsafe.Pointer, n uintptr) {
        memmove(to, from, n)
}

//go:linkname memmove runtime.memmove
//go:noescape
func memmove(to, from unsafe.Pointer, n uintptr)









































  280 
    8 



  277 



  278 





   21 



  271 
    5 

    1 



    1 


    1 


  267 


  268 


   12 


  266 



  266 




  240 


  186 


  238 




  238 

  264 














  271 
  256 

    2 





    2 



  271 



   12 


  250 








  248 



  268 










  314 

  268 


  201 



  157 


  157 














   34 

   25 


    4 



   30 


   29 








   14 





   15 


   14 


   13 


   10 


   10 


    9 





    6 

    1 


    5 


    5 


    3 



    2 




    2 




    2 








    2 




    2 





    2 







    2 





    1 








    1 



   22 




    3 



   20 
    1 


   17 



   19 





    9 




    1 


    8 




    4 



    4 



    1 



    3 


    1 


    2 


    2 


    1 


    1 


    1 














    3 



    3 



    1 



    2 


    1 


    1 


    1 

    1 
    1 











    4 



    4 







    4 


    2 


    2 


    2 


    1 

    1 









  264 









  255 



  193 



   97 





   96 







   11 



   11 
   11 







   11 

    2 



    2 


    1 











   10 
    2 



    8 



    8 


    7 



    6 



    7 
    2 




    2 


    1 


    1 













    5 



    4 


    2 




    1 





    1 






    1 

    2 




    2 







    8 




    1 



    7 





    5 







    6 












    6 


    5 





    5 


    4 











    4 




    3 

    1 



    1 


    2 

    1 

    1 




    1 







    2 



    1 




    1 












    1 



    1 
    1 


























    2 





    1 


    1 


    1 
    1 
































    9 




    1 


    8 


    8 



   19 







   19 



    7 




    1 


    7 



    4 



    4 






    4 



    1 


    3 


    2 














    3 





    1 


    2 


    1 
    1 



























    2 




    1 


    1 


    1 



    1 




    1 



    1 



    1 








    1 



    2 




    1 



    1 



    1 








    1 



  296 





  185 
    6 






  182 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernfs

// This file implements vfs.FilesystemImpl for kernfs.

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

// stepExistingLocked resolves rp.Component() in parent directory vfsd.
//
// stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
//
// Preconditions:
// * Filesystem.mu must be locked for at least reading.
// * !rp.Done().
//
// Postcondition: Caller must call fs.processDeferredDecRefs*.
func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, mayFollowSymlinks bool) (*Dentry, error) {
        if !d.isDir() {
                return nil, linuxerr.ENOTDIR
        }
        // Directory searchable?
        if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
                return nil, err
        }
afterSymlink:
        name := rp.Component()
        // Revalidation must be skipped if name is "." or ".."; d or its parent
        // respectively can't be expected to transition from invalidated back to
        // valid, so detecting invalidation and retrying would loop forever. This
        // is consistent with Linux: fs/namei.c:walk_component() => lookup_fast()
        // calls d_revalidate(), but walk_component() => handle_dots() does not.
        if name == "." {
                rp.Advance()
                return d, nil
        }
        if name == ".." {
                if isRoot, err := rp.CheckRoot(ctx, d.VFSDentry()); err != nil {
                        return nil, err
                } else if isRoot || d.parent == nil {
                        rp.Advance()
                        return d, nil
                }
                if err := rp.CheckMount(ctx, d.parent.VFSDentry()); err != nil {
                        return nil, err
                }
                rp.Advance()
                return d.parent, nil
        }
        if len(name) > linux.NAME_MAX {
                return nil, linuxerr.ENAMETOOLONG
        }
        d.dirMu.Lock()
        next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, d.children[name])
        d.dirMu.Unlock()
        if err != nil {
                return nil, err
        }
        if err := rp.CheckMount(ctx, next.VFSDentry()); err != nil {
                return nil, err
        }
        // Resolve any symlink at current path component.
        if mayFollowSymlinks && rp.ShouldFollowSymlink() && next.isSymlink() {
                targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount())
                if err != nil {
                        return nil, err
                }
                if targetVD.Ok() {
                        err := rp.HandleJump(targetVD)
                        fs.deferDecRefVD(ctx, targetVD)
                        if err != nil {
                                return nil, err
                        }
                } else {
                        if err := rp.HandleSymlink(targetPathname); err != nil {
                                return nil, err
                        }
                }
                goto afterSymlink
        }
        rp.Advance()
        return next, nil
}

// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
// nil) to verify that the returned child (or lack thereof) is correct.
//
// Preconditions:
// * Filesystem.mu must be locked for at least reading.
// * parent.dirMu must be locked.
// * parent.isDir().
// * name is not "." or "..".
//
// Postconditions: Caller must call fs.processDeferredDecRefs*.
func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) {
        if child != nil {
                // Cached dentry exists, revalidate.
                if !child.inode.Valid(ctx) {
                        delete(parent.children, name)
                        if child.inode.Keep() {
                                // Drop the ref owned by kernfs.
                                fs.deferDecRef(child)
                        }
                        vfsObj.InvalidateDentry(ctx, child.VFSDentry())
                        child = nil
                }
        }
        if child == nil {
                // Dentry isn't cached; it either doesn't exist or failed revalidation.
                // Attempt to resolve it via Lookup.
                childInode, err := parent.inode.Lookup(ctx, name)
                if err != nil {
                        return nil, err
                }
                var newChild Dentry
                newChild.Init(fs, childInode) // childInode's ref is transferred to newChild.
                parent.insertChildLocked(name, &newChild)
                child = &newChild

                // Drop the ref on newChild. This will cause the dentry to get pruned
                // from the dentry tree by the end of current filesystem operation
                // (before returning to the VFS layer) if another ref is not picked on
                // this dentry.
                if !childInode.Keep() {
                        fs.deferDecRef(&newChild)
                }
        }
        return child, nil
}

// walkExistingLocked resolves rp to an existing file.
//
// walkExistingLocked is loosely analogous to Linux's
// fs/namei.c:path_lookupat().
//
// Preconditions: Filesystem.mu must be locked for at least reading.
//
// Postconditions: Caller must call fs.processDeferredDecRefs*.
func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
        d := rp.Start().Impl().(*Dentry)
        for !rp.Done() {
                var err error
                d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */)
                if err != nil {
                        return nil, err
                }
        }
        if rp.MustBeDir() && !d.isDir() {
                return nil, linuxerr.ENOTDIR
        }
        return d, nil
}

// walkParentDirLocked resolves all but the last path component of rp to an
// existing directory. It does not check that the returned directory is
// searchable by the provider of rp.
//
// walkParentDirLocked is loosely analogous to Linux's
// fs/namei.c:path_parentat().
//
// Preconditions:
// * Filesystem.mu must be locked for at least reading.
// * !rp.Done().
//
// Postconditions: Caller must call fs.processDeferredDecRefs*.
func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
        d := rp.Start().Impl().(*Dentry)
        for !rp.Final() {
                var err error
                d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */)
                if err != nil {
                        return nil, err
                }
        }
        if !d.isDir() {
                return nil, linuxerr.ENOTDIR
        }
        return d, nil
}

// checkCreateLocked checks that a file named rp.Component() may be created in
// directory parent, then returns rp.Component().
//
// Preconditions:
// * Filesystem.mu must be locked for at least reading.
// * isDir(parentInode) == true.
func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string, parent *Dentry) error {
        // Order of checks is important. First check if parent directory can be
        // executed, then check for existence, and lastly check if mount is writable.
        if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayExec); err != nil {
                return err
        }
        if name == "." || name == ".." {
                return linuxerr.EEXIST
        }
        if len(name) > linux.NAME_MAX {
                return linuxerr.ENAMETOOLONG
        }
        if _, ok := parent.children[name]; ok {
                return linuxerr.EEXIST
        }
        if parent.VFSDentry().IsDead() {
                return syserror.ENOENT
        }
        if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite); err != nil {
                return err
        }
        return nil
}

// checkDeleteLocked checks that the file represented by vfsd may be deleted.
//
// Preconditions: Filesystem.mu must be locked for at least reading.
func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error {
        parent := d.parent
        if parent == nil {
                return linuxerr.EBUSY
        }
        if parent.vfsd.IsDead() {
                return syserror.ENOENT
        }
        if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
                return err
        }
        return nil
}

// Release implements vfs.FilesystemImpl.Release.
func (fs *Filesystem) Release(ctx context.Context) {
        root := fs.root
        if root == nil {
                return
        }
        fs.mu.Lock()
        root.releaseKeptDentriesLocked(ctx)
        for fs.cachedDentriesLen != 0 {
                fs.evictCachedDentryLocked(ctx)
        }
        fs.mu.Unlock()
        // Drop ref acquired in Dentry.InitRoot().
        root.DecRef(ctx)
}

// releaseKeptDentriesLocked recursively drops all dentry references created by
// Lookup when Dentry.inode.Keep() is true.
//
// Precondition: Filesystem.mu is held.
func (d *Dentry) releaseKeptDentriesLocked(ctx context.Context) {
        if d.inode.Keep() && d != d.fs.root {
                d.decRefLocked(ctx)
        }

        if d.isDir() {
                var children []*Dentry
                d.dirMu.Lock()
                for _, child := range d.children {
                        children = append(children, child)
                }
                d.dirMu.Unlock()
                for _, child := range children {
                        child.releaseKeptDentriesLocked(ctx)
                }
        }
}

// Sync implements vfs.FilesystemImpl.Sync.
func (fs *Filesystem) Sync(ctx context.Context) error {
        // All filesystem state is in-memory.
        return nil
}

// AccessAt implements vfs.Filesystem.Impl.AccessAt.
func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
        fs.mu.RLock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.RUnlock()

        d, err := fs.walkExistingLocked(ctx, rp)
        if err != nil {
                return err
        }
        return d.inode.CheckPermissions(ctx, creds, ats)
}

// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
        fs.mu.RLock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.RUnlock()
        d, err := fs.walkExistingLocked(ctx, rp)
        if err != nil {
                return nil, err
        }

        if opts.CheckSearchable {
                if !d.isDir() {
                        return nil, linuxerr.ENOTDIR
                }
                if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
                        return nil, err
                }
        }
        vfsd := d.VFSDentry()
        vfsd.IncRef() // Ownership transferred to caller.
        return vfsd, nil
}

// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
        fs.mu.RLock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.RUnlock()
        d, err := fs.walkParentDirLocked(ctx, rp)
        if err != nil {
                return nil, err
        }
        d.IncRef() // Ownership transferred to caller.
        return d.VFSDentry(), nil
}

// LinkAt implements vfs.FilesystemImpl.LinkAt.
func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
        if rp.Done() {
                return linuxerr.EEXIST
        }
        fs.mu.Lock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.Unlock()
        parent, err := fs.walkParentDirLocked(ctx, rp)
        if err != nil {
                return err
        }

        parent.dirMu.Lock()
        defer parent.dirMu.Unlock()
        pc := rp.Component()
        if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
                return err
        }
        if rp.MustBeDir() {
                return syserror.ENOENT
        }
        if rp.Mount() != vd.Mount() {
                return linuxerr.EXDEV
        }
        if err := rp.Mount().CheckBeginWrite(); err != nil {
                return err
        }
        defer rp.Mount().EndWrite()

        d := vd.Dentry().Impl().(*Dentry)
        if d.isDir() {
                return linuxerr.EPERM
        }

        childI, err := parent.inode.NewLink(ctx, pc, d.inode)
        if err != nil {
                return err
        }
        var child Dentry
        child.Init(fs, childI)
        parent.insertChildLocked(pc, &child)
        return nil
}

// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
        if rp.Done() {
                return linuxerr.EEXIST
        }
        fs.mu.Lock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.Unlock()
        parent, err := fs.walkParentDirLocked(ctx, rp)
        if err != nil {
                return err
        }

        parent.dirMu.Lock()
        defer parent.dirMu.Unlock()
        pc := rp.Component()
        if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
                return err
        }
        if err := rp.Mount().CheckBeginWrite(); err != nil {
                return err
        }
        defer rp.Mount().EndWrite()
        childI, err := parent.inode.NewDir(ctx, pc, opts)
        if err != nil {
                if !opts.ForSyntheticMountpoint || linuxerr.Equals(linuxerr.EEXIST, err) {
                        return err
                }
                childI = newSyntheticDirectory(ctx, rp.Credentials(), opts.Mode)
        }
        var child Dentry
        child.Init(fs, childI)
        parent.insertChildLocked(pc, &child)
        return nil
}

// MknodAt implements vfs.FilesystemImpl.MknodAt.
func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
        if rp.Done() {
                return linuxerr.EEXIST
        }
        fs.mu.Lock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.Unlock()
        parent, err := fs.walkParentDirLocked(ctx, rp)
        if err != nil {
                return err
        }

        parent.dirMu.Lock()
        defer parent.dirMu.Unlock()
        pc := rp.Component()
        if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
                return err
        }
        if rp.MustBeDir() {
                return syserror.ENOENT
        }
        if err := rp.Mount().CheckBeginWrite(); err != nil {
                return err
        }
        defer rp.Mount().EndWrite()
        newI, err := parent.inode.NewNode(ctx, pc, opts)
        if err != nil {
                return err
        }
        var newD Dentry
        newD.Init(fs, newI)
        parent.insertChildLocked(pc, &newD)
        return nil
}

// OpenAt implements vfs.FilesystemImpl.OpenAt.
func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        // Filter out flags that are not supported by kernfs. O_DIRECTORY and
        // O_NOFOLLOW have no effect here (they're handled by VFS by setting
        // appropriate bits in rp), but are returned by
        // FileDescriptionImpl.StatusFlags().
        opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC |
                linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY
        ats := vfs.AccessTypesForOpenFlags(&opts)

        // Do not create new file.
        if opts.Flags&linux.O_CREAT == 0 {
                fs.mu.RLock()
                defer fs.processDeferredDecRefs(ctx)
                d, err := fs.walkExistingLocked(ctx, rp)
                if err != nil {
                        fs.mu.RUnlock()
                        return nil, err
                }
                if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
                        fs.mu.RUnlock()
                        return nil, err
                }
                // Open may block so we need to unlock fs.mu. IncRef d to prevent
                // its destruction while fs.mu is unlocked.
                d.IncRef()
                fs.mu.RUnlock()
                fd, err := d.inode.Open(ctx, rp, d, opts)
                d.DecRef(ctx)
                return fd, err
        }

        // May create new file.
        mustCreate := opts.Flags&linux.O_EXCL != 0
        d := rp.Start().Impl().(*Dentry)
        fs.mu.Lock()
        unlocked := false
        unlock := func() {
                if !unlocked {
                        fs.mu.Unlock()
                        unlocked = true
                }
        }
        // Process all to-be-decref'd dentries at the end at once.
        // Since we defer unlock() AFTER this, fs.mu is guaranteed to be unlocked
        // when this is executed.
        defer fs.processDeferredDecRefs(ctx)
        defer unlock()
        if rp.Done() {
                if rp.MustBeDir() {
                        return nil, syserror.EISDIR
                }
                if mustCreate {
                        return nil, linuxerr.EEXIST
                }
                if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
                        return nil, err
                }
                // Open may block so we need to unlock fs.mu. IncRef d to prevent
                // its destruction while fs.mu is unlocked.
                d.IncRef()
                unlock()
                fd, err := d.inode.Open(ctx, rp, d, opts)
                d.DecRef(ctx)
                return fd, err
        }
afterTrailingSymlink:
        parent, err := fs.walkParentDirLocked(ctx, rp)
        if err != nil {
                return nil, err
        }
        // Check for search permission in the parent directory.
        if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
                return nil, err
        }
        // Reject attempts to open directories with O_CREAT.
        if rp.MustBeDir() {
                return nil, syserror.EISDIR
        }
        pc := rp.Component()
        if pc == "." || pc == ".." {
                return nil, syserror.EISDIR
        }
        if len(pc) > linux.NAME_MAX {
                return nil, linuxerr.ENAMETOOLONG
        }
        // Determine whether or not we need to create a file.
        child, err := fs.stepExistingLocked(ctx, rp, parent, false /* mayFollowSymlinks */)
        if linuxerr.Equals(linuxerr.ENOENT, err) {
                // Already checked for searchability above; now check for writability.
                if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
                        return nil, err
                }
                if err := rp.Mount().CheckBeginWrite(); err != nil {
                        return nil, err
                }
                defer rp.Mount().EndWrite()
                // Create and open the child.
                childI, err := parent.inode.NewFile(ctx, pc, opts)
                if err != nil {
                        return nil, err
                }
                var child Dentry
                child.Init(fs, childI)
                parent.insertChild(pc, &child)
                // Open may block so we need to unlock fs.mu. IncRef child to prevent
                // its destruction while fs.mu is unlocked.
                child.IncRef()
                unlock()
                fd, err := child.inode.Open(ctx, rp, &child, opts)
                child.DecRef(ctx)
                return fd, err
        }
        if err != nil {
                return nil, err
        }
        // Open existing file or follow symlink.
        if mustCreate {
                return nil, linuxerr.EEXIST
        }
        if rp.ShouldFollowSymlink() && child.isSymlink() {
                targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount())
                if err != nil {
                        return nil, err
                }
                if targetVD.Ok() {
                        err := rp.HandleJump(targetVD)
                        fs.deferDecRefVD(ctx, targetVD)
                        if err != nil {
                                return nil, err
                        }
                } else {
                        if err := rp.HandleSymlink(targetPathname); err != nil {
                                return nil, err
                        }
                }
                // rp.Final() may no longer be true since we now need to resolve the
                // symlink target.
                goto afterTrailingSymlink
        }
        if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
                return nil, err
        }
        // Open may block so we need to unlock fs.mu. IncRef child to prevent
        // its destruction while fs.mu is unlocked.
        child.IncRef()
        unlock()
        fd, err := child.inode.Open(ctx, rp, child, opts)
        child.DecRef(ctx)
        return fd, err
}

// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
        defer fs.processDeferredDecRefs(ctx)

        fs.mu.RLock()
        d, err := fs.walkExistingLocked(ctx, rp)
        if err != nil {
                fs.mu.RUnlock()
                return "", err
        }
        if !d.isSymlink() {
                fs.mu.RUnlock()
                return "", linuxerr.EINVAL
        }

        // Inode.Readlink() cannot be called holding fs locks.
        d.IncRef()
        defer d.DecRef(ctx)
        fs.mu.RUnlock()

        return d.inode.Readlink(ctx, rp.Mount())
}

// RenameAt implements vfs.FilesystemImpl.RenameAt.
func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
        fs.mu.Lock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.Unlock()

        // Resolve the destination directory first to verify that it's on this
        // Mount.
        dstDir, err := fs.walkParentDirLocked(ctx, rp)
        if err != nil {
                return err
        }

        // Only RENAME_NOREPLACE is supported.
        if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
                return linuxerr.EINVAL
        }
        noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0

        mnt := rp.Mount()
        if mnt != oldParentVD.Mount() {
                return linuxerr.EXDEV
        }
        if err := mnt.CheckBeginWrite(); err != nil {
                return err
        }
        defer mnt.EndWrite()

        srcDirVFSD := oldParentVD.Dentry()
        srcDir := srcDirVFSD.Impl().(*Dentry)
        srcDir.dirMu.Lock()
        src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDir.children[oldName])
        srcDir.dirMu.Unlock()
        if err != nil {
                return err
        }

        // Can we remove the src dentry?
        if err := checkDeleteLocked(ctx, rp, src); err != nil {
                return err
        }

        // Can we create the dst dentry?
        var dst *Dentry
        newName := rp.Component()
        if newName == "." || newName == ".." {
                if noReplace {
                        return linuxerr.EEXIST
                }
                return linuxerr.EBUSY
        }

        err = checkCreateLocked(ctx, rp.Credentials(), newName, dstDir)
        switch {
        case err == nil:
                // Ok, continue with rename as replacement.
        case linuxerr.Equals(linuxerr.EEXIST, err):
                if noReplace {
                        // Won't overwrite existing node since RENAME_NOREPLACE was requested.
                        return linuxerr.EEXIST
                }
                dst = dstDir.children[newName]
                if dst == nil {
                        panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", newName, dstDir))
                }
        default:
                return err
        }

        if srcDir == dstDir && oldName == newName {
                return nil
        }

        var dstVFSD *vfs.Dentry
        if dst != nil {
                dstVFSD = dst.VFSDentry()
        }

        mntns := vfs.MountNamespaceFromContext(ctx)
        defer mntns.DecRef(ctx)
        virtfs := rp.VirtualFilesystem()

        // We can't deadlock here due to lock ordering because we're protected from
        // concurrent renames by fs.mu held for writing.
        srcDir.dirMu.Lock()
        defer srcDir.dirMu.Unlock()
        if srcDir != dstDir {
                dstDir.dirMu.Lock()
                defer dstDir.dirMu.Unlock()
        }

        srcVFSD := src.VFSDentry()
        if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
                return err
        }
        err = srcDir.inode.Rename(ctx, src.name, newName, src.inode, dstDir.inode)
        if err != nil {
                virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
                return err
        }
        delete(srcDir.children, src.name)
        if srcDir != dstDir {
                fs.deferDecRef(srcDir) // child (src) drops ref on old parent.
                dstDir.IncRef()        // child (src) takes a ref on the new parent.
        }
        src.parent = dstDir
        src.name = newName
        if dstDir.children == nil {
                dstDir.children = make(map[string]*Dentry)
        }
        replaced := dstDir.children[newName]
        dstDir.children[newName] = src
        var replaceVFSD *vfs.Dentry
        if replaced != nil {
                // deferDecRef so that fs.mu and dstDir.mu are unlocked by then.
                fs.deferDecRef(replaced)
                replaceVFSD = replaced.VFSDentry()
        }
        virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) // +checklocksforce: to may be nil, that's okay.
        return nil
}

// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
        fs.mu.Lock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.Unlock()

        d, err := fs.walkExistingLocked(ctx, rp)
        if err != nil {
                return err
        }
        if err := rp.Mount().CheckBeginWrite(); err != nil {
                return err
        }
        defer rp.Mount().EndWrite()
        if err := checkDeleteLocked(ctx, rp, d); err != nil {
                return err
        }
        if !d.isDir() {
                return linuxerr.ENOTDIR
        }
        if d.inode.HasChildren() {
                return linuxerr.ENOTEMPTY
        }
        virtfs := rp.VirtualFilesystem()
        parentDentry := d.parent
        parentDentry.dirMu.Lock()
        defer parentDentry.dirMu.Unlock()

        mntns := vfs.MountNamespaceFromContext(ctx)
        defer mntns.DecRef(ctx)
        vfsd := d.VFSDentry()
        if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
                return err // +checklocksforce: vfsd is not locked.
        }

        if err := parentDentry.inode.RmDir(ctx, d.name, d.inode); err != nil {
                virtfs.AbortDeleteDentry(vfsd)
                return err
        }
        delete(parentDentry.children, d.name)
        // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
        fs.deferDecRef(d)
        virtfs.CommitDeleteDentry(ctx, vfsd)
        return nil
}

// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
        fs.mu.RLock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.RUnlock()
        d, err := fs.walkExistingLocked(ctx, rp)
        if err != nil {
                return err
        }
        if opts.Stat.Mask == 0 {
                return nil
        }
        return d.inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
}

// StatAt implements vfs.FilesystemImpl.StatAt.
func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
        fs.mu.RLock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.RUnlock()
        d, err := fs.walkExistingLocked(ctx, rp)
        if err != nil {
                return linux.Statx{}, err
        }
        return d.inode.Stat(ctx, fs.VFSFilesystem(), opts)
}

// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
        fs.mu.RLock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.RUnlock()
        d, err := fs.walkExistingLocked(ctx, rp)
        if err != nil {
                return linux.Statfs{}, err
        }
        return d.inode.StatFS(ctx, fs.VFSFilesystem())
}

// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
        if rp.Done() {
                return linuxerr.EEXIST
        }
        fs.mu.Lock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.Unlock()
        parent, err := fs.walkParentDirLocked(ctx, rp)
        if err != nil {
                return err
        }
        parent.dirMu.Lock()
        defer parent.dirMu.Unlock()

        pc := rp.Component()
        if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil {
                return err
        }
        if rp.MustBeDir() {
                return syserror.ENOENT
        }
        if err := rp.Mount().CheckBeginWrite(); err != nil {
                return err
        }
        defer rp.Mount().EndWrite()
        childI, err := parent.inode.NewSymlink(ctx, pc, target)
        if err != nil {
                return err
        }
        var child Dentry
        child.Init(fs, childI)
        parent.insertChildLocked(pc, &child)
        return nil
}

// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
        fs.mu.Lock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.Unlock()

        d, err := fs.walkExistingLocked(ctx, rp)
        if err != nil {
                return err
        }
        if err := rp.Mount().CheckBeginWrite(); err != nil {
                return err
        }
        defer rp.Mount().EndWrite()
        if err := checkDeleteLocked(ctx, rp, d); err != nil {
                return err
        }
        if d.isDir() {
                return syserror.EISDIR
        }
        virtfs := rp.VirtualFilesystem()
        parentDentry := d.parent
        parentDentry.dirMu.Lock()
        defer parentDentry.dirMu.Unlock()
        mntns := vfs.MountNamespaceFromContext(ctx)
        defer mntns.DecRef(ctx)
        vfsd := d.VFSDentry()
        if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
                return err
        }
        if err := parentDentry.inode.Unlink(ctx, d.name, d.inode); err != nil {
                virtfs.AbortDeleteDentry(vfsd)
                return err
        }
        delete(parentDentry.children, d.name)
        // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
        fs.deferDecRef(d)
        virtfs.CommitDeleteDentry(ctx, vfsd)
        return nil
}

// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
        fs.mu.RLock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.RUnlock()
        d, err := fs.walkExistingLocked(ctx, rp)
        if err != nil {
                return nil, err
        }
        if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
                return nil, err
        }
        return nil, linuxerr.ECONNREFUSED
}

// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
        fs.mu.RLock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.RUnlock()
        _, err := fs.walkExistingLocked(ctx, rp)
        if err != nil {
                return nil, err
        }
        // kernfs currently does not support extended attributes.
        return nil, linuxerr.ENOTSUP
}

// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
        fs.mu.RLock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.RUnlock()
        _, err := fs.walkExistingLocked(ctx, rp)
        if err != nil {
                return "", err
        }
        // kernfs currently does not support extended attributes.
        return "", linuxerr.ENOTSUP
}

// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
        fs.mu.RLock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.RUnlock()
        _, err := fs.walkExistingLocked(ctx, rp)
        if err != nil {
                return err
        }
        // kernfs currently does not support extended attributes.
        return linuxerr.ENOTSUP
}

// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
        fs.mu.RLock()
        defer fs.processDeferredDecRefs(ctx)
        defer fs.mu.RUnlock()
        _, err := fs.walkExistingLocked(ctx, rp)
        if err != nil {
                return err
        }
        // kernfs currently does not support extended attributes.
        return linuxerr.ENOTSUP
}

// PrependPath implements vfs.FilesystemImpl.PrependPath.
func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
        fs.mu.RLock()
        defer fs.mu.RUnlock()
        return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b)
}

func (fs *Filesystem) deferDecRefVD(ctx context.Context, vd vfs.VirtualDentry) {
        if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs {
                // The following is equivalent to vd.DecRef(ctx). This is needed
                // because if d belongs to this filesystem, we can not DecRef it right
                // away as we may be holding fs.mu. d.DecRef may acquire fs.mu. So we
                // defer the DecRef to when locks are dropped.
                vd.Mount().DecRef(ctx)
                fs.deferDecRef(d)
        } else {
                vd.DecRef(ctx)
        }
}










































   79 




   84 




  131 




   43 






  362 


  357 

   20 


  362 







   90 




   88 
   19 






   80 
   67 



   14 






    2 


    2 





































  234 

    9 



  234 
  161 
    2 


  159 
   28 

   84 






  441 

  422 

   40 






  441 

  306 

  184 







   25 

   10 

    1 


    9 



   22 









   22 





   22 

    1 



   21 
    1 



   20 











   20 






   19 
   17 


    2 




    2 





   14 
   14 


    1 





    4 






   61 

    1 


   61 


   51 
    2 


   48 









   20 

   12 


   10 


    2 


    1 
    8 




    2 
    1 


    1 

    6 



    5 




   34 

    1 





   33 



   33 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "math"
        "strings"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/syserror"
)

// AccessTypes is a bitmask of Unix file permissions.
//
// +stateify savable
type AccessTypes uint16

// Bits in AccessTypes.
const (
        MayExec  AccessTypes = 1
        MayWrite AccessTypes = 2
        MayRead  AccessTypes = 4
)

// OnlyRead returns true if access _only_ allows read.
func (a AccessTypes) OnlyRead() bool {
        return a == MayRead
}

// MayRead returns true if access allows read.
func (a AccessTypes) MayRead() bool {
        return a&MayRead != 0
}

// MayWrite returns true if access allows write.
func (a AccessTypes) MayWrite() bool {
        return a&MayWrite != 0
}

// MayExec returns true if access allows exec.
func (a AccessTypes) MayExec() bool {
        return a&MayExec != 0
}

// GenericCheckPermissions checks that creds has the given access rights on a
// file with the given permissions, UID, and GID, subject to the rules of
// fs/namei.c:generic_permission().
func GenericCheckPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
        // Check permission bits.
        perms := uint16(mode.Permissions())
        if creds.EffectiveKUID == kuid {
                perms >>= 6
        } else if creds.InGroup(kgid) {
                perms >>= 3
        }
        if uint16(ats)&perms == uint16(ats) {
                // All permission bits match, access granted.
                return nil
        }

        // Caller capabilities require that the file's KUID and KGID are mapped in
        // the caller's user namespace; compare
        // kernel/capability.c:privileged_wrt_inode_uidgid().
        if !kuid.In(creds.UserNamespace).Ok() || !kgid.In(creds.UserNamespace).Ok() {
                return linuxerr.EACCES
        }
        // CAP_DAC_READ_SEARCH allows the caller to read and search arbitrary
        // directories, and read arbitrary non-directory files.
        if (mode.IsDir() && !ats.MayWrite()) || ats.OnlyRead() {
                if creds.HasCapability(linux.CAP_DAC_READ_SEARCH) {
                        return nil
                }
        }
        // CAP_DAC_OVERRIDE allows arbitrary access to directories, read/write
        // access to non-directory files, and execute access to non-directory files
        // for which at least one execute bit is set.
        if mode.IsDir() || !ats.MayExec() || (mode.Permissions()&0111 != 0) {
                if creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
                        return nil
                }
        }
        return linuxerr.EACCES
}

// MayLink determines whether creating a hard link to a file with the given
// mode, kuid, and kgid is permitted.
//
// This corresponds to Linux's fs/namei.c:may_linkat.
func MayLink(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
        // Source inode owner can hardlink all they like; otherwise, it must be a
        // safe source.
        if CanActAsOwner(creds, kuid) {
                return nil
        }

        // Only regular files can be hard linked.
        if mode.FileType() != linux.S_IFREG {
                return linuxerr.EPERM
        }

        // Setuid files should not get pinned to the filesystem.
        if mode&linux.S_ISUID != 0 {
                return linuxerr.EPERM
        }

        // Executable setgid files should not get pinned to the filesystem, but we
        // don't support S_IXGRP anyway.

        // Hardlinking to unreadable or unwritable sources is dangerous.
        if err := GenericCheckPermissions(creds, MayRead|MayWrite, mode, kuid, kgid); err != nil {
                return linuxerr.EPERM
        }
        return nil
}

// AccessTypesForOpenFlags returns the access types required to open a file
// with the given OpenOptions.Flags. Note that this is NOT the same thing as
// the set of accesses permitted for the opened file:
//
// - O_TRUNC causes MayWrite to be set in the returned AccessTypes (since it
// mutates the file), but does not permit writing to the open file description
// thereafter.
//
// - "Linux reserves the special, nonstandard access mode 3 (binary 11) in
// flags to mean: check for read and write permission on the file and return a
// file descriptor that can't be used for reading or writing." - open(2). Thus
// AccessTypesForOpenFlags returns MayRead|MayWrite in this case.
//
// Use May{Read,Write}FileWithOpenFlags() for these checks instead.
func AccessTypesForOpenFlags(opts *OpenOptions) AccessTypes {
        ats := AccessTypes(0)
        if opts.FileExec {
                ats |= MayExec
        }

        switch opts.Flags & linux.O_ACCMODE {
        case linux.O_RDONLY:
                if opts.Flags&linux.O_TRUNC != 0 {
                        return ats | MayRead | MayWrite
                }
                return ats | MayRead
        case linux.O_WRONLY:
                return ats | MayWrite
        default:
                return ats | MayRead | MayWrite
        }
}

// MayReadFileWithOpenFlags returns true if a file with the given open flags
// should be readable.
func MayReadFileWithOpenFlags(flags uint32) bool {
        switch flags & linux.O_ACCMODE {
        case linux.O_RDONLY, linux.O_RDWR:
                return true
        default:
                return false
        }
}

// MayWriteFileWithOpenFlags returns true if a file with the given open flags
// should be writable.
func MayWriteFileWithOpenFlags(flags uint32) bool {
        switch flags & linux.O_ACCMODE {
        case linux.O_WRONLY, linux.O_RDWR:
                return true
        default:
                return false
        }
}

// CheckSetStat checks that creds has permission to change the metadata of a
// file with the given permissions, UID, and GID as specified by stat, subject
// to the rules of Linux's fs/attr.c:setattr_prepare().
func CheckSetStat(ctx context.Context, creds *auth.Credentials, opts *SetStatOptions, mode linux.FileMode, kuid auth.KUID, kgid auth.KGID) error {
        stat := &opts.Stat
        if stat.Mask&linux.STATX_SIZE != 0 {
                limit, err := CheckLimit(ctx, 0, int64(stat.Size))
                if err != nil {
                        return err
                }
                if limit < int64(stat.Size) {
                        return syserror.ErrExceedsFileSizeLimit
                }
        }
        if stat.Mask&linux.STATX_MODE != 0 {
                if !CanActAsOwner(creds, kuid) {
                        return linuxerr.EPERM
                }
                // TODO(b/30815691): "If the calling process is not privileged (Linux:
                // does not have the CAP_FSETID capability), and the group of the file
                // does not match the effective group ID of the process or one of its
                // supplementary group IDs, the S_ISGID bit will be turned off, but
                // this will not cause an error to be returned." - chmod(2)
        }
        if stat.Mask&linux.STATX_UID != 0 {
                if !((creds.EffectiveKUID == kuid && auth.KUID(stat.UID) == kuid) ||
                        HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) {
                        return linuxerr.EPERM
                }
        }
        if stat.Mask&linux.STATX_GID != 0 {
                if !((creds.EffectiveKUID == kuid && creds.InGroup(auth.KGID(stat.GID))) ||
                        HasCapabilityOnFile(creds, linux.CAP_CHOWN, kuid, kgid)) {
                        return linuxerr.EPERM
                }
        }
        if opts.NeedWritePerm && !creds.HasCapability(linux.CAP_DAC_OVERRIDE) {
                if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil {
                        return err
                }
        }
        if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME) != 0 {
                if !CanActAsOwner(creds, kuid) {
                        if (stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW) ||
                                (stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW) ||
                                (stat.Mask&linux.STATX_CTIME != 0 && stat.Ctime.Nsec != linux.UTIME_NOW) {
                                return linuxerr.EPERM
                        }
                        if err := GenericCheckPermissions(creds, MayWrite, mode, kuid, kgid); err != nil {
                                return err
                        }
                }
        }
        return nil
}

// CheckDeleteSticky checks whether the sticky bit is set on a directory with
// the given file mode, and if so, checks whether creds has permission to
// remove a file owned by childKUID from a directory with the given mode.
// CheckDeleteSticky is consistent with fs/linux.h:check_sticky().
func CheckDeleteSticky(creds *auth.Credentials, parentMode linux.FileMode, parentKUID auth.KUID, childKUID auth.KUID, childKGID auth.KGID) error {
        if parentMode&linux.ModeSticky == 0 {
                return nil
        }
        if creds.EffectiveKUID == childKUID ||
                creds.EffectiveKUID == parentKUID ||
                HasCapabilityOnFile(creds, linux.CAP_FOWNER, childKUID, childKGID) {
                return nil
        }
        return linuxerr.EPERM
}

// CanActAsOwner returns true if creds can act as the owner of a file with the
// given owning UID, consistent with Linux's
// fs/inode.c:inode_owner_or_capable().
func CanActAsOwner(creds *auth.Credentials, kuid auth.KUID) bool {
        if creds.EffectiveKUID == kuid {
                return true
        }
        return creds.HasCapability(linux.CAP_FOWNER) && creds.UserNamespace.MapFromKUID(kuid).Ok()
}

// HasCapabilityOnFile returns true if creds has the given capability with
// respect to a file with the given owning UID and GID, consistent with Linux's
// kernel/capability.c:capable_wrt_inode_uidgid().
func HasCapabilityOnFile(creds *auth.Credentials, cp linux.Capability, kuid auth.KUID, kgid auth.KGID) bool {
        return creds.HasCapability(cp) && creds.UserNamespace.MapFromKUID(kuid).Ok() && creds.UserNamespace.MapFromKGID(kgid).Ok()
}

// CheckLimit enforces file size rlimits. It returns error if the write
// operation must not proceed. Otherwise it returns the max length allowed to
// without violating the limit.
func CheckLimit(ctx context.Context, offset, size int64) (int64, error) {
        fileSizeLimit := limits.FromContextOrDie(ctx).Get(limits.FileSize).Cur
        if fileSizeLimit > math.MaxInt64 {
                return size, nil
        }
        if offset >= int64(fileSizeLimit) {
                return 0, syserror.ErrExceedsFileSizeLimit
        }
        remaining := int64(fileSizeLimit) - offset
        if remaining < size {
                return remaining, nil
        }
        return size, nil
}

// CheckXattrPermissions checks permissions for extended attribute access.
// This is analogous to fs/xattr.c:xattr_permission(). Some key differences:
// * Does not check for read-only filesystem property.
// * Does not check inode immutability or append only mode. In both cases EPERM
//   must be returned by filesystem implementations.
// * Does not do inode permission checks. Filesystem implementations should
//   handle inode permission checks as they may differ across implementations.
func CheckXattrPermissions(creds *auth.Credentials, ats AccessTypes, mode linux.FileMode, kuid auth.KUID, name string) error {
        switch {
        case strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX):
                // The trusted.* namespace can only be accessed by privileged
                // users.
                if creds.HasCapability(linux.CAP_SYS_ADMIN) {
                        return nil
                }
                if ats.MayWrite() {
                        return linuxerr.EPERM
                }
                return linuxerr.ENODATA
        case strings.HasPrefix(name, linux.XATTR_USER_PREFIX):
                // In the user.* namespace, only regular files and directories can have
                // extended attributes. For sticky directories, only the owner and
                // privileged users can write attributes.
                filetype := mode.FileType()
                if filetype != linux.ModeRegular && filetype != linux.ModeDirectory {
                        if ats.MayWrite() {
                                return linuxerr.EPERM
                        }
                        return linuxerr.ENODATA
                }
                if filetype == linux.ModeDirectory && mode&linux.ModeSticky != 0 && ats.MayWrite() && !CanActAsOwner(creds, kuid) {
                        return linuxerr.EPERM
                }
        }
        return nil
}

// ClearSUIDAndSGID clears the setuid and/or setgid bits after a chown or write.
// Depending on the mode, neither bit, only the setuid bit, or both are cleared.
func ClearSUIDAndSGID(mode uint32) uint32 {
        // Directories don't have their bits changed.
        if mode&linux.ModeDirectory == linux.ModeDirectory {
                return mode
        }

        // Changing owners always disables the setuid bit. It disables
        // the setgid bit when the file is executable.
        mode &= ^uint32(linux.ModeSetUID)
        if sgid := uint32(linux.ModeSetGID | linux.ModeGroupExec); mode&sgid == sgid {
                mode &= ^uint32(linux.ModeSetGID)
        }
        return mode
}

































    4 






    4 













    1 




    3 




    1 




    1 




    1 




    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package memdev

import (
        "io"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/usermem"
)

const nullDevMinor = 3

// nullDevice implements vfs.Device for /dev/null.
//
// +stateify savable
type nullDevice struct{}

// Open implements vfs.Device.Open.
func (nullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd := &nullFD{}
        if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
                UseDentryMetadata: true,
        }); err != nil {
                return nil, err
        }
        return &fd.vfsfd, nil
}

// nullFD implements vfs.FileDescriptionImpl for /dev/null.
//
// +stateify savable
type nullFD struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.DentryMetadataFileDescriptionImpl
        vfs.NoLockFD
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *nullFD) Release(context.Context) {
        // noop
}

// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *nullFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        return 0, io.EOF
}

// Read implements vfs.FileDescriptionImpl.Read.
func (fd *nullFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        return 0, io.EOF
}

// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *nullFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
        return src.NumBytes(), nil
}

// Write implements vfs.FileDescriptionImpl.Write.
func (fd *nullFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        return src.NumBytes(), nil
}

// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *nullFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        return 0, nil
}






















    2 










  168 






  693 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "gvisor.dev/gvisor/pkg/sentry/inet"
)

// IsNetworkNamespaced returns true if t is in a non-root network namespace.
func (t *Task) IsNetworkNamespaced() bool {
        t.mu.Lock()
        defer t.mu.Unlock()
        return !t.netns.IsRoot()
}

// NetworkContext returns the network stack used by the task. NetworkContext
// may return nil if no network stack is available.
//
// TODO(gvisor.dev/issue/1833): Migrate callers of this method to
// NetworkNamespace().
func (t *Task) NetworkContext() inet.Stack {
        t.mu.Lock()
        defer t.mu.Unlock()
        return t.netns.Stack()
}

// NetworkNamespace returns the network namespace observed by the task.
func (t *Task) NetworkNamespace() *inet.Namespace {
        t.mu.Lock()
        defer t.mu.Unlock()
        return t.netns
}




























   23 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tmpfs

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)

// +stateify savable
type symlink struct {
        inode  inode
        target string // immutable
}

func (fs *filesystem) newSymlink(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, target string, parentDir *directory) *inode {
        link := &symlink{
                target: target,
        }
        link.inode.init(link, fs, kuid, kgid, linux.S_IFLNK|mode, parentDir)
        link.inode.nlink = 1 // from parent directory
        return &link.inode
}

// O_PATH is unimplemented, so there's no way to get a FileDescription
// representing a symlink yet.









































    7 






  189 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernfs

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

// StaticSymlink provides an Inode implementation for symlinks that point to
// a immutable target.
//
// +stateify savable
type StaticSymlink struct {
        InodeAttrs
        InodeNoopRefCount
        InodeSymlink
        InodeNoStatFS

        target string
}

var _ Inode = (*StaticSymlink)(nil)

// NewStaticSymlink creates a new symlink file pointing to 'target'.
func NewStaticSymlink(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) Inode {
        inode := &StaticSymlink{}
        inode.Init(ctx, creds, devMajor, devMinor, ino, target)
        return inode
}

// Init initializes the instance.
func (s *StaticSymlink) Init(ctx context.Context, creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) {
        s.target = target
        s.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeSymlink|0777)
}

// Readlink implements Inode.Readlink.
func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) {
        return s.target, nil
}

// Getlink implements Inode.Getlink.
func (s *StaticSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) {
        return vfs.VirtualDentry{}, s.target, nil
}

// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
        return linuxerr.EPERM
}




















































    2 

























































































    2 




































   17 





   18 








   18 













   11 










   11 















   19 

   16 




   10 



   10 

    9 




    1 




































    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package devpts provides a filesystem implementation that behaves like
// devpts.
package devpts

import (
        "fmt"
        "math"
        "sort"
        "strconv"
        "sync"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

// Name is the filesystem name.
const Name = "devpts"

// FilesystemType implements vfs.FilesystemType.
//
// +stateify savable
type FilesystemType struct {
        initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
        initErr  error

        // fs backs all mounts of this FilesystemType. root is fs' root. fs and root
        // are immutable.
        fs   *vfs.Filesystem
        root *vfs.Dentry
}

// Name implements vfs.FilesystemType.Name.
func (*FilesystemType) Name() string {
        return Name
}

// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fstype *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
        // No data allowed.
        if opts.Data != "" {
                return nil, nil, linuxerr.EINVAL
        }

        fstype.initOnce.Do(func() {
                fs, root, err := fstype.newFilesystem(ctx, vfsObj, creds)
                if err != nil {
                        fstype.initErr = err
                        return
                }
                fstype.fs = fs.VFSFilesystem()
                fstype.root = root.VFSDentry()
        })
        if fstype.initErr != nil {
                return nil, nil, fstype.initErr
        }
        fstype.fs.IncRef()
        fstype.root.IncRef()
        return fstype.fs, fstype.root, nil
}

// Release implements vfs.FilesystemType.Release.
func (fstype *FilesystemType) Release(ctx context.Context) {
        if fstype.fs != nil {
                fstype.root.DecRef(ctx)
                fstype.fs.DecRef(ctx)
        }
}

// +stateify savable
type filesystem struct {
        kernfs.Filesystem

        devMinor uint32
}

// newFilesystem creates a new devpts filesystem with root directory and ptmx
// master inode. It returns the filesystem and root Dentry.
func (fstype *FilesystemType) newFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) {
        devMinor, err := vfsObj.GetAnonBlockDevMinor()
        if err != nil {
                return nil, nil, err
        }

        fs := &filesystem{
                devMinor: devMinor,
        }
        fs.Filesystem.VFSFilesystem().Init(vfsObj, fstype, fs)

        // Construct the root directory. This is always inode id 1.
        root := &rootInode{
                replicas: make(map[uint32]*replicaInode),
        }
        root.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
        root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
        root.InitRefs()

        var rootD kernfs.Dentry
        rootD.InitRoot(&fs.Filesystem, root)

        // Construct the pts master inode and dentry. Linux always uses inode
        // id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx.
        master := &masterInode{
                root: root,
        }
        master.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666)

        // Add the master as a child of the root.
        links := root.OrderedChildren.Populate(map[string]kernfs.Inode{
                "ptmx": master,
        })
        root.IncLinks(links)

        return fs, &rootD, nil
}

// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
        fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
        fs.Filesystem.Release(ctx)
}

// MountOptions implements vfs.FilesystemImpl.MountOptions.
func (fs *filesystem) MountOptions() string {
        return ""
}

// rootInode is the root directory inode for the devpts mounts.
//
// +stateify savable
type rootInode struct {
        implStatFS
        kernfs.InodeAlwaysValid
        kernfs.InodeAttrs
        kernfs.InodeDirectoryNoNewChildren
        kernfs.InodeNotSymlink
        kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
        kernfs.OrderedChildren
        rootInodeRefs

        locks vfs.FileLocks

        // master is the master pty inode. Immutable.
        master *masterInode

        // mu protects the fields below.
        mu sync.Mutex `state:"nosave"`

        // replicas maps pty ids to replica inodes.
        replicas map[uint32]*replicaInode

        // nextIdx is the next pty index to use. Must be accessed atomically.
        //
        // TODO(b/29356795): reuse indices when ptys are closed.
        nextIdx uint32
}

var _ kernfs.Inode = (*rootInode)(nil)

// allocateTerminal creates a new Terminal and installs a pts node for it.
func (i *rootInode) allocateTerminal(ctx context.Context, creds *auth.Credentials) (*Terminal, error) {
        i.mu.Lock()
        defer i.mu.Unlock()
        if i.nextIdx == math.MaxUint32 {
                return nil, syserror.ENOMEM
        }
        idx := i.nextIdx
        i.nextIdx++

        // Sanity check that replica with idx does not exist.
        if _, ok := i.replicas[idx]; ok {
                panic(fmt.Sprintf("pty index collision; index %d already exists", idx))
        }

        // Create the new terminal and replica.
        t := newTerminal(idx)
        replica := &replicaInode{
                root: i,
                t:    t,
        }
        // Linux always uses pty index + 3 as the inode id. See
        // fs/devpts/inode.c:devpts_pty_new().
        replica.InodeAttrs.Init(ctx, creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
        i.replicas[idx] = replica

        return t, nil
}

// masterClose is called when the master end of t is closed.
func (i *rootInode) masterClose(ctx context.Context, t *Terminal) {
        i.mu.Lock()
        defer i.mu.Unlock()

        // Sanity check that replica with idx exists.
        ri, ok := i.replicas[t.n]
        if !ok {
                panic(fmt.Sprintf("pty with index %d does not exist", t.n))
        }

        // Drop the ref on replica inode taken during rootInode.allocateTerminal.
        ri.DecRef(ctx)
        delete(i.replicas, t.n)
}

// Open implements kernfs.Inode.Open.
func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
                SeekEnd: kernfs.SeekEndStaticEntries,
        })
        if err != nil {
                return nil, err
        }
        return fd.VFSFileDescription(), nil
}

// Lookup implements kernfs.Inode.Lookup.
func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
        // Check if a static entry was looked up.
        if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil {
                return d, nil
        }

        // Not a static entry.
        idx, err := strconv.ParseUint(name, 10, 32)
        if err != nil {
                return nil, syserror.ENOENT
        }
        i.mu.Lock()
        defer i.mu.Unlock()
        if ri, ok := i.replicas[uint32(idx)]; ok {
                ri.IncRef() // This ref is passed to the dentry upon creation via Init.
                return ri, nil

        }
        return nil, syserror.ENOENT
}

// IterDirents implements kernfs.Inode.IterDirents.
func (i *rootInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
        i.mu.Lock()
        defer i.mu.Unlock()
        i.InodeAttrs.TouchAtime(ctx, mnt)
        ids := make([]int, 0, len(i.replicas))
        for id := range i.replicas {
                ids = append(ids, int(id))
        }
        sort.Ints(ids)
        for _, id := range ids[relOffset:] {
                dirent := vfs.Dirent{
                        Name:    strconv.FormatUint(uint64(id), 10),
                        Type:    linux.DT_CHR,
                        Ino:     i.replicas[uint32(id)].InodeAttrs.Ino(),
                        NextOff: offset + 1,
                }
                if err := cb.Handle(dirent); err != nil {
                        return offset, err
                }
                offset++
        }
        return offset, nil
}

// DecRef implements kernfs.Inode.DecRef.
func (i *rootInode) DecRef(ctx context.Context) {
        i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) })
}

// +stateify savable
type implStatFS struct{}

// StatFS implements kernfs.Inode.StatFS.
func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
        return vfs.GenericStatFS(linux.DEVPTS_SUPER_MAGIC), nil
}














































    1 












   25 




  118 




   36 








   41 
   42 



   35 



   34 










   34 




   31 









   32 



   15 




   15 





   15 





















   15 









  132 




  132 















  129 



  129 

  129 




   28 





  128 









   43 







   44 







    5 


   42 



   44 




    4 
    1 

    1 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package memmap

import (
        "fmt"
        "math"

        "gvisor.dev/gvisor/pkg/hostarch"
)

// MappingSet maps offsets into a Mappable to mappings of those offsets. It is
// used to implement Mappable.AddMapping and RemoveMapping for Mappables that
// may need to call MappingSpace.Invalidate.
//
// type MappingSet <generated by go_generics>

// MappingsOfRange is the value type of MappingSet, and represents the set of
// all mappings of the corresponding MappableRange.
//
// Using a map offers O(1) lookups in RemoveMapping and
// mappingSetFunctions.Merge.
type MappingsOfRange map[MappingOfRange]struct{}

// MappingOfRange represents a mapping of a MappableRange.
//
// +stateify savable
type MappingOfRange struct {
        MappingSpace MappingSpace
        AddrRange    hostarch.AddrRange
        Writable     bool
}

func (r MappingOfRange) invalidate(opts InvalidateOpts) {
        r.MappingSpace.Invalidate(r.AddrRange, opts)
}

// String implements fmt.Stringer.String.
func (r MappingOfRange) String() string {
        return fmt.Sprintf("%#v", r.AddrRange)
}

// mappingSetFunctions implements segment.Functions for MappingSet.
type mappingSetFunctions struct{}

// MinKey implements segment.Functions.MinKey.
func (mappingSetFunctions) MinKey() uint64 {
        return 0
}

// MaxKey implements segment.Functions.MaxKey.
func (mappingSetFunctions) MaxKey() uint64 {
        return math.MaxUint64
}

// ClearValue implements segment.Functions.ClearValue.
func (mappingSetFunctions) ClearValue(v *MappingsOfRange) {
        *v = MappingsOfRange{}
}

// Merge implements segment.Functions.Merge.
//
// Since each value is a map of MappingOfRanges, values can only be merged if
// all MappingOfRanges in each map have an exact pair in the other map, forming
// one contiguous region.
func (mappingSetFunctions) Merge(r1 MappableRange, val1 MappingsOfRange, r2 MappableRange, val2 MappingsOfRange) (MappingsOfRange, bool) {
        if len(val1) != len(val2) {
                return nil, false
        }

        merged := make(MappingsOfRange, len(val1))

        // Each MappingOfRange in val1 must have a matching region in val2, forming
        // one contiguous region.
        for k1 := range val1 {
                // We expect val2 to contain a key that forms a contiguous
                // region with k1.
                k2 := MappingOfRange{
                        MappingSpace: k1.MappingSpace,
                        AddrRange: hostarch.AddrRange{
                                Start: k1.AddrRange.End,
                                End:   k1.AddrRange.End + hostarch.Addr(r2.Length()),
                        },
                        Writable: k1.Writable,
                }
                if _, ok := val2[k2]; !ok {
                        return nil, false
                }

                // OK. Add it to the merged map.
                merged[MappingOfRange{
                        MappingSpace: k1.MappingSpace,
                        AddrRange: hostarch.AddrRange{
                                Start: k1.AddrRange.Start,
                                End:   k2.AddrRange.End,
                        },
                        Writable: k1.Writable,
                }] = struct{}{}
        }

        return merged, true
}

// Split implements segment.Functions.Split.
func (mappingSetFunctions) Split(r MappableRange, val MappingsOfRange, split uint64) (MappingsOfRange, MappingsOfRange) {
        if split <= r.Start || split >= r.End {
                panic(fmt.Sprintf("split is not within range %v", r))
        }

        m1 := make(MappingsOfRange, len(val))
        m2 := make(MappingsOfRange, len(val))

        // split is a value in MappableRange, we need the offset into the
        // corresponding MappingsOfRange.
        offset := hostarch.Addr(split - r.Start)
        for k := range val {
                k1 := MappingOfRange{
                        MappingSpace: k.MappingSpace,
                        AddrRange: hostarch.AddrRange{
                                Start: k.AddrRange.Start,
                                End:   k.AddrRange.Start + offset,
                        },
                        Writable: k.Writable,
                }
                m1[k1] = struct{}{}

                k2 := MappingOfRange{
                        MappingSpace: k.MappingSpace,
                        AddrRange: hostarch.AddrRange{
                                Start: k.AddrRange.Start + offset,
                                End:   k.AddrRange.End,
                        },
                        Writable: k.Writable,
                }
                m2[k2] = struct{}{}
        }

        return m1, m2
}

// subsetMapping returns the MappingOfRange that maps subsetRange, given that
// ms maps wholeRange beginning at addr.
//
// For instance, suppose wholeRange = [0x0, 0x2000) and addr = 0x4000,
// indicating that ms maps addresses [0x4000, 0x6000) to MappableRange [0x0,
// 0x2000). Then for subsetRange = [0x1000, 0x2000), subsetMapping returns a
// MappingOfRange for which AddrRange = [0x5000, 0x6000).
func subsetMapping(wholeRange, subsetRange MappableRange, ms MappingSpace, addr hostarch.Addr, writable bool) MappingOfRange {
        if !wholeRange.IsSupersetOf(subsetRange) {
                panic(fmt.Sprintf("%v is not a superset of %v", wholeRange, subsetRange))
        }

        offset := subsetRange.Start - wholeRange.Start
        start := addr + hostarch.Addr(offset)
        return MappingOfRange{
                MappingSpace: ms,
                AddrRange: hostarch.AddrRange{
                        Start: start,
                        End:   start + hostarch.Addr(subsetRange.Length()),
                },
                Writable: writable,
        }
}

// AddMapping adds the given mapping and returns the set of MappableRanges that
// previously had no mappings.
//
// Preconditions: Same as Mappable.AddMapping.
func (s *MappingSet) AddMapping(ms MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) []MappableRange {
        mr := MappableRange{offset, offset + uint64(ar.Length())}
        var mapped []MappableRange
        seg, gap := s.Find(mr.Start)
        for {
                switch {
                case seg.Ok() && seg.Start() < mr.End:
                        seg = s.Isolate(seg, mr)
                        seg.Value()[subsetMapping(mr, seg.Range(), ms, ar.Start, writable)] = struct{}{}
                        seg, gap = seg.NextNonEmpty()

                case gap.Ok() && gap.Start() < mr.End:
                        gapMR := gap.Range().Intersect(mr)
                        mapped = append(mapped, gapMR)
                        // Insert a set and continue from the above case.
                        seg, gap = s.Insert(gap, gapMR, make(MappingsOfRange)), MappingGapIterator{}

                default:
                        return mapped
                }
        }
}

// RemoveMapping removes the given mapping and returns the set of
// MappableRanges that now have no mappings.
//
// Preconditions: Same as Mappable.RemoveMapping.
func (s *MappingSet) RemoveMapping(ms MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) []MappableRange {
        mr := MappableRange{offset, offset + uint64(ar.Length())}
        var unmapped []MappableRange

        seg := s.FindSegment(mr.Start)
        if !seg.Ok() {
                panic(fmt.Sprintf("MappingSet.RemoveMapping(%v): no segment containing %#x: %v", mr, mr.Start, s))
        }
        for seg.Ok() && seg.Start() < mr.End {
                // Ensure this segment is limited to our range.
                seg = s.Isolate(seg, mr)

                // Remove this part of the mapping.
                mappings := seg.Value()
                delete(mappings, subsetMapping(mr, seg.Range(), ms, ar.Start, writable))

                if len(mappings) == 0 {
                        unmapped = append(unmapped, seg.Range())
                        seg = s.Remove(seg).NextSegment()
                } else {
                        seg = seg.NextSegment()
                }
        }
        s.MergeAdjacent(mr)
        return unmapped
}

// Invalidate calls MappingSpace.Invalidate for all mappings of offsets in mr.
func (s *MappingSet) Invalidate(mr MappableRange, opts InvalidateOpts) {
        for seg := s.LowerBoundSegment(mr.Start); seg.Ok() && seg.Start() < mr.End; seg = seg.NextSegment() {
                segMR := seg.Range()
                for m := range seg.Value() {
                        region := subsetMapping(segMR, segMR.Intersect(mr), m.MappingSpace, m.AddrRange.Start, m.Writable)
                        region.invalidate(opts)
                }
        }
}

// InvalidateAll calls MappingSpace.Invalidate for all mappings of s.
func (s *MappingSet) InvalidateAll(opts InvalidateOpts) {
        for seg := s.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
                for m := range seg.Value() {
                        m.invalidate(opts)
                }
        }
}


























































    5 


































    4 


    4 






    4 








    2 


    2 



    2 

    2 


    2 


    2 











  211 



   71 


   72 


   72 

  166 
  165 


  167 



  211 




  212 

    7 








    1 


    6 

  211 














  209 
   71 


  210 
   73 


  210 

  211 















    2 





    2 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fsutil

import (
        "fmt"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sync"
)

// HostFileMapper caches mappings of an arbitrary host file descriptor. It is
// used by implementations of memmap.Mappable that represent a host file
// descriptor.
//
// +stateify savable
type HostFileMapper struct {
        // HostFile conceptually breaks the file into pieces called chunks, of
        // size and alignment chunkSize, and caches mappings of the file on a chunk
        // granularity.

        refsMu sync.Mutex `state:"nosave"`

        // refs maps chunk start offsets to the sum of reference counts for all
        // pages in that chunk. refs is protected by refsMu.
        refs map[uint64]int32

        mapsMu sync.Mutex `state:"nosave"`

        // mappings maps chunk start offsets to mappings of those chunks,
        // obtained by calling unix.Mmap. mappings is protected by
        // mapsMu.
        mappings map[uint64]mapping `state:"nosave"`
}

const (
        chunkShift = hostarch.HugePageShift
        chunkSize  = 1 << chunkShift
        chunkMask  = chunkSize - 1
)

func pagesInChunk(mr memmap.MappableRange, chunkStart uint64) int32 {
        return int32(mr.Intersect(memmap.MappableRange{chunkStart, chunkStart + chunkSize}).Length() / hostarch.PageSize)
}

type mapping struct {
        addr     uintptr
        writable bool
}

// Init must be called on zero-value HostFileMappers before first use.
func (f *HostFileMapper) Init() {
        f.refs = make(map[uint64]int32)
        f.mappings = make(map[uint64]mapping)
}

// IsInited returns true if f.Init() has been called. This is used when
// restoring a checkpoint that contains a HostFileMapper that may or may not
// have been initialized.
func (f *HostFileMapper) IsInited() bool {
        return f.refs != nil
}

// NewHostFileMapper returns an initialized HostFileMapper allocated on the
// heap with no references or cached mappings.
func NewHostFileMapper() *HostFileMapper {
        f := &HostFileMapper{}
        f.Init()
        return f
}

// IncRefOn increments the reference count on all offsets in mr.
//
// Preconditions:
// * mr.Length() != 0.
// * mr.Start and mr.End must be page-aligned.
func (f *HostFileMapper) IncRefOn(mr memmap.MappableRange) {
        f.refsMu.Lock()
        defer f.refsMu.Unlock()
        for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize {
                refs := f.refs[chunkStart]
                pgs := pagesInChunk(mr, chunkStart)
                if refs+pgs < refs {
                        // Would overflow.
                        panic(fmt.Sprintf("HostFileMapper.IncRefOn(%v): adding %d page references to chunk %#x, which has %d page references", mr, pgs, chunkStart, refs))
                }
                f.refs[chunkStart] = refs + pgs
        }
}

// DecRefOn decrements the reference count on all offsets in mr.
//
// Preconditions:
// * mr.Length() != 0.
// * mr.Start and mr.End must be page-aligned.
func (f *HostFileMapper) DecRefOn(mr memmap.MappableRange) {
        f.refsMu.Lock()
        defer f.refsMu.Unlock()
        for chunkStart := mr.Start &^ chunkMask; chunkStart < mr.End; chunkStart += chunkSize {
                refs := f.refs[chunkStart]
                pgs := pagesInChunk(mr, chunkStart)
                switch {
                case refs > pgs:
                        f.refs[chunkStart] = refs - pgs
                case refs == pgs:
                        f.mapsMu.Lock()
                        delete(f.refs, chunkStart)
                        if m, ok := f.mappings[chunkStart]; ok {
                                f.unmapAndRemoveLocked(chunkStart, m)
                        }
                        f.mapsMu.Unlock()
                case refs < pgs:
                        panic(fmt.Sprintf("HostFileMapper.DecRefOn(%v): removing %d page references from chunk %#x, which has %d page references", mr, pgs, chunkStart, refs))
                }
        }
}

// MapInternal returns a mapping of offsets in fr from fd. The returned
// safemem.BlockSeq is valid as long as at least one reference is held on all
// offsets in fr or until the next call to UnmapAll.
//
// Preconditions: The caller must hold a reference on all offsets in fr.
func (f *HostFileMapper) MapInternal(fr memmap.FileRange, fd int, write bool) (safemem.BlockSeq, error) {
        chunks := ((fr.End + chunkMask) >> chunkShift) - (fr.Start >> chunkShift)
        f.mapsMu.Lock()
        defer f.mapsMu.Unlock()
        if chunks == 1 {
                // Avoid an unnecessary slice allocation.
                var seq safemem.BlockSeq
                err := f.forEachMappingBlockLocked(fr, fd, write, func(b safemem.Block) {
                        seq = safemem.BlockSeqOf(b)
                })
                return seq, err
        }
        blocks := make([]safemem.Block, 0, chunks)
        err := f.forEachMappingBlockLocked(fr, fd, write, func(b safemem.Block) {
                blocks = append(blocks, b)
        })
        return safemem.BlockSeqFromSlice(blocks), err
}

// Preconditions: f.mapsMu must be locked.
func (f *HostFileMapper) forEachMappingBlockLocked(fr memmap.FileRange, fd int, write bool, fn func(safemem.Block)) error {
        prot := unix.PROT_READ
        if write {
                prot |= unix.PROT_WRITE
        }
        for chunkStart := fr.Start &^ chunkMask; chunkStart < fr.End; chunkStart += chunkSize {
                m, ok := f.mappings[chunkStart]
                if !ok {
                        addr, _, errno := unix.Syscall6(
                                unix.SYS_MMAP,
                                0,
                                chunkSize,
                                uintptr(prot),
                                unix.MAP_SHARED,
                                uintptr(fd),
                                uintptr(chunkStart))
                        if errno != 0 {
                                return errno
                        }
                        m = mapping{addr, write}
                        f.mappings[chunkStart] = m
                } else if write && !m.writable {
                        addr, _, errno := unix.Syscall6(
                                unix.SYS_MMAP,
                                m.addr,
                                chunkSize,
                                uintptr(prot),
                                unix.MAP_SHARED|unix.MAP_FIXED,
                                uintptr(fd),
                                uintptr(chunkStart))
                        if errno != 0 {
                                return errno
                        }
                        m = mapping{addr, write}
                        f.mappings[chunkStart] = m
                }
                var startOff uint64
                if chunkStart < fr.Start {
                        startOff = fr.Start - chunkStart
                }
                endOff := uint64(chunkSize)
                if chunkStart+chunkSize > fr.End {
                        endOff = fr.End - chunkStart
                }
                fn(f.unsafeBlockFromChunkMapping(m.addr).TakeFirst64(endOff).DropFirst64(startOff))
        }
        return nil
}

// UnmapAll unmaps all cached mappings. Callers are responsible for
// synchronization with mappings returned by previous calls to MapInternal.
func (f *HostFileMapper) UnmapAll() {
        f.mapsMu.Lock()
        defer f.mapsMu.Unlock()
        for chunkStart, m := range f.mappings {
                f.unmapAndRemoveLocked(chunkStart, m)
        }
}

// Preconditions:
// * f.mapsMu must be locked.
// * f.mappings[chunkStart] == m.
func (f *HostFileMapper) unmapAndRemoveLocked(chunkStart uint64, m mapping) {
        if _, _, errno := unix.Syscall(unix.SYS_MUNMAP, m.addr, chunkSize, 0); errno != 0 {
                // This leaks address space and is unexpected, but is otherwise
                // harmless, so complain but don't panic.
                log.Warningf("HostFileMapper: failed to unmap mapping %#x for chunk %#x: %v", m.addr, chunkStart, errno)
        }
        delete(f.mappings, chunkStart)
}

// RegenerateMappings must be called when the file description mapped by f
// changes, to replace existing mappings of the previous file description.
func (f *HostFileMapper) RegenerateMappings(fd int) error {
        f.mapsMu.Lock()
        defer f.mapsMu.Unlock()

        for chunkStart, m := range f.mappings {
                prot := unix.PROT_READ
                if m.writable {
                        prot |= unix.PROT_WRITE
                }
                _, _, errno := unix.Syscall6(
                        unix.SYS_MMAP,
                        m.addr,
                        chunkSize,
                        uintptr(prot),
                        unix.MAP_SHARED|unix.MAP_FIXED,
                        uintptr(fd),
                        uintptr(chunkStart))
                if errno != 0 {
                        return errno
                }
        }
        return nil
}































































































































































   19 































   19 



   19 
   19 



   19 







   29 




   48 








   31 









    2 












   11 




    2 















   11 


   11 









   11 

    1 





   11 










    8 
    1 



    7 

    3 

    1 



    3 



    5 
    5 



    5 






    1 



    5 




    5 



    5 





    5 



    1 



    5 



    5 









   28 

    9 
   13 


   15 
    1 


   14 




    9 










    9 



    9 





   22 

    1 




   22 
    1 


    2 





   22 
    2 


   20 




   28 


   22 

    3 



    1 

    3 


    2 



   27 


   28 









   28 

    1 



   28 




   27 

   15 



   13 


   15 
    1 



    1 


   14 




   14 
    1 



   14 
    1 


   13 





   25 



   24 



   24 


    1 












    2 


   23 

    2 





   23 












   27 
    1 




   28 













   28 
    7 


   23 
    1 


   22 



    1 













    3 













    1 




    1 




    1 





    3 










    9 

    4 








    4 


    1 





    3 



    2 

    1 




    2 



    2 


    1 




    1 








    1 


    1 



    1 








    1 



    1 

    4 
    1 



    3 
    1 
    1 





    2 


    3 



    1 








    1 



    1 

    1 


    4 



    3 























    1 


    1 



    1 


    1 





    1 





    2 

    1 







    1 


    1 



















   23 























    5 

    1 


    5 


   23 


   23 



    1 





   22 





   33 

    1 


   33 



    4 



    2 


    2 









    1 









    1 

    1 
    1 













    1 


    2 










    9 






    5 
    5 

    3 



    2 



    1 




    8 




    8 
    1 



    7 






    4 






    7 
    2 






    7 


    1 





    7 



    7 






    7 




















    1 









    1 

    1 





    1 




    1 



    1 




    1 



   20 

   17 










    1 


   16 

   19 


    1 












   19 


   16 


    1 



   15 







   15 
   11 






   15 
    4 


    2 




   13 




    1 



   12 
















    7 




    4 




    3 






















    1 







    1 































   23 
   18 














    5 



    4 






    8 









    8 





    7 




    1 






    6 







    6 




















    3 

    3 






    6 








    6 




    6 






    2 



    2 



    2 


















    6 



    8 


    8 
    6 






    1 




















   26 




   19 




   35 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package udp

import (
        "io"
        "sync/atomic"
        "time"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/ports"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
        "gvisor.dev/gvisor/pkg/waiter"
)

// +stateify savable
type udpPacket struct {
        udpPacketEntry
        senderAddress      tcpip.FullAddress
        destinationAddress tcpip.FullAddress
        packetInfo         tcpip.IPPacketInfo
        data               buffer.VectorisedView `state:".(buffer.VectorisedView)"`
        receivedAt         time.Time             `state:".(int64)"`
        // tos stores either the receiveTOS or receiveTClass value.
        tos uint8
}

// EndpointState represents the state of a UDP endpoint.
type EndpointState tcpip.EndpointState

// Endpoint states. Note that are represented in a netstack-specific manner and
// may not be meaningful externally. Specifically, they need to be translated to
// Linux's representation for these states if presented to userspace.
const (
        _ EndpointState = iota
        StateInitial
        StateBound
        StateConnected
        StateClosed
)

// String implements fmt.Stringer.
func (s EndpointState) String() string {
        switch s {
        case StateInitial:
                return "INITIAL"
        case StateBound:
                return "BOUND"
        case StateConnected:
                return "CONNECTING"
        case StateClosed:
                return "CLOSED"
        default:
                return "UNKNOWN"
        }
}

// endpoint represents a UDP endpoint. This struct serves as the interface
// between users of the endpoint and the protocol implementation; it is legal to
// have concurrent goroutines make calls into the endpoint, they are properly
// synchronized.
//
// It implements tcpip.Endpoint.
//
// +stateify savable
type endpoint struct {
        stack.TransportEndpointInfo
        tcpip.DefaultSocketOptionsHandler

        // The following fields are initialized at creation time and do not
        // change throughout the lifetime of the endpoint.
        stack       *stack.Stack `state:"manual"`
        waiterQueue *waiter.Queue
        uniqueID    uint64

        // The following fields are used to manage the receive queue, and are
        // protected by rcvMu.
        rcvMu      sync.Mutex `state:"nosave"`
        rcvReady   bool
        rcvList    udpPacketList
        rcvBufSize int
        rcvClosed  bool

        // The following fields are protected by the mu mutex.
        mu sync.RWMutex `state:"nosave"`
        // state must be read/set using the EndpointState()/setEndpointState()
        // methods.
        state          uint32
        route          *stack.Route `state:"manual"`
        dstPort        uint16
        ttl            uint8
        multicastTTL   uint8
        multicastAddr  tcpip.Address
        multicastNICID tcpip.NICID
        portFlags      ports.Flags

        lastErrorMu sync.Mutex `state:"nosave"`
        lastError   tcpip.Error

        // Values used to reserve a port or register a transport endpoint.
        // (which ever happens first).
        boundBindToDevice tcpip.NICID
        boundPortFlags    ports.Flags

        // sendTOS represents IPv4 TOS or IPv6 TrafficClass,
        // applied while sending packets. Defaults to 0 as on Linux.
        sendTOS uint8

        // shutdownFlags represent the current shutdown state of the endpoint.
        shutdownFlags tcpip.ShutdownFlags

        // multicastMemberships that need to be remvoed when the endpoint is
        // closed. Protected by the mu mutex.
        multicastMemberships map[multicastMembership]struct{}

        // effectiveNetProtos contains the network protocols actually in use. In
        // most cases it will only contain "netProto", but in cases like IPv6
        // endpoints with v6only set to false, this could include multiple
        // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
        // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
        // address).
        effectiveNetProtos []tcpip.NetworkProtocolNumber

        // TODO(b/142022063): Add ability to save and restore per endpoint stats.
        stats tcpip.TransportEndpointStats `state:"nosave"`

        // owner is used to get uid and gid of the packet.
        owner tcpip.PacketOwner

        // ops is used to get socket level options.
        ops tcpip.SocketOptions

        // frozen indicates if the packets should be delivered to the endpoint
        // during restore.
        frozen bool
}

// +stateify savable
type multicastMembership struct {
        nicID         tcpip.NICID
        multicastAddr tcpip.Address
}

func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
        e := &endpoint{
                stack: s,
                TransportEndpointInfo: stack.TransportEndpointInfo{
                        NetProto:   netProto,
                        TransProto: header.UDPProtocolNumber,
                },
                waiterQueue: waiterQueue,
                // RFC 1075 section 5.4 recommends a TTL of 1 for membership
                // requests.
                //
                // RFC 5135 4.2.1 appears to assume that IGMP messages have a
                // TTL of 1.
                //
                // RFC 5135 Appendix A defines TTL=1: A multicast source that
                // wants its traffic to not traverse a router (e.g., leave a
                // home network) may find it useful to send traffic with IP
                // TTL=1.
                //
                // Linux defaults to TTL=1.
                multicastTTL:         1,
                multicastMemberships: make(map[multicastMembership]struct{}),
                state:                uint32(StateInitial),
                uniqueID:             s.UniqueID(),
        }
        e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits)
        e.ops.SetMulticastLoop(true)
        e.ops.SetSendBufferSize(32*1024, false /* notify */)
        e.ops.SetReceiveBufferSize(32*1024, false /* notify */)

        // Override with stack defaults.
        var ss tcpip.SendBufferSizeOption
        if err := s.Option(&ss); err == nil {
                e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */)
        }

        var rs tcpip.ReceiveBufferSizeOption
        if err := s.Option(&rs); err == nil {
                e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */)
        }

        return e
}

// setEndpointState updates the state of the endpoint to state atomically. This
// method is unexported as the only place we should update the state is in this
// package but we allow the state to be read freely without holding e.mu.
//
// Precondition: e.mu must be held to call this method.
func (e *endpoint) setEndpointState(state EndpointState) {
        atomic.StoreUint32(&e.state, uint32(state))
}

// EndpointState() returns the current state of the endpoint.
func (e *endpoint) EndpointState() EndpointState {
        return EndpointState(atomic.LoadUint32(&e.state))
}

// UniqueID implements stack.TransportEndpoint.
func (e *endpoint) UniqueID() uint64 {
        return e.uniqueID
}

func (e *endpoint) LastError() tcpip.Error {
        e.lastErrorMu.Lock()
        defer e.lastErrorMu.Unlock()

        err := e.lastError
        e.lastError = nil
        return err
}

// UpdateLastError implements tcpip.SocketOptionsHandler.
func (e *endpoint) UpdateLastError(err tcpip.Error) {
        e.lastErrorMu.Lock()
        e.lastError = err
        e.lastErrorMu.Unlock()
}

// Abort implements stack.TransportEndpoint.
func (e *endpoint) Abort() {
        e.Close()
}

// Close puts the endpoint in a closed state and frees all resources
// associated with it.
func (e *endpoint) Close() {
        e.mu.Lock()
        e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite

        switch e.EndpointState() {
        case StateBound, StateConnected:
                e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
                portRes := ports.Reservation{
                        Networks:     e.effectiveNetProtos,
                        Transport:    ProtocolNumber,
                        Addr:         e.ID.LocalAddress,
                        Port:         e.ID.LocalPort,
                        Flags:        e.boundPortFlags,
                        BindToDevice: e.boundBindToDevice,
                        Dest:         tcpip.FullAddress{},
                }
                e.stack.ReleasePort(portRes)
                e.boundBindToDevice = 0
                e.boundPortFlags = ports.Flags{}
        }

        for mem := range e.multicastMemberships {
                e.stack.LeaveGroup(e.NetProto, mem.nicID, mem.multicastAddr)
        }
        e.multicastMemberships = make(map[multicastMembership]struct{})

        // Close the receive list and drain it.
        e.rcvMu.Lock()
        e.rcvClosed = true
        e.rcvBufSize = 0
        for !e.rcvList.Empty() {
                p := e.rcvList.Front()
                e.rcvList.Remove(p)
        }
        e.rcvMu.Unlock()

        if e.route != nil {
                e.route.Release()
                e.route = nil
        }

        // Update the state.
        e.setEndpointState(StateClosed)

        e.mu.Unlock()

        e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
}

// ModerateRecvBuf implements tcpip.Endpoint.
func (*endpoint) ModerateRecvBuf(int) {}

// Read implements tcpip.Endpoint.
func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) {
        if err := e.LastError(); err != nil {
                return tcpip.ReadResult{}, err
        }

        e.rcvMu.Lock()

        if e.rcvList.Empty() {
                var err tcpip.Error = &tcpip.ErrWouldBlock{}
                if e.rcvClosed {
                        e.stats.ReadErrors.ReadClosed.Increment()
                        err = &tcpip.ErrClosedForReceive{}
                }
                e.rcvMu.Unlock()
                return tcpip.ReadResult{}, err
        }

        p := e.rcvList.Front()
        if !opts.Peek {
                e.rcvList.Remove(p)
                e.rcvBufSize -= p.data.Size()
        }
        e.rcvMu.Unlock()

        // Control Messages
        cm := tcpip.ControlMessages{
                HasTimestamp: true,
                Timestamp:    p.receivedAt.UnixNano(),
        }
        if e.ops.GetReceiveTOS() {
                cm.HasTOS = true
                cm.TOS = p.tos
        }
        if e.ops.GetReceiveTClass() {
                cm.HasTClass = true
                // Although TClass is an 8-bit value it's read in the CMsg as a uint32.
                cm.TClass = uint32(p.tos)
        }
        if e.ops.GetReceivePacketInfo() {
                cm.HasIPPacketInfo = true
                cm.PacketInfo = p.packetInfo
        }
        if e.ops.GetReceiveOriginalDstAddress() {
                cm.HasOriginalDstAddress = true
                cm.OriginalDstAddress = p.destinationAddress
        }

        // Read Result
        res := tcpip.ReadResult{
                Total:           p.data.Size(),
                ControlMessages: cm,
        }
        if opts.NeedRemoteAddr {
                res.RemoteAddr = p.senderAddress
        }

        n, err := p.data.ReadTo(dst, opts.Peek)
        if n == 0 && err != nil {
                return res, &tcpip.ErrBadBuffer{}
        }
        res.Count = n
        return res, nil
}

// prepareForWrite prepares the endpoint for sending data. In particular, it
// binds it if it's still in the initial state. To do so, it must first
// reacquire the mutex in exclusive mode.
//
// Returns true for retry if preparation should be retried.
// +checklocks:e.mu
func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err tcpip.Error) {
        switch e.EndpointState() {
        case StateInitial:
        case StateConnected:
                return false, nil

        case StateBound:
                if to == nil {
                        return false, &tcpip.ErrDestinationRequired{}
                }
                return false, nil
        default:
                return false, &tcpip.ErrInvalidEndpointState{}
        }

        e.mu.RUnlock()
        e.mu.Lock()
        defer e.mu.DowngradeLock()

        // The state changed when we released the shared locked and re-acquired
        // it in exclusive mode. Try again.
        if e.EndpointState() != StateInitial {
                return true, nil
        }

        // The state is still 'initial', so try to bind the endpoint.
        if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
                return false, err
        }

        return true, nil
}

// connectRoute establishes a route to the specified interface or the
// configured multicast interface if no interface is specified and the
// specified address is a multicast address.
func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (*stack.Route, tcpip.NICID, tcpip.Error) {
        localAddr := e.ID.LocalAddress
        if e.isBroadcastOrMulticast(nicID, netProto, localAddr) {
                // A packet can only originate from a unicast address (i.e., an interface).
                localAddr = ""
        }

        if header.IsV4MulticastAddress(addr.Addr) || header.IsV6MulticastAddress(addr.Addr) {
                if nicID == 0 {
                        nicID = e.multicastNICID
                }
                if localAddr == "" && nicID == 0 {
                        localAddr = e.multicastAddr
                }
        }

        // Find a route to the desired destination.
        r, err := e.stack.FindRoute(nicID, localAddr, addr.Addr, netProto, e.ops.GetMulticastLoop())
        if err != nil {
                return nil, 0, err
        }
        return r, nicID, nil
}

// Write writes data to the endpoint's peer. This method does not block
// if the data cannot be written.
func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
        n, err := e.write(p, opts)
        switch err.(type) {
        case nil:
                e.stats.PacketsSent.Increment()
        case *tcpip.ErrMessageTooLong, *tcpip.ErrInvalidOptionValue:
                e.stats.WriteErrors.InvalidArgs.Increment()
        case *tcpip.ErrClosedForSend:
                e.stats.WriteErrors.WriteClosed.Increment()
        case *tcpip.ErrInvalidEndpointState:
                e.stats.WriteErrors.InvalidEndpointState.Increment()
        case *tcpip.ErrNoRoute, *tcpip.ErrBroadcastDisabled, *tcpip.ErrNetworkUnreachable:
                // Errors indicating any problem with IP routing of the packet.
                e.stats.SendErrors.NoRoute.Increment()
        default:
                // For all other errors when writing to the network layer.
                e.stats.SendErrors.SendToNetworkFailed.Increment()
        }
        return n, err
}

func (e *endpoint) buildUDPPacketInfo(p tcpip.Payloader, opts tcpip.WriteOptions) (udpPacketInfo, tcpip.Error) {
        e.mu.RLock()
        defer e.mu.RUnlock()

        // If we've shutdown with SHUT_WR we are in an invalid state for sending.
        if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
                return udpPacketInfo{}, &tcpip.ErrClosedForSend{}
        }

        // Prepare for write.
        for {
                retry, err := e.prepareForWrite(opts.To)
                if err != nil {
                        return udpPacketInfo{}, err
                }

                if !retry {
                        break
                }
        }

        route := e.route
        dstPort := e.dstPort
        if opts.To != nil {
                // Reject destination address if it goes through a different
                // NIC than the endpoint was bound to.
                nicID := opts.To.NIC
                if nicID == 0 {
                        nicID = tcpip.NICID(e.ops.GetBindToDevice())
                }
                if e.BindNICID != 0 {
                        if nicID != 0 && nicID != e.BindNICID {
                                return udpPacketInfo{}, &tcpip.ErrNoRoute{}
                        }

                        nicID = e.BindNICID
                }

                if opts.To.Port == 0 {
                        // Port 0 is an invalid port to send to.
                        return udpPacketInfo{}, &tcpip.ErrInvalidEndpointState{}
                }

                dst, netProto, err := e.checkV4MappedLocked(*opts.To)
                if err != nil {
                        return udpPacketInfo{}, err
                }

                r, _, err := e.connectRoute(nicID, dst, netProto)
                if err != nil {
                        return udpPacketInfo{}, err
                }
                defer r.Release()

                route = r
                dstPort = dst.Port
        }

        if !e.ops.GetBroadcast() && route.IsOutboundBroadcast() {
                return udpPacketInfo{}, &tcpip.ErrBroadcastDisabled{}
        }

        v := make([]byte, p.Len())
        if _, err := io.ReadFull(p, v); err != nil {
                return udpPacketInfo{}, &tcpip.ErrBadBuffer{}
        }
        if len(v) > header.UDPMaximumPacketSize {
                // Payload can't possibly fit in a packet.
                so := e.SocketOptions()
                if so.GetRecvError() {
                        so.QueueLocalErr(
                                &tcpip.ErrMessageTooLong{},
                                route.NetProto(),
                                header.UDPMaximumPacketSize,
                                tcpip.FullAddress{
                                        NIC:  route.NICID(),
                                        Addr: route.RemoteAddress(),
                                        Port: dstPort,
                                },
                                v,
                        )
                }
                return udpPacketInfo{}, &tcpip.ErrMessageTooLong{}
        }

        ttl := e.ttl
        useDefaultTTL := ttl == 0
        if header.IsV4MulticastAddress(route.RemoteAddress()) || header.IsV6MulticastAddress(route.RemoteAddress()) {
                ttl = e.multicastTTL
                // Multicast allows a 0 TTL.
                useDefaultTTL = false
        }

        return udpPacketInfo{
                route:         route,
                data:          buffer.View(v),
                localPort:     e.ID.LocalPort,
                remotePort:    dstPort,
                ttl:           ttl,
                useDefaultTTL: useDefaultTTL,
                tos:           e.sendTOS,
                owner:         e.owner,
                noChecksum:    e.SocketOptions().GetNoChecksum(),
        }, nil
}

func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
        if err := e.LastError(); err != nil {
                return 0, err
        }

        // MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
        if opts.More {
                return 0, &tcpip.ErrInvalidOptionValue{}
        }

        // Do not hold lock when sending as loopback is synchronous and if the UDP
        // datagram ends up generating an ICMP response then it can result in a
        // deadlock where the ICMP response handling ends up acquiring this endpoint's
        // mutex using e.mu.RLock() in endpoint.HandleControlPacket which can cause a
        // deadlock if another caller is trying to acquire e.mu in exclusive mode w/
        // e.mu.Lock(). Since e.mu.Lock() prevents any new read locks to ensure the
        // lock can be eventually acquired.
        //
        // See: https://golang.org/pkg/sync/#RWMutex for details on why recursive read
        // locking is prohibited.
        u, err := e.buildUDPPacketInfo(p, opts)
        if err != nil {
                return 0, err
        }
        n, err := u.send()
        if err != nil {
                return 0, err
        }
        return int64(n), nil
}

// OnReuseAddressSet implements tcpip.SocketOptionsHandler.
func (e *endpoint) OnReuseAddressSet(v bool) {
        e.mu.Lock()
        e.portFlags.MostRecent = v
        e.mu.Unlock()
}

// OnReusePortSet implements tcpip.SocketOptionsHandler.
func (e *endpoint) OnReusePortSet(v bool) {
        e.mu.Lock()
        e.portFlags.LoadBalanced = v
        e.mu.Unlock()
}

// SetSockOptInt implements tcpip.Endpoint.
func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
        switch opt {
        case tcpip.MTUDiscoverOption:
                // Return not supported if the value is not disabling path
                // MTU discovery.
                if v != tcpip.PMTUDiscoveryDont {
                        return &tcpip.ErrNotSupported{}
                }

        case tcpip.MulticastTTLOption:
                e.mu.Lock()
                e.multicastTTL = uint8(v)
                e.mu.Unlock()

        case tcpip.TTLOption:
                e.mu.Lock()
                e.ttl = uint8(v)
                e.mu.Unlock()

        case tcpip.IPv4TOSOption:
                e.mu.Lock()
                e.sendTOS = uint8(v)
                e.mu.Unlock()

        case tcpip.IPv6TrafficClassOption:
                e.mu.Lock()
                e.sendTOS = uint8(v)
                e.mu.Unlock()
        }

        return nil
}

var _ tcpip.SocketOptionsHandler = (*endpoint)(nil)

// HasNIC implements tcpip.SocketOptionsHandler.
func (e *endpoint) HasNIC(id int32) bool {
        return e.stack.HasNIC(tcpip.NICID(id))
}

// SetSockOpt implements tcpip.Endpoint.
func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
        switch v := opt.(type) {
        case *tcpip.MulticastInterfaceOption:
                e.mu.Lock()
                defer e.mu.Unlock()

                fa := tcpip.FullAddress{Addr: v.InterfaceAddr}
                fa, netProto, err := e.checkV4MappedLocked(fa)
                if err != nil {
                        return err
                }
                nic := v.NIC
                addr := fa.Addr

                if nic == 0 && addr == "" {
                        e.multicastAddr = ""
                        e.multicastNICID = 0
                        break
                }

                if nic != 0 {
                        if !e.stack.CheckNIC(nic) {
                                return &tcpip.ErrBadLocalAddress{}
                        }
                } else {
                        nic = e.stack.CheckLocalAddress(0, netProto, addr)
                        if nic == 0 {
                                return &tcpip.ErrBadLocalAddress{}
                        }
                }

                if e.BindNICID != 0 && e.BindNICID != nic {
                        return &tcpip.ErrInvalidEndpointState{}
                }

                e.multicastNICID = nic
                e.multicastAddr = addr

        case *tcpip.AddMembershipOption:
                if !header.IsV4MulticastAddress(v.MulticastAddr) && !header.IsV6MulticastAddress(v.MulticastAddr) {
                        return &tcpip.ErrInvalidOptionValue{}
                }

                nicID := v.NIC

                if v.InterfaceAddr.Unspecified() {
                        if nicID == 0 {
                                if r, err := e.stack.FindRoute(0, "", v.MulticastAddr, e.NetProto, false /* multicastLoop */); err == nil {
                                        nicID = r.NICID()
                                        r.Release()
                                }
                        }
                } else {
                        nicID = e.stack.CheckLocalAddress(nicID, e.NetProto, v.InterfaceAddr)
                }
                if nicID == 0 {
                        return &tcpip.ErrUnknownDevice{}
                }

                memToInsert := multicastMembership{nicID: nicID, multicastAddr: v.MulticastAddr}

                e.mu.Lock()
                defer e.mu.Unlock()

                if _, ok := e.multicastMemberships[memToInsert]; ok {
                        return &tcpip.ErrPortInUse{}
                }

                if err := e.stack.JoinGroup(e.NetProto, nicID, v.MulticastAddr); err != nil {
                        return err
                }

                e.multicastMemberships[memToInsert] = struct{}{}

        case *tcpip.RemoveMembershipOption:
                if !header.IsV4MulticastAddress(v.MulticastAddr) && !header.IsV6MulticastAddress(v.MulticastAddr) {
                        return &tcpip.ErrInvalidOptionValue{}
                }

                nicID := v.NIC
                if v.InterfaceAddr.Unspecified() {
                        if nicID == 0 {
                                if r, err := e.stack.FindRoute(0, "", v.MulticastAddr, e.NetProto, false /* multicastLoop */); err == nil {
                                        nicID = r.NICID()
                                        r.Release()
                                }
                        }
                } else {
                        nicID = e.stack.CheckLocalAddress(nicID, e.NetProto, v.InterfaceAddr)
                }
                if nicID == 0 {
                        return &tcpip.ErrUnknownDevice{}
                }

                memToRemove := multicastMembership{nicID: nicID, multicastAddr: v.MulticastAddr}

                e.mu.Lock()
                defer e.mu.Unlock()

                if _, ok := e.multicastMemberships[memToRemove]; !ok {
                        return &tcpip.ErrBadLocalAddress{}
                }

                if err := e.stack.LeaveGroup(e.NetProto, nicID, v.MulticastAddr); err != nil {
                        return err
                }

                delete(e.multicastMemberships, memToRemove)

        case *tcpip.SocketDetachFilterOption:
                return nil
        }
        return nil
}

// GetSockOptInt implements tcpip.Endpoint.
func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
        switch opt {
        case tcpip.IPv4TOSOption:
                e.mu.RLock()
                v := int(e.sendTOS)
                e.mu.RUnlock()
                return v, nil

        case tcpip.IPv6TrafficClassOption:
                e.mu.RLock()
                v := int(e.sendTOS)
                e.mu.RUnlock()
                return v, nil

        case tcpip.MTUDiscoverOption:
                // The only supported setting is path MTU discovery disabled.
                return tcpip.PMTUDiscoveryDont, nil

        case tcpip.MulticastTTLOption:
                e.mu.Lock()
                v := int(e.multicastTTL)
                e.mu.Unlock()
                return v, nil

        case tcpip.ReceiveQueueSizeOption:
                v := 0
                e.rcvMu.Lock()
                if !e.rcvList.Empty() {
                        p := e.rcvList.Front()
                        v = p.data.Size()
                }
                e.rcvMu.Unlock()
                return v, nil

        case tcpip.TTLOption:
                e.mu.Lock()
                v := int(e.ttl)
                e.mu.Unlock()
                return v, nil

        default:
                return -1, &tcpip.ErrUnknownProtocolOption{}
        }
}

// GetSockOpt implements tcpip.Endpoint.
func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error {
        switch o := opt.(type) {
        case *tcpip.MulticastInterfaceOption:
                e.mu.Lock()
                *o = tcpip.MulticastInterfaceOption{
                        NIC:           e.multicastNICID,
                        InterfaceAddr: e.multicastAddr,
                }
                e.mu.Unlock()

        default:
                return &tcpip.ErrUnknownProtocolOption{}
        }
        return nil
}

// udpPacketInfo contains all information required to send a UDP packet.
//
// This should be used as a value-only type, which exists in order to simplify
// return value syntax. It should not be exported or extended.
type udpPacketInfo struct {
        route         *stack.Route
        data          buffer.View
        localPort     uint16
        remotePort    uint16
        ttl           uint8
        useDefaultTTL bool
        tos           uint8
        owner         tcpip.PacketOwner
        noChecksum    bool
}

// send sends the given packet.
func (u *udpPacketInfo) send() (int, tcpip.Error) {
        vv := u.data.ToVectorisedView()
        pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                ReserveHeaderBytes: header.UDPMinimumSize + int(u.route.MaxHeaderLength()),
                Data:               vv,
        })
        pkt.Owner = u.owner

        // Initialize the UDP header.
        udp := header.UDP(pkt.TransportHeader().Push(header.UDPMinimumSize))
        pkt.TransportProtocolNumber = ProtocolNumber

        length := uint16(pkt.Size())
        udp.Encode(&header.UDPFields{
                SrcPort: u.localPort,
                DstPort: u.remotePort,
                Length:  length,
        })

        // Set the checksum field unless TX checksum offload is enabled.
        // On IPv4, UDP checksum is optional, and a zero value indicates the
        // transmitter skipped the checksum generation (RFC768).
        // On IPv6, UDP checksum is not optional (RFC2460 Section 8.1).
        if u.route.RequiresTXTransportChecksum() &&
                (!u.noChecksum || u.route.NetProto() == header.IPv6ProtocolNumber) {
                xsum := u.route.PseudoHeaderChecksum(ProtocolNumber, length)
                for _, v := range vv.Views() {
                        xsum = header.Checksum(v, xsum)
                }
                udp.SetChecksum(^udp.CalculateChecksum(xsum))
        }

        if u.useDefaultTTL {
                u.ttl = u.route.DefaultTTL()
        }
        if err := u.route.WritePacket(stack.NetworkHeaderParams{
                Protocol: ProtocolNumber,
                TTL:      u.ttl,
                TOS:      u.tos,
        }, pkt); err != nil {
                u.route.Stats().UDP.PacketSendErrors.Increment()
                return 0, err
        }

        // Track count of packets sent.
        u.route.Stats().UDP.PacketsSent.Increment()
        return len(u.data), nil
}

// checkV4MappedLocked determines the effective network protocol and converts
// addr to its canonical form.
func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) {
        unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only())
        if err != nil {
                return tcpip.FullAddress{}, 0, err
        }
        return unwrapped, netProto, nil
}

// Disconnect implements tcpip.Endpoint.
func (e *endpoint) Disconnect() tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()

        if e.EndpointState() != StateConnected {
                return nil
        }
        var (
                id  stack.TransportEndpointID
                btd tcpip.NICID
        )

        // We change this value below and we need the old value to unregister
        // the endpoint.
        boundPortFlags := e.boundPortFlags

        // Exclude ephemerally bound endpoints.
        if e.BindNICID != 0 || e.ID.LocalAddress == "" {
                var err tcpip.Error
                id = stack.TransportEndpointID{
                        LocalPort:    e.ID.LocalPort,
                        LocalAddress: e.ID.LocalAddress,
                }
                id, btd, err = e.registerWithStack(e.effectiveNetProtos, id)
                if err != nil {
                        return err
                }
                e.setEndpointState(StateBound)
                boundPortFlags = e.boundPortFlags
        } else {
                if e.ID.LocalPort != 0 {
                        // Release the ephemeral port.
                        portRes := ports.Reservation{
                                Networks:     e.effectiveNetProtos,
                                Transport:    ProtocolNumber,
                                Addr:         e.ID.LocalAddress,
                                Port:         e.ID.LocalPort,
                                Flags:        boundPortFlags,
                                BindToDevice: e.boundBindToDevice,
                                Dest:         tcpip.FullAddress{},
                        }
                        e.stack.ReleasePort(portRes)
                        e.boundPortFlags = ports.Flags{}
                }
                e.setEndpointState(StateInitial)
        }

        e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.ID, e, boundPortFlags, e.boundBindToDevice)
        e.ID = id
        e.boundBindToDevice = btd
        e.route.Release()
        e.route = nil
        e.dstPort = 0

        return nil
}

// Connect connects the endpoint to its peer. Specifying a NIC is optional.
func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()

        nicID := addr.NIC
        var localPort uint16
        switch e.EndpointState() {
        case StateInitial:
        case StateBound, StateConnected:
                localPort = e.ID.LocalPort
                if e.BindNICID == 0 {
                        break
                }

                if nicID != 0 && nicID != e.BindNICID {
                        return &tcpip.ErrInvalidEndpointState{}
                }

                nicID = e.BindNICID
        default:
                return &tcpip.ErrInvalidEndpointState{}
        }

        addr, netProto, err := e.checkV4MappedLocked(addr)
        if err != nil {
                return err
        }

        r, nicID, err := e.connectRoute(nicID, addr, netProto)
        if err != nil {
                return err
        }

        id := stack.TransportEndpointID{
                LocalAddress:  e.ID.LocalAddress,
                LocalPort:     localPort,
                RemotePort:    addr.Port,
                RemoteAddress: r.RemoteAddress(),
        }

        if e.EndpointState() == StateInitial {
                id.LocalAddress = r.LocalAddress()
        }

        // Even if we're connected, this endpoint can still be used to send
        // packets on a different network protocol, so we register both even if
        // v6only is set to false and this is an ipv6 endpoint.
        netProtos := []tcpip.NetworkProtocolNumber{netProto}
        if netProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() {
                netProtos = []tcpip.NetworkProtocolNumber{
                        header.IPv4ProtocolNumber,
                        header.IPv6ProtocolNumber,
                }
        }

        oldPortFlags := e.boundPortFlags

        id, btd, err := e.registerWithStack(netProtos, id)
        if err != nil {
                r.Release()
                return err
        }

        // Remove the old registration.
        if e.ID.LocalPort != 0 {
                e.stack.UnregisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.ID, e, oldPortFlags, e.boundBindToDevice)
        }

        e.ID = id
        e.boundBindToDevice = btd
        if e.route != nil {
                // If the endpoint was already connected then make sure we release the
                // previous route.
                e.route.Release()
        }
        e.route = r
        e.dstPort = addr.Port
        e.RegisterNICID = nicID
        e.effectiveNetProtos = netProtos

        e.setEndpointState(StateConnected)

        e.rcvMu.Lock()
        e.rcvReady = true
        e.rcvMu.Unlock()

        return nil
}

// ConnectEndpoint is not supported.
func (*endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error {
        return &tcpip.ErrInvalidEndpointState{}
}

// Shutdown closes the read and/or write end of the endpoint connection
// to its peer.
func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()

        // A socket in the bound state can still receive multicast messages,
        // so we need to notify waiters on shutdown.
        if state := e.EndpointState(); state != StateBound && state != StateConnected {
                return &tcpip.ErrNotConnected{}
        }

        e.shutdownFlags |= flags

        if flags&tcpip.ShutdownRead != 0 {
                e.rcvMu.Lock()
                wasClosed := e.rcvClosed
                e.rcvClosed = true
                e.rcvMu.Unlock()

                if !wasClosed {
                        e.waiterQueue.Notify(waiter.ReadableEvents)
                }
        }

        return nil
}

// Listen is not supported by UDP, it just fails.
func (*endpoint) Listen(int) tcpip.Error {
        return &tcpip.ErrNotSupported{}
}

// Accept is not supported by UDP, it just fails.
func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) {
        return nil, nil, &tcpip.ErrNotSupported{}
}

func (e *endpoint) registerWithStack(netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, tcpip.Error) {
        bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
        if e.ID.LocalPort == 0 {
                portRes := ports.Reservation{
                        Networks:     netProtos,
                        Transport:    ProtocolNumber,
                        Addr:         id.LocalAddress,
                        Port:         id.LocalPort,
                        Flags:        e.portFlags,
                        BindToDevice: bindToDevice,
                        Dest:         tcpip.FullAddress{},
                }
                port, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */)
                if err != nil {
                        return id, bindToDevice, err
                }
                id.LocalPort = port
        }
        e.boundPortFlags = e.portFlags

        err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.boundPortFlags, bindToDevice)
        if err != nil {
                portRes := ports.Reservation{
                        Networks:     netProtos,
                        Transport:    ProtocolNumber,
                        Addr:         id.LocalAddress,
                        Port:         id.LocalPort,
                        Flags:        e.boundPortFlags,
                        BindToDevice: bindToDevice,
                        Dest:         tcpip.FullAddress{},
                }
                e.stack.ReleasePort(portRes)
                e.boundPortFlags = ports.Flags{}
        }
        return id, bindToDevice, err
}

func (e *endpoint) bindLocked(addr tcpip.FullAddress) tcpip.Error {
        // Don't allow binding once endpoint is not in the initial state
        // anymore.
        if e.EndpointState() != StateInitial {
                return &tcpip.ErrInvalidEndpointState{}
        }

        addr, netProto, err := e.checkV4MappedLocked(addr)
        if err != nil {
                return err
        }

        // Expand netProtos to include v4 and v6 if the caller is binding to a
        // wildcard (empty) address, and this is an IPv6 endpoint with v6only
        // set to false.
        netProtos := []tcpip.NetworkProtocolNumber{netProto}
        if netProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() && addr.Addr == "" {
                netProtos = []tcpip.NetworkProtocolNumber{
                        header.IPv6ProtocolNumber,
                        header.IPv4ProtocolNumber,
                }
        }

        nicID := addr.NIC
        if len(addr.Addr) != 0 && !e.isBroadcastOrMulticast(addr.NIC, netProto, addr.Addr) {
                // A local unicast address was specified, verify that it's valid.
                nicID = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
                if nicID == 0 {
                        return &tcpip.ErrBadLocalAddress{}
                }
        }

        id := stack.TransportEndpointID{
                LocalPort:    addr.Port,
                LocalAddress: addr.Addr,
        }
        id, btd, err := e.registerWithStack(netProtos, id)
        if err != nil {
                return err
        }

        e.ID = id
        e.boundBindToDevice = btd
        e.RegisterNICID = nicID
        e.effectiveNetProtos = netProtos

        // Mark endpoint as bound.
        e.setEndpointState(StateBound)

        e.rcvMu.Lock()
        e.rcvReady = true
        e.rcvMu.Unlock()

        return nil
}

// Bind binds the endpoint to a specific local address and port.
// Specifying a NIC is optional.
func (e *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()

        err := e.bindLocked(addr)
        if err != nil {
                return err
        }

        // Save the effective NICID generated by bindLocked.
        e.BindNICID = e.RegisterNICID

        return nil
}

// GetLocalAddress returns the address to which the endpoint is bound.
func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
        e.mu.RLock()
        defer e.mu.RUnlock()

        addr := e.ID.LocalAddress
        if e.EndpointState() == StateConnected {
                addr = e.route.LocalAddress()
        }

        return tcpip.FullAddress{
                NIC:  e.RegisterNICID,
                Addr: addr,
                Port: e.ID.LocalPort,
        }, nil
}

// GetRemoteAddress returns the address to which the endpoint is connected.
func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) {
        e.mu.RLock()
        defer e.mu.RUnlock()

        if e.EndpointState() != StateConnected || e.dstPort == 0 {
                return tcpip.FullAddress{}, &tcpip.ErrNotConnected{}
        }

        return tcpip.FullAddress{
                NIC:  e.RegisterNICID,
                Addr: e.ID.RemoteAddress,
                Port: e.ID.RemotePort,
        }, nil
}

// Readiness returns the current readiness of the endpoint. For example, if
// waiter.EventIn is set, the endpoint is immediately readable.
func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
        // The endpoint is always writable.
        result := waiter.WritableEvents & mask

        // Determine if the endpoint is readable if requested.
        if mask&waiter.ReadableEvents != 0 {
                e.rcvMu.Lock()
                if !e.rcvList.Empty() || e.rcvClosed {
                        result |= waiter.ReadableEvents
                }
                e.rcvMu.Unlock()
        }

        e.lastErrorMu.Lock()
        hasError := e.lastError != nil
        e.lastErrorMu.Unlock()
        if hasError {
                result |= waiter.EventErr
        }
        return result
}

// verifyChecksum verifies the checksum unless RX checksum offload is enabled.
func verifyChecksum(hdr header.UDP, pkt *stack.PacketBuffer) bool {
        if pkt.RXTransportChecksumValidated {
                return true
        }

        // On IPv4, UDP checksum is optional, and a zero value means the transmitter
        // omitted the checksum generation, as per RFC 768:
        //
        //   An all zero transmitted checksum value means that the transmitter
        //   generated  no checksum  (for debugging or for higher level protocols that
        //   don't care).
        //
        // On IPv6, UDP checksum is not optional, as per RFC 2460 Section 8.1:
        //
        //   Unlike IPv4, when UDP packets are originated by an IPv6 node, the UDP
        //   checksum is not optional.
        if pkt.NetworkProtocolNumber == header.IPv4ProtocolNumber && hdr.Checksum() == 0 {
                return true
        }

        netHdr := pkt.Network()
        payloadChecksum := pkt.Data().AsRange().Checksum()
        return hdr.IsChecksumValid(netHdr.SourceAddress(), netHdr.DestinationAddress(), payloadChecksum)
}

// HandlePacket is called by the stack when new packets arrive to this transport
// endpoint.
func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
        // Get the header then trim it from the view.
        hdr := header.UDP(pkt.TransportHeader().View())
        if int(hdr.Length()) > pkt.Data().Size()+header.UDPMinimumSize {
                // Malformed packet.
                e.stack.Stats().UDP.MalformedPacketsReceived.Increment()
                e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
                return
        }

        if !verifyChecksum(hdr, pkt) {
                e.stack.Stats().UDP.ChecksumErrors.Increment()
                e.stats.ReceiveErrors.ChecksumErrors.Increment()
                return
        }

        e.stack.Stats().UDP.PacketsReceived.Increment()
        e.stats.PacketsReceived.Increment()

        e.rcvMu.Lock()
        // Drop the packet if our buffer is currently full.
        if !e.rcvReady || e.rcvClosed {
                e.rcvMu.Unlock()
                e.stack.Stats().UDP.ReceiveBufferErrors.Increment()
                e.stats.ReceiveErrors.ClosedReceiver.Increment()
                return
        }

        rcvBufSize := e.ops.GetReceiveBufferSize()
        if e.frozen || e.rcvBufSize >= int(rcvBufSize) {
                e.rcvMu.Unlock()
                e.stack.Stats().UDP.ReceiveBufferErrors.Increment()
                e.stats.ReceiveErrors.ReceiveBufferOverflow.Increment()
                return
        }

        wasEmpty := e.rcvBufSize == 0

        // Push new packet into receive list and increment the buffer size.
        packet := &udpPacket{
                senderAddress: tcpip.FullAddress{
                        NIC:  pkt.NICID,
                        Addr: id.RemoteAddress,
                        Port: hdr.SourcePort(),
                },
                destinationAddress: tcpip.FullAddress{
                        NIC:  pkt.NICID,
                        Addr: id.LocalAddress,
                        Port: hdr.DestinationPort(),
                },
                data: pkt.Data().ExtractVV(),
        }
        e.rcvList.PushBack(packet)
        e.rcvBufSize += packet.data.Size()

        // Save any useful information from the network header to the packet.
        switch pkt.NetworkProtocolNumber {
        case header.IPv4ProtocolNumber:
                packet.tos, _ = header.IPv4(pkt.NetworkHeader().View()).TOS()
        case header.IPv6ProtocolNumber:
                packet.tos, _ = header.IPv6(pkt.NetworkHeader().View()).TOS()
        }

        // TODO(gvisor.dev/issue/3556): r.LocalAddress may be a multicast or broadcast
        // address. packetInfo.LocalAddr should hold a unicast address that can be
        // used to respond to the incoming packet.
        localAddr := pkt.Network().DestinationAddress()
        packet.packetInfo.LocalAddr = localAddr
        packet.packetInfo.DestinationAddr = localAddr
        packet.packetInfo.NIC = pkt.NICID
        packet.receivedAt = e.stack.Clock().Now()

        e.rcvMu.Unlock()

        // Notify any waiters that there's data to be read now.
        if wasEmpty {
                e.waiterQueue.Notify(waiter.ReadableEvents)
        }
}

func (e *endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) {
        // Update last error first.
        e.lastErrorMu.Lock()
        e.lastError = err
        e.lastErrorMu.Unlock()

        // Update the error queue if IP_RECVERR is enabled.
        if e.SocketOptions().GetRecvError() {
                // Linux passes the payload without the UDP header.
                var payload []byte
                udp := header.UDP(pkt.Data().AsRange().ToOwnedView())
                if len(udp) >= header.UDPMinimumSize {
                        payload = udp.Payload()
                }

                e.SocketOptions().QueueErr(&tcpip.SockError{
                        Err:     err,
                        Cause:   transErr,
                        Payload: payload,
                        Dst: tcpip.FullAddress{
                                NIC:  pkt.NICID,
                                Addr: e.ID.RemoteAddress,
                                Port: e.ID.RemotePort,
                        },
                        Offender: tcpip.FullAddress{
                                NIC:  pkt.NICID,
                                Addr: e.ID.LocalAddress,
                                Port: e.ID.LocalPort,
                        },
                        NetProto: pkt.NetworkProtocolNumber,
                })
        }

        // Notify of the error.
        e.waiterQueue.Notify(waiter.EventErr)
}

// HandleError implements stack.TransportEndpoint.
func (e *endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) {
        // TODO(gvisor.dev/issues/5270): Handle all transport errors.
        switch transErr.Kind() {
        case stack.DestinationPortUnreachableTransportError:
                if e.EndpointState() == StateConnected {
                        e.onICMPError(&tcpip.ErrConnectionRefused{}, transErr, pkt)
                }
        }
}

// State implements tcpip.Endpoint.
func (e *endpoint) State() uint32 {
        return uint32(e.EndpointState())
}

// Info returns a copy of the endpoint info.
func (e *endpoint) Info() tcpip.EndpointInfo {
        e.mu.RLock()
        // Make a copy of the endpoint info.
        ret := e.TransportEndpointInfo
        e.mu.RUnlock()
        return &ret
}

// Stats returns a pointer to the endpoint stats.
func (e *endpoint) Stats() tcpip.EndpointStats {
        return &e.stats
}

// Wait implements tcpip.Endpoint.
func (*endpoint) Wait() {}

func (e *endpoint) isBroadcastOrMulticast(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, addr tcpip.Address) bool {
        return addr == header.IPv4Broadcast || header.IsV4MulticastAddress(addr) || header.IsV6MulticastAddress(addr) || e.stack.IsSubnetBroadcast(nicID, netProto, addr)
}

// SetOwner implements tcpip.Endpoint.
func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
        e.owner = owner
}

// SocketOptions implements tcpip.Endpoint.
func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
        return &e.ops
}

// freeze prevents any more packets from being delivered to the endpoint.
func (e *endpoint) freeze() {
        e.mu.Lock()
        e.frozen = true
        e.mu.Unlock()
}

// thaw unfreezes a previously frozen endpoint using endpoint.freeze() allows
// new packets to be delivered again.
func (e *endpoint) thaw() {
        e.mu.Lock()
        e.frozen = false
        e.mu.Unlock()
}






























    3 

    2 



    2 
    1 










    1 


    1 



    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//+build amd64

package linux

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/syserror"
)

// ArchPrctl implements linux syscall arch_prctl(2).
// It sets architecture-specific process or thread state for t.
func ArchPrctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        switch args[0].Int() {
        case linux.ARCH_GET_FS:
                addr := args[1].Pointer()
                fsbase := t.Arch().TLS()
                switch t.Arch().Width() {
                case 8:
                        if _, err := primitive.CopyUint64Out(t, addr, uint64(fsbase)); err != nil {
                                return 0, nil, err
                        }
                default:
                        return 0, nil, syserror.ENOSYS
                }
        case linux.ARCH_SET_FS:
                fsbase := args[1].Uint64()
                if !t.Arch().SetTLS(uintptr(fsbase)) {
                        return 0, nil, linuxerr.EPERM
                }
        case linux.ARCH_GET_GS, linux.ARCH_SET_GS:
                t.Kernel().EmitUnimplementedEvent(t)
                fallthrough
        default:
                return 0, nil, linuxerr.EINVAL
        }

        return 0, nil, nil
}























  209 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fsutil

import (
        "unsafe"

        "gvisor.dev/gvisor/pkg/safemem"
)

func (*HostFileMapper) unsafeBlockFromChunkMapping(addr uintptr) safemem.Block {
        // We don't control the host file's length, so touching its mappings may
        // raise SIGBUS. Thus accesses to it must use safecopy.
        return safemem.BlockFromUnsafePointer((unsafe.Pointer)(addr), chunkSize)
}


























   29 




   29 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fs

import (
        "math"

        "gvisor.dev/gvisor/pkg/hostarch"
)

// OffsetPageEnd returns the file offset rounded up to the nearest
// page boundary. OffsetPageEnd panics if rounding up causes overflow,
// which shouldn't be possible given that offset is an int64.
func OffsetPageEnd(offset int64) uint64 {
        end, ok := hostarch.Addr(offset).RoundUp()
        if !ok {
                panic("impossible overflow")
        }
        return uint64(end)
}

// ReadEndOffset returns an exclusive end offset for a read operation
// so that the read does not overflow an int64 nor size.
//
// Parameters:
// - offset: the starting offset of the read.
// - length: the number of bytes to read.
// - size:   the size of the file.
//
// Postconditions: The returned offset is >= offset.
func ReadEndOffset(offset int64, length int64, size int64) int64 {
        if offset >= size {
                return offset
        }
        end := offset + length
        // Don't overflow.
        if end < offset || end > size {
                end = size
        }
        return end
}

// WriteEndOffset returns an exclusive end offset for a write operation
// so that the write does not overflow an int64.
//
// Parameters:
// - offset: the starting offset of the write.
// - length: the number of bytes to write.
//
// Postconditions: The returned offset is >= offset.
func WriteEndOffset(offset int64, length int64) int64 {
        return ReadEndOffset(offset, length, math.MaxInt64)
}






































  117 







  117 






  117 




  102 
    2 




   99 


   21 

   80 




   79 



  100 



   98 


   16 







   16 

    8 

    6 

    1 





   15 



    1 



   14 



    1 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package netstack

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/socket"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
        "gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
        "gvisor.dev/gvisor/pkg/waiter"
)

// providerVFS2 is an inet socket provider.
type providerVFS2 struct {
        family   int
        netProto tcpip.NetworkProtocolNumber
}

// Socket creates a new socket object for the AF_INET, AF_INET6, or AF_PACKET
// family.
func (p *providerVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
        // Fail right away if we don't have a stack.
        stack := t.NetworkContext()
        if stack == nil {
                // Don't propagate an error here. Instead, allow the socket
                // code to continue searching for another provider.
                return nil, nil
        }
        eps, ok := stack.(*Stack)
        if !ok {
                return nil, nil
        }

        // Packet sockets are handled separately, since they are neither INET
        // nor INET6 specific.
        if p.family == linux.AF_PACKET {
                return packetSocketVFS2(t, eps, stype, protocol)
        }

        // Figure out the transport protocol.
        transProto, associated, err := getTransportProtocol(t, stype, protocol)
        if err != nil {
                return nil, err
        }

        // Create the endpoint.
        var ep tcpip.Endpoint
        var e tcpip.Error
        wq := &waiter.Queue{}
        if stype == linux.SOCK_RAW {
                ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated)
        } else {
                ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq)

                // Assign task to PacketOwner interface to get the UID and GID for
                // iptables owner matching.
                if e == nil {
                        ep.SetOwner(t)
                }
        }
        if e != nil {
                return nil, syserr.TranslateNetstackError(e)
        }

        return NewVFS2(t, p.family, stype, int(transProto), wq, ep)
}

func packetSocketVFS2(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
        // Packet sockets require CAP_NET_RAW.
        creds := auth.CredentialsFromContext(t)
        if !creds.HasCapability(linux.CAP_NET_RAW) {
                return nil, syserr.ErrNotPermitted
        }

        // "cooked" packets don't contain link layer information.
        var cooked bool
        switch stype {
        case linux.SOCK_DGRAM:
                cooked = true
        case linux.SOCK_RAW:
                cooked = false
        default:
                return nil, syserr.ErrProtocolNotSupported
        }

        // protocol is passed in network byte order, but netstack wants it in
        // host order.
        netProto := tcpip.NetworkProtocolNumber(socket.Ntohs(uint16(protocol)))

        wq := &waiter.Queue{}
        ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq)
        if err != nil {
                return nil, syserr.TranslateNetstackError(err)
        }

        return NewVFS2(t, linux.AF_PACKET, stype, protocol, wq, ep)
}

// Pair just returns nil sockets (not supported).
func (*providerVFS2) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
        return nil, nil, nil
}

// init registers socket providers for AF_INET, AF_INET6, and AF_PACKET.
func init() {
        // Providers backed by netstack.
        p := []providerVFS2{
                {
                        family:   linux.AF_INET,
                        netProto: ipv4.ProtocolNumber,
                },
                {
                        family:   linux.AF_INET6,
                        netProto: ipv6.ProtocolNumber,
                },
                {
                        family: linux.AF_PACKET,
                },
        }

        for i := range p {
                socket.RegisterProviderVFS2(p[i].family, &p[i])
        }
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/vfs/filesystem_refs.go: no such file or directory

























































































 1627 













 1631 



















  644 




  645 



  645 












  647 













  645 




  647 


















  646 























  644 




  645 








  645 








  645 

  643 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package time

import (
        "fmt"
        "time"

        "gvisor.dev/gvisor/pkg/log"
)

const (
        // ApproxUpdateInterval is the approximate interval that parameters
        // should be updated at.
        //
        // Error correction assumes that the next update will occur after this
        // much time.
        //
        // If an update occurs before ApproxUpdateInterval passes, it has no
        // adverse effect on error correction behavior.
        //
        // If an update occurs after ApproxUpdateInterval passes, the clock
        // will overshoot its error correction target and begin accumulating
        // error in the other direction.
        //
        // If updates occur after more than 2*ApproxUpdateInterval passes, the
        // clock becomes unstable, accumulating more error than it had
        // originally. Repeated updates after more than 2*ApproxUpdateInterval
        // will cause unbounded increases in error.
        //
        // These statements assume that the host clock does not change. Actual
        // error will depend upon host clock changes.
        //
        // TODO(b/68779214): make error correction more robust to delayed
        // updates.
        ApproxUpdateInterval = 1 * time.Second

        // MaxClockError is the maximum amount of error that the clocks will
        // try to correct.
        //
        // This limit:
        //
        //  * Puts a limit on cases of otherwise unbounded increases in error.
        //
        //  * Avoids unreasonably large frequency adjustments required to
        //    correct large errors over a single update interval.
        MaxClockError = ReferenceNS(ApproxUpdateInterval) / 4
)

// Parameters are the timekeeping parameters needed to compute the current
// time.
type Parameters struct {
        // BaseCycles was the TSC counter value when the time was BaseRef.
        BaseCycles TSCValue

        // BaseRef is the reference clock time in nanoseconds corresponding to
        // BaseCycles.
        BaseRef ReferenceNS

        // Frequency is the frequency of the cycle clock in Hertz.
        Frequency uint64
}

// muldiv64 multiplies two 64-bit numbers, then divides the result by another
// 64-bit number.
//
// It requires that the result fit in 64 bits, but doesn't require that
// intermediate values do; in particular, the result of the multiplication may
// require 128 bits.
//
// It returns !ok if divisor is zero or the result does not fit in 64 bits.
func muldiv64(value, multiplier, divisor uint64) (uint64, bool)

// ComputeTime calculates the current time from a "now" TSC value.
//
// time = ref + (now - base) / f
func (p Parameters) ComputeTime(nowCycles TSCValue) (int64, bool) {
        diffCycles := nowCycles - p.BaseCycles
        if diffCycles < 0 {
                log.Warningf("now cycles %v < base cycles %v", nowCycles, p.BaseCycles)
                diffCycles = 0
        }

        // Overflow "won't ever happen". If diffCycles is the max value
        // (2^63 - 1), then to overflow,
        //
        // frequency <= ((2^63 - 1) * 10^9) / 2^64 = 500Mhz
        //
        // A TSC running at 2GHz takes 201 years to reach 2^63-1. 805 years at
        // 500MHz.
        diffNS, ok := muldiv64(uint64(diffCycles), uint64(time.Second.Nanoseconds()), p.Frequency)
        return int64(uint64(p.BaseRef) + diffNS), ok
}

// errorAdjust returns a new Parameters struct "adjusted" that satisfies:
//
// 1. adjusted.ComputeTime(now) = prevParams.ComputeTime(now)
//   * i.e., the current time does not jump.
//
// 2. adjusted.ComputeTime(TSC at next update) = newParams.ComputeTime(TSC at next update)
//   * i.e., Any error between prevParams and newParams will be corrected over
//     the course of the next update period.
//
// errorAdjust also returns the current clock error.
//
// Preconditions:
// * newParams.BaseCycles >= prevParams.BaseCycles; i.e., TSC must not go
//   backwards.
// * newParams.BaseCycles <= now; i.e., the new parameters be computed at or
//   before now.
func errorAdjust(prevParams Parameters, newParams Parameters, now TSCValue) (Parameters, ReferenceNS, error) {
        if newParams.BaseCycles < prevParams.BaseCycles {
                // Oh dear! Something is very wrong.
                return Parameters{}, 0, fmt.Errorf("TSC went backwards in updated clock params: %v < %v", newParams.BaseCycles, prevParams.BaseCycles)
        }
        if newParams.BaseCycles > now {
                return Parameters{}, 0, fmt.Errorf("parameters contain base cycles later than now: %v > %v", newParams.BaseCycles, now)
        }

        intervalNS := int64(ApproxUpdateInterval.Nanoseconds())
        nsPerSec := uint64(time.Second.Nanoseconds())

        // Current time as computed by prevParams.
        oldNowNS, ok := prevParams.ComputeTime(now)
        if !ok {
                return Parameters{}, 0, fmt.Errorf("old now time computation overflowed. params = %+v, now = %v", prevParams, now)
        }

        // We expect the update ticker to run based on this clock (i.e., it has
        // been using prevParams and will use the returned adjusted
        // parameters). Hence it will decide to fire intervalNS from the
        // current (oldNowNS) "now".
        nextNS := oldNowNS + intervalNS

        if nextNS <= int64(newParams.BaseRef) {
                // The next update time already passed before the new
                // parameters were created! We definitely can't correct the
                // error by then.
                return Parameters{}, 0, fmt.Errorf("unable to correct error in single period. oldNowNS = %v, nextNS = %v, p = %v", oldNowNS, nextNS, newParams)
        }

        // For what TSC value next will newParams.ComputeTime(next) = nextNS?
        //
        // Solve ComputeTime for next:
        //
        // next = newParams.Frequency * (nextNS - newParams.BaseRef) + newParams.BaseCycles
        c, ok := muldiv64(newParams.Frequency, uint64(nextNS-int64(newParams.BaseRef)), nsPerSec)
        if !ok {
                return Parameters{}, 0, fmt.Errorf("%v * (%v - %v) / %v overflows", newParams.Frequency, nextNS, newParams.BaseRef, nsPerSec)
        }

        cycles := TSCValue(c)
        next := cycles + newParams.BaseCycles

        if next <= now {
                // The next update time already passed now with the new
                // parameters! We can't correct the error in a single period.
                return Parameters{}, 0, fmt.Errorf("unable to correct error in single period. oldNowNS = %v, nextNS = %v, now = %v, next = %v", oldNowNS, nextNS, now, next)
        }

        // We want to solve for parameters that satisfy:
        //
        // adjusted.ComputeTime(now) = oldNowNS
        //
        // adjusted.ComputeTime(next) = nextNS
        //
        // i.e., the current time does not change, but by the time we reach
        // next we reach the same time as newParams.

        // We choose to keep BaseCycles fixed.
        adjusted := Parameters{
                BaseCycles: newParams.BaseCycles,
        }

        // We want a slope such that time goes from oldNowNS to nextNS when
        // we reach next.
        //
        // In other words, cycles should increase by next - now in the next
        // interval.

        cycles = next - now
        ns := intervalNS

        // adjusted.Frequency = cycles / ns
        adjusted.Frequency, ok = muldiv64(uint64(cycles), nsPerSec, uint64(ns))
        if !ok {
                return Parameters{}, 0, fmt.Errorf("(%v - %v) * %v / %v overflows", next, now, nsPerSec, ns)
        }

        // Now choose a base reference such that the current time remains the
        // same. Note that this is just ComputeTime, solving for BaseRef:
        //
        // oldNowNS = BaseRef + (now - BaseCycles) / Frequency
        // BaseRef = oldNowNS - (now - BaseCycles) / Frequency
        diffNS, ok := muldiv64(uint64(now-adjusted.BaseCycles), nsPerSec, adjusted.Frequency)
        if !ok {
                return Parameters{}, 0, fmt.Errorf("(%v - %v) * %v / %v overflows", now, adjusted.BaseCycles, nsPerSec, adjusted.Frequency)
        }

        adjusted.BaseRef = ReferenceNS(oldNowNS - int64(diffNS))

        // The error is the difference between the current time and what the
        // new parameters say the current time should be.
        newNowNS, ok := newParams.ComputeTime(now)
        if !ok {
                return Parameters{}, 0, fmt.Errorf("new now time computation overflowed. params = %+v, now = %v", newParams, now)
        }

        errorNS := ReferenceNS(oldNowNS - newNowNS)

        return adjusted, errorNS, nil
}

// logErrorAdjustment logs the clock error and associated error correction
// frequency adjustment.
//
// The log level is determined by the error severity.
func logErrorAdjustment(clock ClockID, errorNS ReferenceNS, orig, adjusted Parameters) {
        magNS := int64(errorNS.Magnitude())
        if magNS <= 10*time.Microsecond.Nanoseconds() {
                // Don't log small errors.
                return
        }
        fn := log.Infof
        if magNS > time.Millisecond.Nanoseconds() {
                // Upgrade large errors to warning.
                fn = log.Warningf
        }

        fn("Clock(%v): error: %v ns, adjusted frequency from %v Hz to %v Hz", clock, errorNS, orig.Frequency, adjusted.Frequency)
}























  327 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package gofer

import (
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

func dentryTimestampFromP9(s, ns uint64) int64 {
        return int64(s*1e9 + ns)
}

// Preconditions: d.cachedMetadataAuthoritative() == true.
func (d *dentry) touchAtime(mnt *vfs.Mount) {
        if mnt.Flags.NoATime || mnt.ReadOnly() {
                return
        }
        if err := mnt.CheckBeginWrite(); err != nil {
                return
        }
        now := d.fs.clock.Now().Nanoseconds()
        d.metadataMu.Lock()
        atomic.StoreInt64(&d.atime, now)
        atomic.StoreUint32(&d.atimeDirty, 1)
        d.metadataMu.Unlock()
        mnt.EndWrite()
}

// Preconditions: d.metadataMu is locked. d.cachedMetadataAuthoritative() == true.
func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) {
        if mnt.Flags.NoATime || mnt.ReadOnly() {
                return
        }
        if err := mnt.CheckBeginWrite(); err != nil {
                return
        }
        now := d.fs.clock.Now().Nanoseconds()
        atomic.StoreInt64(&d.atime, now)
        atomic.StoreUint32(&d.atimeDirty, 1)
        mnt.EndWrite()
}

// Preconditions:
// * d.cachedMetadataAuthoritative() == true.
// * The caller has successfully called vfs.Mount.CheckBeginWrite().
func (d *dentry) touchCtime() {
        now := d.fs.clock.Now().Nanoseconds()
        d.metadataMu.Lock()
        atomic.StoreInt64(&d.ctime, now)
        d.metadataMu.Unlock()
}

// Preconditions:
// * d.cachedMetadataAuthoritative() == true.
// * The caller has successfully called vfs.Mount.CheckBeginWrite().
func (d *dentry) touchCMtime() {
        now := d.fs.clock.Now().Nanoseconds()
        d.metadataMu.Lock()
        atomic.StoreInt64(&d.mtime, now)
        atomic.StoreInt64(&d.ctime, now)
        atomic.StoreUint32(&d.mtimeDirty, 1)
        d.metadataMu.Unlock()
}

// Preconditions:
// * d.cachedMetadataAuthoritative() == true.
// * The caller has locked d.metadataMu.
func (d *dentry) touchCMtimeLocked() {
        now := d.fs.clock.Now().Nanoseconds()
        atomic.StoreInt64(&d.mtime, now)
        atomic.StoreInt64(&d.ctime, now)
        atomic.StoreUint32(&d.mtimeDirty, 1)
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/futex/atomicptr_bucket_unsafe.go: no such file or directory






































    2 






    2 

















    2 




    1 




    1 






    2 






    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package memdev

import (
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/rand"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/usermem"
)

const (
        randomDevMinor  = 8
        urandomDevMinor = 9
)

// randomDevice implements vfs.Device for /dev/random and /dev/urandom.
//
// +stateify savable
type randomDevice struct{}

// Open implements vfs.Device.Open.
func (randomDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd := &randomFD{}
        if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
                UseDentryMetadata: true,
        }); err != nil {
                return nil, err
        }
        return &fd.vfsfd, nil
}

// randomFD implements vfs.FileDescriptionImpl for /dev/random.
//
// +stateify savable
type randomFD struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.DentryMetadataFileDescriptionImpl
        vfs.NoLockFD

        // off is the "file offset". off is accessed using atomic memory
        // operations.
        off int64
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *randomFD) Release(context.Context) {
        // noop
}

// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *randomFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        return dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader})
}

// Read implements vfs.FileDescriptionImpl.Read.
func (fd *randomFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        n, err := dst.CopyOutFrom(ctx, safemem.FromIOReader{rand.Reader})
        atomic.AddInt64(&fd.off, n)
        return n, err
}

// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *randomFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
        // In Linux, this mixes the written bytes into the entropy pool; we just
        // throw them away.
        return src.NumBytes(), nil
}

// Write implements vfs.FileDescriptionImpl.Write.
func (fd *randomFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        atomic.AddInt64(&fd.off, src.NumBytes())
        return src.NumBytes(), nil
}

// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *randomFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        // Linux: drivers/char/random.c:random_fops.llseek == urandom_fops.llseek
        // == noop_llseek
        return atomic.LoadInt64(&fd.off), nil
}






































































































    8 












    8 



    6 










    6 



    3 
    2 


    1 
    1 

    1 




    1 




  928 




  936 




    1 







    7 


    3 




    1 




    7 







    1 




    6 












    1 





    6 
    6 


    6 





    3 



    3 


    2 

    1 


    1 


    1 


    1 



    3 





    2 








    1 




    1 











    1 



    1 





    1 








    1 

















  198 


  198 




  197 
  196 





    1 


    1 



    1 




 1010 





  230 






  147 





  200 
    1 




   88 


  136 




  200 




    1 



 1008 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/waiter"
)

// epollCycleMu serializes attempts to register EpollInstances with other
// EpollInstances in order to check for cycles.
var epollCycleMu sync.Mutex

// EpollInstance represents an epoll instance, as described by epoll(7).
//
// +stateify savable
type EpollInstance struct {
        vfsfd FileDescription
        FileDescriptionDefaultImpl
        DentryMetadataFileDescriptionImpl
        NoLockFD

        // q holds waiters on this EpollInstance.
        q waiter.Queue

        // interest is the set of file descriptors that are registered with the
        // EpollInstance for monitoring. interest is protected by interestMu.
        interestMu sync.Mutex `state:"nosave"`
        interest   map[epollInterestKey]*epollInterest

        // mu protects fields in registered epollInterests.
        mu sync.Mutex `state:"nosave"`

        // ready is the set of file descriptors that may be "ready" for I/O. Note
        // that this must be an ordered list, not a map: "If more than maxevents
        // file descriptors are ready when epoll_wait() is called, then successive
        // epoll_wait() calls will round robin through the set of ready file
        // descriptors. This behavior helps avoid starvation scenarios, where a
        // process fails to notice that additional file descriptors are ready
        // because it focuses on a set of file descriptors that are already known
        // to be ready." - epoll_wait(2)
        ready epollInterestList
}

// +stateify savable
type epollInterestKey struct {
        // file is the registered FileDescription. No reference is held on file;
        // instead, when the last reference is dropped, FileDescription.DecRef()
        // removes the FileDescription from all EpollInstances. file is immutable.
        file *FileDescription

        // num is the file descriptor number with which this entry was registered.
        // num is immutable.
        num int32
}

// epollInterest represents an EpollInstance's interest in a file descriptor.
//
// +stateify savable
type epollInterest struct {
        // epoll is the owning EpollInstance. epoll is immutable.
        epoll *EpollInstance `state:"wait"`

        // key is the file to which this epollInterest applies. key is immutable.
        key epollInterestKey

        // waiter is registered with key.file. entry is protected by epoll.mu.
        waiter waiter.Entry

        // mask is the event mask associated with this registration, including
        // flags EPOLLET and EPOLLONESHOT. mask is protected by epoll.mu.
        mask uint32

        // ready is true if epollInterestEntry is linked into epoll.ready. ready
        // and epollInterestEntry are protected by epoll.mu.
        ready bool
        epollInterestEntry

        // userData is the struct epoll_event::data associated with this
        // epollInterest. userData is protected by epoll.mu.
        userData [2]int32
}

// NewEpollInstanceFD returns a FileDescription representing a new epoll
// instance. A reference is taken on the returned FileDescription.
func (vfs *VirtualFilesystem) NewEpollInstanceFD(ctx context.Context) (*FileDescription, error) {
        vd := vfs.NewAnonVirtualDentry("[eventpoll]")
        defer vd.DecRef(ctx)
        ep := &EpollInstance{
                interest: make(map[epollInterestKey]*epollInterest),
        }
        if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{
                DenyPRead:         true,
                DenyPWrite:        true,
                UseDentryMetadata: true,
        }); err != nil {
                return nil, err
        }
        return &ep.vfsfd, nil
}

// Release implements FileDescriptionImpl.Release.
func (ep *EpollInstance) Release(ctx context.Context) {
        // Unregister all polled fds.
        ep.interestMu.Lock()
        defer ep.interestMu.Unlock()
        for key, epi := range ep.interest {
                file := key.file
                file.epollMu.Lock()
                delete(file.epolls, epi)
                file.epollMu.Unlock()
                file.EventUnregister(&epi.waiter)
        }
        ep.interest = nil
}

// Readiness implements waiter.Waitable.Readiness.
func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask {
        if mask&waiter.ReadableEvents == 0 {
                return 0
        }
        ep.mu.Lock()
        for epi := ep.ready.Front(); epi != nil; epi = epi.Next() {
                wmask := waiter.EventMaskFromLinux(epi.mask)
                if epi.key.file.Readiness(wmask)&wmask != 0 {
                        ep.mu.Unlock()
                        return waiter.ReadableEvents
                }
        }
        ep.mu.Unlock()
        return 0
}

// EventRegister implements waiter.Waitable.EventRegister.
func (ep *EpollInstance) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        ep.q.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (ep *EpollInstance) EventUnregister(e *waiter.Entry) {
        ep.q.EventUnregister(e)
}

// Seek implements FileDescriptionImpl.Seek.
func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        // Linux: fs/eventpoll.c:eventpoll_fops.llseek == noop_llseek
        return 0, nil
}

// AddInterest implements the semantics of EPOLL_CTL_ADD.
//
// Preconditions: A reference must be held on file.
func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
        // Check for cyclic polling if necessary.
        subep, _ := file.impl.(*EpollInstance)
        if subep != nil {
                epollCycleMu.Lock()
                // epollCycleMu must be locked for the rest of AddInterest to ensure
                // that cyclic polling is not introduced after the check.
                defer epollCycleMu.Unlock()
                if subep.mightPoll(ep) {
                        return linuxerr.ELOOP
                }
        }

        ep.interestMu.Lock()
        defer ep.interestMu.Unlock()

        // Fail if the key is already registered.
        key := epollInterestKey{
                file: file,
                num:  num,
        }
        if _, ok := ep.interest[key]; ok {
                return linuxerr.EEXIST
        }

        // Register interest in file.
        mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP
        epi := &epollInterest{
                epoll:    ep,
                key:      key,
                mask:     mask,
                userData: event.Data,
        }
        epi.waiter.Callback = epi
        ep.interest[key] = epi
        wmask := waiter.EventMaskFromLinux(mask)
        file.EventRegister(&epi.waiter, wmask)

        // Check if the file is already ready.
        if m := file.Readiness(wmask) & wmask; m != 0 {
                epi.Callback(nil, m)
        }

        // Add epi to file.epolls so that it is removed when the last
        // FileDescription reference is dropped.
        file.epollMu.Lock()
        if file.epolls == nil {
                file.epolls = make(map[*epollInterest]struct{})
        }
        file.epolls[epi] = struct{}{}
        file.epollMu.Unlock()

        return nil
}

func (ep *EpollInstance) mightPoll(ep2 *EpollInstance) bool {
        return ep.mightPollRecursive(ep2, 4) // Linux: fs/eventpoll.c:EP_MAX_NESTS
}

func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursion int) bool {
        ep.interestMu.Lock()
        defer ep.interestMu.Unlock()
        for key := range ep.interest {
                nextep, ok := key.file.impl.(*EpollInstance)
                if !ok {
                        continue
                }
                if nextep == ep2 {
                        return true
                }
                if remainingRecursion == 0 {
                        return true
                }
                if nextep.mightPollRecursive(ep2, remainingRecursion-1) {
                        return true
                }
        }
        return false
}

// ModifyInterest implements the semantics of EPOLL_CTL_MOD.
//
// Preconditions: A reference must be held on file.
func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error {
        ep.interestMu.Lock()
        defer ep.interestMu.Unlock()

        // Fail if the key is not already registered.
        epi, ok := ep.interest[epollInterestKey{
                file: file,
                num:  num,
        }]
        if !ok {
                return syserror.ENOENT
        }

        // Update epi for the next call to ep.ReadEvents().
        mask := event.Events | linux.EPOLLERR | linux.EPOLLHUP
        ep.mu.Lock()
        epi.mask = mask
        epi.userData = event.Data
        ep.mu.Unlock()

        // Re-register with the new mask.
        file.EventUnregister(&epi.waiter)
        wmask := waiter.EventMaskFromLinux(mask)
        file.EventRegister(&epi.waiter, wmask)

        // Check if the file is already ready with the new mask.
        if m := file.Readiness(wmask) & wmask; m != 0 {
                epi.Callback(nil, m)
        }

        return nil
}

// DeleteInterest implements the semantics of EPOLL_CTL_DEL.
//
// Preconditions: A reference must be held on file.
func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error {
        ep.interestMu.Lock()
        defer ep.interestMu.Unlock()

        // Fail if the key is not already registered.
        epi, ok := ep.interest[epollInterestKey{
                file: file,
                num:  num,
        }]
        if !ok {
                return syserror.ENOENT
        }

        // Unregister from the file so that epi will no longer be readied.
        file.EventUnregister(&epi.waiter)

        // Forget about epi.
        ep.removeLocked(epi)

        file.epollMu.Lock()
        delete(file.epolls, epi)
        file.epollMu.Unlock()

        return nil
}

// Callback implements waiter.EntryCallback.Callback.
func (epi *epollInterest) Callback(*waiter.Entry, waiter.EventMask) {
        newReady := false
        epi.epoll.mu.Lock()
        if !epi.ready {
                newReady = true
                epi.ready = true
                epi.epoll.ready.PushBack(epi)
        }
        epi.epoll.mu.Unlock()
        if newReady {
                epi.epoll.q.Notify(waiter.ReadableEvents)
        }
}

// Preconditions: ep.interestMu must be locked.
func (ep *EpollInstance) removeLocked(epi *epollInterest) {
        delete(ep.interest, epi.key)
        ep.mu.Lock()
        if epi.ready {
                epi.ready = false
                ep.ready.Remove(epi)
        }
        ep.mu.Unlock()
}

// ReadEvents appends up to maxReady events to events and returns the updated
// slice of events.
func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent, maxEvents int) []linux.EpollEvent {
        i := 0
        // Hot path: avoid defer.
        ep.mu.Lock()
        var next *epollInterest
        var requeue epollInterestList
        for epi := ep.ready.Front(); epi != nil; epi = next {
                next = epi.Next()
                // Regardless of what else happens, epi is initially removed from the
                // ready list.
                ep.ready.Remove(epi)
                wmask := waiter.EventMaskFromLinux(epi.mask)
                ievents := epi.key.file.Readiness(wmask) & wmask
                if ievents == 0 {
                        // Leave epi off the ready list.
                        epi.ready = false
                        continue
                }
                // Determine what we should do with epi.
                switch {
                case epi.mask&linux.EPOLLONESHOT != 0:
                        // Clear all events from the mask; they must be re-added by
                        // EPOLL_CTL_MOD.
                        epi.mask &= linux.EP_PRIVATE_BITS
                        fallthrough
                case epi.mask&linux.EPOLLET != 0:
                        // Leave epi off the ready list.
                        epi.ready = false
                default:
                        // Queue epi to be moved to the end of the ready list.
                        requeue.PushBack(epi)
                }
                // Report ievents.
                events = append(events, linux.EpollEvent{
                        Events: ievents.ToLinux(),
                        Data:   epi.userData,
                })
                i++
                if i == maxEvents {
                        break
                }
        }
        ep.ready.PushBackList(&requeue)
        ep.mu.Unlock()
        return events
}

































































    7 
    1 




    7 





    7 

    3 





    7 




   26 
    7 
    3 











   22 

   18 







   22 


    7 

    4 





   22 



   18 

   17 



   22 
    3 


   21 



















    1 



   21 
    1 



   19 
    5 





   20 


   21 

    1 







   20 


   20 




    5 





   25 



    4 


    4 
    2 





    3 



    3 


    4 





    4 




    1 


    3 
    1 


    2 






    2 
    2 

    2 






    2 

    2 



    2 
    2 








    2 


    1 



    2 



    2 



    2 





    2 





    2 





    2 
    2 



    1 
    1 

    1 



    2 






    2 


    1 






    2 





    2 





    2 





    2 






    3 


    3 


    3 





   11 
    8 


    3 








    1 
    1 



















    9 



   13 


   14 







    4 



    4 








   10 







    1 



    9 
    2 



    9 



    8 









    6 


    8 



    3 








    2 

    1 


    1 




    1 






    1 









    5 








    1 



    4 
    1 



    4 



    1 
    1 







    3 





    3 









   15 


   10 

    1 


    9 


    8 

   13 


 1000 
 1001 


    9 


    8 
    1 


    7 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "fmt"
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/waiter"

        "gvisor.dev/gvisor/pkg/hostarch"
)

// fileCap is the maximum allowable files for poll & select. This has no
// equivalent in Linux; it exists in gVisor since allocation failure in Go is
// unrecoverable.
const fileCap = 1024 * 1024

// Masks for "readable", "writable", and "exceptional" events as defined by
// select(2).
const (
        // selectReadEvents is analogous to the Linux kernel's
        // fs/select.c:POLLIN_SET.
        selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR

        // selectWriteEvents is analogous to the Linux kernel's
        // fs/select.c:POLLOUT_SET.
        selectWriteEvents = linux.POLLOUT | linux.POLLERR

        // selectExceptEvents is analogous to the Linux kernel's
        // fs/select.c:POLLEX_SET.
        selectExceptEvents = linux.POLLPRI
)

// pollState tracks the associated file description and waiter of a PollFD.
type pollState struct {
        file   *vfs.FileDescription
        waiter waiter.Entry
}

// initReadiness gets the current ready mask for the file represented by the FD
// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is
// used to register with the file for event notifications, and a reference to
// the file is stored in "state".
func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) {
        if pfd.FD < 0 {
                pfd.REvents = 0
                return
        }

        file := t.GetFileVFS2(pfd.FD)
        if file == nil {
                pfd.REvents = linux.POLLNVAL
                return
        }

        if ch == nil {
                defer file.DecRef(t)
        } else {
                state.file = file
                state.waiter, _ = waiter.NewChannelEntry(ch)
                file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events)))
        }

        r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events)))
        pfd.REvents = int16(r.ToLinux()) & pfd.Events
}

// releaseState releases all the pollState in "state".
func releaseState(t *kernel.Task, state []pollState) {
        for i := range state {
                if state[i].file != nil {
                        state[i].file.EventUnregister(&state[i].waiter)
                        state[i].file.DecRef(t)
                }
        }
}

// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout"
// when "timeout" is greater than zero.
//
// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or
// positive if interrupted by a signal.
func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) {
        var ch chan struct{}
        if timeout != 0 {
                ch = make(chan struct{}, 1)
        }

        // Register for event notification in the files involved if we may
        // block (timeout not zero). Once we find a file that has a non-zero
        // result, we stop registering for events but still go through all files
        // to get their ready masks.
        state := make([]pollState, len(pfd))
        defer releaseState(t, state)
        n := uintptr(0)
        for i := range pfd {
                initReadiness(t, &pfd[i], &state[i], ch)
                if pfd[i].REvents != 0 {
                        n++
                        ch = nil
                }
        }

        if timeout == 0 {
                return timeout, n, nil
        }

        haveTimeout := timeout >= 0

        for n == 0 {
                var err error
                // Wait for a notification.
                timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout)
                if err != nil {
                        if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
                                err = nil
                        }
                        return timeout, 0, err
                }

                // We got notified, count how many files are ready. If none,
                // then this was a spurious notification, and we just go back
                // to sleep with the remaining timeout.
                for i := range state {
                        if state[i].file == nil {
                                continue
                        }

                        r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events)))
                        rl := int16(r.ToLinux()) & pfd[i].Events
                        if rl != 0 {
                                pfd[i].REvents = rl
                                n++
                        }
                }
        }

        return timeout, n, nil
}

// copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max.
func copyInPollFDs(t *kernel.Task, addr hostarch.Addr, nfds uint) ([]linux.PollFD, error) {
        if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
                return nil, linuxerr.EINVAL
        }

        pfd := make([]linux.PollFD, nfds)
        if nfds > 0 {
                if _, err := linux.CopyPollFDSliceIn(t, addr, pfd); err != nil {
                        return nil, err
                }
        }

        return pfd, nil
}

func doPoll(t *kernel.Task, addr hostarch.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
        pfd, err := copyInPollFDs(t, addr, nfds)
        if err != nil {
                return timeout, 0, err
        }

        // Compatibility warning: Linux adds POLLHUP and POLLERR just before
        // polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
        // polling, changing event masks here is an application-visible difference.
        // (Linux also doesn't copy out event masks at all, only revents.)
        for i := range pfd {
                pfd[i].Events |= linux.POLLHUP | linux.POLLERR
        }
        remainingTimeout, n, err := pollBlock(t, pfd, timeout)
        err = syserror.ConvertIntr(err, syserror.EINTR)

        // The poll entries are copied out regardless of whether
        // any are set or not. This aligns with the Linux behavior.
        if nfds > 0 && err == nil {
                if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil {
                        return remainingTimeout, 0, err
                }
        }

        return remainingTimeout, n, err
}

// CopyInFDSet copies an fd set from select(2)/pselect(2).
func CopyInFDSet(t *kernel.Task, addr hostarch.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) {
        set := make([]byte, nBytes)

        if addr != 0 {
                if _, err := t.CopyInBytes(addr, set); err != nil {
                        return nil, err
                }
                // If we only use part of the last byte, mask out the extraneous bits.
                //
                // N.B. This only works on little-endian architectures.
                if nBitsInLastPartialByte != 0 {
                        set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte
                }
        }
        return set, nil
}

func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Addr, timeout time.Duration) (uintptr, error) {
        if nfds < 0 || nfds > fileCap {
                return 0, linuxerr.EINVAL
        }

        // Calculate the size of the fd sets (one bit per fd).
        nBytes := (nfds + 7) / 8
        nBitsInLastPartialByte := nfds % 8

        // Capture all the provided input vectors.
        r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
        if err != nil {
                return 0, err
        }
        w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte)
        if err != nil {
                return 0, err
        }
        e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte)
        if err != nil {
                return 0, err
        }

        // Count how many FDs are actually being requested so that we can build
        // a PollFD array.
        fdCount := 0
        for i := 0; i < nBytes; i++ {
                v := r[i] | w[i] | e[i]
                for v != 0 {
                        v &= (v - 1)
                        fdCount++
                }
        }

        // Build the PollFD array.
        pfd := make([]linux.PollFD, 0, fdCount)
        var fd int32
        for i := 0; i < nBytes; i++ {
                rV, wV, eV := r[i], w[i], e[i]
                v := rV | wV | eV
                m := byte(1)
                for j := 0; j < 8; j++ {
                        if (v & m) != 0 {
                                // Make sure the fd is valid and decrement the reference
                                // immediately to ensure we don't leak. Note, another thread
                                // might be about to close fd. This is racy, but that's
                                // OK. Linux is racy in the same way.
                                file := t.GetFileVFS2(fd)
                                if file == nil {
                                        return 0, linuxerr.EBADF
                                }
                                file.DecRef(t)

                                var mask int16
                                if (rV & m) != 0 {
                                        mask |= selectReadEvents
                                }

                                if (wV & m) != 0 {
                                        mask |= selectWriteEvents
                                }

                                if (eV & m) != 0 {
                                        mask |= selectExceptEvents
                                }

                                pfd = append(pfd, linux.PollFD{
                                        FD:     fd,
                                        Events: mask,
                                })
                        }

                        fd++
                        m <<= 1
                }
        }

        // Do the syscall, then count the number of bits set.
        if _, _, err = pollBlock(t, pfd, timeout); err != nil {
                return 0, syserror.ConvertIntr(err, syserror.EINTR)
        }

        // r, w, and e are currently event mask bitsets; unset bits corresponding
        // to events that *didn't* occur.
        bitSetCount := uintptr(0)
        for idx := range pfd {
                events := pfd[idx].REvents
                i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8)
                m := byte(1) << j
                if r[i]&m != 0 {
                        if (events & selectReadEvents) != 0 {
                                bitSetCount++
                        } else {
                                r[i] &^= m
                        }
                }
                if w[i]&m != 0 {
                        if (events & selectWriteEvents) != 0 {
                                bitSetCount++
                        } else {
                                w[i] &^= m
                        }
                }
                if e[i]&m != 0 {
                        if (events & selectExceptEvents) != 0 {
                                bitSetCount++
                        } else {
                                e[i] &^= m
                        }
                }
        }

        // Copy updated vectors back.
        if readFDs != 0 {
                if _, err := t.CopyOutBytes(readFDs, r); err != nil {
                        return 0, err
                }
        }

        if writeFDs != 0 {
                if _, err := t.CopyOutBytes(writeFDs, w); err != nil {
                        return 0, err
                }
        }

        if exceptFDs != 0 {
                if _, err := t.CopyOutBytes(exceptFDs, e); err != nil {
                        return 0, err
                }
        }

        return bitSetCount, nil
}

// timeoutRemaining returns the amount of time remaining for the specified
// timeout or 0 if it has elapsed.
//
// startNs must be from CLOCK_MONOTONIC.
func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration {
        now := t.Kernel().MonotonicClock().Now()
        remaining := timeout - now.Sub(startNs)
        if remaining < 0 {
                remaining = 0
        }
        return remaining
}

// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr.
//
// startNs must be from CLOCK_MONOTONIC.
func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr hostarch.Addr) error {
        if timeout <= 0 {
                return nil
        }
        remaining := timeoutRemaining(t, startNs, timeout)
        tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
        _, err := tsRemaining.CopyOut(t, timespecAddr)
        return err
}

// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
//
// startNs must be from CLOCK_MONOTONIC.
func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr hostarch.Addr) error {
        if timeout <= 0 {
                return nil
        }
        remaining := timeoutRemaining(t, startNs, timeout)
        tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
        _, err := tvRemaining.CopyOut(t, timevalAddr)
        return err
}

// pollRestartBlock encapsulates the state required to restart poll(2) via
// restart_syscall(2).
//
// +stateify savable
type pollRestartBlock struct {
        pfdAddr hostarch.Addr
        nfds    uint
        timeout time.Duration
}

// Restart implements kernel.SyscallRestartBlock.Restart.
func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
        return poll(t, p.pfdAddr, p.nfds, p.timeout)
}

func poll(t *kernel.Task, pfdAddr hostarch.Addr, nfds uint, timeout time.Duration) (uintptr, error) {
        remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout)
        // On an interrupt poll(2) is restarted with the remaining timeout.
        if linuxerr.Equals(linuxerr.EINTR, err) {
                t.SetSyscallRestartBlock(&pollRestartBlock{
                        pfdAddr: pfdAddr,
                        nfds:    nfds,
                        timeout: remainingTimeout,
                })
                return 0, syserror.ERESTART_RESTARTBLOCK
        }
        return n, err
}

// Poll implements linux syscall poll(2).
func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pfdAddr := args[0].Pointer()
        nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
        timeout := time.Duration(args[2].Int()) * time.Millisecond
        n, err := poll(t, pfdAddr, nfds, timeout)
        return n, nil, err
}

// Ppoll implements linux syscall ppoll(2).
func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pfdAddr := args[0].Pointer()
        nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
        timespecAddr := args[2].Pointer()
        maskAddr := args[3].Pointer()
        maskSize := uint(args[4].Uint())

        timeout, err := copyTimespecInToDuration(t, timespecAddr)
        if err != nil {
                return 0, nil, err
        }

        var startNs ktime.Time
        if timeout > 0 {
                startNs = t.Kernel().MonotonicClock().Now()
        }

        if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
                return 0, nil, err
        }

        _, n, err := doPoll(t, pfdAddr, nfds, timeout)
        copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
        // doPoll returns EINTR if interrupted, but ppoll is normally restartable
        // if interrupted by something other than a signal handled by the
        // application (i.e. returns ERESTARTNOHAND). However, if
        // copyOutTimespecRemaining failed, then the restarted ppoll would use the
        // wrong timeout, so the error should be left as EINTR.
        //
        // Note that this means that if err is nil but copyErr is not, copyErr is
        // ignored. This is consistent with Linux.
        if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil {
                err = syserror.ERESTARTNOHAND
        }
        return n, nil, err
}

// Select implements linux syscall select(2).
func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        nfds := int(args[0].Int()) // select(2) uses an int.
        readFDs := args[1].Pointer()
        writeFDs := args[2].Pointer()
        exceptFDs := args[3].Pointer()
        timevalAddr := args[4].Pointer()

        // Use a negative Duration to indicate "no timeout".
        timeout := time.Duration(-1)
        if timevalAddr != 0 {
                var timeval linux.Timeval
                if _, err := timeval.CopyIn(t, timevalAddr); err != nil {
                        return 0, nil, err
                }
                if timeval.Sec < 0 || timeval.Usec < 0 {
                        return 0, nil, linuxerr.EINVAL
                }
                timeout = time.Duration(timeval.ToNsecCapped())
        }
        startNs := t.Kernel().MonotonicClock().Now()
        n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
        copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
        // See comment in Ppoll.
        if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil {
                err = syserror.ERESTARTNOHAND
        }
        return n, nil, err
}

// +marshal
type sigSetWithSize struct {
        sigsetAddr   uint64
        sizeofSigset uint64
}

// Pselect implements linux syscall pselect(2).
func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        nfds := int(args[0].Int()) // select(2) uses an int.
        readFDs := args[1].Pointer()
        writeFDs := args[2].Pointer()
        exceptFDs := args[3].Pointer()
        timespecAddr := args[4].Pointer()
        maskWithSizeAddr := args[5].Pointer()

        timeout, err := copyTimespecInToDuration(t, timespecAddr)
        if err != nil {
                return 0, nil, err
        }

        var startNs ktime.Time
        if timeout > 0 {
                startNs = t.Kernel().MonotonicClock().Now()
        }

        if maskWithSizeAddr != 0 {
                if t.Arch().Width() != 8 {
                        panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width()))
                }
                var maskStruct sigSetWithSize
                if _, err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil {
                        return 0, nil, err
                }
                if err := setTempSignalSet(t, hostarch.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil {
                        return 0, nil, err
                }
        }

        n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
        copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
        // See comment in Ppoll.
        if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil {
                err = syserror.ERESTARTNOHAND
        }
        return n, nil, err
}

// copyTimespecInToDuration copies a Timespec from the untrusted app range,
// validates it and converts it to a Duration.
//
// If the Timespec is larger than what can be represented in a Duration, the
// returned value is the maximum that Duration will allow.
//
// If timespecAddr is NULL, the returned value is negative.
func copyTimespecInToDuration(t *kernel.Task, timespecAddr hostarch.Addr) (time.Duration, error) {
        // Use a negative Duration to indicate "no timeout".
        timeout := time.Duration(-1)
        if timespecAddr != 0 {
                var timespec linux.Timespec
                if _, err := timespec.CopyIn(t, timespecAddr); err != nil {
                        return 0, err
                }
                if !timespec.Valid() {
                        return 0, linuxerr.EINVAL
                }
                timeout = time.Duration(timespec.ToNsecCapped())
        }
        return timeout, nil
}

func setTempSignalSet(t *kernel.Task, maskAddr hostarch.Addr, maskSize uint) error {
        if maskAddr == 0 {
                return nil
        }
        if maskSize != linux.SignalSetSize {
                return linuxerr.EINVAL
        }
        var mask linux.SignalSet
        if _, err := mask.CopyIn(t, maskAddr); err != nil {
                return err
        }
        mask &^= kernel.UnblockableSignals
        oldmask := t.SignalMask()
        t.SetSignalMask(mask)
        t.SetSavedSignalMask(oldmask)
        return nil
}



































































































    4 


















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import (
        "encoding/binary"
        "fmt"
        "time"

        "gvisor.dev/gvisor/pkg/tcpip"
)

// IGMP represents an IGMP header stored in a byte array.
type IGMP []byte

// IGMP implements `Transport`.
var _ Transport = (*IGMP)(nil)

const (
        // IGMPMinimumSize is the minimum size of a valid IGMP packet in bytes,
        // as per RFC 2236, Section 2, Page 2.
        IGMPMinimumSize = 8

        // IGMPQueryMinimumSize is the minimum size of a valid Membership Query
        // Message in bytes, as per RFC 2236, Section 2, Page 2.
        IGMPQueryMinimumSize = 8

        // IGMPReportMinimumSize is the minimum size of a valid Report Message in
        // bytes, as per RFC 2236, Section 2, Page 2.
        IGMPReportMinimumSize = 8

        // IGMPLeaveMessageMinimumSize is the minimum size of a valid Leave Message
        // in bytes, as per RFC 2236, Section 2, Page 2.
        IGMPLeaveMessageMinimumSize = 8

        // IGMPTTL is the TTL for all IGMP messages, as per RFC 2236, Section 3, Page
        // 3.
        IGMPTTL = 1

        // igmpTypeOffset defines the offset of the type field in an IGMP message.
        igmpTypeOffset = 0

        // igmpMaxRespTimeOffset defines the offset of the MaxRespTime field in an
        // IGMP message.
        igmpMaxRespTimeOffset = 1

        // igmpChecksumOffset defines the offset of the checksum field in an IGMP
        // message.
        igmpChecksumOffset = 2

        // igmpGroupAddressOffset defines the offset of the Group Address field in an
        // IGMP message.
        igmpGroupAddressOffset = 4

        // IGMPProtocolNumber is IGMP's transport protocol number.
        IGMPProtocolNumber tcpip.TransportProtocolNumber = 2
)

// IGMPType is the IGMP type field as per RFC 2236.
type IGMPType byte

// Values for the IGMP Type described in RFC 2236 Section 2.1, Page 2.
// Descriptions below come from there.
const (
        // IGMPMembershipQuery indicates that the message type is Membership Query.
        // "There are two sub-types of Membership Query messages:
        // - General Query, used to learn which groups have members on an
        //   attached network.
        // - Group-Specific Query, used to learn if a particular group
        //   has any members on an attached network.
        // These two messages are differentiated by the Group Address, as
        // described in section 1.4 ."
        IGMPMembershipQuery IGMPType = 0x11
        // IGMPv1MembershipReport indicates that the message is a Membership Report
        // generated by a host using the IGMPv1 protocol: "an additional type of
        // message, for backwards-compatibility with IGMPv1"
        IGMPv1MembershipReport IGMPType = 0x12
        // IGMPv2MembershipReport indicates that the Message type is a Membership
        // Report generated by a host using the IGMPv2 protocol.
        IGMPv2MembershipReport IGMPType = 0x16
        // IGMPLeaveGroup indicates that the message type is a Leave Group
        // notification message.
        IGMPLeaveGroup IGMPType = 0x17
)

// Type is the IGMP type field.
func (b IGMP) Type() IGMPType { return IGMPType(b[igmpTypeOffset]) }

// SetType sets the IGMP type field.
func (b IGMP) SetType(t IGMPType) { b[igmpTypeOffset] = byte(t) }

// MaxRespTime gets the MaxRespTimeField. This is meaningful only in Membership
// Query messages, in other cases it is set to 0 by the sender and ignored by
// the receiver.
func (b IGMP) MaxRespTime() time.Duration {
        // As per RFC 2236 section 2.2,
        //
        //  The Max Response Time field is meaningful only in Membership Query
        //  messages, and specifies the maximum allowed time before sending a
        //  responding report in units of 1/10 second.  In all other messages, it
        //  is set to zero by the sender and ignored by receivers.
        return DecisecondToDuration(b[igmpMaxRespTimeOffset])
}

// SetMaxRespTime sets the MaxRespTimeField.
func (b IGMP) SetMaxRespTime(m byte) { b[igmpMaxRespTimeOffset] = m }

// Checksum is the IGMP checksum field.
func (b IGMP) Checksum() uint16 {
        return binary.BigEndian.Uint16(b[igmpChecksumOffset:])
}

// SetChecksum sets the IGMP checksum field.
func (b IGMP) SetChecksum(checksum uint16) {
        binary.BigEndian.PutUint16(b[igmpChecksumOffset:], checksum)
}

// GroupAddress gets the Group Address field.
func (b IGMP) GroupAddress() tcpip.Address {
        return tcpip.Address(b[igmpGroupAddressOffset:][:IPv4AddressSize])
}

// SetGroupAddress sets the Group Address field.
func (b IGMP) SetGroupAddress(address tcpip.Address) {
        if n := copy(b[igmpGroupAddressOffset:], address); n != IPv4AddressSize {
                panic(fmt.Sprintf("copied %d bytes, expected %d", n, IPv4AddressSize))
        }
}

// SourcePort implements Transport.SourcePort.
func (IGMP) SourcePort() uint16 {
        return 0
}

// DestinationPort implements Transport.DestinationPort.
func (IGMP) DestinationPort() uint16 {
        return 0
}

// SetSourcePort implements Transport.SetSourcePort.
func (IGMP) SetSourcePort(uint16) {
}

// SetDestinationPort implements Transport.SetDestinationPort.
func (IGMP) SetDestinationPort(uint16) {
}

// Payload implements Transport.Payload.
func (IGMP) Payload() []byte {
        return nil
}

// IGMPCalculateChecksum calculates the IGMP checksum over the provided IGMP
// header.
func IGMPCalculateChecksum(h IGMP) uint16 {
        // The header contains a checksum itself, set it aside to avoid checksumming
        // the checksum and replace it afterwards.
        existingXsum := h.Checksum()
        h.SetChecksum(0)
        xsum := ^Checksum(h, 0)
        h.SetChecksum(existingXsum)
        return xsum
}

// DecisecondToDuration converts a value representing deci-seconds to a
// time.Duration.
func DecisecondToDuration(ds uint8) time.Duration {
        return time.Duration(ds) * time.Second / 10
}


















































































   60 





















    1 




    1 






  194 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package sockfs provides a filesystem implementation for anonymous sockets.
package sockfs

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

// filesystemType implements vfs.FilesystemType.
//
// +stateify savable
type filesystemType struct{}

// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
        panic("sockfs.filesystemType.GetFilesystem should never be called")
}

// Name implements vfs.FilesystemType.Name.
//
// Note that registering sockfs is unnecessary, except for the fact that it
// will not show up under /proc/filesystems as a result. This is a very minor
// discrepancy from Linux.
func (filesystemType) Name() string {
        return "sockfs"
}

// Release implements vfs.FilesystemType.Release.
func (filesystemType) Release(ctx context.Context) {}

// +stateify savable
type filesystem struct {
        kernfs.Filesystem

        devMinor uint32
}

// NewFilesystem sets up and returns a new sockfs filesystem.
//
// Note that there should only ever be one instance of sockfs.Filesystem,
// backing a global socket mount.
func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
        devMinor, err := vfsObj.GetAnonBlockDevMinor()
        if err != nil {
                return nil, err
        }
        fs := &filesystem{
                devMinor: devMinor,
        }
        fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs)
        return fs.Filesystem.VFSFilesystem(), nil
}

// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
        fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
        fs.Filesystem.Release(ctx)
}

// PrependPath implements vfs.FilesystemImpl.PrependPath.
func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
        inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode)
        b.PrependComponent(fmt.Sprintf("socket:[%d]", inode.InodeAttrs.Ino()))
        return vfs.PrependPathSyntheticError{}
}

// MountOptions implements vfs.FilesystemImpl.MountOptions.
func (fs *filesystem) MountOptions() string {
        return ""
}

// inode implements kernfs.Inode.
//
// +stateify savable
type inode struct {
        kernfs.InodeAttrs
        kernfs.InodeNoopRefCount
        kernfs.InodeNotDirectory
        kernfs.InodeNotSymlink
}

// Open implements kernfs.Inode.Open.
func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        return nil, linuxerr.ENXIO
}

// StatFS implements kernfs.Inode.StatFS.
func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
        return vfs.GenericStatFS(linux.SOCKFS_MAGIC), nil
}

// NewDentry constructs and returns a sockfs dentry.
//
// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem().
func NewDentry(ctx context.Context, mnt *vfs.Mount) *vfs.Dentry {
        fs := mnt.Filesystem().Impl().(*filesystem)

        // File mode matches net/socket.c:sock_alloc.
        filemode := linux.FileMode(linux.S_IFSOCK | 0600)
        i := &inode{}
        i.InodeAttrs.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode)

        d := &kernfs.Dentry{}
        d.Init(&fs.Filesystem, i)
        return d.VFSDentry()
}































































































































































































    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package hostinet

import (
        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fdnotifier"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
        "gvisor.dev/gvisor/pkg/sentry/hostfd"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/socket"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// +stateify savable
type socketVFS2 struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.LockFD

        // We store metadata for hostinet sockets internally. Technically, we should
        // access metadata (e.g. through stat, chmod) on the host for correctness,
        // but this is not very useful for inet socket fds, which do not belong to a
        // concrete file anyway.
        vfs.DentryMetadataFileDescriptionImpl

        socketOpsCommon
}

var _ = socket.SocketVFS2(&socketVFS2{})

func newVFS2Socket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) {
        mnt := t.Kernel().SocketMount()
        d := sockfs.NewDentry(t, mnt)
        defer d.DecRef(t)

        s := &socketVFS2{
                socketOpsCommon: socketOpsCommon{
                        family:   family,
                        stype:    stype,
                        protocol: protocol,
                        fd:       fd,
                },
        }
        s.LockFD.Init(&vfs.FileLocks{})
        if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
                return nil, syserr.FromError(err)
        }
        vfsfd := &s.vfsfd
        if err := vfsfd.Init(s, linux.O_RDWR|(flags&linux.O_NONBLOCK), mnt, d, &vfs.FileDescriptionOptions{
                DenyPRead:         true,
                DenyPWrite:        true,
                UseDentryMetadata: true,
        }); err != nil {
                fdnotifier.RemoveFD(int32(s.fd))
                return nil, syserr.FromError(err)
        }
        return vfsfd, nil
}

// Release implements vfs.FileDescriptionImpl.Release.
func (s *socketVFS2) Release(ctx context.Context) {
        kernel.KernelFromContext(ctx).DeleteSocketVFS2(&s.vfsfd)
        s.socketOpsCommon.Release(ctx)
}

// Readiness implements waiter.Waitable.Readiness.
func (s *socketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
        return s.socketOpsCommon.Readiness(mask)
}

// EventRegister implements waiter.Waitable.EventRegister.
func (s *socketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        s.socketOpsCommon.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (s *socketVFS2) EventUnregister(e *waiter.Entry) {
        s.socketOpsCommon.EventUnregister(e)
}

// Ioctl implements vfs.FileDescriptionImpl.
func (s *socketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        return ioctl(ctx, s.fd, uio, args)
}

// PRead implements vfs.FileDescriptionImpl.PRead.
func (s *socketVFS2) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        return 0, linuxerr.ESPIPE
}

// Read implements vfs.FileDescriptionImpl.
func (s *socketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        // All flags other than RWF_NOWAIT should be ignored.
        // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
        if opts.Flags != 0 {
                return 0, linuxerr.EOPNOTSUPP
        }

        reader := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags)
        n, err := dst.CopyOutFrom(ctx, reader)
        hostfd.PutReadWriterAt(reader)
        return int64(n), err
}

// PWrite implements vfs.FileDescriptionImpl.
func (s *socketVFS2) PWrite(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
        return 0, linuxerr.ESPIPE
}

// Write implements vfs.FileDescriptionImpl.
func (s *socketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        // All flags other than RWF_NOWAIT should be ignored.
        // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
        if opts.Flags != 0 {
                return 0, linuxerr.EOPNOTSUPP
        }

        writer := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags)
        n, err := src.CopyInTo(ctx, writer)
        hostfd.PutReadWriterAt(writer)
        return int64(n), err
}

type socketProviderVFS2 struct {
        family int
}

// Socket implements socket.ProviderVFS2.Socket.
func (p *socketProviderVFS2) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
        // Check that we are using the host network stack.
        stack := t.NetworkContext()
        if stack == nil {
                return nil, nil
        }
        if _, ok := stack.(*Stack); !ok {
                return nil, nil
        }

        // Only accept TCP and UDP.
        stype := stypeflags & linux.SOCK_TYPE_MASK
        switch stype {
        case unix.SOCK_STREAM:
                switch protocol {
                case 0, unix.IPPROTO_TCP:
                        // ok
                default:
                        return nil, nil
                }
        case unix.SOCK_DGRAM:
                switch protocol {
                case 0, unix.IPPROTO_UDP:
                        // ok
                default:
                        return nil, nil
                }
        default:
                return nil, nil
        }

        // Conservatively ignore all flags specified by the application and add
        // SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0
        // to simplify the syscall filters, since 0 and IPPROTO_* are equivalent.
        fd, err := unix.Socket(p.family, int(stype)|unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC, 0)
        if err != nil {
                return nil, syserr.FromError(err)
        }
        return newVFS2Socket(t, p.family, stype, protocol, fd, uint32(stypeflags&unix.SOCK_NONBLOCK))
}

// Pair implements socket.Provider.Pair.
func (p *socketProviderVFS2) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
        // Not supported by AF_INET/AF_INET6.
        return nil, nil, nil
}






















































































































   22 


    9 


   10 


    8 













    9 



























    9 









    3 




    2 















    8 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import (
        "encoding/binary"

        "gvisor.dev/gvisor/pkg/tcpip"
)

// ICMPv4 represents an ICMPv4 header stored in a byte array.
type ICMPv4 []byte

const (
        // ICMPv4PayloadOffset defines the start of ICMP payload.
        ICMPv4PayloadOffset = 8

        // ICMPv4MinimumSize is the minimum size of a valid ICMP packet.
        ICMPv4MinimumSize = 8

        // ICMPv4MinimumErrorPayloadSize Is the smallest number of bytes of an
        // errant packet's transport layer that an ICMP error type packet should
        // attempt to send as per RFC 792 (see each type) and RFC 1122
        // section 3.2.2 which states:
        //      Every ICMP error message includes the Internet header and at
        //      least the first 8 data octets of the datagram that triggered
        //      the error; more than 8 octets MAY be sent; this header and data
        //      MUST be unchanged from the received datagram.
        //
        // RFC 792 shows:
        //   0                   1                   2                   3
        //  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
        // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
        // |     Type      |     Code      |          Checksum             |
        // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
        // |                             unused                            |
        // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
        // |      Internet Header + 64 bits of Original Data Datagram      |
        // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
        ICMPv4MinimumErrorPayloadSize = 8

        // ICMPv4ProtocolNumber is the ICMP transport protocol number.
        ICMPv4ProtocolNumber tcpip.TransportProtocolNumber = 1

        // icmpv4ChecksumOffset is the offset of the checksum field
        // in an ICMPv4 message.
        icmpv4ChecksumOffset = 2

        // icmpv4MTUOffset is the offset of the MTU field
        // in an ICMPv4FragmentationNeeded message.
        icmpv4MTUOffset = 6

        // icmpv4IdentOffset is the offset of the ident field
        // in an ICMPv4EchoRequest/Reply message.
        icmpv4IdentOffset = 4

        // icmpv4PointerOffset is the offset of the pointer field
        // in an ICMPv4ParamProblem message.
        icmpv4PointerOffset = 4

        // icmpv4SequenceOffset is the offset of the sequence field
        // in an ICMPv4EchoRequest/Reply message.
        icmpv4SequenceOffset = 6
)

// ICMPv4Type is the ICMP type field described in RFC 792.
type ICMPv4Type byte

// ICMPv4Code is the ICMP code field described in RFC 792.
type ICMPv4Code byte

// Typical values of ICMPv4Type defined in RFC 792.
const (
        ICMPv4EchoReply      ICMPv4Type = 0
        ICMPv4DstUnreachable ICMPv4Type = 3
        ICMPv4SrcQuench      ICMPv4Type = 4
        ICMPv4Redirect       ICMPv4Type = 5
        ICMPv4Echo           ICMPv4Type = 8
        ICMPv4TimeExceeded   ICMPv4Type = 11
        ICMPv4ParamProblem   ICMPv4Type = 12
        ICMPv4Timestamp      ICMPv4Type = 13
        ICMPv4TimestampReply ICMPv4Type = 14
        ICMPv4InfoRequest    ICMPv4Type = 15
        ICMPv4InfoReply      ICMPv4Type = 16
)

// ICMP codes for ICMPv4 Time Exceeded messages as defined in RFC 792.
const (
        ICMPv4TTLExceeded       ICMPv4Code = 0
        ICMPv4ReassemblyTimeout ICMPv4Code = 1
)

// ICMP codes for ICMPv4 Destination Unreachable messages as defined in RFC 792.
const (
        ICMPv4NetUnreachable      ICMPv4Code = 0
        ICMPv4HostUnreachable     ICMPv4Code = 1
        ICMPv4ProtoUnreachable    ICMPv4Code = 2
        ICMPv4PortUnreachable     ICMPv4Code = 3
        ICMPv4FragmentationNeeded ICMPv4Code = 4
)

// ICMPv4UnusedCode is a code to use in ICMP messages where no code is needed.
const ICMPv4UnusedCode ICMPv4Code = 0

// Type is the ICMP type field.
func (b ICMPv4) Type() ICMPv4Type { return ICMPv4Type(b[0]) }

// SetType sets the ICMP type field.
func (b ICMPv4) SetType(t ICMPv4Type) { b[0] = byte(t) }

// Code is the ICMP code field. Its meaning depends on the value of Type.
func (b ICMPv4) Code() ICMPv4Code { return ICMPv4Code(b[1]) }

// SetCode sets the ICMP code field.
func (b ICMPv4) SetCode(c ICMPv4Code) { b[1] = byte(c) }

// Pointer returns the pointer field in a Parameter Problem packet.
func (b ICMPv4) Pointer() byte { return b[icmpv4PointerOffset] }

// SetPointer sets the pointer field in a Parameter Problem packet.
func (b ICMPv4) SetPointer(c byte) { b[icmpv4PointerOffset] = c }

// Checksum is the ICMP checksum field.
func (b ICMPv4) Checksum() uint16 {
        return binary.BigEndian.Uint16(b[icmpv4ChecksumOffset:])
}

// SetChecksum sets the ICMP checksum field.
func (b ICMPv4) SetChecksum(checksum uint16) {
        binary.BigEndian.PutUint16(b[icmpv4ChecksumOffset:], checksum)
}

// SourcePort implements Transport.SourcePort.
func (ICMPv4) SourcePort() uint16 {
        return 0
}

// DestinationPort implements Transport.DestinationPort.
func (ICMPv4) DestinationPort() uint16 {
        return 0
}

// SetSourcePort implements Transport.SetSourcePort.
func (ICMPv4) SetSourcePort(uint16) {
}

// SetDestinationPort implements Transport.SetDestinationPort.
func (ICMPv4) SetDestinationPort(uint16) {
}

// Payload implements Transport.Payload.
func (b ICMPv4) Payload() []byte {
        return b[ICMPv4PayloadOffset:]
}

// MTU retrieves the MTU field from an ICMPv4 message.
func (b ICMPv4) MTU() uint16 {
        return binary.BigEndian.Uint16(b[icmpv4MTUOffset:])
}

// SetMTU sets the MTU field from an ICMPv4 message.
func (b ICMPv4) SetMTU(mtu uint16) {
        binary.BigEndian.PutUint16(b[icmpv4MTUOffset:], mtu)
}

// Ident retrieves the Ident field from an ICMPv4 message.
func (b ICMPv4) Ident() uint16 {
        return binary.BigEndian.Uint16(b[icmpv4IdentOffset:])
}

// SetIdent sets the Ident field from an ICMPv4 message.
func (b ICMPv4) SetIdent(ident uint16) {
        binary.BigEndian.PutUint16(b[icmpv4IdentOffset:], ident)
}

// Sequence retrieves the Sequence field from an ICMPv4 message.
func (b ICMPv4) Sequence() uint16 {
        return binary.BigEndian.Uint16(b[icmpv4SequenceOffset:])
}

// SetSequence sets the Sequence field from an ICMPv4 message.
func (b ICMPv4) SetSequence(sequence uint16) {
        binary.BigEndian.PutUint16(b[icmpv4SequenceOffset:], sequence)
}

// ICMPv4Checksum calculates the ICMP checksum over the provided ICMP header,
// and payload.
func ICMPv4Checksum(h ICMPv4, payloadCsum uint16) uint16 {
        xsum := payloadCsum

        // h[2:4] is the checksum itself, skip it to avoid checksumming the checksum.
        xsum = Checksum(h[:2], xsum)
        xsum = Checksum(h[4:], xsum)

        return ^xsum
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/kernfs/fstree.go: no such file or directory




















































  113 
   48 



  113 





















  113 



   89 





    3 




    1 




    2 




    3 






    3 


    2 
    1 


    1 


    1 



   17 






   17 




   16 



    9 



    9 




    6 


    1 


    6 
    5 
    4 



    1 

    1 




    3 



    3 





    3 

    1 




    3 









   15 





   77 

















   77 












   77 




   99 














   99 



    1 





   98 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package netstack

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/socket"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// SocketVFS2 encapsulates all the state needed to represent a network stack
// endpoint in the kernel context.
//
// +stateify savable
type SocketVFS2 struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.DentryMetadataFileDescriptionImpl
        vfs.LockFD

        socketOpsCommon
}

var _ = socket.SocketVFS2(&SocketVFS2{})

// NewVFS2 creates a new endpoint socket.
func NewVFS2(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*vfs.FileDescription, *syserr.Error) {
        if skType == linux.SOCK_STREAM {
                endpoint.SocketOptions().SetDelayOption(true)
        }

        mnt := t.Kernel().SocketMount()
        d := sockfs.NewDentry(t, mnt)
        defer d.DecRef(t)

        s := &SocketVFS2{
                socketOpsCommon: socketOpsCommon{
                        Queue:    queue,
                        family:   family,
                        Endpoint: endpoint,
                        skType:   skType,
                        protocol: protocol,
                },
        }
        s.LockFD.Init(&vfs.FileLocks{})
        vfsfd := &s.vfsfd
        if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
                DenyPRead:         true,
                DenyPWrite:        true,
                UseDentryMetadata: true,
        }); err != nil {
                return nil, syserr.FromError(err)
        }
        return vfsfd, nil
}

// Release implements vfs.FileDescriptionImpl.Release.
func (s *SocketVFS2) Release(ctx context.Context) {
        kernel.KernelFromContext(ctx).DeleteSocketVFS2(&s.vfsfd)
        s.socketOpsCommon.Release(ctx)
}

// Readiness implements waiter.Waitable.Readiness.
func (s *SocketVFS2) Readiness(mask waiter.EventMask) waiter.EventMask {
        return s.socketOpsCommon.Readiness(mask)
}

// EventRegister implements waiter.Waitable.EventRegister.
func (s *SocketVFS2) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        s.socketOpsCommon.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (s *SocketVFS2) EventUnregister(e *waiter.Entry) {
        s.socketOpsCommon.EventUnregister(e)
}

// Read implements vfs.FileDescriptionImpl.
func (s *SocketVFS2) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        // All flags other than RWF_NOWAIT should be ignored.
        // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
        if opts.Flags != 0 {
                return 0, linuxerr.EOPNOTSUPP
        }

        if dst.NumBytes() == 0 {
                return 0, nil
        }
        n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
        if err == syserr.ErrWouldBlock {
                return int64(n), syserror.ErrWouldBlock
        }
        if err != nil {
                return 0, err.ToError()
        }
        return int64(n), nil
}

// Write implements vfs.FileDescriptionImpl.
func (s *SocketVFS2) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        // All flags other than RWF_NOWAIT should be ignored.
        // TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
        if opts.Flags != 0 {
                return 0, linuxerr.EOPNOTSUPP
        }

        r := src.Reader(ctx)
        n, err := s.Endpoint.Write(r, tcpip.WriteOptions{})
        if _, ok := err.(*tcpip.ErrWouldBlock); ok {
                return 0, syserror.ErrWouldBlock
        }
        if err != nil {
                return 0, syserr.TranslateNetstackError(err).ToError()
        }

        if n < src.NumBytes() {
                return n, syserror.ErrWouldBlock
        }

        return n, nil
}

// Accept implements the linux syscall accept(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketVFS2) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
        // Issue the accept request to get the new endpoint.
        var peerAddr *tcpip.FullAddress
        if peerRequested {
                peerAddr = &tcpip.FullAddress{}
        }
        ep, wq, terr := s.Endpoint.Accept(peerAddr)
        if terr != nil {
                if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking {
                        return 0, nil, 0, syserr.TranslateNetstackError(terr)
                }

                var err *syserr.Error
                ep, wq, err = s.blockingAccept(t, peerAddr)
                if err != nil {
                        return 0, nil, 0, err
                }
        }

        ns, err := NewVFS2(t, s.family, s.skType, s.protocol, wq, ep)
        if err != nil {
                return 0, nil, 0, err
        }
        defer ns.DecRef(t)

        if err := ns.SetStatusFlags(t, t.Credentials(), uint32(flags&linux.SOCK_NONBLOCK)); err != nil {
                return 0, nil, 0, syserr.FromError(err)
        }

        var addr linux.SockAddr
        var addrLen uint32
        if peerAddr != nil {
                // Get address of the peer and write it to peer slice.
                addr, addrLen = socket.ConvertAddress(s.family, *peerAddr)
        }

        fd, e := t.NewFDFromVFS2(0, ns, kernel.FDFlags{
                CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
        })

        t.Kernel().RecordSocketVFS2(ns)

        return fd, addr, addrLen, syserr.FromError(e)
}

// Ioctl implements vfs.FileDescriptionImpl.
func (s *SocketVFS2) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        return s.socketOpsCommon.ioctl(ctx, uio, args)
}

// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketVFS2) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
        // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
        // implemented specifically for netstack.SocketVFS2 rather than
        // commonEndpoint. commonEndpoint should be extended to support socket
        // options where the implementation is not shared, as unix sockets need
        // their own support for SO_TIMESTAMP.
        if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }
                val := primitive.Int32(0)
                s.readMu.Lock()
                defer s.readMu.Unlock()
                if s.sockOptTimestamp {
                        val = 1
                }
                return &val, nil
        }
        if level == linux.SOL_TCP && name == linux.TCP_INQ {
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }
                val := primitive.Int32(0)
                s.readMu.Lock()
                defer s.readMu.Unlock()
                if s.sockOptInq {
                        val = 1
                }
                return &val, nil
        }

        return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
}

// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketVFS2) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
        // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
        // implemented specifically for netstack.SocketVFS2 rather than
        // commonEndpoint. commonEndpoint should be extended to support socket
        // options where the implementation is not shared, as unix sockets need
        // their own support for SO_TIMESTAMP.
        if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }
                s.readMu.Lock()
                defer s.readMu.Unlock()
                s.sockOptTimestamp = hostarch.ByteOrder.Uint32(optVal) != 0
                return nil
        }
        if level == linux.SOL_TCP && name == linux.TCP_INQ {
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }
                s.readMu.Lock()
                defer s.readMu.Unlock()
                s.sockOptInq = hostarch.ByteOrder.Uint32(optVal) != 0
                return nil
        }

        return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
}















































  182 
  182 







  165 
  166 







    7 
    7 


























  417 
  418 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "gvisor.dev/gvisor/pkg/context"
)

// contextID is the kernel package's type for context.Context.Value keys.
type contextID int

const (
        // CtxCanTrace is a Context.Value key for a function with the same
        // signature and semantics as kernel.Task.CanTrace.
        CtxCanTrace contextID = iota

        // CtxKernel is a Context.Value key for a Kernel.
        CtxKernel

        // CtxPIDNamespace is a Context.Value key for a PIDNamespace.
        CtxPIDNamespace

        // CtxTask is a Context.Value key for a Task.
        CtxTask

        // CtxUTSNamespace is a Context.Value key for a UTSNamespace.
        CtxUTSNamespace

        // CtxIPCNamespace is a Context.Value key for a IPCNamespace.
        CtxIPCNamespace
)

// ContextCanTrace returns true if ctx is permitted to trace t, in the same sense
// as kernel.Task.CanTrace.
func ContextCanTrace(ctx context.Context, t *Task, attach bool) bool {
        if v := ctx.Value(CtxCanTrace); v != nil {
                return v.(func(*Task, bool) bool)(t, attach)
        }
        return false
}

// KernelFromContext returns the Kernel in which ctx is executing, or nil if
// there is no such Kernel.
func KernelFromContext(ctx context.Context) *Kernel {
        if v := ctx.Value(CtxKernel); v != nil {
                return v.(*Kernel)
        }
        return nil
}

// PIDNamespaceFromContext returns the PID namespace in which ctx is executing,
// or nil if there is no such PID namespace.
func PIDNamespaceFromContext(ctx context.Context) *PIDNamespace {
        if v := ctx.Value(CtxPIDNamespace); v != nil {
                return v.(*PIDNamespace)
        }
        return nil
}

// UTSNamespaceFromContext returns the UTS namespace in which ctx is executing,
// or nil if there is no such UTS namespace.
func UTSNamespaceFromContext(ctx context.Context) *UTSNamespace {
        if v := ctx.Value(CtxUTSNamespace); v != nil {
                return v.(*UTSNamespace)
        }
        return nil
}

// IPCNamespaceFromContext returns the IPC namespace in which ctx is executing,
// or nil if there is no such IPC namespace. It takes a reference on the
// namespace.
func IPCNamespaceFromContext(ctx context.Context) *IPCNamespace {
        if v := ctx.Value(CtxIPCNamespace); v != nil {
                return v.(*IPCNamespace)
        }
        return nil
}

// TaskFromContext returns the Task associated with ctx, or nil if there is no
// such Task.
func TaskFromContext(ctx context.Context) *Task {
        if v := ctx.Value(CtxTask); v != nil {
                return v.(*Task)
        }
        return nil
}

























    3 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

// renoRecovery stores the variables related to TCP Reno loss recovery
// algorithm.
//
// +stateify savable
type renoRecovery struct {
        s *sender
}

func newRenoRecovery(s *sender) *renoRecovery {
        return &renoRecovery{s: s}
}

func (rr *renoRecovery) DoRecovery(rcvdSeg *segment, fastRetransmit bool) {
        ack := rcvdSeg.ackNumber
        snd := rr.s

        // We are in fast recovery mode. Ignore the ack if it's out of range.
        if !ack.InRange(snd.SndUna, snd.SndNxt+1) {
                return
        }

        // Don't count this as a duplicate if it is carrying data or
        // updating the window.
        if rcvdSeg.logicalLen() != 0 || snd.SndWnd != rcvdSeg.window {
                return
        }

        // Inflate the congestion window if we're getting duplicate acks
        // for the packet we retransmitted.
        if !fastRetransmit && ack == snd.FastRecovery.First {
                // We received a dup, inflate the congestion window by 1 packet
                // if we're not at the max yet. Only inflate the window if
                // regular FastRecovery is in use, RFC6675 does not require
                // inflating cwnd on duplicate ACKs.
                if snd.SndCwnd < snd.FastRecovery.MaxCwnd {
                        snd.SndCwnd++
                }
                return
        }

        // A partial ack was received. Retransmit this packet and remember it
        // so that we don't retransmit it again.
        //
        // We don't inflate the window because we're putting the same packet
        // back onto the wire.
        //
        // N.B. The retransmit timer will be reset by the caller.
        snd.FastRecovery.First = ack
        snd.DupAckCount = 0
        snd.resendSegment()
}









































    2 





    2 
    1 




    1 



    1 






    1 
























    1 


    1 



    1 













    1 



    1 









    1 































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package netfilter

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

// emptyIPv4Filter is for comparison with a rule's filters to determine whether
// it is also empty. It is immutable.
var emptyIPv4Filter = stack.IPHeaderFilter{
        Dst:     "\x00\x00\x00\x00",
        DstMask: "\x00\x00\x00\x00",
        Src:     "\x00\x00\x00\x00",
        SrcMask: "\x00\x00\x00\x00",
}

// convertNetstackToBinary4 converts the iptables as stored in netstack to the
// format expected by the iptables tool. Linux stores each table as a binary
// blob that can only be traversed by parsing a little data, reading some
// offsets, jumping to those offsets, parsing again, etc.
func convertNetstackToBinary4(stk *stack.Stack, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo, error) {
        // The table name has to fit in the struct.
        if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
                return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
        }

        id, ok := nameToID[tablename.String()]
        if !ok {
                return linux.KernelIPTGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
        }

        // Setup the info struct.
        entries, info := getEntries4(stk.IPTables().GetTable(id, false), tablename)
        return entries, info, nil
}

func getEntries4(table stack.Table, tablename linux.TableName) (linux.KernelIPTGetEntries, linux.IPTGetinfo) {
        var info linux.IPTGetinfo
        var entries linux.KernelIPTGetEntries
        copy(info.Name[:], tablename[:])
        copy(entries.Name[:], info.Name[:])
        info.ValidHooks = table.ValidHooks()

        for ruleIdx, rule := range table.Rules {
                nflog("convert to binary: current offset: %d", entries.Size)

                setHooksAndUnderflow(&info, table, entries.Size, ruleIdx)
                // Each rule corresponds to an entry.
                entry := linux.KernelIPTEntry{
                        Entry: linux.IPTEntry{
                                IP: linux.IPTIP{
                                        Protocol: uint16(rule.Filter.Protocol),
                                },
                                NextOffset:   linux.SizeOfIPTEntry,
                                TargetOffset: linux.SizeOfIPTEntry,
                        },
                }
                copy(entry.Entry.IP.Dst[:], rule.Filter.Dst)
                copy(entry.Entry.IP.DstMask[:], rule.Filter.DstMask)
                copy(entry.Entry.IP.Src[:], rule.Filter.Src)
                copy(entry.Entry.IP.SrcMask[:], rule.Filter.SrcMask)
                copy(entry.Entry.IP.OutputInterface[:], rule.Filter.OutputInterface)
                copy(entry.Entry.IP.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
                copy(entry.Entry.IP.InputInterface[:], rule.Filter.InputInterface)
                copy(entry.Entry.IP.InputInterfaceMask[:], rule.Filter.InputInterfaceMask)
                if rule.Filter.DstInvert {
                        entry.Entry.IP.InverseFlags |= linux.IPT_INV_DSTIP
                }
                if rule.Filter.SrcInvert {
                        entry.Entry.IP.InverseFlags |= linux.IPT_INV_SRCIP
                }
                if rule.Filter.OutputInterfaceInvert {
                        entry.Entry.IP.InverseFlags |= linux.IPT_INV_VIA_OUT
                }

                for _, matcher := range rule.Matchers {
                        // Serialize the matcher and add it to the
                        // entry.
                        serialized := marshalMatcher(matcher)
                        nflog("convert to binary: matcher serialized as: %v", serialized)
                        if len(serialized)%8 != 0 {
                                panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
                        }
                        entry.Elems = append(entry.Elems, serialized...)
                        entry.Entry.NextOffset += uint16(len(serialized))
                        entry.Entry.TargetOffset += uint16(len(serialized))
                }

                // Serialize and append the target.
                serialized := marshalTarget(rule.Target)
                if len(serialized)%8 != 0 {
                        panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
                }
                entry.Elems = append(entry.Elems, serialized...)
                entry.Entry.NextOffset += uint16(len(serialized))

                nflog("convert to binary: adding entry: %+v", entry)

                entries.Size += uint32(entry.Entry.NextOffset)
                entries.Entrytable = append(entries.Entrytable, entry)
                info.NumEntries++
        }

        info.Size = entries.Size
        nflog("convert to binary: finished with an marshalled size of %d", info.Size)
        return entries, info
}

func modifyEntries4(task *kernel.Task, stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) {
        nflog("set entries: setting entries in table %q", replace.Name.String())

        // Convert input into a list of rules and their offsets.
        var offset uint32
        // offsets maps rule byte offsets to their position in table.Rules.
        offsets := map[uint32]int{}
        for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
                nflog("set entries: processing entry at offset %d", offset)

                // Get the struct ipt_entry.
                if len(optVal) < linux.SizeOfIPTEntry {
                        nflog("optVal has insufficient size for entry %d", len(optVal))
                        return nil, syserr.ErrInvalidArgument
                }
                var entry linux.IPTEntry
                entry.UnmarshalUnsafe(optVal[:entry.SizeBytes()])
                initialOptValLen := len(optVal)
                optVal = optVal[entry.SizeBytes():]

                if entry.TargetOffset < linux.SizeOfIPTEntry {
                        nflog("entry has too-small target offset %d", entry.TargetOffset)
                        return nil, syserr.ErrInvalidArgument
                }

                filter, err := filterFromIPTIP(entry.IP)
                if err != nil {
                        nflog("bad iptip: %v", err)
                        return nil, syserr.ErrInvalidArgument
                }

                // Get matchers.
                matchersSize := entry.TargetOffset - linux.SizeOfIPTEntry
                if len(optVal) < int(matchersSize) {
                        nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
                        return nil, syserr.ErrInvalidArgument
                }
                matchers, err := parseMatchers(task, filter, optVal[:matchersSize])
                if err != nil {
                        nflog("failed to parse matchers: %v", err)
                        return nil, syserr.ErrInvalidArgument
                }
                optVal = optVal[matchersSize:]

                // Get the target of the rule.
                targetSize := entry.NextOffset - entry.TargetOffset
                if len(optVal) < int(targetSize) {
                        nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
                        return nil, syserr.ErrInvalidArgument
                }

                rule := stack.Rule{
                        Filter:   filter,
                        Matchers: matchers,
                }

                {
                        target, err := parseTarget(filter, optVal[:targetSize], false /* ipv6 */)
                        if err != nil {
                                nflog("failed to parse target: %v", err)
                                return nil, err
                        }
                        rule.Target = target
                }
                optVal = optVal[targetSize:]

                table.Rules = append(table.Rules, rule)
                offsets[offset] = int(entryIdx)
                offset += uint32(entry.NextOffset)

                if initialOptValLen-len(optVal) != int(entry.NextOffset) {
                        nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
                        return nil, syserr.ErrInvalidArgument
                }
        }
        return offsets, nil
}

func filterFromIPTIP(iptip linux.IPTIP) (stack.IPHeaderFilter, error) {
        if containsUnsupportedFields4(iptip) {
                return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
        }
        if len(iptip.Dst) != header.IPv4AddressSize || len(iptip.DstMask) != header.IPv4AddressSize {
                return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
        }
        if len(iptip.Src) != header.IPv4AddressSize || len(iptip.SrcMask) != header.IPv4AddressSize {
                return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
        }

        return stack.IPHeaderFilter{
                Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
                // A Protocol value of 0 indicates all protocols match.
                CheckProtocol:         iptip.Protocol != 0,
                Dst:                   tcpip.Address(iptip.Dst[:]),
                DstMask:               tcpip.Address(iptip.DstMask[:]),
                DstInvert:             iptip.InverseFlags&linux.IPT_INV_DSTIP != 0,
                Src:                   tcpip.Address(iptip.Src[:]),
                SrcMask:               tcpip.Address(iptip.SrcMask[:]),
                SrcInvert:             iptip.InverseFlags&linux.IPT_INV_SRCIP != 0,
                InputInterface:        string(trimNullBytes(iptip.InputInterface[:])),
                InputInterfaceMask:    string(trimNullBytes(iptip.InputInterfaceMask[:])),
                InputInterfaceInvert:  iptip.InverseFlags&linux.IPT_INV_VIA_IN != 0,
                OutputInterface:       string(trimNullBytes(iptip.OutputInterface[:])),
                OutputInterfaceMask:   string(trimNullBytes(iptip.OutputInterfaceMask[:])),
                OutputInterfaceInvert: iptip.InverseFlags&linux.IPT_INV_VIA_OUT != 0,
        }, nil
}

func containsUnsupportedFields4(iptip linux.IPTIP) bool {
        // The following features are supported:
        // - Protocol
        // - Dst and DstMask
        // - Src and SrcMask
        // - The inverse destination IP check flag
        // - InputInterface, InputInterfaceMask and its inverse.
        // - OutputInterface, OutputInterfaceMask and its inverse.
        const flagMask = 0
        // Disable any supported inverse flags.
        const inverseMask = linux.IPT_INV_DSTIP | linux.IPT_INV_SRCIP |
                linux.IPT_INV_VIA_IN | linux.IPT_INV_VIA_OUT
        return iptip.Flags&^flagMask != 0 ||
                iptip.InverseFlags&^inverseMask != 0
}








































  187 





  185 



































  186 







  185 













    1 


    1 






    1 

    1 










    1 


    1 



    1 
    1 


    1 














    2 




    2 







    2 
























    2 














    4 

    4 





    4 




    3 







    3 
    3 
    1 







    3 



    3 





















    3 










    2 



    2 



    3 



    4 


    3 











    3 

    2 

    2 







    2 
















    1 

    1 



    1 








    2 





    2 





    2 



    2 











    2 



    1 
    2 



    2 



    2 




    2 






















    2 






    2 





    2 





    2 



























    2 













    1 















    1 
















    2 







    2 





    2 



    2 








    1 



    1 
    1 



    1 



    1 
    1 



    1 




















    1 





    1 





    1 





    1 












    2 













































    1 




    1 



    1 




    1 








    1 

    1 




    1 
    1 






    1 

    1 



    1 



    1 















    2 



    2 

    2 




    2 



    2 
    2 



    2 








    2 


    2 















    2 















    1 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package proc

import (
        "bytes"
        "fmt"
        "io"
        "reflect"
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/inet"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/socket"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/tcpip/header"
)

func (fs *filesystem) newTaskNetDir(ctx context.Context, task *kernel.Task) kernfs.Inode {
        k := task.Kernel()
        pidns := task.PIDNamespace()
        root := auth.NewRootCredentials(pidns.UserNamespace())

        var contents map[string]kernfs.Inode
        if stack := task.NetworkNamespace().Stack(); stack != nil {
                const (
                        arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
                        netlink   = "sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n"
                        packet    = "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n"
                        protocols = "protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"
                        ptype     = "Type Device      Function\n"
                        upd6      = "  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n"
                )
                psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond))

                // TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task
                // network namespace.
                contents = map[string]kernfs.Inode{
                        "dev":  fs.newInode(ctx, root, 0444, &netDevData{stack: stack}),
                        "snmp": fs.newInode(ctx, root, 0444, &netSnmpData{stack: stack}),

                        // The following files are simple stubs until they are implemented in
                        // netstack, if the file contains a header the stub is just the header
                        // otherwise it is an empty file.
                        "arp":       fs.newInode(ctx, root, 0444, newStaticFile(arp)),
                        "netlink":   fs.newInode(ctx, root, 0444, newStaticFile(netlink)),
                        "netstat":   fs.newInode(ctx, root, 0444, &netStatData{}),
                        "packet":    fs.newInode(ctx, root, 0444, newStaticFile(packet)),
                        "protocols": fs.newInode(ctx, root, 0444, newStaticFile(protocols)),

                        // Linux sets psched values to: nsec per usec, psched tick in ns, 1000000,
                        // high res timer ticks per sec (ClockGetres returns 1ns resolution).
                        "psched": fs.newInode(ctx, root, 0444, newStaticFile(psched)),
                        "ptype":  fs.newInode(ctx, root, 0444, newStaticFile(ptype)),
                        "route":  fs.newInode(ctx, root, 0444, &netRouteData{stack: stack}),
                        "tcp":    fs.newInode(ctx, root, 0444, &netTCPData{kernel: k}),
                        "udp":    fs.newInode(ctx, root, 0444, &netUDPData{kernel: k}),
                        "unix":   fs.newInode(ctx, root, 0444, &netUnixData{kernel: k}),
                }

                if stack.SupportsIPv6() {
                        contents["if_inet6"] = fs.newInode(ctx, root, 0444, &ifinet6{stack: stack})
                        contents["ipv6_route"] = fs.newInode(ctx, root, 0444, newStaticFile(""))
                        contents["tcp6"] = fs.newInode(ctx, root, 0444, &netTCP6Data{kernel: k})
                        contents["udp6"] = fs.newInode(ctx, root, 0444, newStaticFile(upd6))
                }
        }

        return fs.newTaskOwnedDir(ctx, task, fs.NextIno(), 0555, contents)
}

// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6.
//
// +stateify savable
type ifinet6 struct {
        kernfs.DynamicBytesFile

        stack inet.Stack
}

var _ dynamicInode = (*ifinet6)(nil)

func (n *ifinet6) contents() []string {
        var lines []string
        nics := n.stack.Interfaces()
        for id, naddrs := range n.stack.InterfaceAddrs() {
                nic, ok := nics[id]
                if !ok {
                        // NIC was added after NICNames was called. We'll just ignore it.
                        continue
                }

                for _, a := range naddrs {
                        // IPv6 only.
                        if a.Family != linux.AF_INET6 {
                                continue
                        }

                        // Fields:
                        // IPv6 address displayed in 32 hexadecimal chars without colons
                        // Netlink device number (interface index) in hexadecimal (use nic id)
                        // Prefix length in hexadecimal
                        // Scope value (use 0)
                        // Interface flags
                        // Device name
                        lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name))
                }
        }
        return lines
}

// Generate implements vfs.DynamicBytesSource.Generate.
func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error {
        for _, l := range n.contents() {
                buf.WriteString(l)
        }
        return nil
}

// netDevData implements vfs.DynamicBytesSource for /proc/net/dev.
//
// +stateify savable
type netDevData struct {
        kernfs.DynamicBytesFile

        stack inet.Stack
}

var _ dynamicInode = (*netDevData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (n *netDevData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        interfaces := n.stack.Interfaces()
        buf.WriteString("Inter-|   Receive                                                |  Transmit\n")
        buf.WriteString(" face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed\n")

        for _, i := range interfaces {
                // Implements the same format as
                // net/core/net-procfs.c:dev_seq_printf_stats.
                var stats inet.StatDev
                if err := n.stack.Statistics(&stats, i.Name); err != nil {
                        log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err)
                        continue
                }
                fmt.Fprintf(
                        buf,
                        "%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n",
                        i.Name,
                        // Received
                        stats[0], // bytes
                        stats[1], // packets
                        stats[2], // errors
                        stats[3], // dropped
                        stats[4], // fifo
                        stats[5], // frame
                        stats[6], // compressed
                        stats[7], // multicast
                        // Transmitted
                        stats[8],  // bytes
                        stats[9],  // packets
                        stats[10], // errors
                        stats[11], // dropped
                        stats[12], // fifo
                        stats[13], // frame
                        stats[14], // compressed
                        stats[15], // multicast
                )
        }

        return nil
}

// netUnixData implements vfs.DynamicBytesSource for /proc/net/unix.
//
// +stateify savable
type netUnixData struct {
        kernfs.DynamicBytesFile

        kernel *kernel.Kernel
}

var _ dynamicInode = (*netUnixData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        buf.WriteString("Num       RefCount Protocol Flags    Type St Inode Path\n")
        for _, se := range n.kernel.ListSockets() {
                s := se.SockVFS2
                if !s.TryIncRef() {
                        // Racing with socket destruction, this is ok.
                        continue
                }
                if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX {
                        s.DecRef(ctx)
                        // Not a unix socket.
                        continue
                }
                sops := s.Impl().(*unix.SocketVFS2)

                addr, err := sops.Endpoint().GetLocalAddress()
                if err != nil {
                        log.Warningf("Failed to retrieve socket name from %+v: %v", s, err)
                        addr.Addr = "<unknown>"
                }

                sockFlags := 0
                if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok {
                        if ce.Listening() {
                                // For unix domain sockets, linux reports a single flag
                                // value if the socket is listening, of __SO_ACCEPTCON.
                                sockFlags = linux.SO_ACCEPTCON
                        }
                }

                // Get inode number.
                var ino uint64
                stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_INO})
                if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
                        log.Warningf("Failed to retrieve ino for socket file: %v", statErr)
                } else {
                        ino = stat.Ino
                }

                // In the socket entry below, the value for the 'Num' field requires
                // some consideration. Linux prints the address to the struct
                // unix_sock representing a socket in the kernel, but may redact the
                // value for unprivileged users depending on the kptr_restrict
                // sysctl.
                //
                // One use for this field is to allow a privileged user to
                // introspect into the kernel memory to determine information about
                // a socket not available through procfs, such as the socket's peer.
                //
                // In gvisor, returning a pointer to our internal structures would
                // be pointless, as it wouldn't match the memory layout for struct
                // unix_sock, making introspection difficult. We could populate a
                // struct unix_sock with the appropriate data, but even that
                // requires consideration for which kernel version to emulate, as
                // the definition of this struct changes over time.
                //
                // For now, we always redact this pointer.
                fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d",
                        (*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
                        s.ReadRefs()-1,                // RefCount, don't count our own ref.
                        0,                             // Protocol, always 0 for UDS.
                        sockFlags,                     // Flags.
                        sops.Endpoint().Type(),        // Type.
                        sops.State(),                  // State.
                        ino,                           // Inode.
                )

                // Path
                if len(addr.Addr) != 0 {
                        if addr.Addr[0] == 0 {
                                // Abstract path.
                                fmt.Fprintf(buf, " @%s", string(addr.Addr[1:]))
                        } else {
                                fmt.Fprintf(buf, " %s", string(addr.Addr))
                        }
                }
                fmt.Fprintf(buf, "\n")

                s.DecRef(ctx)
        }
        return nil
}

func networkToHost16(n uint16) uint16 {
        // n is in network byte order, so is big-endian. The most-significant byte
        // should be stored in the lower address.
        //
        // We manually inline binary.BigEndian.Uint16() because Go does not support
        // non-primitive consts, so binary.BigEndian is a (mutable) var, so calls to
        // binary.BigEndian.Uint16() require a read of binary.BigEndian and an
        // interface method call, defeating inlining.
        buf := [2]byte{byte(n >> 8 & 0xff), byte(n & 0xff)}
        return hostarch.ByteOrder.Uint16(buf[:])
}

func writeInetAddr(w io.Writer, family int, i linux.SockAddr) {
        switch family {
        case linux.AF_INET:
                var a linux.SockAddrInet
                if i != nil {
                        a = *i.(*linux.SockAddrInet)
                }

                // linux.SockAddrInet.Port is stored in the network byte order and is
                // printed like a number in host byte order. Note that all numbers in host
                // byte order are printed with the most-significant byte first when
                // formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux.
                port := networkToHost16(a.Port)

                // linux.SockAddrInet.Addr is stored as a byte slice in big-endian order
                // (i.e. most-significant byte in index 0). Linux represents this as a
                // __be32 which is a typedef for an unsigned int, and is printed with
                // %X. This means that for a little-endian machine, Linux prints the
                // least-significant byte of the address first. To emulate this, we first
                // invert the byte order for the address using hostarch.ByteOrder.Uint32,
                // which makes it have the equivalent encoding to a __be32 on a little
                // endian machine. Note that this operation is a no-op on a big endian
                // machine. Then similar to Linux, we format it with %X, which will print
                // the most-significant byte of the __be32 address first, which is now
                // actually the least-significant byte of the original address in
                // linux.SockAddrInet.Addr on little endian machines, due to the conversion.
                addr := hostarch.ByteOrder.Uint32(a.Addr[:])

                fmt.Fprintf(w, "%08X:%04X ", addr, port)
        case linux.AF_INET6:
                var a linux.SockAddrInet6
                if i != nil {
                        a = *i.(*linux.SockAddrInet6)
                }

                port := networkToHost16(a.Port)
                addr0 := hostarch.ByteOrder.Uint32(a.Addr[0:4])
                addr1 := hostarch.ByteOrder.Uint32(a.Addr[4:8])
                addr2 := hostarch.ByteOrder.Uint32(a.Addr[8:12])
                addr3 := hostarch.ByteOrder.Uint32(a.Addr[12:16])
                fmt.Fprintf(w, "%08X%08X%08X%08X:%04X ", addr0, addr1, addr2, addr3, port)
        }
}

func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, family int) error {
        // t may be nil here if our caller is not part of a task goroutine. This can
        // happen for example if we're here for "sentryctl cat". When t is nil,
        // degrade gracefully and retrieve what we can.
        t := kernel.TaskFromContext(ctx)

        for _, se := range k.ListSockets() {
                s := se.SockVFS2
                if !s.TryIncRef() {
                        // Racing with socket destruction, this is ok.
                        continue
                }
                sops, ok := s.Impl().(socket.SocketVFS2)
                if !ok {
                        panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s))
                }
                if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) {
                        s.DecRef(ctx)
                        // Not tcp4 sockets.
                        continue
                }

                // Linux's documentation for the fields below can be found at
                // https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt.
                // For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock().
                // Note that the header doesn't contain labels for all the fields.

                // Field: sl; entry number.
                fmt.Fprintf(buf, "%4d: ", se.ID)

                // Field: local_adddress.
                var localAddr linux.SockAddr
                if t != nil {
                        if local, _, err := sops.GetSockName(t); err == nil {
                                localAddr = local
                        }
                }
                writeInetAddr(buf, family, localAddr)

                // Field: rem_address.
                var remoteAddr linux.SockAddr
                if t != nil {
                        if remote, _, err := sops.GetPeerName(t); err == nil {
                                remoteAddr = remote
                        }
                }
                writeInetAddr(buf, family, remoteAddr)

                // Field: state; socket state.
                fmt.Fprintf(buf, "%02X ", sops.State())

                // Field: tx_queue, rx_queue; number of packets in the transmit and
                // receive queue. Unimplemented.
                fmt.Fprintf(buf, "%08X:%08X ", 0, 0)

                // Field: tr, tm->when; timer active state and number of jiffies
                // until timer expires. Unimplemented.
                fmt.Fprintf(buf, "%02X:%08X ", 0, 0)

                // Field: retrnsmt; number of unrecovered RTO timeouts.
                // Unimplemented.
                fmt.Fprintf(buf, "%08X ", 0)

                stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO})

                // Field: uid.
                if statErr != nil || stat.Mask&linux.STATX_UID == 0 {
                        log.Warningf("Failed to retrieve uid for socket file: %v", statErr)
                        fmt.Fprintf(buf, "%5d ", 0)
                } else {
                        creds := auth.CredentialsFromContext(ctx)
                        fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow()))
                }

                // Field: timeout; number of unanswered 0-window probes.
                // Unimplemented.
                fmt.Fprintf(buf, "%8d ", 0)

                // Field: inode.
                if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
                        log.Warningf("Failed to retrieve inode for socket file: %v", statErr)
                        fmt.Fprintf(buf, "%8d ", 0)
                } else {
                        fmt.Fprintf(buf, "%8d ", stat.Ino)
                }

                // Field: refcount. Don't count the ref we obtain while deferencing
                // the weakref to this socket.
                fmt.Fprintf(buf, "%d ", s.ReadRefs()-1)

                // Field: Socket struct address. Redacted due to the same reason as
                // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
                fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))

                // Field: retransmit timeout. Unimplemented.
                fmt.Fprintf(buf, "%d ", 0)

                // Field: predicted tick of soft clock (delayed ACK control data).
                // Unimplemented.
                fmt.Fprintf(buf, "%d ", 0)

                // Field: (ack.quick<<1)|ack.pingpong, Unimplemented.
                fmt.Fprintf(buf, "%d ", 0)

                // Field: sending congestion window, Unimplemented.
                fmt.Fprintf(buf, "%d ", 0)

                // Field: Slow start size threshold, -1 if threshold >= 0xFFFF.
                // Unimplemented, report as large threshold.
                fmt.Fprintf(buf, "%d", -1)

                fmt.Fprintf(buf, "\n")

                s.DecRef(ctx)
        }

        return nil
}

// netTCPData implements vfs.DynamicBytesSource for /proc/net/tcp.
//
// +stateify savable
type netTCPData struct {
        kernfs.DynamicBytesFile

        kernel *kernel.Kernel
}

var _ dynamicInode = (*netTCPData)(nil)

func (d *netTCPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode                                                     \n")
        return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET)
}

// netTCP6Data implements vfs.DynamicBytesSource for /proc/net/tcp6.
//
// +stateify savable
type netTCP6Data struct {
        kernfs.DynamicBytesFile

        kernel *kernel.Kernel
}

var _ dynamicInode = (*netTCP6Data)(nil)

func (d *netTCP6Data) Generate(ctx context.Context, buf *bytes.Buffer) error {
        buf.WriteString("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n")
        return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET6)
}

// netUDPData implements vfs.DynamicBytesSource for /proc/net/udp.
//
// +stateify savable
type netUDPData struct {
        kernfs.DynamicBytesFile

        kernel *kernel.Kernel
}

var _ dynamicInode = (*netUDPData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        // t may be nil here if our caller is not part of a task goroutine. This can
        // happen for example if we're here for "sentryctl cat". When t is nil,
        // degrade gracefully and retrieve what we can.
        t := kernel.TaskFromContext(ctx)

        buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops             \n")

        for _, se := range d.kernel.ListSockets() {
                s := se.SockVFS2
                if !s.TryIncRef() {
                        // Racing with socket destruction, this is ok.
                        continue
                }
                sops, ok := s.Impl().(socket.SocketVFS2)
                if !ok {
                        panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s))
                }
                if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
                        s.DecRef(ctx)
                        // Not udp4 socket.
                        continue
                }

                // For Linux's implementation, see net/ipv4/udp.c:udp4_format_sock().

                // Field: sl; entry number.
                fmt.Fprintf(buf, "%5d: ", se.ID)

                // Field: local_adddress.
                var localAddr linux.SockAddrInet
                if t != nil {
                        if local, _, err := sops.GetSockName(t); err == nil {
                                localAddr = *local.(*linux.SockAddrInet)
                        }
                }
                writeInetAddr(buf, linux.AF_INET, &localAddr)

                // Field: rem_address.
                var remoteAddr linux.SockAddrInet
                if t != nil {
                        if remote, _, err := sops.GetPeerName(t); err == nil {
                                remoteAddr = *remote.(*linux.SockAddrInet)
                        }
                }
                writeInetAddr(buf, linux.AF_INET, &remoteAddr)

                // Field: state; socket state.
                fmt.Fprintf(buf, "%02X ", sops.State())

                // Field: tx_queue, rx_queue; number of packets in the transmit and
                // receive queue. Unimplemented.
                fmt.Fprintf(buf, "%08X:%08X ", 0, 0)

                // Field: tr, tm->when. Always 0 for UDP.
                fmt.Fprintf(buf, "%02X:%08X ", 0, 0)

                // Field: retrnsmt. Always 0 for UDP.
                fmt.Fprintf(buf, "%08X ", 0)

                stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO})

                // Field: uid.
                if statErr != nil || stat.Mask&linux.STATX_UID == 0 {
                        log.Warningf("Failed to retrieve uid for socket file: %v", statErr)
                        fmt.Fprintf(buf, "%5d ", 0)
                } else {
                        creds := auth.CredentialsFromContext(ctx)
                        fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow()))
                }

                // Field: timeout. Always 0 for UDP.
                fmt.Fprintf(buf, "%8d ", 0)

                // Field: inode.
                if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
                        log.Warningf("Failed to retrieve inode for socket file: %v", statErr)
                        fmt.Fprintf(buf, "%8d ", 0)
                } else {
                        fmt.Fprintf(buf, "%8d ", stat.Ino)
                }

                // Field: ref; reference count on the socket inode. Don't count the ref
                // we obtain while deferencing the weakref to this socket.
                fmt.Fprintf(buf, "%d ", s.ReadRefs()-1)

                // Field: Socket struct address. Redacted due to the same reason as
                // the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
                fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))

                // Field: drops; number of dropped packets. Unimplemented.
                fmt.Fprintf(buf, "%d", 0)

                fmt.Fprintf(buf, "\n")

                s.DecRef(ctx)
        }
        return nil
}

// netSnmpData implements vfs.DynamicBytesSource for /proc/net/snmp.
//
// +stateify savable
type netSnmpData struct {
        kernfs.DynamicBytesFile

        stack inet.Stack
}

var _ dynamicInode = (*netSnmpData)(nil)

// +stateify savable
type snmpLine struct {
        prefix string
        header string
}

var snmp = []snmpLine{
        {
                prefix: "Ip",
                header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates",
        },
        {
                prefix: "Icmp",
                header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps",
        },
        {
                prefix: "IcmpMsg",
        },
        {
                prefix: "Tcp",
                header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors",
        },
        {
                prefix: "Udp",
                header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
        },
        {
                prefix: "UdpLite",
                header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
        },
}

func toSlice(a interface{}) []uint64 {
        v := reflect.Indirect(reflect.ValueOf(a))
        return v.Slice(0, v.Len()).Interface().([]uint64)
}

func sprintSlice(s []uint64) string {
        if len(s) == 0 {
                return ""
        }
        r := fmt.Sprint(s)
        return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
}

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        types := []interface{}{
                &inet.StatSNMPIP{},
                &inet.StatSNMPICMP{},
                nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats.
                &inet.StatSNMPTCP{},
                &inet.StatSNMPUDP{},
                &inet.StatSNMPUDPLite{},
        }
        for i, stat := range types {
                line := snmp[i]
                if stat == nil {
                        fmt.Fprintf(buf, "%s:\n", line.prefix)
                        fmt.Fprintf(buf, "%s:\n", line.prefix)
                        continue
                }
                if err := d.stack.Statistics(stat, line.prefix); err != nil {
                        if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) {
                                log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
                        } else {
                                log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
                        }
                }

                fmt.Fprintf(buf, "%s: %s\n", line.prefix, line.header)

                if line.prefix == "Tcp" {
                        tcp := stat.(*inet.StatSNMPTCP)
                        // "Tcp" needs special processing because MaxConn is signed. RFC 2012.
                        fmt.Fprintf(buf, "%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:]))
                } else {
                        fmt.Fprintf(buf, "%s: %s\n", line.prefix, sprintSlice(toSlice(stat)))
                }
        }
        return nil
}

// netRouteData implements vfs.DynamicBytesSource for /proc/net/route.
//
// +stateify savable
type netRouteData struct {
        kernfs.DynamicBytesFile

        stack inet.Stack
}

var _ dynamicInode = (*netRouteData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT")

        interfaces := d.stack.Interfaces()
        for _, rt := range d.stack.RouteTable() {
                // /proc/net/route only includes ipv4 routes.
                if rt.Family != linux.AF_INET {
                        continue
                }

                // /proc/net/route does not include broadcast or multicast routes.
                if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST {
                        continue
                }

                iface, ok := interfaces[rt.OutputInterface]
                if !ok || iface.Name == "lo" {
                        continue
                }

                var (
                        gw     uint32
                        prefix uint32
                        flags  = linux.RTF_UP
                )
                if len(rt.GatewayAddr) == header.IPv4AddressSize {
                        flags |= linux.RTF_GATEWAY
                        gw = hostarch.ByteOrder.Uint32(rt.GatewayAddr)
                }
                if len(rt.DstAddr) == header.IPv4AddressSize {
                        prefix = hostarch.ByteOrder.Uint32(rt.DstAddr)
                }
                l := fmt.Sprintf(
                        "%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d",
                        iface.Name,
                        prefix,
                        gw,
                        flags,
                        0, // RefCnt.
                        0, // Use.
                        0, // Metric.
                        (uint32(1)<<rt.DstLen)-1,
                        0, // MTU.
                        0, // Window.
                        0, // RTT.
                )
                fmt.Fprintf(buf, "%-127s\n", l)
        }
        return nil
}

// netStatData implements vfs.DynamicBytesSource for /proc/net/netstat.
//
// +stateify savable
type netStatData struct {
        kernfs.DynamicBytesFile

        stack inet.Stack
}

var _ dynamicInode = (*netStatData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " +
                "EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps " +
                "LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive " +
                "PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost " +
                "ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog " +
                "TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser " +
                "TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging " +
                "TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo " +
                "TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit " +
                "TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans " +
                "TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes " +
                "TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail " +
                "TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent " +
                "TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose " +
                "TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed " +
                "TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld " +
                "TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected " +
                "TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback " +
                "TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter " +
                "TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail " +
                "TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK " +
                "TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail " +
                "TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow " +
                "TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets " +
                "TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv " +
                "TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect " +
                "TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd " +
                "TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq " +
                "TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge " +
                "TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")
        return nil
}


















































   27 










    3 




  612 




  221 




   31 
    1 
























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
)

// A Filesystem is a tree of nodes represented by Dentries, which forms part of
// a VirtualFilesystem.
//
// Filesystems are reference-counted. Unless otherwise specified, all
// Filesystem methods require that a reference is held.
//
// Filesystem is analogous to Linux's struct super_block.
//
// +stateify savable
type Filesystem struct {
        FilesystemRefs

        // vfs is the VirtualFilesystem that uses this Filesystem. vfs is
        // immutable.
        vfs *VirtualFilesystem

        // fsType is the FilesystemType of this Filesystem.
        fsType FilesystemType

        // impl is the FilesystemImpl associated with this Filesystem. impl is
        // immutable. This should be the last field in Dentry.
        impl FilesystemImpl
}

// Init must be called before first use of fs.
func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, fsType FilesystemType, impl FilesystemImpl) {
        fs.InitRefs()
        fs.vfs = vfsObj
        fs.fsType = fsType
        fs.impl = impl
        vfsObj.filesystemsMu.Lock()
        vfsObj.filesystems[fs] = struct{}{}
        vfsObj.filesystemsMu.Unlock()
}

// FilesystemType returns the FilesystemType for this Filesystem.
func (fs *Filesystem) FilesystemType() FilesystemType {
        return fs.fsType
}

// VirtualFilesystem returns the containing VirtualFilesystem.
func (fs *Filesystem) VirtualFilesystem() *VirtualFilesystem {
        return fs.vfs
}

// Impl returns the FilesystemImpl associated with fs.
func (fs *Filesystem) Impl() FilesystemImpl {
        return fs.impl
}

// DecRef decrements fs' reference count.
func (fs *Filesystem) DecRef(ctx context.Context) {
        fs.FilesystemRefs.DecRef(func() {
                fs.vfs.filesystemsMu.Lock()
                delete(fs.vfs.filesystems, fs)
                fs.vfs.filesystemsMu.Unlock()
                fs.impl.Release(ctx)
        })
}

// FilesystemImpl contains implementation details for a Filesystem.
// Implementations of FilesystemImpl should contain their associated Filesystem
// by value as their first field.
//
// All methods that take a ResolvingPath must resolve the path before
// performing any other checks, including rejection of the operation if not
// supported by the FilesystemImpl. This is because the final FilesystemImpl
// (responsible for actually implementing the operation) isn't known until path
// resolution is complete.
//
// Unless otherwise specified, FilesystemImpl methods are responsible for
// performing permission checks. In many cases, vfs package functions in
// permissions.go may be used to help perform these checks.
//
// When multiple specified error conditions apply to a given method call, the
// implementation may return any applicable errno unless otherwise specified,
// but returning the earliest error specified is preferable to maximize
// compatibility with Linux.
//
// All methods may return errors not specified, notably including:
//
// - ENOENT if a required path component does not exist.
//
// - ENOTDIR if an intermediate path component is not a directory.
//
// - Errors from vfs-package functions (ResolvingPath.Resolve*(),
// Mount.CheckBeginWrite(), permission-checking functions, etc.)
//
// For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid
// should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID
// and auth.KGID respectively).
//
// FilesystemImpl combines elements of Linux's struct super_operations and
// struct inode_operations, for reasons described in the documentation for
// Dentry.
type FilesystemImpl interface {
        // Release is called when the associated Filesystem reaches zero
        // references.
        Release(ctx context.Context)

        // Sync "causes all pending modifications to filesystem metadata and cached
        // file data to be written to the underlying [filesystem]", as by syncfs(2).
        Sync(ctx context.Context) error

        // AccessAt checks whether a user with creds can access the file at rp.
        AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error

        // GetDentryAt returns a Dentry representing the file at rp. A reference is
        // taken on the returned Dentry.
        //
        // GetDentryAt does not correspond directly to a Linux syscall; it is used
        // in the implementation of:
        //
        // - Syscalls that need to resolve two paths: link(), linkat().
        //
        // - Syscalls that need to refer to a filesystem position outside the
        // context of a file description: chdir(), fchdir(), chroot(), mount(),
        // umount().
        GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error)

        // GetParentDentryAt returns a Dentry representing the directory at the
        // second-to-last path component in rp. (Note that, despite the name, this
        // is not necessarily the parent directory of the file at rp, since the
        // last path component in rp may be "." or "..".) A reference is taken on
        // the returned Dentry.
        //
        // GetParentDentryAt does not correspond directly to a Linux syscall; it is
        // used in the implementation of the rename() family of syscalls, which
        // must resolve the parent directories of two paths.
        //
        // Preconditions: !rp.Done().
        //
        // Postconditions: If GetParentDentryAt returns a nil error, then
        // rp.Final(). If GetParentDentryAt returns an error returned by
        // ResolvingPath.Resolve*(), then !rp.Done().
        GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error)

        // LinkAt creates a hard link at rp representing the same file as vd. It
        // does not take ownership of references on vd.
        //
        // Errors:
        //
        // - If the last path component in rp is "." or "..", LinkAt returns
        // EEXIST.
        //
        // - If a file already exists at rp, LinkAt returns EEXIST.
        //
        // - If rp.MustBeDir(), LinkAt returns ENOENT.
        //
        // - If the directory in which the link would be created has been removed
        // by RmdirAt or RenameAt, LinkAt returns ENOENT.
        //
        // - If rp.Mount != vd.Mount(), LinkAt returns EXDEV.
        //
        // - If vd represents a directory, LinkAt returns EPERM.
        //
        // - If vd represents a file for which all existing links have been
        // removed, or a file created by open(O_TMPFILE|O_EXCL), LinkAt returns
        // ENOENT. Equivalently, if vd represents a file with a link count of 0 not
        // created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT.
        //
        // Preconditions:
        // * !rp.Done().
        // * For the final path component in rp, !rp.ShouldFollowSymlink().
        //
        // Postconditions: If LinkAt returns an error returned by
        // ResolvingPath.Resolve*(), then !rp.Done().
        LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error

        // MkdirAt creates a directory at rp.
        //
        // Errors:
        //
        // - If the last path component in rp is "." or "..", MkdirAt returns
        // EEXIST.
        //
        // - If a file already exists at rp, MkdirAt returns EEXIST.
        //
        // - If the directory in which the new directory would be created has been
        // removed by RmdirAt or RenameAt, MkdirAt returns ENOENT.
        //
        // Preconditions:
        // * !rp.Done().
        // * For the final path component in rp, !rp.ShouldFollowSymlink().
        //
        // Postconditions: If MkdirAt returns an error returned by
        // ResolvingPath.Resolve*(), then !rp.Done().
        MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error

        // MknodAt creates a regular file, device special file, or named pipe at
        // rp.
        //
        // Errors:
        //
        // - If the last path component in rp is "." or "..", MknodAt returns
        // EEXIST.
        //
        // - If a file already exists at rp, MknodAt returns EEXIST.
        //
        // - If rp.MustBeDir(), MknodAt returns ENOENT.
        //
        // - If the directory in which the file would be created has been removed
        // by RmdirAt or RenameAt, MknodAt returns ENOENT.
        //
        // Preconditions:
        // * !rp.Done().
        // * For the final path component in rp, !rp.ShouldFollowSymlink().
        //
        // Postconditions: If MknodAt returns an error returned by
        // ResolvingPath.Resolve*(), then !rp.Done().
        MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error

        // OpenAt returns an FileDescription providing access to the file at rp. A
        // reference is taken on the returned FileDescription.
        //
        // Errors:
        //
        // - If opts.Flags specifies O_TMPFILE and this feature is unsupported by
        // the implementation, OpenAt returns EOPNOTSUPP. (All other unsupported
        // features are silently ignored, consistently with Linux's open*(2).)
        OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error)

        // ReadlinkAt returns the target of the symbolic link at rp.
        //
        // Errors:
        //
        // - If the file at rp is not a symbolic link, ReadlinkAt returns EINVAL.
        ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error)

        // RenameAt renames the file named oldName in directory oldParentVD to rp.
        // It does not take ownership of references on oldParentVD.
        //
        // Errors [1]:
        //
        // - If opts.Flags specifies unsupported options, RenameAt returns EINVAL.
        //
        // - If the last path component in rp is "." or "..", and opts.Flags
        // contains RENAME_NOREPLACE, RenameAt returns EEXIST.
        //
        // - If the last path component in rp is "." or "..", and opts.Flags does
        // not contain RENAME_NOREPLACE, RenameAt returns EBUSY.
        //
        // - If rp.Mount != oldParentVD.Mount(), RenameAt returns EXDEV.
        //
        // - If the renamed file is not a directory, and opts.MustBeDir is true,
        // RenameAt returns ENOTDIR.
        //
        // - If renaming would replace an existing file and opts.Flags contains
        // RENAME_NOREPLACE, RenameAt returns EEXIST.
        //
        // - If there is no existing file at rp and opts.Flags contains
        // RENAME_EXCHANGE, RenameAt returns ENOENT.
        //
        // - If there is an existing non-directory file at rp, and rp.MustBeDir()
        // is true, RenameAt returns ENOTDIR.
        //
        // - If the renamed file is not a directory, opts.Flags does not contain
        // RENAME_EXCHANGE, and rp.MustBeDir() is true, RenameAt returns ENOTDIR.
        // (This check is not subsumed by the check for directory replacement below
        // since it applies even if there is no file to replace.)
        //
        // - If the renamed file is a directory, and the new parent directory of
        // the renamed file is either the renamed directory or a descendant
        // subdirectory of the renamed directory, RenameAt returns EINVAL.
        //
        // - If renaming would exchange the renamed file with an ancestor directory
        // of the renamed file, RenameAt returns EINVAL.
        //
        // - If renaming would replace an ancestor directory of the renamed file,
        // RenameAt returns ENOTEMPTY. (This check would be subsumed by the
        // non-empty directory check below; however, this check takes place before
        // the self-rename check.)
        //
        // - If the renamed file would replace or exchange with itself (i.e. the
        // source and destination paths resolve to the same file), RenameAt returns
        // nil, skipping the checks described below.
        //
        // - If the source or destination directory is not writable by the provider
        // of rp.Credentials(), RenameAt returns EACCES.
        //
        // - If the renamed file is a directory, and renaming would replace a
        // non-directory file, RenameAt returns ENOTDIR.
        //
        // - If the renamed file is not a directory, and renaming would replace a
        // directory, RenameAt returns EISDIR.
        //
        // - If the new parent directory of the renamed file has been removed by
        // RmdirAt or a preceding call to RenameAt, RenameAt returns ENOENT.
        //
        // - If the renamed file is a directory, it is not writable by the
        // provider of rp.Credentials(), and the source and destination parent
        // directories are different, RenameAt returns EACCES. (This is nominally
        // required to change the ".." entry in the renamed directory.)
        //
        // - If renaming would replace a non-empty directory, RenameAt returns
        // ENOTEMPTY.
        //
        // Preconditions:
        // * !rp.Done().
        // * For the final path component in rp, !rp.ShouldFollowSymlink().
        // * oldParentVD.Dentry() was obtained from a previous call to
        //   oldParentVD.Mount().Filesystem().Impl().GetParentDentryAt().
        // * oldName is not "." or "..".
        //
        // Postconditions: If RenameAt returns an error returned by
        // ResolvingPath.Resolve*(), then !rp.Done().
        //
        // [1] "The worst of all namespace operations - renaming directory.
        // "Perverted" doesn't even start to describe it. Somebody in UCB had a
        // heck of a trip..." - fs/namei.c:vfs_rename()
        RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error

        // RmdirAt removes the directory at rp.
        //
        // Errors:
        //
        // - If the last path component in rp is ".", RmdirAt returns EINVAL.
        //
        // - If the last path component in rp is "..", RmdirAt returns ENOTEMPTY.
        //
        // - If no file exists at rp, RmdirAt returns ENOENT.
        //
        // - If the file at rp exists but is not a directory, RmdirAt returns
        // ENOTDIR.
        //
        // Preconditions:
        // * !rp.Done().
        // * For the final path component in rp, !rp.ShouldFollowSymlink().
        //
        // Postconditions: If RmdirAt returns an error returned by
        // ResolvingPath.Resolve*(), then !rp.Done().
        RmdirAt(ctx context.Context, rp *ResolvingPath) error

        // SetStatAt updates metadata for the file at the given path. Implementations
        // are responsible for checking if the operation can be performed
        // (see vfs.CheckSetStat() for common checks).
        //
        // Errors:
        //
        // - If opts specifies unsupported options, SetStatAt returns EINVAL.
        SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error

        // StatAt returns metadata for the file at rp.
        StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error)

        // StatFSAt returns metadata for the filesystem containing the file at rp.
        // (This method takes a path because a FilesystemImpl may consist of any
        // number of constituent filesystems.)
        StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error)

        // SymlinkAt creates a symbolic link at rp referring to the given target.
        //
        // Errors:
        //
        // - If the last path component in rp is "." or "..", SymlinkAt returns
        // EEXIST.
        //
        // - If a file already exists at rp, SymlinkAt returns EEXIST.
        //
        // - If rp.MustBeDir(), SymlinkAt returns ENOENT.
        //
        // - If the directory in which the symbolic link would be created has been
        // removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT.
        //
        // Preconditions:
        // * !rp.Done().
        // * For the final path component in rp, !rp.ShouldFollowSymlink().
        //
        // Postconditions: If SymlinkAt returns an error returned by
        // ResolvingPath.Resolve*(), then !rp.Done().
        SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error

        // UnlinkAt removes the file at rp.
        //
        // Errors:
        //
        // - If the last path component in rp is "." or "..", UnlinkAt returns
        // EISDIR.
        //
        // - If no file exists at rp, UnlinkAt returns ENOENT.
        //
        // - If rp.MustBeDir(), and the file at rp exists and is not a directory,
        // UnlinkAt returns ENOTDIR.
        //
        // - If the file at rp exists but is a directory, UnlinkAt returns EISDIR.
        //
        // Preconditions:
        // * !rp.Done().
        // * For the final path component in rp, !rp.ShouldFollowSymlink().
        //
        // Postconditions: If UnlinkAt returns an error returned by
        // ResolvingPath.Resolve*(), then !rp.Done().
        UnlinkAt(ctx context.Context, rp *ResolvingPath) error

        // ListXattrAt returns all extended attribute names for the file at rp.
        //
        // Errors:
        //
        // - If extended attributes are not supported by the filesystem,
        // ListXattrAt returns ENOTSUP.
        //
        // - If the size of the list (including a NUL terminating byte after every
        // entry) would exceed size, ERANGE may be returned. Note that
        // implementations are free to ignore size entirely and return without
        // error). In all cases, if size is 0, the list should be returned without
        // error, regardless of size.
        ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error)

        // GetXattrAt returns the value associated with the given extended
        // attribute for the file at rp.
        //
        // Errors:
        //
        // - If extended attributes are not supported by the filesystem, GetXattrAt
        // returns ENOTSUP.
        //
        // - If an extended attribute named opts.Name does not exist, ENODATA is
        // returned.
        //
        // - If the size of the return value exceeds opts.Size, ERANGE may be
        // returned (note that implementations are free to ignore opts.Size entirely
        // and return without error). In all cases, if opts.Size is 0, the value
        // should be returned without error, regardless of size.
        GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error)

        // SetXattrAt changes the value associated with the given extended
        // attribute for the file at rp.
        //
        // Errors:
        //
        // - If extended attributes are not supported by the filesystem, SetXattrAt
        // returns ENOTSUP.
        //
        // - If XATTR_CREATE is set in opts.Flag and opts.Name already exists,
        // EEXIST is returned. If XATTR_REPLACE is set and opts.Name does not exist,
        // ENODATA is returned.
        SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error

        // RemoveXattrAt removes the given extended attribute from the file at rp.
        //
        // Errors:
        //
        // - If extended attributes are not supported by the filesystem,
        // RemoveXattrAt returns ENOTSUP.
        //
        // - If name does not exist, ENODATA is returned.
        RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error

        // BoundEndpointAt returns the Unix socket endpoint bound at the path rp.
        //
        // Errors:
        //
        // - If the file does not have write permissions, then BoundEndpointAt
        // returns EACCES.
        //
        // - If a non-socket file exists at rp, then BoundEndpointAt returns
        // ECONNREFUSED.
        BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error)

        // PrependPath prepends a path from vd to vd.Mount().Root() to b.
        //
        // If vfsroot.Ok(), it is the contextual VFS root; if it is encountered
        // before vd.Mount().Root(), PrependPath should stop prepending path
        // components and return a PrependPathAtVFSRootError.
        //
        // If traversal of vd.Dentry()'s ancestors encounters an independent
        // ("root") Dentry that is not vd.Mount().Root() (i.e. vd.Dentry() is not a
        // descendant of vd.Mount().Root()), PrependPath should stop prepending
        // path components and return a PrependPathAtNonMountRootError.
        //
        // Filesystems for which Dentries do not have meaningful paths may prepend
        // an arbitrary descriptive string to b and then return a
        // PrependPathSyntheticError.
        //
        // Most implementations can acquire the appropriate locks to ensure that
        // Dentry.Name() and Dentry.Parent() are fixed for vd.Dentry() and all of
        // its ancestors, then call GenericPrependPath.
        //
        // Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
        PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error

        // MountOptions returns mount options for the current filesystem. This
        // should only return options specific to the filesystem (i.e. don't return
        // "ro", "rw", etc). Options should be returned as a comma-separated string,
        // similar to the input to the 5th argument to mount.
        //
        // If the implementation has no filesystem-specific options, it should
        // return the empty string.
        MountOptions() string
}

// PrependPathAtVFSRootError is returned by implementations of
// FilesystemImpl.PrependPath() when they encounter the contextual VFS root.
//
// +stateify savable
type PrependPathAtVFSRootError struct{}

// Error implements error.Error.
func (PrependPathAtVFSRootError) Error() string {
        return "vfs.FilesystemImpl.PrependPath() reached VFS root"
}

// PrependPathAtNonMountRootError is returned by implementations of
// FilesystemImpl.PrependPath() when they encounter an independent ancestor
// Dentry that is not the Mount root.
//
// +stateify savable
type PrependPathAtNonMountRootError struct{}

// Error implements error.Error.
func (PrependPathAtNonMountRootError) Error() string {
        return "vfs.FilesystemImpl.PrependPath() reached root other than Mount root"
}

// PrependPathSyntheticError is returned by implementations of
// FilesystemImpl.PrependPath() for which prepended names do not represent real
// paths.
//
// +stateify savable
type PrependPathSyntheticError struct{}

// Error implements error.Error.
func (PrependPathSyntheticError) Error() string {
        return "vfs.FilesystemImpl.PrependPath() prepended synthetic name"
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/mm/file_refcount_set.go: no such file or directory















































    1 





  223 





  225 



  225 


  223 



























   24 





   24 




   24 


   24 


























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package proc

import (
        "bytes"
        "fmt"
        "strconv"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/usage"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

// +stateify savable
type selfSymlink struct {
        implStatFS
        kernfs.InodeAttrs
        kernfs.InodeNoopRefCount
        kernfs.InodeSymlink

        pidns *kernel.PIDNamespace
}

var _ kernfs.Inode = (*selfSymlink)(nil)

func (i *tasksInode) newSelfSymlink(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
        inode := &selfSymlink{pidns: i.pidns}
        inode.Init(ctx, creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777)
        return inode
}

func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
        t := kernel.TaskFromContext(ctx)
        if t == nil {
                // Who is reading this link?
                return "", linuxerr.EINVAL
        }
        tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
        if tgid == 0 {
                return "", syserror.ENOENT
        }
        return strconv.FormatUint(uint64(tgid), 10), nil
}

func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
        target, err := s.Readlink(ctx, mnt)
        return vfs.VirtualDentry{}, target, err
}

// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
        return linuxerr.EPERM
}

// +stateify savable
type threadSelfSymlink struct {
        implStatFS
        kernfs.InodeAttrs
        kernfs.InodeNoopRefCount
        kernfs.InodeSymlink

        pidns *kernel.PIDNamespace
}

var _ kernfs.Inode = (*threadSelfSymlink)(nil)

func (i *tasksInode) newThreadSelfSymlink(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
        inode := &threadSelfSymlink{pidns: i.pidns}
        inode.Init(ctx, creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777)
        return inode
}

func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
        t := kernel.TaskFromContext(ctx)
        if t == nil {
                // Who is reading this link?
                return "", linuxerr.EINVAL
        }
        tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
        tid := s.pidns.IDOfTask(t)
        if tid == 0 || tgid == 0 {
                return "", syserror.ENOENT
        }
        return fmt.Sprintf("%d/task/%d", tgid, tid), nil
}

func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
        target, err := s.Readlink(ctx, mnt)
        return vfs.VirtualDentry{}, target, err
}

// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
        return linuxerr.EPERM
}

// dynamicBytesFileSetAttr implements a special file that allows inode
// attributes to be set. This is to support /proc files that are readonly, but
// allow attributes to be set.
//
// +stateify savable
type dynamicBytesFileSetAttr struct {
        kernfs.DynamicBytesFile
}

// SetStat implements kernfs.Inode.SetStat.
func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
        return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts)
}

// cpuStats contains the breakdown of CPU time for /proc/stat.
//
// +stateify savable
type cpuStats struct {
        // user is time spent in userspace tasks with non-positive niceness.
        user uint64

        // nice is time spent in userspace tasks with positive niceness.
        nice uint64

        // system is time spent in non-interrupt kernel context.
        system uint64

        // idle is time spent idle.
        idle uint64

        // ioWait is time spent waiting for IO.
        ioWait uint64

        // irq is time spent in interrupt context.
        irq uint64

        // softirq is time spent in software interrupt context.
        softirq uint64

        // steal is involuntary wait time.
        steal uint64

        // guest is time spent in guests with non-positive niceness.
        guest uint64

        // guestNice is time spent in guests with positive niceness.
        guestNice uint64
}

// String implements fmt.Stringer.
func (c cpuStats) String() string {
        return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice)
}

// statData implements vfs.DynamicBytesSource for /proc/stat.
//
// +stateify savable
type statData struct {
        dynamicBytesFileSetAttr
}

var _ dynamicInode = (*statData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (*statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        // TODO(b/37226836): We currently export only zero CPU stats. We could
        // at least provide some aggregate stats.
        var cpu cpuStats
        fmt.Fprintf(buf, "cpu  %s\n", cpu)

        k := kernel.KernelFromContext(ctx)
        for c, max := uint(0), k.ApplicationCores(); c < max; c++ {
                fmt.Fprintf(buf, "cpu%d %s\n", c, cpu)
        }

        // The total number of interrupts is dependent on the CPUs and PCI
        // devices on the system. See arch_probe_nr_irqs.
        //
        // Since we don't report real interrupt stats, just choose an arbitrary
        // value from a representative VM.
        const numInterrupts = 256

        // The Kernel doesn't handle real interrupts, so report all zeroes.
        // TODO(b/37226836): We could count page faults as #PF.
        fmt.Fprintf(buf, "intr 0") // total
        for i := 0; i < numInterrupts; i++ {
                fmt.Fprintf(buf, " 0")
        }
        fmt.Fprintf(buf, "\n")

        // Total number of context switches.
        // TODO(b/37226836): Count this.
        fmt.Fprintf(buf, "ctxt 0\n")

        // CLOCK_REALTIME timestamp from boot, in seconds.
        fmt.Fprintf(buf, "btime %d\n", k.Timekeeper().BootTime().Seconds())

        // Total number of clones.
        // TODO(b/37226836): Count this.
        fmt.Fprintf(buf, "processes 0\n")

        // Number of runnable tasks.
        // TODO(b/37226836): Count this.
        fmt.Fprintf(buf, "procs_running 0\n")

        // Number of tasks waiting on IO.
        // TODO(b/37226836): Count this.
        fmt.Fprintf(buf, "procs_blocked 0\n")

        // Number of each softirq handled.
        fmt.Fprintf(buf, "softirq 0") // total
        for i := 0; i < linux.NumSoftIRQ; i++ {
                fmt.Fprintf(buf, " 0")
        }
        fmt.Fprintf(buf, "\n")
        return nil
}

// loadavgData backs /proc/loadavg.
//
// +stateify savable
type loadavgData struct {
        dynamicBytesFileSetAttr
}

var _ dynamicInode = (*loadavgData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (*loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        // TODO(b/62345059): Include real data in fields.
        // Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods.
        // Column 4-5: currently running processes and the total number of processes.
        // Column 6: the last process ID used.
        fmt.Fprintf(buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0)
        return nil
}

// meminfoData implements vfs.DynamicBytesSource for /proc/meminfo.
//
// +stateify savable
type meminfoData struct {
        dynamicBytesFileSetAttr
}

var _ dynamicInode = (*meminfoData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (*meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        k := kernel.KernelFromContext(ctx)
        mf := k.MemoryFile()
        mf.UpdateUsage()
        snapshot, totalUsage := usage.MemoryAccounting.Copy()
        totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
        anon := snapshot.Anonymous + snapshot.Tmpfs
        file := snapshot.PageCache + snapshot.Mapped
        // We don't actually have active/inactive LRUs, so just make up numbers.
        activeFile := (file / 2) &^ (hostarch.PageSize - 1)
        inactiveFile := file - activeFile

        fmt.Fprintf(buf, "MemTotal:       %8d kB\n", totalSize/1024)
        memFree := totalSize - totalUsage
        if memFree > totalSize {
                // Underflow.
                memFree = 0
        }
        // We use MemFree as MemAvailable because we don't swap.
        // TODO(rahat): When reclaim is implemented the value of MemAvailable
        // should change.
        fmt.Fprintf(buf, "MemFree:        %8d kB\n", memFree/1024)
        fmt.Fprintf(buf, "MemAvailable:   %8d kB\n", memFree/1024)
        fmt.Fprintf(buf, "Buffers:               0 kB\n") // memory usage by block devices
        fmt.Fprintf(buf, "Cached:         %8d kB\n", (file+snapshot.Tmpfs)/1024)
        // Emulate a system with no swap, which disables inactivation of anon pages.
        fmt.Fprintf(buf, "SwapCache:             0 kB\n")
        fmt.Fprintf(buf, "Active:         %8d kB\n", (anon+activeFile)/1024)
        fmt.Fprintf(buf, "Inactive:       %8d kB\n", inactiveFile/1024)
        fmt.Fprintf(buf, "Active(anon):   %8d kB\n", anon/1024)
        fmt.Fprintf(buf, "Inactive(anon):        0 kB\n")
        fmt.Fprintf(buf, "Active(file):   %8d kB\n", activeFile/1024)
        fmt.Fprintf(buf, "Inactive(file): %8d kB\n", inactiveFile/1024)
        fmt.Fprintf(buf, "Unevictable:           0 kB\n") // TODO(b/31823263)
        fmt.Fprintf(buf, "Mlocked:               0 kB\n") // TODO(b/31823263)
        fmt.Fprintf(buf, "SwapTotal:             0 kB\n")
        fmt.Fprintf(buf, "SwapFree:              0 kB\n")
        fmt.Fprintf(buf, "Dirty:                 0 kB\n")
        fmt.Fprintf(buf, "Writeback:             0 kB\n")
        fmt.Fprintf(buf, "AnonPages:      %8d kB\n", anon/1024)
        fmt.Fprintf(buf, "Mapped:         %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know
        fmt.Fprintf(buf, "Shmem:          %8d kB\n", snapshot.Tmpfs/1024)
        return nil
}

// uptimeData implements vfs.DynamicBytesSource for /proc/uptime.
//
// +stateify savable
type uptimeData struct {
        dynamicBytesFileSetAttr
}

var _ dynamicInode = (*uptimeData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (*uptimeData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        k := kernel.KernelFromContext(ctx)
        now := time.NowFromContext(ctx)

        // Pretend that we've spent zero time sleeping (second number).
        fmt.Fprintf(buf, "%.2f 0.00\n", now.Sub(k.Timekeeper().BootTime()).Seconds())
        return nil
}

// versionData implements vfs.DynamicBytesSource for /proc/version.
//
// +stateify savable
type versionData struct {
        dynamicBytesFileSetAttr
}

var _ dynamicInode = (*versionData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (*versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        // /proc/version takes the form:
        //
        // "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST)
        // (COMPILER_VERSION) VERSION"
        //
        // where:
        // - SYSNAME, RELEASE, and VERSION are the same as returned by
        // sys_utsname
        // - COMPILE_USER is the user that build the kernel
        // - COMPILE_HOST is the hostname of the machine on which the kernel
        // was built
        // - COMPILER_VERSION is the version reported by the building compiler
        //
        // Since we don't really want to expose build information to
        // applications, those fields are omitted.
        //
        // FIXME(mpratt): Using Version from the init task SyscallTable
        // disregards the different version a task may have (e.g., in a uts
        // namespace).
        ver := kernelVersion(ctx)
        fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version)
        return nil
}

// filesystemsData backs /proc/filesystems.
//
// +stateify savable
type filesystemsData struct {
        kernfs.DynamicBytesFile
}

var _ dynamicInode = (*filesystemsData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *filesystemsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        k := kernel.KernelFromContext(ctx)
        k.VFS().GenerateProcFilesystems(buf)
        return nil
}

// cgroupsData backs /proc/cgroups.
//
// +stateify savable
type cgroupsData struct {
        dynamicBytesFileSetAttr
}

var _ dynamicInode = (*cgroupsData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (*cgroupsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        r := kernel.KernelFromContext(ctx).CgroupRegistry()
        r.GenerateProcCgroups(buf)
        return nil
}

// cmdLineData backs /proc/cmdline.
//
// +stateify savable
type cmdLineData struct {
        dynamicBytesFileSetAttr
}

var _ dynamicInode = (*cmdLineData)(nil)

// Generate implements vfs.DynamicByteSource.Generate.
func (*cmdLineData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        fmt.Fprintf(buf, "BOOT_IMAGE=/vmlinuz-%s-gvisor quiet\n", kernelVersion(ctx).Release)
        return nil
}

// kernelVersion returns the kernel version.
func kernelVersion(ctx context.Context) kernel.Version {
        k := kernel.KernelFromContext(ctx)
        init := k.GlobalInit()
        if init == nil {
                // Attempted to read before the init Task is created. This can
                // only occur during startup, which should never need to read
                // this file.
                panic("Attempted to read version before initial Task is available")
        }
        return init.Leader().SyscallTable().Version
}















































































































   56 
   56 







    3 
    3 






































































   65 






















   65 





  387 









  191 






  191 


  189 








  627 








  604 







   30 



   29 


   29 










    1 




 1780 


 1778 


 1784 



  102 




    2 
















































































   16 









   10 






    8 



    7 


    8 




    2 




    2 
    2 



    2 




  174 








































  642 




  363 




   23 









    7 







  183 




   15 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/waiter"
)

// TasksLimit is the maximum number of threads for untrusted application.
// Linux doesn't really limit this directly, rather it is limited by total
// memory size, stacks allocated and a global maximum. There's no real reason
// for us to limit it either, (esp. since threads are backed by go routines),
// and we would expect to hit resource limits long before hitting this number.
// However, for correctness, we still check that the user doesn't exceed this
// number.
//
// Note that because of the way futexes are implemented, there *are* in fact
// serious restrictions on valid thread IDs. They are limited to 2^30 - 1
// (kernel/fork.c:MAX_THREADS).
const TasksLimit = (1 << 16)

// ThreadID is a generic thread identifier.
//
// +marshal
type ThreadID int32

// String returns a decimal representation of the ThreadID.
func (tid ThreadID) String() string {
        return fmt.Sprintf("%d", tid)
}

// InitTID is the TID given to the first task added to each PID namespace. The
// thread group led by InitTID is called the namespace's init process. The
// death of a PID namespace's init process causes all tasks visible in that
// namespace to be killed.
const InitTID ThreadID = 1

// A TaskSet comprises all tasks in a system.
//
// +stateify savable
type TaskSet struct {
        // mu protects all relationships between tasks and thread groups in the
        // TaskSet. (mu is approximately equivalent to Linux's tasklist_lock.)
        mu sync.RWMutex `state:"nosave"`

        // Root is the root PID namespace, in which all tasks in the TaskSet are
        // visible. The Root pointer is immutable.
        Root *PIDNamespace

        // sessions is the set of all sessions.
        sessions sessionList

        // stopCount is the number of active external stops applicable to all tasks
        // in the TaskSet (calls to TaskSet.BeginExternalStop that have not been
        // paired with a call to TaskSet.EndExternalStop). stopCount is protected
        // by mu.
        //
        // stopCount is not saved for the same reason as Task.stopCount; it is
        // always reset to zero after restore.
        stopCount int32 `state:"nosave"`

        // liveGoroutines is the number of non-exited task goroutines in the
        // TaskSet.
        //
        // liveGoroutines is not saved; it is reset as task goroutines are
        // restarted by Task.Start.
        liveGoroutines sync.WaitGroup `state:"nosave"`

        // runningGoroutines is the number of running task goroutines in the
        // TaskSet.
        //
        // runningGoroutines is not saved; its counter value is required to be zero
        // at time of save (but note that this is not necessarily the same thing as
        // sync.WaitGroup's zero value).
        runningGoroutines sync.WaitGroup `state:"nosave"`

        // aioGoroutines is the number of goroutines running async I/O
        // callbacks.
        //
        // aioGoroutines is not saved but is required to be zero at the time of
        // save.
        aioGoroutines sync.WaitGroup `state:"nosave"`
}

// newTaskSet returns a new, empty TaskSet.
func newTaskSet(pidns *PIDNamespace) *TaskSet {
        ts := &TaskSet{Root: pidns}
        pidns.owner = ts
        return ts
}

// forEachThreadGroupLocked applies f to each thread group in ts.
//
// Preconditions: ts.mu must be locked (for reading or writing).
func (ts *TaskSet) forEachThreadGroupLocked(f func(tg *ThreadGroup)) {
        for tg := range ts.Root.tgids {
                f(tg)
        }
}

// forEachTaskLocked applies f to each Task in ts.
//
// Preconditions: ts.mu must be locked (for reading or writing).
func (ts *TaskSet) forEachTaskLocked(f func(t *Task)) {
        for t := range ts.Root.tids {
                f(t)
        }
}

// A PIDNamespace represents a PID namespace, a bimap between thread IDs and
// tasks. See the pid_namespaces(7) man page for further details.
//
// N.B. A task is said to be visible in a PID namespace if the PID namespace
// contains a thread ID that maps to that task.
//
// +stateify savable
type PIDNamespace struct {
        // owner is the TaskSet that this PID namespace belongs to. The owner
        // pointer is immutable.
        owner *TaskSet

        // parent is the PID namespace of the process that created this one. If
        // this is the root PID namespace, parent is nil. The parent pointer is
        // immutable.
        //
        // Invariant: All tasks that are visible in this namespace are also visible
        // in all ancestor namespaces.
        parent *PIDNamespace

        // userns is the user namespace with which this PID namespace is
        // associated. Privileged operations on this PID namespace must have
        // appropriate capabilities in userns. The userns pointer is immutable.
        userns *auth.UserNamespace

        // The following fields are protected by owner.mu.

        // last is the last ThreadID to be allocated in this namespace.
        last ThreadID

        // tasks is a mapping from ThreadIDs in this namespace to tasks visible in
        // the namespace.
        tasks map[ThreadID]*Task

        // tids is a mapping from tasks visible in this namespace to their
        // identifiers in this namespace.
        tids map[*Task]ThreadID

        // tgids is a mapping from thread groups visible in this namespace to
        // their identifiers in this namespace.
        //
        // The content of tgids is equivalent to tids[tg.leader]. This exists
        // primarily as an optimization to quickly find all thread groups.
        tgids map[*ThreadGroup]ThreadID

        // sessions is a mapping from SessionIDs in this namespace to sessions
        // visible in the namespace.
        sessions map[SessionID]*Session

        // sids is a mapping from sessions visible in this namespace to their
        // identifiers in this namespace.
        sids map[*Session]SessionID

        // processGroups is a mapping from ProcessGroupIDs in this namespace to
        // process groups visible in the namespace.
        processGroups map[ProcessGroupID]*ProcessGroup

        // pgids is a mapping from process groups visible in this namespace to
        // their identifiers in this namespace.
        pgids map[*ProcessGroup]ProcessGroupID

        // exiting indicates that the namespace's init process is exiting or has
        // exited.
        exiting bool
}

func newPIDNamespace(ts *TaskSet, parent *PIDNamespace, userns *auth.UserNamespace) *PIDNamespace {
        return &PIDNamespace{
                owner:         ts,
                parent:        parent,
                userns:        userns,
                tasks:         make(map[ThreadID]*Task),
                tids:          make(map[*Task]ThreadID),
                tgids:         make(map[*ThreadGroup]ThreadID),
                sessions:      make(map[SessionID]*Session),
                sids:          make(map[*Session]SessionID),
                processGroups: make(map[ProcessGroupID]*ProcessGroup),
                pgids:         make(map[*ProcessGroup]ProcessGroupID),
        }
}

// NewRootPIDNamespace creates the root PID namespace. 'owner' is not available
// yet when root namespace is created and must be set by caller.
func NewRootPIDNamespace(userns *auth.UserNamespace) *PIDNamespace {
        return newPIDNamespace(nil, nil, userns)
}

// NewChild returns a new, empty PID namespace that is a child of ns. Authority
// over the new PID namespace is controlled by userns.
func (ns *PIDNamespace) NewChild(userns *auth.UserNamespace) *PIDNamespace {
        return newPIDNamespace(ns.owner, ns, userns)
}

// TaskWithID returns the task with thread ID tid in PID namespace ns. If no
// task has that TID, TaskWithID returns nil.
func (ns *PIDNamespace) TaskWithID(tid ThreadID) *Task {
        ns.owner.mu.RLock()
        t := ns.tasks[tid]
        ns.owner.mu.RUnlock()
        return t
}

// ThreadGroupWithID returns the thread group led by the task with thread ID
// tid in PID namespace ns. If no task has that TID, or if the task with that
// TID is not a thread group leader, ThreadGroupWithID returns nil.
func (ns *PIDNamespace) ThreadGroupWithID(tid ThreadID) *ThreadGroup {
        ns.owner.mu.RLock()
        defer ns.owner.mu.RUnlock()
        t := ns.tasks[tid]
        if t == nil {
                return nil
        }
        if t != t.tg.leader {
                return nil
        }
        return t.tg
}

// IDOfTask returns the TID assigned to the given task in PID namespace ns. If
// the task is not visible in that namespace, IDOfTask returns 0. (This return
// value is significant in some cases, e.g. getppid() is documented as
// returning 0 if the caller's parent is in an ancestor namespace and
// consequently not visible to the caller.) If the task is nil, IDOfTask returns
// 0.
func (ns *PIDNamespace) IDOfTask(t *Task) ThreadID {
        ns.owner.mu.RLock()
        id := ns.tids[t]
        ns.owner.mu.RUnlock()
        return id
}

// IDOfThreadGroup returns the TID assigned to tg's leader in PID namespace ns.
// If the task is not visible in that namespace, IDOfThreadGroup returns 0.
func (ns *PIDNamespace) IDOfThreadGroup(tg *ThreadGroup) ThreadID {
        ns.owner.mu.RLock()
        id := ns.tgids[tg]
        ns.owner.mu.RUnlock()
        return id
}

// Tasks returns a snapshot of the tasks in ns.
func (ns *PIDNamespace) Tasks() []*Task {
        ns.owner.mu.RLock()
        defer ns.owner.mu.RUnlock()
        tasks := make([]*Task, 0, len(ns.tasks))
        for t := range ns.tids {
                tasks = append(tasks, t)
        }
        return tasks
}

// NumTasks returns the number of tasks in ns.
func (ns *PIDNamespace) NumTasks() int {
        ns.owner.mu.RLock()
        defer ns.owner.mu.RUnlock()
        return len(ns.tids)
}

// ThreadGroups returns a snapshot of the thread groups in ns.
func (ns *PIDNamespace) ThreadGroups() []*ThreadGroup {
        return ns.ThreadGroupsAppend(nil)
}

// ThreadGroupsAppend appends a snapshot of the thread groups in ns to tgs.
func (ns *PIDNamespace) ThreadGroupsAppend(tgs []*ThreadGroup) []*ThreadGroup {
        ns.owner.mu.RLock()
        defer ns.owner.mu.RUnlock()
        for tg := range ns.tgids {
                tgs = append(tgs, tg)
        }
        return tgs
}

// UserNamespace returns the user namespace associated with PID namespace ns.
func (ns *PIDNamespace) UserNamespace() *auth.UserNamespace {
        return ns.userns
}

// Root returns the root PID namespace of ns.
func (ns *PIDNamespace) Root() *PIDNamespace {
        return ns.owner.Root
}

// A threadGroupNode defines the relationship between a thread group and the
// rest of the system. Conceptually, threadGroupNode is data belonging to the
// owning TaskSet, as if TaskSet contained a field `nodes
// map[*ThreadGroup]*threadGroupNode`. However, for practical reasons,
// threadGroupNode is embedded in the ThreadGroup it represents.
// (threadGroupNode is an anonymous field in ThreadGroup; this is to expose
// threadGroupEntry's methods on ThreadGroup to make it implement
// threadGroupLinker.)
//
// +stateify savable
type threadGroupNode struct {
        // pidns is the PID namespace containing the thread group and all of its
        // member tasks. The pidns pointer is immutable.
        pidns *PIDNamespace

        // eventQueue is notified whenever a event of interest to Task.Wait occurs
        // in a child of this thread group, or a ptrace tracee of a task in this
        // thread group. Events are defined in task_exit.go.
        //
        // Note that we cannot check and save this wait queue similarly to other
        // wait queues, as the queue will not be empty by the time of saving, due
        // to the wait sourced from Exec().
        eventQueue waiter.Queue `state:"nosave"`

        // leader is the thread group's leader, which is the oldest task in the
        // thread group; usually the last task in the thread group to call
        // execve(), or if no such task exists then the first task in the thread
        // group, which was created by a call to fork() or clone() without
        // CLONE_THREAD. Once a thread group has been made visible to the rest of
        // the system by TaskSet.newTask, leader is never nil.
        //
        // Note that it's possible for the leader to exit without causing the rest
        // of the thread group to exit; in such a case, leader will still be valid
        // and non-nil, but leader will not be in tasks.
        //
        // leader is protected by the TaskSet mutex.
        leader *Task

        // If execing is not nil, it is a task in the thread group that has killed
        // all other tasks so that it can become the thread group leader and
        // perform an execve. (execing may already be the thread group leader.)
        //
        // execing is analogous to Linux's signal_struct::group_exit_task.
        //
        // execing is protected by the TaskSet mutex.
        execing *Task

        // tasks is all tasks in the thread group that have not yet been reaped.
        //
        // tasks is protected by both the TaskSet mutex and the signal mutex:
        // Mutating tasks requires locking the TaskSet mutex for writing *and*
        // locking the signal mutex. Reading tasks requires locking the TaskSet
        // mutex *or* locking the signal mutex.
        tasks taskList

        // tasksCount is the number of tasks in the thread group that have not yet
        // been reaped; equivalently, tasksCount is the number of tasks in tasks.
        //
        // tasksCount is protected by both the TaskSet mutex and the signal mutex,
        // as with tasks.
        tasksCount int

        // liveTasks is the number of tasks in the thread group that have not yet
        // reached TaskExitZombie.
        //
        // liveTasks is protected by the TaskSet mutex (NOT the signal mutex).
        liveTasks int

        // activeTasks is the number of tasks in the thread group that have not yet
        // reached TaskExitInitiated.
        //
        // activeTasks is protected by both the TaskSet mutex and the signal mutex,
        // as with tasks.
        activeTasks int
}

// PIDNamespace returns the PID namespace containing tg.
func (tg *ThreadGroup) PIDNamespace() *PIDNamespace {
        return tg.pidns
}

// TaskSet returns the TaskSet containing tg.
func (tg *ThreadGroup) TaskSet() *TaskSet {
        return tg.pidns.owner
}

// Leader returns tg's leader.
func (tg *ThreadGroup) Leader() *Task {
        tg.pidns.owner.mu.RLock()
        defer tg.pidns.owner.mu.RUnlock()
        return tg.leader
}

// Count returns the number of non-exited threads in the group.
func (tg *ThreadGroup) Count() int {
        tg.pidns.owner.mu.RLock()
        defer tg.pidns.owner.mu.RUnlock()
        var count int
        for t := tg.tasks.Front(); t != nil; t = t.Next() {
                count++
        }
        return count
}

// MemberIDs returns a snapshot of the ThreadIDs (in PID namespace pidns) for
// all tasks in tg.
func (tg *ThreadGroup) MemberIDs(pidns *PIDNamespace) []ThreadID {
        tg.pidns.owner.mu.RLock()
        defer tg.pidns.owner.mu.RUnlock()

        var tasks []ThreadID
        for t := tg.tasks.Front(); t != nil; t = t.Next() {
                if id, ok := pidns.tids[t]; ok {
                        tasks = append(tasks, id)
                }
        }
        return tasks
}

// ID returns tg's leader's thread ID in its own PID namespace. If tg's leader
// is dead, ID returns 0.
func (tg *ThreadGroup) ID() ThreadID {
        tg.pidns.owner.mu.RLock()
        id := tg.pidns.tgids[tg]
        tg.pidns.owner.mu.RUnlock()
        return id
}

// A taskNode defines the relationship between a task and the rest of the
// system. The comments on threadGroupNode also apply to taskNode.
//
// +stateify savable
type taskNode struct {
        // tg is the thread group that this task belongs to. The tg pointer is
        // immutable.
        tg *ThreadGroup `state:"wait"`

        // taskEntry links into tg.tasks. Note that this means that
        // Task.Next/Prev/SetNext/SetPrev refer to sibling tasks in the same thread
        // group. See threadGroupNode.tasks for synchronization info.
        taskEntry

        // parent is the task's parent. parent may be nil.
        //
        // parent is protected by the TaskSet mutex.
        parent *Task

        // children is this task's children.
        //
        // children is protected by the TaskSet mutex.
        children map[*Task]struct{}

        // If childPIDNamespace is not nil, all new tasks created by this task will
        // be members of childPIDNamespace rather than this one. (As a corollary,
        // this task becomes unable to create sibling tasks in the same thread
        // group.)
        //
        // childPIDNamespace is exclusive to the task goroutine.
        childPIDNamespace *PIDNamespace
}

// ThreadGroup returns the thread group containing t.
func (t *Task) ThreadGroup() *ThreadGroup {
        return t.tg
}

// PIDNamespace returns the PID namespace containing t.
func (t *Task) PIDNamespace() *PIDNamespace {
        return t.tg.pidns
}

// TaskSet returns the TaskSet containing t.
func (t *Task) TaskSet() *TaskSet {
        return t.tg.pidns.owner
}

// Timekeeper returns the system Timekeeper.
func (t *Task) Timekeeper() *Timekeeper {
        return t.k.timekeeper
}

// Parent returns t's parent.
func (t *Task) Parent() *Task {
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        return t.parent
}

// ThreadID returns t's thread ID in its own PID namespace. If the task is
// dead, ThreadID returns 0.
func (t *Task) ThreadID() ThreadID {
        return t.tg.pidns.IDOfTask(t)
}

// TGIDInRoot returns t's TGID in the root PID namespace.
func (t *Task) TGIDInRoot() ThreadID {
        return t.tg.pidns.owner.Root.IDOfThreadGroup(t.tg)
}






































   41 










   53 




   20 




  623 
   36 





  740 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/kernel/semaphore"
        "gvisor.dev/gvisor/pkg/sentry/kernel/shm"
)

// IPCNamespace represents an IPC namespace.
//
// +stateify savable
type IPCNamespace struct {
        IPCNamespaceRefs

        // User namespace which owns this IPC namespace. Immutable.
        userNS *auth.UserNamespace

        semaphores *semaphore.Registry
        shms       *shm.Registry
}

// NewIPCNamespace creates a new IPC namespace.
func NewIPCNamespace(userNS *auth.UserNamespace) *IPCNamespace {
        ns := &IPCNamespace{
                userNS:     userNS,
                semaphores: semaphore.NewRegistry(userNS),
                shms:       shm.NewRegistry(userNS),
        }
        ns.InitRefs()
        return ns
}

// SemaphoreRegistry returns the semaphore set registry for this namespace.
func (i *IPCNamespace) SemaphoreRegistry() *semaphore.Registry {
        return i.semaphores
}

// ShmRegistry returns the shm segment registry for this namespace.
func (i *IPCNamespace) ShmRegistry() *shm.Registry {
        return i.shms
}

// DecRef implements refsvfs2.RefCounter.DecRef.
func (i *IPCNamespace) DecRef(ctx context.Context) {
        i.IPCNamespaceRefs.DecRef(func() {
                i.shms.Release(ctx)
        })
}

// IPCNamespace returns the task's IPC namespace.
func (t *Task) IPCNamespace() *IPCNamespace {
        t.mu.Lock()
        defer t.mu.Unlock()
        return t.ipcns
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/pgalloc/reclaim_set.go: no such file or directory


























    5 




    2 



    3 




    3 




    3 




    1 



    2 



    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
)

// Eventfd2 implements linux syscall eventfd2(2).
func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        initVal := uint64(args[0].Uint())
        flags := uint(args[1].Uint())
        allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC)

        if flags & ^allOps != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        vfsObj := t.Kernel().VFS()
        fileFlags := uint32(linux.O_RDWR)
        if flags&linux.EFD_NONBLOCK != 0 {
                fileFlags |= linux.O_NONBLOCK
        }
        semMode := flags&linux.EFD_SEMAPHORE != 0
        eventfd, err := eventfd.New(t, vfsObj, initVal, semMode, fileFlags)
        if err != nil {
                return 0, nil, err
        }
        defer eventfd.DecRef(t)

        fd, err := t.NewFDFromVFS2(0, eventfd, kernel.FDFlags{
                CloseOnExec: flags&linux.EFD_CLOEXEC != 0,
        })
        if err != nil {
                return 0, nil, err
        }

        return uintptr(fd), nil, nil
}

// Eventfd implements linux syscall eventfd(2).
func Eventfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        args[1].Value = 0
        return Eventfd2(t, args)
}








































    3 











    3 



    3 




    3 




    3 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cgroupfs

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)

// +stateify savable
type cpuController struct {
        controllerCommon

        // CFS bandwidth control parameters, values in microseconds.
        cfsPeriod int64
        cfsQuota  int64

        // CPU shares, values should be (num core * 1024).
        shares int64
}

var _ controller = (*cpuController)(nil)

func newCPUController(fs *filesystem, defaults map[string]int64) *cpuController {
        // Default values for controller parameters from Linux.
        c := &cpuController{
                cfsPeriod: 100000,
                cfsQuota:  -1,
                shares:    1024,
        }

        if val, ok := defaults["cpu.cfs_period_us"]; ok {
                c.cfsPeriod = val
                delete(defaults, "cpu.cfs_period_us")
        }
        if val, ok := defaults["cpu.cfs_quota_us"]; ok {
                c.cfsQuota = val
                delete(defaults, "cpu.cfs_quota_us")
        }
        if val, ok := defaults["cpu.shares"]; ok {
                c.shares = val
                delete(defaults, "cpu.shares")
        }

        c.controllerCommon.init(controllerCPU, fs)
        return c
}

// AddControlFiles implements controller.AddControlFiles.
func (c *cpuController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) {
        contents["cpu.cfs_period_us"] = c.fs.newStaticControllerFile(ctx, creds, linux.FileMode(0644), fmt.Sprintf("%d\n", c.cfsPeriod))
        contents["cpu.cfs_quota_us"] = c.fs.newStaticControllerFile(ctx, creds, linux.FileMode(0644), fmt.Sprintf("%d\n", c.cfsQuota))
        contents["cpu.shares"] = c.fs.newStaticControllerFile(ctx, creds, linux.FileMode(0644), fmt.Sprintf("%d\n", c.shares))
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/memmap/mappable_range.go: no such file or directory

























































  668 


  669 



  667 







  696 





  679 

  679 






  681 
   70 




   69 


  697 






 1957 








 1956 







 1951 







  344 










   10 

    2 




   10 




   10 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux/errno"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel/futex"
        "gvisor.dev/gvisor/pkg/sentry/loader"
        "gvisor.dev/gvisor/pkg/sentry/mm"
        "gvisor.dev/gvisor/pkg/syserr"
)

var errNoSyscalls = syserr.New("no syscall table found", errno.ENOEXEC)

// Auxmap contains miscellaneous data for the task.
type Auxmap map[string]interface{}

// TaskImage is the subset of a task's data that is provided by the loader.
//
// +stateify savable
type TaskImage struct {
        // Name is the thread name set by the prctl(PR_SET_NAME) system call.
        Name string

        // Arch is the architecture-specific context (registers, etc.)
        Arch arch.Context

        // MemoryManager is the task's address space.
        MemoryManager *mm.MemoryManager

        // fu implements futexes in the address space.
        fu *futex.Manager

        // st is the task's syscall table.
        st *SyscallTable `state:".(syscallTableInfo)"`
}

// release releases all resources held by the TaskImage. release is called by
// the task when it execs into a new TaskImage or exits.
func (image *TaskImage) release() {
        // Nil out pointers so that if the task is saved after release, it doesn't
        // follow the pointers to possibly now-invalid objects.
        if image.MemoryManager != nil {
                image.MemoryManager.DecUsers(context.Background())
                image.MemoryManager = nil
        }
        image.fu = nil
}

// Fork returns a duplicate of image. The copied TaskImage always has an
// independent arch.Context. If shareAddressSpace is true, the copied
// TaskImage shares an address space with the original; otherwise, the copied
// TaskImage has an independent address space that is initially a duplicate
// of the original's.
func (image *TaskImage) Fork(ctx context.Context, k *Kernel, shareAddressSpace bool) (*TaskImage, error) {
        newImage := &TaskImage{
                Name: image.Name,
                Arch: image.Arch.Fork(),
                st:   image.st,
        }
        if shareAddressSpace {
                newImage.MemoryManager = image.MemoryManager
                if newImage.MemoryManager != nil {
                        if !newImage.MemoryManager.IncUsers() {
                                // Shouldn't be possible since image.MemoryManager should be a
                                // counted user.
                                panic(fmt.Sprintf("TaskImage.Fork called with userless TaskImage.MemoryManager"))
                        }
                }
                newImage.fu = image.fu
        } else {
                newMM, err := image.MemoryManager.Fork(ctx)
                if err != nil {
                        return nil, err
                }
                newImage.MemoryManager = newMM
                newImage.fu = k.futexes.Fork()
        }
        return newImage, nil
}

// Arch returns t's arch.Context.
//
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) Arch() arch.Context {
        return t.image.Arch
}

// MemoryManager returns t's MemoryManager. MemoryManager does not take an
// additional reference on the returned MM.
//
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) MemoryManager() *mm.MemoryManager {
        return t.image.MemoryManager
}

// SyscallTable returns t's syscall table.
//
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) SyscallTable() *SyscallTable {
        return t.image.st
}

// Stack returns the userspace stack.
//
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) Stack() *arch.Stack {
        return &arch.Stack{
                Arch:   t.Arch(),
                IO:     t.MemoryManager(),
                Bottom: hostarch.Addr(t.Arch().Stack()),
        }
}

// LoadTaskImage loads a specified file into a new TaskImage.
//
// args.MemoryManager does not need to be set by the caller.
func (k *Kernel) LoadTaskImage(ctx context.Context, args loader.LoadArgs) (*TaskImage, *syserr.Error) {
        // If File is not nil, we should load that instead of resolving Filename.
        if args.File != nil {
                args.Filename = args.File.PathnameWithDeleted(ctx)
        }

        // Prepare a new user address space to load into.
        m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
        defer m.DecUsers(ctx)
        args.MemoryManager = m

        os, ac, name, err := loader.Load(ctx, args, k.extraAuxv, k.vdso)
        if err != nil {
                return nil, err
        }

        // Lookup our new syscall table.
        st, ok := LookupSyscallTable(os, ac.Arch())
        if !ok {
                // No syscall table found. This means that the ELF binary does not match
                // the architecture.
                return nil, errNoSyscalls
        }

        if !m.IncUsers() {
                panic("Failed to increment users count on new MM")
        }
        return &TaskImage{
                Name:          name,
                Arch:          ac,
                MemoryManager: m,
                fu:            k.futexes.Fork(),
                st:            st,
        }, nil
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/abi/linux/linux_abi_autogen_unsafe.go: no such file or directory





























    2 



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package gofer

import (
        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/p9"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/waiter"
)

func (d *dentry) isSocket() bool {
        return d.fileType() == linux.S_IFSOCK
}

// endpoint is a Gofer-backed transport.BoundEndpoint.
//
// An endpoint's lifetime is the time between when filesystem.BoundEndpointAt()
// is called and either BoundEndpoint.BidirectionalConnect or
// BoundEndpoint.UnidirectionalConnect is called.
//
// +stateify savable
type endpoint struct {
        // dentry is the filesystem dentry which produced this endpoint.
        dentry *dentry

        // path is the sentry path where this endpoint is bound.
        path string
}

func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) {
        switch t {
        case linux.SOCK_STREAM:
                return p9.StreamSocket, true
        case linux.SOCK_SEQPACKET:
                return p9.SeqpacketSocket, true
        case linux.SOCK_DGRAM:
                return p9.DgramSocket, true
        }
        return 0, false
}

// BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect.
func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error {
        cf, ok := sockTypeToP9(ce.Type())
        if !ok {
                return syserr.ErrConnectionRefused
        }

        // No lock ordering required as only the ConnectingEndpoint has a mutex.
        ce.Lock()

        // Check connecting state.
        if ce.Connected() {
                ce.Unlock()
                return syserr.ErrAlreadyConnected
        }
        if ce.Listening() {
                ce.Unlock()
                return syserr.ErrInvalidEndpointState
        }

        c, err := e.newConnectedEndpoint(ctx, cf, ce.WaiterQueue())
        if err != nil {
                ce.Unlock()
                return err
        }

        returnConnect(c, c)
        ce.Unlock()
        if err := c.Init(); err != nil {
                return syserr.FromError(err)
        }

        return nil
}

// UnidirectionalConnect implements
// transport.BoundEndpoint.UnidirectionalConnect.
func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.ConnectedEndpoint, *syserr.Error) {
        c, err := e.newConnectedEndpoint(ctx, p9.DgramSocket, &waiter.Queue{})
        if err != nil {
                return nil, err
        }

        if err := c.Init(); err != nil {
                return nil, syserr.FromError(err)
        }

        // We don't need the receiver.
        c.CloseRecv()
        c.Release(ctx)

        return c, nil
}

func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) {
        hostFile, err := e.dentry.file.connect(ctx, flags)
        if err != nil {
                return nil, syserr.ErrConnectionRefused
        }
        // Dup the fd so that the new endpoint can manage its lifetime.
        hostFD, err := unix.Dup(hostFile.FD())
        if err != nil {
                log.Warningf("Could not dup host socket fd %d: %v", hostFile.FD(), err)
                return nil, syserr.FromError(err)
        }
        // After duplicating, we no longer need hostFile.
        hostFile.Close()

        c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path)
        if serr != nil {
                log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.dentry.file, flags, serr)
                return nil, serr
        }
        return c, nil
}

// Release implements transport.BoundEndpoint.Release.
func (e *endpoint) Release(ctx context.Context) {
        e.dentry.DecRef(ctx)
}

// Passcred implements transport.BoundEndpoint.Passcred.
func (e *endpoint) Passcred() bool {
        return false
}





















































  719 






  653 






   86 







  794 




















































  645 

  648 


  646 



  794 




  123 
  122 
















































































































































































  735 




  343 




  401 






















































































  401 
  294 





   83 




  366 




    2 




  362 









   35 









   35 




    2 




   28 




  788 









    2 




  106 







































    2 












    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/bits"
        "gvisor.dev/gvisor/pkg/hostarch"
)

const (
        // SignalMaximum is the highest valid signal number.
        SignalMaximum = 64

        // FirstStdSignal is the lowest standard signal number.
        FirstStdSignal = 1

        // LastStdSignal is the highest standard signal number.
        LastStdSignal = 31

        // FirstRTSignal is the lowest real-time signal number.
        //
        // 32 (SIGCANCEL) and 33 (SIGSETXID) are used internally by glibc.
        FirstRTSignal = 32

        // LastRTSignal is the highest real-time signal number.
        LastRTSignal = 64

        // NumStdSignals is the number of standard signals.
        NumStdSignals = LastStdSignal - FirstStdSignal + 1

        // NumRTSignals is the number of realtime signals.
        NumRTSignals = LastRTSignal - FirstRTSignal + 1
)

// Signal is a signal number.
type Signal int

// IsValid returns true if s is a valid standard or realtime signal. (0 is not
// considered valid; interfaces special-casing signal number 0 should check for
// 0 first before asserting validity.)
func (s Signal) IsValid() bool {
        return s > 0 && s <= SignalMaximum
}

// IsStandard returns true if s is a standard signal.
//
// Preconditions: s.IsValid().
func (s Signal) IsStandard() bool {
        return s <= LastStdSignal
}

// IsRealtime returns true if s is a realtime signal.
//
// Preconditions: s.IsValid().
func (s Signal) IsRealtime() bool {
        return s >= FirstRTSignal
}

// Index returns the index for signal s into arrays of both standard and
// realtime signals (e.g. signal masks).
//
// Preconditions: s.IsValid().
func (s Signal) Index() int {
        return int(s - 1)
}

// Signals.
const (
        SIGABRT   = Signal(6)
        SIGALRM   = Signal(14)
        SIGBUS    = Signal(7)
        SIGCHLD   = Signal(17)
        SIGCLD    = Signal(17)
        SIGCONT   = Signal(18)
        SIGFPE    = Signal(8)
        SIGHUP    = Signal(1)
        SIGILL    = Signal(4)
        SIGINT    = Signal(2)
        SIGIO     = Signal(29)
        SIGIOT    = Signal(6)
        SIGKILL   = Signal(9)
        SIGPIPE   = Signal(13)
        SIGPOLL   = Signal(29)
        SIGPROF   = Signal(27)
        SIGPWR    = Signal(30)
        SIGQUIT   = Signal(3)
        SIGSEGV   = Signal(11)
        SIGSTKFLT = Signal(16)
        SIGSTOP   = Signal(19)
        SIGSYS    = Signal(31)
        SIGTERM   = Signal(15)
        SIGTRAP   = Signal(5)
        SIGTSTP   = Signal(20)
        SIGTTIN   = Signal(21)
        SIGTTOU   = Signal(22)
        SIGUNUSED = Signal(31)
        SIGURG    = Signal(23)
        SIGUSR1   = Signal(10)
        SIGUSR2   = Signal(12)
        SIGVTALRM = Signal(26)
        SIGWINCH  = Signal(28)
        SIGXCPU   = Signal(24)
        SIGXFSZ   = Signal(25)
)

// SignalSet is a signal mask with a bit corresponding to each signal.
//
// +marshal
type SignalSet uint64

// SignalSetSize is the size in bytes of a SignalSet.
const SignalSetSize = 8

// MakeSignalSet returns SignalSet with the bit corresponding to each of the
// given signals set.
func MakeSignalSet(sigs ...Signal) SignalSet {
        indices := make([]int, len(sigs))
        for i, sig := range sigs {
                indices[i] = sig.Index()
        }
        return SignalSet(bits.Mask64(indices...))
}

// SignalSetOf returns a SignalSet with a single signal set.
func SignalSetOf(sig Signal) SignalSet {
        return SignalSet(bits.MaskOf64(sig.Index()))
}

// ForEachSignal invokes f for each signal set in the given mask.
func ForEachSignal(mask SignalSet, f func(sig Signal)) {
        bits.ForEachSetBit64(uint64(mask), func(i int) {
                f(Signal(i + 1))
        })
}

// 'how' values for rt_sigprocmask(2).
const (
        // SIG_BLOCK blocks the signals in the set.
        SIG_BLOCK = 0

        // SIG_UNBLOCK blocks the signals in the set.
        SIG_UNBLOCK = 1

        // SIG_SETMASK sets the signal mask to set.
        SIG_SETMASK = 2
)

// Signal actions for rt_sigaction(2), from uapi/asm-generic/signal-defs.h.
const (
        // SIG_DFL performs the default action.
        SIG_DFL = 0

        // SIG_IGN ignores the signal.
        SIG_IGN = 1
)

// Signal action flags for rt_sigaction(2), from uapi/asm-generic/signal.h.
const (
        SA_NOCLDSTOP = 0x00000001
        SA_NOCLDWAIT = 0x00000002
        SA_SIGINFO   = 0x00000004
        SA_RESTORER  = 0x04000000
        SA_ONSTACK   = 0x08000000
        SA_RESTART   = 0x10000000
        SA_NODEFER   = 0x40000000
        SA_RESETHAND = 0x80000000
        SA_NOMASK    = SA_NODEFER
        SA_ONESHOT   = SA_RESETHAND
)

// Signal stack flags for signalstack(2), from include/uapi/linux/signal.h.
const (
        SS_ONSTACK = 1
        SS_DISABLE = 2
)

// SIGPOLL si_codes.
const (
        // SI_POLL is defined as __SI_POLL in Linux 2.6.
        SI_POLL = 2 << 16

        // POLL_IN indicates that data input available.
        POLL_IN = SI_POLL | 1

        // POLL_OUT indicates that output buffers available.
        POLL_OUT = SI_POLL | 2

        // POLL_MSG indicates that an input message available.
        POLL_MSG = SI_POLL | 3

        // POLL_ERR indicates that there was an i/o error.
        POLL_ERR = SI_POLL | 4

        // POLL_PRI indicates that a high priority input available.
        POLL_PRI = SI_POLL | 5

        // POLL_HUP indicates that a device disconnected.
        POLL_HUP = SI_POLL | 6
)

// Possible values for si_code.
const (
        // SI_USER is sent by kill, sigsend, raise.
        SI_USER = 0

        // SI_KERNEL is sent by the kernel from somewhere.
        SI_KERNEL = 0x80

        // SI_QUEUE is sent by sigqueue.
        SI_QUEUE = -1

        // SI_TIMER is sent by timer expiration.
        SI_TIMER = -2

        // SI_MESGQ is sent by real time mesq state change.
        SI_MESGQ = -3

        // SI_ASYNCIO is sent by AIO completion.
        SI_ASYNCIO = -4

        // SI_SIGIO is sent by queued SIGIO.
        SI_SIGIO = -5

        // SI_TKILL is sent by tkill system call.
        SI_TKILL = -6

        // SI_DETHREAD is sent by execve() killing subsidiary threads.
        SI_DETHREAD = -7

        // SI_ASYNCNL is sent by glibc async name lookup completion.
        SI_ASYNCNL = -60
)

// CLD_* codes are only meaningful for SIGCHLD.
const (
        // CLD_EXITED indicates that a task exited.
        CLD_EXITED = 1

        // CLD_KILLED indicates that a task was killed by a signal.
        CLD_KILLED = 2

        // CLD_DUMPED indicates that a task was killed by a signal and then dumped
        // core.
        CLD_DUMPED = 3

        // CLD_TRAPPED indicates that a task was stopped by ptrace.
        CLD_TRAPPED = 4

        // CLD_STOPPED indicates that a thread group completed a group stop.
        CLD_STOPPED = 5

        // CLD_CONTINUED indicates that a group-stopped thread group was continued.
        CLD_CONTINUED = 6
)

// SYS_* codes are only meaningful for SIGSYS.
const (
        // SYS_SECCOMP indicates that a signal originates from seccomp.
        SYS_SECCOMP = 1
)

// Possible values for Sigevent.Notify, aka struct sigevent::sigev_notify.
const (
        SIGEV_SIGNAL    = 0
        SIGEV_NONE      = 1
        SIGEV_THREAD    = 2
        SIGEV_THREAD_ID = 4
)

// Sigevent represents struct sigevent.
//
// +marshal
type Sigevent struct {
        Value  uint64 // union sigval {int, void*}
        Signo  int32
        Notify int32

        // struct sigevent here contains 48-byte union _sigev_un. However, only
        // member _tid is significant to the kernel.
        Tid         int32
        UnRemainder [44]byte
}

// SigAction represents struct sigaction.
//
// +marshal
// +stateify savable
type SigAction struct {
        Handler  uint64
        Flags    uint64
        Restorer uint64
        Mask     SignalSet
}

// SignalStack represents information about a user stack, and is equivalent to
// stack_t.
//
// +marshal
// +stateify savable
type SignalStack struct {
        Addr  uint64
        Flags uint32
        _     uint32
        Size  uint64
}

// Contains checks if the stack pointer is within this stack.
func (s *SignalStack) Contains(sp hostarch.Addr) bool {
        return hostarch.Addr(s.Addr) < sp && sp <= hostarch.Addr(s.Addr+s.Size)
}

// Top returns the stack's top address.
func (s *SignalStack) Top() hostarch.Addr {
        return hostarch.Addr(s.Addr + s.Size)
}

// IsEnabled returns true iff this signal stack is marked as enabled.
func (s *SignalStack) IsEnabled() bool {
        return s.Flags&SS_DISABLE == 0
}

// SignalInfo represents information about a signal being delivered, and is
// equivalent to struct siginfo in linux kernel(linux/include/uapi/asm-generic/siginfo.h).
//
// +marshal
// +stateify savable
type SignalInfo struct {
        Signo int32 // Signal number
        Errno int32 // Errno value
        Code  int32 // Signal code
        _     uint32

        // struct siginfo::_sifields is a union. In SignalInfo, fields in the union
        // are accessed through methods.
        //
        // For reference, here is the definition of _sifields: (_sigfault._trapno,
        // which does not exist on x86, omitted for clarity)
        //
        // union {
        //         int _pad[SI_PAD_SIZE];
        //
        //         /* kill() */
        //         struct {
        //                 __kernel_pid_t _pid;        /* sender's pid */
        //                 __ARCH_SI_UID_T _uid;        /* sender's uid */
        //         } _kill;
        //
        //         /* POSIX.1b timers */
        //         struct {
        //                 __kernel_timer_t _tid;        /* timer id */
        //                 int _overrun;                /* overrun count */
        //                 char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)];
        //                 sigval_t _sigval;        /* same as below */
        //                 int _sys_private;       /* not to be passed to user */
        //         } _timer;
        //
        //         /* POSIX.1b signals */
        //         struct {
        //                 __kernel_pid_t _pid;        /* sender's pid */
        //                 __ARCH_SI_UID_T _uid;        /* sender's uid */
        //                 sigval_t _sigval;
        //         } _rt;
        //
        //         /* SIGCHLD */
        //         struct {
        //                 __kernel_pid_t _pid;        /* which child */
        //                 __ARCH_SI_UID_T _uid;        /* sender's uid */
        //                 int _status;                /* exit code */
        //                 __ARCH_SI_CLOCK_T _utime;
        //                 __ARCH_SI_CLOCK_T _stime;
        //         } _sigchld;
        //
        //         /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
        //         struct {
        //                 void *_addr; /* faulting insn/memory ref. */
        //                 short _addr_lsb; /* LSB of the reported address */
        //         } _sigfault;
        //
        //         /* SIGPOLL */
        //         struct {
        //                 __ARCH_SI_BAND_T _band;        /* POLL_IN, POLL_OUT, POLL_MSG */
        //                 int _fd;
        //         } _sigpoll;
        //
        //         /* SIGSYS */
        //         struct {
        //                 void *_call_addr; /* calling user insn */
        //                 int _syscall;        /* triggering system call number */
        //                 unsigned int _arch;        /* AUDIT_ARCH_* of syscall */
        //         } _sigsys;
        // } _sifields;
        //
        // _sifields is padded so that the size of siginfo is SI_MAX_SIZE = 128
        // bytes.
        Fields [128 - 16]byte
}

// FixSignalCodeForUser fixes up si_code.
//
// The si_code we get from Linux may contain the kernel-specific code in the
// top 16 bits if it's positive (e.g., from ptrace). Linux's
// copy_siginfo_to_user does
//     err |= __put_user((short)from->si_code, &to->si_code);
// to mask out those bits and we need to do the same.
func (s *SignalInfo) FixSignalCodeForUser() {
        if s.Code > 0 {
                s.Code &= 0x0000ffff
        }
}

// PID returns the si_pid field.
func (s *SignalInfo) PID() int32 {
        return int32(hostarch.ByteOrder.Uint32(s.Fields[0:4]))
}

// SetPID mutates the si_pid field.
func (s *SignalInfo) SetPID(val int32) {
        hostarch.ByteOrder.PutUint32(s.Fields[0:4], uint32(val))
}

// UID returns the si_uid field.
func (s *SignalInfo) UID() int32 {
        return int32(hostarch.ByteOrder.Uint32(s.Fields[4:8]))
}

// SetUID mutates the si_uid field.
func (s *SignalInfo) SetUID(val int32) {
        hostarch.ByteOrder.PutUint32(s.Fields[4:8], uint32(val))
}

// Sigval returns the sigval field, which is aliased to both si_int and si_ptr.
func (s *SignalInfo) Sigval() uint64 {
        return hostarch.ByteOrder.Uint64(s.Fields[8:16])
}

// SetSigval mutates the sigval field.
func (s *SignalInfo) SetSigval(val uint64) {
        hostarch.ByteOrder.PutUint64(s.Fields[8:16], val)
}

// TimerID returns the si_timerid field.
func (s *SignalInfo) TimerID() TimerID {
        return TimerID(hostarch.ByteOrder.Uint32(s.Fields[0:4]))
}

// SetTimerID sets the si_timerid field.
func (s *SignalInfo) SetTimerID(val TimerID) {
        hostarch.ByteOrder.PutUint32(s.Fields[0:4], uint32(val))
}

// Overrun returns the si_overrun field.
func (s *SignalInfo) Overrun() int32 {
        return int32(hostarch.ByteOrder.Uint32(s.Fields[4:8]))
}

// SetOverrun sets the si_overrun field.
func (s *SignalInfo) SetOverrun(val int32) {
        hostarch.ByteOrder.PutUint32(s.Fields[4:8], uint32(val))
}

// Addr returns the si_addr field.
func (s *SignalInfo) Addr() uint64 {
        return hostarch.ByteOrder.Uint64(s.Fields[0:8])
}

// SetAddr sets the si_addr field.
func (s *SignalInfo) SetAddr(val uint64) {
        hostarch.ByteOrder.PutUint64(s.Fields[0:8], val)
}

// Status returns the si_status field.
func (s *SignalInfo) Status() int32 {
        return int32(hostarch.ByteOrder.Uint32(s.Fields[8:12]))
}

// SetStatus mutates the si_status field.
func (s *SignalInfo) SetStatus(val int32) {
        hostarch.ByteOrder.PutUint32(s.Fields[8:12], uint32(val))
}

// CallAddr returns the si_call_addr field.
func (s *SignalInfo) CallAddr() uint64 {
        return hostarch.ByteOrder.Uint64(s.Fields[0:8])
}

// SetCallAddr mutates the si_call_addr field.
func (s *SignalInfo) SetCallAddr(val uint64) {
        hostarch.ByteOrder.PutUint64(s.Fields[0:8], val)
}

// Syscall returns the si_syscall field.
func (s *SignalInfo) Syscall() int32 {
        return int32(hostarch.ByteOrder.Uint32(s.Fields[8:12]))
}

// SetSyscall mutates the si_syscall field.
func (s *SignalInfo) SetSyscall(val int32) {
        hostarch.ByteOrder.PutUint32(s.Fields[8:12], uint32(val))
}

// Arch returns the si_arch field.
func (s *SignalInfo) Arch() uint32 {
        return hostarch.ByteOrder.Uint32(s.Fields[12:16])
}

// SetArch mutates the si_arch field.
func (s *SignalInfo) SetArch(val uint32) {
        hostarch.ByteOrder.PutUint32(s.Fields[12:16], val)
}

// Band returns the si_band field.
func (s *SignalInfo) Band() int64 {
        return int64(hostarch.ByteOrder.Uint64(s.Fields[0:8]))
}

// SetBand mutates the si_band field.
func (s *SignalInfo) SetBand(val int64) {
        // Note: this assumes the platform uses `long` as `__ARCH_SI_BAND_T`.
        // On some platforms, which gVisor doesn't support, `__ARCH_SI_BAND_T` is
        // `int`. See siginfo.h.
        hostarch.ByteOrder.PutUint64(s.Fields[0:8], uint64(val))
}

// FD returns the si_fd field.
func (s *SignalInfo) FD() uint32 {
        return hostarch.ByteOrder.Uint32(s.Fields[8:12])
}

// SetFD mutates the si_fd field.
func (s *SignalInfo) SetFD(val uint32) {
        hostarch.ByteOrder.PutUint32(s.Fields[8:12], val)
}


























   51 









   23 





















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
// Copyright 2020 The gVisor Authors.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build go1.13
// +build !go1.18

// Check go:linkname function signatures, type definitions, and constants when
// updating Go version.

package sync

import (
        "fmt"
        "reflect"
        "unsafe"
)

// Gopark is runtime.gopark. Gopark calls unlockf(pointer to runtime.g, lock);
// if unlockf returns true, Gopark blocks until Goready(pointer to runtime.g)
// is called. unlockf and its callees must be nosplit and norace, since stack
// splitting and race context are not available where it is called.
//
//go:nosplit
func Gopark(unlockf func(uintptr, unsafe.Pointer) bool, lock unsafe.Pointer, reason uint8, traceEv byte, traceskip int) {
        gopark(unlockf, lock, reason, traceEv, traceskip)
}

//go:linkname gopark runtime.gopark
func gopark(unlockf func(uintptr, unsafe.Pointer) bool, lock unsafe.Pointer, reason uint8, traceEv byte, traceskip int)

// Goready is runtime.goready.
//
//go:nosplit
func Goready(gp uintptr, traceskip int) {
        goready(gp, traceskip)
}

//go:linkname goready runtime.goready
func goready(gp uintptr, traceskip int)

// Values for the reason argument to gopark, from Go's src/runtime/runtime2.go.
const (
        WaitReasonSelect      uint8 = 9
        WaitReasonChanReceive uint8 = 14
        WaitReasonSemacquire  uint8 = 18
)

// Values for the traceEv argument to gopark, from Go's src/runtime/trace.go.
const (
        TraceEvGoBlockRecv   byte = 23
        TraceEvGoBlockSelect byte = 24
        TraceEvGoBlockSync   byte = 25
)

// Rand32 returns a non-cryptographically-secure random uint32.
func Rand32() uint32 {
        return fastrand()
}

// Rand64 returns a non-cryptographically-secure random uint64.
func Rand64() uint64 {
        return uint64(fastrand())<<32 | uint64(fastrand())
}

//go:linkname fastrand runtime.fastrand
func fastrand() uint32

// RandUintptr returns a non-cryptographically-secure random uintptr.
func RandUintptr() uintptr {
        if unsafe.Sizeof(uintptr(0)) == 4 {
                return uintptr(Rand32())
        }
        return uintptr(Rand64())
}

// MapKeyHasher returns a hash function for pointers of m's key type.
//
// Preconditions: m must be a map.
func MapKeyHasher(m interface{}) func(unsafe.Pointer, uintptr) uintptr {
        if rtyp := reflect.TypeOf(m); rtyp.Kind() != reflect.Map {
                panic(fmt.Sprintf("sync.MapKeyHasher: m is %v, not map", rtyp))
        }
        mtyp := *(**maptype)(unsafe.Pointer(&m))
        return mtyp.hasher
}

// maptype is equivalent to the beginning of runtime.maptype.
type maptype struct {
        size       uintptr
        ptrdata    uintptr
        hash       uint32
        tflag      uint8
        align      uint8
        fieldAlign uint8
        kind       uint8
        equal      func(unsafe.Pointer, unsafe.Pointer) bool
        gcdata     *byte
        str        int32
        ptrToThis  int32
        key        unsafe.Pointer
        elem       unsafe.Pointer
        bucket     unsafe.Pointer
        hasher     func(unsafe.Pointer, uintptr) uintptr
        // more fields
}

// These functions are only used within the sync package.

//go:linkname semacquire sync.runtime_Semacquire
func semacquire(s *uint32)

//go:linkname semrelease sync.runtime_Semrelease
func semrelease(s *uint32, handoff bool, skipframes int)

//go:linkname canSpin sync.runtime_canSpin
func canSpin(i int) bool

//go:linkname doSpin sync.runtime_doSpin
func doSpin()






















































   16 
   15 











































    4 





    4 

    1 


    4 


    4 



    3 









    3 



    2 



    2 
    2 



    2 

    3 





    3 









    3 


    3 








    1 









    1 







   10 







    2 










    2 










    5 









    3 




















    3 
    1 


    2 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package fasync provides FIOASYNC related functionality.
package fasync

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/waiter"
)

// Table to convert waiter event masks into si_band siginfo codes.
// Taken from fs/fcntl.c:band_table.
var bandTable = map[waiter.EventMask]int64{
        // POLL_IN
        waiter.EventIn: linux.EPOLLIN | linux.EPOLLRDNORM,
        // POLL_OUT
        waiter.EventOut: linux.EPOLLOUT | linux.EPOLLWRNORM | linux.EPOLLWRBAND,
        // POLL_ERR
        waiter.EventErr: linux.EPOLLERR,
        // POLL_PRI
        waiter.EventPri: linux.EPOLLPRI | linux.EPOLLRDBAND,
        // POLL_HUP
        waiter.EventHUp: linux.EPOLLHUP | linux.EPOLLERR,
}

// New returns a function that creates a new fs.FileAsync with the given file
// descriptor.
func New(fd int) func() fs.FileAsync {
        return func() fs.FileAsync {
                return &FileAsync{fd: fd}
        }
}

// NewVFS2 returns a function that creates a new vfs.FileAsync with the given
// file descriptor.
func NewVFS2(fd int) func() vfs.FileAsync {
        return func() vfs.FileAsync {
                return &FileAsync{fd: fd}
        }
}

// FileAsync sends signals when the registered file is ready for IO.
//
// +stateify savable
type FileAsync struct {
        // e is immutable after first use (which is protected by mu below).
        e waiter.Entry

        // fd is the file descriptor to notify about.
        // It is immutable, set at allocation time. This matches Linux semantics in
        // fs/fcntl.c:fasync_helper.
        // The fd value is passed to the signal recipient in siginfo.si_fd.
        fd int

        // regMu protects registeration and unregistration actions on e.
        //
        // regMu must be held while registration decisions are being made
        // through the registration action itself.
        //
        // Lock ordering: regMu, mu.
        regMu sync.Mutex `state:"nosave"`

        // mu protects all following fields.
        //
        // Lock ordering: e.mu, mu.
        mu         sync.Mutex `state:"nosave"`
        requester  *auth.Credentials
        registered bool
        // signal is the signal to deliver upon I/O being available.
        // The default value ("zero signal") means the default SIGIO signal will be
        // delivered.
        signal linux.Signal

        // Only one of the following is allowed to be non-nil.
        recipientPG *kernel.ProcessGroup
        recipientTG *kernel.ThreadGroup
        recipientT  *kernel.Task
}

// Callback sends a signal.
func (a *FileAsync) Callback(e *waiter.Entry, mask waiter.EventMask) {
        a.mu.Lock()
        defer a.mu.Unlock()
        if !a.registered {
                return
        }
        t := a.recipientT
        tg := a.recipientTG
        if a.recipientPG != nil {
                tg = a.recipientPG.Originator()
        }
        if tg != nil {
                t = tg.Leader()
        }
        if t == nil {
                // No recipient has been registered.
                return
        }
        c := t.Credentials()
        // Logic from sigio_perm in fs/fcntl.c.
        permCheck := (a.requester.EffectiveKUID == 0 ||
                a.requester.EffectiveKUID == c.SavedKUID ||
                a.requester.EffectiveKUID == c.RealKUID ||
                a.requester.RealKUID == c.SavedKUID ||
                a.requester.RealKUID == c.RealKUID)
        if !permCheck {
                return
        }
        signalInfo := &linux.SignalInfo{
                Signo: int32(linux.SIGIO),
                Code:  linux.SI_KERNEL,
        }
        if a.signal != 0 {
                signalInfo.Signo = int32(a.signal)
                signalInfo.SetFD(uint32(a.fd))
                var band int64
                for m, bandCode := range bandTable {
                        if m&mask != 0 {
                                band |= bandCode
                        }
                }
                signalInfo.SetBand(band)
        }
        t.SendSignal(signalInfo)
}

// Register sets the file which will be monitored for IO events.
//
// The file must not be currently registered.
func (a *FileAsync) Register(w waiter.Waitable) {
        a.regMu.Lock()
        defer a.regMu.Unlock()
        a.mu.Lock()

        if a.registered {
                a.mu.Unlock()
                panic("registering already registered file")
        }

        if a.e.Callback == nil {
                a.e.Callback = a
        }
        a.registered = true

        a.mu.Unlock()
        w.EventRegister(&a.e, waiter.ReadableEvents|waiter.WritableEvents|waiter.EventErr|waiter.EventHUp)
}

// Unregister stops monitoring a file.
//
// The file must be currently registered.
func (a *FileAsync) Unregister(w waiter.Waitable) {
        a.regMu.Lock()
        defer a.regMu.Unlock()
        a.mu.Lock()

        if !a.registered {
                a.mu.Unlock()
                panic("unregistering unregistered file")
        }

        a.registered = false

        a.mu.Unlock()
        w.EventUnregister(&a.e)
}

// Owner returns who is currently getting signals. All return values will be
// nil if no one is set to receive signals.
func (a *FileAsync) Owner() (*kernel.Task, *kernel.ThreadGroup, *kernel.ProcessGroup) {
        a.mu.Lock()
        defer a.mu.Unlock()
        return a.recipientT, a.recipientTG, a.recipientPG
}

// SetOwnerTask sets the owner (who will receive signals) to a specified task.
// Only this owner will receive signals.
func (a *FileAsync) SetOwnerTask(requester *kernel.Task, recipient *kernel.Task) {
        a.mu.Lock()
        defer a.mu.Unlock()
        a.requester = requester.Credentials()
        a.recipientT = recipient
        a.recipientTG = nil
        a.recipientPG = nil
}

// SetOwnerThreadGroup sets the owner (who will receive signals) to a specified
// thread group. Only this owner will receive signals.
func (a *FileAsync) SetOwnerThreadGroup(requester *kernel.Task, recipient *kernel.ThreadGroup) {
        a.mu.Lock()
        defer a.mu.Unlock()
        a.requester = requester.Credentials()
        a.recipientT = nil
        a.recipientTG = recipient
        a.recipientPG = nil
}

// SetOwnerProcessGroup sets the owner (who will receive signals) to a
// specified process group. Only this owner will receive signals.
func (a *FileAsync) SetOwnerProcessGroup(requester *kernel.Task, recipient *kernel.ProcessGroup) {
        a.mu.Lock()
        defer a.mu.Unlock()
        a.requester = requester.Credentials()
        a.recipientT = nil
        a.recipientTG = nil
        a.recipientPG = recipient
}

// ClearOwner unsets the current signal recipient.
func (a *FileAsync) ClearOwner() {
        a.mu.Lock()
        defer a.mu.Unlock()
        a.requester = nil
        a.recipientT = nil
        a.recipientTG = nil
        a.recipientPG = nil
}

// Signal returns which signal will be sent to the signal recipient.
// A value of zero means the signal to deliver wasn't customized, which means
// the default signal (SIGIO) will be delivered.
func (a *FileAsync) Signal() linux.Signal {
        a.mu.Lock()
        defer a.mu.Unlock()
        return a.signal
}

// SetSignal overrides which signal to send when I/O is available.
// The default behavior can be reset by specifying signal zero, which means
// to send SIGIO.
func (a *FileAsync) SetSignal(signal linux.Signal) error {
        if signal != 0 && !signal.IsValid() {
                return linuxerr.EINVAL
        }
        a.mu.Lock()
        defer a.mu.Unlock()
        a.signal = signal
        return nil
}
























 1642 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pgalloc

import (
        "reflect"
        "unsafe"

        "golang.org/x/sys/unix"
)

func unsafeSlice(addr uintptr, length int) (slice []byte) {
        sh := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
        sh.Data = addr
        sh.Len = length
        sh.Cap = length
        return
}

func mincore(s []byte, buf []byte) error {
        if _, _, errno := unix.RawSyscall(
                unix.SYS_MINCORE,
                uintptr(unsafe.Pointer(&s[0])),
                uintptr(len(s)),
                uintptr(unsafe.Pointer(&buf[0]))); errno != 0 {
                return errno
        }
        return nil
}

























































































































  335 



  335 




  335 


  334 









  334 

  334 











  332 

  333 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package flipcall

import (
        "encoding/json"
        "fmt"
        "math"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/log"
)

type endpointControlImpl struct {
        state int32
}

// Bits in endpointControlImpl.state.
const (
        epsBlocked = 1 << iota
        epsShutdown
)

func (ep *Endpoint) ctrlInit(opts ...EndpointOption) error {
        if len(opts) != 0 {
                return fmt.Errorf("unknown EndpointOption: %T", opts[0])
        }
        return nil
}

func (ep *Endpoint) ctrlConnect() error {
        if err := ep.enterFutexWait(); err != nil {
                return err
        }
        defer ep.exitFutexWait()

        // Write the connection request.
        w := ep.NewWriter()
        if err := json.NewEncoder(w).Encode(struct{}{}); err != nil {
                return fmt.Errorf("error writing connection request: %v", err)
        }
        *ep.dataLen() = w.Len()

        // Exchange control with the server.
        if err := ep.futexSetPeerActive(); err != nil {
                return err
        }
        if err := ep.futexWakePeer(); err != nil {
                return err
        }
        if err := ep.futexWaitUntilActive(); err != nil {
                return err
        }

        // Read the connection response.
        var resp struct{}
        respLen := atomic.LoadUint32(ep.dataLen())
        if respLen > ep.dataCap {
                return fmt.Errorf("invalid connection response length %d (maximum %d)", respLen, ep.dataCap)
        }
        if err := json.NewDecoder(ep.NewReader(respLen)).Decode(&resp); err != nil {
                return fmt.Errorf("error reading connection response: %v", err)
        }

        return nil
}

func (ep *Endpoint) ctrlWaitFirst() error {
        if err := ep.enterFutexWait(); err != nil {
                return err
        }
        defer ep.exitFutexWait()

        // Wait for the connection request.
        if err := ep.futexWaitUntilActive(); err != nil {
                return err
        }

        // Read the connection request.
        reqLen := atomic.LoadUint32(ep.dataLen())
        if reqLen > ep.dataCap {
                return fmt.Errorf("invalid connection request length %d (maximum %d)", reqLen, ep.dataCap)
        }
        var req struct{}
        if err := json.NewDecoder(ep.NewReader(reqLen)).Decode(&req); err != nil {
                return fmt.Errorf("error reading connection request: %v", err)
        }

        // Write the connection response.
        w := ep.NewWriter()
        if err := json.NewEncoder(w).Encode(struct{}{}); err != nil {
                return fmt.Errorf("error writing connection response: %v", err)
        }
        *ep.dataLen() = w.Len()

        // Return control to the client.
        raceBecomeInactive()
        if err := ep.futexSetPeerActive(); err != nil {
                return err
        }
        if err := ep.futexWakePeer(); err != nil {
                return err
        }

        // Wait for the first non-connection message.
        return ep.futexWaitUntilActive()
}

func (ep *Endpoint) ctrlRoundTrip() error {
        if err := ep.enterFutexWait(); err != nil {
                return err
        }
        defer ep.exitFutexWait()

        if err := ep.futexSetPeerActive(); err != nil {
                return err
        }
        if err := ep.futexWakePeer(); err != nil {
                return err
        }
        return ep.futexWaitUntilActive()
}

func (ep *Endpoint) ctrlWakeLast() error {
        if err := ep.futexSetPeerActive(); err != nil {
                return err
        }
        return ep.futexWakePeer()
}

func (ep *Endpoint) enterFutexWait() error {
        switch eps := atomic.AddInt32(&ep.ctrl.state, epsBlocked); eps {
        case epsBlocked:
                return nil
        case epsBlocked | epsShutdown:
                atomic.AddInt32(&ep.ctrl.state, -epsBlocked)
                return ShutdownError{}
        default:
                // Most likely due to ep.enterFutexWait() being called concurrently
                // from multiple goroutines.
                panic(fmt.Sprintf("invalid flipcall.Endpoint.ctrl.state before flipcall.Endpoint.enterFutexWait(): %v", eps-epsBlocked))
        }
}

func (ep *Endpoint) exitFutexWait() {
        switch eps := atomic.AddInt32(&ep.ctrl.state, -epsBlocked); eps {
        case 0:
                return
        case epsShutdown:
                // ep.ctrlShutdown() was called while we were blocked, so we are
                // repsonsible for indicating connection shutdown.
                ep.shutdownConn()
        default:
                panic(fmt.Sprintf("invalid flipcall.Endpoint.ctrl.state after flipcall.Endpoint.exitFutexWait(): %v", eps+epsBlocked))
        }
}

func (ep *Endpoint) ctrlShutdown() {
        // Set epsShutdown to ensure that future calls to ep.enterFutexWait() fail.
        if atomic.AddInt32(&ep.ctrl.state, epsShutdown)&epsBlocked != 0 {
                // Wake the blocked thread. This must loop because it's possible that
                // FUTEX_WAKE occurs after the waiter sets epsBlocked, but before it
                // blocks in FUTEX_WAIT.
                for {
                        // Wake MaxInt32 threads to prevent a broken or malicious peer from
                        // swallowing our wakeup by FUTEX_WAITing from multiple threads.
                        if err := ep.futexWakeConnState(math.MaxInt32); err != nil {
                                log.Warningf("failed to FUTEX_WAKE Endpoints: %v", err)
                                break
                        }
                        yieldThread()
                        if atomic.LoadInt32(&ep.ctrl.state)&epsBlocked == 0 {
                                break
                        }
                }
        } else {
                // There is no blocked thread, so we are responsible for indicating
                // connection shutdown.
                ep.shutdownConn()
        }
}

func (ep *Endpoint) shutdownConn() {
        switch cs := atomic.SwapUint32(ep.connState(), csShutdown); cs {
        case ep.activeState:
                if err := ep.futexWakeConnState(1); err != nil {
                        log.Warningf("failed to FUTEX_WAKE peer Endpoint for shutdown: %v", err)
                }
        case ep.inactiveState:
                // The peer is currently active and will detect shutdown when it tries
                // to update the connection state.
        case csShutdown:
                // The peer also called Endpoint.Shutdown().
        default:
                log.Warningf("unexpected connection state before Endpoint.shutdownConn(): %v", cs)
        }
}































    4 

    4 


    4 






   76 




   77 









   76 
    1 


   75 

    1 



   75 






   59 

    1 


   58 

















   59 

   10 





   56 




   54 
    5 
    5 




    5 



   42 





   12 





   59 



   59 










   25 









   35 



    4 




   32 





   32 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package parse provides utilities to parse packets.
package parse

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

// ARP populates pkt's network header with an ARP header found in
// pkt.Data.
//
// Returns true if the header was successfully parsed.
func ARP(pkt *stack.PacketBuffer) bool {
        _, ok := pkt.NetworkHeader().Consume(header.ARPSize)
        if ok {
                pkt.NetworkProtocolNumber = header.ARPProtocolNumber
        }
        return ok
}

// IPv4 parses an IPv4 packet found in pkt.Data and populates pkt's network
// header with the IPv4 header.
//
// Returns true if the header was successfully parsed.
func IPv4(pkt *stack.PacketBuffer) bool {
        hdr, ok := pkt.Data().PullUp(header.IPv4MinimumSize)
        if !ok {
                return false
        }
        ipHdr := header.IPv4(hdr)

        // Header may have options, determine the true header length.
        headerLen := int(ipHdr.HeaderLength())
        if headerLen < header.IPv4MinimumSize {
                // TODO(gvisor.dev/issue/2404): Per RFC 791, IHL needs to be at least 5 in
                // order for the packet to be valid. Figure out if we want to reject this
                // case.
                headerLen = header.IPv4MinimumSize
        }
        hdr, ok = pkt.NetworkHeader().Consume(headerLen)
        if !ok {
                return false
        }
        ipHdr = header.IPv4(hdr)
        length := int(ipHdr.TotalLength()) - len(hdr)
        if length < 0 {
                return false
        }

        pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
        pkt.Data().CapLength(length)
        return true
}

// IPv6 parses an IPv6 packet found in pkt.Data and populates pkt's network
// header with the IPv6 header.
func IPv6(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, fragID uint32, fragOffset uint16, fragMore bool, ok bool) {
        hdr, ok := pkt.Data().PullUp(header.IPv6MinimumSize)
        if !ok {
                return 0, 0, 0, false, false
        }
        ipHdr := header.IPv6(hdr)

        // Create a VV to parse the packet. We don't plan to modify anything here.
        // dataVV consists of:
        // - Any IPv6 header bytes after the first 40 (i.e. extensions).
        // - The transport header, if present.
        // - Any other payload data.
        views := [8]buffer.View{}
        dataVV := buffer.NewVectorisedView(0, views[:0])
        dataVV.AppendViews(pkt.Data().Views())
        dataVV.TrimFront(header.IPv6MinimumSize)
        it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataVV)

        // Iterate over the IPv6 extensions to find their length.
        var nextHdr tcpip.TransportProtocolNumber
        var extensionsSize int

traverseExtensions:
        for {
                extHdr, done, err := it.Next()
                if err != nil {
                        break
                }

                // If we exhaust the extension list, the entire packet is the IPv6 header
                // and (possibly) extensions.
                if done {
                        extensionsSize = dataVV.Size()
                        break
                }

                switch extHdr := extHdr.(type) {
                case header.IPv6FragmentExtHdr:
                        if fragID == 0 && fragOffset == 0 && !fragMore {
                                fragID = extHdr.ID()
                                fragOffset = extHdr.FragmentOffset()
                                fragMore = extHdr.More()
                        }
                        rawPayload := it.AsRawHeader(true /* consume */)
                        extensionsSize = dataVV.Size() - rawPayload.Buf.Size()
                        break traverseExtensions

                case header.IPv6RawPayloadHeader:
                        // We've found the payload after any extensions.
                        extensionsSize = dataVV.Size() - extHdr.Buf.Size()
                        nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier)
                        break traverseExtensions

                default:
                        // Any other extension is a no-op, keep looping until we find the payload.
                }
        }

        // Put the IPv6 header with extensions in pkt.NetworkHeader().
        hdr, ok = pkt.NetworkHeader().Consume(header.IPv6MinimumSize + extensionsSize)
        if !ok {
                panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data().Size()))
        }
        ipHdr = header.IPv6(hdr)
        pkt.Data().CapLength(int(ipHdr.PayloadLength()))
        pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber

        return nextHdr, fragID, fragOffset, fragMore, true
}

// UDP parses a UDP packet found in pkt.Data and populates pkt's transport
// header with the UDP header.
//
// Returns true if the header was successfully parsed.
func UDP(pkt *stack.PacketBuffer) bool {
        _, ok := pkt.TransportHeader().Consume(header.UDPMinimumSize)
        pkt.TransportProtocolNumber = header.UDPProtocolNumber
        return ok
}

// TCP parses a TCP packet found in pkt.Data and populates pkt's transport
// header with the TCP header.
//
// Returns true if the header was successfully parsed.
func TCP(pkt *stack.PacketBuffer) bool {
        // TCP header is variable length, peek at it first.
        hdrLen := header.TCPMinimumSize
        hdr, ok := pkt.Data().PullUp(hdrLen)
        if !ok {
                return false
        }

        // If the header has options, pull those up as well.
        if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data().Size() {
                // TODO(gvisor.dev/issue/2404): Figure out whether to reject this kind of
                // packets.
                hdrLen = offset
        }

        _, ok = pkt.TransportHeader().Consume(hdrLen)
        pkt.TransportProtocolNumber = header.TCPProtocolNumber
        return ok
}












































    9 














































 1396 


  576 



 1413 

  948 





  934 






  115 




   12 




















































 1404 














 1368 




 1352 







 1392 
 1391 



 1369 


 1385 














 1167 



 1167 



 1167 




 1168 
























































   10 





    1 


    9 




    3 






    1 



    2 
    1 






    2 







    2 
    1 



    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/syserror"
)

// futexWaitRestartBlock encapsulates the state required to restart futex(2)
// via restart_syscall(2).
//
// +stateify savable
type futexWaitRestartBlock struct {
        duration time.Duration

        // addr stored as uint64 since uintptr is not save-able.
        addr    uint64
        private bool
        val     uint32
        mask    uint32
}

// Restart implements kernel.SyscallRestartBlock.Restart.
func (f *futexWaitRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
        return futexWaitDuration(t, f.duration, false, hostarch.Addr(f.addr), f.private, f.val, f.mask)
}

// futexWaitAbsolute performs a FUTEX_WAIT_BITSET, blocking until the wait is
// complete.
//
// The wait blocks forever if forever is true, otherwise it blocks until ts.
//
// If blocking is interrupted, the syscall is restarted with the original
// arguments.
func futexWaitAbsolute(t *kernel.Task, clockRealtime bool, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) {
        w := t.FutexWaiter()
        err := t.Futex().WaitPrepare(w, t, addr, private, val, mask)
        if err != nil {
                return 0, err
        }

        if forever {
                err = t.Block(w.C)
        } else if clockRealtime {
                notifier, tchan := ktime.NewChannelNotifier()
                timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
                timer.Swap(ktime.Setting{
                        Enabled: true,
                        Next:    ktime.FromTimespec(ts),
                })
                err = t.BlockWithTimer(w.C, tchan)
                timer.Destroy()
        } else {
                err = t.BlockWithDeadline(w.C, true, ktime.FromTimespec(ts))
        }

        t.Futex().WaitComplete(w, t)
        return 0, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
}

// futexWaitDuration performs a FUTEX_WAIT, blocking until the wait is
// complete.
//
// The wait blocks forever if forever is true, otherwise is blocks for
// duration.
//
// If blocking is interrupted, forever determines how to restart the
// syscall. If forever is true, the syscall is restarted with the original
// arguments. If forever is false, duration is a relative timeout and the
// syscall is restarted with the remaining timeout.
func futexWaitDuration(t *kernel.Task, duration time.Duration, forever bool, addr hostarch.Addr, private bool, val, mask uint32) (uintptr, error) {
        w := t.FutexWaiter()
        err := t.Futex().WaitPrepare(w, t, addr, private, val, mask)
        if err != nil {
                return 0, err
        }

        remaining, err := t.BlockWithTimeout(w.C, !forever, duration)
        t.Futex().WaitComplete(w, t)
        if err == nil {
                return 0, nil
        }

        // The wait was unsuccessful for some reason other than interruption. Simply
        // forward the error.
        if err != syserror.ErrInterrupted {
                return 0, err
        }

        // The wait was interrupted and we need to restart. Decide how.

        // The wait duration was absolute, restart with the original arguments.
        if forever {
                return 0, syserror.ERESTARTSYS
        }

        // The wait duration was relative, restart with the remaining duration.
        t.SetSyscallRestartBlock(&futexWaitRestartBlock{
                duration: remaining,
                addr:     uint64(addr),
                private:  private,
                val:      val,
                mask:     mask,
        })
        return 0, syserror.ERESTART_RESTARTBLOCK
}

func futexLockPI(t *kernel.Task, ts linux.Timespec, forever bool, addr hostarch.Addr, private bool) error {
        w := t.FutexWaiter()
        locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, false)
        if err != nil {
                return err
        }
        if locked {
                // Futex acquired, we're done!
                return nil
        }

        if forever {
                err = t.Block(w.C)
        } else {
                notifier, tchan := ktime.NewChannelNotifier()
                timer := ktime.NewTimer(t.Kernel().RealtimeClock(), notifier)
                timer.Swap(ktime.Setting{
                        Enabled: true,
                        Next:    ktime.FromTimespec(ts),
                })
                err = t.BlockWithTimer(w.C, tchan)
                timer.Destroy()
        }

        t.Futex().WaitComplete(w, t)
        return syserror.ConvertIntr(err, syserror.ERESTARTSYS)
}

func tryLockPI(t *kernel.Task, addr hostarch.Addr, private bool) error {
        w := t.FutexWaiter()
        locked, err := t.Futex().LockPI(w, t, addr, uint32(t.ThreadID()), private, true)
        if err != nil {
                return err
        }
        if !locked {
                return linuxerr.EWOULDBLOCK
        }
        return nil
}

// Futex implements linux syscall futex(2).
// It provides a method for a program to wait for a value at a given address to
// change, and a method to wake up anyone waiting on a particular address.
func Futex(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        futexOp := args[1].Int()
        val := int(args[2].Int())
        nreq := int(args[3].Int())
        timeout := args[3].Pointer()
        naddr := args[4].Pointer()
        val3 := args[5].Int()

        cmd := futexOp &^ (linux.FUTEX_PRIVATE_FLAG | linux.FUTEX_CLOCK_REALTIME)
        private := (futexOp & linux.FUTEX_PRIVATE_FLAG) != 0
        clockRealtime := (futexOp & linux.FUTEX_CLOCK_REALTIME) == linux.FUTEX_CLOCK_REALTIME
        mask := uint32(val3)

        switch cmd {
        case linux.FUTEX_WAIT, linux.FUTEX_WAIT_BITSET:
                // WAIT{_BITSET} wait forever if the timeout isn't passed.
                forever := (timeout == 0)

                var timespec linux.Timespec
                if !forever {
                        var err error
                        timespec, err = copyTimespecIn(t, timeout)
                        if err != nil {
                                return 0, nil, err
                        }
                }

                switch cmd {
                case linux.FUTEX_WAIT:
                        // WAIT uses a relative timeout.
                        mask = linux.FUTEX_BITSET_MATCH_ANY
                        var timeoutDur time.Duration
                        if !forever {
                                timeoutDur = time.Duration(timespec.ToNsecCapped()) * time.Nanosecond
                        }
                        n, err := futexWaitDuration(t, timeoutDur, forever, addr, private, uint32(val), mask)
                        return n, nil, err

                case linux.FUTEX_WAIT_BITSET:
                        // WAIT_BITSET uses an absolute timeout which is either
                        // CLOCK_MONOTONIC or CLOCK_REALTIME.
                        if mask == 0 {
                                return 0, nil, linuxerr.EINVAL
                        }
                        n, err := futexWaitAbsolute(t, clockRealtime, timespec, forever, addr, private, uint32(val), mask)
                        return n, nil, err
                default:
                        panic("unreachable")
                }

        case linux.FUTEX_WAKE:
                mask = ^uint32(0)
                fallthrough

        case linux.FUTEX_WAKE_BITSET:
                if mask == 0 {
                        return 0, nil, linuxerr.EINVAL
                }
                if val <= 0 {
                        // The Linux kernel wakes one waiter even if val is
                        // non-positive.
                        val = 1
                }
                n, err := t.Futex().Wake(t, addr, private, mask, val)
                return uintptr(n), nil, err

        case linux.FUTEX_REQUEUE:
                n, err := t.Futex().Requeue(t, addr, naddr, private, val, nreq)
                return uintptr(n), nil, err

        case linux.FUTEX_CMP_REQUEUE:
                // 'val3' contains the value to be checked at 'addr' and
                // 'val' is the number of waiters that should be woken up.
                nval := uint32(val3)
                n, err := t.Futex().RequeueCmp(t, addr, naddr, private, nval, val, nreq)
                return uintptr(n), nil, err

        case linux.FUTEX_WAKE_OP:
                op := uint32(val3)
                if val <= 0 {
                        // The Linux kernel wakes one waiter even if val is
                        // non-positive.
                        val = 1
                }
                n, err := t.Futex().WakeOp(t, addr, naddr, private, val, nreq, op)
                return uintptr(n), nil, err

        case linux.FUTEX_LOCK_PI:
                forever := (timeout == 0)

                var timespec linux.Timespec
                if !forever {
                        var err error
                        timespec, err = copyTimespecIn(t, timeout)
                        if err != nil {
                                return 0, nil, err
                        }
                }
                err := futexLockPI(t, timespec, forever, addr, private)
                return 0, nil, err

        case linux.FUTEX_TRYLOCK_PI:
                err := tryLockPI(t, addr, private)
                return 0, nil, err

        case linux.FUTEX_UNLOCK_PI:
                err := t.Futex().UnlockPI(t, addr, uint32(t.ThreadID()), private)
                return 0, nil, err

        case linux.FUTEX_WAIT_REQUEUE_PI, linux.FUTEX_CMP_REQUEUE_PI:
                t.Kernel().EmitUnimplementedEvent(t)
                return 0, nil, syserror.ENOSYS

        default:
                // We don't even know about this command.
                return 0, nil, syserror.ENOSYS
        }
}

// SetRobustList implements linux syscall set_robust_list(2).
func SetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        // Despite the syscall using the name 'pid' for this variable, it is
        // very much a tid.
        head := args[0].Pointer()
        length := args[1].SizeT()

        if length != uint(linux.SizeOfRobustListHead) {
                return 0, nil, linuxerr.EINVAL
        }
        t.SetRobustList(head)
        return 0, nil, nil
}

// GetRobustList implements linux syscall get_robust_list(2).
func GetRobustList(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        // Despite the syscall using the name 'pid' for this variable, it is
        // very much a tid.
        tid := args[0].Int()
        headAddr := args[1].Pointer()
        sizeAddr := args[2].Pointer()

        if tid < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        ot := t
        if tid != 0 {
                if ot = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid)); ot == nil {
                        return 0, nil, linuxerr.ESRCH
                }
        }

        // Copy out head pointer.
        head := t.Arch().Native(uintptr(ot.GetRobustList()))
        if _, err := head.CopyOut(t, headAddr); err != nil {
                return 0, nil, err
        }

        // Copy out size, which is a constant. Note that while size isn't
        // an address, it is defined as the arch-dependent size_t, so it
        // needs to be converted to a native-sized int.
        size := t.Arch().Native(uintptr(linux.SizeOfRobustListHead))
        if _, err := size.CopyOut(t, sizeAddr); err != nil {
                return 0, nil, err
        }

        return 0, nil, nil
}





















































































  240 





  210 


























































  267 

  271 
    3 









 1616 
 1614 



 1611 















  947 







  955 







  755 

  309 
  295 



  755 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package waiter provides the implementation of a wait queue, where waiters can
// be enqueued to be notified when an event of interest happens.
//
// Becoming readable and/or writable are examples of events. Waiters are
// expected to use a pattern similar to this to make a blocking function out of
// a non-blocking one:
//
//        func (o *object) blockingRead(...) error {
//                err := o.nonBlockingRead(...)
//                if err != ErrAgain {
//                        // Completed with no need to wait!
//                        return err
//                }
//
//                e := createOrGetWaiterEntry(...)
//                o.EventRegister(&e, waiter.EventIn)
//                defer o.EventUnregister(&e)
//
//                // We need to try to read again after registration because the
//                // object may have become readable between the last attempt to
//                // read and read registration.
//                err = o.nonBlockingRead(...)
//                for err == ErrAgain {
//                        wait()
//                        err = o.nonBlockingRead(...)
//                }
//
//                return err
//        }
//
// Another goroutine needs to notify waiters when events happen. For example:
//
//        func (o *object) Write(...) ... {
//                // Do write work.
//                [...]
//
//                if oldDataAvailableSize == 0 && dataAvailableSize > 0 {
//                        // If no data was available and now some data is
//                        // available, the object became readable, so notify
//                        // potential waiters about this.
//                        o.Notify(waiter.EventIn)
//                }
//        }
package waiter

import (
        "gvisor.dev/gvisor/pkg/sync"
)

// EventMask represents io events as used in the poll() syscall.
type EventMask uint64

// Events that waiters can wait on. The meaning is the same as those in the
// poll() syscall.
const (
        EventIn     EventMask = 0x01   // POLLIN
        EventPri    EventMask = 0x02   // POLLPRI
        EventOut    EventMask = 0x04   // POLLOUT
        EventErr    EventMask = 0x08   // POLLERR
        EventHUp    EventMask = 0x10   // POLLHUP
        EventRdNorm EventMask = 0x0040 // POLLRDNORM
        EventWrNorm EventMask = 0x0100 // POLLWRNORM

        allEvents      EventMask = 0x1f | EventRdNorm | EventWrNorm
        ReadableEvents EventMask = EventIn | EventRdNorm
        WritableEvents EventMask = EventOut | EventWrNorm
)

// EventMaskFromLinux returns an EventMask representing the supported events
// from the Linux events e, which is in the format used by poll(2).
func EventMaskFromLinux(e uint32) EventMask {
        // Our flag definitions are currently identical to Linux.
        return EventMask(e) & allEvents
}

// ToLinux returns e in the format used by Linux poll(2).
func (e EventMask) ToLinux() uint32 {
        // Our flag definitions are currently identical to Linux.
        return uint32(e)
}

// Waitable contains the methods that need to be implemented by waitable
// objects.
type Waitable interface {
        // Readiness returns what the object is currently ready for. If it's
        // not ready for a desired purpose, the caller may use EventRegister and
        // EventUnregister to get notifications once the object becomes ready.
        //
        // Implementations should allow for events like EventHUp and EventErr
        // to be returned regardless of whether they are in the input EventMask.
        Readiness(mask EventMask) EventMask

        // EventRegister registers the given waiter entry to receive
        // notifications when an event occurs that makes the object ready for
        // at least one of the events in mask.
        EventRegister(e *Entry, mask EventMask)

        // EventUnregister unregisters a waiter entry previously registered with
        // EventRegister().
        EventUnregister(e *Entry)
}

// EntryCallback provides a notify callback.
type EntryCallback interface {
        // Callback is the function to be called when the waiter entry is
        // notified. It is responsible for doing whatever is needed to wake up
        // the waiter.
        //
        // The callback is supposed to perform minimal work, and cannot call
        // any method on the queue itself because it will be locked while the
        // callback is running.
        //
        // The mask indicates the events that occurred and that the entry is
        // interested in.
        Callback(e *Entry, mask EventMask)
}

// Entry represents a waiter that can be add to the a wait queue. It can
// only be in one queue at a time, and is added "intrusively" to the queue with
// no extra memory allocations.
//
// +stateify savable
type Entry struct {
        Callback EntryCallback

        // The following fields are protected by the queue lock.
        mask EventMask
        waiterEntry
}

type channelCallback struct {
        ch chan struct{}
}

// Callback implements EntryCallback.Callback.
func (c *channelCallback) Callback(*Entry, EventMask) {
        select {
        case c.ch <- struct{}{}:
        default:
        }
}

// NewChannelEntry initializes a new Entry that does a non-blocking write to a
// struct{} channel when the callback is called. It returns the new Entry
// instance and the channel being used.
//
// If a channel isn't specified (i.e., if "c" is nil), then NewChannelEntry
// allocates a new channel.
func NewChannelEntry(c chan struct{}) (Entry, chan struct{}) {
        if c == nil {
                c = make(chan struct{}, 1)
        }

        return Entry{Callback: &channelCallback{ch: c}}, c
}

// Queue represents the wait queue where waiters can be added and
// notifiers can notify them when events happen.
//
// The zero value for waiter.Queue is an empty queue ready for use.
//
// +stateify savable
type Queue struct {
        list waiterList
        mu   sync.RWMutex `state:"nosave"`
}

// EventRegister adds a waiter to the wait queue; the waiter will be notified
// when at least one of the events specified in mask happens.
func (q *Queue) EventRegister(e *Entry, mask EventMask) {
        q.mu.Lock()
        e.mask = mask
        q.list.PushBack(e)
        q.mu.Unlock()
}

// EventUnregister removes the given waiter entry from the wait queue.
func (q *Queue) EventUnregister(e *Entry) {
        q.mu.Lock()
        q.list.Remove(e)
        q.mu.Unlock()
}

// Notify notifies all waiters in the queue whose masks have at least one bit
// in common with the notification mask.
func (q *Queue) Notify(mask EventMask) {
        q.mu.RLock()
        for e := q.list.Front(); e != nil; e = e.Next() {
                if m := mask & e.mask; m != 0 {
                        e.Callback.Callback(e, m)
                }
        }
        q.mu.RUnlock()
}

// Events returns the set of events being waited on. It is the union of the
// masks of all registered entries.
func (q *Queue) Events() EventMask {
        ret := EventMask(0)

        q.mu.RLock()
        for e := q.list.Front(); e != nil; e = e.Next() {
                ret |= e.mask
        }
        q.mu.RUnlock()

        return ret
}

// IsEmpty returns if the wait queue is empty or not.
func (q *Queue) IsEmpty() bool {
        q.mu.Lock()
        defer q.mu.Unlock()

        return q.list.Front() == nil
}

// AlwaysReady implements the Waitable interface but is always ready. Embedding
// this struct into another struct makes it implement the boilerplate empty
// functions automatically.
type AlwaysReady struct {
}

// Readiness always returns the input mask because this object is always ready.
func (*AlwaysReady) Readiness(mask EventMask) EventMask {
        return mask
}

// EventRegister doesn't do anything because this object doesn't need to issue
// notifications because its readiness never changes.
func (*AlwaysReady) EventRegister(*Entry, EventMask) {
}

// EventUnregister doesn't do anything because this object doesn't need to issue
// notifications because its readiness never changes.
func (*AlwaysReady) EventUnregister(e *Entry) {
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/proc/fd_dir_inode_refs.go: no such file or directory





































    3 





    3 


    2 


    2 
    1 



    1 



    2 

    1 


    2 


    1 



    4 



    1 


    3 


    2 


    1 



    1 


    1 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
)

const (
        onlyScheduler = linux.SCHED_NORMAL
        onlyPriority  = 0
)

// SchedParam replicates struct sched_param in sched.h.
//
// +marshal
type SchedParam struct {
        schedPriority int32
}

// SchedGetparam implements linux syscall sched_getparam(2).
func SchedGetparam(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pid := args[0].Int()
        param := args[1].Pointer()
        if param == 0 {
                return 0, nil, linuxerr.EINVAL
        }
        if pid < 0 {
                return 0, nil, linuxerr.EINVAL
        }
        if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
                return 0, nil, linuxerr.ESRCH
        }
        r := SchedParam{schedPriority: onlyPriority}
        if _, err := r.CopyOut(t, param); err != nil {
                return 0, nil, err
        }

        return 0, nil, nil
}

// SchedGetscheduler implements linux syscall sched_getscheduler(2).
func SchedGetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pid := args[0].Int()
        if pid < 0 {
                return 0, nil, linuxerr.EINVAL
        }
        if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
                return 0, nil, linuxerr.ESRCH
        }
        return onlyScheduler, nil, nil
}

// SchedSetscheduler implements linux syscall sched_setscheduler(2).
func SchedSetscheduler(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pid := args[0].Int()
        policy := args[1].Int()
        param := args[2].Pointer()
        if pid < 0 {
                return 0, nil, linuxerr.EINVAL
        }
        if policy != onlyScheduler {
                return 0, nil, linuxerr.EINVAL
        }
        if pid != 0 && t.PIDNamespace().TaskWithID(kernel.ThreadID(pid)) == nil {
                return 0, nil, linuxerr.ESRCH
        }
        var r SchedParam
        if _, err := r.CopyIn(t, param); err != nil {
                return 0, nil, linuxerr.EINVAL
        }
        if r.schedPriority != onlyPriority {
                return 0, nil, linuxerr.EINVAL
        }
        return 0, nil, nil
}

// SchedGetPriorityMax implements linux syscall sched_get_priority_max(2).
func SchedGetPriorityMax(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return onlyPriority, nil, nil
}

// SchedGetPriorityMin implements linux syscall sched_get_priority_min(2).
func SchedGetPriorityMin(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return onlyPriority, nil, nil
}


























































































































































































































































   27 




   28 






   28 





   28 




   28 
   28 







   28 











   28 


   28 



   25 


















   28 




   28 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package watchdog is responsible for monitoring the sentry for tasks that may
// potentially be stuck or looping inderterminally causing hard to debug hungs in
// the untrusted app.
//
// It works by periodically querying all tasks to check whether they are in user
// mode (RunUser), kernel mode (RunSys), or blocked in the kernel (OffCPU). Tasks
// that have been running in kernel mode for a long time in the same syscall
// without blocking are considered stuck and are reported.
//
// When a stuck task is detected, the watchdog can take one of the following actions:
//                1. LogWarning: Logs a warning message followed by a stack dump of all goroutines.
//                         If a tasks continues to be stuck, the message will repeat every minute, unless
//                         a new stuck task is detected
//                2. Panic: same as above, followed by panic()
//
package watchdog

import (
        "bytes"
        "fmt"
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/metric"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sync"
)

// Opts configures the watchdog.
type Opts struct {
        // TaskTimeout is the amount of time to allow a task to execute the
        // same syscall without blocking before it's declared stuck.
        TaskTimeout time.Duration

        // TaskTimeoutAction indicates what action to take when a stuck tasks
        // is detected.
        TaskTimeoutAction Action

        // StartupTimeout is the amount of time to allow between watchdog
        // creation and calling watchdog.Start.
        StartupTimeout time.Duration

        // StartupTimeoutAction indicates what action to take when
        // watchdog.Start is not called within the timeout.
        StartupTimeoutAction Action
}

// DefaultOpts is a default set of options for the watchdog.
var DefaultOpts = Opts{
        // Task timeout.
        TaskTimeout:       3 * time.Minute,
        TaskTimeoutAction: LogWarning,

        // Startup timeout.
        StartupTimeout:       30 * time.Second,
        StartupTimeoutAction: LogWarning,
}

// descheduleThreshold is the amount of time scheduling needs to be off before the entire wait period
// is discounted from task's last update time. It's set high enough that small scheduling delays won't
// trigger it.
const descheduleThreshold = 1 * time.Second

// Amount of time to wait before dumping the stack to the log again when the same task(s) remains stuck.
var stackDumpSameTaskPeriod = time.Minute

// Action defines what action to take when a stuck task is detected.
type Action int

const (
        // LogWarning logs warning message followed by stack trace.
        LogWarning Action = iota

        // Panic will do the same logging as LogWarning and panic().
        Panic
)

// Set implements flag.Value.
func (a *Action) Set(v string) error {
        switch v {
        case "log", "logwarning":
                *a = LogWarning
        case "panic":
                *a = Panic
        default:
                return fmt.Errorf("invalid watchdog action %q", v)
        }
        return nil
}

// Get implements flag.Value.
func (a *Action) Get() interface{} {
        return *a
}

// String returns Action's string representation.
func (a Action) String() string {
        switch a {
        case LogWarning:
                return "logWarning"
        case Panic:
                return "panic"
        default:
                panic(fmt.Sprintf("Invalid watchdog action: %d", a))
        }
}

// Watchdog is the main watchdog class. It controls a goroutine that periodically
// analyses all tasks and reports if any of them appear to be stuck.
type Watchdog struct {
        // Configuration options are embedded.
        Opts

        // period indicates how often to check all tasks. It's calculated based on
        // opts.TaskTimeout.
        period time.Duration

        // k is where the tasks come from.
        k *kernel.Kernel

        // stop is used to notify to watchdog should stop.
        stop chan struct{}

        // done is used to notify when the watchdog has stopped.
        done chan struct{}

        // offenders map contains all tasks that are currently stuck.
        offenders map[*kernel.Task]*offender

        // lastStackDump tracks the last time a stack dump was generated to prevent
        // spamming the log.
        lastStackDump time.Time

        // lastRun is set to the last time the watchdog executed a monitoring loop.
        lastRun ktime.Time

        // mu protects the fields below.
        mu sync.Mutex

        // running is true if the watchdog is running.
        running bool

        // startCalled is true if Start has ever been called. It remains true
        // even if Stop is called.
        startCalled bool
}

type offender struct {
        lastUpdateTime ktime.Time
}

// New creates a new watchdog.
func New(k *kernel.Kernel, opts Opts) *Watchdog {
        // 4 is arbitrary, just don't want to prolong 'TaskTimeout' too much.
        period := opts.TaskTimeout / 4
        w := &Watchdog{
                Opts:      opts,
                k:         k,
                period:    period,
                offenders: make(map[*kernel.Task]*offender),
                stop:      make(chan struct{}),
                done:      make(chan struct{}),
        }

        // Handle StartupTimeout if it exists.
        if w.StartupTimeout > 0 {
                log.Infof("Watchdog waiting %v for startup", w.StartupTimeout)
                go w.waitForStart() // S/R-SAFE: watchdog is stopped buring save and restarted after restore.
        }

        return w
}

// Start starts the watchdog.
func (w *Watchdog) Start() {
        w.mu.Lock()
        defer w.mu.Unlock()
        w.startCalled = true

        if w.running {
                return
        }

        if w.TaskTimeout == 0 {
                log.Infof("Watchdog task timeout disabled")
                return
        }
        w.lastRun = w.k.MonotonicClock().Now()

        log.Infof("Starting watchdog, period: %v, timeout: %v, action: %v", w.period, w.TaskTimeout, w.TaskTimeoutAction)
        go w.loop() // S/R-SAFE: watchdog is stopped during save and restarted after restore.
        w.running = true
}

// Stop requests the watchdog to stop and wait for it.
func (w *Watchdog) Stop() {
        if w.TaskTimeout == 0 {
                return
        }

        w.mu.Lock()
        defer w.mu.Unlock()
        if !w.running {
                return
        }
        log.Infof("Stopping watchdog")
        w.stop <- struct{}{}
        <-w.done
        w.running = false
        log.Infof("Watchdog stopped")
}

// waitForStart waits for Start to be called and takes action if it does not
// happen within the startup timeout.
func (w *Watchdog) waitForStart() {
        <-time.After(w.StartupTimeout)
        w.mu.Lock()
        defer w.mu.Unlock()
        if w.startCalled {
                // We are fine.
                return
        }

        metric.WeirdnessMetric.Increment("watchdog_stuck_startup")

        var buf bytes.Buffer
        buf.WriteString(fmt.Sprintf("Watchdog.Start() not called within %s", w.StartupTimeout))
        w.doAction(w.StartupTimeoutAction, false, &buf)
}

// loop is the main watchdog routine. It only returns when 'Stop()' is called.
func (w *Watchdog) loop() {
        // Loop until someone stops it.
        for {
                select {
                case <-w.stop:
                        w.done <- struct{}{}
                        return
                case <-time.After(w.period):
                        w.runTurn()
                }
        }
}

// runTurn runs a single pass over all tasks and reports anything it finds.
func (w *Watchdog) runTurn() {
        // Someone needs to watch the watchdog. The call below can get stuck if there
        // is a deadlock affecting root's PID namespace mutex. Run it in a goroutine
        // and report if it takes too long to return.
        var tasks []*kernel.Task
        done := make(chan struct{})
        go func() { // S/R-SAFE: watchdog is stopped and restarted during S/R.
                tasks = w.k.TaskSet().Root.Tasks()
                close(done)
        }()

        select {
        case <-done:
        case <-time.After(w.TaskTimeout):
                // Report if the watchdog is not making progress.
                // No one is watching the watchdog watcher though.
                w.reportStuckWatchdog()
                <-done
        }

        newOffenders := make(map[*kernel.Task]*offender)
        newTaskFound := false
        now := ktime.FromNanoseconds(int64(w.k.CPUClockNow() * uint64(linux.ClockTick)))

        // The process may be running with low CPU limit making tasks appear stuck because
        // are starved of CPU cycles. An estimate is that Tasks could have been starved
        // since the last time the watchdog run. If the watchdog detects that scheduling
        // is off, it will discount the entire duration since last run from 'lastUpdateTime'.
        discount := time.Duration(0)
        if now.Sub(w.lastRun.Add(w.period)) > descheduleThreshold {
                discount = now.Sub(w.lastRun)
        }
        w.lastRun = now

        log.Infof("Watchdog starting loop, tasks: %d, discount: %v", len(tasks), discount)
        for _, t := range tasks {
                tsched := t.TaskGoroutineSchedInfo()

                // An offender is a task running inside the kernel for longer than the specified timeout.
                if tsched.State == kernel.TaskGoroutineRunningSys {
                        lastUpdateTime := ktime.FromNanoseconds(int64(tsched.Timestamp * uint64(linux.ClockTick)))
                        elapsed := now.Sub(lastUpdateTime) - discount
                        if elapsed > w.TaskTimeout {
                                tc, ok := w.offenders[t]
                                if !ok {
                                        // New stuck task detected.
                                        //
                                        // Note that tasks blocked doing IO may be considered stuck in kernel,
                                        // unless they are surrounded by
                                        // Task.UninterruptibleSleepStart/Finish.
                                        tc = &offender{lastUpdateTime: lastUpdateTime}
                                        metric.WeirdnessMetric.Increment("watchdog_stuck_tasks")
                                        newTaskFound = true
                                }
                                newOffenders[t] = tc
                        }
                }
        }
        if len(newOffenders) > 0 {
                w.report(newOffenders, newTaskFound, now)
        }

        // Remember which tasks have been reported.
        w.offenders = newOffenders
}

// report takes appropriate action when a stuck task is detected.
func (w *Watchdog) report(offenders map[*kernel.Task]*offender, newTaskFound bool, now ktime.Time) {
        var buf bytes.Buffer
        buf.WriteString(fmt.Sprintf("Sentry detected %d stuck task(s):\n", len(offenders)))
        for t, o := range offenders {
                tid := w.k.TaskSet().Root.IDOfTask(t)
                buf.WriteString(fmt.Sprintf("\tTask tid: %v (goroutine %d), entered RunSys state %v ago.\n", tid, t.GoroutineID(), now.Sub(o.lastUpdateTime)))
        }
        buf.WriteString("Search for 'goroutine <id>' in the stack dump to find the offending goroutine(s)")

        // Force stack dump only if a new task is detected.
        w.doAction(w.TaskTimeoutAction, newTaskFound, &buf)
}

func (w *Watchdog) reportStuckWatchdog() {
        var buf bytes.Buffer
        buf.WriteString("Watchdog goroutine is stuck")
        w.doAction(w.TaskTimeoutAction, false, &buf)
}

// doAction will take the given action. If the action is LogWarning, the stack
// is not always dumped to the log to prevent log flooding. "forceStack"
// guarantees that the stack will be dumped regardless.
func (w *Watchdog) doAction(action Action, forceStack bool, msg *bytes.Buffer) {
        switch action {
        case LogWarning:
                // Dump stack only if forced or sometime has passed since the last time a
                // stack dump was generated.
                if !forceStack && time.Since(w.lastStackDump) < stackDumpSameTaskPeriod {
                        msg.WriteString("\n...[stack dump skipped]...")
                        log.Warningf(msg.String())
                        return
                }
                log.TracebackAll(msg.String())
                w.lastStackDump = time.Now()

        case Panic:
                // Panic will skip over running tasks, which is likely the culprit here. So manually
                // dump all stacks before panic'ing.
                log.TracebackAll(msg.String())

                // Attempt to flush metrics, timeout and move on in case metrics are stuck as well.
                metricsEmitted := make(chan struct{}, 1)
                go func() { // S/R-SAFE: watchdog is stopped during save and restarted after restore.
                        // Flush metrics before killing process.
                        metric.EmitMetricUpdate()
                        metricsEmitted <- struct{}{}
                }()
                select {
                case <-metricsEmitted:
                case <-time.After(1 * time.Second):
                }
                panic(fmt.Sprintf("%s\nStack for running G's are skipped while panicking.", msg.String()))

        default:
                panic(fmt.Sprintf("Unknown watchdog action %v", action))

        }
}





















































































































































  149 



  145 



  151 


  150 


  148 










   20 




  139 







  144 







  127 












  139 










  147 




   36 


   36 


   36 


  141 



  146 



  148 



  144 




  147 


  147 





  139 




  138 


  137 





  148 

  103 


  148 



  146 






   57 























   47 

   37 

   10 











   19 













   18 



















  143 






  147 








  139 










  138 






   47 






  131 



  131 







   58 


   58 


   58 



    3 












    1 

    1 
    1 



    1 







    1 




  144 




   60 












   77 

   52 


   77 










   27 





   24 



   24 








    2 
    1 


    1 









   10 


   10 
    9 














    9 



    5 



    5 
    5 


    5 



   60 







   59 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at //
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/buffer"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        tcpipbuffer "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
)

type headerType int

const (
        linkHeader headerType = iota
        networkHeader
        transportHeader
        numHeaderType
)

// PacketBufferOptions specifies options for PacketBuffer creation.
type PacketBufferOptions struct {
        // ReserveHeaderBytes is the number of bytes to reserve for headers. Total
        // number of bytes pushed onto the headers must not exceed this value.
        ReserveHeaderBytes int

        // Data is the initial unparsed data for the new packet. If set, it will be
        // owned by the new packet.
        Data tcpipbuffer.VectorisedView

        // IsForwardedPacket identifies that the PacketBuffer being created is for a
        // forwarded packet.
        IsForwardedPacket bool
}

// A PacketBuffer contains all the data of a network packet.
//
// As a PacketBuffer traverses up the stack, it may be necessary to pass it to
// multiple endpoints.
//
// The whole packet is expected to be a series of bytes in the following order:
// LinkHeader, NetworkHeader, TransportHeader, and Data. Any of them can be
// empty. Use of PacketBuffer in any other order is unsupported.
//
// PacketBuffer must be created with NewPacketBuffer.
//
// Internal structure: A PacketBuffer holds a pointer to buffer.Buffer, which
// exposes a logically-contiguous byte storage. The underlying storage structure
// is abstracted out, and should not be a concern here for most of the time.
//
// |- reserved ->|
//               |--->| consumed (incoming)
// 0             V    V
// +--------+----+----+--------------------+
// |        |    |    | current data ...   | (buf)
// +--------+----+----+--------------------+
//          ^    |
//          |<---| pushed (outgoing)
//
// When a PacketBuffer is created, a `reserved` header region can be specified,
// which stack pushes headers in this region for an outgoing packet. There could
// be no such region for an incoming packet, and `reserved` is 0. The value of
// `reserved` never changes in the entire lifetime of the packet.
//
// Outgoing Packet: When a header is pushed, `pushed` gets incremented by the
// pushed length, and the current value is stored for each header. PacketBuffer
// substracts this value from `reserved` to compute the starting offset of each
// header in `buf`.
//
// Incoming Packet: When a header is consumed (a.k.a. parsed), the current
// `consumed` value is stored for each header, and it gets incremented by the
// consumed length. PacketBuffer adds this value to `reserved` to compute the
// starting offset of each header in `buf`.
type PacketBuffer struct {
        _ sync.NoCopy

        // PacketBufferEntry is used to build an intrusive list of
        // PacketBuffers.
        PacketBufferEntry

        // buf is the underlying buffer for the packet. See struct level docs for
        // details.
        buf      *buffer.Buffer
        reserved int
        pushed   int
        consumed int

        // headers stores metadata about each header.
        headers [numHeaderType]headerInfo

        // NetworkProtocolNumber is only valid when NetworkHeader().View().IsEmpty()
        // returns false.
        // TODO(gvisor.dev/issue/3574): Remove the separately passed protocol
        // numbers in registration APIs that take a PacketBuffer.
        NetworkProtocolNumber tcpip.NetworkProtocolNumber

        // TransportProtocol is only valid if it is non zero.
        // TODO(gvisor.dev/issue/3810): This and the network protocol number should
        // be moved into the headerinfo. This should resolve the validity issue.
        TransportProtocolNumber tcpip.TransportProtocolNumber

        // Hash is the transport layer hash of this packet. A value of zero
        // indicates no valid hash has been set.
        Hash uint32

        // Owner is implemented by task to get the uid and gid.
        // Only set for locally generated packets.
        Owner tcpip.PacketOwner

        // The following fields are only set by the qdisc layer when the packet
        // is added to a queue.
        EgressRoute RouteInfo
        GSOOptions  GSO

        // NatDone indicates if the packet has been manipulated as per NAT
        // iptables rule.
        NatDone bool

        // PktType indicates the SockAddrLink.PacketType of the packet as defined in
        // https://www.man7.org/linux/man-pages/man7/packet.7.html.
        PktType tcpip.PacketType

        // NICID is the ID of the last interface the network packet was handled at.
        NICID tcpip.NICID

        // RXTransportChecksumValidated indicates that transport checksum verification
        // may be safely skipped.
        RXTransportChecksumValidated bool

        // NetworkPacketInfo holds an incoming packet's network-layer information.
        NetworkPacketInfo NetworkPacketInfo
}

// NewPacketBuffer creates a new PacketBuffer with opts.
func NewPacketBuffer(opts PacketBufferOptions) *PacketBuffer {
        pk := &PacketBuffer{
                buf: &buffer.Buffer{},
        }
        if opts.ReserveHeaderBytes != 0 {
                pk.buf.AppendOwned(make([]byte, opts.ReserveHeaderBytes))
                pk.reserved = opts.ReserveHeaderBytes
        }
        for _, v := range opts.Data.Views() {
                pk.buf.AppendOwned(v)
        }
        if opts.IsForwardedPacket {
                pk.NetworkPacketInfo.IsForwardedPacket = opts.IsForwardedPacket
        }
        return pk
}

// ReservedHeaderBytes returns the number of bytes initially reserved for
// headers.
func (pk *PacketBuffer) ReservedHeaderBytes() int {
        return pk.reserved
}

// AvailableHeaderBytes returns the number of bytes currently available for
// headers. This is relevant to PacketHeader.Push method only.
func (pk *PacketBuffer) AvailableHeaderBytes() int {
        return pk.reserved - pk.pushed
}

// LinkHeader returns the handle to link-layer header.
func (pk *PacketBuffer) LinkHeader() PacketHeader {
        return PacketHeader{
                pk:  pk,
                typ: linkHeader,
        }
}

// NetworkHeader returns the handle to network-layer header.
func (pk *PacketBuffer) NetworkHeader() PacketHeader {
        return PacketHeader{
                pk:  pk,
                typ: networkHeader,
        }
}

// TransportHeader returns the handle to transport-layer header.
func (pk *PacketBuffer) TransportHeader() PacketHeader {
        return PacketHeader{
                pk:  pk,
                typ: transportHeader,
        }
}

// HeaderSize returns the total size of all headers in bytes.
func (pk *PacketBuffer) HeaderSize() int {
        return pk.pushed + pk.consumed
}

// Size returns the size of packet in bytes.
func (pk *PacketBuffer) Size() int {
        return int(pk.buf.Size()) - pk.headerOffset()
}

// MemSize returns the estimation size of the pk in memory, including backing
// buffer data.
func (pk *PacketBuffer) MemSize() int {
        return int(pk.buf.Size()) + packetBufferStructSize
}

// Data returns the handle to data portion of pk.
func (pk *PacketBuffer) Data() PacketData {
        return PacketData{pk: pk}
}

// Views returns the underlying storage of the whole packet.
func (pk *PacketBuffer) Views() []tcpipbuffer.View {
        var views []tcpipbuffer.View
        offset := pk.headerOffset()
        pk.buf.SubApply(offset, int(pk.buf.Size())-offset, func(v []byte) {
                views = append(views, v)
        })
        return views
}

func (pk *PacketBuffer) headerOffset() int {
        return pk.reserved - pk.pushed
}

func (pk *PacketBuffer) headerOffsetOf(typ headerType) int {
        return pk.reserved + pk.headers[typ].offset
}

func (pk *PacketBuffer) dataOffset() int {
        return pk.reserved + pk.consumed
}

func (pk *PacketBuffer) push(typ headerType, size int) tcpipbuffer.View {
        h := &pk.headers[typ]
        if h.length > 0 {
                panic(fmt.Sprintf("push(%s, %d) called after previous push", typ, size))
        }
        if pk.pushed+size > pk.reserved {
                panic(fmt.Sprintf("push(%s, %d) overflows; pushed=%d reserved=%d", typ, size, pk.pushed, pk.reserved))
        }
        pk.pushed += size
        h.offset = -pk.pushed
        h.length = size
        return pk.headerView(typ)
}

func (pk *PacketBuffer) consume(typ headerType, size int) (v tcpipbuffer.View, consumed bool) {
        h := &pk.headers[typ]
        if h.length > 0 {
                panic(fmt.Sprintf("consume must not be called twice: type %s", typ))
        }
        if pk.reserved+pk.consumed+size > int(pk.buf.Size()) {
                return nil, false
        }
        h.offset = pk.consumed
        h.length = size
        pk.consumed += size
        return pk.headerView(typ), true
}

func (pk *PacketBuffer) headerView(typ headerType) tcpipbuffer.View {
        h := &pk.headers[typ]
        if h.length == 0 {
                return nil
        }
        v, ok := pk.buf.PullUp(pk.headerOffsetOf(typ), h.length)
        if !ok {
                panic("PullUp failed")
        }
        return v
}

// Clone makes a shallow copy of pk.
//
// Clone should be called in such cases so that no modifications is done to
// underlying packet payload.
func (pk *PacketBuffer) Clone() *PacketBuffer {
        return &PacketBuffer{
                PacketBufferEntry:            pk.PacketBufferEntry,
                buf:                          pk.buf,
                reserved:                     pk.reserved,
                pushed:                       pk.pushed,
                consumed:                     pk.consumed,
                headers:                      pk.headers,
                Hash:                         pk.Hash,
                Owner:                        pk.Owner,
                GSOOptions:                   pk.GSOOptions,
                NetworkProtocolNumber:        pk.NetworkProtocolNumber,
                NatDone:                      pk.NatDone,
                TransportProtocolNumber:      pk.TransportProtocolNumber,
                PktType:                      pk.PktType,
                NICID:                        pk.NICID,
                RXTransportChecksumValidated: pk.RXTransportChecksumValidated,
                NetworkPacketInfo:            pk.NetworkPacketInfo,
        }
}

// Network returns the network header as a header.Network.
//
// Network should only be called when NetworkHeader has been set.
func (pk *PacketBuffer) Network() header.Network {
        switch netProto := pk.NetworkProtocolNumber; netProto {
        case header.IPv4ProtocolNumber:
                return header.IPv4(pk.NetworkHeader().View())
        case header.IPv6ProtocolNumber:
                return header.IPv6(pk.NetworkHeader().View())
        default:
                panic(fmt.Sprintf("unknown network protocol number %d", netProto))
        }
}

// CloneToInbound makes a shallow copy of the packet buffer to be used as an
// inbound packet.
//
// See PacketBuffer.Data for details about how a packet buffer holds an inbound
// packet.
func (pk *PacketBuffer) CloneToInbound() *PacketBuffer {
        newPk := &PacketBuffer{
                buf: pk.buf,
                // Treat unfilled header portion as reserved.
                reserved: pk.AvailableHeaderBytes(),
        }
        // TODO(gvisor.dev/issue/5696): reimplement conntrack so that no need to
        // maintain this flag in the packet. Currently conntrack needs this flag to
        // tell if a noop connection should be inserted at Input hook. Once conntrack
        // redefines the manipulation field as mutable, we won't need the special noop
        // connection.
        if pk.NatDone {
                newPk.NatDone = true
        }
        return newPk
}

// headerInfo stores metadata about a header in a packet.
type headerInfo struct {
        // offset is the offset of the header in pk.buf relative to
        // pk.buf[pk.reserved]. See the PacketBuffer struct for details.
        offset int

        // length is the length of this header.
        length int
}

// PacketHeader is a handle object to a header in the underlying packet.
type PacketHeader struct {
        pk  *PacketBuffer
        typ headerType
}

// View returns the underlying storage of h.
func (h PacketHeader) View() tcpipbuffer.View {
        return h.pk.headerView(h.typ)
}

// Push pushes size bytes in the front of its residing packet, and returns the
// backing storage. Callers may only call one of Push or Consume once on each
// header in the lifetime of the underlying packet.
func (h PacketHeader) Push(size int) tcpipbuffer.View {
        return h.pk.push(h.typ, size)
}

// Consume moves the first size bytes of the unparsed data portion in the packet
// to h, and returns the backing storage. In the case of data is shorter than
// size, consumed will be false, and the state of h will not be affected.
// Callers may only call one of Push or Consume once on each header in the
// lifetime of the underlying packet.
func (h PacketHeader) Consume(size int) (v tcpipbuffer.View, consumed bool) {
        return h.pk.consume(h.typ, size)
}

// PacketData represents the data portion of a PacketBuffer.
type PacketData struct {
        pk *PacketBuffer
}

// PullUp returns a contiguous view of size bytes from the beginning of d.
// Callers should not write to or keep the view for later use.
func (d PacketData) PullUp(size int) (tcpipbuffer.View, bool) {
        return d.pk.buf.PullUp(d.pk.dataOffset(), size)
}

// DeleteFront removes count from the beginning of d. It panics if count >
// d.Size(). All backing storage references after the front of the d are
// invalidated.
func (d PacketData) DeleteFront(count int) {
        if !d.pk.buf.Remove(d.pk.dataOffset(), count) {
                panic("count > d.Size()")
        }
}

// CapLength reduces d to at most length bytes.
func (d PacketData) CapLength(length int) {
        if length < 0 {
                panic("length < 0")
        }
        if currLength := d.Size(); currLength > length {
                trim := currLength - length
                d.pk.buf.Remove(int(d.pk.buf.Size())-trim, trim)
        }
}

// Views returns the underlying storage of d in a slice of Views. Caller should
// not modify the returned slice.
func (d PacketData) Views() []tcpipbuffer.View {
        var views []tcpipbuffer.View
        offset := d.pk.dataOffset()
        d.pk.buf.SubApply(offset, int(d.pk.buf.Size())-offset, func(v []byte) {
                views = append(views, v)
        })
        return views
}

// AppendView appends v into d, taking the ownership of v.
func (d PacketData) AppendView(v tcpipbuffer.View) {
        d.pk.buf.AppendOwned(v)
}

// MergeFragment appends the data portion of frag to dst. It takes ownership of
// frag and frag should not be used again.
func MergeFragment(dst, frag *PacketBuffer) {
        frag.buf.TrimFront(int64(frag.dataOffset()))
        dst.buf.Merge(frag.buf)
}

// ReadFromVV moves at most count bytes from the beginning of srcVV to the end
// of d and returns the number of bytes moved.
func (d PacketData) ReadFromVV(srcVV *tcpipbuffer.VectorisedView, count int) int {
        done := 0
        for _, v := range srcVV.Views() {
                if len(v) < count {
                        count -= len(v)
                        done += len(v)
                        d.pk.buf.AppendOwned(v)
                } else {
                        v = v[:count]
                        count -= len(v)
                        done += len(v)
                        d.pk.buf.Append(v)
                        break
                }
        }
        srcVV.TrimFront(done)
        return done
}

// Size returns the number of bytes in the data payload of the packet.
func (d PacketData) Size() int {
        return int(d.pk.buf.Size()) - d.pk.dataOffset()
}

// AsRange returns a Range representing the current data payload of the packet.
func (d PacketData) AsRange() Range {
        return Range{
                pk:     d.pk,
                offset: d.pk.dataOffset(),
                length: d.Size(),
        }
}

// ExtractVV returns a VectorisedView of d. This method has the semantic to
// destruct the underlying packet, hence the packet cannot be used again.
//
// This method exists for compatibility between PacketBuffer and VectorisedView.
// It may be removed later and should be used with care.
func (d PacketData) ExtractVV() tcpipbuffer.VectorisedView {
        var vv tcpipbuffer.VectorisedView
        d.pk.buf.SubApply(d.pk.dataOffset(), d.pk.Size(), func(v []byte) {
                vv.AppendView(v)
        })
        return vv
}

// Range represents a contiguous subportion of a PacketBuffer.
type Range struct {
        pk     *PacketBuffer
        offset int
        length int
}

// Size returns the number of bytes in r.
func (r Range) Size() int {
        return r.length
}

// SubRange returns a new Range starting at off bytes of r. It returns an empty
// range if off is out-of-bounds.
func (r Range) SubRange(off int) Range {
        if off > r.length {
                return Range{pk: r.pk}
        }
        return Range{
                pk:     r.pk,
                offset: r.offset + off,
                length: r.length - off,
        }
}

// Capped returns a new Range with the same starting point of r and length
// capped at max.
func (r Range) Capped(max int) Range {
        if r.length <= max {
                return r
        }
        return Range{
                pk:     r.pk,
                offset: r.offset,
                length: max,
        }
}

// AsView returns the backing storage of r if possible. It will allocate a new
// View if r spans multiple pieces internally. Caller should not write to the
// returned View in any way.
func (r Range) AsView() tcpipbuffer.View {
        var allocated bool
        var v tcpipbuffer.View
        r.iterate(func(b []byte) {
                if v == nil {
                        // v has not been assigned, allowing first view to be returned.
                        v = b
                } else {
                        // v has been assigned. This range spans more than a view, a new view
                        // needs to be allocated.
                        if !allocated {
                                allocated = true
                                all := make([]byte, 0, r.length)
                                all = append(all, v...)
                                v = all
                        }
                        v = append(v, b...)
                }
        })
        return v
}

// ToOwnedView returns a owned copy of data in r.
func (r Range) ToOwnedView() tcpipbuffer.View {
        if r.length == 0 {
                return nil
        }
        all := make([]byte, 0, r.length)
        r.iterate(func(b []byte) {
                all = append(all, b...)
        })
        return all
}

// Checksum calculates the RFC 1071 checksum for the underlying bytes of r.
func (r Range) Checksum() uint16 {
        var c header.Checksumer
        r.iterate(c.Add)
        return c.Checksum()
}

// iterate calls fn for each piece in r. fn is always called with a non-empty
// slice.
func (r Range) iterate(fn func([]byte)) {
        r.pk.buf.SubApply(r.offset, r.length, fn)
}

// PayloadSince returns packet payload starting from and including a particular
// header.
//
// The returned View is owned by the caller - its backing buffer is separate
// from the packet header's underlying packet buffer.
func PayloadSince(h PacketHeader) tcpipbuffer.View {
        offset := h.pk.headerOffset()
        for i := headerType(0); i < h.typ; i++ {
                offset += h.pk.headers[i].length
        }
        return Range{
                pk:     h.pk,
                offset: offset,
                length: int(h.pk.buf.Size()) - offset,
        }.ToOwnedView()
}



























































































    7 










    7 







    2 




    2 



    2 






























    9 
    9 
    7 


    2 




    6 
    2 






    8 




    2 
    2 


    2 


    7 
    5 

    4 




    1 


    1 



    1 





    1 

    1 




    1 


    1 











    9 


    9 


    1 



    1 





















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package loader loads an executable file into a MemoryManager.
package loader

import (
        "bytes"
        "fmt"
        "io"
        "path"

        "gvisor.dev/gvisor/pkg/abi"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/abi/linux/errno"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/cpuid"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/rand"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsbridge"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/mm"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
)

// LoadArgs holds specifications for an executable file to be loaded.
type LoadArgs struct {
        // MemoryManager is the memory manager to load the executable into.
        MemoryManager *mm.MemoryManager

        // RemainingTraversals is the maximum number of symlinks to follow to
        // resolve Filename. This counter is passed by reference to keep it
        // updated throughout the call stack.
        RemainingTraversals *uint

        // ResolveFinal indicates whether the final link of Filename should be
        // resolved, if it is a symlink.
        ResolveFinal bool

        // Filename is the path for the executable.
        Filename string

        // File is an open fs.File object of the executable. If File is not
        // nil, then File will be loaded and Filename will be ignored.
        //
        // The caller is responsible for checking that the user can execute this file.
        File fsbridge.File

        // Opener is used to open the executable file when 'File' is nil.
        Opener fsbridge.Lookup

        // CloseOnExec indicates that the executable (or one of its parent
        // directories) was opened with O_CLOEXEC. If the executable is an
        // interpreter script, then cause an ENOENT error to occur, since the
        // script would otherwise be inaccessible to the interpreter.
        CloseOnExec bool

        // Argv is the vector of arguments to pass to the executable.
        Argv []string

        // Envv is the vector of environment variables to pass to the
        // executable.
        Envv []string

        // Features specifies the CPU feature set for the executable.
        Features *cpuid.FeatureSet
}

// openPath opens args.Filename and checks that it is valid for loading.
//
// openPath returns an *fs.Dirent and *fs.File for args.Filename, which is not
// installed in the Task FDTable. The caller takes ownership of both.
//
// args.Filename must be a readable, executable, regular file.
func openPath(ctx context.Context, args LoadArgs) (fsbridge.File, error) {
        if args.Filename == "" {
                ctx.Infof("cannot open empty name")
                return nil, syserror.ENOENT
        }

        // TODO(gvisor.dev/issue/160): Linux requires only execute permission,
        // not read. However, our backing filesystems may prevent us from reading
        // the file without read permission. Additionally, a task with a
        // non-readable executable has additional constraints on access via
        // ptrace and procfs.
        opts := vfs.OpenOptions{
                Flags:    linux.O_RDONLY,
                FileExec: true,
        }
        return args.Opener.OpenPath(ctx, args.Filename, opts, args.RemainingTraversals, args.ResolveFinal)
}

// checkIsRegularFile prevents us from trying to execute a directory, pipe, etc.
func checkIsRegularFile(ctx context.Context, file fsbridge.File, filename string) error {
        t, err := file.Type(ctx)
        if err != nil {
                return err
        }
        if t != linux.ModeRegular {
                ctx.Infof("%q is not a regular file: %v", filename, t)
                return linuxerr.EACCES
        }
        return nil
}

// allocStack allocates and maps a stack in to any available part of the address space.
func allocStack(ctx context.Context, m *mm.MemoryManager, a arch.Context) (*arch.Stack, error) {
        ar, err := m.MapStack(ctx)
        if err != nil {
                return nil, err
        }
        return &arch.Stack{Arch: a, IO: m, Bottom: ar.End}, nil
}

const (
        // maxLoaderAttempts is the maximum number of attempts to try to load
        // an interpreter scripts, to prevent loops. 6 (initial + 5 changes) is
        // what the Linux kernel allows (fs/exec.c:search_binary_handler).
        maxLoaderAttempts = 6
)

// loadExecutable loads an executable that is pointed to by args.File. The
// caller is responsible for checking that the user can execute this file.
// If nil, the path args.Filename is resolved and loaded (check that the user
// can execute this file is done here in this case). If the executable is an
// interpreter script rather than an ELF, the binary of the corresponding
// interpreter will be loaded.
//
// It returns:
//  * loadedELF, description of the loaded binary
//  * arch.Context matching the binary arch
//  * fs.Dirent of the binary file
//  * Possibly updated args.Argv
func loadExecutable(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, fsbridge.File, []string, error) {
        for i := 0; i < maxLoaderAttempts; i++ {
                if args.File == nil {
                        var err error
                        args.File, err = openPath(ctx, args)
                        if err != nil {
                                ctx.Infof("Error opening %s: %v", args.Filename, err)
                                return loadedELF{}, nil, nil, nil, err
                        }
                        // Ensure file is release in case the code loops or errors out.
                        defer args.File.DecRef(ctx)
                } else {
                        if err := checkIsRegularFile(ctx, args.File, args.Filename); err != nil {
                                return loadedELF{}, nil, nil, nil, err
                        }
                }

                // Check the header. Is this an ELF or interpreter script?
                var hdr [4]uint8
                // N.B. We assume that reading from a regular file cannot block.
                _, err := args.File.ReadFull(ctx, usermem.BytesIOSequence(hdr[:]), 0)
                // Allow unexpected EOF, as a valid executable could be only three bytes
                // (e.g., #!a).
                if err != nil && err != io.ErrUnexpectedEOF {
                        if err == io.EOF {
                                err = syserror.ENOEXEC
                        }
                        return loadedELF{}, nil, nil, nil, err
                }

                switch {
                case bytes.Equal(hdr[:], []byte(elfMagic)):
                        loaded, ac, err := loadELF(ctx, args)
                        if err != nil {
                                ctx.Infof("Error loading ELF: %v", err)
                                return loadedELF{}, nil, nil, nil, err
                        }
                        // An ELF is always terminal. Hold on to file.
                        args.File.IncRef()
                        return loaded, ac, args.File, args.Argv, err

                case bytes.Equal(hdr[:2], []byte(interpreterScriptMagic)):
                        if args.CloseOnExec {
                                return loadedELF{}, nil, nil, nil, syserror.ENOENT
                        }
                        args.Filename, args.Argv, err = parseInterpreterScript(ctx, args.Filename, args.File, args.Argv)
                        if err != nil {
                                ctx.Infof("Error loading interpreter script: %v", err)
                                return loadedELF{}, nil, nil, nil, err
                        }
                        // Refresh the traversal limit for the interpreter.
                        *args.RemainingTraversals = linux.MaxSymlinkTraversals

                default:
                        ctx.Infof("Unknown magic: %v", hdr)
                        return loadedELF{}, nil, nil, nil, syserror.ENOEXEC
                }
                // Set to nil in case we loop on a Interpreter Script.
                args.File = nil
        }

        return loadedELF{}, nil, nil, nil, linuxerr.ELOOP
}

// Load loads args.File into a MemoryManager. If args.File is nil, the path
// args.Filename is resolved and loaded instead.
//
// If Load returns ErrSwitchFile it should be called again with the returned
// path and argv.
//
// Preconditions:
// * The Task MemoryManager is empty.
// * Load is called on the Task goroutine.
func Load(ctx context.Context, args LoadArgs, extraAuxv []arch.AuxEntry, vdso *VDSO) (abi.OS, arch.Context, string, *syserr.Error) {
        // Load the executable itself.
        loaded, ac, file, newArgv, err := loadExecutable(ctx, args)
        if err != nil {
                return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("failed to load %s: %v", args.Filename, err), syserr.FromError(err).ToLinux())
        }
        defer file.DecRef(ctx)

        // Load the VDSO.
        vdsoAddr, err := loadVDSO(ctx, args.MemoryManager, vdso, loaded)
        if err != nil {
                return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("error loading VDSO: %v", err), syserr.FromError(err).ToLinux())
        }

        // Setup the heap. brk starts at the next page after the end of the
        // executable. Userspace can assume that the remainer of the page after
        // loaded.end is available for its use.
        e, ok := loaded.end.RoundUp()
        if !ok {
                return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("brk overflows: %#x", loaded.end), errno.ENOEXEC)
        }
        args.MemoryManager.BrkSetup(ctx, e)

        // Allocate our stack.
        stack, err := allocStack(ctx, args.MemoryManager, ac)
        if err != nil {
                return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to allocate stack: %v", err), syserr.FromError(err).ToLinux())
        }

        // Push the original filename to the stack, for AT_EXECFN.
        if _, err := stack.PushNullTerminatedByteSlice([]byte(args.Filename)); err != nil {
                return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push exec filename: %v", err), syserr.FromError(err).ToLinux())
        }
        execfn := stack.Bottom

        // Push 16 random bytes on the stack which AT_RANDOM will point to.
        var b [16]byte
        if _, err := rand.Read(b[:]); err != nil {
                return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to read random bytes: %v", err), syserr.FromError(err).ToLinux())
        }
        if _, err = stack.PushNullTerminatedByteSlice(b[:]); err != nil {
                return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to push random bytes: %v", err), syserr.FromError(err).ToLinux())
        }
        random := stack.Bottom

        c := auth.CredentialsFromContext(ctx)

        // Add generic auxv entries.
        auxv := append(loaded.auxv, arch.Auxv{
                arch.AuxEntry{linux.AT_UID, hostarch.Addr(c.RealKUID.In(c.UserNamespace).OrOverflow())},
                arch.AuxEntry{linux.AT_EUID, hostarch.Addr(c.EffectiveKUID.In(c.UserNamespace).OrOverflow())},
                arch.AuxEntry{linux.AT_GID, hostarch.Addr(c.RealKGID.In(c.UserNamespace).OrOverflow())},
                arch.AuxEntry{linux.AT_EGID, hostarch.Addr(c.EffectiveKGID.In(c.UserNamespace).OrOverflow())},
                // The conditions that require AT_SECURE = 1 never arise. See
                // kernel.Task.updateCredsForExecLocked.
                arch.AuxEntry{linux.AT_SECURE, 0},
                arch.AuxEntry{linux.AT_CLKTCK, linux.CLOCKS_PER_SEC},
                arch.AuxEntry{linux.AT_EXECFN, execfn},
                arch.AuxEntry{linux.AT_RANDOM, random},
                arch.AuxEntry{linux.AT_PAGESZ, hostarch.PageSize},
                arch.AuxEntry{linux.AT_SYSINFO_EHDR, vdsoAddr},
        }...)
        auxv = append(auxv, extraAuxv...)

        sl, err := stack.Load(newArgv, args.Envv, auxv)
        if err != nil {
                return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to load stack: %v", err), syserr.FromError(err).ToLinux())
        }

        m := args.MemoryManager
        m.SetArgvStart(sl.ArgvStart)
        m.SetArgvEnd(sl.ArgvEnd)
        m.SetEnvvStart(sl.EnvvStart)
        m.SetEnvvEnd(sl.EnvvEnd)
        m.SetAuxv(auxv)
        m.SetExecutable(ctx, file)

        symbolValue, err := getSymbolValueFromVDSO("rt_sigreturn")
        if err != nil {
                return 0, nil, "", syserr.NewDynamic(fmt.Sprintf("Failed to find rt_sigreturn in vdso: %v", err), syserr.FromError(err).ToLinux())
        }

        // Found rt_sigretrun.
        addr := uint64(vdsoAddr) + symbolValue - vdsoPrelink
        m.SetVDSOSigReturn(addr)

        ac.SetIP(uintptr(loaded.entry))
        ac.SetStack(uintptr(stack.Bottom))

        name := path.Base(args.Filename)
        if len(name) > linux.TASK_COMM_LEN-1 {
                name = name[:linux.TASK_COMM_LEN-1]
        }

        return loaded.os, ac, name, nil
}


















































































































  551 

































  699 








 1958 
















    1 




    1 





    1 





  293 






































    1 


























    1 


    1 


    1 





   12 


    2 


   10 








    3 


    7 


    5 


    5 


    4 


    4 


    2 


    2 




    2 


    1 


    2 




    2 


    2 


    2 






   10 





 1955 











    7 

    1 
    1 



    3 

    1 

    2 





    6 

    2 
    1 


    1 
    1 

    3 



























    3 

    3 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build amd64 386

package arch

import (
        "fmt"
        "io"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/cpuid"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sentry/arch/fpu"
        rpb "gvisor.dev/gvisor/pkg/sentry/arch/registers_go_proto"
)

// Registers represents the CPU registers for this architecture.
//
// +stateify savable
type Registers struct {
        linux.PtraceRegs
}

// System-related constants for x86.
const (
        // SyscallWidth is the width of syscall, sysenter, and int 80 insturctions.
        SyscallWidth = 2
)

// EFLAGS register bits.
const (
        // eflagsCF is the mask for the carry flag.
        eflagsCF = uint64(1) << 0
        // eflagsPF is the mask for the parity flag.
        eflagsPF = uint64(1) << 2
        // eflagsAF is the mask for the auxiliary carry flag.
        eflagsAF = uint64(1) << 4
        // eflagsZF is the mask for the zero flag.
        eflagsZF = uint64(1) << 6
        // eflagsSF is the mask for the sign flag.
        eflagsSF = uint64(1) << 7
        // eflagsTF is the mask for the trap flag.
        eflagsTF = uint64(1) << 8
        // eflagsIF is the mask for the interrupt flag.
        eflagsIF = uint64(1) << 9
        // eflagsDF is the mask for the direction flag.
        eflagsDF = uint64(1) << 10
        // eflagsOF is the mask for the overflow flag.
        eflagsOF = uint64(1) << 11
        // eflagsIOPL is the mask for the I/O privilege level.
        eflagsIOPL = uint64(3) << 12
        // eflagsNT is the mask for the nested task bit.
        eflagsNT = uint64(1) << 14
        // eflagsRF is the mask for the resume flag.
        eflagsRF = uint64(1) << 16
        // eflagsVM is the mask for the virtual mode bit.
        eflagsVM = uint64(1) << 17
        // eflagsAC is the mask for the alignment check / access control bit.
        eflagsAC = uint64(1) << 18
        // eflagsVIF is the mask for the virtual interrupt flag.
        eflagsVIF = uint64(1) << 19
        // eflagsVIP is the mask for the virtual interrupt pending bit.
        eflagsVIP = uint64(1) << 20
        // eflagsID is the mask for the CPUID detection bit.
        eflagsID = uint64(1) << 21

        // eflagsPtraceMutable is the mask for the set of EFLAGS that may be
        // changed by ptrace(PTRACE_SETREGS). eflagsPtraceMutable is analogous to
        // Linux's FLAG_MASK.
        eflagsPtraceMutable = eflagsCF | eflagsPF | eflagsAF | eflagsZF | eflagsSF | eflagsTF | eflagsDF | eflagsOF | eflagsRF | eflagsAC | eflagsNT

        // eflagsRestorable is the mask for the set of EFLAGS that may be changed by
        // SignalReturn. eflagsRestorable is analogous to Linux's FIX_EFLAGS.
        eflagsRestorable = eflagsAC | eflagsOF | eflagsDF | eflagsTF | eflagsSF | eflagsZF | eflagsAF | eflagsPF | eflagsCF | eflagsRF
)

// Segment selectors. See arch/x86/include/asm/segment.h.
const (
        userCS   = 0x33 // guest ring 3 code selector
        user32CS = 0x23 // guest ring 3 32 bit code selector
        userDS   = 0x2b // guest ring 3 data selector

        _FS_TLS_SEL = 0x63 // Linux FS thread-local storage selector
        _GS_TLS_SEL = 0x6b // Linux GS thread-local storage selector
)

var (
        // TrapInstruction is the x86 trap instruction.
        TrapInstruction = [1]byte{0xcc}

        // CPUIDInstruction is the x86 CPUID instruction.
        CPUIDInstruction = [2]byte{0xf, 0xa2}

        // X86TrapFlag is an exported const for use by other packages.
        X86TrapFlag uint64 = (1 << 8)
)

// Proto returns a protobuf representation of the system registers in State.
func (s State) Proto() *rpb.Registers {
        regs := &rpb.AMD64Registers{
                Rax:     s.Regs.Rax,
                Rbx:     s.Regs.Rbx,
                Rcx:     s.Regs.Rcx,
                Rdx:     s.Regs.Rdx,
                Rsi:     s.Regs.Rsi,
                Rdi:     s.Regs.Rdi,
                Rsp:     s.Regs.Rsp,
                Rbp:     s.Regs.Rbp,
                R8:      s.Regs.R8,
                R9:      s.Regs.R9,
                R10:     s.Regs.R10,
                R11:     s.Regs.R11,
                R12:     s.Regs.R12,
                R13:     s.Regs.R13,
                R14:     s.Regs.R14,
                R15:     s.Regs.R15,
                Rip:     s.Regs.Rip,
                Rflags:  s.Regs.Eflags,
                OrigRax: s.Regs.Orig_rax,
                Cs:      s.Regs.Cs,
                Ds:      s.Regs.Ds,
                Es:      s.Regs.Es,
                Fs:      s.Regs.Fs,
                Gs:      s.Regs.Gs,
                Ss:      s.Regs.Ss,
                FsBase:  s.Regs.Fs_base,
                GsBase:  s.Regs.Gs_base,
        }
        return &rpb.Registers{Arch: &rpb.Registers_Amd64{Amd64: regs}}
}

// Fork creates and returns an identical copy of the state.
func (s *State) Fork() State {
        return State{
                Regs:       s.Regs,
                fpState:    s.fpState.Fork(),
                FeatureSet: s.FeatureSet,
        }
}

// StateData implements Context.StateData.
func (s *State) StateData() *State {
        return s
}

// CPUIDEmulate emulates a cpuid instruction.
func (s *State) CPUIDEmulate(l log.Logger) {
        argax := uint32(s.Regs.Rax)
        argcx := uint32(s.Regs.Rcx)
        ax, bx, cx, dx := s.FeatureSet.EmulateID(argax, argcx)
        s.Regs.Rax = uint64(ax)
        s.Regs.Rbx = uint64(bx)
        s.Regs.Rcx = uint64(cx)
        s.Regs.Rdx = uint64(dx)
        l.Debugf("CPUID(%x,%x): %x %x %x %x", argax, argcx, ax, bx, cx, dx)
}

// SingleStep implements Context.SingleStep.
func (s *State) SingleStep() bool {
        return s.Regs.Eflags&X86TrapFlag != 0
}

// SetSingleStep enables single stepping.
func (s *State) SetSingleStep() {
        // Set the trap flag.
        s.Regs.Eflags |= X86TrapFlag
}

// ClearSingleStep enables single stepping.
func (s *State) ClearSingleStep() {
        // Clear the trap flag.
        s.Regs.Eflags &= ^X86TrapFlag
}

// RegisterMap returns a map of all registers.
func (s *State) RegisterMap() (map[string]uintptr, error) {
        return map[string]uintptr{
                "R15":      uintptr(s.Regs.R15),
                "R14":      uintptr(s.Regs.R14),
                "R13":      uintptr(s.Regs.R13),
                "R12":      uintptr(s.Regs.R12),
                "Rbp":      uintptr(s.Regs.Rbp),
                "Rbx":      uintptr(s.Regs.Rbx),
                "R11":      uintptr(s.Regs.R11),
                "R10":      uintptr(s.Regs.R10),
                "R9":       uintptr(s.Regs.R9),
                "R8":       uintptr(s.Regs.R8),
                "Rax":      uintptr(s.Regs.Rax),
                "Rcx":      uintptr(s.Regs.Rcx),
                "Rdx":      uintptr(s.Regs.Rdx),
                "Rsi":      uintptr(s.Regs.Rsi),
                "Rdi":      uintptr(s.Regs.Rdi),
                "Orig_rax": uintptr(s.Regs.Orig_rax),
                "Rip":      uintptr(s.Regs.Rip),
                "Cs":       uintptr(s.Regs.Cs),
                "Eflags":   uintptr(s.Regs.Eflags),
                "Rsp":      uintptr(s.Regs.Rsp),
                "Ss":       uintptr(s.Regs.Ss),
                "Fs_base":  uintptr(s.Regs.Fs_base),
                "Gs_base":  uintptr(s.Regs.Gs_base),
                "Ds":       uintptr(s.Regs.Ds),
                "Es":       uintptr(s.Regs.Es),
                "Fs":       uintptr(s.Regs.Fs),
                "Gs":       uintptr(s.Regs.Gs),
        }, nil
}

// PtraceGetRegs implements Context.PtraceGetRegs.
func (s *State) PtraceGetRegs(dst io.Writer) (int, error) {
        regs := s.ptraceGetRegs()
        n, err := regs.WriteTo(dst)
        return int(n), err
}

func (s *State) ptraceGetRegs() Registers {
        regs := s.Regs
        // These may not be initialized.
        if regs.Cs == 0 || regs.Ss == 0 || regs.Eflags == 0 {
                regs.Eflags = eflagsIF
                regs.Cs = userCS
                regs.Ss = userDS
        }
        // As an optimization, Linux <4.7 implements 32-bit fs_base/gs_base
        // addresses using reserved descriptors in the GDT instead of the MSRs,
        // with selector values FS_TLS_SEL and GS_TLS_SEL respectively. These
        // values are actually visible in struct user_regs_struct::fs/gs;
        // arch/x86/kernel/ptrace.c:getreg() doesn't attempt to sanitize struct
        // thread_struct::fsindex/gsindex.
        //
        // We always use fs == gs == 0 when fs_base/gs_base is in use, for
        // simplicity.
        //
        // Luckily, Linux <4.7 silently ignores setting fs/gs to 0 via
        // arch/x86/kernel/ptrace.c:set_segment_reg() when fs_base/gs_base is a
        // 32-bit value and fsindex/gsindex indicates that this optimization is
        // in use, as well as the reverse case of setting fs/gs to
        // FS/GS_TLS_SEL when fs_base/gs_base is a 64-bit value. (We do the
        // same in PtraceSetRegs.)
        //
        // TODO(gvisor.dev/issue/168): Remove this fixup since newer Linux
        // doesn't have this behavior anymore.
        if regs.Fs == 0 && regs.Fs_base <= 0xffffffff {
                regs.Fs = _FS_TLS_SEL
        }
        if regs.Gs == 0 && regs.Gs_base <= 0xffffffff {
                regs.Gs = _GS_TLS_SEL
        }
        return regs
}

var ptraceRegistersSize = (*linux.PtraceRegs)(nil).SizeBytes()

// PtraceSetRegs implements Context.PtraceSetRegs.
func (s *State) PtraceSetRegs(src io.Reader) (int, error) {
        var regs Registers
        buf := make([]byte, ptraceRegistersSize)
        if _, err := io.ReadFull(src, buf); err != nil {
                return 0, err
        }
        regs.UnmarshalUnsafe(buf)
        // Truncate segment registers to 16 bits.
        regs.Cs = uint64(uint16(regs.Cs))
        regs.Ds = uint64(uint16(regs.Ds))
        regs.Es = uint64(uint16(regs.Es))
        regs.Fs = uint64(uint16(regs.Fs))
        regs.Gs = uint64(uint16(regs.Gs))
        regs.Ss = uint64(uint16(regs.Ss))
        // In Linux this validation is via arch/x86/kernel/ptrace.c:putreg().
        if !isUserSegmentSelector(regs.Cs) {
                return 0, unix.EIO
        }
        if regs.Ds != 0 && !isUserSegmentSelector(regs.Ds) {
                return 0, unix.EIO
        }
        if regs.Es != 0 && !isUserSegmentSelector(regs.Es) {
                return 0, unix.EIO
        }
        if regs.Fs != 0 && !isUserSegmentSelector(regs.Fs) {
                return 0, unix.EIO
        }
        if regs.Gs != 0 && !isUserSegmentSelector(regs.Gs) {
                return 0, unix.EIO
        }
        if !isUserSegmentSelector(regs.Ss) {
                return 0, unix.EIO
        }
        if !isValidSegmentBase(regs.Fs_base) {
                return 0, unix.EIO
        }
        if !isValidSegmentBase(regs.Gs_base) {
                return 0, unix.EIO
        }
        // CS and SS are validated, but changes to them are otherwise silently
        // ignored on amd64.
        regs.Cs = s.Regs.Cs
        regs.Ss = s.Regs.Ss
        // fs_base/gs_base changes reset fs/gs via do_arch_prctl() on Linux.
        if regs.Fs_base != s.Regs.Fs_base {
                regs.Fs = 0
        }
        if regs.Gs_base != s.Regs.Gs_base {
                regs.Gs = 0
        }
        // Ignore "stale" TLS segment selectors for FS and GS. See comment in
        // ptraceGetRegs.
        if regs.Fs == _FS_TLS_SEL && regs.Fs_base != 0 {
                regs.Fs = 0
        }
        if regs.Gs == _GS_TLS_SEL && regs.Gs_base != 0 {
                regs.Gs = 0
        }
        regs.Eflags = (s.Regs.Eflags &^ eflagsPtraceMutable) | (regs.Eflags & eflagsPtraceMutable)
        s.Regs = regs
        return ptraceRegistersSize, nil
}

// isUserSegmentSelector returns true if the given segment selector specifies a
// privilege level of 3 (USER_RPL).
func isUserSegmentSelector(reg uint64) bool {
        return reg&3 == 3
}

// isValidSegmentBase returns true if the given segment base specifies a
// canonical user address.
func isValidSegmentBase(reg uint64) bool {
        return reg < uint64(maxAddr64)
}

// Register sets defined in include/uapi/linux/elf.h.
const (
        _NT_PRSTATUS   = 1
        _NT_PRFPREG    = 2
        _NT_X86_XSTATE = 0x202
)

// PtraceGetRegSet implements Context.PtraceGetRegSet.
func (s *State) PtraceGetRegSet(regset uintptr, dst io.Writer, maxlen int) (int, error) {
        switch regset {
        case _NT_PRSTATUS:
                if maxlen < ptraceRegistersSize {
                        return 0, linuxerr.EFAULT
                }
                return s.PtraceGetRegs(dst)
        case _NT_PRFPREG:
                return s.fpState.PtraceGetFPRegs(dst, maxlen)
        case _NT_X86_XSTATE:
                return s.fpState.PtraceGetXstateRegs(dst, maxlen, s.FeatureSet)
        default:
                return 0, linuxerr.EINVAL
        }
}

// PtraceSetRegSet implements Context.PtraceSetRegSet.
func (s *State) PtraceSetRegSet(regset uintptr, src io.Reader, maxlen int) (int, error) {
        switch regset {
        case _NT_PRSTATUS:
                if maxlen < ptraceRegistersSize {
                        return 0, linuxerr.EFAULT
                }
                return s.PtraceSetRegs(src)
        case _NT_PRFPREG:
                return s.fpState.PtraceSetFPRegs(src, maxlen)
        case _NT_X86_XSTATE:
                return s.fpState.PtraceSetXstateRegs(src, maxlen, s.FeatureSet)
        default:
                return 0, linuxerr.EINVAL
        }
}

// FullRestore indicates whether a full restore is required.
func (s *State) FullRestore() bool {
        // A fast system call return is possible only if
        //
        // * RCX matches the instruction pointer.
        // * R11 matches our flags value.
        // * Usermode does not expect to set either the resume flag or the
        //   virtual mode flags (unlikely.)
        // * CS and SS are set to the standard selectors.
        //
        // That is, SYSRET results in the correct final state.
        fastRestore := s.Regs.Rcx == s.Regs.Rip &&
                s.Regs.Eflags == s.Regs.R11 &&
                (s.Regs.Eflags&eflagsRF == 0) &&
                (s.Regs.Eflags&eflagsVM == 0) &&
                s.Regs.Cs == userCS &&
                s.Regs.Ss == userDS
        return !fastRestore
}

// New returns a new architecture context.
func New(arch Arch, fs *cpuid.FeatureSet) Context {
        switch arch {
        case AMD64:
                return &context64{
                        State{
                                fpState:    fpu.NewState(),
                                FeatureSet: fs,
                        },
                        []fpu.State(nil),
                }
        }
        panic(fmt.Sprintf("unknown architecture %v", arch))
}
















































  574 

  199 


  498 







  501 
























  579 
  476 






  219 
  114 


  218 
    9 




   11 










   11 

    5 



    2 


    3 


  202 




   14 






   12 
























    2 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "io"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/metric"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
)

var (
        partialResultOnce sync.Once
)

// incrementPartialResultMetric increments PartialResultMetric by calling
// Increment(). This is added as the func Do() which is called below requires
// us to pass a function which does not take any arguments, whereas Increment()
// takes a variadic number of arguments.
func incrementPartialResultMetric() {
        metric.WeirdnessMetric.Increment("partial_result")
}

// HandleIOErrorVFS2 handles special error cases for partial results. For some
// errors, we may consume the error and return only the partial read/write.
//
// op and f are used only for panics.
func HandleIOErrorVFS2(ctx context.Context, partialResult bool, ioerr, intr error, op string, f *vfs.FileDescription) error {
        known, err := handleIOErrorImpl(ctx, partialResult, ioerr, intr, op)
        if err != nil {
                return err
        }
        if !known {
                // An unknown error is encountered with a partial read/write.
                fs := f.Mount().Filesystem().VirtualFilesystem()
                root := vfs.RootFromContext(ctx)
                name, _ := fs.PathnameWithDeleted(ctx, root, f.VirtualDentry())
                log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q", partialResult, ioerr, ioerr, op, name)
                partialResultOnce.Do(incrementPartialResultMetric)
        }
        return nil
}

// handleIOError handles special error cases for partial results. For some
// errors, we may consume the error and return only the partial read/write.
//
// op and f are used only for panics.
func handleIOError(ctx context.Context, partialResult bool, ioerr, intr error, op string, f *fs.File) error {
        known, err := handleIOErrorImpl(ctx, partialResult, ioerr, intr, op)
        if err != nil {
                return err
        }
        if !known {
                // An unknown error is encountered with a partial read/write.
                name, _ := f.Dirent.FullName(nil /* ignore chroot */)
                log.Traceback("Invalid request partialResult %v and err (type %T) %v for %s operation on %q, %T", partialResult, ioerr, ioerr, op, name, f.FileOperations)
                partialResultOnce.Do(incrementPartialResultMetric)
        }
        return nil
}

// handleIOError handles special error cases for partial results. For some
// errors, we may consume the error and return only the partial read/write.
//
// Returns false if error is unknown.
func handleIOErrorImpl(ctx context.Context, partialResult bool, errOrig, intr error, op string) (bool, error) {
        if errOrig == nil {
                // Typical successful syscall.
                return true, nil
        }

        // Translate error, if possible, to consolidate errors from other packages
        // into a smaller set of errors from syserror package.
        translatedErr := errOrig
        if errno, ok := syserror.TranslateError(errOrig); ok {
                translatedErr = errno
        }
        switch {
        case translatedErr == io.EOF:
                // EOF is always consumed. If this is a partial read/write
                // (result != 0), the application will see that, otherwise
                // they will see 0.
                return true, nil
        case linuxerr.Equals(linuxerr.EFBIG, translatedErr):
                t := kernel.TaskFromContext(ctx)
                if t == nil {
                        panic("I/O error should only occur from a context associated with a Task")
                }
                // Ignore partialResult because this error only applies to
                // normal files, and for those files we cannot accumulate
                // write results.
                //
                // Do not consume the error and return it as EFBIG.
                // Simultaneously send a SIGXFSZ per setrlimit(2).
                t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
                return true, linuxerr.EFBIG
        case linuxerr.Equals(linuxerr.EINTR, translatedErr):
                // The syscall was interrupted. Return nil if it completed
                // partially, otherwise return the error code that the syscall
                // needs (to indicate to the kernel what it should do).
                if partialResult {
                        return true, nil
                }
                return true, intr
        }

        if !partialResult {
                // Typical syscall error.
                return true, errOrig
        }

        switch {
        case linuxerr.Equals(linuxerr.EINTR, translatedErr):
                // Syscall interrupted, but completed a partial
                // read/write.  Like ErrWouldBlock, since we have a
                // partial read/write, we consume the error and return
                // the partial result.
                return true, nil
        case linuxerr.Equals(linuxerr.EFAULT, translatedErr):
                // EFAULT is only shown the user if nothing was
                // read/written. If we read something (this case), they see
                // a partial read/write. They will then presumably try again
                // with an incremented buffer, which will EFAULT with
                // result == 0.
                return true, nil
        case linuxerr.Equals(linuxerr.EPIPE, translatedErr):
                // Writes to a pipe or socket will return EPIPE if the other
                // side is gone. The partial write is returned. EPIPE will be
                // returned on the next call.
                //
                // TODO(gvisor.dev/issue/161): In some cases SIGPIPE should
                // also be sent to the application.
                return true, nil
        case linuxerr.Equals(linuxerr.ENOSPC, translatedErr):
                // Similar to EPIPE. Return what we wrote this time, and let
                // ENOSPC be returned on the next call.
                return true, nil
        case linuxerr.Equals(linuxerr.ECONNRESET, translatedErr):
                fallthrough
        case linuxerr.Equals(linuxerr.ETIMEDOUT, translatedErr):
                // For TCP sendfile connections, we may have a reset or timeout. But we
                // should just return n as the result.
                return true, nil
        case linuxerr.Equals(linuxerr.EWOULDBLOCK, translatedErr):
                // Syscall would block, but completed a partial read/write.
                // This case should only be returned by IssueIO for nonblocking
                // files. Since we have a partial read/write, we consume
                // ErrWouldBlock, returning the partial result.
                return true, nil
        }

        switch errOrig.(type) {
        case syserror.SyscallRestartErrno:
                // Identical to the EINTR case.
                return true, nil
        }

        // Error is unknown and cannot be properly handled.
        return false, nil
}



























































































































































  321 





































































  335 








  335 




  334 




  334 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package flipcall implements a protocol providing Fast Local Interprocess
// Procedure Calls between mutually-distrusting processes.
package flipcall

import (
        "fmt"
        "math"
        "sync/atomic"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/memutil"
)

// An Endpoint provides the ability to synchronously transfer data and control
// to a connected peer Endpoint, which may be in another process.
//
// Since the Endpoint control transfer model is synchronous, at any given time
// one Endpoint "has control" (designated the active Endpoint), and the other
// is "waiting for control" (designated the inactive Endpoint). Users of the
// flipcall package designate one Endpoint as the client, which is initially
// active, and the other as the server, which is initially inactive. See
// flipcall_example_test.go for usage.
type Endpoint struct {
        // packet is a pointer to the beginning of the packet window. (Since this
        // is a raw OS memory mapping and not a Go object, it does not need to be
        // represented as an unsafe.Pointer.) packet is immutable.
        packet uintptr

        // dataCap is the size of the datagram part of the packet window in bytes.
        // dataCap is immutable.
        dataCap uint32

        // activeState is csClientActive if this is a client Endpoint and
        // csServerActive if this is a server Endpoint.
        activeState uint32

        // inactiveState is csServerActive if this is a client Endpoint and
        // csClientActive if this is a server Endpoint.
        inactiveState uint32

        // shutdown is non-zero if Endpoint.Shutdown() has been called, or if the
        // Endpoint has acknowledged shutdown initiated by the peer. shutdown is
        // accessed using atomic memory operations.
        shutdown uint32

        ctrl endpointControlImpl
}

// EndpointSide indicates which side of a connection an Endpoint belongs to.
type EndpointSide int

const (
        // ClientSide indicates that an Endpoint is a client (initially-active;
        // first method call should be Connect).
        ClientSide EndpointSide = iota

        // ServerSide indicates that an Endpoint is a server (initially-inactive;
        // first method call should be RecvFirst.)
        ServerSide
)

// Init must be called on zero-value Endpoints before first use. If it
// succeeds, ep.Destroy() must be called once the Endpoint is no longer in use.
//
// pwd represents the packet window used to exchange data with the peer
// Endpoint. FD may differ between Endpoints if they are in different
// processes, but must represent the same file. The packet window must
// initially be filled with zero bytes.
func (ep *Endpoint) Init(side EndpointSide, pwd PacketWindowDescriptor, opts ...EndpointOption) error {
        switch side {
        case ClientSide:
                ep.activeState = csClientActive
                ep.inactiveState = csServerActive
        case ServerSide:
                ep.activeState = csServerActive
                ep.inactiveState = csClientActive
        default:
                return fmt.Errorf("invalid EndpointSide: %v", side)
        }
        if pwd.Length < pageSize {
                return fmt.Errorf("packet window size (%d) less than minimum (%d)", pwd.Length, pageSize)
        }
        if pwd.Length > math.MaxUint32 {
                return fmt.Errorf("packet window size (%d) exceeds maximum (%d)", pwd.Length, math.MaxUint32)
        }
        m, err := memutil.MapFile(0, uintptr(pwd.Length), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED, uintptr(pwd.FD), uintptr(pwd.Offset))
        if err != nil {
                return fmt.Errorf("failed to mmap packet window: %v", err)
        }
        ep.packet = m
        ep.dataCap = uint32(pwd.Length) - uint32(PacketHeaderBytes)
        if err := ep.ctrlInit(opts...); err != nil {
                ep.unmapPacket()
                return err
        }
        return nil
}

// NewEndpoint is a convenience function that returns an initialized Endpoint
// allocated on the heap.
func NewEndpoint(side EndpointSide, pwd PacketWindowDescriptor, opts ...EndpointOption) (*Endpoint, error) {
        var ep Endpoint
        if err := ep.Init(side, pwd, opts...); err != nil {
                return nil, err
        }
        return &ep, nil
}

// An EndpointOption configures an Endpoint.
type EndpointOption interface {
        isEndpointOption()
}

// Destroy releases resources owned by ep. No other Endpoint methods may be
// called after Destroy.
func (ep *Endpoint) Destroy() {
        ep.unmapPacket()
}

func (ep *Endpoint) unmapPacket() {
        unix.RawSyscall(unix.SYS_MUNMAP, ep.packet, uintptr(ep.dataCap)+PacketHeaderBytes, 0)
        ep.packet = 0
}

// Shutdown causes concurrent and future calls to ep.Connect(), ep.SendRecv(),
// ep.RecvFirst(), and ep.SendLast(), as well as the same calls in the peer
// Endpoint, to unblock and return ShutdownErrors. It does not wait for
// concurrent calls to return. Successive calls to Shutdown have no effect.
//
// Shutdown is the only Endpoint method that may be called concurrently with
// other methods on the same Endpoint.
func (ep *Endpoint) Shutdown() {
        if atomic.SwapUint32(&ep.shutdown, 1) != 0 {
                // ep.Shutdown() has previously been called.
                return
        }
        ep.ctrlShutdown()
}

// isShutdownLocally returns true if ep.Shutdown() has been called.
func (ep *Endpoint) isShutdownLocally() bool {
        return atomic.LoadUint32(&ep.shutdown) != 0
}

// ShutdownError is returned by most Endpoint methods after Endpoint.Shutdown()
// has been called.
type ShutdownError struct{}

// Error implements error.Error.
func (ShutdownError) Error() string {
        return "flipcall connection shutdown"
}

// DataCap returns the maximum datagram size supported by ep. Equivalently,
// DataCap returns len(ep.Data()).
func (ep *Endpoint) DataCap() uint32 {
        return ep.dataCap
}

// Connection state.
const (
        // The client is, by definition, initially active, so this must be 0.
        csClientActive = 0
        csServerActive = 1
        csShutdown     = 2
)

// Connect blocks until the peer Endpoint has called Endpoint.RecvFirst().
//
// Preconditions:
// * ep is a client Endpoint.
// * ep.Connect(), ep.RecvFirst(), ep.SendRecv(), and ep.SendLast() have never
//   been called.
func (ep *Endpoint) Connect() error {
        err := ep.ctrlConnect()
        if err == nil {
                raceBecomeActive()
        }
        return err
}

// RecvFirst blocks until the peer Endpoint calls Endpoint.SendRecv(), then
// returns the datagram length specified by that call.
//
// Preconditions:
// * ep is a server Endpoint.
// * ep.SendRecv(), ep.RecvFirst(), and ep.SendLast() have never been called.
func (ep *Endpoint) RecvFirst() (uint32, error) {
        if err := ep.ctrlWaitFirst(); err != nil {
                return 0, err
        }
        raceBecomeActive()
        recvDataLen := atomic.LoadUint32(ep.dataLen())
        if recvDataLen > ep.dataCap {
                return 0, fmt.Errorf("received packet with invalid datagram length %d (maximum %d)", recvDataLen, ep.dataCap)
        }
        return recvDataLen, nil
}

// SendRecv transfers control to the peer Endpoint, causing its call to
// Endpoint.SendRecv() or Endpoint.RecvFirst() to return with the given
// datagram length, then blocks until the peer Endpoint calls
// Endpoint.SendRecv() or Endpoint.SendLast().
//
// Preconditions:
// * dataLen <= ep.DataCap().
// * No previous call to ep.SendRecv() or ep.RecvFirst() has returned an error.
// * ep.SendLast() has never been called.
// * If ep is a client Endpoint, ep.Connect() has previously been called and
//   returned nil.
func (ep *Endpoint) SendRecv(dataLen uint32) (uint32, error) {
        if dataLen > ep.dataCap {
                panic(fmt.Sprintf("attempting to send packet with datagram length %d (maximum %d)", dataLen, ep.dataCap))
        }
        // This store can safely be non-atomic: Under correct operation we should
        // be the only thread writing ep.dataLen(), and ep.ctrlRoundTrip() will
        // synchronize with the receiver. We will not read from ep.dataLen() until
        // after ep.ctrlRoundTrip(), so if the peer is mutating it concurrently then
        // they can only shoot themselves in the foot.
        *ep.dataLen() = dataLen
        raceBecomeInactive()
        if err := ep.ctrlRoundTrip(); err != nil {
                return 0, err
        }
        raceBecomeActive()
        recvDataLen := atomic.LoadUint32(ep.dataLen())
        if recvDataLen > ep.dataCap {
                return 0, fmt.Errorf("received packet with invalid datagram length %d (maximum %d)", recvDataLen, ep.dataCap)
        }
        return recvDataLen, nil
}

// SendLast causes the peer Endpoint's call to Endpoint.SendRecv() or
// Endpoint.RecvFirst() to return with the given datagram length.
//
// Preconditions:
// * dataLen <= ep.DataCap().
// * No previous call to ep.SendRecv() or ep.RecvFirst() has returned an error.
// * ep.SendLast() has never been called.
// * If ep is a client Endpoint, ep.Connect() has previously been called and
//   returned nil.
func (ep *Endpoint) SendLast(dataLen uint32) error {
        if dataLen > ep.dataCap {
                panic(fmt.Sprintf("attempting to send packet with datagram length %d (maximum %d)", dataLen, ep.dataCap))
        }
        *ep.dataLen() = dataLen
        raceBecomeInactive()
        if err := ep.ctrlWakeLast(); err != nil {
                return err
        }
        return nil
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/syscalls/linux/vfs2/vfs2_abi_autogen_unsafe.go: no such file or directory














































































































































































































































































  566 



































  106 




  538 






    2 


  533 


    2 


  531 







    1 
    1 
    1 
    1 







    5 












    5 








































































































    1 










    1 







    1 














    1 
































    2 





  533 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/sentry/usage"
        "gvisor.dev/gvisor/pkg/sync"
)

// A ThreadGroup is a logical grouping of tasks that has widespread
// significance to other kernel features (e.g. signal handling). ("Thread
// groups" are usually called "processes" in userspace documentation.)
//
// ThreadGroup is a superset of Linux's struct signal_struct.
//
// +stateify savable
type ThreadGroup struct {
        threadGroupNode

        // signalHandlers is the set of signal handlers used by every task in this
        // thread group. (signalHandlers may also be shared with other thread
        // groups.)
        //
        // signalHandlers.mu (hereafter "the signal mutex") protects state related
        // to signal handling, as well as state that usually needs to be atomic
        // with signal handling, for all ThreadGroups and Tasks using
        // signalHandlers. (This is analogous to Linux's use of struct
        // sighand_struct::siglock.)
        //
        // The signalHandlers pointer can only be mutated during an execve
        // (Task.finishExec). Consequently, when it's possible for a task in the
        // thread group to be completing an execve, signalHandlers is protected by
        // the owning TaskSet.mu. Otherwise, it is possible to read the
        // signalHandlers pointer without synchronization. In particular,
        // completing an execve requires that all other tasks in the thread group
        // have exited, so task goroutines do not need the owning TaskSet.mu to
        // read the signalHandlers pointer of their thread groups.
        signalHandlers *SignalHandlers

        // pendingSignals is the set of pending signals that may be handled by any
        // task in this thread group.
        //
        // pendingSignals is protected by the signal mutex.
        pendingSignals pendingSignals

        // If groupStopDequeued is true, a task in the thread group has dequeued a
        // stop signal, but has not yet initiated the group stop.
        //
        // groupStopDequeued is analogous to Linux's JOBCTL_STOP_DEQUEUED.
        //
        // groupStopDequeued is protected by the signal mutex.
        groupStopDequeued bool

        // groupStopSignal is the signal that caused a group stop to be initiated.
        //
        // groupStopSignal is protected by the signal mutex.
        groupStopSignal linux.Signal

        // groupStopPendingCount is the number of active tasks in the thread group
        // for which Task.groupStopPending is set.
        //
        // groupStopPendingCount is analogous to Linux's
        // signal_struct::group_stop_count.
        //
        // groupStopPendingCount is protected by the signal mutex.
        groupStopPendingCount int

        // If groupStopComplete is true, groupStopPendingCount transitioned from
        // non-zero to zero without an intervening SIGCONT.
        //
        // groupStopComplete is analogous to Linux's SIGNAL_STOP_STOPPED.
        //
        // groupStopComplete is protected by the signal mutex.
        groupStopComplete bool

        // If groupStopWaitable is true, the thread group is indicating a waitable
        // group stop event (as defined by EventChildGroupStop).
        //
        // Linux represents the analogous state as SIGNAL_STOP_STOPPED being set
        // and group_exit_code being non-zero.
        //
        // groupStopWaitable is protected by the signal mutex.
        groupStopWaitable bool

        // If groupContNotify is true, then a SIGCONT has recently ended a group
        // stop on this thread group, and the first task to observe it should
        // notify its parent. groupContInterrupted is true iff SIGCONT ended an
        // incomplete group stop. If groupContNotify is false, groupContInterrupted is
        // meaningless.
        //
        // Analogues in Linux:
        //
        // - groupContNotify && groupContInterrupted is represented by
        // SIGNAL_CLD_STOPPED.
        //
        // - groupContNotify && !groupContInterrupted is represented by
        // SIGNAL_CLD_CONTINUED.
        //
        // - !groupContNotify is represented by neither flag being set.
        //
        // groupContNotify and groupContInterrupted are protected by the signal
        // mutex.
        groupContNotify      bool
        groupContInterrupted bool

        // If groupContWaitable is true, the thread group is indicating a waitable
        // continue event (as defined by EventGroupContinue).
        //
        // groupContWaitable is analogous to Linux's SIGNAL_STOP_CONTINUED.
        //
        // groupContWaitable is protected by the signal mutex.
        groupContWaitable bool

        // exiting is true if all tasks in the ThreadGroup should exit. exiting is
        // analogous to Linux's SIGNAL_GROUP_EXIT.
        //
        // exiting is protected by the signal mutex. exiting can only transition
        // from false to true.
        exiting bool

        // exitStatus is the thread group's exit status.
        //
        // While exiting is false, exitStatus is protected by the signal mutex.
        // When exiting becomes true, exitStatus becomes immutable.
        exitStatus linux.WaitStatus

        // terminationSignal is the signal that this thread group's leader will
        // send to its parent when it exits.
        //
        // terminationSignal is protected by the TaskSet mutex.
        terminationSignal linux.Signal

        // liveGoroutines is the number of non-exited task goroutines in the thread
        // group.
        //
        // liveGoroutines is not saved; it is reset as task goroutines are
        // restarted by Task.Start.
        liveGoroutines sync.WaitGroup `state:"nosave"`

        timerMu sync.Mutex `state:"nosave"`

        // itimerRealTimer implements ITIMER_REAL for the thread group.
        itimerRealTimer *ktime.Timer

        // itimerVirtSetting is the ITIMER_VIRTUAL setting for the thread group.
        //
        // itimerVirtSetting is protected by the signal mutex.
        itimerVirtSetting ktime.Setting

        // itimerProfSetting is the ITIMER_PROF setting for the thread group.
        //
        // itimerProfSetting is protected by the signal mutex.
        itimerProfSetting ktime.Setting

        // rlimitCPUSoftSetting is the setting for RLIMIT_CPU soft limit
        // notifications for the thread group.
        //
        // rlimitCPUSoftSetting is protected by the signal mutex.
        rlimitCPUSoftSetting ktime.Setting

        // cpuTimersEnabled is non-zero if itimerVirtSetting.Enabled is true,
        // itimerProfSetting.Enabled is true, rlimitCPUSoftSetting.Enabled is true,
        // or limits.Get(CPU) is finite.
        //
        // cpuTimersEnabled is protected by the signal mutex. cpuTimersEnabled is
        // accessed using atomic memory operations.
        cpuTimersEnabled uint32

        // timers is the thread group's POSIX interval timers. nextTimerID is the
        // TimerID at which allocation should begin searching for an unused ID.
        //
        // timers and nextTimerID are protected by timerMu.
        timers      map[linux.TimerID]*IntervalTimer
        nextTimerID linux.TimerID

        // exitedCPUStats is the CPU usage for all exited tasks in the thread
        // group. exitedCPUStats is protected by the TaskSet mutex.
        exitedCPUStats usage.CPUStats

        // childCPUStats is the CPU usage of all joined descendants of this thread
        // group. childCPUStats is protected by the TaskSet mutex.
        childCPUStats usage.CPUStats

        // ioUsage is the I/O usage for all exited tasks in the thread group.
        // The ioUsage pointer is immutable.
        ioUsage *usage.IO

        // maxRSS is the historical maximum resident set size of the thread group, updated when:
        //
        // - A task in the thread group exits, since after all tasks have
        // exited the MemoryManager is no longer reachable.
        //
        // - The thread group completes an execve, since this changes
        // MemoryManagers.
        //
        // maxRSS is protected by the TaskSet mutex.
        maxRSS uint64

        // childMaxRSS is the maximum resident set size in bytes of all joined
        // descendants of this thread group.
        //
        // childMaxRSS is protected by the TaskSet mutex.
        childMaxRSS uint64

        // Resource limits for this ThreadGroup. The limits pointer is immutable.
        limits *limits.LimitSet

        // processGroup is the processGroup for this thread group.
        //
        // processGroup is protected by the TaskSet mutex.
        processGroup *ProcessGroup

        // execed indicates an exec has occurred since creation. This will be
        // set by finishExec, and new TheadGroups will have this field cleared.
        // When execed is set, the processGroup may no longer be changed.
        //
        // execed is protected by the TaskSet mutex.
        execed bool

        // oldRSeqCritical is the thread group's old rseq critical region.
        oldRSeqCritical atomic.Value `state:".(*OldRSeqCriticalRegion)"`

        // mounts is the thread group's mount namespace. This does not really
        // correspond to a "mount namespace" in Linux, but is more like a
        // complete VFS that need not be shared between processes. See the
        // comment in mounts.go  for more information.
        //
        // mounts is immutable.
        mounts *fs.MountNamespace

        // tty is the thread group's controlling terminal. If nil, there is no
        // controlling terminal.
        //
        // tty is protected by the signal mutex.
        tty *TTY

        // oomScoreAdj is the thread group's OOM score adjustment. This is
        // currently not used but is maintained for consistency.
        // TODO(gvisor.dev/issue/1967)
        //
        // oomScoreAdj is accessed using atomic memory operations.
        oomScoreAdj int32
}

// NewThreadGroup returns a new, empty thread group in PID namespace pidns. The
// thread group leader will send its parent terminationSignal when it exits.
// The new thread group isn't visible to the system until a task has been
// created inside of it by a successful call to TaskSet.NewTask.
func (k *Kernel) NewThreadGroup(mntns *fs.MountNamespace, pidns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet) *ThreadGroup {
        tg := &ThreadGroup{
                threadGroupNode: threadGroupNode{
                        pidns: pidns,
                },
                signalHandlers:    sh,
                terminationSignal: terminationSignal,
                ioUsage:           &usage.IO{},
                limits:            limits,
                mounts:            mntns,
        }
        tg.itimerRealTimer = ktime.NewTimer(k.timekeeper.monotonicClock, &itimerRealListener{tg: tg})
        tg.timers = make(map[linux.TimerID]*IntervalTimer)
        tg.oldRSeqCritical.Store(&OldRSeqCriticalRegion{})
        return tg
}

// saveOldRSeqCritical is invoked by stateify.
func (tg *ThreadGroup) saveOldRSeqCritical() *OldRSeqCriticalRegion {
        return tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
}

// loadOldRSeqCritical is invoked by stateify.
func (tg *ThreadGroup) loadOldRSeqCritical(r *OldRSeqCriticalRegion) {
        tg.oldRSeqCritical.Store(r)
}

// SignalHandlers returns the signal handlers used by tg.
//
// Preconditions: The caller must provide the synchronization required to read
// tg.signalHandlers, as described in the field's comment.
func (tg *ThreadGroup) SignalHandlers() *SignalHandlers {
        return tg.signalHandlers
}

// Limits returns tg's limits.
func (tg *ThreadGroup) Limits() *limits.LimitSet {
        return tg.limits
}

// Release releases the thread group's resources.
func (tg *ThreadGroup) Release(ctx context.Context) {
        // Timers must be destroyed without holding the TaskSet or signal mutexes
        // since timers send signals with Timer.mu locked.
        tg.itimerRealTimer.Destroy()
        var its []*IntervalTimer
        tg.pidns.owner.mu.Lock()
        tg.signalHandlers.mu.Lock()
        for _, it := range tg.timers {
                its = append(its, it)
        }
        tg.timers = make(map[linux.TimerID]*IntervalTimer) // nil maps can't be saved
        tg.signalHandlers.mu.Unlock()
        tg.pidns.owner.mu.Unlock()
        for _, it := range its {
                it.DestroyTimer()
        }
        if tg.mounts != nil {
                tg.mounts.DecRef(ctx)
        }
}

// forEachChildThreadGroupLocked indicates over all child ThreadGroups.
//
// Precondition: TaskSet.mu must be held.
func (tg *ThreadGroup) forEachChildThreadGroupLocked(fn func(*ThreadGroup)) {
        for t := tg.tasks.Front(); t != nil; t = t.Next() {
                for child := range t.children {
                        if child == child.tg.leader {
                                fn(child.tg)
                        }
                }
        }
}

// SetControllingTTY sets tty as the controlling terminal of tg.
func (tg *ThreadGroup) SetControllingTTY(tty *TTY, steal bool, isReadable bool) error {
        tty.mu.Lock()
        defer tty.mu.Unlock()

        // We might be asked to set the controlling terminal of multiple
        // processes, so we lock both the TaskSet and SignalHandlers.
        tg.pidns.owner.mu.Lock()
        defer tg.pidns.owner.mu.Unlock()
        tg.signalHandlers.mu.Lock()
        defer tg.signalHandlers.mu.Unlock()

        // "The calling process must be a session leader and not have a
        // controlling terminal already." - tty_ioctl(4)
        if tg.processGroup.session.leader != tg || tg.tty != nil {
                return linuxerr.EINVAL
        }

        creds := auth.CredentialsFromContext(tg.leader)
        hasAdmin := creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root())

        // "If this terminal is already the controlling terminal of a different
        // session group, then the ioctl fails with EPERM, unless the caller
        // has the CAP_SYS_ADMIN capability and arg equals 1, in which case the
        // terminal is stolen, and all processes that had it as controlling
        // terminal lose it." - tty_ioctl(4)
        if tty.tg != nil && tg.processGroup.session != tty.tg.processGroup.session {
                // Stealing requires CAP_SYS_ADMIN in the root user namespace.
                if !hasAdmin || !steal {
                        return linuxerr.EPERM
                }
                // Steal the TTY away. Unlike TIOCNOTTY, don't send signals.
                for othertg := range tg.pidns.owner.Root.tgids {
                        // This won't deadlock by locking tg.signalHandlers
                        // because at this point:
                        // - We only lock signalHandlers if it's in the same
                        //   session as the tty's controlling thread group.
                        // - We know that the calling thread group is not in
                        //   the same session as the tty's controlling thread
                        //   group.
                        if othertg.processGroup.session == tty.tg.processGroup.session {
                                othertg.signalHandlers.mu.Lock()
                                othertg.tty = nil
                                othertg.signalHandlers.mu.Unlock()
                        }
                }
        }

        if !isReadable && !hasAdmin {
                return linuxerr.EPERM
        }

        // Set the controlling terminal and foreground process group.
        tg.tty = tty
        tg.processGroup.session.foreground = tg.processGroup
        // Set this as the controlling process of the terminal.
        tty.tg = tg

        return nil
}

// ReleaseControllingTTY gives up tty as the controlling tty of tg.
func (tg *ThreadGroup) ReleaseControllingTTY(tty *TTY) error {
        tty.mu.Lock()
        defer tty.mu.Unlock()

        // We might be asked to set the controlling terminal of multiple
        // processes, so we lock both the TaskSet and SignalHandlers.
        tg.pidns.owner.mu.RLock()
        defer tg.pidns.owner.mu.RUnlock()

        // Just below, we may re-lock signalHandlers in order to send signals.
        // Thus we can't defer Unlock here.
        tg.signalHandlers.mu.Lock()

        if tg.tty == nil || tg.tty != tty {
                tg.signalHandlers.mu.Unlock()
                return linuxerr.ENOTTY
        }

        // "If the process was session leader, then send SIGHUP and SIGCONT to
        // the foreground process group and all processes in the current
        // session lose their controlling terminal." - tty_ioctl(4)
        // Remove tty as the controlling tty for each process in the session,
        // then send them SIGHUP and SIGCONT.

        // If we're not the session leader, we don't have to do much.
        if tty.tg != tg {
                tg.tty = nil
                tg.signalHandlers.mu.Unlock()
                return nil
        }

        tg.signalHandlers.mu.Unlock()

        // We're the session leader. SIGHUP and SIGCONT the foreground process
        // group and remove all controlling terminals in the session.
        var lastErr error
        for othertg := range tg.pidns.owner.Root.tgids {
                if othertg.processGroup.session == tg.processGroup.session {
                        othertg.signalHandlers.mu.Lock()
                        othertg.tty = nil
                        if othertg.processGroup == tg.processGroup.session.foreground {
                                if err := othertg.leader.sendSignalLocked(&linux.SignalInfo{Signo: int32(linux.SIGHUP)}, true /* group */); err != nil {
                                        lastErr = err
                                }
                                if err := othertg.leader.sendSignalLocked(&linux.SignalInfo{Signo: int32(linux.SIGCONT)}, true /* group */); err != nil {
                                        lastErr = err
                                }
                        }
                        othertg.signalHandlers.mu.Unlock()
                }
        }

        return lastErr
}

// ForegroundProcessGroup returns the process group ID of the foreground
// process group.
func (tg *ThreadGroup) ForegroundProcessGroup(tty *TTY) (int32, error) {
        tty.mu.Lock()
        defer tty.mu.Unlock()

        tg.pidns.owner.mu.Lock()
        defer tg.pidns.owner.mu.Unlock()
        tg.signalHandlers.mu.Lock()
        defer tg.signalHandlers.mu.Unlock()

        // "When fd does not refer to the controlling terminal of the calling
        // process, -1 is returned" - tcgetpgrp(3)
        if tg.tty != tty {
                return -1, linuxerr.ENOTTY
        }

        return int32(tg.processGroup.session.foreground.id), nil
}

// SetForegroundProcessGroup sets the foreground process group of tty to pgid.
func (tg *ThreadGroup) SetForegroundProcessGroup(tty *TTY, pgid ProcessGroupID) (int32, error) {
        tty.mu.Lock()
        defer tty.mu.Unlock()

        tg.pidns.owner.mu.Lock()
        defer tg.pidns.owner.mu.Unlock()
        tg.signalHandlers.mu.Lock()
        defer tg.signalHandlers.mu.Unlock()

        // TODO(gvisor.dev/issue/6148): "If tcsetpgrp() is called by a member of a
        // background process group in its session, and the calling process is not
        // blocking or ignoring SIGTTOU, a SIGTTOU signal is sent to all members of
        // this background process group."

        // tty must be the controlling terminal.
        if tg.tty != tty {
                return -1, linuxerr.ENOTTY
        }

        // pgid must be positive.
        if pgid < 0 {
                return -1, linuxerr.EINVAL
        }

        // pg must not be empty. Empty process groups are removed from their
        // pid namespaces.
        pg, ok := tg.pidns.processGroups[pgid]
        if !ok {
                return -1, linuxerr.ESRCH
        }

        // pg must be part of this process's session.
        if tg.processGroup.session != pg.session {
                return -1, linuxerr.EPERM
        }

        tg.processGroup.session.foreground.id = pgid
        return 0, nil
}

// itimerRealListener implements ktime.Listener for ITIMER_REAL expirations.
//
// +stateify savable
type itimerRealListener struct {
        tg *ThreadGroup
}

// Notify implements ktime.TimerListener.Notify.
func (l *itimerRealListener) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
        l.tg.SendSignal(SignalInfoPriv(linux.SIGALRM))
        return ktime.Setting{}, false
}

// Destroy implements ktime.TimerListener.Destroy.
func (l *itimerRealListener) Destroy() {
}

































































    6 














    6 
































    2 












    3 
    1 


    3 


    1 



    5 
    2 


    5 


    3 















    3 







    3 





    1 
    1 








    1 























    5 

    1 


    4 





    6 
    1 



    5 








    5 




    5 






























    1 










    2 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package eventfd implements event fds.
package eventfd

import (
        "math"
        "sync"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/fdnotifier"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// EventFileDescription implements vfs.FileDescriptionImpl for file-based event
// notification (eventfd). Eventfds are usually internal to the Sentry but in
// certain situations they may be converted into a host-backed eventfd.
//
// +stateify savable
type EventFileDescription struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.DentryMetadataFileDescriptionImpl
        vfs.NoLockFD

        // queue is used to notify interested parties when the event object
        // becomes readable or writable.
        queue waiter.Queue

        // mu protects the fields below.
        mu sync.Mutex `state:"nosave"`

        // val is the current value of the event counter.
        val uint64

        // semMode specifies whether the event is in "semaphore" mode.
        semMode bool

        // hostfd indicates whether this eventfd is passed through to the host.
        hostfd int
}

var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil)

// New creates a new event fd.
func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) {
        vd := vfsObj.NewAnonVirtualDentry("[eventfd]")
        defer vd.DecRef(ctx)
        efd := &EventFileDescription{
                val:     initVal,
                semMode: semMode,
                hostfd:  -1,
        }
        if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{
                UseDentryMetadata: true,
                DenyPRead:         true,
                DenyPWrite:        true,
        }); err != nil {
                return nil, err
        }
        return &efd.vfsfd, nil
}

// HostFD returns the host eventfd associated with this event.
func (efd *EventFileDescription) HostFD() (int, error) {
        efd.mu.Lock()
        defer efd.mu.Unlock()
        if efd.hostfd >= 0 {
                return efd.hostfd, nil
        }

        flags := linux.EFD_NONBLOCK
        if efd.semMode {
                flags |= linux.EFD_SEMAPHORE
        }

        fd, _, errno := unix.Syscall(unix.SYS_EVENTFD2, uintptr(efd.val), uintptr(flags), 0)
        if errno != 0 {
                return -1, errno
        }

        if err := fdnotifier.AddFD(int32(fd), &efd.queue); err != nil {
                if closeErr := unix.Close(int(fd)); closeErr != nil {
                        log.Warningf("close(%d) eventfd failed: %v", fd, closeErr)
                }
                return -1, err
        }

        efd.hostfd = int(fd)
        return efd.hostfd, nil
}

// Release implements vfs.FileDescriptionImpl.Release.
func (efd *EventFileDescription) Release(context.Context) {
        efd.mu.Lock()
        defer efd.mu.Unlock()
        if efd.hostfd >= 0 {
                fdnotifier.RemoveFD(int32(efd.hostfd))
                if closeErr := unix.Close(int(efd.hostfd)); closeErr != nil {
                        log.Warningf("close(%d) eventfd failed: %v", efd.hostfd, closeErr)
                }
                efd.hostfd = -1
        }
}

// Read implements vfs.FileDescriptionImpl.Read.
func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
        if dst.NumBytes() < 8 {
                return 0, unix.EINVAL
        }
        if err := efd.read(ctx, dst); err != nil {
                return 0, err
        }
        return 8, nil
}

// Write implements vfs.FileDescriptionImpl.Write.
func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
        if src.NumBytes() < 8 {
                return 0, unix.EINVAL
        }
        if err := efd.write(ctx, src); err != nil {
                return 0, err
        }
        return 8, nil
}

// Preconditions: Must be called with efd.mu locked.
func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem.IOSequence) error {
        var buf [8]byte
        if _, err := unix.Read(efd.hostfd, buf[:]); err != nil {
                if err == unix.EWOULDBLOCK {
                        return syserror.ErrWouldBlock
                }
                return err
        }
        _, err := dst.CopyOut(ctx, buf[:])
        return err
}

func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequence) error {
        efd.mu.Lock()
        if efd.hostfd >= 0 {
                defer efd.mu.Unlock()
                return efd.hostReadLocked(ctx, dst)
        }

        // We can't complete the read if the value is currently zero.
        if efd.val == 0 {
                efd.mu.Unlock()
                return syserror.ErrWouldBlock
        }

        // Update the value based on the mode the event is operating in.
        var val uint64
        if efd.semMode {
                val = 1
                // Consistent with Linux, this is done even if writing to memory fails.
                efd.val--
        } else {
                val = efd.val
                efd.val = 0
        }

        efd.mu.Unlock()

        // Notify writers. We do this even if we were already writable because
        // it is possible that a writer is waiting to write the maximum value
        // to the event.
        efd.queue.Notify(waiter.WritableEvents)

        var buf [8]byte
        hostarch.ByteOrder.PutUint64(buf[:], val)
        _, err := dst.CopyOut(ctx, buf[:])
        return err
}

// Preconditions: Must be called with efd.mu locked.
func (efd *EventFileDescription) hostWriteLocked(val uint64) error {
        var buf [8]byte
        hostarch.ByteOrder.PutUint64(buf[:], val)
        _, err := unix.Write(efd.hostfd, buf[:])
        if err == unix.EWOULDBLOCK {
                return syserror.ErrWouldBlock
        }
        return err
}

func (efd *EventFileDescription) write(ctx context.Context, src usermem.IOSequence) error {
        var buf [8]byte
        if _, err := src.CopyIn(ctx, buf[:]); err != nil {
                return err
        }
        val := hostarch.ByteOrder.Uint64(buf[:])

        return efd.Signal(val)
}

// Signal is an internal function to signal the event fd.
func (efd *EventFileDescription) Signal(val uint64) error {
        if val == math.MaxUint64 {
                return unix.EINVAL
        }

        efd.mu.Lock()

        if efd.hostfd >= 0 {
                defer efd.mu.Unlock()
                return efd.hostWriteLocked(val)
        }

        // We only allow writes that won't cause the value to go over the max
        // uint64 minus 1.
        if val > math.MaxUint64-1-efd.val {
                efd.mu.Unlock()
                return syserror.ErrWouldBlock
        }

        efd.val += val
        efd.mu.Unlock()

        // Always trigger a notification.
        efd.queue.Notify(waiter.ReadableEvents)

        return nil
}

// Readiness implements waiter.Waitable.Readiness.
func (efd *EventFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
        efd.mu.Lock()
        defer efd.mu.Unlock()

        if efd.hostfd >= 0 {
                return fdnotifier.NonBlockingPoll(int32(efd.hostfd), mask)
        }

        ready := waiter.EventMask(0)
        if efd.val > 0 {
                ready |= waiter.ReadableEvents
        }

        if efd.val < math.MaxUint64-1 {
                ready |= waiter.WritableEvents
        }

        return mask & ready
}

// EventRegister implements waiter.Waitable.EventRegister.
func (efd *EventFileDescription) EventRegister(entry *waiter.Entry, mask waiter.EventMask) {
        efd.queue.EventRegister(entry, mask)

        efd.mu.Lock()
        defer efd.mu.Unlock()
        if efd.hostfd >= 0 {
                fdnotifier.UpdateFD(int32(efd.hostfd))
        }
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (efd *EventFileDescription) EventUnregister(entry *waiter.Entry) {
        efd.queue.EventUnregister(entry)

        efd.mu.Lock()
        defer efd.mu.Unlock()
        if efd.hostfd >= 0 {
                fdnotifier.UpdateFD(int32(efd.hostfd))
        }
}
















































   31 





   35 



   34 















   31 






   31 


   31 











   58 




  116 




   36 




   68 




   35 






















   10 



   10 



   20 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package nested provides helpers to implement the pattern of nested
// stack.LinkEndpoints.
package nested

import (
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

// Endpoint is a wrapper around stack.LinkEndpoint and stack.NetworkDispatcher
// that can be used to implement nesting safely by providing lifecycle
// concurrency guards.
//
// See the tests in this package for example usage.
type Endpoint struct {
        child    stack.LinkEndpoint
        embedder stack.NetworkDispatcher

        // mu protects dispatcher.
        mu         sync.RWMutex
        dispatcher stack.NetworkDispatcher
}

var _ stack.GSOEndpoint = (*Endpoint)(nil)
var _ stack.LinkEndpoint = (*Endpoint)(nil)
var _ stack.NetworkDispatcher = (*Endpoint)(nil)

// Init initializes a nested.Endpoint that uses embedder as the dispatcher for
// child on Attach.
//
// See the tests in this package for example usage.
func (e *Endpoint) Init(child stack.LinkEndpoint, embedder stack.NetworkDispatcher) {
        e.child = child
        e.embedder = embedder
}

// DeliverNetworkPacket implements stack.NetworkDispatcher.
func (e *Endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
        e.mu.RLock()
        d := e.dispatcher
        e.mu.RUnlock()
        if d != nil {
                d.DeliverNetworkPacket(remote, local, protocol, pkt)
        }
}

// DeliverOutboundPacket implements stack.NetworkDispatcher.DeliverOutboundPacket.
func (e *Endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
        e.mu.RLock()
        d := e.dispatcher
        e.mu.RUnlock()
        if d != nil {
                d.DeliverOutboundPacket(remote, local, protocol, pkt)
        }
}

// Attach implements stack.LinkEndpoint.
func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) {
        e.mu.Lock()
        e.dispatcher = dispatcher
        e.mu.Unlock()
        // If we're attaching to a valid dispatcher, pass embedder as the dispatcher
        // to our child, otherwise detach the child by giving it a nil dispatcher.
        var pass stack.NetworkDispatcher
        if dispatcher != nil {
                pass = e.embedder
        }
        e.child.Attach(pass)
}

// IsAttached implements stack.LinkEndpoint.
func (e *Endpoint) IsAttached() bool {
        e.mu.RLock()
        isAttached := e.dispatcher != nil
        e.mu.RUnlock()
        return isAttached
}

// MTU implements stack.LinkEndpoint.
func (e *Endpoint) MTU() uint32 {
        return e.child.MTU()
}

// Capabilities implements stack.LinkEndpoint.
func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
        return e.child.Capabilities()
}

// MaxHeaderLength implements stack.LinkEndpoint.
func (e *Endpoint) MaxHeaderLength() uint16 {
        return e.child.MaxHeaderLength()
}

// LinkAddress implements stack.LinkEndpoint.
func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
        return e.child.LinkAddress()
}

// WritePacket implements stack.LinkEndpoint.
func (e *Endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error {
        return e.child.WritePacket(r, protocol, pkt)
}

// WritePackets implements stack.LinkEndpoint.
func (e *Endpoint) WritePackets(r stack.RouteInfo, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, tcpip.Error) {
        return e.child.WritePackets(r, pkts, protocol)
}

// Wait implements stack.LinkEndpoint.
func (e *Endpoint) Wait() {
        e.child.Wait()
}

// GSOMaxSize implements stack.GSOEndpoint.
func (e *Endpoint) GSOMaxSize() uint32 {
        if e, ok := e.child.(stack.GSOEndpoint); ok {
                return e.GSOMaxSize()
        }
        return 0
}

// SupportedGSO implements stack.GSOEndpoint.
func (e *Endpoint) SupportedGSO() stack.SupportedGSO {
        if e, ok := e.child.(stack.GSOEndpoint); ok {
                return e.SupportedGSO()
        }
        return stack.GSONotSupported
}

// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType
func (e *Endpoint) ARPHardwareType() header.ARPHardwareType {
        return e.child.ARPHardwareType()
}

// AddHeader implements stack.LinkEndpoint.AddHeader.
func (e *Endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
        e.child.AddHeader(local, remote, protocol, pkt)
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/tcpip/link/tun/tun_endpoint_refs.go: no such file or directory


































   51 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// Copyright 2020 The gVisor Authors.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package sync

import (
        "sync"
)

// Aliases of standard library types.
type (
        // Cond is an alias of sync.Cond.
        Cond = sync.Cond

        // Locker is an alias of sync.Locker.
        Locker = sync.Locker

        // Once is an alias of sync.Once.
        Once = sync.Once

        // Pool is an alias of sync.Pool.
        Pool = sync.Pool

        // WaitGroup is an alias of sync.WaitGroup.
        WaitGroup = sync.WaitGroup

        // Map is an alias of sync.Map.
        Map = sync.Map
)

// NewCond is a wrapper around sync.NewCond.
func NewCond(l Locker) *Cond {
        return sync.NewCond(l)
}















































  102 

   46 



   45 

   34 

   19 

    6 

    9 



   22 


    1 



   21 
   12 

    3 



    3 




    3 



    1 














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package netstack

import (
        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/socket"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
        "gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
        "gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
        "gvisor.dev/gvisor/pkg/tcpip/transport/udp"
        "gvisor.dev/gvisor/pkg/waiter"
)

// LINT.IfChange

// provider is an inet socket provider.
type provider struct {
        family   int
        netProto tcpip.NetworkProtocolNumber
}

// getTransportProtocol figures out transport protocol. Currently only TCP,
// UDP, and ICMP are supported. The bool return value is true when this socket
// is associated with a transport protocol. This is only false for SOCK_RAW,
// IPPROTO_IP sockets.
func getTransportProtocol(ctx context.Context, stype linux.SockType, protocol int) (tcpip.TransportProtocolNumber, bool, *syserr.Error) {
        switch stype {
        case linux.SOCK_STREAM:
                if protocol != 0 && protocol != unix.IPPROTO_TCP {
                        return 0, true, syserr.ErrInvalidArgument
                }
                return tcp.ProtocolNumber, true, nil

        case linux.SOCK_DGRAM:
                switch protocol {
                case 0, unix.IPPROTO_UDP:
                        return udp.ProtocolNumber, true, nil
                case unix.IPPROTO_ICMP:
                        return header.ICMPv4ProtocolNumber, true, nil
                case unix.IPPROTO_ICMPV6:
                        return header.ICMPv6ProtocolNumber, true, nil
                }

        case linux.SOCK_RAW:
                // Raw sockets require CAP_NET_RAW.
                creds := auth.CredentialsFromContext(ctx)
                if !creds.HasCapability(linux.CAP_NET_RAW) {
                        return 0, true, syserr.ErrNotPermitted
                }

                switch protocol {
                case unix.IPPROTO_ICMP:
                        return header.ICMPv4ProtocolNumber, true, nil
                case unix.IPPROTO_ICMPV6:
                        return header.ICMPv6ProtocolNumber, true, nil
                case unix.IPPROTO_UDP:
                        return header.UDPProtocolNumber, true, nil
                case unix.IPPROTO_TCP:
                        return header.TCPProtocolNumber, true, nil
                // IPPROTO_RAW signifies that the raw socket isn't assigned to
                // a transport protocol. Users will be able to write packets'
                // IP headers and won't receive anything.
                case unix.IPPROTO_RAW:
                        return tcpip.TransportProtocolNumber(0), false, nil
                }
        }
        return 0, true, syserr.ErrProtocolNotSupported
}

// Socket creates a new socket object for the AF_INET, AF_INET6, or AF_PACKET
// family.
func (p *provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
        // Fail right away if we don't have a stack.
        stack := t.NetworkContext()
        if stack == nil {
                // Don't propagate an error here. Instead, allow the socket
                // code to continue searching for another provider.
                return nil, nil
        }
        eps, ok := stack.(*Stack)
        if !ok {
                return nil, nil
        }

        // Packet sockets are handled separately, since they are neither INET
        // nor INET6 specific.
        if p.family == linux.AF_PACKET {
                return packetSocket(t, eps, stype, protocol)
        }

        // Figure out the transport protocol.
        transProto, associated, err := getTransportProtocol(t, stype, protocol)
        if err != nil {
                return nil, err
        }

        // Create the endpoint.
        var ep tcpip.Endpoint
        var e tcpip.Error
        wq := &waiter.Queue{}
        if stype == linux.SOCK_RAW {
                ep, e = eps.Stack.NewRawEndpoint(transProto, p.netProto, wq, associated)
        } else {
                ep, e = eps.Stack.NewEndpoint(transProto, p.netProto, wq)

                // Assign task to PacketOwner interface to get the UID and GID for
                // iptables owner matching.
                if e == nil {
                        ep.SetOwner(t)
                }
        }
        if e != nil {
                return nil, syserr.TranslateNetstackError(e)
        }

        return New(t, p.family, stype, int(transProto), wq, ep)
}

func packetSocket(t *kernel.Task, epStack *Stack, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
        // Packet sockets require CAP_NET_RAW.
        creds := auth.CredentialsFromContext(t)
        if !creds.HasCapability(linux.CAP_NET_RAW) {
                return nil, syserr.ErrNotPermitted
        }

        // "cooked" packets don't contain link layer information.
        var cooked bool
        switch stype {
        case linux.SOCK_DGRAM:
                cooked = true
        case linux.SOCK_RAW:
                cooked = false
        default:
                return nil, syserr.ErrProtocolNotSupported
        }

        // protocol is passed in network byte order, but netstack wants it in
        // host order.
        netProto := tcpip.NetworkProtocolNumber(socket.Ntohs(uint16(protocol)))

        wq := &waiter.Queue{}
        ep, err := epStack.Stack.NewPacketEndpoint(cooked, netProto, wq)
        if err != nil {
                return nil, syserr.TranslateNetstackError(err)
        }

        return New(t, linux.AF_PACKET, stype, protocol, wq, ep)
}

// LINT.ThenChange(./provider_vfs2.go)

// Pair just returns nil sockets (not supported).
func (*provider) Pair(*kernel.Task, linux.SockType, int) (*fs.File, *fs.File, *syserr.Error) {
        return nil, nil, nil
}

// init registers socket providers for AF_INET, AF_INET6, and AF_PACKET.
func init() {
        // Providers backed by netstack.
        p := []provider{
                {
                        family:   linux.AF_INET,
                        netProto: ipv4.ProtocolNumber,
                },
                {
                        family:   linux.AF_INET6,
                        netProto: ipv6.ProtocolNumber,
                },
                {
                        family: linux.AF_PACKET,
                },
        }

        for i := range p {
                socket.RegisterProvider(p[i].family, &p[i])
        }
}



































    3 






    3 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cgroupfs

import (
        "bytes"
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/usage"
)

// +stateify savable
type cpuacctController struct {
        controllerCommon
}

var _ controller = (*cpuacctController)(nil)

func newCPUAcctController(fs *filesystem) *cpuacctController {
        c := &cpuacctController{}
        c.controllerCommon.init(controllerCPUAcct, fs)
        return c
}

// AddControlFiles implements controller.AddControlFiles.
func (c *cpuacctController) AddControlFiles(ctx context.Context, creds *auth.Credentials, cg *cgroupInode, contents map[string]kernfs.Inode) {
        cpuacctCG := &cpuacctCgroup{cg}
        contents["cpuacct.stat"] = c.fs.newControllerFile(ctx, creds, &cpuacctStatData{cpuacctCG})
        contents["cpuacct.usage"] = c.fs.newControllerFile(ctx, creds, &cpuacctUsageData{cpuacctCG})
        contents["cpuacct.usage_user"] = c.fs.newControllerFile(ctx, creds, &cpuacctUsageUserData{cpuacctCG})
        contents["cpuacct.usage_sys"] = c.fs.newControllerFile(ctx, creds, &cpuacctUsageSysData{cpuacctCG})
}

// +stateify savable
type cpuacctCgroup struct {
        *cgroupInode
}

func (c *cpuacctCgroup) collectCPUStats() usage.CPUStats {
        var cs usage.CPUStats
        c.fs.tasksMu.RLock()
        // Note: This isn't very accurate, since the tasks are potentially
        // still running as we accumulate their stats.
        for t := range c.ts {
                cs.Accumulate(t.CPUStats())
        }
        c.fs.tasksMu.RUnlock()
        return cs
}

// +stateify savable
type cpuacctStatData struct {
        *cpuacctCgroup
}

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *cpuacctStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        cs := d.collectCPUStats()
        fmt.Fprintf(buf, "user %d\n", linux.ClockTFromDuration(cs.UserTime))
        fmt.Fprintf(buf, "system %d\n", linux.ClockTFromDuration(cs.SysTime))
        return nil
}

// +stateify savable
type cpuacctUsageData struct {
        *cpuacctCgroup
}

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *cpuacctUsageData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        cs := d.collectCPUStats()
        fmt.Fprintf(buf, "%d\n", cs.UserTime.Nanoseconds()+cs.SysTime.Nanoseconds())
        return nil
}

// +stateify savable
type cpuacctUsageUserData struct {
        *cpuacctCgroup
}

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *cpuacctUsageUserData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        cs := d.collectCPUStats()
        fmt.Fprintf(buf, "%d\n", cs.UserTime.Nanoseconds())
        return nil
}

// +stateify savable
type cpuacctUsageSysData struct {
        *cpuacctCgroup
}

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *cpuacctUsageSysData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        cs := d.collectCPUStats()
        fmt.Fprintf(buf, "%d\n", cs.SysTime.Nanoseconds())
        return nil
}



















































 1961 









 1962 
    1 


 1964 










 1963 







  211 



 1958 



 1963 











  956 



  952 



 1962 



 1961 











  830 



  825 



  904 



  908 


  376 







 1958 















  143 




  493 




  178 



















 1960 
 1958 



  831 
    6 


  830 



  829 


  828 










   30 
   30 








   30 




   30 
   29 



   30 






















  839 



  840 






    1 



    1 





 1872 



 1872 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package safemem

import (
        "fmt"
        "unsafe"

        "gvisor.dev/gvisor/pkg/gohacks"
        "gvisor.dev/gvisor/pkg/safecopy"
        "gvisor.dev/gvisor/pkg/sync"
)

// A Block is a range of contiguous bytes, similar to []byte but with the
// following differences:
//
// - The memory represented by a Block may require the use of safecopy to
// access.
//
// - Block does not carry a capacity and cannot be expanded.
//
// Blocks are immutable and may be copied by value. The zero value of Block
// represents an empty range, analogous to a nil []byte.
type Block struct {
        // [start, start+length) is the represented memory.
        //
        // start is an unsafe.Pointer to ensure that Block prevents the represented
        // memory from being garbage-collected.
        start  unsafe.Pointer
        length int

        // needSafecopy is true if accessing the represented memory requires the
        // use of safecopy.
        needSafecopy bool
}

// BlockFromSafeSlice returns a Block equivalent to slice, which is safe to
// access without safecopy.
func BlockFromSafeSlice(slice []byte) Block {
        return blockFromSlice(slice, false)
}

// BlockFromUnsafeSlice returns a Block equivalent to bs, which is not safe to
// access without safecopy.
func BlockFromUnsafeSlice(slice []byte) Block {
        return blockFromSlice(slice, true)
}

func blockFromSlice(slice []byte, needSafecopy bool) Block {
        if len(slice) == 0 {
                return Block{}
        }
        return Block{
                start:        unsafe.Pointer(&slice[0]),
                length:       len(slice),
                needSafecopy: needSafecopy,
        }
}

// BlockFromSafePointer returns a Block equivalent to [ptr, ptr+length), which is
// safe to access without safecopy.
//
// Preconditions: ptr+length does not overflow.
func BlockFromSafePointer(ptr unsafe.Pointer, length int) Block {
        return blockFromPointer(ptr, length, false)
}

// BlockFromUnsafePointer returns a Block equivalent to [ptr, ptr+len), which
// is not safe to access without safecopy.
//
// Preconditions: ptr+len does not overflow.
func BlockFromUnsafePointer(ptr unsafe.Pointer, length int) Block {
        return blockFromPointer(ptr, length, true)
}

func blockFromPointer(ptr unsafe.Pointer, length int, needSafecopy bool) Block {
        if uptr := uintptr(ptr); uptr+uintptr(length) < uptr {
                panic(fmt.Sprintf("ptr %#x + len %#x overflows", uptr, length))
        }
        return Block{
                start:        ptr,
                length:       length,
                needSafecopy: needSafecopy,
        }
}

// DropFirst returns a Block equivalent to b, but with the first n bytes
// omitted. It is analogous to the [n:] operation on a slice, except that if n
// > b.Len(), DropFirst returns an empty Block instead of panicking.
//
// Preconditions: n >= 0.
func (b Block) DropFirst(n int) Block {
        if n < 0 {
                panic(fmt.Sprintf("invalid n: %d", n))
        }
        return b.DropFirst64(uint64(n))
}

// DropFirst64 is equivalent to DropFirst but takes a uint64.
func (b Block) DropFirst64(n uint64) Block {
        if n >= uint64(b.length) {
                return Block{}
        }
        return Block{
                start:        unsafe.Pointer(uintptr(b.start) + uintptr(n)),
                length:       b.length - int(n),
                needSafecopy: b.needSafecopy,
        }
}

// TakeFirst returns a Block equivalent to the first n bytes of b. It is
// analogous to the [:n] operation on a slice, except that if n > b.Len(),
// TakeFirst returns a copy of b instead of panicking.
//
// Preconditions: n >= 0.
func (b Block) TakeFirst(n int) Block {
        if n < 0 {
                panic(fmt.Sprintf("invalid n: %d", n))
        }
        return b.TakeFirst64(uint64(n))
}

// TakeFirst64 is equivalent to TakeFirst but takes a uint64.
func (b Block) TakeFirst64(n uint64) Block {
        if n == 0 {
                return Block{}
        }
        if n >= uint64(b.length) {
                return b
        }
        return Block{
                start:        b.start,
                length:       int(n),
                needSafecopy: b.needSafecopy,
        }
}

// ToSlice returns a []byte equivalent to b.
func (b Block) ToSlice() []byte {
        return *(*[]byte)(unsafe.Pointer(&gohacks.SliceHeader{
                Data: b.start,
                Len:  b.length,
                Cap:  b.length,
        }))
}

// Addr returns b's start address as a uintptr. It returns uintptr instead of
// unsafe.Pointer so that code using safemem cannot obtain unsafe.Pointers
// without importing the unsafe package explicitly.
//
// Note that a uintptr is not recognized as a pointer by the garbage collector,
// such that if there are no uses of b after a call to b.Addr() and the address
// is to Go-managed memory, the returned uintptr does not prevent garbage
// collection of the pointee.
func (b Block) Addr() uintptr {
        return uintptr(b.start)
}

// Len returns b's length in bytes.
func (b Block) Len() int {
        return b.length
}

// NeedSafecopy returns true if accessing b.ToSlice() requires the use of safecopy.
func (b Block) NeedSafecopy() bool {
        return b.needSafecopy
}

// String implements fmt.Stringer.String.
func (b Block) String() string {
        if uintptr(b.start) == 0 && b.length == 0 {
                return "<nil>"
        }
        var suffix string
        if b.needSafecopy {
                suffix = "*"
        }
        return fmt.Sprintf("[%#x-%#x)%s", uintptr(b.start), uintptr(b.start)+uintptr(b.length), suffix)
}

// Copy copies src.Len() or dst.Len() bytes, whichever is less, from src
// to dst and returns the number of bytes copied.
//
// If src and dst overlap, the data stored in dst is unspecified.
func Copy(dst, src Block) (int, error) {
        if !dst.needSafecopy && !src.needSafecopy {
                return copy(dst.ToSlice(), src.ToSlice()), nil
        }

        n := dst.length
        if n > src.length {
                n = src.length
        }
        if n == 0 {
                return 0, nil
        }

        switch {
        case dst.needSafecopy && !src.needSafecopy:
                return safecopy.CopyOut(dst.start, src.TakeFirst(n).ToSlice())
        case !dst.needSafecopy && src.needSafecopy:
                return safecopy.CopyIn(dst.TakeFirst(n).ToSlice(), src.start)
        case dst.needSafecopy && src.needSafecopy:
                n64, err := safecopy.Copy(dst.start, src.start, uintptr(n))
                return int(n64), err
        default:
                panic("unreachable")
        }
}

// Zero sets all bytes in dst to 0 and returns the number of bytes zeroed.
func Zero(dst Block) (int, error) {
        if !dst.needSafecopy {
                bs := dst.ToSlice()
                if !sync.RaceEnabled {
                        // If the race detector isn't enabled, the golang
                        // compiler replaces the next loop with memclr
                        // (https://github.com/golang/go/issues/5373).
                        for i := range bs {
                                bs[i] = 0
                        }
                } else {
                        bsLen := len(bs)
                        if bsLen == 0 {
                                return 0, nil
                        }
                        bs[0] = 0
                        for i := 1; i < bsLen; i *= 2 {
                                copy(bs[i:], bs[:i])
                        }
                }
                return len(bs), nil
        }

        n64, err := safecopy.ZeroOut(dst.start, uintptr(dst.length))
        return int(n64), err
}

// Safecopy atomics are no slower than non-safecopy atomics, so use the former
// even when !b.needSafecopy to get consistent alignment checking.

// SwapUint32 invokes safecopy.SwapUint32 on the first 4 bytes of b.
//
// Preconditions: b.Len() >= 4.
func SwapUint32(b Block, new uint32) (uint32, error) {
        if b.length < 4 {
                panic(fmt.Sprintf("insufficient length: %d", b.length))
        }
        return safecopy.SwapUint32(b.start, new)
}

// SwapUint64 invokes safecopy.SwapUint64 on the first 8 bytes of b.
//
// Preconditions: b.Len() >= 8.
func SwapUint64(b Block, new uint64) (uint64, error) {
        if b.length < 8 {
                panic(fmt.Sprintf("insufficient length: %d", b.length))
        }
        return safecopy.SwapUint64(b.start, new)
}

// CompareAndSwapUint32 invokes safecopy.CompareAndSwapUint32 on the first 4
// bytes of b.
//
// Preconditions: b.Len() >= 4.
func CompareAndSwapUint32(b Block, old, new uint32) (uint32, error) {
        if b.length < 4 {
                panic(fmt.Sprintf("insufficient length: %d", b.length))
        }
        return safecopy.CompareAndSwapUint32(b.start, old, new)
}

// LoadUint32 invokes safecopy.LoadUint32 on the first 4 bytes of b.
//
// Preconditions: b.Len() >= 4.
func LoadUint32(b Block) (uint32, error) {
        if b.length < 4 {
                panic(fmt.Sprintf("insufficient length: %d", b.length))
        }
        return safecopy.LoadUint32(b.start)
}




























    4 






    8 








   12 



   12 



   12 
    1 


   11 



   11 


    1 


   10 
    1 


    9 





   42 






    9 






   50 

    2 


   48 
    2 


   45 






   10 







    6 







   16 

    1 


   15 
    1 


   14 


   11 


   14 








   26 







  210 








   23 





  251 

    2 


  250 
    2 


  248 





   71 


  209 








   19 






    4 








    8 








   31 

    1 



   30 
    1 


   29 


    1 


   27 



   28 







    7 




   12 




   12 



   12 




   11 




   14 

    1 


   13 
    1 


   12 




    8 








    8 


    3 



    7 






   13 






   20 

    1 


   19 


   18 



   18 



   18 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

// Link implements Linux syscall link(2).
func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        oldpathAddr := args[0].Pointer()
        newpathAddr := args[1].Pointer()
        return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
}

// Linkat implements Linux syscall linkat(2).
func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        olddirfd := args[0].Int()
        oldpathAddr := args[1].Pointer()
        newdirfd := args[2].Int()
        newpathAddr := args[3].Pointer()
        flags := args[4].Int()
        return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
}

func linkat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags int32) error {
        if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 {
                return linuxerr.EINVAL
        }
        if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) {
                return syserror.ENOENT
        }

        oldpath, err := copyInPath(t, oldpathAddr)
        if err != nil {
                return err
        }
        oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0))
        if err != nil {
                return err
        }
        defer oldtpop.Release(t)

        newpath, err := copyInPath(t, newpathAddr)
        if err != nil {
                return err
        }
        newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
        if err != nil {
                return err
        }
        defer newtpop.Release(t)

        return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop)
}

// Mkdir implements Linux syscall mkdir(2).
func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        mode := args[1].ModeT()
        return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode)
}

// Mkdirat implements Linux syscall mkdirat(2).
func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        dirfd := args[0].Int()
        addr := args[1].Pointer()
        mode := args[2].ModeT()
        return 0, nil, mkdirat(t, dirfd, addr, mode)
}

func mkdirat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode uint) error {
        path, err := copyInPath(t, addr)
        if err != nil {
                return err
        }
        tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
        if err != nil {
                return err
        }
        defer tpop.Release(t)
        return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{
                Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()),
        })
}

// Mknod implements Linux syscall mknod(2).
func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        mode := args[1].ModeT()
        dev := args[2].Uint()
        return 0, nil, mknodat(t, linux.AT_FDCWD, addr, linux.FileMode(mode), dev)
}

// Mknodat implements Linux syscall mknodat(2).
func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        dirfd := args[0].Int()
        addr := args[1].Pointer()
        mode := args[2].ModeT()
        dev := args[3].Uint()
        return 0, nil, mknodat(t, dirfd, addr, linux.FileMode(mode), dev)
}

func mknodat(t *kernel.Task, dirfd int32, addr hostarch.Addr, mode linux.FileMode, dev uint32) error {
        path, err := copyInPath(t, addr)
        if err != nil {
                return err
        }
        tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
        if err != nil {
                return err
        }
        defer tpop.Release(t)

        // "Zero file type is equivalent to type S_IFREG." - mknod(2)
        if mode.FileType() == 0 {
                mode |= linux.ModeRegular
        }
        major, minor := linux.DecodeDeviceID(dev)
        return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
                Mode:     mode &^ linux.FileMode(t.FSContext().Umask()),
                DevMajor: uint32(major),
                DevMinor: minor,
        })
}

// Open implements Linux syscall open(2).
func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        flags := args[1].Uint()
        mode := args[2].ModeT()
        return openat(t, linux.AT_FDCWD, addr, flags, mode)
}

// Openat implements Linux syscall openat(2).
func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        dirfd := args[0].Int()
        addr := args[1].Pointer()
        flags := args[2].Uint()
        mode := args[3].ModeT()
        return openat(t, dirfd, addr, flags, mode)
}

// Creat implements Linux syscall creat(2).
func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        mode := args[1].ModeT()
        return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode)
}

func openat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) {
        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return 0, nil, err
        }
        tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0))
        if err != nil {
                return 0, nil, err
        }
        defer tpop.Release(t)

        file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{
                Flags: flags | linux.O_LARGEFILE,
                Mode:  linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()),
        })
        if err != nil {
                return 0, nil, err
        }
        defer file.DecRef(t)

        fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
                CloseOnExec: flags&linux.O_CLOEXEC != 0,
        })
        return uintptr(fd), nil, err
}

// Rename implements Linux syscall rename(2).
func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        oldpathAddr := args[0].Pointer()
        newpathAddr := args[1].Pointer()
        return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
}

// Renameat implements Linux syscall renameat(2).
func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        olddirfd := args[0].Int()
        oldpathAddr := args[1].Pointer()
        newdirfd := args[2].Int()
        newpathAddr := args[3].Pointer()
        return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */)
}

// Renameat2 implements Linux syscall renameat2(2).
func Renameat2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        olddirfd := args[0].Int()
        oldpathAddr := args[1].Pointer()
        newdirfd := args[2].Int()
        newpathAddr := args[3].Pointer()
        flags := args[4].Uint()
        return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
}

func renameat(t *kernel.Task, olddirfd int32, oldpathAddr hostarch.Addr, newdirfd int32, newpathAddr hostarch.Addr, flags uint32) error {
        oldpath, err := copyInPath(t, oldpathAddr)
        if err != nil {
                return err
        }
        // "If oldpath refers to a symbolic link, the link is renamed" - rename(2)
        oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink)
        if err != nil {
                return err
        }
        defer oldtpop.Release(t)

        newpath, err := copyInPath(t, newpathAddr)
        if err != nil {
                return err
        }
        newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
        if err != nil {
                return err
        }
        defer newtpop.Release(t)

        return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{
                Flags: flags,
        })
}

// Rmdir implements Linux syscall rmdir(2).
func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pathAddr := args[0].Pointer()
        return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr)
}

func rmdirat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr) error {
        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return err
        }
        tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
        if err != nil {
                return err
        }
        defer tpop.Release(t)
        return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop)
}

// Unlink implements Linux syscall unlink(2).
func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pathAddr := args[0].Pointer()
        return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr)
}

func unlinkat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr) error {
        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return err
        }
        tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
        if err != nil {
                return err
        }
        defer tpop.Release(t)
        return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop)
}

// Unlinkat implements Linux syscall unlinkat(2).
func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        dirfd := args[0].Int()
        pathAddr := args[1].Pointer()
        flags := args[2].Int()

        if flags&^linux.AT_REMOVEDIR != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        if flags&linux.AT_REMOVEDIR != 0 {
                return 0, nil, rmdirat(t, dirfd, pathAddr)
        }
        return 0, nil, unlinkat(t, dirfd, pathAddr)
}

// Symlink implements Linux syscall symlink(2).
func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        targetAddr := args[0].Pointer()
        linkpathAddr := args[1].Pointer()
        return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr)
}

// Symlinkat implements Linux syscall symlinkat(2).
func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        targetAddr := args[0].Pointer()
        newdirfd := args[1].Int()
        linkpathAddr := args[2].Pointer()
        return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr)
}

func symlinkat(t *kernel.Task, targetAddr hostarch.Addr, newdirfd int32, linkpathAddr hostarch.Addr) error {
        target, err := t.CopyInString(targetAddr, linux.PATH_MAX)
        if err != nil {
                return err
        }
        if len(target) == 0 {
                return syserror.ENOENT
        }
        linkpath, err := copyInPath(t, linkpathAddr)
        if err != nil {
                return err
        }
        tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink)
        if err != nil {
                return err
        }
        defer tpop.Release(t)
        return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target)
}





























































    5 
    5 
    3 



    2 

















































    6 




    6 










    6 





    6 




    6 




























    5 









    4 



    1 



    3 


    9 




    9 




    7 






    7 




    7 






























    4 











    4 




    4 









    4 














    4 



    4 



















    3 































    4 






    2 






    2 




    2 





    1 




    1 


    1 







    1 

    1 


    1 













    3 







    3 


    3 



    3 




    3 
    1 


    3 


    3 










    7 












    7 


















    7 










    1 











    4 










    2 


    1 






    1 





    2 


    9 














   10 










   10 












   10 




    8 







    9 
    4 




    5 



    5 





























    3 


    4 












    4 













    1 




















    3 












    3 





    3 




    3 















    3 















    3 






































    2 












    6 




    2 














    2 







    6 


   10 




    3 

    2 


    1 













   10 


   11 

    6 






    9 





    6 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
        "crypto/sha1"
        "encoding/binary"
        "fmt"
        "hash"
        "io"
        "sync/atomic"
        "time"

        "gvisor.dev/gvisor/pkg/sleep"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/ports"
        "gvisor.dev/gvisor/pkg/tcpip/seqnum"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
        "gvisor.dev/gvisor/pkg/waiter"
)

const (
        // tsLen is the length, in bits, of the timestamp in the SYN cookie.
        tsLen = 8

        // tsMask is a mask for timestamp values (i.e., tsLen bits).
        tsMask = (1 << tsLen) - 1

        // tsOffset is the offset, in bits, of the timestamp in the SYN cookie.
        tsOffset = 24

        // hashMask is the mask for hash values (i.e., tsOffset bits).
        hashMask = (1 << tsOffset) - 1

        // maxTSDiff is the maximum allowed difference between a received cookie
        // timestamp and the current timestamp. If the difference is greater
        // than maxTSDiff, the cookie is expired.
        maxTSDiff = 2
)

var (
        // mssTable is a slice containing the possible MSS values that we
        // encode in the SYN cookie with two bits.
        mssTable = []uint16{536, 1300, 1440, 1460}
)

func encodeMSS(mss uint16) uint32 {
        for i := len(mssTable) - 1; i > 0; i-- {
                if mss >= mssTable[i] {
                        return uint32(i)
                }
        }
        return 0
}

// listenContext is used by a listening endpoint to store state used while
// listening for connections. This struct is allocated by the listen goroutine
// and must not be accessed or have its methods called concurrently as they
// may mutate the stored objects.
type listenContext struct {
        stack *stack.Stack

        // rcvWnd is the receive window that is sent by this listening context
        // in the initial SYN-ACK.
        rcvWnd seqnum.Size

        // nonce are random bytes that are initialized once when the context
        // is created and used to seed the hash function when generating
        // the SYN cookie.
        nonce [2][sha1.BlockSize]byte

        // listenEP is a reference to the listening endpoint associated with
        // this context. Can be nil if the context is created by the forwarder.
        listenEP *endpoint

        // hasherMu protects hasher.
        hasherMu sync.Mutex
        // hasher is the hash function used to generate a SYN cookie.
        hasher hash.Hash

        // v6Only is true if listenEP is a dual stack socket and has the
        // IPV6_V6ONLY option set.
        v6Only bool

        // netProto indicates the network protocol(IPv4/v6) for the listening
        // endpoint.
        netProto tcpip.NetworkProtocolNumber

        // pendingMu protects pendingEndpoints. This should only be accessed
        // by the listening endpoint's worker goroutine.
        //
        // Lock Ordering: listenEP.workerMu -> pendingMu
        pendingMu sync.Mutex
        // pending is used to wait for all pendingEndpoints to finish when
        // a socket is closed.
        pending sync.WaitGroup
        // pendingEndpoints is a map of all endpoints for which a handshake is
        // in progress.
        pendingEndpoints map[stack.TransportEndpointID]*endpoint
}

// timeStamp returns an 8-bit timestamp with a granularity of 64 seconds.
func timeStamp(clock tcpip.Clock) uint32 {
        return uint32(clock.NowMonotonic().Sub(tcpip.MonotonicTime{}).Seconds()) >> 6 & tsMask
}

// newListenContext creates a new listen context.
func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6Only bool, netProto tcpip.NetworkProtocolNumber) *listenContext {
        l := &listenContext{
                stack:            stk,
                rcvWnd:           rcvWnd,
                hasher:           sha1.New(),
                v6Only:           v6Only,
                netProto:         netProto,
                listenEP:         listenEP,
                pendingEndpoints: make(map[stack.TransportEndpointID]*endpoint),
        }

        for i := range l.nonce {
                if _, err := io.ReadFull(stk.SecureRNG(), l.nonce[i][:]); err != nil {
                        panic(err)
                }
        }

        return l
}

// cookieHash calculates the cookieHash for the given id, timestamp and nonce
// index. The hash is used to create and validate cookies.
func (l *listenContext) cookieHash(id stack.TransportEndpointID, ts uint32, nonceIndex int) uint32 {

        // Initialize block with fixed-size data: local ports and v.
        var payload [8]byte
        binary.BigEndian.PutUint16(payload[0:], id.LocalPort)
        binary.BigEndian.PutUint16(payload[2:], id.RemotePort)
        binary.BigEndian.PutUint32(payload[4:], ts)

        // Feed everything to the hasher.
        l.hasherMu.Lock()
        l.hasher.Reset()

        // Per hash.Hash.Writer:
        //
        // It never returns an error.
        l.hasher.Write(payload[:])
        l.hasher.Write(l.nonce[nonceIndex][:])
        l.hasher.Write([]byte(id.LocalAddress))
        l.hasher.Write([]byte(id.RemoteAddress))

        // Finalize the calculation of the hash and return the first 4 bytes.
        h := l.hasher.Sum(nil)
        l.hasherMu.Unlock()

        return binary.BigEndian.Uint32(h[:])
}

// createCookie creates a SYN cookie for the given id and incoming sequence
// number.
func (l *listenContext) createCookie(id stack.TransportEndpointID, seq seqnum.Value, data uint32) seqnum.Value {
        ts := timeStamp(l.stack.Clock())
        v := l.cookieHash(id, 0, 0) + uint32(seq) + (ts << tsOffset)
        v += (l.cookieHash(id, ts, 1) + data) & hashMask
        return seqnum.Value(v)
}

// isCookieValid checks if the supplied cookie is valid for the given id and
// sequence number. If it is, it also returns the data originally encoded in the
// cookie when createCookie was called.
func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnum.Value, seq seqnum.Value) (uint32, bool) {
        ts := timeStamp(l.stack.Clock())
        v := uint32(cookie) - l.cookieHash(id, 0, 0) - uint32(seq)
        cookieTS := v >> tsOffset
        if ((ts - cookieTS) & tsMask) > maxTSDiff {
                return 0, false
        }

        return (v - l.cookieHash(id, cookieTS, 1)) & hashMask, true
}

func (l *listenContext) useSynCookies() bool {
        var alwaysUseSynCookies tcpip.TCPAlwaysUseSynCookies
        if err := l.stack.TransportProtocolOption(header.TCPProtocolNumber, &alwaysUseSynCookies); err != nil {
                panic(fmt.Sprintf("TransportProtocolOption(%d, %T) = %s", header.TCPProtocolNumber, alwaysUseSynCookies, err))
        }
        return bool(alwaysUseSynCookies) || (l.listenEP != nil && l.listenEP.synRcvdBacklogFull())
}

// createConnectingEndpoint creates a new endpoint in a connecting state, with
// the connection parameters given by the arguments.
func (l *listenContext) createConnectingEndpoint(s *segment, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, tcpip.Error) {
        // Create a new endpoint.
        netProto := l.netProto
        if netProto == 0 {
                netProto = s.netProto
        }

        route, err := l.stack.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */)
        if err != nil {
                return nil, err
        }

        n := newEndpoint(l.stack, netProto, queue)
        n.ops.SetV6Only(l.v6Only)
        n.TransportEndpointInfo.ID = s.id
        n.boundNICID = s.nicID
        n.route = route
        n.effectiveNetProtos = []tcpip.NetworkProtocolNumber{s.netProto}
        n.ops.SetReceiveBufferSize(int64(l.rcvWnd), false /* notify */)
        n.amss = calculateAdvertisedMSS(n.userMSS, n.route)
        n.setEndpointState(StateConnecting)

        n.maybeEnableTimestamp(rcvdSynOpts)
        n.maybeEnableSACKPermitted(rcvdSynOpts)

        n.initGSO()

        // Bootstrap the auto tuning algorithm. Starting at zero will result in
        // a large step function on the first window adjustment causing the
        // window to grow to a really large value.
        n.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = n.initialReceiveWindow()

        return n, nil
}

// startHandshake creates a new endpoint in connecting state and then sends
// the SYN-ACK for the TCP 3-way handshake. It returns the state of the
// handshake in progress, which includes the new endpoint in the SYN-RCVD
// state.
//
// On success, a handshake h is returned with h.ep.mu held.
//
// Precondition: if l.listenEP != nil, l.listenEP.mu must be locked.
func (l *listenContext) startHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*handshake, tcpip.Error) {
        // Create new endpoint.
        irs := s.sequenceNumber
        isn := generateSecureISN(s.id, l.stack.Clock(), l.stack.Seed())
        ep, err := l.createConnectingEndpoint(s, opts, queue)
        if err != nil {
                return nil, err
        }

        // Lock the endpoint before registering to ensure that no out of
        // band changes are possible due to incoming packets etc till
        // the endpoint is done initializing.
        ep.mu.Lock()
        ep.owner = owner

        // listenEP is nil when listenContext is used by tcp.Forwarder.
        deferAccept := time.Duration(0)
        if l.listenEP != nil {
                if l.listenEP.EndpointState() != StateListen {

                        // Ensure we release any registrations done by the newly
                        // created endpoint.
                        ep.mu.Unlock()
                        ep.Close()

                        return nil, &tcpip.ErrConnectionAborted{}
                }
                l.addPendingEndpoint(ep)

                // Propagate any inheritable options from the listening endpoint
                // to the newly created endpoint.
                l.listenEP.propagateInheritableOptionsLocked(ep)

                if !ep.reserveTupleLocked() {
                        ep.mu.Unlock()
                        ep.Close()

                        l.removePendingEndpoint(ep)

                        return nil, &tcpip.ErrConnectionAborted{}
                }

                deferAccept = l.listenEP.deferAccept
        }

        // Register new endpoint so that packets are routed to it.
        if err := ep.stack.RegisterTransportEndpoint(
                ep.effectiveNetProtos,
                ProtocolNumber,
                ep.TransportEndpointInfo.ID,
                ep,
                ep.boundPortFlags,
                ep.boundBindToDevice,
        ); err != nil {
                ep.mu.Unlock()
                ep.Close()

                if l.listenEP != nil {
                        l.removePendingEndpoint(ep)
                }

                ep.drainClosingSegmentQueue()

                return nil, err
        }

        ep.isRegistered = true

        // Initialize and start the handshake.
        h := ep.newPassiveHandshake(isn, irs, opts, deferAccept)
        h.listenEP = l.listenEP
        h.start()
        return h, nil
}

// performHandshake performs a TCP 3-way handshake. On success, the new
// established endpoint is returned with e.mu held.
//
// Precondition: if l.listenEP != nil, l.listenEP.mu must be locked.
func (l *listenContext) performHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*endpoint, tcpip.Error) {
        h, err := l.startHandshake(s, opts, queue, owner)
        if err != nil {
                return nil, err
        }
        ep := h.ep

        // N.B. the endpoint is generated above by startHandshake, and will be
        // returned locked. This first call is forced.
        if err := h.complete(); err != nil { // +checklocksforce
                ep.stack.Stats().TCP.FailedConnectionAttempts.Increment()
                ep.stats.FailedConnectionAttempts.Increment()
                l.cleanupFailedHandshake(h)
                return nil, err
        }
        l.cleanupCompletedHandshake(h)
        return ep, nil
}

func (l *listenContext) addPendingEndpoint(n *endpoint) {
        l.pendingMu.Lock()
        l.pendingEndpoints[n.TransportEndpointInfo.ID] = n
        l.pending.Add(1)
        l.pendingMu.Unlock()
}

func (l *listenContext) removePendingEndpoint(n *endpoint) {
        l.pendingMu.Lock()
        delete(l.pendingEndpoints, n.TransportEndpointInfo.ID)
        l.pending.Done()
        l.pendingMu.Unlock()
}

func (l *listenContext) closeAllPendingEndpoints() {
        l.pendingMu.Lock()
        for _, n := range l.pendingEndpoints {
                n.notifyProtocolGoroutine(notifyClose)
        }
        l.pendingMu.Unlock()
        l.pending.Wait()
}

// Precondition: h.ep.mu must be held.
// +checklocks:h.ep.mu
func (l *listenContext) cleanupFailedHandshake(h *handshake) {
        e := h.ep
        e.mu.Unlock()
        e.Close()
        e.notifyAborted()
        if l.listenEP != nil {
                l.removePendingEndpoint(e)
        }
        e.drainClosingSegmentQueue()
        e.h = nil
}

// cleanupCompletedHandshake transfers any state from the completed handshake to
// the new endpoint.
//
// Precondition: h.ep.mu must be held.
func (l *listenContext) cleanupCompletedHandshake(h *handshake) {
        e := h.ep
        if l.listenEP != nil {
                l.removePendingEndpoint(e)
        }
        e.isConnectNotified = true

        // Update the receive window scaling. We can't do it before the
        // handshake because it's possible that the peer doesn't support window
        // scaling.
        e.rcv.RcvWndScale = e.h.effectiveRcvWndScale()

        // Clean up handshake state stored in the endpoint so that it can be GCed.
        e.h = nil
}

// deliverAccepted delivers the newly-accepted endpoint to the listener. If the
// listener has transitioned out of the listen state (accepted is the zero
// value), the new endpoint is reset instead.
func (e *endpoint) deliverAccepted(n *endpoint, withSynCookie bool) {
        e.mu.Lock()
        e.pendingAccepted.Add(1)
        e.mu.Unlock()
        defer e.pendingAccepted.Done()

        // Drop the lock before notifying to avoid deadlock in user-specified
        // callbacks.
        delivered := func() bool {
                e.acceptMu.Lock()
                defer e.acceptMu.Unlock()
                for {
                        if e.accepted == (accepted{}) {
                                return false
                        }
                        if e.accepted.endpoints.Len() == e.accepted.cap {
                                e.acceptCond.Wait()
                                continue
                        }

                        e.accepted.endpoints.PushBack(n)
                        if !withSynCookie {
                                atomic.AddInt32(&e.synRcvdCount, -1)
                        }
                        return true
                }
        }()
        if delivered {
                e.waiterQueue.Notify(waiter.ReadableEvents)
        } else {
                n.notifyProtocolGoroutine(notifyReset)
        }
}

// propagateInheritableOptionsLocked propagates any options set on the listening
// endpoint to the newly created endpoint.
//
// Precondition: e.mu and n.mu must be held.
func (e *endpoint) propagateInheritableOptionsLocked(n *endpoint) {
        n.userTimeout = e.userTimeout
        n.portFlags = e.portFlags
        n.boundBindToDevice = e.boundBindToDevice
        n.boundPortFlags = e.boundPortFlags
        n.userMSS = e.userMSS
}

// reserveTupleLocked reserves an accepted endpoint's tuple.
//
// Preconditions:
// * propagateInheritableOptionsLocked has been called.
// * e.mu is held.
func (e *endpoint) reserveTupleLocked() bool {
        dest := tcpip.FullAddress{
                Addr: e.TransportEndpointInfo.ID.RemoteAddress,
                Port: e.TransportEndpointInfo.ID.RemotePort,
        }
        portRes := ports.Reservation{
                Networks:     e.effectiveNetProtos,
                Transport:    ProtocolNumber,
                Addr:         e.TransportEndpointInfo.ID.LocalAddress,
                Port:         e.TransportEndpointInfo.ID.LocalPort,
                Flags:        e.boundPortFlags,
                BindToDevice: e.boundBindToDevice,
                Dest:         dest,
        }
        if !e.stack.ReserveTuple(portRes) {
                e.stack.Stats().TCP.FailedPortReservations.Increment()
                return false
        }

        e.isPortReserved = true
        e.boundDest = dest
        return true
}

// notifyAborted wakes up any waiters on registered, but not accepted
// endpoints.
//
// This is strictly not required normally as a socket that was never accepted
// can't really have any registered waiters except when stack.Wait() is called
// which waits for all registered endpoints to stop and expects an EventHUp.
func (e *endpoint) notifyAborted() {
        e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
}

// handleSynSegment is called in its own goroutine once the listening endpoint
// receives a SYN segment. It is responsible for completing the handshake and
// queueing the new endpoint for acceptance.
//
// A limited number of these goroutines are allowed before TCP starts using SYN
// cookies to accept connections.
//
// Precondition: if ctx.listenEP != nil, ctx.listenEP.mu must be locked.
func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header.TCPSynOptions) tcpip.Error {
        defer s.decRef()

        h, err := ctx.startHandshake(s, opts, &waiter.Queue{}, e.owner)
        if err != nil {
                e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
                e.stats.FailedConnectionAttempts.Increment()
                atomic.AddInt32(&e.synRcvdCount, -1)
                return err
        }

        go func() {
                // Note that startHandshake returns a locked endpoint. The
                // force call here just makes it so.
                if err := h.complete(); err != nil { // +checklocksforce
                        e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
                        e.stats.FailedConnectionAttempts.Increment()
                        ctx.cleanupFailedHandshake(h)
                        atomic.AddInt32(&e.synRcvdCount, -1)
                        return
                }
                ctx.cleanupCompletedHandshake(h)
                h.ep.startAcceptedLoop()
                e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
                e.deliverAccepted(h.ep, false /*withSynCookie*/)
        }()

        return nil
}

func (e *endpoint) synRcvdBacklogFull() bool {
        e.acceptMu.Lock()
        acceptedCap := e.accepted.cap
        e.acceptMu.Unlock()
        // The capacity of the accepted queue would always be one greater than the
        // listen backlog. But, the SYNRCVD connections count is always checked
        // against the listen backlog value for Linux parity reason.
        // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/inet_connection_sock.h#L280
        //
        // We maintain an equality check here as the synRcvdCount is incremented
        // and compared only from a single listener context and the capacity of
        // the accepted queue can only increase by a new listen call.
        return int(atomic.LoadInt32(&e.synRcvdCount)) == acceptedCap-1
}

func (e *endpoint) acceptQueueIsFull() bool {
        e.acceptMu.Lock()
        full := e.accepted != (accepted{}) && e.accepted.endpoints.Len() == e.accepted.cap
        e.acceptMu.Unlock()
        return full
}

// handleListenSegment is called when a listening endpoint receives a segment
// and needs to handle it.
//
// Precondition: if ctx.listenEP != nil, ctx.listenEP.mu must be locked.
func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) tcpip.Error {
        e.rcvQueueInfo.rcvQueueMu.Lock()
        rcvClosed := e.rcvQueueInfo.RcvClosed
        e.rcvQueueInfo.rcvQueueMu.Unlock()
        if rcvClosed || s.flags.Contains(header.TCPFlagSyn|header.TCPFlagAck) {
                // If the endpoint is shutdown, reply with reset.
                //
                // RFC 793 section 3.4 page 35 (figure 12) outlines that a RST
                // must be sent in response to a SYN-ACK while in the listen
                // state to prevent completing a handshake from an old SYN.
                return replyWithReset(e.stack, s, e.sendTOS, e.ttl)
        }

        switch {
        case s.flags.Contains(header.TCPFlagRst):
                e.stack.Stats().DroppedPackets.Increment()
                return nil

        case s.flags == header.TCPFlagSyn:
                if e.acceptQueueIsFull() {
                        e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
                        e.stats.ReceiveErrors.ListenOverflowSynDrop.Increment()
                        e.stack.Stats().DroppedPackets.Increment()
                        return nil
                }

                opts := parseSynSegmentOptions(s)
                if !ctx.useSynCookies() {
                        s.incRef()
                        atomic.AddInt32(&e.synRcvdCount, 1)
                        return e.handleSynSegment(ctx, s, &opts)
                }
                route, err := e.stack.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */)
                if err != nil {
                        return err
                }
                defer route.Release()

                // Send SYN without window scaling because we currently
                // don't encode this information in the cookie.
                //
                // Enable Timestamp option if the original syn did have
                // the timestamp option specified.
                //
                // Use the user supplied MSS on the listening socket for
                // new connections, if available.
                synOpts := header.TCPSynOptions{
                        WS:    -1,
                        TS:    opts.TS,
                        TSVal: tcpTimeStamp(e.stack.Clock().NowMonotonic(), timeStampOffset(e.stack.Rand())),
                        TSEcr: opts.TSVal,
                        MSS:   calculateAdvertisedMSS(e.userMSS, route),
                }
                cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS))
                fields := tcpFields{
                        id:     s.id,
                        ttl:    e.ttl,
                        tos:    e.sendTOS,
                        flags:  header.TCPFlagSyn | header.TCPFlagAck,
                        seq:    cookie,
                        ack:    s.sequenceNumber + 1,
                        rcvWnd: ctx.rcvWnd,
                }
                if err := e.sendSynTCP(route, fields, synOpts); err != nil {
                        return err
                }
                e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment()
                return nil

        case s.flags.Contains(header.TCPFlagAck):
                if e.acceptQueueIsFull() {
                        // Silently drop the ack as the application can't accept
                        // the connection at this point. The ack will be
                        // retransmitted by the sender anyway and we can
                        // complete the connection at the time of retransmit if
                        // the backlog has space.
                        e.stack.Stats().TCP.ListenOverflowAckDrop.Increment()
                        e.stats.ReceiveErrors.ListenOverflowAckDrop.Increment()
                        e.stack.Stats().DroppedPackets.Increment()
                        return nil
                }

                iss := s.ackNumber - 1
                irs := s.sequenceNumber - 1

                // Since SYN cookies are in use this is potentially an ACK to a
                // SYN-ACK we sent but don't have a half open connection state
                // as cookies are being used to protect against a potential SYN
                // flood. In such cases validate the cookie and if valid create
                // a fully connected endpoint and deliver to the accept queue.
                //
                // If not, silently drop the ACK to avoid leaking information
                // when under a potential syn flood attack.
                //
                // Validate the cookie.
                data, ok := ctx.isCookieValid(s.id, iss, irs)
                if !ok || int(data) >= len(mssTable) {
                        e.stack.Stats().TCP.ListenOverflowInvalidSynCookieRcvd.Increment()
                        e.stack.Stats().DroppedPackets.Increment()

                        // When not using SYN cookies, as per RFC 793, section 3.9, page 64:
                        // Any acknowledgment is bad if it arrives on a connection still in
                        // the LISTEN state.  An acceptable reset segment should be formed
                        // for any arriving ACK-bearing segment.  The RST should be
                        // formatted as follows:
                        //
                        //  <SEQ=SEG.ACK><CTL=RST>
                        //
                        // Send a reset as this is an ACK for which there is no
                        // half open connections and we are not using cookies
                        // yet.
                        //
                        // The only time we should reach here when a connection
                        // was opened and closed really quickly and a delayed
                        // ACK was received from the sender.
                        return replyWithReset(e.stack, s, e.sendTOS, e.ttl)
                }
                e.stack.Stats().TCP.ListenOverflowSynCookieRcvd.Increment()
                // Create newly accepted endpoint and deliver it.
                rcvdSynOptions := &header.TCPSynOptions{
                        MSS: mssTable[data],
                        // Disable Window scaling as original SYN is
                        // lost.
                        WS: -1,
                }

                // When syn cookies are in use we enable timestamp only
                // if the ack specifies the timestamp option assuming
                // that the other end did in fact negotiate the
                // timestamp option in the original SYN.
                if s.parsedOptions.TS {
                        rcvdSynOptions.TS = true
                        rcvdSynOptions.TSVal = s.parsedOptions.TSVal
                        rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr
                }

                n, err := ctx.createConnectingEndpoint(s, rcvdSynOptions, &waiter.Queue{})
                if err != nil {
                        return err
                }

                n.mu.Lock()

                // Propagate any inheritable options from the listening endpoint
                // to the newly created endpoint.
                e.propagateInheritableOptionsLocked(n)

                if !n.reserveTupleLocked() {
                        n.mu.Unlock()
                        n.Close()

                        e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
                        e.stats.FailedConnectionAttempts.Increment()
                        return nil
                }

                // Register new endpoint so that packets are routed to it.
                if err := n.stack.RegisterTransportEndpoint(
                        n.effectiveNetProtos,
                        ProtocolNumber,
                        n.TransportEndpointInfo.ID,
                        n,
                        n.boundPortFlags,
                        n.boundBindToDevice,
                ); err != nil {
                        n.mu.Unlock()
                        n.Close()

                        e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
                        e.stats.FailedConnectionAttempts.Increment()
                        return err
                }

                n.isRegistered = true

                // clear the tsOffset for the newly created
                // endpoint as the Timestamp was already
                // randomly offset when the original SYN-ACK was
                // sent above.
                n.TSOffset = 0

                // Switch state to connected.
                n.isConnectNotified = true
                n.transitionToStateEstablishedLocked(&handshake{
                        ep:          n,
                        iss:         iss,
                        ackNum:      irs + 1,
                        rcvWnd:      seqnum.Size(n.initialReceiveWindow()),
                        sndWnd:      s.window,
                        rcvWndScale: e.rcvWndScaleForHandshake(),
                        sndWndScale: rcvdSynOptions.WS,
                        mss:         rcvdSynOptions.MSS,
                })

                // Requeue the segment if the ACK completing the handshake has more info
                // to be procesed by the newly established endpoint.
                if (s.flags.Contains(header.TCPFlagFin) || s.data.Size() > 0) && n.enqueueSegment(s) {
                        s.incRef()
                        n.newSegmentWaker.Assert()
                }

                // Do the delivery in a separate goroutine so
                // that we don't block the listen loop in case
                // the application is slow to accept or stops
                // accepting.
                //
                // NOTE: This won't result in an unbounded
                // number of goroutines as we do check before
                // entering here that there was at least some
                // space available in the backlog.

                // Start the protocol goroutine.
                n.startAcceptedLoop()
                e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
                go e.deliverAccepted(n, true /*withSynCookie*/)
                return nil

        default:
                e.stack.Stats().DroppedPackets.Increment()
                return nil
        }
}

// protocolListenLoop is the main loop of a listening TCP endpoint. It runs in
// its own goroutine and is responsible for handling connection requests.
func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) {
        e.mu.Lock()
        v6Only := e.ops.GetV6Only()
        ctx := newListenContext(e.stack, e, rcvWnd, v6Only, e.NetProto)

        defer func() {
                // Mark endpoint as closed. This will prevent goroutines running
                // handleSynSegment() from attempting to queue new connections
                // to the endpoint.
                e.setEndpointState(StateClose)

                // Close any endpoints in SYN-RCVD state.
                ctx.closeAllPendingEndpoints()

                // Do cleanup if needed.
                e.completeWorkerLocked()

                if e.drainDone != nil {
                        close(e.drainDone)
                }
                e.mu.Unlock()

                e.drainClosingSegmentQueue()

                // Notify waiters that the endpoint is shutdown.
                e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
        }()

        var s sleep.Sleeper
        s.AddWaker(&e.notificationWaker, wakerForNotification)
        s.AddWaker(&e.newSegmentWaker, wakerForNewSegment)
        for {
                e.mu.Unlock()
                index, _ := s.Fetch(true)
                e.mu.Lock()
                switch index {
                case wakerForNotification:
                        n := e.fetchNotifications()
                        if n&notifyClose != 0 {
                                return
                        }
                        if n&notifyDrain != 0 {
                                for !e.segmentQueue.empty() {
                                        s := e.segmentQueue.dequeue()
                                        // TODO(gvisor.dev/issue/4690): Better handle errors instead of
                                        // silently dropping.
                                        _ = e.handleListenSegment(ctx, s)
                                        s.decRef()
                                }
                                close(e.drainDone)
                                e.mu.Unlock()
                                <-e.undrain
                                e.mu.Lock()
                        }

                case wakerForNewSegment:
                        // Process at most maxSegmentsPerWake segments.
                        mayRequeue := true
                        for i := 0; i < maxSegmentsPerWake; i++ {
                                s := e.segmentQueue.dequeue()
                                if s == nil {
                                        mayRequeue = false
                                        break
                                }

                                // TODO(gvisor.dev/issue/4690): Better handle errors instead of
                                // silently dropping.
                                _ = e.handleListenSegment(ctx, s)
                                s.decRef()
                        }

                        // If the queue is not empty, make sure we'll wake up
                        // in the next iteration.
                        if mayRequeue && !e.segmentQueue.empty() {
                                e.newSegmentWaker.Assert()
                        }
                }
        }
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/pgalloc/usage_set.go: no such file or directory














































































































































    2 


















































































































































































































































































































    2 

    2 


    2 




















































    2 









    9 














   11 
   10 
   10 



    1 













































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
)

// This file contains structures required to support netfilter, specifically
// the iptables tool.

// Hooks into the network stack. These correspond to values in
// include/uapi/linux/netfilter.h.
const (
        NF_INET_PRE_ROUTING  = 0
        NF_INET_LOCAL_IN     = 1
        NF_INET_FORWARD      = 2
        NF_INET_LOCAL_OUT    = 3
        NF_INET_POST_ROUTING = 4
        NF_INET_NUMHOOKS     = 5
)

// Verdicts that can be returned by targets. These correspond to values in
// include/uapi/linux/netfilter.h
const (
        NF_DROP        = 0
        NF_ACCEPT      = 1
        NF_STOLEN      = 2
        NF_QUEUE       = 3
        NF_REPEAT      = 4
        NF_STOP        = 5
        NF_MAX_VERDICT = NF_STOP
        // NF_RETURN is defined in include/uapi/linux/netfilter/x_tables.h.
        NF_RETURN = -NF_REPEAT - 1
)

// VerdictStrings maps int verdicts to the strings they represent. It is used
// for debugging.
var VerdictStrings = map[int32]string{
        -NF_DROP - 1:   "DROP",
        -NF_ACCEPT - 1: "ACCEPT",
        -NF_QUEUE - 1:  "QUEUE",
        NF_RETURN:      "RETURN",
}

// Socket options for SOL_SOCKET. These correspond to values in
// include/uapi/linux/netfilter_ipv4/ip_tables.h.
const (
        IPT_BASE_CTL            = 64
        IPT_SO_SET_REPLACE      = IPT_BASE_CTL
        IPT_SO_SET_ADD_COUNTERS = IPT_BASE_CTL + 1
        IPT_SO_SET_MAX          = IPT_SO_SET_ADD_COUNTERS

        IPT_SO_GET_INFO            = IPT_BASE_CTL
        IPT_SO_GET_ENTRIES         = IPT_BASE_CTL + 1
        IPT_SO_GET_REVISION_MATCH  = IPT_BASE_CTL + 2
        IPT_SO_GET_REVISION_TARGET = IPT_BASE_CTL + 3
        IPT_SO_GET_MAX             = IPT_SO_GET_REVISION_TARGET
)

// Socket option for SOL_IP. This corresponds to the value in
// include/uapi/linux/netfilter_ipv4.h.
const (
        SO_ORIGINAL_DST = 80
)

// Name lengths. These correspond to values in
// include/uapi/linux/netfilter/x_tables.h.
const (
        XT_FUNCTION_MAXNAMELEN  = 30
        XT_EXTENSION_MAXNAMELEN = 29
        XT_TABLE_MAXNAMELEN     = 32
)

// IPTEntry is an iptable rule. It corresponds to struct ipt_entry in
// include/uapi/linux/netfilter_ipv4/ip_tables.h.
//
// +marshal
type IPTEntry struct {
        // IP is used to filter packets based on the IP header.
        IP IPTIP

        // NFCache relates to kernel-internal caching and isn't used by
        // userspace.
        NFCache uint32

        // TargetOffset is the byte offset from the beginning of this IPTEntry
        // to the start of the entry's target.
        TargetOffset uint16

        // NextOffset is the byte offset from the beginning of this IPTEntry to
        // the start of the next entry. It is thus also the size of the entry.
        NextOffset uint16

        // Comeback is a return pointer. It is not used by userspace.
        Comeback uint32

        // Counters holds the packet and byte counts for this rule.
        Counters XTCounters

        // Elems holds the data for all this rule's matches followed by the
        // target. It is variable length -- users have to iterate over any
        // matches and use TargetOffset and NextOffset to make sense of the
        // data.
        //
        // Elems is omitted here because it would cause IPTEntry to be an extra
        // byte larger (see http://www.catb.org/esr/structure-packing/).
        //
        // Elems [0]byte
}

// SizeOfIPTEntry is the size of an IPTEntry.
const SizeOfIPTEntry = 112

// KernelIPTEntry is identical to IPTEntry, but includes the Elems field.
//
// +marshal dynamic
type KernelIPTEntry struct {
        Entry IPTEntry

        // Elems holds the data for all this rule's matches followed by the
        // target. It is variable length -- users have to iterate over any
        // matches and use TargetOffset and NextOffset to make sense of the
        // data.
        Elems primitive.ByteSlice
}

// SizeBytes implements marshal.Marshallable.SizeBytes.
func (ke *KernelIPTEntry) SizeBytes() int {
        return ke.Entry.SizeBytes() + ke.Elems.SizeBytes()
}

// MarshalBytes implements marshal.Marshallable.MarshalBytes.
func (ke *KernelIPTEntry) MarshalBytes(dst []byte) {
        ke.Entry.MarshalUnsafe(dst)
        ke.Elems.MarshalBytes(dst[ke.Entry.SizeBytes():])
}

// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
func (ke *KernelIPTEntry) UnmarshalBytes(src []byte) {
        ke.Entry.UnmarshalUnsafe(src)
        ke.Elems.UnmarshalBytes(src[ke.Entry.SizeBytes():])
}

var _ marshal.Marshallable = (*KernelIPTEntry)(nil)

// IPTIP contains information for matching a packet's IP header.
// It corresponds to struct ipt_ip in
// include/uapi/linux/netfilter_ipv4/ip_tables.h.
//
// +marshal
type IPTIP struct {
        // Src is the source IP address.
        Src InetAddr

        // Dst is the destination IP address.
        Dst InetAddr

        // SrcMask is the source IP mask.
        SrcMask InetAddr

        // DstMask is the destination IP mask.
        DstMask InetAddr

        // InputInterface is the input network interface.
        InputInterface [IFNAMSIZ]byte

        // OutputInterface is the output network interface.
        OutputInterface [IFNAMSIZ]byte

        // InputInterfaceMask is the input interface mask.
        InputInterfaceMask [IFNAMSIZ]byte

        // OuputInterfaceMask is the output interface mask.
        OutputInterfaceMask [IFNAMSIZ]byte

        // Protocol is the transport protocol.
        Protocol uint16

        // Flags define matching behavior for the IP header.
        Flags uint8

        // InverseFlags invert the meaning of fields in struct IPTIP. See the
        // IPT_INV_* flags.
        InverseFlags uint8
}

// Flags in IPTIP.InverseFlags. Corresponding constants are in
// include/uapi/linux/netfilter_ipv4/ip_tables.h.
const (
        // Invert the meaning of InputInterface.
        IPT_INV_VIA_IN = 0x01
        // Invert the meaning of OutputInterface.
        IPT_INV_VIA_OUT = 0x02
        // Unclear what this is, as no references to it exist in the kernel.
        IPT_INV_TOS = 0x04
        // Invert the meaning of Src.
        IPT_INV_SRCIP = 0x08
        // Invert the meaning of Dst.
        IPT_INV_DSTIP = 0x10
        // Invert the meaning of the IPT_F_FRAG flag.
        IPT_INV_FRAG = 0x20
        // Invert the meaning of the Protocol field.
        IPT_INV_PROTO = 0x40
        // Enable all flags.
        IPT_INV_MASK = 0x7F
)

// SizeOfIPTIP is the size of an IPTIP.
const SizeOfIPTIP = 84

// XTCounters holds packet and byte counts for a rule. It corresponds to struct
// xt_counters in include/uapi/linux/netfilter/x_tables.h.
//
// +marshal
type XTCounters struct {
        // Pcnt is the packet count.
        Pcnt uint64

        // Bcnt is the byte count.
        Bcnt uint64
}

// SizeOfXTCounters is the size of an XTCounters.
const SizeOfXTCounters = 16

// XTEntryMatch holds a match for a rule. For example, a user using the
// addrtype iptables match extension would put the data for that match into an
// XTEntryMatch. iptables-extensions(8) has a list of possible matches.
//
// XTEntryMatch corresponds to struct xt_entry_match in
// include/uapi/linux/netfilter/x_tables.h. That struct contains a union
// exposing different data to the user and kernel, but this struct holds only
// the user data.
//
// +marshal
type XTEntryMatch struct {
        MatchSize uint16
        Name      ExtensionName
        Revision  uint8
        // Data is omitted here because it would cause XTEntryMatch to be an
        // extra byte larger (see http://www.catb.org/esr/structure-packing/).
        // Data [0]byte
}

// SizeOfXTEntryMatch is the size of an XTEntryMatch.
const SizeOfXTEntryMatch = 32

// KernelXTEntryMatch is identical to XTEntryMatch, but contains
// variable-length Data field.
type KernelXTEntryMatch struct {
        XTEntryMatch
        Data []byte
}

// XTGetRevision corresponds to xt_get_revision in
// include/uapi/linux/netfilter/x_tables.h
//
// +marshal
type XTGetRevision struct {
        Name     ExtensionName
        Revision uint8
}

// SizeOfXTGetRevision is the size of an XTGetRevision.
const SizeOfXTGetRevision = 30

// XTEntryTarget holds a target for a rule. For example, it can specify that
// packets matching the rule should DROP, ACCEPT, or use an extension target.
// iptables-extension(8) has a list of possible targets.
//
// XTEntryTarget corresponds to struct xt_entry_target in
// include/uapi/linux/netfilter/x_tables.h. That struct contains a union
// exposing different data to the user and kernel, but this struct holds only
// the user data.
//
// +marshal
type XTEntryTarget struct {
        TargetSize uint16
        Name       ExtensionName
        Revision   uint8
        // Data is omitted here because it would cause XTEntryTarget to be an
        // extra byte larger (see http://www.catb.org/esr/structure-packing/).
        // Data [0]byte
}

// SizeOfXTEntryTarget is the size of an XTEntryTarget.
const SizeOfXTEntryTarget = 32

// KernelXTEntryTarget is identical to XTEntryTarget, but contains a
// variable-length Data field.
type KernelXTEntryTarget struct {
        XTEntryTarget
        Data []byte
}

// XTStandardTarget is a built-in target, one of ACCEPT, DROP, JUMP, QUEUE,
// RETURN, or jump. It corresponds to struct xt_standard_target in
// include/uapi/linux/netfilter/x_tables.h.
//
// +marshal
type XTStandardTarget struct {
        Target XTEntryTarget
        // A positive verdict indicates a jump, and is the offset from the
        // start of the table to jump to. A negative value means one of the
        // other built-in targets.
        Verdict int32
        _       [4]byte
}

// SizeOfXTStandardTarget is the size of an XTStandardTarget.
const SizeOfXTStandardTarget = 40

// XTErrorTarget triggers an error when reached. It is also used to mark the
// beginning of user-defined chains by putting the name of the chain in
// ErrorName. It corresponds to struct xt_error_target in
// include/uapi/linux/netfilter/x_tables.h.
//
// +marshal
type XTErrorTarget struct {
        Target XTEntryTarget
        Name   ErrorName
        _      [2]byte
}

// SizeOfXTErrorTarget is the size of an XTErrorTarget.
const SizeOfXTErrorTarget = 64

// Flag values for NfNATIPV4Range. The values indicate whether to map
// protocol specific part(ports) or IPs. It corresponds to values in
// include/uapi/linux/netfilter/nf_nat.h.
const (
        NF_NAT_RANGE_MAP_IPS            = 1 << 0
        NF_NAT_RANGE_PROTO_SPECIFIED    = 1 << 1
        NF_NAT_RANGE_PROTO_RANDOM       = 1 << 2
        NF_NAT_RANGE_PERSISTENT         = 1 << 3
        NF_NAT_RANGE_PROTO_RANDOM_FULLY = 1 << 4
        NF_NAT_RANGE_PROTO_RANDOM_ALL   = (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
        NF_NAT_RANGE_MASK               = (NF_NAT_RANGE_MAP_IPS |
                NF_NAT_RANGE_PROTO_SPECIFIED | NF_NAT_RANGE_PROTO_RANDOM |
                NF_NAT_RANGE_PERSISTENT | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
)

// NfNATIPV4Range corresponds to struct nf_nat_ipv4_range
// in include/uapi/linux/netfilter/nf_nat.h. The fields are in
// network byte order.
//
// +marshal
type NfNATIPV4Range struct {
        Flags   uint32
        MinIP   [4]byte
        MaxIP   [4]byte
        MinPort uint16
        MaxPort uint16
}

// NfNATIPV4MultiRangeCompat corresponds to struct
// nf_nat_ipv4_multi_range_compat in include/uapi/linux/netfilter/nf_nat.h.
//
// +marshal
type NfNATIPV4MultiRangeCompat struct {
        RangeSize uint32
        RangeIPV4 NfNATIPV4Range
}

// XTRedirectTarget triggers a redirect when reached.
// Adding 4 bytes of padding to make the struct 8 byte aligned.
//
// +marshal
type XTRedirectTarget struct {
        Target  XTEntryTarget
        NfRange NfNATIPV4MultiRangeCompat
        _       [4]byte
}

// SizeOfXTRedirectTarget is the size of an XTRedirectTarget.
const SizeOfXTRedirectTarget = 56

// XTSNATTarget triggers Source NAT when reached.
// Adding 4 bytes of padding to make the struct 8 byte aligned.
//
// +marshal
type XTSNATTarget struct {
        Target  XTEntryTarget
        NfRange NfNATIPV4MultiRangeCompat
        _       [4]byte
}

// SizeOfXTSNATTarget is the size of an XTSNATTarget.
const SizeOfXTSNATTarget = 56

// IPTGetinfo is the argument for the IPT_SO_GET_INFO sockopt. It corresponds
// to struct ipt_getinfo in include/uapi/linux/netfilter_ipv4/ip_tables.h.
//
// +marshal
type IPTGetinfo struct {
        Name       TableName
        ValidHooks uint32
        HookEntry  [NF_INET_NUMHOOKS]uint32
        Underflow  [NF_INET_NUMHOOKS]uint32
        NumEntries uint32
        Size       uint32
}

// SizeOfIPTGetinfo is the size of an IPTGetinfo.
const SizeOfIPTGetinfo = 84

// IPTGetEntries is the argument for the IPT_SO_GET_ENTRIES sockopt. It
// corresponds to struct ipt_get_entries in
// include/uapi/linux/netfilter_ipv4/ip_tables.h.
//
// +marshal
type IPTGetEntries struct {
        Name TableName
        Size uint32
        _    [4]byte
        // Entrytable is omitted here because it would cause IPTGetEntries to
        // be an extra byte longer (see
        // http://www.catb.org/esr/structure-packing/).
        // Entrytable [0]IPTEntry
}

// SizeOfIPTGetEntries is the size of an IPTGetEntries.
const SizeOfIPTGetEntries = 40

// KernelIPTGetEntries is identical to IPTGetEntries, but includes the
// Entrytable field.
//
// +marshal dynamic
type KernelIPTGetEntries struct {
        IPTGetEntries
        Entrytable []KernelIPTEntry
}

// SizeBytes implements marshal.Marshallable.SizeBytes.
func (ke *KernelIPTGetEntries) SizeBytes() int {
        res := ke.IPTGetEntries.SizeBytes()
        for _, entry := range ke.Entrytable {
                res += entry.SizeBytes()
        }
        return res
}

// MarshalBytes implements marshal.Marshallable.MarshalBytes.
func (ke *KernelIPTGetEntries) MarshalBytes(dst []byte) {
        ke.IPTGetEntries.MarshalUnsafe(dst)
        marshalledUntil := ke.IPTGetEntries.SizeBytes()
        for i := range ke.Entrytable {
                ke.Entrytable[i].MarshalBytes(dst[marshalledUntil:])
                marshalledUntil += ke.Entrytable[i].SizeBytes()
        }
}

// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
func (ke *KernelIPTGetEntries) UnmarshalBytes(src []byte) {
        ke.IPTGetEntries.UnmarshalUnsafe(src)
        unmarshalledUntil := ke.IPTGetEntries.SizeBytes()
        for i := range ke.Entrytable {
                ke.Entrytable[i].UnmarshalBytes(src[unmarshalledUntil:])
                unmarshalledUntil += ke.Entrytable[i].SizeBytes()
        }
}

var _ marshal.Marshallable = (*KernelIPTGetEntries)(nil)

// IPTReplace is the argument for the IPT_SO_SET_REPLACE sockopt. It
// corresponds to struct ipt_replace in
// include/uapi/linux/netfilter_ipv4/ip_tables.h.
//
// +marshal
type IPTReplace struct {
        Name        TableName
        ValidHooks  uint32
        NumEntries  uint32
        Size        uint32
        HookEntry   [NF_INET_NUMHOOKS]uint32
        Underflow   [NF_INET_NUMHOOKS]uint32
        NumCounters uint32
        Counters    uint64 // This is really a *XTCounters.
        // Entries is omitted here because it would cause IPTReplace to be an
        // extra byte longer (see http://www.catb.org/esr/structure-packing/).
        // Entries [0]IPTEntry
}

// SizeOfIPTReplace is the size of an IPTReplace.
const SizeOfIPTReplace = 96

// ExtensionName holds the name of a netfilter extension.
//
// +marshal
type ExtensionName [XT_EXTENSION_MAXNAMELEN]byte

// String implements fmt.Stringer.
func (en ExtensionName) String() string {
        return goString(en[:])
}

// TableName holds the name of a netfilter table.
//
// +marshal
type TableName [XT_TABLE_MAXNAMELEN]byte

// String implements fmt.Stringer.
func (tn TableName) String() string {
        return goString(tn[:])
}

// ErrorName holds the name of a netfilter error. These can also hold
// user-defined chains.
//
// +marshal
type ErrorName [XT_FUNCTION_MAXNAMELEN]byte

// String implements fmt.Stringer.
func (en ErrorName) String() string {
        return goString(en[:])
}

func goString(cstring []byte) string {
        for i, c := range cstring {
                if c == 0 {
                        return string(cstring[:i])
                }
        }
        return string(cstring)
}

// XTTCP holds data for matching TCP packets. It corresponds to struct xt_tcp
// in include/uapi/linux/netfilter/xt_tcpudp.h.
//
// +marshal
type XTTCP struct {
        // SourcePortStart specifies the inclusive start of the range of source
        // ports to which the matcher applies.
        SourcePortStart uint16

        // SourcePortEnd specifies the inclusive end of the range of source ports
        // to which the matcher applies.
        SourcePortEnd uint16

        // DestinationPortStart specifies the start of the destination port
        // range to which the matcher applies.
        DestinationPortStart uint16

        // DestinationPortEnd specifies the end of the destination port
        // range to which the matcher applies.
        DestinationPortEnd uint16

        // Option specifies that a particular TCP option must be set.
        Option uint8

        // FlagMask masks TCP flags when comparing to the FlagCompare byte. It allows
        // for specification of which flags are important to the matcher.
        FlagMask uint8

        // FlagCompare, in combination with FlagMask, is used to match only packets
        // that have certain flags set.
        FlagCompare uint8

        // InverseFlags flips the meaning of certain fields. See the
        // TX_TCP_INV_* flags.
        InverseFlags uint8
}

// SizeOfXTTCP is the size of an XTTCP.
const SizeOfXTTCP = 12

// Flags in XTTCP.InverseFlags. Corresponding constants are in
// include/uapi/linux/netfilter/xt_tcpudp.h.
const (
        // Invert the meaning of SourcePortStart/End.
        XT_TCP_INV_SRCPT = 0x01
        // Invert the meaning of DestinationPortStart/End.
        XT_TCP_INV_DSTPT = 0x02
        // Invert the meaning of FlagCompare.
        XT_TCP_INV_FLAGS = 0x04
        // Invert the meaning of Option.
        XT_TCP_INV_OPTION = 0x08
        // Enable all flags.
        XT_TCP_INV_MASK = 0x0F
)

// XTUDP holds data for matching UDP packets. It corresponds to struct xt_udp
// in include/uapi/linux/netfilter/xt_tcpudp.h.
//
// +marshal
type XTUDP struct {
        // SourcePortStart is the inclusive start of the range of source ports
        // to which the matcher applies.
        SourcePortStart uint16

        // SourcePortEnd is the inclusive end of the range of source ports to
        // which the matcher applies.
        SourcePortEnd uint16

        // DestinationPortStart is the inclusive start of the destination port
        // range to which the matcher applies.
        DestinationPortStart uint16

        // DestinationPortEnd is the inclusive end of the destination port
        // range to which the matcher applies.
        DestinationPortEnd uint16

        // InverseFlags flips the meaning of certain fields. See the
        // TX_UDP_INV_* flags.
        InverseFlags uint8

        _ uint8
}

// SizeOfXTUDP is the size of an XTUDP.
const SizeOfXTUDP = 10

// Flags in XTUDP.InverseFlags. Corresponding constants are in
// include/uapi/linux/netfilter/xt_tcpudp.h.
const (
        // Invert the meaning of SourcePortStart/End.
        XT_UDP_INV_SRCPT = 0x01
        // Invert the meaning of DestinationPortStart/End.
        XT_UDP_INV_DSTPT = 0x02
        // Enable all flags.
        XT_UDP_INV_MASK = 0x03
)

// IPTOwnerInfo holds data for matching packets with owner. It corresponds
// to struct ipt_owner_info in libxt_owner.c of iptables binary.
//
// +marshal
type IPTOwnerInfo struct {
        // UID is user id which created the packet.
        UID uint32

        // GID is group id which created the packet.
        GID uint32

        // PID is process id of the process which created the packet.
        PID uint32

        // SID is session id which created the packet.
        SID uint32

        // Comm is the command name which created the packet.
        Comm [16]byte

        // Match is used to match UID/GID of the socket. See the
        // XT_OWNER_* flags below.
        Match uint8

        // Invert flips the meaning of Match field.
        Invert uint8 `marshal:"unaligned"`
}

// SizeOfIPTOwnerInfo is the size of an XTOwnerMatchInfo.
const SizeOfIPTOwnerInfo = 34

// Flags in IPTOwnerInfo.Match. Corresponding constants are in
// include/uapi/linux/netfilter/xt_owner.h.
const (
        // Match the UID of the packet.
        XT_OWNER_UID = 1 << 0
        // Match the GID of the packet.
        XT_OWNER_GID = 1 << 1
        // Match if the socket exists for the packet. Forwarded
        // packets do not have an associated socket.
        XT_OWNER_SOCKET = 1 << 2
)




























  334 
  334 










  333 



  331 


  334 
  335 

  334 

  321 



  322 


  322 








  334 



  335 


  322 




  323 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build linux

package flipcall

import (
        "fmt"
        "runtime"
        "sync/atomic"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
)

func (ep *Endpoint) futexSetPeerActive() error {
        if atomic.CompareAndSwapUint32(ep.connState(), ep.activeState, ep.inactiveState) {
                return nil
        }
        switch cs := atomic.LoadUint32(ep.connState()); cs {
        case csShutdown:
                return ShutdownError{}
        default:
                return fmt.Errorf("unexpected connection state before FUTEX_WAKE: %v", cs)
        }
}

func (ep *Endpoint) futexWakePeer() error {
        if err := ep.futexWakeConnState(1); err != nil {
                return fmt.Errorf("failed to FUTEX_WAKE peer Endpoint: %v", err)
        }
        return nil
}

func (ep *Endpoint) futexWaitUntilActive() error {
        for {
                switch cs := atomic.LoadUint32(ep.connState()); cs {
                case ep.activeState:
                        return nil
                case ep.inactiveState:
                        if ep.isShutdownLocally() {
                                return ShutdownError{}
                        }
                        if err := ep.futexWaitConnState(ep.inactiveState); err != nil {
                                return fmt.Errorf("failed to FUTEX_WAIT for peer Endpoint: %v", err)
                        }
                        continue
                case csShutdown:
                        return ShutdownError{}
                default:
                        return fmt.Errorf("unexpected connection state before FUTEX_WAIT: %v", cs)
                }
        }
}

func (ep *Endpoint) futexWakeConnState(numThreads int32) error {
        if _, _, e := unix.RawSyscall(unix.SYS_FUTEX, ep.packet, linux.FUTEX_WAKE, uintptr(numThreads)); e != 0 {
                return e
        }
        return nil
}

func (ep *Endpoint) futexWaitConnState(curState uint32) error {
        _, _, e := unix.Syscall6(unix.SYS_FUTEX, ep.packet, linux.FUTEX_WAIT, uintptr(curState), 0, 0, 0)
        if e != 0 && e != unix.EAGAIN && e != unix.EINTR {
                return e
        }
        return nil
}

func yieldThread() {
        unix.Syscall(unix.SYS_SCHED_YIELD, 0, 0, 0)
        // The thread we're trying to yield to may be waiting for a Go runtime P.
        // runtime.Gosched() will hand off ours if necessary.
        runtime.Gosched()
}

























































   12 













   12 



    3 






    1 






    3 


    1 





    2 










    1 



    2 



    1 





    1 



    1 













    5 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package signalfd provides basic signalfd file implementations.
package signalfd

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// SignalFileDescription implements vfs.FileDescriptionImpl for signal fds.
//
// +stateify savable
type SignalFileDescription struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.DentryMetadataFileDescriptionImpl
        vfs.NoLockFD

        // target is the original signal target task.
        //
        // The semantics here are a bit broken. Linux will always use current
        // for all reads, regardless of where the signalfd originated. We can't
        // do exactly that because we need to plumb the context through
        // EventRegister in order to support proper blocking behavior. This
        // will undoubtedly become very complicated quickly.
        target *kernel.Task

        // mu protects mask.
        mu sync.Mutex `state:"nosave"`

        // mask is the signal mask. Protected by mu.
        mask linux.SignalSet
}

var _ vfs.FileDescriptionImpl = (*SignalFileDescription)(nil)

// New creates a new signal fd.
func New(vfsObj *vfs.VirtualFilesystem, target *kernel.Task, mask linux.SignalSet, flags uint32) (*vfs.FileDescription, error) {
        vd := vfsObj.NewAnonVirtualDentry("[signalfd]")
        defer vd.DecRef(target)
        sfd := &SignalFileDescription{
                target: target,
                mask:   mask,
        }
        if err := sfd.vfsfd.Init(sfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{
                UseDentryMetadata: true,
                DenyPRead:         true,
                DenyPWrite:        true,
        }); err != nil {
                return nil, err
        }
        return &sfd.vfsfd, nil
}

// Mask returns the signal mask.
func (sfd *SignalFileDescription) Mask() linux.SignalSet {
        sfd.mu.Lock()
        defer sfd.mu.Unlock()
        return sfd.mask
}

// SetMask sets the signal mask.
func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) {
        sfd.mu.Lock()
        defer sfd.mu.Unlock()
        sfd.mask = mask
}

// Read implements vfs.FileDescriptionImpl.Read.
func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
        // Attempt to dequeue relevant signals.
        info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0)
        if err != nil {
                // There must be no signal available.
                return 0, syserror.ErrWouldBlock
        }

        // Copy out the signal info using the specified format.
        infoNative := linux.SignalfdSiginfo{
                Signo:   uint32(info.Signo),
                Errno:   info.Errno,
                Code:    info.Code,
                PID:     uint32(info.PID()),
                UID:     uint32(info.UID()),
                Status:  info.Status(),
                Overrun: uint32(info.Overrun()),
                Addr:    info.Addr(),
        }
        n, err := infoNative.WriteTo(dst.Writer(ctx))
        if err == usermem.ErrEndOfIOSequence {
                // Partial copy-out ok.
                err = nil
        }
        return n, err
}

// Readiness implements waiter.Waitable.Readiness.
func (sfd *SignalFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
        sfd.mu.Lock()
        defer sfd.mu.Unlock()
        if mask&waiter.ReadableEvents != 0 && sfd.target.PendingSignals()&sfd.mask != 0 {
                return waiter.ReadableEvents // Pending signals.
        }
        return 0
}

// EventRegister implements waiter.Waitable.EventRegister.
func (sfd *SignalFileDescription) EventRegister(entry *waiter.Entry, _ waiter.EventMask) {
        sfd.mu.Lock()
        defer sfd.mu.Unlock()
        // Register for the signal set; ignore the passed events.
        sfd.target.SignalRegister(entry, waiter.EventMask(sfd.mask))
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) {
        // Unregister the original entry.
        sfd.target.SignalUnregister(entry)
}

// Release implements vfs.FileDescriptionImpl.Release.
func (sfd *SignalFileDescription) Release(context.Context) {}


































































































  712 
    5 



  712 






  715 
  108 


  699 



 1927 




 1933 






 1931 









 1932 
 1922 




 1926 



 1926 

 1923 


 1923 





   18 




   18 







  724 




   84 




  166 










   16 








   16 




   97 


   97 


   97 







    7 






































    1 




    1 

    1 






















    7 

    4 


    4 































    4 





    5 




















 1923 










 1911 
 1902 



  648 







  646 





  645 




    2 



    1 


    1 
   83 


   82 




  647 







    1 







  648 



    1 



   83 

   80 



   82 
   79 



  645 





 1925 


 1916 



   49 



   15 






   37 





 1905 










   83 











   83 





   81 
   81 










   81 



   81 



   80 




   86 

   84 

    2 






    6 

    4 



    3 



    1 












    1 

    1 



    1 
















  662 










    4 





    4 


    1 



    3 




    3 





















  659 




  662 
  661 



  661 



  659 






    3 






    1 













    2 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

// CPU scheduling, real and fake.

import (
        "fmt"
        "math/rand"
        "sync/atomic"
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/hostcpu"
        "gvisor.dev/gvisor/pkg/sentry/kernel/sched"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/sentry/usage"
)

// TaskGoroutineState is a coarse representation of the current execution
// status of a kernel.Task goroutine.
type TaskGoroutineState int

const (
        // TaskGoroutineNonexistent indicates that the task goroutine has either
        // not yet been created by Task.Start() or has returned from Task.run().
        // This must be the zero value for TaskGoroutineState.
        TaskGoroutineNonexistent TaskGoroutineState = iota

        // TaskGoroutineRunningSys indicates that the task goroutine is executing
        // sentry code.
        TaskGoroutineRunningSys

        // TaskGoroutineRunningApp indicates that the task goroutine is executing
        // application code.
        TaskGoroutineRunningApp

        // TaskGoroutineBlockedInterruptible indicates that the task goroutine is
        // blocked in Task.block(), and hence may be woken by Task.interrupt()
        // (e.g. due to signal delivery).
        TaskGoroutineBlockedInterruptible

        // TaskGoroutineBlockedUninterruptible indicates that the task goroutine is
        // stopped outside of Task.block() and Task.doStop(), and hence cannot be
        // woken by Task.interrupt().
        TaskGoroutineBlockedUninterruptible

        // TaskGoroutineStopped indicates that the task goroutine is blocked in
        // Task.doStop(). TaskGoroutineStopped is similar to
        // TaskGoroutineBlockedUninterruptible, but is a separate state to make it
        // possible to determine when Task.stop is meaningful.
        TaskGoroutineStopped
)

// TaskGoroutineSchedInfo contains task goroutine scheduling state which must
// be read and updated atomically.
//
// +stateify savable
type TaskGoroutineSchedInfo struct {
        // Timestamp was the value of Kernel.cpuClock when this
        // TaskGoroutineSchedInfo was last updated.
        Timestamp uint64

        // State is the current state of the task goroutine.
        State TaskGoroutineState

        // UserTicks is the amount of time the task goroutine has spent executing
        // its associated Task's application code, in units of linux.ClockTick.
        UserTicks uint64

        // SysTicks is the amount of time the task goroutine has spent executing in
        // the sentry, in units of linux.ClockTick.
        SysTicks uint64
}

// userTicksAt returns the extrapolated value of ts.UserTicks after
// Kernel.CPUClockNow() indicates a time of now.
//
// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
// monotonic, this is satisfied if now is the result of a previous call to
// Kernel.CPUClockNow().) This requirement exists because otherwise a racing
// change to t.gosched can cause userTicksAt to adjust stats by too much,
// making the observed stats non-monotonic.
func (ts *TaskGoroutineSchedInfo) userTicksAt(now uint64) uint64 {
        if ts.Timestamp < now && ts.State == TaskGoroutineRunningApp {
                // Update stats to reflect execution since the last update.
                return ts.UserTicks + (now - ts.Timestamp)
        }
        return ts.UserTicks
}

// sysTicksAt returns the extrapolated value of ts.SysTicks after
// Kernel.CPUClockNow() indicates a time of now.
//
// Preconditions: As for userTicksAt.
func (ts *TaskGoroutineSchedInfo) sysTicksAt(now uint64) uint64 {
        if ts.Timestamp < now && ts.State == TaskGoroutineRunningSys {
                return ts.SysTicks + (now - ts.Timestamp)
        }
        return ts.SysTicks
}

// Preconditions: The caller must be running on the task goroutine.
func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
        now := t.k.CPUClockNow()
        if t.gosched.State != TaskGoroutineRunningSys {
                panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state))
        }
        t.goschedSeq.BeginWrite()
        // This function is very hot; avoid defer.
        t.gosched.SysTicks += now - t.gosched.Timestamp
        t.gosched.Timestamp = now
        t.gosched.State = state
        t.goschedSeq.EndWrite()

        if state != TaskGoroutineRunningApp {
                // Task is blocking/stopping.
                t.k.decRunningTasks()
        }
}

// Preconditions:
// * The caller must be running on the task goroutine
// * The caller must be leaving a state indicated by a previous call to
//   t.accountTaskGoroutineEnter(state).
func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
        if state != TaskGoroutineRunningApp {
                // Task is unblocking/continuing.
                t.k.incRunningTasks()
        }

        now := t.k.CPUClockNow()
        if t.gosched.State != state {
                panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys))
        }
        t.goschedSeq.BeginWrite()
        // This function is very hot; avoid defer.
        if state == TaskGoroutineRunningApp {
                t.gosched.UserTicks += now - t.gosched.Timestamp
        }
        t.gosched.Timestamp = now
        t.gosched.State = TaskGoroutineRunningSys
        t.goschedSeq.EndWrite()
}

// Preconditions: The caller must be running on the task goroutine.
func (t *Task) accountTaskGoroutineRunning() {
        now := t.k.CPUClockNow()
        if t.gosched.State != TaskGoroutineRunningSys {
                panic(fmt.Sprintf("Task goroutine in state %v (expected %v)", t.gosched.State, TaskGoroutineRunningSys))
        }
        t.goschedSeq.BeginWrite()
        t.gosched.SysTicks += now - t.gosched.Timestamp
        t.gosched.Timestamp = now
        t.goschedSeq.EndWrite()
}

// TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info.
// Most clients should use t.CPUStats() instead.
func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo {
        return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched)
}

// CPUStats returns the CPU usage statistics of t.
func (t *Task) CPUStats() usage.CPUStats {
        return t.cpuStatsAt(t.k.CPUClockNow())
}

// Preconditions: As for TaskGoroutineSchedInfo.userTicksAt.
func (t *Task) cpuStatsAt(now uint64) usage.CPUStats {
        tsched := t.TaskGoroutineSchedInfo()
        return usage.CPUStats{
                UserTime:          time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick)),
                SysTime:           time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick)),
                VoluntarySwitches: atomic.LoadUint64(&t.yieldCount),
        }
}

// CPUStats returns the combined CPU usage statistics of all past and present
// threads in tg.
func (tg *ThreadGroup) CPUStats() usage.CPUStats {
        tg.pidns.owner.mu.RLock()
        defer tg.pidns.owner.mu.RUnlock()
        // Hack to get a pointer to the Kernel.
        if tg.leader == nil {
                // Per comment on tg.leader, this is only possible if nothing in the
                // ThreadGroup has ever executed anyway.
                return usage.CPUStats{}
        }
        return tg.cpuStatsAtLocked(tg.leader.k.CPUClockNow())
}

// Preconditions: Same as TaskGoroutineSchedInfo.userTicksAt, plus:
// * The TaskSet mutex must be locked.
func (tg *ThreadGroup) cpuStatsAtLocked(now uint64) usage.CPUStats {
        stats := tg.exitedCPUStats
        // Account for live tasks.
        for t := tg.tasks.Front(); t != nil; t = t.Next() {
                stats.Accumulate(t.cpuStatsAt(now))
        }
        return stats
}

// JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return
// resource usage statistics for all children of [tg] that have terminated and
// been waited for. These statistics will include the resources used by
// grandchildren, and further removed descendants, if all of the intervening
// descendants waited on their terminated children."
func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats {
        tg.pidns.owner.mu.RLock()
        defer tg.pidns.owner.mu.RUnlock()
        return tg.childCPUStats
}

// taskClock is a ktime.Clock that measures the time that a task has spent
// executing. taskClock is primarily used to implement CLOCK_THREAD_CPUTIME_ID.
//
// +stateify savable
type taskClock struct {
        t *Task

        // If includeSys is true, the taskClock includes both time spent executing
        // application code as well as time spent in the sentry. Otherwise, the
        // taskClock includes only time spent executing application code.
        includeSys bool

        // Implements waiter.Waitable. TimeUntil wouldn't change its estimation
        // based on either of the clock events, so there's no event to be
        // notified for.
        ktime.NoClockEvents `state:"nosave"`

        // Implements ktime.Clock.WallTimeUntil.
        //
        // As an upper bound, a task's clock cannot advance faster than CPU
        // time. It would have to execute at a rate of more than 1 task-second
        // per 1 CPU-second, which isn't possible.
        ktime.WallRateClock `state:"nosave"`
}

// UserCPUClock returns a clock measuring the CPU time the task has spent
// executing application code.
func (t *Task) UserCPUClock() ktime.Clock {
        return &taskClock{t: t, includeSys: false}
}

// CPUClock returns a clock measuring the CPU time the task has spent executing
// application and "kernel" code.
func (t *Task) CPUClock() ktime.Clock {
        return &taskClock{t: t, includeSys: true}
}

// Now implements ktime.Clock.Now.
func (tc *taskClock) Now() ktime.Time {
        stats := tc.t.CPUStats()
        if tc.includeSys {
                return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
        }
        return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
}

// tgClock is a ktime.Clock that measures the time a thread group has spent
// executing. tgClock is primarily used to implement CLOCK_PROCESS_CPUTIME_ID.
//
// +stateify savable
type tgClock struct {
        tg *ThreadGroup

        // If includeSys is true, the tgClock includes both time spent executing
        // application code as well as time spent in the sentry. Otherwise, the
        // tgClock includes only time spent executing application code.
        includeSys bool

        // Implements waiter.Waitable.
        ktime.ClockEventsQueue `state:"nosave"`
}

// Now implements ktime.Clock.Now.
func (tgc *tgClock) Now() ktime.Time {
        stats := tgc.tg.CPUStats()
        if tgc.includeSys {
                return ktime.FromNanoseconds((stats.UserTime + stats.SysTime).Nanoseconds())
        }
        return ktime.FromNanoseconds(stats.UserTime.Nanoseconds())
}

// WallTimeUntil implements ktime.Clock.WallTimeUntil.
func (tgc *tgClock) WallTimeUntil(t, now ktime.Time) time.Duration {
        // Thread group CPU time should not exceed wall time * live tasks, since
        // task goroutines exit after the transition to TaskExitZombie in
        // runExitNotify.
        tgc.tg.pidns.owner.mu.RLock()
        n := tgc.tg.liveTasks
        tgc.tg.pidns.owner.mu.RUnlock()
        if n == 0 {
                if t.Before(now) {
                        return 0
                }
                // The timer tick raced with thread group exit, after which no more
                // tasks can enter the thread group. So tgc.Now() will never advance
                // again. Return a large delay; the timer should be stopped long before
                // it comes again anyway.
                return time.Hour
        }
        // This is a lower bound on the amount of time that can elapse before an
        // associated timer expires, so returning this value tends to result in a
        // sequence of closely-spaced ticks just before timer expiry. To avoid
        // this, round up to the nearest ClockTick; CPU usage measurements are
        // limited to this resolution anyway.
        remaining := time.Duration(t.Sub(now).Nanoseconds()/int64(n)) * time.Nanosecond
        return ((remaining + (linux.ClockTick - time.Nanosecond)) / linux.ClockTick) * linux.ClockTick
}

// UserCPUClock returns a ktime.Clock that measures the time that a thread
// group has spent executing.
func (tg *ThreadGroup) UserCPUClock() ktime.Clock {
        return &tgClock{tg: tg, includeSys: false}
}

// CPUClock returns a ktime.Clock that measures the time that a thread group
// has spent executing, including sentry time.
func (tg *ThreadGroup) CPUClock() ktime.Clock {
        return &tgClock{tg: tg, includeSys: true}
}

type kernelCPUClockTicker struct {
        k *Kernel

        // These are essentially kernelCPUClockTicker.Notify local variables that
        // are cached between calls to reduce allocations.
        rng *rand.Rand
        tgs []*ThreadGroup
}

func newKernelCPUClockTicker(k *Kernel) *kernelCPUClockTicker {
        return &kernelCPUClockTicker{
                k:   k,
                rng: rand.New(rand.NewSource(rand.Int63())),
        }
}

// Notify implements ktime.TimerListener.Notify.
func (ticker *kernelCPUClockTicker) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
        // Only increment cpuClock by 1 regardless of the number of expirations.
        // This approximately compensates for cases where thread throttling or bad
        // Go runtime scheduling prevents the kernelCPUClockTicker goroutine, and
        // presumably task goroutines as well, from executing for a long period of
        // time. It's also necessary to prevent CPU clocks from seeing large
        // discontinuous jumps.
        now := atomic.AddUint64(&ticker.k.cpuClock, 1)

        // Check thread group CPU timers.
        tgs := ticker.k.tasks.Root.ThreadGroupsAppend(ticker.tgs)
        for _, tg := range tgs {
                if atomic.LoadUint32(&tg.cpuTimersEnabled) == 0 {
                        continue
                }

                ticker.k.tasks.mu.RLock()
                if tg.leader == nil {
                        // No tasks have ever run in this thread group.
                        ticker.k.tasks.mu.RUnlock()
                        continue
                }
                // Accumulate thread group CPU stats, and randomly select running tasks
                // using reservoir sampling to receive CPU timer signals.
                var virtReceiver *Task
                nrVirtCandidates := 0
                var profReceiver *Task
                nrProfCandidates := 0
                tgUserTime := tg.exitedCPUStats.UserTime
                tgSysTime := tg.exitedCPUStats.SysTime
                for t := tg.tasks.Front(); t != nil; t = t.Next() {
                        tsched := t.TaskGoroutineSchedInfo()
                        tgUserTime += time.Duration(tsched.userTicksAt(now) * uint64(linux.ClockTick))
                        tgSysTime += time.Duration(tsched.sysTicksAt(now) * uint64(linux.ClockTick))
                        switch tsched.State {
                        case TaskGoroutineRunningApp:
                                // Considered by ITIMER_VIRT, ITIMER_PROF, and RLIMIT_CPU
                                // timers.
                                nrVirtCandidates++
                                if int(randInt31n(ticker.rng, int32(nrVirtCandidates))) == 0 {
                                        virtReceiver = t
                                }
                                fallthrough
                        case TaskGoroutineRunningSys:
                                // Considered by ITIMER_PROF and RLIMIT_CPU timers.
                                nrProfCandidates++
                                if int(randInt31n(ticker.rng, int32(nrProfCandidates))) == 0 {
                                        profReceiver = t
                                }
                        }
                }
                tgVirtNow := ktime.FromNanoseconds(tgUserTime.Nanoseconds())
                tgProfNow := ktime.FromNanoseconds((tgUserTime + tgSysTime).Nanoseconds())

                // All of the following are standard (not real-time) signals, which are
                // automatically deduplicated, so we ignore the number of expirations.
                tg.signalHandlers.mu.Lock()
                // It should only be possible for these timers to advance if we found
                // at least one running task.
                if virtReceiver != nil {
                        // ITIMER_VIRTUAL
                        newItimerVirtSetting, exp := tg.itimerVirtSetting.At(tgVirtNow)
                        tg.itimerVirtSetting = newItimerVirtSetting
                        if exp != 0 {
                                virtReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGVTALRM), true)
                        }
                }
                if profReceiver != nil {
                        // ITIMER_PROF
                        newItimerProfSetting, exp := tg.itimerProfSetting.At(tgProfNow)
                        tg.itimerProfSetting = newItimerProfSetting
                        if exp != 0 {
                                profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGPROF), true)
                        }
                        // RLIMIT_CPU soft limit
                        newRlimitCPUSoftSetting, exp := tg.rlimitCPUSoftSetting.At(tgProfNow)
                        tg.rlimitCPUSoftSetting = newRlimitCPUSoftSetting
                        if exp != 0 {
                                profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGXCPU), true)
                        }
                        // RLIMIT_CPU hard limit
                        rlimitCPUMax := tg.limits.Get(limits.CPU).Max
                        if rlimitCPUMax != limits.Infinity && !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPUMax))) {
                                profReceiver.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true)
                        }
                }
                tg.signalHandlers.mu.Unlock()

                ticker.k.tasks.mu.RUnlock()
        }

        // Retain tgs between calls to Notify to reduce allocations.
        for i := range tgs {
                tgs[i] = nil
        }
        ticker.tgs = tgs[:0]

        // If nothing is running, we can disable the timer.
        tasks := atomic.LoadInt64(&ticker.k.runningTasks)
        if tasks == 0 {
                ticker.k.runningTasksMu.Lock()
                defer ticker.k.runningTasksMu.Unlock()
                tasks := atomic.LoadInt64(&ticker.k.runningTasks)
                if tasks != 0 {
                        // Raced with a 0 -> 1 transition.
                        return setting, false
                }

                // Stop the timer. We must cache the current setting so the
                // kernel can access it without violating the lock order.
                ticker.k.cpuClockTickerSetting = setting
                ticker.k.cpuClockTickerDisabled = true
                setting.Enabled = false
                return setting, true
        }

        return setting, false
}

// Destroy implements ktime.TimerListener.Destroy.
func (ticker *kernelCPUClockTicker) Destroy() {
}

// randInt31n returns a random integer in [0, n).
//
// randInt31n is equivalent to math/rand.Rand.int31n(), which is unexported.
// See that function for details.
func randInt31n(rng *rand.Rand, n int32) int32 {
        v := rng.Uint32()
        prod := uint64(v) * uint64(n)
        low := uint32(prod)
        if low < uint32(n) {
                thresh := uint32(-n) % uint32(n)
                for low < thresh {
                        v = rng.Uint32()
                        prod = uint64(v) * uint64(n)
                        low = uint32(prod)
                }
        }
        return int32(prod >> 32)
}

// NotifyRlimitCPUUpdated is called by setrlimit.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) NotifyRlimitCPUUpdated() {
        t.k.cpuClockTicker.Atomically(func() {
                t.tg.pidns.owner.mu.RLock()
                defer t.tg.pidns.owner.mu.RUnlock()
                t.tg.signalHandlers.mu.Lock()
                defer t.tg.signalHandlers.mu.Unlock()
                rlimitCPU := t.tg.limits.Get(limits.CPU)
                t.tg.rlimitCPUSoftSetting = ktime.Setting{
                        Enabled: rlimitCPU.Cur != limits.Infinity,
                        Next:    ktime.FromNanoseconds((time.Duration(rlimitCPU.Cur) * time.Second).Nanoseconds()),
                        Period:  time.Second,
                }
                if rlimitCPU.Max != limits.Infinity {
                        // Check if tg is already over the hard limit.
                        tgcpu := t.tg.cpuStatsAtLocked(t.k.CPUClockNow())
                        tgProfNow := ktime.FromNanoseconds((tgcpu.UserTime + tgcpu.SysTime).Nanoseconds())
                        if !tgProfNow.Before(ktime.FromSeconds(int64(rlimitCPU.Max))) {
                                t.sendSignalLocked(SignalInfoPriv(linux.SIGKILL), true)
                        }
                }
                t.tg.updateCPUTimersEnabledLocked()
        })
}

// Preconditions: The signal mutex must be locked.
func (tg *ThreadGroup) updateCPUTimersEnabledLocked() {
        rlimitCPU := tg.limits.Get(limits.CPU)
        if tg.itimerVirtSetting.Enabled || tg.itimerProfSetting.Enabled || tg.rlimitCPUSoftSetting.Enabled || rlimitCPU.Max != limits.Infinity {
                atomic.StoreUint32(&tg.cpuTimersEnabled, 1)
        } else {
                atomic.StoreUint32(&tg.cpuTimersEnabled, 0)
        }
}

// StateStatus returns a string representation of the task's current state,
// appropriate for /proc/[pid]/status.
func (t *Task) StateStatus() string {
        switch s := t.TaskGoroutineSchedInfo().State; s {
        case TaskGoroutineNonexistent, TaskGoroutineRunningSys:
                t.tg.pidns.owner.mu.RLock()
                defer t.tg.pidns.owner.mu.RUnlock()
                switch t.exitState {
                case TaskExitZombie:
                        return "Z (zombie)"
                case TaskExitDead:
                        return "X (dead)"
                default:
                        // The task goroutine can't exit before passing through
                        // runExitNotify, so if s == TaskGoroutineNonexistent, the task has
                        // been created but the task goroutine hasn't yet started. The
                        // Linux equivalent is struct task_struct::state == TASK_NEW
                        // (kernel/fork.c:copy_process() =>
                        // kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is
                        // masked out by TASK_REPORT for /proc/[pid]/status, leaving only
                        // TASK_RUNNING.
                        return "R (running)"
                }
        case TaskGoroutineRunningApp:
                return "R (running)"
        case TaskGoroutineBlockedInterruptible:
                return "S (sleeping)"
        case TaskGoroutineStopped:
                t.tg.signalHandlers.mu.Lock()
                defer t.tg.signalHandlers.mu.Unlock()
                switch t.stop.(type) {
                case *groupStop:
                        return "T (stopped)"
                case *ptraceStop:
                        return "t (tracing stop)"
                }
                fallthrough
        case TaskGoroutineBlockedUninterruptible:
                // This is the name Linux uses for TASK_UNINTERRUPTIBLE and
                // TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL):
                // fs/proc/array.c:task_state_array.
                return "D (disk sleep)"
        default:
                panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s))
        }
}

// CPUMask returns a copy of t's allowed CPU mask.
func (t *Task) CPUMask() sched.CPUSet {
        t.mu.Lock()
        defer t.mu.Unlock()
        return t.allowedCPUMask.Copy()
}

// SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of
// mask.
//
// Preconditions: mask.Size() ==
// sched.CPUSetSize(t.Kernel().ApplicationCores()).
func (t *Task) SetCPUMask(mask sched.CPUSet) error {
        if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want {
                panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want))
        }

        // Remove CPUs in mask above Kernel.applicationCores.
        mask.ClearAbove(t.k.applicationCores)

        // Ensure that at least 1 CPU is still allowed.
        if mask.NumCPUs() == 0 {
                return linuxerr.EINVAL
        }

        if t.k.useHostCores {
                // No-op; pretend the mask was immediately changed back.
                return nil
        }

        t.tg.pidns.owner.mu.RLock()
        rootTID := t.tg.pidns.owner.Root.tids[t]
        t.tg.pidns.owner.mu.RUnlock()

        t.mu.Lock()
        defer t.mu.Unlock()
        t.allowedCPUMask = mask
        atomic.StoreInt32(&t.cpu, assignCPU(mask, rootTID))
        return nil
}

// CPU returns the cpu id for a given task.
func (t *Task) CPU() int32 {
        if t.k.useHostCores {
                return int32(hostcpu.GetCPU())
        }

        return atomic.LoadInt32(&t.cpu)
}

// assignCPU returns the virtualized CPU number for the task with global TID
// tid and allowedCPUMask allowed.
func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) {
        // To pretend that threads are evenly distributed to allowed CPUs, choose n
        // to be less than the number of CPUs in allowed ...
        n := int(tid) % int(allowed.NumCPUs())
        // ... then pick the nth CPU in allowed.
        allowed.ForEachCPU(func(c uint) {
                if n--; n == 0 {
                        cpu = int32(c)
                }
        })
        return cpu
}

// Niceness returns t's niceness.
func (t *Task) Niceness() int {
        t.mu.Lock()
        defer t.mu.Unlock()
        return t.niceness
}

// Priority returns t's priority.
func (t *Task) Priority() int {
        t.mu.Lock()
        defer t.mu.Unlock()
        return t.niceness + 20
}

// SetNiceness sets t's niceness to n.
func (t *Task) SetNiceness(n int) {
        t.mu.Lock()
        defer t.mu.Unlock()
        t.niceness = n
}

// NumaPolicy returns t's current numa policy.
func (t *Task) NumaPolicy() (policy linux.NumaPolicy, nodeMask uint64) {
        t.mu.Lock()
        defer t.mu.Unlock()
        return t.numaPolicy, t.numaNodeMask
}

// SetNumaPolicy sets t's numa policy.
func (t *Task) SetNumaPolicy(policy linux.NumaPolicy, nodeMask uint64) {
        t.mu.Lock()
        defer t.mu.Unlock()
        t.numaPolicy = policy
        t.numaNodeMask = nodeMask
}




























  567 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build amd64

package linux

import (
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
)

// Clone implements linux syscall clone(2).
// sys_clone has so many flavors. We implement the default one in linux 3.11
// x86_64:
//    sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, tls_val)
func Clone(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        flags := int(args[0].Int())
        stack := args[1].Pointer()
        parentTID := args[2].Pointer()
        childTID := args[3].Pointer()
        tls := args[4].Pointer()
        return clone(t, flags, stack, parentTID, childTID, tls)
}































    8 
















    3 




    3 





   55 



































   55 














   55 





   55 


   54 













   55 






   55 


   55 

   55 



   55 



















   55 

   55 






   55 

   55 
   55 


   54 

   55 

   55 

   55 






   55 























   55 



   55 

   55 


   55 




  573 
  571 




  574 







  571 
  563 

   40 



   40 





   30 



   40 





   40 



   40 





   34 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mm

import (
        "fmt"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        "gvisor.dev/gvisor/pkg/sentry/platform"
)

// NewMemoryManager returns a new MemoryManager with no mappings and 1 user.
func NewMemoryManager(p platform.Platform, mfp pgalloc.MemoryFileProvider, sleepForActivation bool) *MemoryManager {
        return &MemoryManager{
                p:                  p,
                mfp:                mfp,
                haveASIO:           p.SupportsAddressSpaceIO(),
                privateRefs:        &privateRefs{},
                users:              1,
                auxv:               arch.Auxv{},
                dumpability:        UserDumpable,
                aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
                sleepForActivation: sleepForActivation,
        }
}

// SetMmapLayout initializes mm's layout from the given arch.Context.
//
// Preconditions: mm contains no mappings and is not used concurrently.
func (mm *MemoryManager) SetMmapLayout(ac arch.Context, r *limits.LimitSet) (arch.MmapLayout, error) {
        layout, err := ac.NewMmapLayout(mm.p.MinUserAddress(), mm.p.MaxUserAddress(), r)
        if err != nil {
                return arch.MmapLayout{}, err
        }
        mm.layout = layout
        return layout, nil
}

// Fork creates a copy of mm with 1 user, as for Linux syscalls fork() or
// clone() (without CLONE_VM).
func (mm *MemoryManager) Fork(ctx context.Context) (*MemoryManager, error) {
        mm.AddressSpace().PreFork()
        defer mm.AddressSpace().PostFork()
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        mm.mappingMu.RLock()
        defer mm.mappingMu.RUnlock()
        mm2 := &MemoryManager{
                p:           mm.p,
                mfp:         mm.mfp,
                haveASIO:    mm.haveASIO,
                layout:      mm.layout,
                privateRefs: mm.privateRefs,
                users:       1,
                brk:         mm.brk,
                usageAS:     mm.usageAS,
                dataAS:      mm.dataAS,
                // "The child does not inherit its parent's memory locks (mlock(2),
                // mlockall(2))." - fork(2). So lockedAS is 0 and defMLockMode is
                // MLockNone, both of which are zero values. vma.mlockMode is reset
                // when copied below.
                captureInvalidations: true,
                argv:                 mm.argv,
                envv:                 mm.envv,
                auxv:                 append(arch.Auxv(nil), mm.auxv...),
                // IncRef'd below, once we know that there isn't an error.
                executable:         mm.executable,
                dumpability:        mm.dumpability,
                aioManager:         aioManager{contexts: make(map[uint64]*AIOContext)},
                sleepForActivation: mm.sleepForActivation,
                vdsoSigReturnAddr:  mm.vdsoSigReturnAddr,
        }

        // Copy vmas.
        dontforks := false
        dstvgap := mm2.vmas.FirstGap()
        for srcvseg := mm.vmas.FirstSegment(); srcvseg.Ok(); srcvseg = srcvseg.NextSegment() {
                vma := srcvseg.Value() // makes a copy of the vma
                vmaAR := srcvseg.Range()

                if vma.dontfork {
                        length := uint64(vmaAR.Length())
                        mm2.usageAS -= length
                        if vma.isPrivateDataLocked() {
                                mm2.dataAS -= length
                        }
                        dontforks = true
                        continue
                }

                // Inform the Mappable, if any, of the new mapping.
                if vma.mappable != nil {
                        if err := vma.mappable.AddMapping(ctx, mm2, vmaAR, vma.off, vma.canWriteMappableLocked()); err != nil {
                                mm2.removeVMAsLocked(ctx, mm2.applicationAddrRange())
                                return nil, err
                        }
                }
                if vma.id != nil {
                        vma.id.IncRef()
                }
                vma.mlockMode = memmap.MLockNone
                dstvgap = mm2.vmas.Insert(dstvgap, vmaAR, vma).NextGap()
                // We don't need to update mm2.usageAS since we copied it from mm
                // above.
        }

        // Copy pmas. We have to lock mm.activeMu for writing to make existing
        // private pmas copy-on-write. We also have to lock mm2.activeMu since
        // after copying vmas above, memmap.Mappables may call mm2.Invalidate. We
        // only copy private pmas, since in the common case where fork(2) is
        // immediately followed by execve(2), copying non-private pmas that can be
        // regenerated by calling memmap.Mappable.Translate is a waste of time.
        // (Linux does the same; compare kernel/fork.c:dup_mmap() =>
        // mm/memory.c:copy_page_range().)
        mm2.activeMu.Lock()
        defer mm2.activeMu.Unlock()
        mm.activeMu.Lock()
        defer mm.activeMu.Unlock()
        if dontforks {
                defer mm.pmas.MergeRange(mm.applicationAddrRange())
        }
        srcvseg := mm.vmas.FirstSegment()
        dstpgap := mm2.pmas.FirstGap()
        var unmapAR hostarch.AddrRange
        for srcpseg := mm.pmas.FirstSegment(); srcpseg.Ok(); srcpseg = srcpseg.NextSegment() {
                pma := srcpseg.ValuePtr()
                if !pma.private {
                        continue
                }

                if dontforks {
                        // Find the 'vma' that contains the starting address
                        // associated with the 'pma' (there must be one).
                        srcvseg = srcvseg.seekNextLowerBound(srcpseg.Start())
                        if checkInvariants {
                                if !srcvseg.Ok() {
                                        panic(fmt.Sprintf("no vma covers pma range %v", srcpseg.Range()))
                                }
                                if srcpseg.Start() < srcvseg.Start() {
                                        panic(fmt.Sprintf("vma %v ran ahead of pma %v", srcvseg.Range(), srcpseg.Range()))
                                }
                        }

                        srcpseg = mm.pmas.Isolate(srcpseg, srcvseg.Range())
                        if srcvseg.ValuePtr().dontfork {
                                continue
                        }
                        pma = srcpseg.ValuePtr()
                }

                if !pma.needCOW {
                        pma.needCOW = true
                        if pma.effectivePerms.Write {
                                // We don't want to unmap the whole address space, even though
                                // doing so would reduce calls to unmapASLocked(), because mm
                                // will most likely continue to be used after the fork, so
                                // unmapping pmas unnecessarily will result in extra page
                                // faults. But we do want to merge consecutive AddrRanges
                                // across pma boundaries.
                                if unmapAR.End == srcpseg.Start() {
                                        unmapAR.End = srcpseg.End()
                                } else {
                                        if unmapAR.Length() != 0 {
                                                mm.unmapASLocked(unmapAR)
                                        }
                                        unmapAR = srcpseg.Range()
                                }
                                pma.effectivePerms.Write = false
                        }
                        pma.maxPerms.Write = false
                }
                fr := srcpseg.fileRange()
                mm2.incPrivateRef(fr)
                srcpseg.ValuePtr().file.IncRef(fr)
                addrRange := srcpseg.Range()
                mm2.addRSSLocked(addrRange)
                dstpgap = mm2.pmas.Insert(dstpgap, addrRange, *pma).NextGap()
        }
        if unmapAR.Length() != 0 {
                mm.unmapASLocked(unmapAR)
        }

        // Between when we call memmap.Mappable.AddMapping while copying vmas and
        // when we lock mm2.activeMu to copy pmas, calls to mm2.Invalidate() are
        // ineffective because the pmas they invalidate haven't yet been copied,
        // possibly allowing mm2 to get invalidated translations:
        //
        // Invalidating Mappable            mm.Fork
        // ---------------------            -------
        //
        // mm2.Invalidate()
        //                                  mm.activeMu.Lock()
        // mm.Invalidate() /* blocks */
        //                                  mm2.activeMu.Lock()
        //                                  (mm copies invalidated pma to mm2)
        //
        // This would technically be both safe (since we only copy private pmas,
        // which will still hold a reference on their memory) and consistent with
        // Linux, but we avoid it anyway by setting mm2.captureInvalidations during
        // construction, causing calls to mm2.Invalidate() to be captured in
        // mm2.capturedInvalidations, to be replayed after pmas are copied - i.e.
        // here.
        mm2.captureInvalidations = false
        for _, invArgs := range mm2.capturedInvalidations {
                mm2.invalidateLocked(invArgs.ar, invArgs.opts.InvalidatePrivate, true)
        }
        mm2.capturedInvalidations = nil

        if mm2.executable != nil {
                mm2.executable.IncRef()
        }
        return mm2, nil
}

// IncUsers increments mm's user count and returns true. If the user count is
// already 0, IncUsers does nothing and returns false.
func (mm *MemoryManager) IncUsers() bool {
        for {
                users := atomic.LoadInt32(&mm.users)
                if users == 0 {
                        return false
                }
                if atomic.CompareAndSwapInt32(&mm.users, users, users+1) {
                        return true
                }
        }
}

// DecUsers decrements mm's user count. If the user count reaches 0, all
// mappings in mm are unmapped.
func (mm *MemoryManager) DecUsers(ctx context.Context) {
        if users := atomic.AddInt32(&mm.users, -1); users > 0 {
                return
        } else if users < 0 {
                panic(fmt.Sprintf("Invalid MemoryManager.users: %d", users))
        }

        mm.destroyAIOManager(ctx)

        mm.metadataMu.Lock()
        exe := mm.executable
        mm.executable = nil
        mm.metadataMu.Unlock()
        if exe != nil {
                exe.DecRef(ctx)
        }

        mm.activeMu.Lock()
        // Sanity check.
        if atomic.LoadInt32(&mm.active) != 0 {
                panic("active address space lost?")
        }
        // Make sure the AddressSpace is returned.
        if mm.as != nil {
                mm.as.Release()
                mm.as = nil
        }
        mm.activeMu.Unlock()

        mm.mappingMu.Lock()
        defer mm.mappingMu.Unlock()
        // If mm is being dropped before mm.SetMmapLayout was called,
        // mm.applicationAddrRange() will be empty.
        if ar := mm.applicationAddrRange(); ar.Length() != 0 {
                mm.unmapLocked(ctx, ar)
        }
}
























  547 
  544 


   40 



  223 
  223 


    2 



   47 
   47 


    3 



   30 
   30 


    2 


   43 



   43 

    2 


   43 






    4 


























    7 













    7 










    7 




    7 










    7 







    2 






    2 




    5 




    3 


    5 
    5 



    1 


    4 
    1 







    3 



    3 


    3 



    3 



    3 







    3 


    3 


    2 


    2 













    2 




    1 


    2 
    2 




    2 
    1 


    1 


    1 


    1 



    1 




    2 





    2 



    4 



    3 






    4 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package auth

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
)

// MapFromKUID translates kuid, a UID in the root namespace, to a UID in ns.
func (ns *UserNamespace) MapFromKUID(kuid KUID) UID {
        if ns.parent == nil {
                return UID(kuid)
        }
        return UID(ns.mapID(&ns.uidMapFromParent, uint32(ns.parent.MapFromKUID(kuid))))
}

// MapFromKGID translates kgid, a GID in the root namespace, to a GID in ns.
func (ns *UserNamespace) MapFromKGID(kgid KGID) GID {
        if ns.parent == nil {
                return GID(kgid)
        }
        return GID(ns.mapID(&ns.gidMapFromParent, uint32(ns.parent.MapFromKGID(kgid))))
}

// MapToKUID translates uid, a UID in ns, to a UID in the root namespace.
func (ns *UserNamespace) MapToKUID(uid UID) KUID {
        if ns.parent == nil {
                return KUID(uid)
        }
        return ns.parent.MapToKUID(UID(ns.mapID(&ns.uidMapToParent, uint32(uid))))
}

// MapToKGID translates gid, a GID in ns, to a GID in the root namespace.
func (ns *UserNamespace) MapToKGID(gid GID) KGID {
        if ns.parent == nil {
                return KGID(gid)
        }
        return ns.parent.MapToKGID(GID(ns.mapID(&ns.gidMapToParent, uint32(gid))))
}

func (ns *UserNamespace) mapID(m *idMapSet, id uint32) uint32 {
        if id == NoID {
                return NoID
        }
        ns.mu.Lock()
        defer ns.mu.Unlock()
        if it := m.FindSegment(id); it.Ok() {
                return it.Value() + (id - it.Start())
        }
        return NoID
}

// allIDsMapped returns true if all IDs in the range [start, end) are mapped in
// m.
//
// Preconditions: end >= start.
func (ns *UserNamespace) allIDsMapped(m *idMapSet, start, end uint32) bool {
        ns.mu.Lock()
        defer ns.mu.Unlock()
        return m.SpanRange(idMapRange{start, end}) == end-start
}

// An IDMapEntry represents a mapping from a range of contiguous IDs in a user
// namespace to an equally-sized range of contiguous IDs in the namespace's
// parent.
//
// +stateify savable
type IDMapEntry struct {
        // FirstID is the first ID in the range in the namespace.
        FirstID uint32

        // FirstParentID is the first ID in the range in the parent namespace.
        FirstParentID uint32

        // Length is the number of IDs in the range.
        Length uint32
}

// SetUIDMap instructs ns to translate UIDs as specified by entries.
//
// Note: SetUIDMap does not place an upper bound on the number of entries, but
// Linux does. This restriction is implemented in SetUIDMap's caller, the
// implementation of /proc/[pid]/uid_map.
func (ns *UserNamespace) SetUIDMap(ctx context.Context, entries []IDMapEntry) error {
        c := CredentialsFromContext(ctx)

        ns.mu.Lock()
        defer ns.mu.Unlock()
        // "After the creation of a new user namespace, the uid_map file of *one*
        // of the processes in the namespace may be written to *once* to define the
        // mapping of user IDs in the new user namespace. An attempt to write more
        // than once to a uid_map file in a user namespace fails with the error
        // EPERM. Similar rules apply for gid_map files." - user_namespaces(7)
        if !ns.uidMapFromParent.IsEmpty() {
                return linuxerr.EPERM
        }
        // "At least one line must be written to the file."
        if len(entries) == 0 {
                return linuxerr.EINVAL
        }
        // """
        // In order for a process to write to the /proc/[pid]/uid_map
        // (/proc/[pid]/gid_map) file, all of the following requirements must be
        // met:
        //
        // 1. The writing process must have the CAP_SETUID (CAP_SETGID) capability
        // in the user namespace of the process pid.
        // """
        if !c.HasCapabilityIn(linux.CAP_SETUID, ns) {
                return linuxerr.EPERM
        }
        // "2. The writing process must either be in the user namespace of the process
        // pid or be in the parent user namespace of the process pid."
        if c.UserNamespace != ns && c.UserNamespace != ns.parent {
                return linuxerr.EPERM
        }
        // """
        // 3. (see trySetUIDMap)
        //
        // 4. One of the following two cases applies:
        //
        // * Either the writing process has the CAP_SETUID (CAP_SETGID) capability
        // in the parent user namespace.
        // """
        if !c.HasCapabilityIn(linux.CAP_SETUID, ns.parent) {
                // """
                // * Or otherwise all of the following restrictions apply:
                //
                //   + The data written to uid_map (gid_map) must consist of a single line
                //   that maps the writing process' effective user ID (group ID) in the
                //   parent user namespace to a user ID (group ID) in the user namespace.
                // """
                if len(entries) != 1 || ns.parent.MapToKUID(UID(entries[0].FirstParentID)) != c.EffectiveKUID || entries[0].Length != 1 {
                        return linuxerr.EPERM
                }
                // """
                //   + The writing process must have the same effective user ID as the
                //   process that created the user namespace.
                // """
                if c.EffectiveKUID != ns.owner {
                        return linuxerr.EPERM
                }
        }
        // trySetUIDMap leaves data in maps if it fails.
        if err := ns.trySetUIDMap(entries); err != nil {
                ns.uidMapFromParent.RemoveAll()
                ns.uidMapToParent.RemoveAll()
                return err
        }
        return nil
}

func (ns *UserNamespace) trySetUIDMap(entries []IDMapEntry) error {
        for _, e := range entries {
                // Determine upper bounds and check for overflow. This implicitly
                // checks for NoID.
                lastID := e.FirstID + e.Length
                if lastID <= e.FirstID {
                        return linuxerr.EINVAL
                }
                lastParentID := e.FirstParentID + e.Length
                if lastParentID <= e.FirstParentID {
                        return linuxerr.EINVAL
                }
                // "3. The mapped user IDs (group IDs) must in turn have a mapping in
                // the parent user namespace."
                // Only the root namespace has a nil parent, and root is assigned
                // mappings when it's created, so SetUIDMap would have returned EPERM
                // without reaching this point if ns is root.
                if !ns.parent.allIDsMapped(&ns.parent.uidMapToParent, e.FirstParentID, lastParentID) {
                        return linuxerr.EPERM
                }
                // If either of these Adds fail, we have an overlapping range.
                if !ns.uidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
                        return linuxerr.EINVAL
                }
                if !ns.uidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
                        return linuxerr.EINVAL
                }
        }
        return nil
}

// SetGIDMap instructs ns to translate GIDs as specified by entries.
func (ns *UserNamespace) SetGIDMap(ctx context.Context, entries []IDMapEntry) error {
        c := CredentialsFromContext(ctx)

        ns.mu.Lock()
        defer ns.mu.Unlock()
        if !ns.gidMapFromParent.IsEmpty() {
                return linuxerr.EPERM
        }
        if len(entries) == 0 {
                return linuxerr.EINVAL
        }
        if !c.HasCapabilityIn(linux.CAP_SETGID, ns) {
                return linuxerr.EPERM
        }
        if c.UserNamespace != ns && c.UserNamespace != ns.parent {
                return linuxerr.EPERM
        }
        if !c.HasCapabilityIn(linux.CAP_SETGID, ns.parent) {
                if len(entries) != 1 || ns.parent.MapToKGID(GID(entries[0].FirstParentID)) != c.EffectiveKGID || entries[0].Length != 1 {
                        return linuxerr.EPERM
                }
                // It's correct for this to still be UID.
                if c.EffectiveKUID != ns.owner {
                        return linuxerr.EPERM
                }
                // "In the case of gid_map, use of the setgroups(2) system call must
                // first be denied by writing "deny" to the /proc/[pid]/setgroups file
                // (see below) before writing to gid_map." (This file isn't implemented
                // in the version of Linux we're emulating; see comment in
                // UserNamespace.)
        }
        if err := ns.trySetGIDMap(entries); err != nil {
                ns.gidMapFromParent.RemoveAll()
                ns.gidMapToParent.RemoveAll()
                return err
        }
        return nil
}

func (ns *UserNamespace) trySetGIDMap(entries []IDMapEntry) error {
        for _, e := range entries {
                lastID := e.FirstID + e.Length
                if lastID <= e.FirstID {
                        return linuxerr.EINVAL
                }
                lastParentID := e.FirstParentID + e.Length
                if lastParentID <= e.FirstParentID {
                        return linuxerr.EINVAL
                }
                if !ns.parent.allIDsMapped(&ns.parent.gidMapToParent, e.FirstParentID, lastParentID) {
                        return linuxerr.EPERM
                }
                if !ns.gidMapFromParent.Add(idMapRange{e.FirstParentID, lastParentID}, e.FirstID) {
                        return linuxerr.EINVAL
                }
                if !ns.gidMapToParent.Add(idMapRange{e.FirstID, lastID}, e.FirstParentID) {
                        return linuxerr.EINVAL
                }
        }
        return nil
}

// UIDMap returns the user ID mappings configured for ns. If no mappings
// have been configured, UIDMap returns nil.
func (ns *UserNamespace) UIDMap() []IDMapEntry {
        return ns.getIDMap(&ns.uidMapToParent)
}

// GIDMap returns the group ID mappings configured for ns. If no mappings
// have been configured, GIDMap returns nil.
func (ns *UserNamespace) GIDMap() []IDMapEntry {
        return ns.getIDMap(&ns.gidMapToParent)
}

func (ns *UserNamespace) getIDMap(m *idMapSet) []IDMapEntry {
        ns.mu.Lock()
        defer ns.mu.Unlock()
        var entries []IDMapEntry
        for it := m.FirstSegment(); it.Ok(); it = it.NextSegment() {
                entries = append(entries, IDMapEntry{
                        FirstID:       it.Start(),
                        FirstParentID: it.Value(),
                        Length:        it.Range().Length(),
                })
        }
        return entries
}




























































































































































    6 


    6 








    6 



    1 


    1 








    1 
















  446 

  420 


  445 

  445 


    1 


  444 
    2 





  438 




  442 




    4 

  444 


  429 



   50 
   32 


   18 

    2 








   36 

   25 


   13 


   25 




   24 

   35 








  155 

  151 


  146 


  149 

    2 


  150 

  153 








   14 

   13 


   13 


   13 

    3 


   10 

   11 


    6 


































    6 




    6 




    6 

    1 


    6 




    6 
    4 




    6 



    6 





    6 



    6 


    6 


    6 




    2 


































  372 






   44 






    8 






    8 






   47 









   35 









  149 









   14 






  275 






  248 






   65 






   25 














   25 


   21 


   25 



   49 




   11 


    8 


   11 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package usermem governs access to user memory.
package usermem

import (
        "bytes"
        "errors"
        "io"
        "strconv"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/gohacks"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/safemem"
)

// IO provides access to the contents of a virtual memory space.
type IO interface {
        // CopyOut copies len(src) bytes from src to the memory mapped at addr. It
        // returns the number of bytes copied. If the number of bytes copied is <
        // len(src), it returns a non-nil error explaining why.
        //
        // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
        // any following locks in the lock order.
        //
        // Postconditions: CopyOut does not retain src.
        CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts IOOpts) (int, error)

        // CopyIn copies len(dst) bytes from the memory mapped at addr to dst.
        // It returns the number of bytes copied. If the number of bytes copied is
        // < len(dst), it returns a non-nil error explaining why.
        //
        // Preconditions: The caller must not hold mm.MemoryManager.mappingMu or
        // any following locks in the lock order.
        //
        // Postconditions: CopyIn does not retain dst.
        CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts IOOpts) (int, error)

        // ZeroOut sets toZero bytes to 0, starting at addr. It returns the number
        // of bytes zeroed. If the number of bytes zeroed is < toZero, it returns a
        // non-nil error explaining why.
        //
        // Preconditions:
        // * The caller must not hold mm.MemoryManager.mappingMu or any
        //   following locks in the lock order.
        // * toZero >= 0.
        ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts IOOpts) (int64, error)

        // CopyOutFrom copies ars.NumBytes() bytes from src to the memory mapped at
        // ars. It returns the number of bytes copied, which may be less than the
        // number of bytes read from src if copying fails. CopyOutFrom may return a
        // partial copy without an error iff src.ReadToBlocks returns a partial
        // read without an error.
        //
        // CopyOutFrom calls src.ReadToBlocks at most once.
        //
        // Preconditions:
        // * The caller must not hold mm.MemoryManager.mappingMu or any
        //   following locks in the lock order.
        // * src.ReadToBlocks must not block on mm.MemoryManager.activeMu or
        //   any preceding locks in the lock order.
        CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error)

        // CopyInTo copies ars.NumBytes() bytes from the memory mapped at ars to
        // dst. It returns the number of bytes copied. CopyInTo may return a
        // partial copy without an error iff dst.WriteFromBlocks returns a partial
        // write without an error.
        //
        // CopyInTo calls dst.WriteFromBlocks at most once.
        //
        // Preconditions:
        // * The caller must not hold mm.MemoryManager.mappingMu or any
        //   following locks in the lock order.
        // * dst.WriteFromBlocks must not block on mm.MemoryManager.activeMu or
        //   any preceding locks in the lock order.
        CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error)

        // TODO(jamieliu): The requirement that CopyOutFrom/CopyInTo call src/dst
        // at most once, which is unnecessary in most cases, forces implementations
        // to gather safemem.Blocks into a single slice to pass to src/dst. Add
        // CopyOutFromIter/CopyInToIter, which relaxes this restriction, to avoid
        // this allocation.

        // SwapUint32 atomically sets the uint32 value at addr to new and
        // returns the previous value.
        //
        // Preconditions:
        // * The caller must not hold mm.MemoryManager.mappingMu or any
        //   following locks in the lock order.
        // * addr must be aligned to a 4-byte boundary.
        SwapUint32(ctx context.Context, addr hostarch.Addr, new uint32, opts IOOpts) (uint32, error)

        // CompareAndSwapUint32 atomically compares the uint32 value at addr to
        // old; if they are equal, the value in memory is replaced by new. In
        // either case, the previous value stored in memory is returned.
        //
        // Preconditions:
        // * The caller must not hold mm.MemoryManager.mappingMu or any
        //   following locks in the lock order.
        // * addr must be aligned to a 4-byte boundary.
        CompareAndSwapUint32(ctx context.Context, addr hostarch.Addr, old, new uint32, opts IOOpts) (uint32, error)

        // LoadUint32 atomically loads the uint32 value at addr and returns it.
        //
        // Preconditions:
        // * The caller must not hold mm.MemoryManager.mappingMu or any
        //   following locks in the lock order.
        // * addr must be aligned to a 4-byte boundary.
        LoadUint32(ctx context.Context, addr hostarch.Addr, opts IOOpts) (uint32, error)
}

// IOOpts contains options applicable to all IO methods.
type IOOpts struct {
        // If IgnorePermissions is true, application-defined memory protections set
        // by mmap(2) or mprotect(2) will be ignored. (Memory protections required
        // by the target of the mapping are never ignored.)
        IgnorePermissions bool

        // If AddressSpaceActive is true, the IO implementation may assume that it
        // has an active AddressSpace and can therefore use AddressSpace copying
        // without performing activation. See mm/io.go for details.
        AddressSpaceActive bool
}

// IOReadWriter is an io.ReadWriter that reads from / writes to addresses
// starting at addr in IO. The preconditions that apply to IO.CopyIn and
// IO.CopyOut also apply to IOReadWriter.Read and IOReadWriter.Write
// respectively.
type IOReadWriter struct {
        Ctx  context.Context
        IO   IO
        Addr hostarch.Addr
        Opts IOOpts
}

// Read implements io.Reader.Read.
//
// Note that an address space does not have an "end of file", so Read can only
// return io.EOF if IO.CopyIn returns io.EOF. Attempts to read unmapped or
// unreadable memory, or beyond the end of the address space, should return
// EFAULT.
func (rw *IOReadWriter) Read(dst []byte) (int, error) {
        n, err := rw.IO.CopyIn(rw.Ctx, rw.Addr, dst, rw.Opts)
        end, ok := rw.Addr.AddLength(uint64(n))
        if ok {
                rw.Addr = end
        } else {
                // Disallow wraparound.
                rw.Addr = ^hostarch.Addr(0)
                if err != nil {
                        err = linuxerr.EFAULT
                }
        }
        return n, err
}

// Write implements io.Writer.Write.
func (rw *IOReadWriter) Write(src []byte) (int, error) {
        n, err := rw.IO.CopyOut(rw.Ctx, rw.Addr, src, rw.Opts)
        end, ok := rw.Addr.AddLength(uint64(n))
        if ok {
                rw.Addr = end
        } else {
                // Disallow wraparound.
                rw.Addr = ^hostarch.Addr(0)
                if err != nil {
                        err = linuxerr.EFAULT
                }
        }
        return n, err
}

// CopyStringIn tuning parameters, defined outside that function for tests.
const (
        copyStringIncrement     = 64
        copyStringMaxInitBufLen = 256
)

// CopyStringIn copies a NUL-terminated string of unknown length from the
// memory mapped at addr in uio and returns it as a string (not including the
// trailing NUL). If the length of the string, including the terminating NUL,
// would exceed maxlen, CopyStringIn returns the string truncated to maxlen and
// ENAMETOOLONG.
//
// Preconditions: Same as IO.CopyFromUser, plus:
// * maxlen >= 0.
func CopyStringIn(ctx context.Context, uio IO, addr hostarch.Addr, maxlen int, opts IOOpts) (string, error) {
        initLen := maxlen
        if initLen > copyStringMaxInitBufLen {
                initLen = copyStringMaxInitBufLen
        }
        buf := make([]byte, initLen)
        var done int
        for done < maxlen {
                // Read up to copyStringIncrement bytes at a time.
                readlen := copyStringIncrement
                if readlen > maxlen-done {
                        readlen = maxlen - done
                }
                end, ok := addr.AddLength(uint64(readlen))
                if !ok {
                        return gohacks.StringFromImmutableBytes(buf[:done]), linuxerr.EFAULT
                }
                // Shorten the read to avoid crossing page boundaries, since faulting
                // in a page unnecessarily is expensive. This also ensures that partial
                // copies up to the end of application-mappable memory succeed.
                if addr.RoundDown() != end.RoundDown() {
                        end = end.RoundDown()
                        readlen = int(end - addr)
                }
                // Ensure that our buffer is large enough to accommodate the read.
                if done+readlen > len(buf) {
                        newBufLen := len(buf) * 2
                        if newBufLen > maxlen {
                                newBufLen = maxlen
                        }
                        buf = append(buf, make([]byte, newBufLen-len(buf))...)
                }
                n, err := uio.CopyIn(ctx, addr, buf[done:done+readlen], opts)
                // Look for the terminating zero byte, which may have occurred before
                // hitting err.
                if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 {
                        return gohacks.StringFromImmutableBytes(buf[:done+i]), nil
                }

                done += n
                if err != nil {
                        return gohacks.StringFromImmutableBytes(buf[:done]), err
                }
                addr = end
        }
        return gohacks.StringFromImmutableBytes(buf), linuxerr.ENAMETOOLONG
}

// CopyOutVec copies bytes from src to the memory mapped at ars in uio. The
// maximum number of bytes copied is ars.NumBytes() or len(src), whichever is
// less. CopyOutVec returns the number of bytes copied; if this is less than
// the maximum, it returns a non-nil error explaining why.
//
// Preconditions: Same as IO.CopyOut.
func CopyOutVec(ctx context.Context, uio IO, ars hostarch.AddrRangeSeq, src []byte, opts IOOpts) (int, error) {
        var done int
        for !ars.IsEmpty() && done < len(src) {
                ar := ars.Head()
                cplen := len(src) - done
                if hostarch.Addr(cplen) >= ar.Length() {
                        cplen = int(ar.Length())
                }
                n, err := uio.CopyOut(ctx, ar.Start, src[done:done+cplen], opts)
                done += n
                if err != nil {
                        return done, err
                }
                ars = ars.DropFirst(n)
        }
        return done, nil
}

// CopyInVec copies bytes from the memory mapped at ars in uio to dst. The
// maximum number of bytes copied is ars.NumBytes() or len(dst), whichever is
// less. CopyInVec returns the number of bytes copied; if this is less than the
// maximum, it returns a non-nil error explaining why.
//
// Preconditions: Same as IO.CopyIn.
func CopyInVec(ctx context.Context, uio IO, ars hostarch.AddrRangeSeq, dst []byte, opts IOOpts) (int, error) {
        var done int
        for !ars.IsEmpty() && done < len(dst) {
                ar := ars.Head()
                cplen := len(dst) - done
                if hostarch.Addr(cplen) >= ar.Length() {
                        cplen = int(ar.Length())
                }
                n, err := uio.CopyIn(ctx, ar.Start, dst[done:done+cplen], opts)
                done += n
                if err != nil {
                        return done, err
                }
                ars = ars.DropFirst(n)
        }
        return done, nil
}

// ZeroOutVec writes zeroes to the memory mapped at ars in uio. The maximum
// number of bytes written is ars.NumBytes() or toZero, whichever is less.
// ZeroOutVec returns the number of bytes written; if this is less than the
// maximum, it returns a non-nil error explaining why.
//
// Preconditions: Same as IO.ZeroOut.
func ZeroOutVec(ctx context.Context, uio IO, ars hostarch.AddrRangeSeq, toZero int64, opts IOOpts) (int64, error) {
        var done int64
        for !ars.IsEmpty() && done < toZero {
                ar := ars.Head()
                cplen := toZero - done
                if hostarch.Addr(cplen) >= ar.Length() {
                        cplen = int64(ar.Length())
                }
                n, err := uio.ZeroOut(ctx, ar.Start, cplen, opts)
                done += n
                if err != nil {
                        return done, err
                }
                ars = ars.DropFirst64(n)
        }
        return done, nil
}

func isASCIIWhitespace(b byte) bool {
        // Compare Linux include/linux/ctype.h, lib/ctype.c.
        //  9 => horizontal tab '\t'
        // 10 => line feed '\n'
        // 11 => vertical tab '\v'
        // 12 => form feed '\c'
        // 13 => carriage return '\r'
        return b == ' ' || (b >= 9 && b <= 13)
}

// CopyInt32StringsInVec copies up to len(dsts) whitespace-separated decimal
// strings from the memory mapped at ars in uio and converts them to int32
// values in dsts. It returns the number of bytes read.
//
// CopyInt32StringsInVec shares the following properties with Linux's
// kernel/sysctl.c:proc_dointvec(write=1):
//
// - If any read value overflows the range of int32, or any invalid characters
// are encountered during the read, CopyInt32StringsInVec returns EINVAL.
//
// - If, upon reaching the end of ars, fewer than len(dsts) values have been
// read, CopyInt32StringsInVec returns no error if at least 1 value was read
// and EINVAL otherwise.
//
// - Trailing whitespace after the last successfully read value is counted in
// the number of bytes read.
//
// Unlike proc_dointvec():
//
// - CopyInt32StringsInVec does not implicitly limit ars.NumBytes() to
// PageSize-1; callers that require this must do so explicitly.
//
// - CopyInt32StringsInVec returns EINVAL if ars.NumBytes() == 0.
//
// Preconditions: Same as CopyInVec.
func CopyInt32StringsInVec(ctx context.Context, uio IO, ars hostarch.AddrRangeSeq, dsts []int32, opts IOOpts) (int64, error) {
        if len(dsts) == 0 {
                return 0, nil
        }

        buf := make([]byte, ars.NumBytes())
        n, cperr := CopyInVec(ctx, uio, ars, buf, opts)
        buf = buf[:n]

        var i, j int
        for ; j < len(dsts); j++ {
                // Skip leading whitespace.
                for i < len(buf) && isASCIIWhitespace(buf[i]) {
                        i++
                }
                if i == len(buf) {
                        break
                }

                // Find the end of the value to be parsed (next whitespace or end of string).
                nextI := i + 1
                for nextI < len(buf) && !isASCIIWhitespace(buf[nextI]) {
                        nextI++
                }

                // Parse a single value.
                val, err := strconv.ParseInt(string(buf[i:nextI]), 10, 32)
                if err != nil {
                        return int64(i), linuxerr.EINVAL
                }
                dsts[j] = int32(val)

                i = nextI
        }

        // Skip trailing whitespace.
        for i < len(buf) && isASCIIWhitespace(buf[i]) {
                i++
        }

        if cperr != nil {
                return int64(i), cperr
        }
        if j == 0 {
                return int64(i), linuxerr.EINVAL
        }
        return int64(i), nil
}

// CopyInt32StringInVec is equivalent to CopyInt32StringsInVec, but copies at
// most one int32.
func CopyInt32StringInVec(ctx context.Context, uio IO, ars hostarch.AddrRangeSeq, dst *int32, opts IOOpts) (int64, error) {
        dsts := [1]int32{*dst}
        n, err := CopyInt32StringsInVec(ctx, uio, ars, dsts[:], opts)
        *dst = dsts[0]
        return n, err
}

// IOSequence holds arguments to IO methods.
type IOSequence struct {
        IO    IO
        Addrs hostarch.AddrRangeSeq
        Opts  IOOpts
}

// NumBytes returns s.Addrs.NumBytes().
//
// Note that NumBytes() may return 0 even if !s.Addrs.IsEmpty(), since
// s.Addrs may contain a non-zero number of zero-length AddrRanges.
// Many clients of
// IOSequence currently do something like:
//
//     if ioseq.NumBytes() == 0 {
//       return 0, nil
//     }
//     if f.availableBytes == 0 {
//       return 0, syserror.ErrWouldBlock
//     }
//     return ioseq.CopyOutFrom(..., reader)
//
// In such cases, using s.Addrs.IsEmpty() will cause them to have the wrong
// behavior for zero-length I/O. However, using s.NumBytes() == 0 instead means
// that we will return success for zero-length I/O in cases where Linux would
// return EFAULT due to a failed access_ok() check, so in the long term we
// should move checks for ErrWouldBlock etc. into the body of
// reader.ReadToBlocks and use s.Addrs.IsEmpty() instead.
func (s IOSequence) NumBytes() int64 {
        return s.Addrs.NumBytes()
}

// DropFirst returns a copy of s with s.Addrs.DropFirst(n).
//
// Preconditions: Same as hostarch.AddrRangeSeq.DropFirst.
func (s IOSequence) DropFirst(n int) IOSequence {
        return IOSequence{s.IO, s.Addrs.DropFirst(n), s.Opts}
}

// DropFirst64 returns a copy of s with s.Addrs.DropFirst64(n).
//
// Preconditions: Same as hostarch.AddrRangeSeq.DropFirst64.
func (s IOSequence) DropFirst64(n int64) IOSequence {
        return IOSequence{s.IO, s.Addrs.DropFirst64(n), s.Opts}
}

// TakeFirst returns a copy of s with s.Addrs.TakeFirst(n).
//
// Preconditions: Same as hostarch.AddrRangeSeq.TakeFirst.
func (s IOSequence) TakeFirst(n int) IOSequence {
        return IOSequence{s.IO, s.Addrs.TakeFirst(n), s.Opts}
}

// TakeFirst64 returns a copy of s with s.Addrs.TakeFirst64(n).
//
// Preconditions: Same as hostarch.AddrRangeSeq.TakeFirst64.
func (s IOSequence) TakeFirst64(n int64) IOSequence {
        return IOSequence{s.IO, s.Addrs.TakeFirst64(n), s.Opts}
}

// CopyOut invokes CopyOutVec over s.Addrs.
//
// As with CopyOutVec, if s.NumBytes() < len(src), the copy will be truncated
// to s.NumBytes(), and a nil error will be returned.
//
// Preconditions: Same as CopyOutVec.
func (s IOSequence) CopyOut(ctx context.Context, src []byte) (int, error) {
        return CopyOutVec(ctx, s.IO, s.Addrs, src, s.Opts)
}

// CopyIn invokes CopyInVec over s.Addrs.
//
// As with CopyInVec, if s.NumBytes() < len(dst), the copy will be truncated to
// s.NumBytes(), and a nil error will be returned.
//
// Preconditions: Same as CopyInVec.
func (s IOSequence) CopyIn(ctx context.Context, dst []byte) (int, error) {
        return CopyInVec(ctx, s.IO, s.Addrs, dst, s.Opts)
}

// ZeroOut invokes ZeroOutVec over s.Addrs.
//
// As with ZeroOutVec, if s.NumBytes() < toZero, the write will be truncated
// to s.NumBytes(), and a nil error will be returned.
//
// Preconditions: Same as ZeroOutVec.
func (s IOSequence) ZeroOut(ctx context.Context, toZero int64) (int64, error) {
        return ZeroOutVec(ctx, s.IO, s.Addrs, toZero, s.Opts)
}

// CopyOutFrom invokes s.CopyOutFrom over s.Addrs.
//
// Preconditions: Same as IO.CopyOutFrom.
func (s IOSequence) CopyOutFrom(ctx context.Context, src safemem.Reader) (int64, error) {
        return s.IO.CopyOutFrom(ctx, s.Addrs, src, s.Opts)
}

// CopyInTo invokes s.CopyInTo over s.Addrs.
//
// Preconditions: Same as IO.CopyInTo.
func (s IOSequence) CopyInTo(ctx context.Context, dst safemem.Writer) (int64, error) {
        return s.IO.CopyInTo(ctx, s.Addrs, dst, s.Opts)
}

// Reader returns an io.Reader that reads from s. Reads beyond the end of s
// return io.EOF. The preconditions that apply to s.CopyIn also apply to the
// returned io.Reader.Read.
func (s IOSequence) Reader(ctx context.Context) *IOSequenceReadWriter {
        return &IOSequenceReadWriter{ctx, s}
}

// Writer returns an io.Writer that writes to s. Writes beyond the end of s
// return ErrEndOfIOSequence. The preconditions that apply to s.CopyOut also
// apply to the returned io.Writer.Write.
func (s IOSequence) Writer(ctx context.Context) *IOSequenceReadWriter {
        return &IOSequenceReadWriter{ctx, s}
}

// ErrEndOfIOSequence is returned by IOSequence.Writer().Write() when
// attempting to write beyond the end of the IOSequence.
var ErrEndOfIOSequence = errors.New("write beyond end of IOSequence")

// IOSequenceReadWriter implements io.Reader and io.Writer for an IOSequence.
type IOSequenceReadWriter struct {
        ctx context.Context
        s   IOSequence
}

// Read implements io.Reader.Read.
func (rw *IOSequenceReadWriter) Read(dst []byte) (int, error) {
        n, err := rw.s.CopyIn(rw.ctx, dst)
        rw.s = rw.s.DropFirst(n)
        if err == nil && rw.s.NumBytes() == 0 {
                err = io.EOF
        }
        return n, err
}

// Len implements tcpip.Payloader.
func (rw *IOSequenceReadWriter) Len() int {
        return int(rw.s.NumBytes())
}

// Write implements io.Writer.Write.
func (rw *IOSequenceReadWriter) Write(src []byte) (int, error) {
        n, err := rw.s.CopyOut(rw.ctx, src)
        rw.s = rw.s.DropFirst(n)
        if err == nil && n < len(src) {
                err = ErrEndOfIOSequence
        }
        return n, err
}




























   31 




   31 
   31 


   31 




   36 
















































   36 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package hash contains utility functions for hashing.
package hash

import (
        "encoding/binary"

        "gvisor.dev/gvisor/pkg/rand"
        "gvisor.dev/gvisor/pkg/tcpip/header"
)

var hashIV = RandN32(1)[0]

// RandN32 generates a slice of n cryptographic random 32-bit numbers.
func RandN32(n int) []uint32 {
        b := make([]byte, 4*n)
        if _, err := rand.Read(b); err != nil {
                panic("unable to get random numbers: " + err.Error())
        }
        r := make([]uint32, n)
        for i := range r {
                r[i] = binary.LittleEndian.Uint32(b[4*i : (4*i + 4)])
        }
        return r
}

// Hash3Words calculates the Jenkins hash of 3 32-bit words. This is adapted
// from linux.
func Hash3Words(a, b, c, initval uint32) uint32 {
        const iv = 0xdeadbeef + (3 << 2)
        initval += iv

        a += initval
        b += initval
        c += initval

        c ^= b
        c -= rol32(b, 14)
        a ^= c
        a -= rol32(c, 11)
        b ^= a
        b -= rol32(a, 25)
        c ^= b
        c -= rol32(b, 16)
        a ^= c
        a -= rol32(c, 4)
        b ^= a
        b -= rol32(a, 14)
        c ^= b
        c -= rol32(b, 24)

        return c
}

// IPv4FragmentHash computes the hash of the IPv4 fragment as suggested in RFC 791.
func IPv4FragmentHash(h header.IPv4) uint32 {
        x := uint32(h.ID())<<16 | uint32(h.Protocol())
        t := h.SourceAddress()
        y := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
        t = h.DestinationAddress()
        z := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
        return Hash3Words(x, y, z, hashIV)
}

// IPv6FragmentHash computes the hash of the ipv6 fragment.
// Unlike IPv4, the protocol is not used to compute the hash.
// RFC 2640 (sec 4.5) is not very sharp on this aspect.
// As a reference, also Linux ignores the protocol to compute
// the hash (inet6_hash_frag).
func IPv6FragmentHash(h header.IPv6, id uint32) uint32 {
        t := h.SourceAddress()
        y := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
        t = h.DestinationAddress()
        z := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
        return Hash3Words(id, y, z, hashIV)
}

func rol32(v, shift uint32) uint32 {
        return (v << shift) | (v >> ((-shift) & 31))
}


























 1959 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "unsafe"

        "gvisor.dev/gvisor/pkg/safemem"
)

// countBlock provides a safemem.BlockSeq for kcov.count.
//
// Like k.count, the block returned is protected by k.mu.
func (kcov *Kcov) countBlock() safemem.BlockSeq {
        return safemem.BlockSeqOf(safemem.BlockFromSafePointer(unsafe.Pointer(&kcov.count), int(unsafe.Sizeof(kcov.count))))
}





























 1700 



 1700 








 1692 


 1692 

 1686 



 1184 




 1697 







  181 









 1694 








   54 



   54 




   54 





















   54 














 1695 


 1692 

 1692 



 1585 




 1688 



  288 





 1698 





















  650 



   10 



  641 








  650 







  650 




  612 


  650 
    1 



  660 

  664 





  527 
   30 









  525 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mm

import (
        "fmt"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/platform"
)

// AddressSpace returns the platform.AddressSpace bound to mm.
//
// Preconditions: The caller must have called mm.Activate().
func (mm *MemoryManager) AddressSpace() platform.AddressSpace {
        if atomic.LoadInt32(&mm.active) == 0 {
                panic("trying to use inactive address space?")
        }
        return mm.as
}

// Activate ensures this MemoryManager has a platform.AddressSpace.
//
// The caller must not hold any locks when calling Activate.
//
// When this MemoryManager is no longer needed by a task, it should call
// Deactivate to release the reference.
func (mm *MemoryManager) Activate(ctx context.Context) error {
        // Fast path: the MemoryManager already has an active
        // platform.AddressSpace, and we just need to indicate that we need it too.
        for {
                active := atomic.LoadInt32(&mm.active)
                if active == 0 {
                        // Fall back to the slow path.
                        break
                }
                if atomic.CompareAndSwapInt32(&mm.active, active, active+1) {
                        return nil
                }
        }

        for {
                // Slow path: may need to synchronize with other goroutines changing
                // mm.active to or from zero.
                mm.activeMu.Lock()
                // Inline Unlock instead of using a defer for performance since this
                // method is commonly in the hot-path.

                // Check if we raced with another goroutine performing activation.
                if atomic.LoadInt32(&mm.active) > 0 {
                        // This can't race; Deactivate can't decrease mm.active from 1 to 0
                        // without holding activeMu.
                        atomic.AddInt32(&mm.active, 1)
                        mm.activeMu.Unlock()
                        return nil
                }

                // Do we have a context? If so, then we never unmapped it. This can
                // only be the case if !mm.p.CooperativelySchedulesAddressSpace().
                if mm.as != nil {
                        atomic.StoreInt32(&mm.active, 1)
                        mm.activeMu.Unlock()
                        return nil
                }

                // Get a new address space. We must force unmapping by passing nil to
                // NewAddressSpace if requested. (As in the nil interface object, not a
                // typed nil.)
                mappingsID := (interface{})(mm)
                if mm.unmapAllOnActivate {
                        mappingsID = nil
                }
                as, c, err := mm.p.NewAddressSpace(mappingsID)
                if err != nil {
                        mm.activeMu.Unlock()
                        return err
                }
                if as == nil {
                        // AddressSpace is unavailable, we must wait.
                        //
                        // activeMu must not be held while waiting, as the user of the address
                        // space we are waiting on may attempt to take activeMu.
                        mm.activeMu.Unlock()

                        sleep := mm.p.CooperativelySchedulesAddressSpace() && mm.sleepForActivation
                        if sleep {
                                // Mark this task sleeping while waiting for the address space to
                                // prevent the watchdog from reporting it as a stuck task.
                                ctx.UninterruptibleSleepStart(false)
                        }
                        <-c
                        if sleep {
                                ctx.UninterruptibleSleepFinish(false)
                        }
                        continue
                }

                // Okay, we could restore all mappings at this point.
                // But forget that. Let's just let them fault in.
                mm.as = as

                // Unmapping is done, if necessary.
                mm.unmapAllOnActivate = false

                // Now that m.as has been assigned, we can set m.active to a non-zero value
                // to enable the fast path.
                atomic.StoreInt32(&mm.active, 1)

                mm.activeMu.Unlock()
                return nil
        }
}

// Deactivate releases a reference to the MemoryManager.
func (mm *MemoryManager) Deactivate() {
        // Fast path: this is not the last goroutine to deactivate the
        // MemoryManager.
        for {
                active := atomic.LoadInt32(&mm.active)
                if active == 1 {
                        // Fall back to the slow path.
                        break
                }
                if atomic.CompareAndSwapInt32(&mm.active, active, active-1) {
                        return
                }
        }

        mm.activeMu.Lock()
        // Same as Activate.

        // Still active?
        if atomic.AddInt32(&mm.active, -1) > 0 {
                mm.activeMu.Unlock()
                return
        }

        // Can we hold on to the address space?
        if !mm.p.CooperativelySchedulesAddressSpace() {
                mm.activeMu.Unlock()
                return
        }

        // Release the address space.
        mm.as.Release()

        // Lost it.
        mm.as = nil
        mm.activeMu.Unlock()
}

// mapASLocked maps addresses in ar into mm.as. If precommit is true, mappings
// for all addresses in ar should be precommitted.
//
// Preconditions:
// * mm.activeMu must be locked.
// * mm.as != nil.
// * ar.Length() != 0.
// * ar must be page-aligned.
// * pseg == mm.pmas.LowerBoundSegment(ar.Start).
func (mm *MemoryManager) mapASLocked(pseg pmaIterator, ar hostarch.AddrRange, precommit bool) error {
        // By default, map entire pmas at a time, under the assumption that there
        // is no cost to mapping more of a pma than necessary.
        mapAR := hostarch.AddrRange{0, ^hostarch.Addr(hostarch.PageSize - 1)}
        if precommit {
                // When explicitly precommitting, only map ar, since overmapping may
                // incur unexpected resource usage.
                mapAR = ar
        } else if mapUnit := mm.p.MapUnit(); mapUnit != 0 {
                // Limit the range we map to ar, aligned to mapUnit.
                mapMask := hostarch.Addr(mapUnit - 1)
                mapAR.Start = ar.Start &^ mapMask
                // If rounding ar.End up overflows, just keep the existing mapAR.End.
                if end := (ar.End + mapMask) &^ mapMask; end >= ar.End {
                        mapAR.End = end
                }
        }
        if checkInvariants {
                if !mapAR.IsSupersetOf(ar) {
                        panic(fmt.Sprintf("mapAR %#v is not a superset of ar %#v", mapAR, ar))
                }
        }

        // Since this checks ar.End and not mapAR.End, we will never map a pma that
        // is not required.
        for pseg.Ok() && pseg.Start() < ar.End {
                pma := pseg.ValuePtr()
                pmaAR := pseg.Range()
                pmaMapAR := pmaAR.Intersect(mapAR)
                perms := pma.effectivePerms
                if pma.needCOW {
                        perms.Write = false
                }
                if perms.Any() { // MapFile precondition
                        if err := mm.as.MapFile(pmaMapAR.Start, pma.file, pseg.fileRangeOf(pmaMapAR), perms, precommit); err != nil {
                                return err
                        }
                }
                pseg = pseg.NextSegment()
        }
        return nil
}

// unmapASLocked removes all AddressSpace mappings for addresses in ar.
//
// Preconditions: mm.activeMu must be locked.
func (mm *MemoryManager) unmapASLocked(ar hostarch.AddrRange) {
        if mm.as == nil {
                // No AddressSpace? Force all mappings to be unmapped on the next
                // Activate.
                mm.unmapAllOnActivate = true
                return
        }

        // unmapASLocked doesn't require vmas or pmas to exist for ar, so it can be
        // passed ranges that include addresses that can't be mapped by the
        // application.
        ar = ar.Intersect(mm.applicationAddrRange())

        // Note that this AddressSpace may or may not be active. If the
        // platform does not require cooperative sharing of AddressSpaces, they
        // are retained between Deactivate/Activate calls. Despite not being
        // active, it is still valid to perform operations on these address
        // spaces.
        mm.as.Unmap(ar.Start, uint64(ar.Length()))
}










































   15 









    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package uniqueid defines context.Context keys for obtaining system-wide
// unique identifiers.
package uniqueid

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
)

// contextID is the kernel package's type for context.Context.Value keys.
type contextID int

const (
        // CtxGlobalUniqueID is a Context.Value key for a system-wide
        // unique identifier.
        CtxGlobalUniqueID contextID = iota

        // CtxGlobalUniqueIDProvider is a Context.Value key for a
        // system-wide unique identifier generator.
        CtxGlobalUniqueIDProvider

        // CtxInotifyCookie is a Context.Value key for a unique inotify
        // event cookie.
        CtxInotifyCookie
)

// GlobalFromContext returns a system-wide unique identifier from ctx.
func GlobalFromContext(ctx context.Context) uint64 {
        return ctx.Value(CtxGlobalUniqueID).(uint64)
}

// GlobalProviderFromContext returns a system-wide unique identifier from ctx.
func GlobalProviderFromContext(ctx context.Context) transport.UniqueIDProvider {
        return ctx.Value(CtxGlobalUniqueIDProvider).(transport.UniqueIDProvider)
}

// InotifyCookie generates a unique inotify event cookie from ctx.
func InotifyCookie(ctx context.Context) uint32 {
        return ctx.Value(CtxInotifyCookie).(uint32)
}




























































   31 



































































































   32 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ip

import "gvisor.dev/gvisor/pkg/tcpip"

// LINT.IfChange(MultiCounterIPForwardingStats)

// MultiCounterIPForwardingStats holds IP forwarding statistics. Each counter
// may have several versions.
type MultiCounterIPForwardingStats struct {
        // Unrouteable is the number of IP packets received which were dropped
        // because the netstack could not construct a route to their
        // destination.
        Unrouteable tcpip.MultiCounterStat

        // ExhaustedTTL is the number of IP packets received which were dropped
        // because their TTL was exhausted.
        ExhaustedTTL tcpip.MultiCounterStat

        // LinkLocalSource is the number of IP packets which were dropped
        // because they contained a link-local source address.
        LinkLocalSource tcpip.MultiCounterStat

        // LinkLocalDestination is the number of IP packets which were dropped
        // because they contained a link-local destination address.
        LinkLocalDestination tcpip.MultiCounterStat

        // PacketTooBig is the number of IP packets which were dropped because they
        // were too big for the outgoing MTU.
        PacketTooBig tcpip.MultiCounterStat

        // HostUnreachable is the number of IP packets received which could not be
        // successfully forwarded due to an unresolvable next hop.
        HostUnreachable tcpip.MultiCounterStat

        // ExtensionHeaderProblem is the number of IP packets which were dropped
        // because of a problem encountered when processing an IPv6 extension
        // header.
        ExtensionHeaderProblem tcpip.MultiCounterStat

        // Errors is the number of IP packets received which could not be
        // successfully forwarded.
        Errors tcpip.MultiCounterStat
}

// Init sets internal counters to track a and b counters.
func (m *MultiCounterIPForwardingStats) Init(a, b *tcpip.IPForwardingStats) {
        m.Unrouteable.Init(a.Unrouteable, b.Unrouteable)
        m.Errors.Init(a.Errors, b.Errors)
        m.LinkLocalSource.Init(a.LinkLocalSource, b.LinkLocalSource)
        m.LinkLocalDestination.Init(a.LinkLocalDestination, b.LinkLocalDestination)
        m.ExtensionHeaderProblem.Init(a.ExtensionHeaderProblem, b.ExtensionHeaderProblem)
        m.PacketTooBig.Init(a.PacketTooBig, b.PacketTooBig)
        m.ExhaustedTTL.Init(a.ExhaustedTTL, b.ExhaustedTTL)
        m.HostUnreachable.Init(a.HostUnreachable, b.HostUnreachable)
}

// LINT.ThenChange(:MultiCounterIPForwardingStats, ../../../tcpip.go:IPForwardingStats)

// LINT.IfChange(MultiCounterIPStats)

// MultiCounterIPStats holds IP statistics, each counter may have several
// versions.
type MultiCounterIPStats struct {
        // PacketsReceived is the number of IP packets received from the link
        // layer.
        PacketsReceived tcpip.MultiCounterStat

        // ValidPacketsReceived is the number of valid IP packets that reached the IP
        // layer.
        ValidPacketsReceived tcpip.MultiCounterStat

        // DisabledPacketsReceived is the number of IP packets received from
        // the link layer when the IP layer is disabled.
        DisabledPacketsReceived tcpip.MultiCounterStat

        // InvalidDestinationAddressesReceived is the number of IP packets
        // received with an unknown or invalid destination address.
        InvalidDestinationAddressesReceived tcpip.MultiCounterStat

        // InvalidSourceAddressesReceived is the number of IP packets received
        // with a source address that should never have been received on the
        // wire.
        InvalidSourceAddressesReceived tcpip.MultiCounterStat

        // PacketsDelivered is the number of incoming IP packets successfully
        // delivered to the transport layer.
        PacketsDelivered tcpip.MultiCounterStat

        // PacketsSent is the number of IP packets sent via WritePacket.
        PacketsSent tcpip.MultiCounterStat

        // OutgoingPacketErrors is the number of IP packets which failed to
        // write to a link-layer endpoint.
        OutgoingPacketErrors tcpip.MultiCounterStat

        // MalformedPacketsReceived is the number of IP Packets that were
        // dropped due to the IP packet header failing validation checks.
        MalformedPacketsReceived tcpip.MultiCounterStat

        // MalformedFragmentsReceived is the number of IP Fragments that were
        // dropped due to the fragment failing validation checks.
        MalformedFragmentsReceived tcpip.MultiCounterStat

        // IPTablesPreroutingDropped is the number of IP packets dropped in the
        // Prerouting chain.
        IPTablesPreroutingDropped tcpip.MultiCounterStat

        // IPTablesInputDropped is the number of IP packets dropped in the
        // Input chain.
        IPTablesInputDropped tcpip.MultiCounterStat

        // IPTablesForwardDropped is the number of IP packets dropped in the
        // Forward chain.
        IPTablesForwardDropped tcpip.MultiCounterStat

        // IPTablesOutputDropped is the number of IP packets dropped in the
        // Output chain.
        IPTablesOutputDropped tcpip.MultiCounterStat

        // IPTablesPostroutingDropped is the number of IP packets dropped in
        // the Postrouting chain.
        IPTablesPostroutingDropped tcpip.MultiCounterStat

        // TODO(https://gvisor.dev/issues/5529): Move the IPv4-only option
        // stats out of IPStats.

        // OptionTimestampReceived is the number of Timestamp options seen.
        OptionTimestampReceived tcpip.MultiCounterStat

        // OptionRecordRouteReceived is the number of Record Route options
        // seen.
        OptionRecordRouteReceived tcpip.MultiCounterStat

        // OptionRouterAlertReceived is the number of Router Alert options
        // seen.
        OptionRouterAlertReceived tcpip.MultiCounterStat

        // OptionUnknownReceived is the number of unknown IP options seen.
        OptionUnknownReceived tcpip.MultiCounterStat

        // Forwarding collects stats related to IP forwarding.
        Forwarding MultiCounterIPForwardingStats
}

// Init sets internal counters to track a and b counters.
func (m *MultiCounterIPStats) Init(a, b *tcpip.IPStats) {
        m.PacketsReceived.Init(a.PacketsReceived, b.PacketsReceived)
        m.ValidPacketsReceived.Init(a.ValidPacketsReceived, b.ValidPacketsReceived)
        m.DisabledPacketsReceived.Init(a.DisabledPacketsReceived, b.DisabledPacketsReceived)
        m.InvalidDestinationAddressesReceived.Init(a.InvalidDestinationAddressesReceived, b.InvalidDestinationAddressesReceived)
        m.InvalidSourceAddressesReceived.Init(a.InvalidSourceAddressesReceived, b.InvalidSourceAddressesReceived)
        m.PacketsDelivered.Init(a.PacketsDelivered, b.PacketsDelivered)
        m.PacketsSent.Init(a.PacketsSent, b.PacketsSent)
        m.OutgoingPacketErrors.Init(a.OutgoingPacketErrors, b.OutgoingPacketErrors)
        m.MalformedPacketsReceived.Init(a.MalformedPacketsReceived, b.MalformedPacketsReceived)
        m.MalformedFragmentsReceived.Init(a.MalformedFragmentsReceived, b.MalformedFragmentsReceived)
        m.IPTablesPreroutingDropped.Init(a.IPTablesPreroutingDropped, b.IPTablesPreroutingDropped)
        m.IPTablesInputDropped.Init(a.IPTablesInputDropped, b.IPTablesInputDropped)
        m.IPTablesForwardDropped.Init(a.IPTablesForwardDropped, b.IPTablesForwardDropped)
        m.IPTablesOutputDropped.Init(a.IPTablesOutputDropped, b.IPTablesOutputDropped)
        m.IPTablesPostroutingDropped.Init(a.IPTablesPostroutingDropped, b.IPTablesPostroutingDropped)
        m.OptionTimestampReceived.Init(a.OptionTimestampReceived, b.OptionTimestampReceived)
        m.OptionRecordRouteReceived.Init(a.OptionRecordRouteReceived, b.OptionRecordRouteReceived)
        m.OptionRouterAlertReceived.Init(a.OptionRouterAlertReceived, b.OptionRouterAlertReceived)
        m.OptionUnknownReceived.Init(a.OptionUnknownReceived, b.OptionUnknownReceived)
        m.Forwarding.Init(&a.Forwarding, &b.Forwarding)
}

// LINT.ThenChange(:MultiCounterIPStats, ../../../tcpip.go:IPStats)






























































  324 







   22 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sentry/platform"
)

// SignalPanic is used to panic the running threads. It is a signal which
// cannot be used by the application: it must be caught and ignored by the
// runtime (in order to catch possible races).
const SignalPanic = linux.SIGUSR2

// sendExternalSignal is called when an asynchronous signal is sent to the
// sentry ("in sentry context"). On some platforms, it may also be called when
// an asynchronous signal is sent to sandboxed application threads ("in
// application context").
//
// context is used only for debugging to differentiate these cases.
//
// Preconditions: Kernel must have an init process.
func (k *Kernel) sendExternalSignal(info *linux.SignalInfo, context string) {
        switch linux.Signal(info.Signo) {
        case linux.SIGURG:
                // Sent by the Go 1.14+ runtime for asynchronous goroutine preemption.

        case platform.SignalInterrupt:
                // Assume that a call to platform.Context.Interrupt() misfired.

        case SignalPanic:
                // SignalPanic is also specially handled in sentry setup to ensure that
                // it causes a panic even after tasks exit, but SignalPanic may also
                // be sent here if it is received while in app context.
                panic("Signal-induced panic")

        default:
                log.Infof("Received external signal %d in %s context", info.Signo, context)
                if k.globalInit == nil {
                        panic(fmt.Sprintf("Received external signal %d before init created", info.Signo))
                }
                k.globalInit.SendSignal(info)
        }
}

// SignalInfoPriv returns a SignalInfo equivalent to Linux's SEND_SIG_PRIV.
func SignalInfoPriv(sig linux.Signal) *linux.SignalInfo {
        return &linux.SignalInfo{
                Signo: int32(sig),
                Code:  linux.SI_KERNEL,
        }
}

// SignalInfoNoInfo returns a SignalInfo equivalent to Linux's SEND_SIG_NOINFO.
func SignalInfoNoInfo(sig linux.Signal, sender, receiver *Task) *linux.SignalInfo {
        info := &linux.SignalInfo{
                Signo: int32(sig),
                Code:  linux.SI_USER,
        }
        info.SetPID(int32(receiver.tg.pidns.IDOfThreadGroup(sender.tg)))
        info.SetUID(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
        return info
}


















































































 1846 









 1840 



 1834 





 1837 






 1830 




 1839 




 1841 



 1834 






 1822 
























   34 

    3 


   31 













 1825 




















 1818 
    1 


 1815 






































 1813 


  617 
  573 


  616 


 1811 


 1801 



 1815 






















 1798 



































































































 1390 

    1 

  396 

 1261 

  126 

    1 










  115 
  113 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "fmt"
        "os"
        "runtime/trace"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/bits"
        "gvisor.dev/gvisor/pkg/errors"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/metric"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/syserror"
)

// SyscallRestartBlock represents the restart block for a syscall restartable
// with a custom function. It encapsulates the state required to restart a
// syscall across a S/R.
type SyscallRestartBlock interface {
        Restart(t *Task) (uintptr, error)
}

// SyscallControl is returned by syscalls to control the behavior of
// Task.doSyscallInvoke.
type SyscallControl struct {
        // next is the state that the task goroutine should switch to. If next is
        // nil, the task goroutine should continue to syscall exit as usual.
        next taskRunState

        // If ignoreReturn is true, Task.doSyscallInvoke should not store any value
        // in the task's syscall return value register.
        ignoreReturn bool
}

var (
        // CtrlDoExit is returned by the implementations of the exit and exit_group
        // syscalls to enter the task exit path directly, skipping syscall exit
        // tracing.
        CtrlDoExit = &SyscallControl{next: (*runExit)(nil), ignoreReturn: true}

        // ctrlStopAndReinvokeSyscall is returned by syscalls using the external
        // feature before syscall execution. This causes Task.doSyscallInvoke
        // to return runSyscallReinvoke, allowing Task.run to check for stops
        // before immediately re-invoking the syscall (skipping the re-checking
        // of seccomp filters and ptrace which would confuse userspace
        // tracing).
        ctrlStopAndReinvokeSyscall = &SyscallControl{next: (*runSyscallReinvoke)(nil), ignoreReturn: true}

        // ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at
        // their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather
        // than tail-calling it, allowing stops to be checked before syscall exit.
        ctrlStopBeforeSyscallExit = &SyscallControl{next: (*runSyscallExit)(nil)}
)

func (t *Task) invokeExternal() {
        t.BeginExternalStop()
        go func() { // S/R-SAFE: External control flow.
                defer t.EndExternalStop()
                t.SyscallTable().External(t.Kernel())
        }()
}

func (t *Task) executeSyscall(sysno uintptr, args arch.SyscallArguments) (rval uintptr, ctrl *SyscallControl, err error) {
        s := t.SyscallTable()

        fe := s.FeatureEnable.Word(sysno)

        var straceContext interface{}
        if bits.IsAnyOn32(fe, StraceEnableBits) {
                straceContext = s.Stracer.SyscallEnter(t, sysno, args, fe)
        }

        if bits.IsOn32(fe, ExternalBeforeEnable) && (s.ExternalFilterBefore == nil || s.ExternalFilterBefore(t, sysno, args)) {
                t.invokeExternal()
                // Ensure we check for stops, then invoke the syscall again.
                ctrl = ctrlStopAndReinvokeSyscall
        } else {
                fn := s.Lookup(sysno)
                var region *trace.Region // Only non-nil if tracing == true.
                if trace.IsEnabled() {
                        region = trace.StartRegion(t.traceContext, s.LookupName(sysno))
                }
                if fn != nil {
                        // Call our syscall implementation.
                        rval, ctrl, err = fn(t, args)
                } else {
                        // Use the missing function if not found.
                        rval, err = t.SyscallTable().Missing(t, sysno, args)
                }
                if region != nil {
                        region.End()
                }
        }

        if bits.IsOn32(fe, ExternalAfterEnable) && (s.ExternalFilterAfter == nil || s.ExternalFilterAfter(t, sysno, args)) {
                t.invokeExternal()
                // Don't reinvoke the unix.
        }

        if bits.IsAnyOn32(fe, StraceEnableBits) {
                s.Stracer.SyscallExit(straceContext, t, sysno, rval, err)
        }

        return
}

// doSyscall is the entry point for an invocation of a system call specified by
// the current state of t's registers.
//
// The syscall path is very hot; avoid defer.
func (t *Task) doSyscall() taskRunState {
        // Save value of the register which is clobbered in the following
        // t.Arch().SetReturn(-ENOSYS) operation. This is dedicated to arm64.
        //
        // On x86, register rax was shared by syscall number and return
        // value, and at the entry of the syscall handler, the rax was
        // saved to regs.orig_rax which was exposed to userspace.
        // But on arm64, syscall number was passed through X8, and the X0
        // was shared by the first syscall argument and return value. The
        // X0 was saved to regs.orig_x0 which was not exposed to userspace.
        // So we have to do the same operation here to save the X0 value
        // into the task context.
        t.Arch().SyscallSaveOrig()

        sysno := t.Arch().SyscallNo()
        args := t.Arch().SyscallArgs()

        // Tracers expect to see this between when the task traps into the kernel
        // to perform a syscall and when the syscall is actually invoked.
        // This useless-looking temporary is needed because Go.
        tmp := uintptr(unix.ENOSYS)
        t.Arch().SetReturn(-tmp)

        // Check seccomp filters. The nil check is for performance (as seccomp use
        // is rare), not needed for correctness.
        if t.syscallFilters.Load() != nil {
                switch r := t.checkSeccompSyscall(int32(sysno), args, hostarch.Addr(t.Arch().IP())); r {
                case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
                        t.Debugf("Syscall %d: denied by seccomp", sysno)
                        return (*runSyscallExit)(nil)
                case linux.SECCOMP_RET_ALLOW:
                        // ok
                case linux.SECCOMP_RET_KILL_THREAD:
                        t.Debugf("Syscall %d: killed by seccomp", sysno)
                        t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
                        return (*runExit)(nil)
                case linux.SECCOMP_RET_TRACE:
                        t.Debugf("Syscall %d: stopping for PTRACE_EVENT_SECCOMP", sysno)
                        return (*runSyscallAfterPtraceEventSeccomp)(nil)
                default:
                        panic(fmt.Sprintf("Unknown seccomp result %d", r))
                }
        }

        return t.doSyscallEnter(sysno, args)
}

type runSyscallAfterPtraceEventSeccomp struct{}

func (*runSyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
        if t.killed() {
                // "[S]yscall-exit-stop is not generated prior to death by SIGKILL." -
                // ptrace(2)
                return (*runInterrupt)(nil)
        }
        sysno := t.Arch().SyscallNo()
        // "The tracer can skip the system call by changing the syscall number to
        // -1." - Documentation/prctl/seccomp_filter.txt
        if sysno == ^uintptr(0) {
                return (*runSyscallExit)(nil).execute(t)
        }
        args := t.Arch().SyscallArgs()
        return t.doSyscallEnter(sysno, args)
}

func (t *Task) doSyscallEnter(sysno uintptr, args arch.SyscallArguments) taskRunState {
        if next, ok := t.ptraceSyscallEnter(); ok {
                return next
        }
        return t.doSyscallInvoke(sysno, args)
}

// +stateify savable
type runSyscallAfterSyscallEnterStop struct{}

func (*runSyscallAfterSyscallEnterStop) execute(t *Task) taskRunState {
        if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
                t.tg.signalHandlers.mu.Lock()
                t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
                t.tg.signalHandlers.mu.Unlock()
        }
        if t.killed() {
                return (*runInterrupt)(nil)
        }
        sysno := t.Arch().SyscallNo()
        if sysno == ^uintptr(0) {
                return (*runSyscallExit)(nil)
        }
        args := t.Arch().SyscallArgs()

        return t.doSyscallInvoke(sysno, args)
}

// +stateify savable
type runSyscallAfterSysemuStop struct{}

func (*runSyscallAfterSysemuStop) execute(t *Task) taskRunState {
        if sig := linux.Signal(t.ptraceCode); sig.IsValid() {
                t.tg.signalHandlers.mu.Lock()
                t.sendSignalLocked(SignalInfoPriv(sig), false /* group */)
                t.tg.signalHandlers.mu.Unlock()
        }
        if t.killed() {
                return (*runInterrupt)(nil)
        }
        return (*runSyscallExit)(nil).execute(t)
}

func (t *Task) doSyscallInvoke(sysno uintptr, args arch.SyscallArguments) taskRunState {
        rval, ctrl, err := t.executeSyscall(sysno, args)

        if ctrl != nil {
                if !ctrl.ignoreReturn {
                        t.Arch().SetReturn(rval)
                }
                if ctrl.next != nil {
                        return ctrl.next
                }
        } else if err != nil {
                t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
                t.haveSyscallReturn = true
        } else {
                t.Arch().SetReturn(rval)
        }

        return (*runSyscallExit)(nil).execute(t)
}

// +stateify savable
type runSyscallReinvoke struct{}

func (*runSyscallReinvoke) execute(t *Task) taskRunState {
        if t.killed() {
                // It's possible that since the last execution, the task has
                // been forcible killed. Invoking the system call here could
                // result in an infinite loop if it is again preempted by an
                // external stop and reinvoked.
                return (*runInterrupt)(nil)
        }

        sysno := t.Arch().SyscallNo()
        args := t.Arch().SyscallArgs()
        return t.doSyscallInvoke(sysno, args)
}

// +stateify savable
type runSyscallExit struct{}

func (*runSyscallExit) execute(t *Task) taskRunState {
        t.ptraceSyscallExit()
        return (*runApp)(nil)
}

// doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as
// indicated by an execution fault at address addr. doVsyscall returns the
// task's next run state.
func (t *Task) doVsyscall(addr hostarch.Addr, sysno uintptr) taskRunState {
        metric.WeirdnessMetric.Increment("vsyscall_count")

        // Grab the caller up front, to make sure there's a sensible stack.
        caller := t.Arch().Native(uintptr(0))
        if _, err := caller.CopyIn(t, hostarch.Addr(t.Arch().Stack())); err != nil {
                t.Debugf("vsyscall %d: error reading return address from stack: %v", sysno, err)
                t.forceSignal(linux.SIGSEGV, false /* unconditional */)
                t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                return (*runApp)(nil)
        }

        // For _vsyscalls_, there is no need to translate System V calling convention
        // to syscall ABI because they both use RDI, RSI, and RDX for the first three
        // arguments and none of the vsyscalls uses more than two arguments.
        args := t.Arch().SyscallArgs()
        if t.syscallFilters.Load() != nil {
                switch r := t.checkSeccompSyscall(int32(sysno), args, addr); r {
                case linux.SECCOMP_RET_ERRNO, linux.SECCOMP_RET_TRAP:
                        t.Debugf("vsyscall %d, caller %x: denied by seccomp", sysno, t.Arch().Value(caller))
                        return (*runApp)(nil)
                case linux.SECCOMP_RET_ALLOW:
                        // ok
                case linux.SECCOMP_RET_TRACE:
                        t.Debugf("vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP", sysno, t.Arch().Value(caller))
                        return &runVsyscallAfterPtraceEventSeccomp{addr, sysno, caller}
                case linux.SECCOMP_RET_KILL_THREAD:
                        t.Debugf("vsyscall %d: killed by seccomp", sysno)
                        t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
                        return (*runExit)(nil)
                default:
                        panic(fmt.Sprintf("Unknown seccomp result %d", r))
                }
        }

        return t.doVsyscallInvoke(sysno, args, caller)
}

type runVsyscallAfterPtraceEventSeccomp struct {
        addr   hostarch.Addr
        sysno  uintptr
        caller marshal.Marshallable
}

func (r *runVsyscallAfterPtraceEventSeccomp) execute(t *Task) taskRunState {
        if t.killed() {
                return (*runInterrupt)(nil)
        }
        sysno := t.Arch().SyscallNo()
        // "... the syscall may not be changed to another system call using the
        // orig_rax register. It may only be changed to -1 order [sic] to skip the
        // currently emulated call. ... The tracer MUST NOT modify rip or rsp." -
        // Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
        // causes do_exit(SIGSYS), and changing sp is ignored.
        if (sysno != ^uintptr(0) && sysno != r.sysno) || hostarch.Addr(t.Arch().IP()) != r.addr {
                t.PrepareExit(linux.WaitStatusTerminationSignal(linux.SIGSYS))
                return (*runExit)(nil)
        }
        if sysno == ^uintptr(0) {
                return (*runApp)(nil)
        }
        return t.doVsyscallInvoke(sysno, t.Arch().SyscallArgs(), r.caller)
}

func (t *Task) doVsyscallInvoke(sysno uintptr, args arch.SyscallArguments, caller marshal.Marshallable) taskRunState {
        rval, ctrl, err := t.executeSyscall(sysno, args)
        if ctrl != nil {
                t.Debugf("vsyscall %d, caller %x: syscall control: %v", sysno, t.Arch().Value(caller), ctrl)
                // Set the return value. The stack has already been adjusted.
                t.Arch().SetReturn(0)
        } else if err == nil {
                t.Debugf("vsyscall %d, caller %x: successfully emulated syscall", sysno, t.Arch().Value(caller))
                // Set the return value. The stack has already been adjusted.
                t.Arch().SetReturn(uintptr(rval))
        } else {
                t.Debugf("vsyscall %d, caller %x: emulated syscall returned error: %v", sysno, t.Arch().Value(caller), err)
                if linuxerr.Equals(linuxerr.EFAULT, err) {
                        t.forceSignal(linux.SIGSEGV, false /* unconditional */)
                        t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                        // A return is not emulated in this case.
                        return (*runApp)(nil)
                }
                t.Arch().SetReturn(uintptr(-ExtractErrno(err, int(sysno))))
        }
        t.Arch().SetIP(t.Arch().Value(caller))
        t.Arch().SetStack(t.Arch().Stack() + uintptr(t.Arch().Width()))
        return (*runApp)(nil)
}

// ExtractErrno extracts an integer error number from the error.
// The syscall number is purely for context in the error case. Use -1 if
// syscall number is unknown.
func ExtractErrno(err error, sysno int) int {
        switch err := err.(type) {
        case nil:
                return 0
        case unix.Errno:
                return int(err)
        case *errors.Error:
                return int(err.Errno())
        case syserror.SyscallRestartErrno:
                return int(err)
        case *memmap.BusError:
                // Bus errors may generate SIGBUS, but for syscalls they still
                // return EFAULT. See case in task_run.go where the fault is
                // handled (and the SIGBUS is delivered).
                return int(unix.EFAULT)
        case *os.PathError:
                return ExtractErrno(err.Err, sysno)
        case *os.LinkError:
                return ExtractErrno(err.Err, sysno)
        case *os.SyscallError:
                return ExtractErrno(err.Err, sysno)
        default:
                if errno, ok := syserror.TranslateError(err); ok {
                        return int(errno)
                }
        }
        panic(fmt.Sprintf("Unknown syscall %d error: %v", sysno, err))
}


































































































   92 










   94 




    3 


























































   11 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import (
        "encoding/binary"

        "gvisor.dev/gvisor/pkg/tcpip"
)

const (
        dstMAC  = 0
        srcMAC  = 6
        ethType = 12
)

// EthernetFields contains the fields of an ethernet frame header. It is used to
// describe the fields of a frame that needs to be encoded.
type EthernetFields struct {
        // SrcAddr is the "MAC source" field of an ethernet frame header.
        SrcAddr tcpip.LinkAddress

        // DstAddr is the "MAC destination" field of an ethernet frame header.
        DstAddr tcpip.LinkAddress

        // Type is the "ethertype" field of an ethernet frame header.
        Type tcpip.NetworkProtocolNumber
}

// Ethernet represents an ethernet frame header stored in a byte array.
type Ethernet []byte

const (
        // EthernetMinimumSize is the minimum size of a valid ethernet frame.
        EthernetMinimumSize = 14

        // EthernetAddressSize is the size, in bytes, of an ethernet address.
        EthernetAddressSize = 6

        // unspecifiedEthernetAddress is the unspecified ethernet address
        // (all bits set to 0).
        unspecifiedEthernetAddress = tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00")

        // EthernetBroadcastAddress is an ethernet address that addresses every node
        // on a local link.
        EthernetBroadcastAddress = tcpip.LinkAddress("\xff\xff\xff\xff\xff\xff")

        // unicastMulticastFlagMask is the mask of the least significant bit in
        // the first octet (in network byte order) of an ethernet address that
        // determines whether the ethernet address is a unicast or multicast. If
        // the masked bit is a 1, then the address is a multicast, unicast
        // otherwise.
        //
        // See the IEEE Std 802-2001 document for more details. Specifically,
        // section 9.2.1 of http://ieee802.org/secmail/pdfocSP2xXA6d.pdf:
        // "A 48-bit universal address consists of two parts. The first 24 bits
        // correspond to the OUI as assigned by the IEEE, expect that the
        // assignee may set the LSB of the first octet to 1 for group addresses
        // or set it to 0 for individual addresses."
        unicastMulticastFlagMask = 1

        // unicastMulticastFlagByteIdx is the byte that holds the
        // unicast/multicast flag. See unicastMulticastFlagMask.
        unicastMulticastFlagByteIdx = 0
)

const (
        // EthernetProtocolAll is a catch-all for all protocols carried inside
        // an ethernet frame. It is mainly used to create packet sockets that
        // capture all traffic.
        EthernetProtocolAll tcpip.NetworkProtocolNumber = 0x0003

        // EthernetProtocolPUP is the PARC Universial Packet protocol ethertype.
        EthernetProtocolPUP tcpip.NetworkProtocolNumber = 0x0200
)

// Ethertypes holds the protocol numbers describing the payload of an ethernet
// frame. These types aren't necessarily supported by netstack, but can be used
// to catch all traffic of a type via packet endpoints.
var Ethertypes = []tcpip.NetworkProtocolNumber{
        EthernetProtocolAll,
        EthernetProtocolPUP,
}

// SourceAddress returns the "MAC source" field of the ethernet frame header.
func (b Ethernet) SourceAddress() tcpip.LinkAddress {
        return tcpip.LinkAddress(b[srcMAC:][:EthernetAddressSize])
}

// DestinationAddress returns the "MAC destination" field of the ethernet frame
// header.
func (b Ethernet) DestinationAddress() tcpip.LinkAddress {
        return tcpip.LinkAddress(b[dstMAC:][:EthernetAddressSize])
}

// Type returns the "ethertype" field of the ethernet frame header.
func (b Ethernet) Type() tcpip.NetworkProtocolNumber {
        return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(b[ethType:]))
}

// Encode encodes all the fields of the ethernet frame header.
func (b Ethernet) Encode(e *EthernetFields) {
        binary.BigEndian.PutUint16(b[ethType:], uint16(e.Type))
        copy(b[srcMAC:][:EthernetAddressSize], e.SrcAddr)
        copy(b[dstMAC:][:EthernetAddressSize], e.DstAddr)
}

// IsMulticastEthernetAddress returns true if the address is a multicast
// ethernet address.
func IsMulticastEthernetAddress(addr tcpip.LinkAddress) bool {
        if len(addr) != EthernetAddressSize {
                return false
        }

        return addr[unicastMulticastFlagByteIdx]&unicastMulticastFlagMask != 0
}

// IsValidUnicastEthernetAddress returns true if the address is a unicast
// ethernet address.
func IsValidUnicastEthernetAddress(addr tcpip.LinkAddress) bool {
        if len(addr) != EthernetAddressSize {
                return false
        }

        if addr == unspecifiedEthernetAddress {
                return false
        }

        if addr[unicastMulticastFlagByteIdx]&unicastMulticastFlagMask != 0 {
                return false
        }

        return true
}

// EthernetAddressFromMulticastIPv4Address returns a multicast Ethernet address
// for a multicast IPv4 address.
//
// addr MUST be a multicast IPv4 address.
func EthernetAddressFromMulticastIPv4Address(addr tcpip.Address) tcpip.LinkAddress {
        var linkAddrBytes [EthernetAddressSize]byte
        // RFC 1112 Host Extensions for IP Multicasting
        //
        // 6.4. Extensions to an Ethernet Local Network Module:
        //
        // An IP host group address is mapped to an Ethernet multicast
        // address by placing the low-order 23-bits of the IP address
        // into the low-order 23 bits of the Ethernet multicast address
        // 01-00-5E-00-00-00 (hex).
        linkAddrBytes[0] = 0x1
        linkAddrBytes[2] = 0x5e
        linkAddrBytes[3] = addr[1] & 0x7F
        copy(linkAddrBytes[4:], addr[IPv4AddressSize-2:])
        return tcpip.LinkAddress(linkAddrBytes[:])
}

// EthernetAddressFromMulticastIPv6Address returns a multicast Ethernet address
// for a multicast IPv6 address.
//
// addr MUST be a multicast IPv6 address.
func EthernetAddressFromMulticastIPv6Address(addr tcpip.Address) tcpip.LinkAddress {
        // RFC 2464 Transmission of IPv6 Packets over Ethernet Networks
        //
        // 7. Address Mapping -- Multicast
        //
        // An IPv6 packet with a multicast destination address DST,
        // consisting of the sixteen octets DST[1] through DST[16], is
        // transmitted to the Ethernet multicast address whose first
        // two octets are the value 3333 hexadecimal and whose last
        // four octets are the last four octets of DST.
        linkAddrBytes := []byte(addr[IPv6AddressSize-EthernetAddressSize:])
        linkAddrBytes[0] = 0x33
        linkAddrBytes[1] = 0x33
        return tcpip.LinkAddress(linkAddrBytes[:])
}

































    4 






    1 






    5 





    5 








    3 







    3 










    1 















    1 








    2 
    1 



    1 




    1 




    1 


   11 

    8 

    1 


    7 


   10 




    8 


   10 



   10 








   10 


    1 


    9 



    3 







    3 
    1 



    2 










   12 







   12 
    1 


   11 





   11 









   17 






    1 


   16 

    1 


   15 


   14 



   14 



   14 
   14 











    3 








    3 




    2 


    1 




    1 



    3 



    2 




    1 



    1 
    1 







    3 









    1 


    1 





    2 




    2 


    3 
    2 





    1 



    1 















    6 








    3 


    3 



    3 






    2 

    1 





    1 


    2 


    6 
    1 





    5 
    1 


    4 
    1 


    3 





    3 
    1 


    1 





    2 


   15 



   15 



   15 


    3 




    3 













   12 







   13 
    3 




   10 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX

// Chmod implements Linux syscall chmod(2).
func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pathAddr := args[0].Pointer()
        mode := args[1].ModeT()
        return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode)
}

// Fchmodat implements Linux syscall fchmodat(2).
func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        dirfd := args[0].Int()
        pathAddr := args[1].Pointer()
        mode := args[2].ModeT()
        return 0, nil, fchmodat(t, dirfd, pathAddr, mode)
}

func fchmodat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint) error {
        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return err
        }

        return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
                Stat: linux.Statx{
                        Mask: linux.STATX_MODE,
                        Mode: uint16(mode & chmodMask),
                },
        })
}

// Fchmod implements Linux syscall fchmod(2).
func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        mode := args[1].ModeT()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        return 0, nil, file.SetStat(t, vfs.SetStatOptions{
                Stat: linux.Statx{
                        Mask: linux.STATX_MODE,
                        Mode: uint16(mode & chmodMask),
                },
        })
}

// Chown implements Linux syscall chown(2).
func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pathAddr := args[0].Pointer()
        owner := args[1].Int()
        group := args[2].Int()
        return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */)
}

// Lchown implements Linux syscall lchown(2).
func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pathAddr := args[0].Pointer()
        owner := args[1].Int()
        group := args[2].Int()
        return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW)
}

// Fchownat implements Linux syscall fchownat(2).
func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        dirfd := args[0].Int()
        pathAddr := args[1].Pointer()
        owner := args[2].Int()
        group := args[3].Int()
        flags := args[4].Int()
        return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags)
}

func fchownat(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, owner, group, flags int32) error {
        if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
                return linuxerr.EINVAL
        }

        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return err
        }

        var opts vfs.SetStatOptions
        if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
                return err
        }

        return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
}

func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error {
        userns := t.UserNamespace()
        if owner != -1 {
                kuid := userns.MapToKUID(auth.UID(owner))
                if !kuid.Ok() {
                        return linuxerr.EINVAL
                }
                opts.Stat.Mask |= linux.STATX_UID
                opts.Stat.UID = uint32(kuid)
        }
        if group != -1 {
                kgid := userns.MapToKGID(auth.GID(group))
                if !kgid.Ok() {
                        return linuxerr.EINVAL
                }
                opts.Stat.Mask |= linux.STATX_GID
                opts.Stat.GID = uint32(kgid)
        }
        return nil
}

// Fchown implements Linux syscall fchown(2).
func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        owner := args[1].Int()
        group := args[2].Int()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        var opts vfs.SetStatOptions
        if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
                return 0, nil, err
        }
        return 0, nil, file.SetStat(t, opts)
}

// Truncate implements Linux syscall truncate(2).
func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        length := args[1].Int64()

        if length < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        path, err := copyInPath(t, addr)
        if err != nil {
                return 0, nil, err
        }

        err = setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
                Stat: linux.Statx{
                        Mask: linux.STATX_SIZE,
                        Size: uint64(length),
                },
                NeedWritePerm: true,
        })
        return 0, nil, handleSetSizeError(t, err)
}

// Ftruncate implements Linux syscall ftruncate(2).
func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        length := args[1].Int64()

        if length < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        if !file.IsWritable() {
                return 0, nil, linuxerr.EINVAL
        }

        err := file.SetStat(t, vfs.SetStatOptions{
                Stat: linux.Statx{
                        Mask: linux.STATX_SIZE,
                        Size: uint64(length),
                },
        })
        return 0, nil, handleSetSizeError(t, err)
}

// Fallocate implements linux system call fallocate(2).
func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        mode := args[1].Uint64()
        offset := args[2].Int64()
        length := args[3].Int64()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        if !file.IsWritable() {
                return 0, nil, linuxerr.EBADF
        }
        if mode != 0 {
                return 0, nil, linuxerr.ENOTSUP
        }
        if offset < 0 || length <= 0 {
                return 0, nil, linuxerr.EINVAL
        }

        size := offset + length
        if size < 0 {
                return 0, nil, linuxerr.EFBIG
        }
        limit := limits.FromContext(t).Get(limits.FileSize).Cur
        if uint64(size) >= limit {
                t.SendSignal(&linux.SignalInfo{
                        Signo: int32(linux.SIGXFSZ),
                        Code:  linux.SI_USER,
                })
                return 0, nil, linuxerr.EFBIG
        }

        return 0, nil, file.Allocate(t, mode, uint64(offset), uint64(length))
}

// Utime implements Linux syscall utime(2).
func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pathAddr := args[0].Pointer()
        timesAddr := args[1].Pointer()

        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return 0, nil, err
        }

        opts := vfs.SetStatOptions{
                Stat: linux.Statx{
                        Mask: linux.STATX_ATIME | linux.STATX_MTIME,
                },
        }
        if timesAddr == 0 {
                opts.Stat.Atime.Nsec = linux.UTIME_NOW
                opts.Stat.Mtime.Nsec = linux.UTIME_NOW
        } else {
                var times linux.Utime
                if _, err := times.CopyIn(t, timesAddr); err != nil {
                        return 0, nil, err
                }
                opts.Stat.Atime.Sec = times.Actime
                opts.Stat.Mtime.Sec = times.Modtime
        }

        return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
}

// Utimes implements Linux syscall utimes(2).
func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pathAddr := args[0].Pointer()
        timesAddr := args[1].Pointer()

        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return 0, nil, err
        }

        var opts vfs.SetStatOptions
        if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil {
                return 0, nil, err
        }

        return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
}

// Futimesat implements Linux syscall futimesat(2).
func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        dirfd := args[0].Int()
        pathAddr := args[1].Pointer()
        timesAddr := args[2].Pointer()

        // "If filename is NULL and dfd refers to an open file, then operate on the
        // file. Otherwise look up filename, possibly using dfd as a starting
        // point." - fs/utimes.c
        var path fspath.Path
        shouldAllowEmptyPath := allowEmptyPath
        if dirfd == linux.AT_FDCWD || pathAddr != 0 {
                var err error
                path, err = copyInPath(t, pathAddr)
                if err != nil {
                        return 0, nil, err
                }
                shouldAllowEmptyPath = disallowEmptyPath
        }

        var opts vfs.SetStatOptions
        if err := populateSetStatOptionsForUtimes(t, timesAddr, &opts); err != nil {
                return 0, nil, err
        }

        return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, followFinalSymlink, &opts)
}

func populateSetStatOptionsForUtimes(t *kernel.Task, timesAddr hostarch.Addr, opts *vfs.SetStatOptions) error {
        if timesAddr == 0 {
                opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
                opts.Stat.Atime.Nsec = linux.UTIME_NOW
                opts.Stat.Mtime.Nsec = linux.UTIME_NOW
                return nil
        }
        var times [2]linux.Timeval
        if _, err := linux.CopyTimevalSliceIn(t, timesAddr, times[:]); err != nil {
                return err
        }
        if times[0].Usec < 0 || times[0].Usec > 999999 || times[1].Usec < 0 || times[1].Usec > 999999 {
                return linuxerr.EINVAL
        }
        opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
        opts.Stat.Atime = linux.StatxTimestamp{
                Sec:  times[0].Sec,
                Nsec: uint32(times[0].Usec * 1000),
        }
        opts.Stat.Mtime = linux.StatxTimestamp{
                Sec:  times[1].Sec,
                Nsec: uint32(times[1].Usec * 1000),
        }
        return nil
}

// Utimensat implements Linux syscall utimensat(2).
func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        dirfd := args[0].Int()
        pathAddr := args[1].Pointer()
        timesAddr := args[2].Pointer()
        flags := args[3].Int()

        // Linux requires that the UTIME_OMIT check occur before checking path or
        // flags.
        var opts vfs.SetStatOptions
        if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
                return 0, nil, err
        }
        if opts.Stat.Mask == 0 {
                return 0, nil, nil
        }

        if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // "If filename is NULL and dfd refers to an open file, then operate on the
        // file. Otherwise look up filename, possibly using dfd as a starting
        // point." - fs/utimes.c
        var path fspath.Path
        shouldAllowEmptyPath := allowEmptyPath
        if dirfd == linux.AT_FDCWD || pathAddr != 0 {
                var err error
                path, err = copyInPath(t, pathAddr)
                if err != nil {
                        return 0, nil, err
                }
                shouldAllowEmptyPath = disallowEmptyPath
        }

        return 0, nil, setstatat(t, dirfd, path, shouldAllowEmptyPath, shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
}

func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr hostarch.Addr, opts *vfs.SetStatOptions) error {
        if timesAddr == 0 {
                opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
                opts.Stat.Atime.Nsec = linux.UTIME_NOW
                opts.Stat.Mtime.Nsec = linux.UTIME_NOW
                return nil
        }
        var times [2]linux.Timespec
        if _, err := linux.CopyTimespecSliceIn(t, timesAddr, times[:]); err != nil {
                return err
        }
        if times[0].Nsec != linux.UTIME_OMIT {
                if times[0].Nsec != linux.UTIME_NOW && (times[0].Nsec < 0 || times[0].Nsec > 999999999) {
                        return linuxerr.EINVAL
                }
                opts.Stat.Mask |= linux.STATX_ATIME
                opts.Stat.Atime = linux.StatxTimestamp{
                        Sec:  times[0].Sec,
                        Nsec: uint32(times[0].Nsec),
                }
        }
        if times[1].Nsec != linux.UTIME_OMIT {
                if times[1].Nsec != linux.UTIME_NOW && (times[1].Nsec < 0 || times[1].Nsec > 999999999) {
                        return linuxerr.EINVAL
                }
                opts.Stat.Mask |= linux.STATX_MTIME
                opts.Stat.Mtime = linux.StatxTimestamp{
                        Sec:  times[1].Sec,
                        Nsec: uint32(times[1].Nsec),
                }
        }
        return nil
}

func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
        root := t.FSContext().RootDirectoryVFS2()
        defer root.DecRef(t)
        start := root
        if !path.Absolute {
                if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
                        return syserror.ENOENT
                }
                if dirfd == linux.AT_FDCWD {
                        start = t.FSContext().WorkingDirectoryVFS2()
                        defer start.DecRef(t)
                } else {
                        dirfile := t.GetFileVFS2(dirfd)
                        if dirfile == nil {
                                return linuxerr.EBADF
                        }
                        if !path.HasComponents() {
                                // Use FileDescription.SetStat() instead of
                                // VirtualFilesystem.SetStatAt(), since the former may be able
                                // to use opened file state to expedite the SetStat.
                                err := dirfile.SetStat(t, *opts)
                                dirfile.DecRef(t)
                                return err
                        }
                        start = dirfile.VirtualDentry()
                        start.IncRef()
                        defer start.DecRef(t)
                        dirfile.DecRef(t)
                }
        }
        return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{
                Root:               root,
                Start:              start,
                Path:               path,
                FollowFinalSymlink: bool(shouldFollowFinalSymlink),
        }, opts)
}

func handleSetSizeError(t *kernel.Task, err error) error {
        if err == syserror.ErrExceedsFileSizeLimit {
                // Convert error to EFBIG and send a SIGXFSZ per setrlimit(2).
                t.SendSignal(kernel.SignalInfoNoInfo(linux.SIGXFSZ, t, t))
                return linuxerr.EFBIG
        }
        return err
}















































































































  181 








  180 
























   34 




   34 




   32 



   32 
    2 



   30 






   33 













































   13 


   21 


   21 


   20 
   20 


   20 


   20 


   19 


   19 






















   32 
    1 
    1 



   31 


    2 


    2 


    2 







   31 

   31 
   29 


   31 

    2 



    2 

    2 






    1 










    2 





    2 



    2 



 1960 




 1964 




















    3 












   30 















   30 




    4 











    3 













   50 





   50 



   50 


   49 







   42 










   44 
















   10 
    3 


    6 












































   35 
    1 


   34 

    4 


   30 





   29 


   28 




   28 





   25 








   27 





   28 










    1 
    1 











  616 


   23 
    2 









   23 


  610 

    1 













   23 





    2 



   23 










    2 






   23 











  636 
   97 


  587 





   23 











   24 


   24 



   24 





















 1962 
 1958 


    5 


    2 

    2 



    1 









 1957 
 1961 


   25 

   25 







    3 




    3 





















  694 
  697 











































































  662 
  661 























































  667 
  666 


    4 

    3 


    1 







    1 
















    1 





    1 






    1 










    3 


    2 


    1 


    1 




    1 



    1 






    4 












    4 














   69 






   69 
    6 





   66 

    2 


   35 



   55 


   54 




   51 
    7 



   50 







   50 
















    1 
    1 





    2 




    2 

    3 
    1 



    2 

    2 
    1 



    1 

    2 
    1 



    1 

    1 




    1 

    3 





    3 


    3 


    2 



    2 



    2 



   37 


    3 





    2 


    1 


    1 




    7 










    7 










    5 




    2 



    2 


    7 

    1 



    6 











    5 


    1 



    1 


    1 





    2 

    1 


    1 




    1 


    1 
    1 


















    1 




    1 







   18 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "fmt"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/mm"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
)

// ptraceOptions are the subset of options controlling a task's ptrace behavior
// that are set by ptrace(PTRACE_SETOPTIONS).
//
// +stateify savable
type ptraceOptions struct {
        // ExitKill is true if the tracee should be sent SIGKILL when the tracer
        // exits.
        ExitKill bool

        // If SysGood is true, set bit 7 in the signal number for
        // syscall-entry-stop and syscall-exit-stop traps delivered to this task's
        // tracer.
        SysGood bool

        // TraceClone is true if the tracer wants to receive PTRACE_EVENT_CLONE
        // events.
        TraceClone bool

        // TraceExec is true if the tracer wants to receive PTRACE_EVENT_EXEC
        // events.
        TraceExec bool

        // TraceExit is true if the tracer wants to receive PTRACE_EVENT_EXIT
        // events.
        TraceExit bool

        // TraceFork is true if the tracer wants to receive PTRACE_EVENT_FORK
        // events.
        TraceFork bool

        // TraceSeccomp is true if the tracer wants to receive PTRACE_EVENT_SECCOMP
        // events.
        TraceSeccomp bool

        // TraceVfork is true if the tracer wants to receive PTRACE_EVENT_VFORK
        // events.
        TraceVfork bool

        // TraceVforkDone is true if the tracer wants to receive
        // PTRACE_EVENT_VFORK_DONE events.
        TraceVforkDone bool
}

// ptraceSyscallMode controls the behavior of a ptraced task at syscall entry
// and exit.
type ptraceSyscallMode int

const (
        // ptraceSyscallNone indicates that the task has never ptrace-stopped, or
        // that it was resumed from its last ptrace-stop by PTRACE_CONT or
        // PTRACE_DETACH. The task's syscalls will not be intercepted.
        ptraceSyscallNone ptraceSyscallMode = iota

        // ptraceSyscallIntercept indicates that the task was resumed from its last
        // ptrace-stop by PTRACE_SYSCALL. The next time the task enters or exits a
        // syscall, a ptrace-stop will occur.
        ptraceSyscallIntercept

        // ptraceSyscallEmu indicates that the task was resumed from its last
        // ptrace-stop by PTRACE_SYSEMU or PTRACE_SYSEMU_SINGLESTEP. The next time
        // the task enters a syscall, the syscall will be skipped, and a
        // ptrace-stop will occur.
        ptraceSyscallEmu
)

// CanTrace checks that t is permitted to access target's state, as defined by
// ptrace(2), subsection "Ptrace access mode checking". If attach is true, it
// checks for access mode PTRACE_MODE_ATTACH; otherwise, it checks for access
// mode PTRACE_MODE_READ.
//
// In Linux, ptrace access restrictions may be configured by LSMs. While we do
// not support LSMs, we do add additional restrictions based on the commoncap
// and YAMA LSMs.
//
// TODO(gvisor.dev/issue/212): The result of CanTrace is immediately stale (e.g., a
// racing setuid(2) may change traceability). This may pose a risk when a task
// changes from traceable to not traceable. This is only problematic across
// execve, where privileges may increase.
//
// We currently do not implement privileged executables (set-user/group-ID bits
// and file capabilities), so that case is not reachable.
func (t *Task) CanTrace(target *Task, attach bool) bool {
        // "If the calling thread and the target thread are in the same thread
        // group, access is always allowed." - ptrace(2)
        //
        // Note: Strictly speaking, prior to 73af963f9f30 ("__ptrace_may_access()
        // should not deny sub-threads", first released in Linux 3.12), the rule
        // only applies if t and target are the same task. But, as that commit
        // message puts it, "[any] security check is pointless when the tasks share
        // the same ->mm."
        if t.tg == target.tg {
                return true
        }

        if !t.canTraceStandard(target, attach) {
                return false
        }

        // YAMA only supported for vfs2.
        if !VFS2Enabled {
                return true
        }

        if atomic.LoadInt32(&t.k.YAMAPtraceScope) == linux.YAMA_SCOPE_RELATIONAL {
                t.tg.pidns.owner.mu.RLock()
                defer t.tg.pidns.owner.mu.RUnlock()
                if !t.canTraceYAMALocked(target) {
                        return false
                }
        }
        return true
}

// canTraceLocked is the same as CanTrace, except the caller must already hold
// the TaskSet mutex (for reading or writing).
func (t *Task) canTraceLocked(target *Task, attach bool) bool {
        if t.tg == target.tg {
                return true
        }

        if !t.canTraceStandard(target, attach) {
                return false
        }

        // YAMA only supported for vfs2.
        if !VFS2Enabled {
                return true
        }

        if atomic.LoadInt32(&t.k.YAMAPtraceScope) == linux.YAMA_SCOPE_RELATIONAL {
                if !t.canTraceYAMALocked(target) {
                        return false
                }
        }
        return true
}

// canTraceStandard performs standard ptrace access checks as defined by
// kernel/ptrace.c:__ptrace_may_access as well as the commoncap LSM
// implementation of the security_ptrace_access_check() interface, which is
// always invoked.
func (t *Task) canTraceStandard(target *Task, attach bool) bool {
        // """
        // TODO(gvisor.dev/issue/260): 1. If the access mode specifies
        // PTRACE_MODE_FSCREDS (ED: snipped, doesn't exist until Linux 4.5).
        //
        // Otherwise, the access mode specifies PTRACE_MODE_REALCREDS, so use the
        // caller's real UID and GID for the checks in the next step. (Most APIs
        // that check the caller's UID and GID use the effective IDs. For
        // historical reasons, the PTRACE_MODE_REALCREDS check uses the real IDs
        // instead.)
        //
        // 2. Deny access if neither of the following is true:
        //
        // - The real, effective, and saved-set user IDs of the target match the
        // caller's user ID, *and* the real, effective, and saved-set group IDs of
        // the target match the caller's group ID.
        //
        // - The caller has the CAP_SYS_PTRACE capability in the user namespace of
        // the target.
        //
        // 3. Deny access if the target process "dumpable" attribute has a value
        // other than 1 (SUID_DUMP_USER; see the discussion of PR_SET_DUMPABLE in
        // prctl(2)), and the caller does not have the CAP_SYS_PTRACE capability in
        // the user namespace of the target process.
        //
        // 4. The commoncap LSM performs the following steps:
        //
        // a) If the access mode includes PTRACE_MODE_FSCREDS, then use the
        // caller's effective capability set; otherwise (the access mode specifies
        // PTRACE_MODE_REALCREDS, so) use the caller's permitted capability set.
        //
        // b) Deny access if neither of the following is true:
        //
        // - The caller and the target process are in the same user namespace, and
        // the caller's capabilities are a proper superset of the target process's
        // permitted capabilities.
        //
        // - The caller has the CAP_SYS_PTRACE capability in the target process's
        // user namespace.
        //
        // Note that the commoncap LSM does not distinguish between
        // PTRACE_MODE_READ and PTRACE_MODE_ATTACH. (ED: From earlier in this
        // section: "the commoncap LSM ... is always invoked".)
        // """
        callerCreds := t.Credentials()
        targetCreds := target.Credentials()
        if callerCreds.HasCapabilityIn(linux.CAP_SYS_PTRACE, targetCreds.UserNamespace) {
                return true
        }
        if cuid := callerCreds.RealKUID; cuid != targetCreds.RealKUID || cuid != targetCreds.EffectiveKUID || cuid != targetCreds.SavedKUID {
                return false
        }
        if cgid := callerCreds.RealKGID; cgid != targetCreds.RealKGID || cgid != targetCreds.EffectiveKGID || cgid != targetCreds.SavedKGID {
                return false
        }
        var targetMM *mm.MemoryManager
        target.WithMuLocked(func(t *Task) {
                targetMM = t.MemoryManager()
        })
        if targetMM != nil && targetMM.Dumpability() != mm.UserDumpable {
                return false
        }
        if callerCreds.UserNamespace != targetCreds.UserNamespace {
                return false
        }
        if targetCreds.PermittedCaps&^callerCreds.PermittedCaps != 0 {
                return false
        }
        return true
}

// canTraceYAMALocked performs ptrace access checks as defined by the YAMA LSM
// implementation of the security_ptrace_access_check() interface, with YAMA
// configured to mode 1. This is a common default among various Linux
// distributions.
//
// It only permits the tracer to proceed if one of the following conditions is
// met:
//
// a) The tracer is already attached to the tracee.
//
// b) The target is a descendant of the tracer.
//
// c) The target has explicitly given permission to the tracer through the
// PR_SET_PTRACER prctl.
//
// d) The tracer has CAP_SYS_PTRACE.
//
// See security/yama/yama_lsm.c:yama_ptrace_access_check.
//
// Precondition: the TaskSet mutex must be locked (for reading or writing).
func (t *Task) canTraceYAMALocked(target *Task) bool {
        if tracer := target.Tracer(); tracer != nil {
                if tracer.tg == t.tg {
                        return true
                }
        }
        if target.isYAMADescendantOfLocked(t) {
                return true
        }
        if target.hasYAMAExceptionForLocked(t) {
                return true
        }
        if t.HasCapabilityIn(linux.CAP_SYS_PTRACE, target.UserNamespace()) {
                return true
        }
        return false
}

// Determines whether t is considered a descendant of ancestor for the purposes
// of YAMA permissions (specifically, whether t's thread group is descended from
// ancestor's).
//
// Precondition: the TaskSet mutex must be locked (for reading or writing).
func (t *Task) isYAMADescendantOfLocked(ancestor *Task) bool {
        walker := t
        for walker != nil {
                if walker.tg.leader == ancestor.tg.leader {
                        return true
                }
                walker = walker.parent
        }
        return false
}

// Precondition: the TaskSet mutex must be locked (for reading or writing).
func (t *Task) hasYAMAExceptionForLocked(tracer *Task) bool {
        allowed, ok := t.k.ptraceExceptions[t.tg.leader]
        if !ok {
                return false
        }
        return allowed == nil || tracer.isYAMADescendantOfLocked(allowed)
}

// ClearYAMAException removes any YAMA exception with t as the tracee.
func (t *Task) ClearYAMAException() {
        t.tg.pidns.owner.mu.Lock()
        defer t.tg.pidns.owner.mu.Unlock()
        tracee := t.tg.leader
        delete(t.k.ptraceExceptions, tracee)
}

// SetYAMAException creates a YAMA exception allowing all descendants of tracer
// to trace t. If tracer is nil, then any task is allowed to trace t.
//
// If there was an existing exception, it is overwritten with the new one.
func (t *Task) SetYAMAException(tracer *Task) {
        t.tg.pidns.owner.mu.Lock()
        defer t.tg.pidns.owner.mu.Unlock()

        tracee := t.tg.leader
        tracee.ptraceYAMAExceptionAdded = true
        if tracer != nil {
                tracer.ptraceYAMAExceptionAdded = true
        }

        t.k.ptraceExceptions[tracee] = tracer
}

// Tracer returns t's ptrace Tracer.
func (t *Task) Tracer() *Task {
        return t.ptraceTracer.Load().(*Task)
}

// hasTracer returns true if t has a ptrace tracer attached.
func (t *Task) hasTracer() bool {
        // This isn't just inlined into callers so that if Task.Tracer() turns out
        // to be too expensive because of e.g. interface conversion, we can switch
        // to having a separate atomic flag more easily.
        return t.Tracer() != nil
}

// ptraceStop is a TaskStop placed on tasks in a ptrace-stop.
//
// +stateify savable
type ptraceStop struct {
        // If frozen is true, the stopped task's tracer is currently operating on
        // it, so Task.Kill should not remove the stop.
        frozen bool

        // If listen is true, the stopped task's tracer invoked PTRACE_LISTEN, so
        // ptraceFreeze should fail.
        listen bool
}

// Killable implements TaskStop.Killable.
func (s *ptraceStop) Killable() bool {
        return !s.frozen
}

// beginPtraceStopLocked initiates an unfrozen ptrace-stop on t. If t has been
// killed, the stop is skipped, and beginPtraceStopLocked returns false.
//
// beginPtraceStopLocked does not signal t's tracer or wake it if it is
// waiting.
//
// Preconditions:
// * The TaskSet mutex must be locked.
// * The caller must be running on the task goroutine.
func (t *Task) beginPtraceStopLocked() bool {
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        // This is analogous to Linux's kernel/signal.c:ptrace_stop() => ... =>
        // kernel/sched/core.c:__schedule() => signal_pending_state() check, which
        // is what prevents tasks from entering ptrace-stops after being killed.
        // Note that if t was SIGKILLed and beingPtraceStopLocked is being called
        // for PTRACE_EVENT_EXIT, the task will have dequeued the signal before
        // entering the exit path, so t.killedLocked() will no longer return true.
        // This is consistent with Linux: "Bugs: ... A SIGKILL signal may still
        // cause a PTRACE_EVENT_EXIT stop before actual signal death. This may be
        // changed in the future; SIGKILL is meant to always immediately kill tasks
        // even under ptrace. Last confirmed on Linux 3.13." - ptrace(2)
        if t.killedLocked() {
                return false
        }
        t.beginInternalStopLocked(&ptraceStop{})
        return true
}

// Preconditions: The TaskSet mutex must be locked.
func (t *Task) ptraceTrapLocked(code int32) {
        // This is unconditional in ptrace_stop().
        t.tg.signalHandlers.mu.Lock()
        t.trapStopPending = false
        t.tg.signalHandlers.mu.Unlock()
        t.ptraceCode = code
        t.ptraceSiginfo = &linux.SignalInfo{
                Signo: int32(linux.SIGTRAP),
                Code:  code,
        }
        t.ptraceSiginfo.SetPID(int32(t.tg.pidns.tids[t]))
        t.ptraceSiginfo.SetUID(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
        if t.beginPtraceStopLocked() {
                tracer := t.Tracer()
                tracer.signalStop(t, linux.CLD_TRAPPED, int32(linux.SIGTRAP))
                tracer.tg.eventQueue.Notify(EventTraceeStop)
        }
}

// ptraceFreeze checks if t is in a ptraceStop. If so, it freezes the
// ptraceStop, temporarily preventing it from being removed by a concurrent
// Task.Kill, and returns true. Otherwise it returns false.
//
// Preconditions:
// * The TaskSet mutex must be locked.
// * The caller must be running on the task goroutine of t's tracer.
func (t *Task) ptraceFreeze() bool {
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        if t.stop == nil {
                return false
        }
        s, ok := t.stop.(*ptraceStop)
        if !ok {
                return false
        }
        if s.listen {
                return false
        }
        s.frozen = true
        return true
}

// ptraceUnfreeze ends the effect of a previous successful call to
// ptraceFreeze.
//
// Preconditions: t must be in a frozen ptraceStop.
func (t *Task) ptraceUnfreeze() {
        // t.tg.signalHandlers is stable because t is in a frozen ptrace-stop,
        // preventing its thread group from completing execve.
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        t.ptraceUnfreezeLocked()
}

// Preconditions:
// * t must be in a frozen ptraceStop.
// * t's signal mutex must be locked.
func (t *Task) ptraceUnfreezeLocked() {
        // Do this even if the task has been killed to ensure a panic if t.stop is
        // nil or not a ptraceStop.
        t.stop.(*ptraceStop).frozen = false
        if t.killedLocked() {
                t.endInternalStopLocked()
        }
}

// ptraceUnstop implements ptrace request PTRACE_CONT, PTRACE_SYSCALL,
// PTRACE_SINGLESTEP, PTRACE_SYSEMU, or PTRACE_SYSEMU_SINGLESTEP depending on
// mode and singlestep.
//
// Preconditions: t must be in a frozen ptrace stop.
//
// Postconditions: If ptraceUnstop returns nil, t will no longer be in a ptrace
// stop.
func (t *Task) ptraceUnstop(mode ptraceSyscallMode, singlestep bool, sig linux.Signal) error {
        if sig != 0 && !sig.IsValid() {
                return syserror.EIO
        }
        t.tg.pidns.owner.mu.Lock()
        defer t.tg.pidns.owner.mu.Unlock()
        t.ptraceCode = int32(sig)
        t.ptraceSyscallMode = mode
        t.ptraceSinglestep = singlestep
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        t.endInternalStopLocked()
        return nil
}

func (t *Task) ptraceTraceme() error {
        t.tg.pidns.owner.mu.Lock()
        defer t.tg.pidns.owner.mu.Unlock()
        if t.hasTracer() {
                return linuxerr.EPERM
        }
        if t.parent == nil {
                // In Linux, only init can not have a parent, and init is assumed never
                // to invoke PTRACE_TRACEME. In the sentry, TGID 1 is an arbitrary user
                // application that may invoke PTRACE_TRACEME; having no parent can
                // also occur if all tasks in the parent thread group have exited, and
                // failed to find a living thread group to reparent to. The former case
                // is treated as if TGID 1 has an exited parent in an invisible
                // ancestor PID namespace that is an owner of the root user namespace
                // (and consequently has CAP_SYS_PTRACE), and the latter case is a
                // special form of the exited parent case below. In either case,
                // returning nil here is correct.
                return nil
        }
        if !t.parent.canTraceLocked(t, true) {
                return linuxerr.EPERM
        }
        if t.parent.exitState != TaskExitNone {
                // Fail silently, as if we were successfully attached but then
                // immediately detached. This is consistent with Linux.
                return nil
        }
        t.ptraceTracer.Store(t.parent)
        t.parent.ptraceTracees[t] = struct{}{}
        return nil
}

// ptraceAttach implements ptrace(PTRACE_ATTACH, target) if seize is false, and
// ptrace(PTRACE_SEIZE, target, 0, opts) if seize is true. t is the caller.
func (t *Task) ptraceAttach(target *Task, seize bool, opts uintptr) error {
        if t.tg == target.tg {
                return linuxerr.EPERM
        }
        t.tg.pidns.owner.mu.Lock()
        defer t.tg.pidns.owner.mu.Unlock()
        if !t.canTraceLocked(target, true) {
                return linuxerr.EPERM
        }
        if target.hasTracer() {
                return linuxerr.EPERM
        }
        // Attaching to zombies and dead tasks is not permitted; the exit
        // notification logic relies on this. Linux allows attaching to PF_EXITING
        // tasks, though.
        if target.exitState >= TaskExitZombie {
                return linuxerr.EPERM
        }
        if seize {
                if err := target.ptraceSetOptionsLocked(opts); err != nil {
                        return syserror.EIO
                }
        }
        target.ptraceTracer.Store(t)
        t.ptraceTracees[target] = struct{}{}
        target.ptraceSeized = seize
        target.tg.signalHandlers.mu.Lock()
        // "Unlike PTRACE_ATTACH, PTRACE_SEIZE does not stop the process." -
        // ptrace(2)
        if !seize {
                target.sendSignalLocked(&linux.SignalInfo{
                        Signo: int32(linux.SIGSTOP),
                        Code:  linux.SI_USER,
                }, false /* group */)
        }
        // Undocumented Linux feature: If the tracee is already group-stopped (and
        // consequently will not report the SIGSTOP just sent), force it to leave
        // and re-enter the stop so that it will switch to a ptrace-stop.
        if target.stop == (*groupStop)(nil) {
                target.trapStopPending = true
                target.endInternalStopLocked()
                // TODO(jamieliu): Linux blocks ptrace_attach() until the task has
                // entered the ptrace-stop (or exited) via JOBCTL_TRAPPING.
        }
        target.tg.signalHandlers.mu.Unlock()
        return nil
}

// ptraceDetach implements ptrace(PTRACE_DETACH, target, 0, sig). t is the
// caller.
//
// Preconditions: target must be a tracee of t in a frozen ptrace stop.
//
// Postconditions: If ptraceDetach returns nil, target will no longer be in a
// ptrace stop.
func (t *Task) ptraceDetach(target *Task, sig linux.Signal) error {
        if sig != 0 && !sig.IsValid() {
                return syserror.EIO
        }
        t.tg.pidns.owner.mu.Lock()
        defer t.tg.pidns.owner.mu.Unlock()
        target.ptraceCode = int32(sig)
        target.forgetTracerLocked()
        delete(t.ptraceTracees, target)
        return nil
}

// exitPtrace is called in the exit path to detach all of t's tracees.
func (t *Task) exitPtrace() {
        t.tg.pidns.owner.mu.Lock()
        defer t.tg.pidns.owner.mu.Unlock()
        for target := range t.ptraceTracees {
                if target.ptraceOpts.ExitKill {
                        target.tg.signalHandlers.mu.Lock()
                        target.sendSignalLocked(&linux.SignalInfo{
                                Signo: int32(linux.SIGKILL),
                        }, false /* group */)
                        target.tg.signalHandlers.mu.Unlock()
                }
                // Leave ptraceCode unchanged so that if the task is ptrace-stopped, it
                // observes the ptraceCode it set before it entered the stop. I believe
                // this is consistent with Linux.
                target.forgetTracerLocked()
        }
        // "nil maps cannot be saved"
        t.ptraceTracees = make(map[*Task]struct{})

        if t.ptraceYAMAExceptionAdded {
                delete(t.k.ptraceExceptions, t)
                for tracee, tracer := range t.k.ptraceExceptions {
                        if tracer == t {
                                delete(t.k.ptraceExceptions, tracee)
                        }
                }
        }
}

// forgetTracerLocked detaches t's tracer and ensures that t is no longer
// ptrace-stopped.
//
// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) forgetTracerLocked() {
        t.ptraceSeized = false
        t.ptraceOpts = ptraceOptions{}
        t.ptraceSyscallMode = ptraceSyscallNone
        t.ptraceSinglestep = false
        t.ptraceTracer.Store((*Task)(nil))
        if t.exitTracerNotified && !t.exitTracerAcked {
                t.exitTracerAcked = true
                t.exitNotifyLocked(true)
        }
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        // Unset t.trapStopPending, which might have been set by PTRACE_INTERRUPT. If
        // it wasn't, it will be reset via t.groupStopPending after the following.
        t.trapStopPending = false
        // If t's thread group is in a group stop and t is eligible to participate,
        // make it do so. This is essentially the reverse of the special case in
        // ptraceAttach, which converts a group stop to a ptrace stop. ("Handling
        // of restart from group-stop is currently buggy, but the "as planned"
        // behavior is to leave tracee stopped and waiting for SIGCONT." -
        // ptrace(2))
        if (t.tg.groupStopComplete || t.tg.groupStopPendingCount != 0) && !t.groupStopPending && t.exitState < TaskExitInitiated {
                t.groupStopPending = true
                // t already participated in the group stop when it unset
                // groupStopPending.
                t.groupStopAcknowledged = true
                t.interrupt()
        }
        if _, ok := t.stop.(*ptraceStop); ok {
                t.endInternalStopLocked()
        }
}

// ptraceSignalLocked is called after signal dequeueing to check if t should
// enter ptrace signal-delivery-stop.
//
// Preconditions:
// * The signal mutex must be locked.
// * The caller must be running on the task goroutine.
// +checklocks:t.tg.signalHandlers.mu
func (t *Task) ptraceSignalLocked(info *linux.SignalInfo) bool {
        if linux.Signal(info.Signo) == linux.SIGKILL {
                return false
        }
        if !t.hasTracer() {
                return false
        }
        // The tracer might change this signal into a stop signal, in which case
        // any SIGCONT received after the signal was originally dequeued should
        // cancel it. This is consistent with Linux.
        t.tg.groupStopDequeued = true
        // This is unconditional in ptrace_stop().
        t.trapStopPending = false
        // Can't lock the TaskSet mutex while holding a signal mutex.
        t.tg.signalHandlers.mu.Unlock()
        defer t.tg.signalHandlers.mu.Lock()
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        tracer := t.Tracer()
        if tracer == nil {
                return false
        }
        t.ptraceCode = info.Signo
        t.ptraceSiginfo = info
        t.Debugf("Entering signal-delivery-stop for signal %d", info.Signo)
        if t.beginPtraceStopLocked() {
                tracer.signalStop(t, linux.CLD_TRAPPED, info.Signo)
                tracer.tg.eventQueue.Notify(EventTraceeStop)
        }
        return true
}

// ptraceSeccomp is called when a seccomp-bpf filter returns action
// SECCOMP_RET_TRACE to check if t should enter PTRACE_EVENT_SECCOMP stop. data
// is the lower 16 bits of the filter's return value.
func (t *Task) ptraceSeccomp(data uint16) bool {
        if !t.hasTracer() {
                return false
        }
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        if !t.ptraceOpts.TraceSeccomp {
                return false
        }
        t.Debugf("Entering PTRACE_EVENT_SECCOMP stop")
        t.ptraceEventLocked(linux.PTRACE_EVENT_SECCOMP, uint64(data))
        return true
}

// ptraceSyscallEnter is called immediately before entering a syscall to check
// if t should enter ptrace syscall-enter-stop.
func (t *Task) ptraceSyscallEnter() (taskRunState, bool) {
        if !t.hasTracer() {
                return nil, false
        }
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        switch t.ptraceSyscallMode {
        case ptraceSyscallNone:
                return nil, false
        case ptraceSyscallIntercept:
                t.Debugf("Entering syscall-enter-stop from PTRACE_SYSCALL")
                t.ptraceSyscallStopLocked()
                return (*runSyscallAfterSyscallEnterStop)(nil), true
        case ptraceSyscallEmu:
                t.Debugf("Entering syscall-enter-stop from PTRACE_SYSEMU")
                t.ptraceSyscallStopLocked()
                return (*runSyscallAfterSysemuStop)(nil), true
        }
        panic(fmt.Sprintf("Unknown ptraceSyscallMode: %v", t.ptraceSyscallMode))
}

// ptraceSyscallExit is called immediately after leaving a syscall to check if
// t should enter ptrace syscall-exit-stop.
func (t *Task) ptraceSyscallExit() {
        if !t.hasTracer() {
                return
        }
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        if t.ptraceSyscallMode != ptraceSyscallIntercept {
                return
        }
        t.Debugf("Entering syscall-exit-stop")
        t.ptraceSyscallStopLocked()
}

// Preconditions: The TaskSet mutex must be locked.
func (t *Task) ptraceSyscallStopLocked() {
        code := int32(linux.SIGTRAP)
        if t.ptraceOpts.SysGood {
                code |= 0x80
        }
        t.ptraceTrapLocked(code)
}

type ptraceCloneKind int32

const (
        // ptraceCloneKindClone represents a call to Task.Clone where
        // TerminationSignal is not SIGCHLD and Vfork is false.
        ptraceCloneKindClone ptraceCloneKind = iota

        // ptraceCloneKindFork represents a call to Task.Clone where
        // TerminationSignal is SIGCHLD and Vfork is false.
        ptraceCloneKindFork

        // ptraceCloneKindVfork represents a call to Task.Clone where Vfork is
        // true.
        ptraceCloneKindVfork
)

// ptraceClone is called at the end of a clone or fork syscall to check if t
// should enter PTRACE_EVENT_CLONE, PTRACE_EVENT_FORK, or PTRACE_EVENT_VFORK
// stop. child is the new task.
func (t *Task) ptraceClone(kind ptraceCloneKind, child *Task, opts *CloneOptions) bool {
        if !t.hasTracer() {
                return false
        }
        t.tg.pidns.owner.mu.Lock()
        defer t.tg.pidns.owner.mu.Unlock()
        event := false
        if !opts.Untraced {
                switch kind {
                case ptraceCloneKindClone:
                        if t.ptraceOpts.TraceClone {
                                t.Debugf("Entering PTRACE_EVENT_CLONE stop")
                                t.ptraceEventLocked(linux.PTRACE_EVENT_CLONE, uint64(t.tg.pidns.tids[child]))
                                event = true
                        }
                case ptraceCloneKindFork:
                        if t.ptraceOpts.TraceFork {
                                t.Debugf("Entering PTRACE_EVENT_FORK stop")
                                t.ptraceEventLocked(linux.PTRACE_EVENT_FORK, uint64(t.tg.pidns.tids[child]))
                                event = true
                        }
                case ptraceCloneKindVfork:
                        if t.ptraceOpts.TraceVfork {
                                t.Debugf("Entering PTRACE_EVENT_VFORK stop")
                                t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK, uint64(t.tg.pidns.tids[child]))
                                event = true
                        }
                default:
                        panic(fmt.Sprintf("Unknown ptraceCloneKind: %v", kind))
                }
        }
        // "If the PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or PTRACE_O_TRACECLONE
        // options are in effect, then children created by, respectively, vfork(2)
        // or clone(2) with the CLONE_VFORK flag, fork(2) or clone(2) with the exit
        // signal set to SIGCHLD, and other kinds of clone(2), are automatically
        // attached to the same tracer which traced their parent. SIGSTOP is
        // delivered to the children, causing them to enter signal-delivery-stop
        // after they exit the system call which created them." - ptrace(2)
        //
        // clone(2)'s documentation of CLONE_UNTRACED and CLONE_PTRACE is
        // confusingly wrong; see kernel/fork.c:_do_fork() => copy_process() =>
        // include/linux/ptrace.h:ptrace_init_task().
        if event || opts.InheritTracer {
                tracer := t.Tracer()
                if tracer != nil {
                        child.ptraceTracer.Store(tracer)
                        tracer.ptraceTracees[child] = struct{}{}
                        // "The "seized" behavior ... is inherited by children that are
                        // automatically attached using PTRACE_O_TRACEFORK,
                        // PTRACE_O_TRACEVFORK, and PTRACE_O_TRACECLONE." - ptrace(2)
                        child.ptraceSeized = t.ptraceSeized
                        // "Flags are inherited by new tracees created and "auto-attached"
                        // via active PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, or
                        // PTRACE_O_TRACECLONE options." - ptrace(2)
                        child.ptraceOpts = t.ptraceOpts
                        child.tg.signalHandlers.mu.Lock()
                        // "PTRACE_SEIZE: ... Automatically attached children stop with
                        // PTRACE_EVENT_STOP and WSTOPSIG(status) returns SIGTRAP instead
                        // of having SIGSTOP signal delivered to them." - ptrace(2)
                        if child.ptraceSeized {
                                child.trapStopPending = true
                        } else {
                                child.pendingSignals.enqueue(&linux.SignalInfo{
                                        Signo: int32(linux.SIGSTOP),
                                }, nil)
                        }
                        // The child will self-interrupt() when its task goroutine starts
                        // running, so we don't have to.
                        child.tg.signalHandlers.mu.Unlock()
                }
        }
        return event
}

// ptraceVforkDone is called after the end of a vfork stop to check if t should
// enter PTRACE_EVENT_VFORK_DONE stop. child is the new task's thread ID in t's
// PID namespace.
func (t *Task) ptraceVforkDone(child ThreadID) bool {
        if !t.hasTracer() {
                return false
        }
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        if !t.ptraceOpts.TraceVforkDone {
                return false
        }
        t.Debugf("Entering PTRACE_EVENT_VFORK_DONE stop")
        t.ptraceEventLocked(linux.PTRACE_EVENT_VFORK_DONE, uint64(child))
        return true
}

// ptraceExec is called at the end of an execve syscall to check if t should
// enter PTRACE_EVENT_EXEC stop. oldTID is t's thread ID, in its *tracer's* PID
// namespace, prior to the execve. (If t did not have a tracer at the time
// oldTID was read, oldTID may be 0. This is consistent with Linux.)
func (t *Task) ptraceExec(oldTID ThreadID) {
        if !t.hasTracer() {
                return
        }
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        // Recheck with the TaskSet mutex locked. Most ptrace points don't need to
        // do this because detaching resets ptrace options, but PTRACE_EVENT_EXEC
        // is special because both TraceExec and !TraceExec do something if a
        // tracer is attached.
        if !t.hasTracer() {
                return
        }
        if t.ptraceOpts.TraceExec {
                t.Debugf("Entering PTRACE_EVENT_EXEC stop")
                t.ptraceEventLocked(linux.PTRACE_EVENT_EXEC, uint64(oldTID))
                return
        }
        // "If the PTRACE_O_TRACEEXEC option is not in effect for the execing
        // tracee, and if the tracee was PTRACE_ATTACHed rather that [sic]
        // PTRACE_SEIZEd, the kernel delivers an extra SIGTRAP to the tracee after
        // execve(2) returns. This is an ordinary signal (similar to one which can
        // be generated by `kill -TRAP`, not a special kind of ptrace-stop.
        // Employing PTRACE_GETSIGINFO for this signal returns si_code set to 0
        // (SI_USER). This signal may be blocked by signal mask, and thus may be
        // delivered (much) later." - ptrace(2)
        if t.ptraceSeized {
                return
        }
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        t.sendSignalLocked(&linux.SignalInfo{
                Signo: int32(linux.SIGTRAP),
                Code:  linux.SI_USER,
        }, false /* group */)
}

// ptraceExit is called early in the task exit path to check if t should enter
// PTRACE_EVENT_EXIT stop.
func (t *Task) ptraceExit() {
        if !t.hasTracer() {
                return
        }
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        if !t.ptraceOpts.TraceExit {
                return
        }
        t.tg.signalHandlers.mu.Lock()
        status := t.exitStatus
        t.tg.signalHandlers.mu.Unlock()
        t.Debugf("Entering PTRACE_EVENT_EXIT stop")
        t.ptraceEventLocked(linux.PTRACE_EVENT_EXIT, uint64(status))
}

// Preconditions: The TaskSet mutex must be locked.
func (t *Task) ptraceEventLocked(event int32, msg uint64) {
        t.ptraceEventMsg = msg
        // """
        // PTRACE_EVENT stops are observed by the tracer as waitpid(2) returning
        // with WIFSTOPPED(status), and WSTOPSIG(status) returns SIGTRAP. An
        // additional bit is set in the higher byte of the status word: the value
        // status>>8 will be
        //
        //   (SIGTRAP | PTRACE_EVENT_foo << 8).
        //
        // ...
        //
        // """ - ptrace(2)
        t.ptraceTrapLocked(int32(linux.SIGTRAP) | (event << 8))
}

// ptraceKill implements ptrace(PTRACE_KILL, target). t is the caller.
func (t *Task) ptraceKill(target *Task) error {
        t.tg.pidns.owner.mu.Lock()
        defer t.tg.pidns.owner.mu.Unlock()
        if target.Tracer() != t {
                return linuxerr.ESRCH
        }
        target.tg.signalHandlers.mu.Lock()
        defer target.tg.signalHandlers.mu.Unlock()
        // "This operation is deprecated; do not use it! Instead, send a SIGKILL
        // directly using kill(2) or tgkill(2). The problem with PTRACE_KILL is
        // that it requires the tracee to be in signal-delivery-stop, otherwise it
        // may not work (i.e., may complete successfully but won't kill the
        // tracee)." - ptrace(2)
        if target.stop == nil {
                return nil
        }
        if _, ok := target.stop.(*ptraceStop); !ok {
                return nil
        }
        target.ptraceCode = int32(linux.SIGKILL)
        target.endInternalStopLocked()
        return nil
}

func (t *Task) ptraceInterrupt(target *Task) error {
        t.tg.pidns.owner.mu.Lock()
        defer t.tg.pidns.owner.mu.Unlock()
        if target.Tracer() != t {
                return linuxerr.ESRCH
        }
        if !target.ptraceSeized {
                return syserror.EIO
        }
        target.tg.signalHandlers.mu.Lock()
        defer target.tg.signalHandlers.mu.Unlock()
        if target.killedLocked() || target.exitState >= TaskExitInitiated {
                return nil
        }
        target.trapStopPending = true
        if s, ok := target.stop.(*ptraceStop); ok && s.listen {
                target.endInternalStopLocked()
        }
        target.interrupt()
        return nil
}

// Preconditions:
// * The TaskSet mutex must be locked for writing.
// * t must have a tracer.
func (t *Task) ptraceSetOptionsLocked(opts uintptr) error {
        const valid = uintptr(linux.PTRACE_O_EXITKILL |
                linux.PTRACE_O_TRACESYSGOOD |
                linux.PTRACE_O_TRACECLONE |
                linux.PTRACE_O_TRACEEXEC |
                linux.PTRACE_O_TRACEEXIT |
                linux.PTRACE_O_TRACEFORK |
                linux.PTRACE_O_TRACESECCOMP |
                linux.PTRACE_O_TRACEVFORK |
                linux.PTRACE_O_TRACEVFORKDONE)
        if opts&^valid != 0 {
                return linuxerr.EINVAL
        }
        t.ptraceOpts = ptraceOptions{
                ExitKill:       opts&linux.PTRACE_O_EXITKILL != 0,
                SysGood:        opts&linux.PTRACE_O_TRACESYSGOOD != 0,
                TraceClone:     opts&linux.PTRACE_O_TRACECLONE != 0,
                TraceExec:      opts&linux.PTRACE_O_TRACEEXEC != 0,
                TraceExit:      opts&linux.PTRACE_O_TRACEEXIT != 0,
                TraceFork:      opts&linux.PTRACE_O_TRACEFORK != 0,
                TraceSeccomp:   opts&linux.PTRACE_O_TRACESECCOMP != 0,
                TraceVfork:     opts&linux.PTRACE_O_TRACEVFORK != 0,
                TraceVforkDone: opts&linux.PTRACE_O_TRACEVFORKDONE != 0,
        }
        return nil
}

// Ptrace implements the ptrace system call.
func (t *Task) Ptrace(req int64, pid ThreadID, addr, data hostarch.Addr) error {
        // PTRACE_TRACEME ignores all other arguments.
        if req == linux.PTRACE_TRACEME {
                return t.ptraceTraceme()
        }
        // All other ptrace requests operate on a current or future tracee
        // specified by pid.
        target := t.tg.pidns.TaskWithID(pid)
        if target == nil {
                return linuxerr.ESRCH
        }

        // PTRACE_ATTACH and PTRACE_SEIZE do not require that target is not already
        // a tracee.
        if req == linux.PTRACE_ATTACH || req == linux.PTRACE_SEIZE {
                seize := req == linux.PTRACE_SEIZE
                if seize && addr != 0 {
                        return syserror.EIO
                }
                return t.ptraceAttach(target, seize, uintptr(data))
        }
        // PTRACE_KILL and PTRACE_INTERRUPT require that the target is a tracee,
        // but does not require that it is ptrace-stopped.
        if req == linux.PTRACE_KILL {
                return t.ptraceKill(target)
        }
        if req == linux.PTRACE_INTERRUPT {
                return t.ptraceInterrupt(target)
        }
        // All other ptrace requests require that the target is a ptrace-stopped
        // tracee, and freeze the ptrace-stop so the tracee can be operated on.
        t.tg.pidns.owner.mu.RLock()
        if target.Tracer() != t {
                t.tg.pidns.owner.mu.RUnlock()
                return linuxerr.ESRCH
        }
        if !target.ptraceFreeze() {
                t.tg.pidns.owner.mu.RUnlock()
                // "Most ptrace commands (all except PTRACE_ATTACH, PTRACE_SEIZE,
                // PTRACE_TRACEME, PTRACE_INTERRUPT, and PTRACE_KILL) require the
                // tracee to be in a ptrace-stop, otherwise they fail with ESRCH." -
                // ptrace(2)
                return linuxerr.ESRCH
        }
        t.tg.pidns.owner.mu.RUnlock()
        // Even if the target has a ptrace-stop active, the tracee's task goroutine
        // may not yet have reached Task.doStop; wait for it to do so. This is safe
        // because there's no way for target to initiate a ptrace-stop and then
        // block (by calling Task.block) before entering it.
        //
        // Caveat: If tasks were just restored, the tracee's first call to
        // Task.Activate (in Task.run) occurs before its first call to Task.doStop,
        // which may block if the tracer's address space is active.
        t.UninterruptibleSleepStart(true)
        target.waitGoroutineStoppedOrExited()
        t.UninterruptibleSleepFinish(true)

        // Resuming commands end the ptrace stop, but only if successful.
        // PTRACE_LISTEN ends the ptrace stop if trapNotifyPending is already set on the
        // target.
        switch req {
        case linux.PTRACE_DETACH:
                if err := t.ptraceDetach(target, linux.Signal(data)); err != nil {
                        target.ptraceUnfreeze()
                        return err
                }
                return nil

        case linux.PTRACE_CONT:
                if err := target.ptraceUnstop(ptraceSyscallNone, false, linux.Signal(data)); err != nil {
                        target.ptraceUnfreeze()
                        return err
                }
                return nil

        case linux.PTRACE_SYSCALL:
                if err := target.ptraceUnstop(ptraceSyscallIntercept, false, linux.Signal(data)); err != nil {
                        target.ptraceUnfreeze()
                        return err
                }
                return nil

        case linux.PTRACE_SINGLESTEP:
                if err := target.ptraceUnstop(ptraceSyscallNone, true, linux.Signal(data)); err != nil {
                        target.ptraceUnfreeze()
                        return err
                }
                return nil

        case linux.PTRACE_SYSEMU:
                if err := target.ptraceUnstop(ptraceSyscallEmu, false, linux.Signal(data)); err != nil {
                        target.ptraceUnfreeze()
                        return err
                }
                return nil

        case linux.PTRACE_SYSEMU_SINGLESTEP:
                if err := target.ptraceUnstop(ptraceSyscallEmu, true, linux.Signal(data)); err != nil {
                        target.ptraceUnfreeze()
                        return err
                }
                return nil

        case linux.PTRACE_LISTEN:
                t.tg.pidns.owner.mu.RLock()
                defer t.tg.pidns.owner.mu.RUnlock()
                if !target.ptraceSeized {
                        return syserror.EIO
                }
                if target.ptraceSiginfo == nil {
                        return syserror.EIO
                }
                if target.ptraceSiginfo.Code>>8 != linux.PTRACE_EVENT_STOP {
                        return syserror.EIO
                }
                target.tg.signalHandlers.mu.Lock()
                defer target.tg.signalHandlers.mu.Unlock()
                if target.trapNotifyPending {
                        target.endInternalStopLocked()
                } else {
                        target.stop.(*ptraceStop).listen = true
                        target.ptraceUnfreezeLocked()
                }
                return nil
        }

        // All other ptrace requests expect us to unfreeze the stop.
        defer target.ptraceUnfreeze()

        switch req {
        case linux.PTRACE_PEEKTEXT, linux.PTRACE_PEEKDATA:
                // "At the system call level, the PTRACE_PEEKTEXT, PTRACE_PEEKDATA, and
                // PTRACE_PEEKUSER requests have a different API: they store the result
                // at the address specified by the data parameter, and the return value
                // is the error flag." - ptrace(2)
                word := t.Arch().Native(0)
                if _, err := word.CopyIn(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr); err != nil {
                        return err
                }
                _, err := word.CopyOut(t, data)
                return err

        case linux.PTRACE_POKETEXT, linux.PTRACE_POKEDATA:
                word := t.Arch().Native(uintptr(data))
                _, err := word.CopyOut(target.CopyContext(t, usermem.IOOpts{IgnorePermissions: true}), addr)
                return err

        case linux.PTRACE_GETREGSET:
                // "Read the tracee's registers. addr specifies, in an
                // architecture-dependent way, the type of registers to be read. ...
                // data points to a struct iovec, which describes the destination
                // buffer's location and length. On return, the kernel modifies iov.len
                // to indicate the actual number of bytes returned." - ptrace(2)
                ars, err := t.CopyInIovecs(data, 1)
                if err != nil {
                        return err
                }

                t.p.PullFullState(t.MemoryManager().AddressSpace(), t.Arch())

                ar := ars.Head()
                n, err := target.Arch().PtraceGetRegSet(uintptr(addr), &usermem.IOReadWriter{
                        Ctx:  t,
                        IO:   t.MemoryManager(),
                        Addr: ar.Start,
                        Opts: usermem.IOOpts{
                                AddressSpaceActive: true,
                        },
                }, int(ar.Length()))
                if err != nil {
                        return err
                }

                // Update iovecs to represent the range of the written register set.
                end, ok := ar.Start.AddLength(uint64(n))
                if !ok {
                        panic(fmt.Sprintf("%#x + %#x overflows. Invalid reg size > %#x", ar.Start, n, ar.Length()))
                }
                ar.End = end
                return t.CopyOutIovecs(data, hostarch.AddrRangeSeqOf(ar))

        case linux.PTRACE_SETREGSET:
                ars, err := t.CopyInIovecs(data, 1)
                if err != nil {
                        return err
                }

                mm := t.MemoryManager()
                t.p.PullFullState(mm.AddressSpace(), t.Arch())

                ar := ars.Head()
                n, err := target.Arch().PtraceSetRegSet(uintptr(addr), &usermem.IOReadWriter{
                        Ctx:  t,
                        IO:   mm,
                        Addr: ar.Start,
                        Opts: usermem.IOOpts{
                                AddressSpaceActive: true,
                        },
                }, int(ar.Length()))
                if err != nil {
                        return err
                }
                t.p.FullStateChanged()
                ar.End -= hostarch.Addr(n)
                return t.CopyOutIovecs(data, hostarch.AddrRangeSeqOf(ar))

        case linux.PTRACE_GETSIGINFO:
                t.tg.pidns.owner.mu.RLock()
                defer t.tg.pidns.owner.mu.RUnlock()
                if target.ptraceSiginfo == nil {
                        return linuxerr.EINVAL
                }
                _, err := target.ptraceSiginfo.CopyOut(t, data)
                return err

        case linux.PTRACE_SETSIGINFO:
                var info linux.SignalInfo
                if _, err := info.CopyIn(t, data); err != nil {
                        return err
                }
                t.tg.pidns.owner.mu.RLock()
                defer t.tg.pidns.owner.mu.RUnlock()
                if target.ptraceSiginfo == nil {
                        return linuxerr.EINVAL
                }
                target.ptraceSiginfo = &info
                return nil

        case linux.PTRACE_GETSIGMASK:
                if addr != linux.SignalSetSize {
                        return linuxerr.EINVAL
                }
                mask := target.SignalMask()
                _, err := mask.CopyOut(t, data)
                return err

        case linux.PTRACE_SETSIGMASK:
                if addr != linux.SignalSetSize {
                        return linuxerr.EINVAL
                }
                var mask linux.SignalSet
                if _, err := mask.CopyIn(t, data); err != nil {
                        return err
                }
                // The target's task goroutine is stopped, so this is safe:
                target.SetSignalMask(mask &^ UnblockableSignals)
                return nil

        case linux.PTRACE_SETOPTIONS:
                t.tg.pidns.owner.mu.Lock()
                defer t.tg.pidns.owner.mu.Unlock()
                return target.ptraceSetOptionsLocked(uintptr(data))

        case linux.PTRACE_GETEVENTMSG:
                t.tg.pidns.owner.mu.RLock()
                defer t.tg.pidns.owner.mu.RUnlock()
                _, err := primitive.CopyUint64Out(t, hostarch.Addr(data), target.ptraceEventMsg)
                return err

        // PEEKSIGINFO is unimplemented but seems to have no users anywhere.

        default:
                return t.ptraceArch(target, req, addr, data)
        }
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/arch/arch_amd64_abi_autogen_unsafe.go: no such file or directory


































































































  674 




  671 


  673 



  684 






  685 



  685 







  674 






  673 




  670 





  669 









  675 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
)

// vdsoParams are the parameters exposed to the VDSO.
//
// They are exposed to the VDSO via a parameter page managed by VDSOParamPage,
// which also includes a sequence counter.
//
// +marshal
type vdsoParams struct {
        monotonicReady      uint64
        monotonicBaseCycles int64
        monotonicBaseRef    int64
        monotonicFrequency  uint64

        realtimeReady      uint64
        realtimeBaseCycles int64
        realtimeBaseRef    int64
        realtimeFrequency  uint64
}

// VDSOParamPage manages a VDSO parameter page.
//
// Its memory layout looks like:
//
// type page struct {
//        // seq is a sequence counter that protects the fields below.
//        seq uint64
//        vdsoParams
// }
//
// Everything in the struct is 8 bytes for easy alignment.
//
// It must be kept in sync with params in vdso/vdso_time.cc.
//
// +stateify savable
type VDSOParamPage struct {
        // The parameter page is fr, allocated from mfp.MemoryFile().
        mfp pgalloc.MemoryFileProvider
        fr  memmap.FileRange

        // seq is the current sequence count written to the page.
        //
        // A write is in progress if bit 1 of the counter is set.
        //
        // Timekeeper's updater goroutine may call Write before equality is
        // checked in state_test_util tests, causing this field to change across
        // save / restore.
        seq uint64

        // copyScratchBuffer is a temporary buffer used to marshal the params before
        // copying it to the real parameter page. The parameter page is typically
        // updated at a moderate frequency of ~O(seconds) throughout the lifetime of
        // the sentry, so reusing this buffer is a good tradeoff between memory
        // usage and the cost of allocation.
        copyScratchBuffer []byte
}

// NewVDSOParamPage returns a VDSOParamPage.
//
// Preconditions:
// * fr is a single page allocated from mfp.MemoryFile(). VDSOParamPage does
//   not take ownership of fr; it must remain allocated for the lifetime of the
//   VDSOParamPage.
// * VDSOParamPage must be the only writer to fr.
// * mfp.MemoryFile().MapInternal(fr) must return a single safemem.Block.
func NewVDSOParamPage(mfp pgalloc.MemoryFileProvider, fr memmap.FileRange) *VDSOParamPage {
        return &VDSOParamPage{
                mfp:               mfp,
                fr:                fr,
                copyScratchBuffer: make([]byte, (*vdsoParams)(nil).SizeBytes()),
        }
}

// access returns a mapping of the param page.
func (v *VDSOParamPage) access() (safemem.Block, error) {
        bs, err := v.mfp.MemoryFile().MapInternal(v.fr, hostarch.ReadWrite)
        if err != nil {
                return safemem.Block{}, err
        }
        if bs.NumBlocks() != 1 {
                panic(fmt.Sprintf("Multiple blocks (%d) in VDSO param BlockSeq", bs.NumBlocks()))
        }
        return bs.Head(), nil
}

// incrementSeq increments the sequence counter in the param page.
func (v *VDSOParamPage) incrementSeq(paramPage safemem.Block) error {
        next := v.seq + 1
        old, err := safemem.SwapUint64(paramPage, next)
        if err != nil {
                return err
        }

        if old != v.seq {
                return fmt.Errorf("unexpected VDSOParamPage seq value: got %d expected %d; application may hang or get incorrect time from the VDSO", old, v.seq)
        }

        v.seq = next
        return nil
}

// Write updates the VDSO parameters.
//
// Write starts a write block, calls f to get the new parameters, writes
// out the new parameters, then ends the write block.
func (v *VDSOParamPage) Write(f func() vdsoParams) error {
        paramPage, err := v.access()
        if err != nil {
                return err
        }

        // Write begin.
        next := v.seq + 1
        if next%2 != 1 {
                panic("Out-of-order sequence count")
        }

        err = v.incrementSeq(paramPage)
        if err != nil {
                return err
        }

        // Get the new params.
        p := f()
        buf := v.copyScratchBuffer[:p.SizeBytes()]
        p.MarshalUnsafe(buf)

        // Skip the sequence counter.
        if _, err := safemem.Copy(paramPage.DropFirst(8), safemem.BlockFromSafeSlice(buf)); err != nil {
                panic(fmt.Sprintf("Unable to get set VDSO parameters: %v", err))
        }

        // Write end.
        return v.incrementSeq(paramPage)
}















































































































    1 






























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package device defines reserved virtual kernel devices and structures
// for managing them.
package device

import (
        "bytes"
        "fmt"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/sync"
)

// Registry tracks all simple devices and related state on the system for
// save/restore.
//
// The set of devices across save/restore must remain consistent. That is, no
// devices may be created or removed on restore relative to the saved
// system. Practically, this means do not create new devices specifically as
// part of restore.
//
// +stateify savable
type Registry struct {
        // lastAnonDeviceMinor is the last minor device number used for an anonymous
        // device. Must be accessed atomically.
        lastAnonDeviceMinor uint64

        // mu protects the fields below.
        mu sync.Mutex `state:"nosave"`

        devices map[ID]*Device
}

// SimpleDevices is the system-wide simple device registry. This is
// saved/restored by kernel.Kernel, but defined here to allow access without
// depending on the kernel package. See kernel.Kernel.deviceRegistry.
var SimpleDevices = newRegistry()

func newRegistry() *Registry {
        return &Registry{
                devices: make(map[ID]*Device),
        }
}

// newAnonID assigns a major and minor number to an anonymous device ID.
func (r *Registry) newAnonID() ID {
        return ID{
                // Anon devices always have a major number of 0.
                Major: 0,
                // Use the next minor number.
                Minor: atomic.AddUint64(&r.lastAnonDeviceMinor, 1),
        }
}

// newAnonDevice allocates a new anonymous device with a unique minor device
// number, and registers it with r.
func (r *Registry) newAnonDevice() *Device {
        r.mu.Lock()
        defer r.mu.Unlock()
        d := &Device{
                ID: r.newAnonID(),
        }
        r.devices[d.ID] = d
        return d
}

// LoadFrom initializes the internal state of all devices in r from other. The
// set of devices in both registries must match. Devices may not be created or
// destroyed across save/restore.
func (r *Registry) LoadFrom(other *Registry) {
        r.mu.Lock()
        defer r.mu.Unlock()
        other.mu.Lock()
        defer other.mu.Unlock()
        if len(r.devices) != len(other.devices) {
                panic(fmt.Sprintf("Devices were added or removed when restoring the registry:\nnew:\n%+v\nold:\n%+v", r.devices, other.devices))
        }
        for id, otherD := range other.devices {
                ourD, ok := r.devices[id]
                if !ok {
                        panic(fmt.Sprintf("Device %+v could not be restored as it wasn't defined in the new registry", otherD))
                }
                ourD.loadFrom(otherD)
        }
        atomic.StoreUint64(&r.lastAnonDeviceMinor, atomic.LoadUint64(&other.lastAnonDeviceMinor))
}

// ID identifies a device.
//
// +stateify savable
type ID struct {
        Major uint64
        Minor uint64
}

// DeviceID formats a major and minor device number into a standard device number.
func (i *ID) DeviceID() uint64 {
        return uint64(linux.MakeDeviceID(uint16(i.Major), uint32(i.Minor)))
}

// NewAnonDevice creates a new anonymous device. Packages that require an anonymous
// device should initialize the device in a global variable in a file called device.go:
//
// var myDevice = device.NewAnonDevice()
func NewAnonDevice() *Device {
        return SimpleDevices.newAnonDevice()
}

// NewAnonMultiDevice creates a new multi-keyed anonymous device. Packages that require
// a multi-key anonymous device should initialize the device in a global variable in a
// file called device.go:
//
// var myDevice = device.NewAnonMultiDevice()
func NewAnonMultiDevice() *MultiDevice {
        return &MultiDevice{
                ID: SimpleDevices.newAnonID(),
        }
}

// Device is a simple virtual kernel device.
//
// +stateify savable
type Device struct {
        ID

        // last is the last generated inode.
        last uint64
}

// loadFrom initializes d from other. The IDs of both devices must match.
func (d *Device) loadFrom(other *Device) {
        if d.ID != other.ID {
                panic(fmt.Sprintf("Attempting to initialize a device %+v from %+v, but device IDs don't match", d, other))
        }
        atomic.StoreUint64(&d.last, atomic.LoadUint64(&other.last))
}

// NextIno generates a new inode number
func (d *Device) NextIno() uint64 {
        return atomic.AddUint64(&d.last, 1)
}

// MultiDeviceKey provides a hashable key for a MultiDevice. The key consists
// of a raw device and inode for a resource, which must consistently identify
// the unique resource.  It may optionally include a secondary device if
// appropriate.
//
// Note that using the path is not enough, because filesystems may rename a file
// to a different backing resource, at which point the path points to a different
// entity.  Using only the inode is also not enough because the inode is assumed
// to be unique only within the device on which the resource exists.
type MultiDeviceKey struct {
        Device          uint64
        SecondaryDevice string
        Inode           uint64
}

// String stringifies the key.
func (m MultiDeviceKey) String() string {
        return fmt.Sprintf("key{device: %d, sdevice: %s, inode: %d}", m.Device, m.SecondaryDevice, m.Inode)
}

// MultiDevice allows for remapping resources that come from a variety of raw
// devices into a single device.  The device ID should be one of the static
// Device IDs above and cannot be reused.
type MultiDevice struct {
        ID

        mu     sync.Mutex
        last   uint64
        cache  map[MultiDeviceKey]uint64
        rcache map[uint64]MultiDeviceKey
}

// String stringifies MultiDevice.
func (m *MultiDevice) String() string {
        m.mu.Lock()
        defer m.mu.Unlock()

        buf := bytes.NewBuffer(nil)
        buf.WriteString("cache{")
        for k, v := range m.cache {
                buf.WriteString(fmt.Sprintf("%s -> %d, ", k, v))
        }
        buf.WriteString("}")
        return buf.String()
}

// Map maps a raw device and inode into the inode space of MultiDevice,
// returning a virtualized inode.  Raw devices and inodes can be reused;
// in this case, the same virtual inode will be returned.
func (m *MultiDevice) Map(key MultiDeviceKey) uint64 {
        m.mu.Lock()
        defer m.mu.Unlock()

        if m.cache == nil {
                m.cache = make(map[MultiDeviceKey]uint64)
                m.rcache = make(map[uint64]MultiDeviceKey)
        }

        id, ok := m.cache[key]
        if ok {
                return id
        }
        // Step over reserved entries that may have been loaded.
        idx := m.last + 1
        for {
                if _, ok := m.rcache[idx]; !ok {
                        break
                }
                idx++
        }
        // We found a non-reserved entry, use it.
        m.last = idx
        m.cache[key] = m.last
        m.rcache[m.last] = key
        return m.last
}

// Load loads a raw device and inode into MultiDevice inode mappings
// with value as the virtual inode.
//
// By design, inodes start from 1 and continue until max uint64.  This means
// that the zero value, which is often the uninitialized value, can be rejected
// as invalid.
func (m *MultiDevice) Load(key MultiDeviceKey, value uint64) bool {
        // Reject the uninitialized value; see comment above.
        if value == 0 {
                return false
        }

        m.mu.Lock()
        defer m.mu.Unlock()

        if m.cache == nil {
                m.cache = make(map[MultiDeviceKey]uint64)
                m.rcache = make(map[uint64]MultiDeviceKey)
        }

        if val, exists := m.cache[key]; exists && val != value {
                return false
        }
        if k, exists := m.rcache[value]; exists && k != key {
                // Should never happen.
                panic(fmt.Sprintf("MultiDevice's caches are inconsistent, current: %+v, previous: %+v", key, k))
        }

        // Cache value at key.
        m.cache[key] = value

        // Prevent value from being used by new inode mappings.
        m.rcache[value] = key

        return true
}



























































    3 









  191 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build !arm,!mips,!386

package atomicbitops

import "sync/atomic"

// AlignedAtomicInt64 is an atomic int64 that is guaranteed to be 64-bit
// aligned, even on 32-bit systems. On most architectures, it's just a regular
// int64.
//
// See aligned_unsafe.go in this directory for justification.
//
// +stateify savable
type AlignedAtomicInt64 struct {
        value int64
}

// Load is analagous to atomic.LoadInt64.
func (aa *AlignedAtomicInt64) Load() int64 {
        return atomic.LoadInt64(&aa.value)
}

// Store is analagous to atomic.StoreInt64.
func (aa *AlignedAtomicInt64) Store(v int64) {
        atomic.StoreInt64(&aa.value, v)
}

// Add is analagous to atomic.AddInt64.
func (aa *AlignedAtomicInt64) Add(v int64) int64 {
        return atomic.AddInt64(&aa.value, v)
}

// AlignedAtomicUint64 is an atomic uint64 that is guaranteed to be 64-bit
// aligned, even on 32-bit systems. On most architectures, it's just a regular
// uint64.
//
// See aligned_unsafe.go in this directory for justification.
//
// +stateify savable
type AlignedAtomicUint64 struct {
        value uint64
}

// Load is analagous to atomic.LoadUint64.
func (aa *AlignedAtomicUint64) Load() uint64 {
        return atomic.LoadUint64(&aa.value)
}

// Store is analagous to atomic.StoreUint64.
func (aa *AlignedAtomicUint64) Store(v uint64) {
        atomic.StoreUint64(&aa.value, v)
}

// Add is analagous to atomic.AddUint64.
func (aa *AlignedAtomicUint64) Add(v uint64) uint64 {
        return atomic.AddUint64(&aa.value, v)
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/bits/bits32.go: no such file or directory





























    5 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package auth

// idMapFunctions "implements" generic interface segment.Functions for
// idMapSet. An idMapSet maps non-overlapping ranges of contiguous IDs in one
// user namespace to non-overlapping ranges of contiguous IDs in another user
// namespace. Each such ID mapping is implemented as a range-to-value mapping
// in the set such that [range.Start(), range.End()) => [value, value +
// range.Length()).
type idMapFunctions struct{}

func (idMapFunctions) MinKey() uint32 {
        return 0
}

func (idMapFunctions) MaxKey() uint32 {
        return NoID
}

func (idMapFunctions) ClearValue(*uint32) {}

func (idMapFunctions) Merge(r1 idMapRange, val1 uint32, r2 idMapRange, val2 uint32) (uint32, bool) {
        // Mapped ranges have to be contiguous.
        if val1+r1.Length() != val2 {
                return 0, false
        }
        return val1, true
}

func (idMapFunctions) Split(r idMapRange, val uint32, split uint32) (uint32, uint32) {
        return val, val + (split - r.Start)
}


























































































































































































































    1 









    1 




    1 










    1 


















    1 




    1 















    1 




    1 




    1 








    1 







    1 




    1 























































































































   21 














   21 





























































































































    1 





    1 




    1 





















































































































































































































































































































































































































































































































































































































































































































































































































  191 





  191 




  190 
















  192 













  191 




  191 






































































































































    2 





    2 




    2 
























































































































































































    2 






    2 




    2 





















































    1 







    1 




    1 









































    1 





    1 




    1 













































    1 






    1 




    1 






















    1 



    1 


    1 



    1 




















    1 




    1 




    1 




    1 




    1 















    3 




    3 




    3 








    3 







    3 




    3 















    2 




    2 




    2 










    2 









    2 




    2 











































































   14 



   14 





   14 




   14 










































































































































































































































































































































































  323 


  323 





  323 




  322 











  323 


  321 















  322 




  322 













































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package p9

import (
        "fmt"
        "math"

        "gvisor.dev/gvisor/pkg/fd"
)

// ErrInvalidMsgType is returned when an unsupported message type is found.
type ErrInvalidMsgType struct {
        MsgType
}

// Error returns a useful string.
func (e *ErrInvalidMsgType) Error() string {
        return fmt.Sprintf("invalid message type: %d", e.MsgType)
}

// message is a generic 9P message.
type message interface {
        encoder
        fmt.Stringer

        // Type returns the message type number.
        Type() MsgType
}

// payloader is a special message which may include an inline payload.
type payloader interface {
        // FixedSize returns the size of the fixed portion of this message.
        FixedSize() uint32

        // Payload returns the payload for sending.
        Payload() []byte

        // SetPayload returns the decoded message.
        //
        // This is going to be total message size - FixedSize. But this should
        // be validated during decode, which will be called after SetPayload.
        SetPayload([]byte)
}

// filer is a message capable of passing a file.
type filer interface {
        // FilePayload returns the file payload.
        FilePayload() *fd.FD

        // SetFilePayload sets the file payload.
        SetFilePayload(*fd.FD)
}

// filePayload embeds a File object.
type filePayload struct {
        File *fd.FD
}

// FilePayload returns the file payload.
func (f *filePayload) FilePayload() *fd.FD {
        return f.File
}

// SetFilePayload sets the received file.
func (f *filePayload) SetFilePayload(file *fd.FD) {
        f.File = file
}

// Tversion is a version request.
type Tversion struct {
        // MSize is the message size to use.
        MSize uint32

        // Version is the version string.
        //
        // For this implementation, this must be 9P2000.L.
        Version string
}

// decode implements encoder.decode.
func (t *Tversion) decode(b *buffer) {
        t.MSize = b.Read32()
        t.Version = b.ReadString()
}

// encode implements encoder.encode.
func (t *Tversion) encode(b *buffer) {
        b.Write32(t.MSize)
        b.WriteString(t.Version)
}

// Type implements message.Type.
func (*Tversion) Type() MsgType {
        return MsgTversion
}

// String implements fmt.Stringer.
func (t *Tversion) String() string {
        return fmt.Sprintf("Tversion{MSize: %d, Version: %s}", t.MSize, t.Version)
}

// Rversion is a version response.
type Rversion struct {
        // MSize is the negotiated size.
        MSize uint32

        // Version is the negotiated version.
        Version string
}

// decode implements encoder.decode.
func (r *Rversion) decode(b *buffer) {
        r.MSize = b.Read32()
        r.Version = b.ReadString()
}

// encode implements encoder.encode.
func (r *Rversion) encode(b *buffer) {
        b.Write32(r.MSize)
        b.WriteString(r.Version)
}

// Type implements message.Type.
func (*Rversion) Type() MsgType {
        return MsgRversion
}

// String implements fmt.Stringer.
func (r *Rversion) String() string {
        return fmt.Sprintf("Rversion{MSize: %d, Version: %s}", r.MSize, r.Version)
}

// Tflush is a flush request.
type Tflush struct {
        // OldTag is the tag to wait on.
        OldTag Tag
}

// decode implements encoder.decode.
func (t *Tflush) decode(b *buffer) {
        t.OldTag = b.ReadTag()
}

// encode implements encoder.encode.
func (t *Tflush) encode(b *buffer) {
        b.WriteTag(t.OldTag)
}

// Type implements message.Type.
func (*Tflush) Type() MsgType {
        return MsgTflush
}

// String implements fmt.Stringer.
func (t *Tflush) String() string {
        return fmt.Sprintf("Tflush{OldTag: %d}", t.OldTag)
}

// Rflush is a flush response.
type Rflush struct {
}

// decode implements encoder.decode.
func (*Rflush) decode(*buffer) {
}

// encode implements encoder.encode.
func (*Rflush) encode(*buffer) {
}

// Type implements message.Type.
func (*Rflush) Type() MsgType {
        return MsgRflush
}

// String implements fmt.Stringer.
func (r *Rflush) String() string {
        return "RFlush{}"
}

// Twalk is a walk request.
type Twalk struct {
        // FID is the FID to be walked.
        FID FID

        // NewFID is the resulting FID.
        NewFID FID

        // Names are the set of names to be walked.
        Names []string
}

// decode implements encoder.decode.
func (t *Twalk) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.NewFID = b.ReadFID()
        n := b.Read16()
        t.Names = t.Names[:0]
        for i := 0; i < int(n); i++ {
                t.Names = append(t.Names, b.ReadString())
        }
}

// encode implements encoder.encode.
func (t *Twalk) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.WriteFID(t.NewFID)
        b.Write16(uint16(len(t.Names)))
        for _, name := range t.Names {
                b.WriteString(name)
        }
}

// Type implements message.Type.
func (*Twalk) Type() MsgType {
        return MsgTwalk
}

// String implements fmt.Stringer.
func (t *Twalk) String() string {
        return fmt.Sprintf("Twalk{FID: %d, NewFID: %d, Names: %v}", t.FID, t.NewFID, t.Names)
}

// Rwalk is a walk response.
type Rwalk struct {
        // QIDs are the set of QIDs returned.
        QIDs []QID
}

// decode implements encoder.decode.
func (r *Rwalk) decode(b *buffer) {
        n := b.Read16()
        r.QIDs = r.QIDs[:0]
        for i := 0; i < int(n); i++ {
                var q QID
                q.decode(b)
                r.QIDs = append(r.QIDs, q)
        }
}

// encode implements encoder.encode.
func (r *Rwalk) encode(b *buffer) {
        b.Write16(uint16(len(r.QIDs)))
        for i := range r.QIDs {
                r.QIDs[i].encode(b)
        }
}

// Type implements message.Type.
func (*Rwalk) Type() MsgType {
        return MsgRwalk
}

// String implements fmt.Stringer.
func (r *Rwalk) String() string {
        return fmt.Sprintf("Rwalk{QIDs: %v}", r.QIDs)
}

// Tclunk is a close request.
type Tclunk struct {
        // FID is the FID to be closed.
        FID FID
}

// decode implements encoder.decode.
func (t *Tclunk) decode(b *buffer) {
        t.FID = b.ReadFID()
}

// encode implements encoder.encode.
func (t *Tclunk) encode(b *buffer) {
        b.WriteFID(t.FID)
}

// Type implements message.Type.
func (*Tclunk) Type() MsgType {
        return MsgTclunk
}

// String implements fmt.Stringer.
func (t *Tclunk) String() string {
        return fmt.Sprintf("Tclunk{FID: %d}", t.FID)
}

// Rclunk is a close response.
type Rclunk struct {
}

// decode implements encoder.decode.
func (*Rclunk) decode(*buffer) {
}

// encode implements encoder.encode.
func (*Rclunk) encode(*buffer) {
}

// Type implements message.Type.
func (*Rclunk) Type() MsgType {
        return MsgRclunk
}

// String implements fmt.Stringer.
func (r *Rclunk) String() string {
        return "Rclunk{}"
}

// Tsetattrclunk is a setattr+close request.
type Tsetattrclunk struct {
        // FID is the FID to change.
        FID FID

        // Valid is the set of bits which will be used.
        Valid SetAttrMask

        // SetAttr is the set request.
        SetAttr SetAttr
}

// decode implements encoder.decode.
func (t *Tsetattrclunk) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Valid.decode(b)
        t.SetAttr.decode(b)
}

// encode implements encoder.encode.
func (t *Tsetattrclunk) encode(b *buffer) {
        b.WriteFID(t.FID)
        t.Valid.encode(b)
        t.SetAttr.encode(b)
}

// Type implements message.Type.
func (*Tsetattrclunk) Type() MsgType {
        return MsgTsetattrclunk
}

// String implements fmt.Stringer.
func (t *Tsetattrclunk) String() string {
        return fmt.Sprintf("Tsetattrclunk{FID: %d, Valid: %v, SetAttr: %s}", t.FID, t.Valid, t.SetAttr)
}

// Rsetattrclunk is a setattr+close response.
type Rsetattrclunk struct {
}

// decode implements encoder.decode.
func (*Rsetattrclunk) decode(*buffer) {
}

// encode implements encoder.encode.
func (*Rsetattrclunk) encode(*buffer) {
}

// Type implements message.Type.
func (*Rsetattrclunk) Type() MsgType {
        return MsgRsetattrclunk
}

// String implements fmt.Stringer.
func (r *Rsetattrclunk) String() string {
        return "Rsetattrclunk{}"
}

// Tremove is a remove request.
//
// This will eventually be replaced by Tunlinkat.
type Tremove struct {
        // FID is the FID to be removed.
        FID FID
}

// decode implements encoder.decode.
func (t *Tremove) decode(b *buffer) {
        t.FID = b.ReadFID()
}

// encode implements encoder.encode.
func (t *Tremove) encode(b *buffer) {
        b.WriteFID(t.FID)
}

// Type implements message.Type.
func (*Tremove) Type() MsgType {
        return MsgTremove
}

// String implements fmt.Stringer.
func (t *Tremove) String() string {
        return fmt.Sprintf("Tremove{FID: %d}", t.FID)
}

// Rremove is a remove response.
type Rremove struct {
}

// decode implements encoder.decode.
func (*Rremove) decode(*buffer) {
}

// encode implements encoder.encode.
func (*Rremove) encode(*buffer) {
}

// Type implements message.Type.
func (*Rremove) Type() MsgType {
        return MsgRremove
}

// String implements fmt.Stringer.
func (r *Rremove) String() string {
        return "Rremove{}"
}

// Rlerror is an error response.
//
// Note that this replaces the error code used in 9p.
type Rlerror struct {
        Error uint32
}

// decode implements encoder.decode.
func (r *Rlerror) decode(b *buffer) {
        r.Error = b.Read32()
}

// encode implements encoder.encode.
func (r *Rlerror) encode(b *buffer) {
        b.Write32(r.Error)
}

// Type implements message.Type.
func (*Rlerror) Type() MsgType {
        return MsgRlerror
}

// String implements fmt.Stringer.
func (r *Rlerror) String() string {
        return fmt.Sprintf("Rlerror{Error: %d}", r.Error)
}

// Tauth is an authentication request.
type Tauth struct {
        // AuthenticationFID is the FID to attach the authentication result.
        AuthenticationFID FID

        // UserName is the user to attach.
        UserName string

        // AttachName is the attach name.
        AttachName string

        // UserID is the numeric identifier for UserName.
        UID UID
}

// decode implements encoder.decode.
func (t *Tauth) decode(b *buffer) {
        t.AuthenticationFID = b.ReadFID()
        t.UserName = b.ReadString()
        t.AttachName = b.ReadString()
        t.UID = b.ReadUID()
}

// encode implements encoder.encode.
func (t *Tauth) encode(b *buffer) {
        b.WriteFID(t.AuthenticationFID)
        b.WriteString(t.UserName)
        b.WriteString(t.AttachName)
        b.WriteUID(t.UID)
}

// Type implements message.Type.
func (*Tauth) Type() MsgType {
        return MsgTauth
}

// String implements fmt.Stringer.
func (t *Tauth) String() string {
        return fmt.Sprintf("Tauth{AuthFID: %d, UserName: %s, AttachName: %s, UID: %d", t.AuthenticationFID, t.UserName, t.AttachName, t.UID)
}

// Rauth is an authentication response.
//
// encode and decode are inherited directly from QID.
type Rauth struct {
        QID
}

// Type implements message.Type.
func (*Rauth) Type() MsgType {
        return MsgRauth
}

// String implements fmt.Stringer.
func (r *Rauth) String() string {
        return fmt.Sprintf("Rauth{QID: %s}", r.QID)
}

// Tattach is an attach request.
type Tattach struct {
        // FID is the FID to be attached.
        FID FID

        // Auth is the embedded authentication request.
        //
        // See client.Attach for information regarding authentication.
        Auth Tauth
}

// decode implements encoder.decode.
func (t *Tattach) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Auth.decode(b)
}

// encode implements encoder.encode.
func (t *Tattach) encode(b *buffer) {
        b.WriteFID(t.FID)
        t.Auth.encode(b)
}

// Type implements message.Type.
func (*Tattach) Type() MsgType {
        return MsgTattach
}

// String implements fmt.Stringer.
func (t *Tattach) String() string {
        return fmt.Sprintf("Tattach{FID: %d, AuthFID: %d, UserName: %s, AttachName: %s, UID: %d}", t.FID, t.Auth.AuthenticationFID, t.Auth.UserName, t.Auth.AttachName, t.Auth.UID)
}

// Rattach is an attach response.
type Rattach struct {
        QID
}

// Type implements message.Type.
func (*Rattach) Type() MsgType {
        return MsgRattach
}

// String implements fmt.Stringer.
func (r *Rattach) String() string {
        return fmt.Sprintf("Rattach{QID: %s}", r.QID)
}

// Tlopen is an open request.
type Tlopen struct {
        // FID is the FID to be opened.
        FID FID

        // Flags are the open flags.
        Flags OpenFlags
}

// decode implements encoder.decode.
func (t *Tlopen) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Flags = b.ReadOpenFlags()
}

// encode implements encoder.encode.
func (t *Tlopen) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.WriteOpenFlags(t.Flags)
}

// Type implements message.Type.
func (*Tlopen) Type() MsgType {
        return MsgTlopen
}

// String implements fmt.Stringer.
func (t *Tlopen) String() string {
        return fmt.Sprintf("Tlopen{FID: %d, Flags: %v}", t.FID, t.Flags)
}

// Rlopen is a open response.
type Rlopen struct {
        // QID is the file's QID.
        QID QID

        // IoUnit is the recommended I/O unit.
        IoUnit uint32

        filePayload
}

// decode implements encoder.decode.
func (r *Rlopen) decode(b *buffer) {
        r.QID.decode(b)
        r.IoUnit = b.Read32()
}

// encode implements encoder.encode.
func (r *Rlopen) encode(b *buffer) {
        r.QID.encode(b)
        b.Write32(r.IoUnit)
}

// Type implements message.Type.
func (*Rlopen) Type() MsgType {
        return MsgRlopen
}

// String implements fmt.Stringer.
func (r *Rlopen) String() string {
        return fmt.Sprintf("Rlopen{QID: %s, IoUnit: %d, File: %v}", r.QID, r.IoUnit, r.File)
}

// Tlcreate is a create request.
type Tlcreate struct {
        // FID is the parent FID.
        //
        // This becomes the new file.
        FID FID

        // Name is the file name to create.
        Name string

        // Mode is the open mode (O_RDWR, etc.).
        //
        // Note that flags like O_TRUNC are ignored, as is O_EXCL. All
        // create operations are exclusive.
        OpenFlags OpenFlags

        // Permissions is the set of permission bits.
        Permissions FileMode

        // GID is the group ID to use for creating the file.
        GID GID
}

// decode implements encoder.decode.
func (t *Tlcreate) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Name = b.ReadString()
        t.OpenFlags = b.ReadOpenFlags()
        t.Permissions = b.ReadPermissions()
        t.GID = b.ReadGID()
}

// encode implements encoder.encode.
func (t *Tlcreate) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.WriteString(t.Name)
        b.WriteOpenFlags(t.OpenFlags)
        b.WritePermissions(t.Permissions)
        b.WriteGID(t.GID)
}

// Type implements message.Type.
func (*Tlcreate) Type() MsgType {
        return MsgTlcreate
}

// String implements fmt.Stringer.
func (t *Tlcreate) String() string {
        return fmt.Sprintf("Tlcreate{FID: %d, Name: %s, OpenFlags: %s, Permissions: 0o%o, GID: %d}", t.FID, t.Name, t.OpenFlags, t.Permissions, t.GID)
}

// Rlcreate is a create response.
//
// The encode, decode, etc. methods are inherited from Rlopen.
type Rlcreate struct {
        Rlopen
}

// Type implements message.Type.
func (*Rlcreate) Type() MsgType {
        return MsgRlcreate
}

// String implements fmt.Stringer.
func (r *Rlcreate) String() string {
        return fmt.Sprintf("Rlcreate{QID: %s, IoUnit: %d, File: %v}", r.QID, r.IoUnit, r.File)
}

// Tsymlink is a symlink request.
type Tsymlink struct {
        // Directory is the directory FID.
        Directory FID

        // Name is the new in the directory.
        Name string

        // Target is the symlink target.
        Target string

        // GID is the owning group.
        GID GID
}

// decode implements encoder.decode.
func (t *Tsymlink) decode(b *buffer) {
        t.Directory = b.ReadFID()
        t.Name = b.ReadString()
        t.Target = b.ReadString()
        t.GID = b.ReadGID()
}

// encode implements encoder.encode.
func (t *Tsymlink) encode(b *buffer) {
        b.WriteFID(t.Directory)
        b.WriteString(t.Name)
        b.WriteString(t.Target)
        b.WriteGID(t.GID)
}

// Type implements message.Type.
func (*Tsymlink) Type() MsgType {
        return MsgTsymlink
}

// String implements fmt.Stringer.
func (t *Tsymlink) String() string {
        return fmt.Sprintf("Tsymlink{DirectoryFID: %d, Name: %s, Target: %s, GID: %d}", t.Directory, t.Name, t.Target, t.GID)
}

// Rsymlink is a symlink response.
type Rsymlink struct {
        // QID is the new symlink's QID.
        QID QID
}

// decode implements encoder.decode.
func (r *Rsymlink) decode(b *buffer) {
        r.QID.decode(b)
}

// encode implements encoder.encode.
func (r *Rsymlink) encode(b *buffer) {
        r.QID.encode(b)
}

// Type implements message.Type.
func (*Rsymlink) Type() MsgType {
        return MsgRsymlink
}

// String implements fmt.Stringer.
func (r *Rsymlink) String() string {
        return fmt.Sprintf("Rsymlink{QID: %s}", r.QID)
}

// Tlink is a link request.
type Tlink struct {
        // Directory is the directory to contain the link.
        Directory FID

        // FID is the target.
        Target FID

        // Name is the new source name.
        Name string
}

// decode implements encoder.decode.
func (t *Tlink) decode(b *buffer) {
        t.Directory = b.ReadFID()
        t.Target = b.ReadFID()
        t.Name = b.ReadString()
}

// encode implements encoder.encode.
func (t *Tlink) encode(b *buffer) {
        b.WriteFID(t.Directory)
        b.WriteFID(t.Target)
        b.WriteString(t.Name)
}

// Type implements message.Type.
func (*Tlink) Type() MsgType {
        return MsgTlink
}

// String implements fmt.Stringer.
func (t *Tlink) String() string {
        return fmt.Sprintf("Tlink{DirectoryFID: %d, TargetFID: %d, Name: %s}", t.Directory, t.Target, t.Name)
}

// Rlink is a link response.
type Rlink struct {
}

// Type implements message.Type.
func (*Rlink) Type() MsgType {
        return MsgRlink
}

// decode implements encoder.decode.
func (*Rlink) decode(*buffer) {
}

// encode implements encoder.encode.
func (*Rlink) encode(*buffer) {
}

// String implements fmt.Stringer.
func (r *Rlink) String() string {
        return "Rlink{}"
}

// Trenameat is a rename request.
type Trenameat struct {
        // OldDirectory is the source directory.
        OldDirectory FID

        // OldName is the source file name.
        OldName string

        // NewDirectory is the target directory.
        NewDirectory FID

        // NewName is the new file name.
        NewName string
}

// decode implements encoder.decode.
func (t *Trenameat) decode(b *buffer) {
        t.OldDirectory = b.ReadFID()
        t.OldName = b.ReadString()
        t.NewDirectory = b.ReadFID()
        t.NewName = b.ReadString()
}

// encode implements encoder.encode.
func (t *Trenameat) encode(b *buffer) {
        b.WriteFID(t.OldDirectory)
        b.WriteString(t.OldName)
        b.WriteFID(t.NewDirectory)
        b.WriteString(t.NewName)
}

// Type implements message.Type.
func (*Trenameat) Type() MsgType {
        return MsgTrenameat
}

// String implements fmt.Stringer.
func (t *Trenameat) String() string {
        return fmt.Sprintf("TrenameAt{OldDirectoryFID: %d, OldName: %s, NewDirectoryFID: %d, NewName: %s}", t.OldDirectory, t.OldName, t.NewDirectory, t.NewName)
}

// Rrenameat is a rename response.
type Rrenameat struct {
}

// decode implements encoder.decode.
func (*Rrenameat) decode(*buffer) {
}

// encode implements encoder.encode.
func (*Rrenameat) encode(*buffer) {
}

// Type implements message.Type.
func (*Rrenameat) Type() MsgType {
        return MsgRrenameat
}

// String implements fmt.Stringer.
func (r *Rrenameat) String() string {
        return "Rrenameat{}"
}

// Tunlinkat is an unlink request.
type Tunlinkat struct {
        // Directory is the originating directory.
        Directory FID

        // Name is the name of the entry to unlink.
        Name string

        // Flags are extra flags (e.g. O_DIRECTORY). These are not interpreted by p9.
        Flags uint32
}

// decode implements encoder.decode.
func (t *Tunlinkat) decode(b *buffer) {
        t.Directory = b.ReadFID()
        t.Name = b.ReadString()
        t.Flags = b.Read32()
}

// encode implements encoder.encode.
func (t *Tunlinkat) encode(b *buffer) {
        b.WriteFID(t.Directory)
        b.WriteString(t.Name)
        b.Write32(t.Flags)
}

// Type implements message.Type.
func (*Tunlinkat) Type() MsgType {
        return MsgTunlinkat
}

// String implements fmt.Stringer.
func (t *Tunlinkat) String() string {
        return fmt.Sprintf("Tunlinkat{DirectoryFID: %d, Name: %s, Flags: 0x%X}", t.Directory, t.Name, t.Flags)
}

// Runlinkat is an unlink response.
type Runlinkat struct {
}

// decode implements encoder.decode.
func (*Runlinkat) decode(*buffer) {
}

// encode implements encoder.encode.
func (*Runlinkat) encode(*buffer) {
}

// Type implements message.Type.
func (*Runlinkat) Type() MsgType {
        return MsgRunlinkat
}

// String implements fmt.Stringer.
func (r *Runlinkat) String() string {
        return "Runlinkat{}"
}

// Trename is a rename request.
//
// Note that this generally isn't used anymore, and ideally all rename calls
// should Trenameat below.
type Trename struct {
        // FID is the FID to rename.
        FID FID

        // Directory is the target directory.
        Directory FID

        // Name is the new file name.
        Name string
}

// decode implements encoder.decode.
func (t *Trename) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Directory = b.ReadFID()
        t.Name = b.ReadString()
}

// encode implements encoder.encode.
func (t *Trename) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.WriteFID(t.Directory)
        b.WriteString(t.Name)
}

// Type implements message.Type.
func (*Trename) Type() MsgType {
        return MsgTrename
}

// String implements fmt.Stringer.
func (t *Trename) String() string {
        return fmt.Sprintf("Trename{FID: %d, DirectoryFID: %d, Name: %s}", t.FID, t.Directory, t.Name)
}

// Rrename is a rename response.
type Rrename struct {
}

// decode implements encoder.decode.
func (*Rrename) decode(*buffer) {
}

// encode implements encoder.encode.
func (*Rrename) encode(*buffer) {
}

// Type implements message.Type.
func (*Rrename) Type() MsgType {
        return MsgRrename
}

// String implements fmt.Stringer.
func (r *Rrename) String() string {
        return "Rrename{}"
}

// Treadlink is a readlink request.
type Treadlink struct {
        // FID is the symlink.
        FID FID
}

// decode implements encoder.decode.
func (t *Treadlink) decode(b *buffer) {
        t.FID = b.ReadFID()
}

// encode implements encoder.encode.
func (t *Treadlink) encode(b *buffer) {
        b.WriteFID(t.FID)
}

// Type implements message.Type.
func (*Treadlink) Type() MsgType {
        return MsgTreadlink
}

// String implements fmt.Stringer.
func (t *Treadlink) String() string {
        return fmt.Sprintf("Treadlink{FID: %d}", t.FID)
}

// Rreadlink is a readlink response.
type Rreadlink struct {
        // Target is the symlink target.
        Target string
}

// decode implements encoder.decode.
func (r *Rreadlink) decode(b *buffer) {
        r.Target = b.ReadString()
}

// encode implements encoder.encode.
func (r *Rreadlink) encode(b *buffer) {
        b.WriteString(r.Target)
}

// Type implements message.Type.
func (*Rreadlink) Type() MsgType {
        return MsgRreadlink
}

// String implements fmt.Stringer.
func (r *Rreadlink) String() string {
        return fmt.Sprintf("Rreadlink{Target: %s}", r.Target)
}

// Tread is a read request.
type Tread struct {
        // FID is the FID to read.
        FID FID

        // Offset indicates the file offset.
        Offset uint64

        // Count indicates the number of bytes to read.
        Count uint32
}

// decode implements encoder.decode.
func (t *Tread) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Offset = b.Read64()
        t.Count = b.Read32()
}

// encode implements encoder.encode.
func (t *Tread) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.Write64(t.Offset)
        b.Write32(t.Count)
}

// Type implements message.Type.
func (*Tread) Type() MsgType {
        return MsgTread
}

// String implements fmt.Stringer.
func (t *Tread) String() string {
        return fmt.Sprintf("Tread{FID: %d, Offset: %d, Count: %d}", t.FID, t.Offset, t.Count)
}

// Rread is the response for a Tread.
type Rread struct {
        // Data is the resulting data.
        Data []byte
}

// decode implements encoder.decode.
//
// Data is automatically decoded via Payload.
func (r *Rread) decode(b *buffer) {
        count := b.Read32()
        if count != uint32(len(r.Data)) {
                b.markOverrun()
        }
}

// encode implements encoder.encode.
//
// Data is automatically encoded via Payload.
func (r *Rread) encode(b *buffer) {
        b.Write32(uint32(len(r.Data)))
}

// Type implements message.Type.
func (*Rread) Type() MsgType {
        return MsgRread
}

// FixedSize implements payloader.FixedSize.
func (*Rread) FixedSize() uint32 {
        return 4
}

// Payload implements payloader.Payload.
func (r *Rread) Payload() []byte {
        return r.Data
}

// SetPayload implements payloader.SetPayload.
func (r *Rread) SetPayload(p []byte) {
        r.Data = p
}

// String implements fmt.Stringer.
func (r *Rread) String() string {
        return fmt.Sprintf("Rread{len(Data): %d}", len(r.Data))
}

// Twrite is a write request.
type Twrite struct {
        // FID is the FID to read.
        FID FID

        // Offset indicates the file offset.
        Offset uint64

        // Data is the data to be written.
        Data []byte
}

// decode implements encoder.decode.
func (t *Twrite) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Offset = b.Read64()
        count := b.Read32()
        if count != uint32(len(t.Data)) {
                b.markOverrun()
        }
}

// encode implements encoder.encode.
//
// This uses the buffer payload to avoid a copy.
func (t *Twrite) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.Write64(t.Offset)
        b.Write32(uint32(len(t.Data)))
}

// Type implements message.Type.
func (*Twrite) Type() MsgType {
        return MsgTwrite
}

// FixedSize implements payloader.FixedSize.
func (*Twrite) FixedSize() uint32 {
        return 16
}

// Payload implements payloader.Payload.
func (t *Twrite) Payload() []byte {
        return t.Data
}

// SetPayload implements payloader.SetPayload.
func (t *Twrite) SetPayload(p []byte) {
        t.Data = p
}

// String implements fmt.Stringer.
func (t *Twrite) String() string {
        return fmt.Sprintf("Twrite{FID: %v, Offset %d, len(Data): %d}", t.FID, t.Offset, len(t.Data))
}

// Rwrite is the response for a Twrite.
type Rwrite struct {
        // Count indicates the number of bytes successfully written.
        Count uint32
}

// decode implements encoder.decode.
func (r *Rwrite) decode(b *buffer) {
        r.Count = b.Read32()
}

// encode implements encoder.encode.
func (r *Rwrite) encode(b *buffer) {
        b.Write32(r.Count)
}

// Type implements message.Type.
func (*Rwrite) Type() MsgType {
        return MsgRwrite
}

// String implements fmt.Stringer.
func (r *Rwrite) String() string {
        return fmt.Sprintf("Rwrite{Count: %d}", r.Count)
}

// Tmknod is a mknod request.
type Tmknod struct {
        // Directory is the parent directory.
        Directory FID

        // Name is the device name.
        Name string

        // Mode is the device mode and permissions.
        Mode FileMode

        // Major is the device major number.
        Major uint32

        // Minor is the device minor number.
        Minor uint32

        // GID is the device GID.
        GID GID
}

// decode implements encoder.decode.
func (t *Tmknod) decode(b *buffer) {
        t.Directory = b.ReadFID()
        t.Name = b.ReadString()
        t.Mode = b.ReadFileMode()
        t.Major = b.Read32()
        t.Minor = b.Read32()
        t.GID = b.ReadGID()
}

// encode implements encoder.encode.
func (t *Tmknod) encode(b *buffer) {
        b.WriteFID(t.Directory)
        b.WriteString(t.Name)
        b.WriteFileMode(t.Mode)
        b.Write32(t.Major)
        b.Write32(t.Minor)
        b.WriteGID(t.GID)
}

// Type implements message.Type.
func (*Tmknod) Type() MsgType {
        return MsgTmknod
}

// String implements fmt.Stringer.
func (t *Tmknod) String() string {
        return fmt.Sprintf("Tmknod{DirectoryFID: %d, Name: %s, Mode: 0o%o, Major: %d, Minor: %d, GID: %d}", t.Directory, t.Name, t.Mode, t.Major, t.Minor, t.GID)
}

// Rmknod is a mknod response.
type Rmknod struct {
        // QID is the resulting QID.
        QID QID
}

// decode implements encoder.decode.
func (r *Rmknod) decode(b *buffer) {
        r.QID.decode(b)
}

// encode implements encoder.encode.
func (r *Rmknod) encode(b *buffer) {
        r.QID.encode(b)
}

// Type implements message.Type.
func (*Rmknod) Type() MsgType {
        return MsgRmknod
}

// String implements fmt.Stringer.
func (r *Rmknod) String() string {
        return fmt.Sprintf("Rmknod{QID: %s}", r.QID)
}

// Tmkdir is a mkdir request.
type Tmkdir struct {
        // Directory is the parent directory.
        Directory FID

        // Name is the new directory name.
        Name string

        // Permissions is the set of permission bits.
        Permissions FileMode

        // GID is the owning group.
        GID GID
}

// decode implements encoder.decode.
func (t *Tmkdir) decode(b *buffer) {
        t.Directory = b.ReadFID()
        t.Name = b.ReadString()
        t.Permissions = b.ReadPermissions()
        t.GID = b.ReadGID()
}

// encode implements encoder.encode.
func (t *Tmkdir) encode(b *buffer) {
        b.WriteFID(t.Directory)
        b.WriteString(t.Name)
        b.WritePermissions(t.Permissions)
        b.WriteGID(t.GID)
}

// Type implements message.Type.
func (*Tmkdir) Type() MsgType {
        return MsgTmkdir
}

// String implements fmt.Stringer.
func (t *Tmkdir) String() string {
        return fmt.Sprintf("Tmkdir{DirectoryFID: %d, Name: %s, Permissions: 0o%o, GID: %d}", t.Directory, t.Name, t.Permissions, t.GID)
}

// Rmkdir is a mkdir response.
type Rmkdir struct {
        // QID is the resulting QID.
        QID QID
}

// decode implements encoder.decode.
func (r *Rmkdir) decode(b *buffer) {
        r.QID.decode(b)
}

// encode implements encoder.encode.
func (r *Rmkdir) encode(b *buffer) {
        r.QID.encode(b)
}

// Type implements message.Type.
func (*Rmkdir) Type() MsgType {
        return MsgRmkdir
}

// String implements fmt.Stringer.
func (r *Rmkdir) String() string {
        return fmt.Sprintf("Rmkdir{QID: %s}", r.QID)
}

// Tgetattr is a getattr request.
type Tgetattr struct {
        // FID is the FID to get attributes for.
        FID FID

        // AttrMask is the set of attributes to get.
        AttrMask AttrMask
}

// decode implements encoder.decode.
func (t *Tgetattr) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.AttrMask.decode(b)
}

// encode implements encoder.encode.
func (t *Tgetattr) encode(b *buffer) {
        b.WriteFID(t.FID)
        t.AttrMask.encode(b)
}

// Type implements message.Type.
func (*Tgetattr) Type() MsgType {
        return MsgTgetattr
}

// String implements fmt.Stringer.
func (t *Tgetattr) String() string {
        return fmt.Sprintf("Tgetattr{FID: %d, AttrMask: %s}", t.FID, t.AttrMask)
}

// Rgetattr is a getattr response.
type Rgetattr struct {
        // Valid indicates which fields are valid.
        Valid AttrMask

        // QID is the QID for this file.
        QID

        // Attr is the set of attributes.
        Attr Attr
}

// decode implements encoder.decode.
func (r *Rgetattr) decode(b *buffer) {
        r.Valid.decode(b)
        r.QID.decode(b)
        r.Attr.decode(b)
}

// encode implements encoder.encode.
func (r *Rgetattr) encode(b *buffer) {
        r.Valid.encode(b)
        r.QID.encode(b)
        r.Attr.encode(b)
}

// Type implements message.Type.
func (*Rgetattr) Type() MsgType {
        return MsgRgetattr
}

// String implements fmt.Stringer.
func (r *Rgetattr) String() string {
        return fmt.Sprintf("Rgetattr{Valid: %v, QID: %s, Attr: %s}", r.Valid, r.QID, r.Attr)
}

// Tsetattr is a setattr request.
type Tsetattr struct {
        // FID is the FID to change.
        FID FID

        // Valid is the set of bits which will be used.
        Valid SetAttrMask

        // SetAttr is the set request.
        SetAttr SetAttr
}

// decode implements encoder.decode.
func (t *Tsetattr) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Valid.decode(b)
        t.SetAttr.decode(b)
}

// encode implements encoder.encode.
func (t *Tsetattr) encode(b *buffer) {
        b.WriteFID(t.FID)
        t.Valid.encode(b)
        t.SetAttr.encode(b)
}

// Type implements message.Type.
func (*Tsetattr) Type() MsgType {
        return MsgTsetattr
}

// String implements fmt.Stringer.
func (t *Tsetattr) String() string {
        return fmt.Sprintf("Tsetattr{FID: %d, Valid: %v, SetAttr: %s}", t.FID, t.Valid, t.SetAttr)
}

// Rsetattr is a setattr response.
type Rsetattr struct {
}

// decode implements encoder.decode.
func (*Rsetattr) decode(*buffer) {
}

// encode implements encoder.encode.
func (*Rsetattr) encode(*buffer) {
}

// Type implements message.Type.
func (*Rsetattr) Type() MsgType {
        return MsgRsetattr
}

// String implements fmt.Stringer.
func (r *Rsetattr) String() string {
        return "Rsetattr{}"
}

// Tallocate is an allocate request. This is an extension to 9P protocol, not
// present in the 9P2000.L standard.
type Tallocate struct {
        FID    FID
        Mode   AllocateMode
        Offset uint64
        Length uint64
}

// decode implements encoder.decode.
func (t *Tallocate) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Mode.decode(b)
        t.Offset = b.Read64()
        t.Length = b.Read64()
}

// encode implements encoder.encode.
func (t *Tallocate) encode(b *buffer) {
        b.WriteFID(t.FID)
        t.Mode.encode(b)
        b.Write64(t.Offset)
        b.Write64(t.Length)
}

// Type implements message.Type.
func (*Tallocate) Type() MsgType {
        return MsgTallocate
}

// String implements fmt.Stringer.
func (t *Tallocate) String() string {
        return fmt.Sprintf("Tallocate{FID: %d, Offset: %d, Length: %d}", t.FID, t.Offset, t.Length)
}

// Rallocate is an allocate response.
type Rallocate struct {
}

// decode implements encoder.decode.
func (*Rallocate) decode(*buffer) {
}

// encode implements encoder.encode.
func (*Rallocate) encode(*buffer) {
}

// Type implements message.Type.
func (*Rallocate) Type() MsgType {
        return MsgRallocate
}

// String implements fmt.Stringer.
func (r *Rallocate) String() string {
        return "Rallocate{}"
}

// Tlistxattr is a listxattr request.
type Tlistxattr struct {
        // FID refers to the file on which to list xattrs.
        FID FID

        // Size is the buffer size for the xattr list.
        Size uint64
}

// decode implements encoder.decode.
func (t *Tlistxattr) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Size = b.Read64()
}

// encode implements encoder.encode.
func (t *Tlistxattr) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.Write64(t.Size)
}

// Type implements message.Type.
func (*Tlistxattr) Type() MsgType {
        return MsgTlistxattr
}

// String implements fmt.Stringer.
func (t *Tlistxattr) String() string {
        return fmt.Sprintf("Tlistxattr{FID: %d, Size: %d}", t.FID, t.Size)
}

// Rlistxattr is a listxattr response.
type Rlistxattr struct {
        // Xattrs is a list of extended attribute names.
        Xattrs []string
}

// decode implements encoder.decode.
func (r *Rlistxattr) decode(b *buffer) {
        n := b.Read16()
        r.Xattrs = r.Xattrs[:0]
        for i := 0; i < int(n); i++ {
                r.Xattrs = append(r.Xattrs, b.ReadString())
        }
}

// encode implements encoder.encode.
func (r *Rlistxattr) encode(b *buffer) {
        b.Write16(uint16(len(r.Xattrs)))
        for _, x := range r.Xattrs {
                b.WriteString(x)
        }
}

// Type implements message.Type.
func (*Rlistxattr) Type() MsgType {
        return MsgRlistxattr
}

// String implements fmt.Stringer.
func (r *Rlistxattr) String() string {
        return fmt.Sprintf("Rlistxattr{Xattrs: %v}", r.Xattrs)
}

// Txattrwalk walks extended attributes.
type Txattrwalk struct {
        // FID is the FID to check for attributes.
        FID FID

        // NewFID is the new FID associated with the attributes.
        NewFID FID

        // Name is the attribute name.
        Name string
}

// decode implements encoder.decode.
func (t *Txattrwalk) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.NewFID = b.ReadFID()
        t.Name = b.ReadString()
}

// encode implements encoder.encode.
func (t *Txattrwalk) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.WriteFID(t.NewFID)
        b.WriteString(t.Name)
}

// Type implements message.Type.
func (*Txattrwalk) Type() MsgType {
        return MsgTxattrwalk
}

// String implements fmt.Stringer.
func (t *Txattrwalk) String() string {
        return fmt.Sprintf("Txattrwalk{FID: %d, NewFID: %d, Name: %s}", t.FID, t.NewFID, t.Name)
}

// Rxattrwalk is a xattrwalk response.
type Rxattrwalk struct {
        // Size is the size of the extended attribute.
        Size uint64
}

// decode implements encoder.decode.
func (r *Rxattrwalk) decode(b *buffer) {
        r.Size = b.Read64()
}

// encode implements encoder.encode.
func (r *Rxattrwalk) encode(b *buffer) {
        b.Write64(r.Size)
}

// Type implements message.Type.
func (*Rxattrwalk) Type() MsgType {
        return MsgRxattrwalk
}

// String implements fmt.Stringer.
func (r *Rxattrwalk) String() string {
        return fmt.Sprintf("Rxattrwalk{Size: %d}", r.Size)
}

// Txattrcreate prepare to set extended attributes.
type Txattrcreate struct {
        // FID is input/output parameter, it identifies the file on which
        // extended attributes will be set but after successful Rxattrcreate
        // it is used to write the extended attribute value.
        FID FID

        // Name is the attribute name.
        Name string

        // Size of the attribute value. When the FID is clunked it has to match
        // the number of bytes written to the FID.
        AttrSize uint64

        // Linux setxattr(2) flags.
        Flags uint32
}

// decode implements encoder.decode.
func (t *Txattrcreate) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Name = b.ReadString()
        t.AttrSize = b.Read64()
        t.Flags = b.Read32()
}

// encode implements encoder.encode.
func (t *Txattrcreate) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.WriteString(t.Name)
        b.Write64(t.AttrSize)
        b.Write32(t.Flags)
}

// Type implements message.Type.
func (*Txattrcreate) Type() MsgType {
        return MsgTxattrcreate
}

// String implements fmt.Stringer.
func (t *Txattrcreate) String() string {
        return fmt.Sprintf("Txattrcreate{FID: %d, Name: %s, AttrSize: %d, Flags: %d}", t.FID, t.Name, t.AttrSize, t.Flags)
}

// Rxattrcreate is a xattrcreate response.
type Rxattrcreate struct {
}

// decode implements encoder.decode.
func (r *Rxattrcreate) decode(*buffer) {
}

// encode implements encoder.encode.
func (r *Rxattrcreate) encode(*buffer) {
}

// Type implements message.Type.
func (*Rxattrcreate) Type() MsgType {
        return MsgRxattrcreate
}

// String implements fmt.Stringer.
func (r *Rxattrcreate) String() string {
        return "Rxattrcreate{}"
}

// Tgetxattr is a getxattr request.
type Tgetxattr struct {
        // FID refers to the file for which to get xattrs.
        FID FID

        // Name is the xattr to get.
        Name string

        // Size is the buffer size for the xattr to get.
        Size uint64
}

// decode implements encoder.decode.
func (t *Tgetxattr) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Name = b.ReadString()
        t.Size = b.Read64()
}

// encode implements encoder.encode.
func (t *Tgetxattr) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.WriteString(t.Name)
        b.Write64(t.Size)
}

// Type implements message.Type.
func (*Tgetxattr) Type() MsgType {
        return MsgTgetxattr
}

// String implements fmt.Stringer.
func (t *Tgetxattr) String() string {
        return fmt.Sprintf("Tgetxattr{FID: %d, Name: %s, Size: %d}", t.FID, t.Name, t.Size)
}

// Rgetxattr is a getxattr response.
type Rgetxattr struct {
        // Value is the extended attribute value.
        Value string
}

// decode implements encoder.decode.
func (r *Rgetxattr) decode(b *buffer) {
        r.Value = b.ReadString()
}

// encode implements encoder.encode.
func (r *Rgetxattr) encode(b *buffer) {
        b.WriteString(r.Value)
}

// Type implements message.Type.
func (*Rgetxattr) Type() MsgType {
        return MsgRgetxattr
}

// String implements fmt.Stringer.
func (r *Rgetxattr) String() string {
        return fmt.Sprintf("Rgetxattr{Value: %s}", r.Value)
}

// Tsetxattr sets extended attributes.
type Tsetxattr struct {
        // FID refers to the file on which to set xattrs.
        FID FID

        // Name is the attribute name.
        Name string

        // Value is the attribute value.
        Value string

        // Linux setxattr(2) flags.
        Flags uint32
}

// decode implements encoder.decode.
func (t *Tsetxattr) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Name = b.ReadString()
        t.Value = b.ReadString()
        t.Flags = b.Read32()
}

// encode implements encoder.encode.
func (t *Tsetxattr) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.WriteString(t.Name)
        b.WriteString(t.Value)
        b.Write32(t.Flags)
}

// Type implements message.Type.
func (*Tsetxattr) Type() MsgType {
        return MsgTsetxattr
}

// String implements fmt.Stringer.
func (t *Tsetxattr) String() string {
        return fmt.Sprintf("Tsetxattr{FID: %d, Name: %s, Value: %s, Flags: %d}", t.FID, t.Name, t.Value, t.Flags)
}

// Rsetxattr is a setxattr response.
type Rsetxattr struct {
}

// decode implements encoder.decode.
func (r *Rsetxattr) decode(*buffer) {
}

// encode implements encoder.encode.
func (r *Rsetxattr) encode(*buffer) {
}

// Type implements message.Type.
func (*Rsetxattr) Type() MsgType {
        return MsgRsetxattr
}

// String implements fmt.Stringer.
func (r *Rsetxattr) String() string {
        return "Rsetxattr{}"
}

// Tremovexattr is a removexattr request.
type Tremovexattr struct {
        // FID refers to the file on which to set xattrs.
        FID FID

        // Name is the attribute name.
        Name string
}

// decode implements encoder.decode.
func (t *Tremovexattr) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Name = b.ReadString()
}

// encode implements encoder.encode.
func (t *Tremovexattr) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.WriteString(t.Name)
}

// Type implements message.Type.
func (*Tremovexattr) Type() MsgType {
        return MsgTremovexattr
}

// String implements fmt.Stringer.
func (t *Tremovexattr) String() string {
        return fmt.Sprintf("Tremovexattr{FID: %d, Name: %s}", t.FID, t.Name)
}

// Rremovexattr is a removexattr response.
type Rremovexattr struct {
}

// decode implements encoder.decode.
func (r *Rremovexattr) decode(*buffer) {
}

// encode implements encoder.encode.
func (r *Rremovexattr) encode(*buffer) {
}

// Type implements message.Type.
func (*Rremovexattr) Type() MsgType {
        return MsgRremovexattr
}

// String implements fmt.Stringer.
func (r *Rremovexattr) String() string {
        return "Rremovexattr{}"
}

// Treaddir is a readdir request.
type Treaddir struct {
        // Directory is the directory FID to read.
        Directory FID

        // Offset is the offset to read at.
        Offset uint64

        // Count is the number of bytes to read.
        Count uint32
}

// decode implements encoder.decode.
func (t *Treaddir) decode(b *buffer) {
        t.Directory = b.ReadFID()
        t.Offset = b.Read64()
        t.Count = b.Read32()
}

// encode implements encoder.encode.
func (t *Treaddir) encode(b *buffer) {
        b.WriteFID(t.Directory)
        b.Write64(t.Offset)
        b.Write32(t.Count)
}

// Type implements message.Type.
func (*Treaddir) Type() MsgType {
        return MsgTreaddir
}

// String implements fmt.Stringer.
func (t *Treaddir) String() string {
        return fmt.Sprintf("Treaddir{DirectoryFID: %d, Offset: %d, Count: %d}", t.Directory, t.Offset, t.Count)
}

// Rreaddir is a readdir response.
type Rreaddir struct {
        // Count is the byte limit.
        //
        // This should always be set from the Treaddir request.
        Count uint32

        // Entries are the resulting entries.
        //
        // This may be constructed in decode.
        Entries []Dirent

        // payload is the encoded payload.
        //
        // This is constructed by encode.
        payload []byte
}

// decode implements encoder.decode.
func (r *Rreaddir) decode(b *buffer) {
        r.Count = b.Read32()
        entriesBuf := buffer{data: r.payload}
        r.Entries = r.Entries[:0]
        for {
                var d Dirent
                d.decode(&entriesBuf)
                if entriesBuf.isOverrun() {
                        // Couldn't decode a complete entry.
                        break
                }
                r.Entries = append(r.Entries, d)
        }
}

// encode implements encoder.encode.
func (r *Rreaddir) encode(b *buffer) {
        entriesBuf := buffer{}
        payloadSize := 0
        for _, d := range r.Entries {
                d.encode(&entriesBuf)
                if len(entriesBuf.data) > int(r.Count) {
                        break
                }
                payloadSize = len(entriesBuf.data)
        }
        r.Count = uint32(payloadSize)
        r.payload = entriesBuf.data[:payloadSize]
        b.Write32(r.Count)
}

// Type implements message.Type.
func (*Rreaddir) Type() MsgType {
        return MsgRreaddir
}

// FixedSize implements payloader.FixedSize.
func (*Rreaddir) FixedSize() uint32 {
        return 4
}

// Payload implements payloader.Payload.
func (r *Rreaddir) Payload() []byte {
        return r.payload
}

// SetPayload implements payloader.SetPayload.
func (r *Rreaddir) SetPayload(p []byte) {
        r.payload = p
}

// String implements fmt.Stringer.
func (r *Rreaddir) String() string {
        return fmt.Sprintf("Rreaddir{Count: %d, Entries: %s}", r.Count, r.Entries)
}

// Tfsync is an fsync request.
type Tfsync struct {
        // FID is the fid to sync.
        FID FID
}

// decode implements encoder.decode.
func (t *Tfsync) decode(b *buffer) {
        t.FID = b.ReadFID()
}

// encode implements encoder.encode.
func (t *Tfsync) encode(b *buffer) {
        b.WriteFID(t.FID)
}

// Type implements message.Type.
func (*Tfsync) Type() MsgType {
        return MsgTfsync
}

// String implements fmt.Stringer.
func (t *Tfsync) String() string {
        return fmt.Sprintf("Tfsync{FID: %d}", t.FID)
}

// Rfsync is an fsync response.
type Rfsync struct {
}

// decode implements encoder.decode.
func (*Rfsync) decode(*buffer) {
}

// encode implements encoder.encode.
func (*Rfsync) encode(*buffer) {
}

// Type implements message.Type.
func (*Rfsync) Type() MsgType {
        return MsgRfsync
}

// String implements fmt.Stringer.
func (r *Rfsync) String() string {
        return "Rfsync{}"
}

// Tstatfs is a stat request.
type Tstatfs struct {
        // FID is the root.
        FID FID
}

// decode implements encoder.decode.
func (t *Tstatfs) decode(b *buffer) {
        t.FID = b.ReadFID()
}

// encode implements encoder.encode.
func (t *Tstatfs) encode(b *buffer) {
        b.WriteFID(t.FID)
}

// Type implements message.Type.
func (*Tstatfs) Type() MsgType {
        return MsgTstatfs
}

// String implements fmt.Stringer.
func (t *Tstatfs) String() string {
        return fmt.Sprintf("Tstatfs{FID: %d}", t.FID)
}

// Rstatfs is the response for a Tstatfs.
type Rstatfs struct {
        // FSStat is the stat result.
        FSStat FSStat
}

// decode implements encoder.decode.
func (r *Rstatfs) decode(b *buffer) {
        r.FSStat.decode(b)
}

// encode implements encoder.encode.
func (r *Rstatfs) encode(b *buffer) {
        r.FSStat.encode(b)
}

// Type implements message.Type.
func (*Rstatfs) Type() MsgType {
        return MsgRstatfs
}

// String implements fmt.Stringer.
func (r *Rstatfs) String() string {
        return fmt.Sprintf("Rstatfs{FSStat: %v}", r.FSStat)
}

// Tflushf is a flush file request, not to be confused with Tflush.
type Tflushf struct {
        // FID is the FID to be flushed.
        FID FID
}

// decode implements encoder.decode.
func (t *Tflushf) decode(b *buffer) {
        t.FID = b.ReadFID()
}

// encode implements encoder.encode.
func (t *Tflushf) encode(b *buffer) {
        b.WriteFID(t.FID)
}

// Type implements message.Type.
func (*Tflushf) Type() MsgType {
        return MsgTflushf
}

// String implements fmt.Stringer.
func (t *Tflushf) String() string {
        return fmt.Sprintf("Tflushf{FID: %d}", t.FID)
}

// Rflushf is a flush file response.
type Rflushf struct {
}

// decode implements encoder.decode.
func (*Rflushf) decode(*buffer) {
}

// encode implements encoder.encode.
func (*Rflushf) encode(*buffer) {
}

// Type implements message.Type.
func (*Rflushf) Type() MsgType {
        return MsgRflushf
}

// String implements fmt.Stringer.
func (*Rflushf) String() string {
        return "Rflushf{}"
}

// Twalkgetattr is a walk request.
type Twalkgetattr struct {
        // FID is the FID to be walked.
        FID FID

        // NewFID is the resulting FID.
        NewFID FID

        // Names are the set of names to be walked.
        Names []string
}

// decode implements encoder.decode.
func (t *Twalkgetattr) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.NewFID = b.ReadFID()
        n := b.Read16()
        t.Names = t.Names[:0]
        for i := 0; i < int(n); i++ {
                t.Names = append(t.Names, b.ReadString())
        }
}

// encode implements encoder.encode.
func (t *Twalkgetattr) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.WriteFID(t.NewFID)
        b.Write16(uint16(len(t.Names)))
        for _, name := range t.Names {
                b.WriteString(name)
        }
}

// Type implements message.Type.
func (*Twalkgetattr) Type() MsgType {
        return MsgTwalkgetattr
}

// String implements fmt.Stringer.
func (t *Twalkgetattr) String() string {
        return fmt.Sprintf("Twalkgetattr{FID: %d, NewFID: %d, Names: %v}", t.FID, t.NewFID, t.Names)
}

// Rwalkgetattr is a walk response.
type Rwalkgetattr struct {
        // Valid indicates which fields are valid in the Attr below.
        Valid AttrMask

        // Attr is the set of attributes for the last QID (the file walked to).
        Attr Attr

        // QIDs are the set of QIDs returned.
        QIDs []QID
}

// decode implements encoder.decode.
func (r *Rwalkgetattr) decode(b *buffer) {
        r.Valid.decode(b)
        r.Attr.decode(b)
        n := b.Read16()
        r.QIDs = r.QIDs[:0]
        for i := 0; i < int(n); i++ {
                var q QID
                q.decode(b)
                r.QIDs = append(r.QIDs, q)
        }
}

// encode implements encoder.encode.
func (r *Rwalkgetattr) encode(b *buffer) {
        r.Valid.encode(b)
        r.Attr.encode(b)
        b.Write16(uint16(len(r.QIDs)))
        for i := range r.QIDs {
                r.QIDs[i].encode(b)
        }
}

// Type implements message.Type.
func (*Rwalkgetattr) Type() MsgType {
        return MsgRwalkgetattr
}

// String implements fmt.Stringer.
func (r *Rwalkgetattr) String() string {
        return fmt.Sprintf("Rwalkgetattr{Valid: %s, Attr: %s, QIDs: %v}", r.Valid, r.Attr, r.QIDs)
}

// Tucreate is a Tlcreate message that includes a UID.
type Tucreate struct {
        Tlcreate

        // UID is the UID to use as the effective UID in creation messages.
        UID UID
}

// decode implements encoder.decode.
func (t *Tucreate) decode(b *buffer) {
        t.Tlcreate.decode(b)
        t.UID = b.ReadUID()
}

// encode implements encoder.encode.
func (t *Tucreate) encode(b *buffer) {
        t.Tlcreate.encode(b)
        b.WriteUID(t.UID)
}

// Type implements message.Type.
func (t *Tucreate) Type() MsgType {
        return MsgTucreate
}

// String implements fmt.Stringer.
func (t *Tucreate) String() string {
        return fmt.Sprintf("Tucreate{Tlcreate: %v, UID: %d}", &t.Tlcreate, t.UID)
}

// Rucreate is a file creation response.
type Rucreate struct {
        Rlcreate
}

// Type implements message.Type.
func (*Rucreate) Type() MsgType {
        return MsgRucreate
}

// String implements fmt.Stringer.
func (r *Rucreate) String() string {
        return fmt.Sprintf("Rucreate{%v}", &r.Rlcreate)
}

// Tumkdir is a Tmkdir message that includes a UID.
type Tumkdir struct {
        Tmkdir

        // UID is the UID to use as the effective UID in creation messages.
        UID UID
}

// decode implements encoder.decode.
func (t *Tumkdir) decode(b *buffer) {
        t.Tmkdir.decode(b)
        t.UID = b.ReadUID()
}

// encode implements encoder.encode.
func (t *Tumkdir) encode(b *buffer) {
        t.Tmkdir.encode(b)
        b.WriteUID(t.UID)
}

// Type implements message.Type.
func (t *Tumkdir) Type() MsgType {
        return MsgTumkdir
}

// String implements fmt.Stringer.
func (t *Tumkdir) String() string {
        return fmt.Sprintf("Tumkdir{Tmkdir: %v, UID: %d}", &t.Tmkdir, t.UID)
}

// Rumkdir is a umkdir response.
type Rumkdir struct {
        Rmkdir
}

// Type implements message.Type.
func (*Rumkdir) Type() MsgType {
        return MsgRumkdir
}

// String implements fmt.Stringer.
func (r *Rumkdir) String() string {
        return fmt.Sprintf("Rumkdir{%v}", &r.Rmkdir)
}

// Tumknod is a Tmknod message that includes a UID.
type Tumknod struct {
        Tmknod

        // UID is the UID to use as the effective UID in creation messages.
        UID UID
}

// decode implements encoder.decode.
func (t *Tumknod) decode(b *buffer) {
        t.Tmknod.decode(b)
        t.UID = b.ReadUID()
}

// encode implements encoder.encode.
func (t *Tumknod) encode(b *buffer) {
        t.Tmknod.encode(b)
        b.WriteUID(t.UID)
}

// Type implements message.Type.
func (t *Tumknod) Type() MsgType {
        return MsgTumknod
}

// String implements fmt.Stringer.
func (t *Tumknod) String() string {
        return fmt.Sprintf("Tumknod{Tmknod: %v, UID: %d}", &t.Tmknod, t.UID)
}

// Rumknod is a umknod response.
type Rumknod struct {
        Rmknod
}

// Type implements message.Type.
func (*Rumknod) Type() MsgType {
        return MsgRumknod
}

// String implements fmt.Stringer.
func (r *Rumknod) String() string {
        return fmt.Sprintf("Rumknod{%v}", &r.Rmknod)
}

// Tusymlink is a Tsymlink message that includes a UID.
type Tusymlink struct {
        Tsymlink

        // UID is the UID to use as the effective UID in creation messages.
        UID UID
}

// decode implements encoder.decode.
func (t *Tusymlink) decode(b *buffer) {
        t.Tsymlink.decode(b)
        t.UID = b.ReadUID()
}

// encode implements encoder.encode.
func (t *Tusymlink) encode(b *buffer) {
        t.Tsymlink.encode(b)
        b.WriteUID(t.UID)
}

// Type implements message.Type.
func (t *Tusymlink) Type() MsgType {
        return MsgTusymlink
}

// String implements fmt.Stringer.
func (t *Tusymlink) String() string {
        return fmt.Sprintf("Tusymlink{Tsymlink: %v, UID: %d}", &t.Tsymlink, t.UID)
}

// Rusymlink is a usymlink response.
type Rusymlink struct {
        Rsymlink
}

// Type implements message.Type.
func (*Rusymlink) Type() MsgType {
        return MsgRusymlink
}

// String implements fmt.Stringer.
func (r *Rusymlink) String() string {
        return fmt.Sprintf("Rusymlink{%v}", &r.Rsymlink)
}

// Tlconnect is a connect request.
type Tlconnect struct {
        // FID is the FID to be connected.
        FID FID

        // Flags are the connect flags.
        Flags ConnectFlags
}

// decode implements encoder.decode.
func (t *Tlconnect) decode(b *buffer) {
        t.FID = b.ReadFID()
        t.Flags = b.ReadConnectFlags()
}

// encode implements encoder.encode.
func (t *Tlconnect) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.WriteConnectFlags(t.Flags)
}

// Type implements message.Type.
func (*Tlconnect) Type() MsgType {
        return MsgTlconnect
}

// String implements fmt.Stringer.
func (t *Tlconnect) String() string {
        return fmt.Sprintf("Tlconnect{FID: %d, Flags: %v}", t.FID, t.Flags)
}

// Rlconnect is a connect response.
type Rlconnect struct {
        filePayload
}

// decode implements encoder.decode.
func (r *Rlconnect) decode(*buffer) {}

// encode implements encoder.encode.
func (r *Rlconnect) encode(*buffer) {}

// Type implements message.Type.
func (*Rlconnect) Type() MsgType {
        return MsgRlconnect
}

// String implements fmt.Stringer.
func (r *Rlconnect) String() string {
        return fmt.Sprintf("Rlconnect{File: %v}", r.File)
}

// Tchannel creates a new channel.
type Tchannel struct {
        // ID is the channel ID.
        ID uint32

        // Control is 0 if the Rchannel response should provide the flipcall
        // component of the channel, and 1 if the Rchannel response should
        // provide the fdchannel component of the channel.
        Control uint32
}

// decode implements encoder.decode.
func (t *Tchannel) decode(b *buffer) {
        t.ID = b.Read32()
        t.Control = b.Read32()
}

// encode implements encoder.encode.
func (t *Tchannel) encode(b *buffer) {
        b.Write32(t.ID)
        b.Write32(t.Control)
}

// Type implements message.Type.
func (*Tchannel) Type() MsgType {
        return MsgTchannel
}

// String implements fmt.Stringer.
func (t *Tchannel) String() string {
        return fmt.Sprintf("Tchannel{ID: %d, Control: %d}", t.ID, t.Control)
}

// Rchannel is the channel response.
type Rchannel struct {
        Offset uint64
        Length uint64
        filePayload
}

// decode implements encoder.decode.
func (r *Rchannel) decode(b *buffer) {
        r.Offset = b.Read64()
        r.Length = b.Read64()
}

// encode implements encoder.encode.
func (r *Rchannel) encode(b *buffer) {
        b.Write64(r.Offset)
        b.Write64(r.Length)
}

// Type implements message.Type.
func (*Rchannel) Type() MsgType {
        return MsgRchannel
}

// String implements fmt.Stringer.
func (r *Rchannel) String() string {
        return fmt.Sprintf("Rchannel{Offset: %d, Length: %d}", r.Offset, r.Length)
}

// Tmultigetattr is a multi-getattr request.
type Tmultigetattr struct {
        // FID is the FID to be walked.
        FID FID

        // Names are the set of names to be walked.
        Names []string
}

// decode implements encoder.decode.
func (t *Tmultigetattr) decode(b *buffer) {
        t.FID = b.ReadFID()
        n := b.Read16()
        t.Names = t.Names[:0]
        for i := 0; i < int(n); i++ {
                t.Names = append(t.Names, b.ReadString())
        }
}

// encode implements encoder.encode.
func (t *Tmultigetattr) encode(b *buffer) {
        b.WriteFID(t.FID)
        b.Write16(uint16(len(t.Names)))
        for _, name := range t.Names {
                b.WriteString(name)
        }
}

// Type implements message.Type.
func (*Tmultigetattr) Type() MsgType {
        return MsgTmultigetattr
}

// String implements fmt.Stringer.
func (t *Tmultigetattr) String() string {
        return fmt.Sprintf("Tmultigetattr{FID: %d, Names: %v}", t.FID, t.Names)
}

// Rmultigetattr is a multi-getattr response.
type Rmultigetattr struct {
        // Stats are the set of FullStat returned for each of the names in the
        // request.
        Stats []FullStat
}

// decode implements encoder.decode.
func (r *Rmultigetattr) decode(b *buffer) {
        n := b.Read16()
        r.Stats = r.Stats[:0]
        for i := 0; i < int(n); i++ {
                var fs FullStat
                fs.decode(b)
                r.Stats = append(r.Stats, fs)
        }
}

// encode implements encoder.encode.
func (r *Rmultigetattr) encode(b *buffer) {
        b.Write16(uint16(len(r.Stats)))
        for i := range r.Stats {
                r.Stats[i].encode(b)
        }
}

// Type implements message.Type.
func (*Rmultigetattr) Type() MsgType {
        return MsgRmultigetattr
}

// String implements fmt.Stringer.
func (r *Rmultigetattr) String() string {
        return fmt.Sprintf("Rmultigetattr{Stats: %v}", r.Stats)
}

const maxCacheSize = 3

// msgFactory is used to reduce allocations by caching messages for reuse.
type msgFactory struct {
        create func() message
        cache  chan message
}

// msgRegistry indexes all message factories by type.
var msgRegistry registry

type registry struct {
        factories [math.MaxUint8 + 1]msgFactory

        // largestFixedSize is computed so that given some message size M, you can
        // compute the maximum payload size (e.g. for Twrite, Rread) with
        // M-largestFixedSize. You could do this individual on a per-message basis,
        // but it's easier to compute a single maximum safe payload.
        largestFixedSize uint32
}

// get returns a new message by type.
//
// An error is returned in the case of an unknown message.
//
// This takes, and ignores, a message tag so that it may be used directly as a
// lookupTagAndType function for recv (by design).
func (r *registry) get(_ Tag, t MsgType) (message, error) {
        entry := &r.factories[t]
        if entry.create == nil {
                return nil, &ErrInvalidMsgType{t}
        }

        select {
        case msg := <-entry.cache:
                return msg, nil
        default:
                return entry.create(), nil
        }
}

func (r *registry) put(msg message) {
        if p, ok := msg.(payloader); ok {
                p.SetPayload(nil)
        }
        if f, ok := msg.(filer); ok {
                f.SetFilePayload(nil)
        }

        entry := &r.factories[msg.Type()]
        select {
        case entry.cache <- msg:
        default:
        }
}

// register registers the given message type.
//
// This may cause panic on failure and should only be used from init.
func (r *registry) register(t MsgType, fn func() message) {
        if int(t) >= len(r.factories) {
                panic(fmt.Sprintf("message type %d is too large. It must be smaller than %d", t, len(r.factories)))
        }
        if r.factories[t].create != nil {
                panic(fmt.Sprintf("duplicate message type %d: first is %T, second is %T", t, r.factories[t].create(), fn()))
        }
        r.factories[t] = msgFactory{
                create: fn,
                cache:  make(chan message, maxCacheSize),
        }

        if size := calculateSize(fn()); size > r.largestFixedSize {
                r.largestFixedSize = size
        }
}

func calculateSize(m message) uint32 {
        if p, ok := m.(payloader); ok {
                return p.FixedSize()
        }
        var dataBuf buffer
        m.encode(&dataBuf)
        return uint32(len(dataBuf.data))
}

func init() {
        msgRegistry.register(MsgRlerror, func() message { return &Rlerror{} })
        msgRegistry.register(MsgTstatfs, func() message { return &Tstatfs{} })
        msgRegistry.register(MsgRstatfs, func() message { return &Rstatfs{} })
        msgRegistry.register(MsgTlopen, func() message { return &Tlopen{} })
        msgRegistry.register(MsgRlopen, func() message { return &Rlopen{} })
        msgRegistry.register(MsgTlcreate, func() message { return &Tlcreate{} })
        msgRegistry.register(MsgRlcreate, func() message { return &Rlcreate{} })
        msgRegistry.register(MsgTsymlink, func() message { return &Tsymlink{} })
        msgRegistry.register(MsgRsymlink, func() message { return &Rsymlink{} })
        msgRegistry.register(MsgTmknod, func() message { return &Tmknod{} })
        msgRegistry.register(MsgRmknod, func() message { return &Rmknod{} })
        msgRegistry.register(MsgTrename, func() message { return &Trename{} })
        msgRegistry.register(MsgRrename, func() message { return &Rrename{} })
        msgRegistry.register(MsgTreadlink, func() message { return &Treadlink{} })
        msgRegistry.register(MsgRreadlink, func() message { return &Rreadlink{} })
        msgRegistry.register(MsgTgetattr, func() message { return &Tgetattr{} })
        msgRegistry.register(MsgRgetattr, func() message { return &Rgetattr{} })
        msgRegistry.register(MsgTsetattr, func() message { return &Tsetattr{} })
        msgRegistry.register(MsgRsetattr, func() message { return &Rsetattr{} })
        msgRegistry.register(MsgTlistxattr, func() message { return &Tlistxattr{} })
        msgRegistry.register(MsgRlistxattr, func() message { return &Rlistxattr{} })
        msgRegistry.register(MsgTxattrwalk, func() message { return &Txattrwalk{} })
        msgRegistry.register(MsgRxattrwalk, func() message { return &Rxattrwalk{} })
        msgRegistry.register(MsgTxattrcreate, func() message { return &Txattrcreate{} })
        msgRegistry.register(MsgRxattrcreate, func() message { return &Rxattrcreate{} })
        msgRegistry.register(MsgTgetxattr, func() message { return &Tgetxattr{} })
        msgRegistry.register(MsgRgetxattr, func() message { return &Rgetxattr{} })
        msgRegistry.register(MsgTsetxattr, func() message { return &Tsetxattr{} })
        msgRegistry.register(MsgRsetxattr, func() message { return &Rsetxattr{} })
        msgRegistry.register(MsgTremovexattr, func() message { return &Tremovexattr{} })
        msgRegistry.register(MsgRremovexattr, func() message { return &Rremovexattr{} })
        msgRegistry.register(MsgTreaddir, func() message { return &Treaddir{} })
        msgRegistry.register(MsgRreaddir, func() message { return &Rreaddir{} })
        msgRegistry.register(MsgTfsync, func() message { return &Tfsync{} })
        msgRegistry.register(MsgRfsync, func() message { return &Rfsync{} })
        msgRegistry.register(MsgTlink, func() message { return &Tlink{} })
        msgRegistry.register(MsgRlink, func() message { return &Rlink{} })
        msgRegistry.register(MsgTmkdir, func() message { return &Tmkdir{} })
        msgRegistry.register(MsgRmkdir, func() message { return &Rmkdir{} })
        msgRegistry.register(MsgTrenameat, func() message { return &Trenameat{} })
        msgRegistry.register(MsgRrenameat, func() message { return &Rrenameat{} })
        msgRegistry.register(MsgTunlinkat, func() message { return &Tunlinkat{} })
        msgRegistry.register(MsgRunlinkat, func() message { return &Runlinkat{} })
        msgRegistry.register(MsgTversion, func() message { return &Tversion{} })
        msgRegistry.register(MsgRversion, func() message { return &Rversion{} })
        msgRegistry.register(MsgTauth, func() message { return &Tauth{} })
        msgRegistry.register(MsgRauth, func() message { return &Rauth{} })
        msgRegistry.register(MsgTattach, func() message { return &Tattach{} })
        msgRegistry.register(MsgRattach, func() message { return &Rattach{} })
        msgRegistry.register(MsgTflush, func() message { return &Tflush{} })
        msgRegistry.register(MsgRflush, func() message { return &Rflush{} })
        msgRegistry.register(MsgTwalk, func() message { return &Twalk{} })
        msgRegistry.register(MsgRwalk, func() message { return &Rwalk{} })
        msgRegistry.register(MsgTread, func() message { return &Tread{} })
        msgRegistry.register(MsgRread, func() message { return &Rread{} })
        msgRegistry.register(MsgTwrite, func() message { return &Twrite{} })
        msgRegistry.register(MsgRwrite, func() message { return &Rwrite{} })
        msgRegistry.register(MsgTclunk, func() message { return &Tclunk{} })
        msgRegistry.register(MsgRclunk, func() message { return &Rclunk{} })
        msgRegistry.register(MsgTremove, func() message { return &Tremove{} })
        msgRegistry.register(MsgRremove, func() message { return &Rremove{} })
        msgRegistry.register(MsgTflushf, func() message { return &Tflushf{} })
        msgRegistry.register(MsgRflushf, func() message { return &Rflushf{} })
        msgRegistry.register(MsgTwalkgetattr, func() message { return &Twalkgetattr{} })
        msgRegistry.register(MsgRwalkgetattr, func() message { return &Rwalkgetattr{} })
        msgRegistry.register(MsgTucreate, func() message { return &Tucreate{} })
        msgRegistry.register(MsgRucreate, func() message { return &Rucreate{} })
        msgRegistry.register(MsgTumkdir, func() message { return &Tumkdir{} })
        msgRegistry.register(MsgRumkdir, func() message { return &Rumkdir{} })
        msgRegistry.register(MsgTumknod, func() message { return &Tumknod{} })
        msgRegistry.register(MsgRumknod, func() message { return &Rumknod{} })
        msgRegistry.register(MsgTusymlink, func() message { return &Tusymlink{} })
        msgRegistry.register(MsgRusymlink, func() message { return &Rusymlink{} })
        msgRegistry.register(MsgTlconnect, func() message { return &Tlconnect{} })
        msgRegistry.register(MsgRlconnect, func() message { return &Rlconnect{} })
        msgRegistry.register(MsgTallocate, func() message { return &Tallocate{} })
        msgRegistry.register(MsgRallocate, func() message { return &Rallocate{} })
        msgRegistry.register(MsgTsetattrclunk, func() message { return &Tsetattrclunk{} })
        msgRegistry.register(MsgRsetattrclunk, func() message { return &Rsetattrclunk{} })
        msgRegistry.register(MsgTmultigetattr, func() message { return &Tmultigetattr{} })
        msgRegistry.register(MsgRmultigetattr, func() message { return &Rmultigetattr{} })
        msgRegistry.register(MsgTchannel, func() message { return &Tchannel{} })
        msgRegistry.register(MsgRchannel, func() message { return &Rchannel{} })
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/kernfs/dentry_list.go: no such file or directory

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/task_list.go: no such file or directory
































  696 




  697 




  693 







  697 





  689 
    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package cleanup provides utilities to clean "stuff" on defers.
package cleanup

// Cleanup allows defers to be aborted when cleanup needs to happen
// conditionally. Usage:
//          cu := cleanup.Make(func() { f.Close() })
//          defer cu.Clean() // failure before release is called will close the file.
//          ...
//   cu.Add(func() { f2.Close() })  // Adds another cleanup function
//   ...
//          cu.Release() // on success, aborts closing the file.
//          return f
type Cleanup struct {
        cleaners []func()
}

// Make creates a new Cleanup object.
func Make(f func()) Cleanup {
        return Cleanup{cleaners: []func(){f}}
}

// Add adds a new function to be called on Clean().
func (c *Cleanup) Add(f func()) {
        c.cleaners = append(c.cleaners, f)
}

// Clean calls all cleanup functions in reverse order.
func (c *Cleanup) Clean() {
        clean(c.cleaners)
        c.cleaners = nil
}

// Release releases the cleanup from its duties, i.e. cleanup functions are not
// called after this point. Returns a function that calls all registered
// functions in case the caller has use for them.
func (c *Cleanup) Release() func() {
        old := c.cleaners
        c.cleaners = nil
        return func() { clean(old) }
}

func clean(cleaners []func()) {
        for i := len(cleaners) - 1; i >= 0; i-- {
                cleaners[i]()
        }
}
































    5 






    5 













    3 




    1 




    3 




    1 




    1 




    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package memdev

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
)

const fullDevMinor = 7

// fullDevice implements vfs.Device for /dev/full.
//
// +stateify savable
type fullDevice struct{}

// Open implements vfs.Device.Open.
func (fullDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd := &fullFD{}
        if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
                UseDentryMetadata: true,
        }); err != nil {
                return nil, err
        }
        return &fd.vfsfd, nil
}

// fullFD implements vfs.FileDescriptionImpl for /dev/full.
//
// +stateify savable
type fullFD struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.DentryMetadataFileDescriptionImpl
        vfs.NoLockFD
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *fullFD) Release(context.Context) {
        // noop
}

// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *fullFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        return dst.ZeroOut(ctx, dst.NumBytes())
}

// Read implements vfs.FileDescriptionImpl.Read.
func (fd *fullFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        return dst.ZeroOut(ctx, dst.NumBytes())
}

// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *fullFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
        return 0, syserror.ENOSPC
}

// Write implements vfs.FileDescriptionImpl.Write.
func (fd *fullFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        return 0, syserror.ENOSPC
}

// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *fullFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        return 0, nil
}




































    9 
    9 
































  236 
    8 


  230 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "gvisor.dev/gvisor/pkg/context"
)

// contextID is this package's type for context.Context.Value keys.
type contextID int

const (
        // CtxMountNamespace is a Context.Value key for a MountNamespace.
        CtxMountNamespace contextID = iota

        // CtxRoot is a Context.Value key for a VFS root.
        CtxRoot
)

// MountNamespaceFromContext returns the MountNamespace used by ctx. If ctx is
// not associated with a MountNamespace, MountNamespaceFromContext returns nil.
//
// A reference is taken on the returned MountNamespace.
func MountNamespaceFromContext(ctx context.Context) *MountNamespace {
        if v := ctx.Value(CtxMountNamespace); v != nil {
                return v.(*MountNamespace)
        }
        return nil
}

type mountNamespaceContext struct {
        context.Context
        mntns *MountNamespace
}

// Value implements Context.Value.
func (mc mountNamespaceContext) Value(key interface{}) interface{} {
        switch key {
        case CtxMountNamespace:
                mc.mntns.IncRef()
                return mc.mntns
        default:
                return mc.Context.Value(key)
        }
}

// WithMountNamespace returns a copy of ctx with the given MountNamespace.
func WithMountNamespace(ctx context.Context, mntns *MountNamespace) context.Context {
        return &mountNamespaceContext{
                Context: ctx,
                mntns:   mntns,
        }
}

// RootFromContext returns the VFS root used by ctx. It takes a reference on
// the returned VirtualDentry. If ctx does not have a specific VFS root,
// RootFromContext returns a zero-value VirtualDentry.
func RootFromContext(ctx context.Context) VirtualDentry {
        if v := ctx.Value(CtxRoot); v != nil {
                return v.(VirtualDentry)
        }
        return VirtualDentry{}
}

type rootContext struct {
        context.Context
        root VirtualDentry
}

// WithRoot returns a copy of ctx with the given root.
func WithRoot(ctx context.Context, root VirtualDentry) context.Context {
        return &rootContext{
                Context: ctx,
                root:    root,
        }
}

// Value implements Context.Value.
func (rc rootContext) Value(key interface{}) interface{} {
        switch key {
        case CtxRoot:
                rc.root.IncRef()
                return rc.root
        default:
                return rc.Context.Value(key)
        }
}























































    2 











    2 



    2 








    1 

    1 
    1 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package port provides port ID allocation for netlink sockets.
//
// A netlink port is any int32 value. Positive ports are typically equivalent
// to the PID of the binding process. If that port is unavailable, negative
// ports are searched to find a free port that will not conflict with other
// PIDS.
package port

import (
        "fmt"
        "math"
        "math/rand"

        "gvisor.dev/gvisor/pkg/sync"
)

// maxPorts is a sanity limit on the maximum number of ports to allocate per
// protocol.
const maxPorts = 10000

// Manager allocates netlink port IDs.
//
// +stateify savable
type Manager struct {
        // mu protects the fields below.
        mu sync.Mutex `state:"nosave"`

        // ports contains a map of allocated ports for each protocol.
        ports map[int]map[int32]struct{}
}

// New creates a new Manager.
func New() *Manager {
        return &Manager{
                ports: make(map[int]map[int32]struct{}),
        }
}

// Allocate reserves a new port ID for protocol. hint will be taken if
// available.
func (m *Manager) Allocate(protocol int, hint int32) (int32, bool) {
        m.mu.Lock()
        defer m.mu.Unlock()

        proto, ok := m.ports[protocol]
        if !ok {
                proto = make(map[int32]struct{})
                // Port 0 is reserved for the kernel.
                proto[0] = struct{}{}
                m.ports[protocol] = proto
        }

        if len(proto) >= maxPorts {
                return 0, false
        }

        if _, ok := proto[hint]; !ok {
                // Hint is available, reserve it.
                proto[hint] = struct{}{}
                return hint, true
        }

        // Search for any free port in [math.MinInt32, -4096). The positive
        // port space is left open for pid-based allocations. This behavior is
        // consistent with Linux.
        start := int32(math.MinInt32 + rand.Int63n(math.MaxInt32-4096+1))
        curr := start
        for {
                if _, ok := proto[curr]; !ok {
                        proto[curr] = struct{}{}
                        return curr, true
                }

                curr--
                if curr >= -4096 {
                        curr = -4097
                }
                if curr == start {
                        // Nothing found. We should always find a free port
                        // because maxPorts < -4096 - MinInt32.
                        panic(fmt.Sprintf("No free port found in %+v", proto))
                }
        }
}

// Release frees the specified port for protocol.
//
// Preconditions: port is already allocated.
func (m *Manager) Release(protocol int, port int32) {
        m.mu.Lock()
        defer m.mu.Unlock()

        proto, ok := m.ports[protocol]
        if !ok {
                panic(fmt.Sprintf("Released port %d for protocol %d which has no allocations", port, protocol))
        }

        if _, ok := proto[port]; !ok {
                panic(fmt.Sprintf("Released port %d for protocol %d is not allocated", port, protocol))
        }

        delete(proto, port)
}













































































































































































































































































   69 


   23 

   30 
    7 


    4 





   69 
   14 
    2 


    4 
    1 




   69 
   63 
    8 
    1 






   68 
















   31 





   31 



   31 



   31 
   31 






   31 

   31 


   31 



   31 


   31 

































   31 





   31 







   31 













   85 







   30 




   30 























   34 




   34 









   67 




   66 
















  153 







  104 



















































































    3 



























   80 





   80 





   21 




   21 



   18 




   18 




   15 




   15 



























   31 









   32 
    2 






   32 





   32 
   32 













    3 


    3 
    2 



    1 































    1 








    1 



    1 









    1 




    1 












    1 













































   19 




   20 








   19 



   20 












   20 

   20 



   20 






   20 

   20 
























   31 































   31 








   31 












































   25 
   17 


    8 






   72 

   27 



   56 

   39 





   56 
   20 
    7 








   56 




   45 













   45 








   74 
   48 



   74 
   46 
   27 




   20 


   46 



    3 



   82 



    6 

    6 






















   80 









   76 
   45 






   41 
    6 
    6 














    4 


    4 


   31 



   31 



   31 
   19 



   19 




   19 
   18 

   12 


   19 



   19 










    3 




   12 
   19 



   12 










































   12 


    3 


    3 




   97 


















   17 




    3 

    1 



    2 



    2 



   14 
    9 




    5 




























































































































   46 





    9 





    9 





   10 









   42 







    1 






   19 





   16 







































































































   15 




   15 

   15 
    1 




   14 



    1 



    1 



    1 





   11 





   12 

   12 

   12 


   12 































































   51 

   51 
















    1 



    1 






    1 



    1 


















  134 





























   15 


















































   44 





  113 





   38 




  129 








  130 



   32 




















  122 


   48 



   80 


   22 



   59 



   56 












   24 

   22 



    6 











   25 



    4 

    1 



    3 


   21 





   21 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package stack provides the glue between networking protocols and the
// consumers of the networking stack.
//
// For consumers, the only function of interest is New(), everything else is
// provided by the tcpip/public package.
package stack

import (
        "encoding/binary"
        "fmt"
        "io"
        "math/rand"
        "sync/atomic"
        "time"

        "golang.org/x/time/rate"
        "gvisor.dev/gvisor/pkg/atomicbitops"
        cryptorand "gvisor.dev/gvisor/pkg/rand"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/ports"
        "gvisor.dev/gvisor/pkg/waiter"
)

const (
        // DefaultTOS is the default type of service value for network endpoints.
        DefaultTOS = 0
)

type transportProtocolState struct {
        proto          TransportProtocol
        defaultHandler func(id TransportEndpointID, pkt *PacketBuffer) bool
}

// ResumableEndpoint is an endpoint that needs to be resumed after restore.
type ResumableEndpoint interface {
        // Resume resumes an endpoint after restore. This can be used to restart
        // background workers such as protocol goroutines. This must be called after
        // all indirect dependencies of the endpoint has been restored, which
        // generally implies at the end of the restore process.
        Resume(*Stack)
}

// uniqueIDGenerator is a default unique ID generator.
type uniqueIDGenerator atomicbitops.AlignedAtomicUint64

func (u *uniqueIDGenerator) UniqueID() uint64 {
        return ((*atomicbitops.AlignedAtomicUint64)(u)).Add(1)
}

// Stack is a networking stack, with all supported protocols, NICs, and route
// table.
type Stack struct {
        transportProtocols map[tcpip.TransportProtocolNumber]*transportProtocolState
        networkProtocols   map[tcpip.NetworkProtocolNumber]NetworkProtocol

        // rawFactory creates raw endpoints. If nil, raw endpoints are
        // disabled. It is set during Stack creation and is immutable.
        rawFactory RawFactory

        demux *transportDemuxer

        stats tcpip.Stats

        // LOCK ORDERING: mu > route.mu.
        route struct {
                mu struct {
                        sync.RWMutex

                        table []tcpip.Route
                }
        }

        mu                       sync.RWMutex
        nics                     map[tcpip.NICID]*nic
        defaultForwardingEnabled map[tcpip.NetworkProtocolNumber]struct{}

        // cleanupEndpointsMu protects cleanupEndpoints.
        cleanupEndpointsMu sync.Mutex
        cleanupEndpoints   map[TransportEndpoint]struct{}

        *ports.PortManager

        // If not nil, then any new endpoints will have this probe function
        // invoked everytime they receive a TCP segment.
        tcpProbeFunc atomic.Value // TCPProbeFunc

        // clock is used to generate user-visible times.
        clock tcpip.Clock

        // handleLocal allows non-loopback interfaces to loop packets.
        handleLocal bool

        // tables are the iptables packet filtering and manipulation rules.
        // TODO(gvisor.dev/issue/4595): S/R this field.
        tables *IPTables

        // resumableEndpoints is a list of endpoints that need to be resumed if the
        // stack is being restored.
        resumableEndpoints []ResumableEndpoint

        // icmpRateLimiter is a global rate limiter for all ICMP messages generated
        // by the stack.
        icmpRateLimiter *ICMPRateLimiter

        // seed is a one-time random value initialized at stack startup
        // and is used to seed the TCP port picking on active connections
        //
        // TODO(gvisor.dev/issue/940): S/R this field.
        seed uint32

        // nudConfigs is the default NUD configurations used by interfaces.
        nudConfigs NUDConfigurations

        // nudDisp is the NUD event dispatcher that is used to send the netstack
        // integrator NUD related events.
        nudDisp NUDDispatcher

        // uniqueIDGenerator is a generator of unique identifiers.
        uniqueIDGenerator UniqueID

        // randomGenerator is an injectable pseudo random generator that can be
        // used when a random number is required.
        randomGenerator *rand.Rand

        // secureRNG is a cryptographically secure random number generator.
        secureRNG io.Reader

        // sendBufferSize holds the min/default/max send buffer sizes for
        // endpoints other than TCP.
        sendBufferSize tcpip.SendBufferSizeOption

        // receiveBufferSize holds the min/default/max receive buffer sizes for
        // endpoints other than TCP.
        receiveBufferSize tcpip.ReceiveBufferSizeOption

        // tcpInvalidRateLimit is the maximal rate for sending duplicate
        // acknowledgements in response to incoming TCP packets that are for an existing
        // connection but that are invalid due to any of the following reasons:
        //
        //   a) out-of-window sequence number.
        //   b) out-of-window acknowledgement number.
        //   c) PAWS check failure (when implemented).
        //
        // This is required to prevent potential ACK loops.
        // Setting this to 0 will disable all rate limiting.
        tcpInvalidRateLimit time.Duration
}

// UniqueID is an abstract generator of unique identifiers.
type UniqueID interface {
        UniqueID() uint64
}

// NetworkProtocolFactory instantiates a network protocol.
//
// NetworkProtocolFactory must not attempt to modify the stack, it may only
// query the stack.
type NetworkProtocolFactory func(*Stack) NetworkProtocol

// TransportProtocolFactory instantiates a transport protocol.
//
// TransportProtocolFactory must not attempt to modify the stack, it may only
// query the stack.
type TransportProtocolFactory func(*Stack) TransportProtocol

// Options contains optional Stack configuration.
type Options struct {
        // NetworkProtocols lists the network protocols to enable.
        NetworkProtocols []NetworkProtocolFactory

        // TransportProtocols lists the transport protocols to enable.
        TransportProtocols []TransportProtocolFactory

        // Clock is an optional clock used for timekeeping.
        //
        // If Clock is nil, tcpip.NewStdClock() will be used.
        Clock tcpip.Clock

        // Stats are optional statistic counters.
        Stats tcpip.Stats

        // HandleLocal indicates whether packets destined to their source
        // should be handled by the stack internally (true) or outside the
        // stack (false).
        HandleLocal bool

        // UniqueID is an optional generator of unique identifiers.
        UniqueID UniqueID

        // NUDConfigs is the default NUD configurations used by interfaces.
        NUDConfigs NUDConfigurations

        // NUDDisp is the NUD event dispatcher that an integrator can provide to
        // receive NUD related events.
        NUDDisp NUDDispatcher

        // RawFactory produces raw endpoints. Raw endpoints are enabled only if
        // this is non-nil.
        RawFactory RawFactory

        // RandSource is an optional source to use to generate random
        // numbers. If omitted it defaults to a Source seeded by the data
        // returned by the stack secure RNG.
        //
        // RandSource must be thread-safe.
        RandSource rand.Source

        // IPTables are the initial iptables rules. If nil, DefaultIPTables will be
        // used to construct the initial iptables rules.
        // all traffic.
        IPTables *IPTables

        // DefaultIPTables is an optional iptables rules constructor that is called
        // if IPTables is nil. If both fields are nil, iptables will allow all
        // traffic.
        DefaultIPTables func(uint32) *IPTables

        // SecureRNG is a cryptographically secure random number generator.
        SecureRNG io.Reader
}

// TransportEndpointInfo holds useful information about a transport endpoint
// which can be queried by monitoring tools.
//
// +stateify savable
type TransportEndpointInfo struct {
        // The following fields are initialized at creation time and are
        // immutable.

        NetProto   tcpip.NetworkProtocolNumber
        TransProto tcpip.TransportProtocolNumber

        // The following fields are protected by endpoint mu.

        ID TransportEndpointID
        // BindNICID and bindAddr are set via calls to Bind(). They are used to
        // reject attempts to send data or connect via a different NIC or
        // address
        BindNICID tcpip.NICID
        BindAddr  tcpip.Address
        // RegisterNICID is the default NICID registered as a side-effect of
        // connect or datagram write.
        RegisterNICID tcpip.NICID
}

// AddrNetProtoLocked unwraps the specified address if it is a V4-mapped V6
// address and returns the network protocol number to be used to communicate
// with the specified address. It returns an error if the passed address is
// incompatible with the receiver.
//
// Preconditon: the parent endpoint mu must be held while calling this method.
func (t *TransportEndpointInfo) AddrNetProtoLocked(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) {
        netProto := t.NetProto
        switch len(addr.Addr) {
        case header.IPv4AddressSize:
                netProto = header.IPv4ProtocolNumber
        case header.IPv6AddressSize:
                if header.IsV4MappedAddress(addr.Addr) {
                        netProto = header.IPv4ProtocolNumber
                        addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
                        if addr.Addr == header.IPv4Any {
                                addr.Addr = ""
                        }
                }
        }

        switch len(t.ID.LocalAddress) {
        case header.IPv4AddressSize:
                if len(addr.Addr) == header.IPv6AddressSize {
                        return tcpip.FullAddress{}, 0, &tcpip.ErrInvalidEndpointState{}
                }
        case header.IPv6AddressSize:
                if len(addr.Addr) == header.IPv4AddressSize {
                        return tcpip.FullAddress{}, 0, &tcpip.ErrNetworkUnreachable{}
                }
        }

        switch {
        case netProto == t.NetProto:
        case netProto == header.IPv4ProtocolNumber && t.NetProto == header.IPv6ProtocolNumber:
                if v6only {
                        return tcpip.FullAddress{}, 0, &tcpip.ErrNoRoute{}
                }
        default:
                return tcpip.FullAddress{}, 0, &tcpip.ErrInvalidEndpointState{}
        }

        return addr, netProto, nil
}

// IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
// marker interface.
func (*TransportEndpointInfo) IsEndpointInfo() {}

// New allocates a new networking stack with only the requested networking and
// transport protocols configured with default options.
//
// Note, NDPConfigurations will be fixed before being used by the Stack. That
// is, if an invalid value was provided, it will be reset to the default value.
//
// Protocol options can be changed by calling the
// SetNetworkProtocolOption/SetTransportProtocolOption methods provided by the
// stack. Please refer to individual protocol implementations as to what options
// are supported.
func New(opts Options) *Stack {
        clock := opts.Clock
        if clock == nil {
                clock = tcpip.NewStdClock()
        }

        if opts.UniqueID == nil {
                opts.UniqueID = new(uniqueIDGenerator)
        }

        if opts.SecureRNG == nil {
                opts.SecureRNG = cryptorand.Reader
        }

        randSrc := opts.RandSource
        if randSrc == nil {
                var v int64
                if err := binary.Read(opts.SecureRNG, binary.LittleEndian, &v); err != nil {
                        panic(err)
                }
                // Source provided by rand.NewSource is not thread-safe so
                // we wrap it in a simple thread-safe version.
                randSrc = &lockedRandomSource{src: rand.NewSource(v)}
        }
        randomGenerator := rand.New(randSrc)

        seed := randomGenerator.Uint32()
        if opts.IPTables == nil {
                if opts.DefaultIPTables == nil {
                        opts.DefaultIPTables = DefaultTables
                }
                opts.IPTables = opts.DefaultIPTables(seed)
        }

        opts.NUDConfigs.resetInvalidFields()

        s := &Stack{
                transportProtocols:       make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
                networkProtocols:         make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
                nics:                     make(map[tcpip.NICID]*nic),
                defaultForwardingEnabled: make(map[tcpip.NetworkProtocolNumber]struct{}),
                cleanupEndpoints:         make(map[TransportEndpoint]struct{}),
                PortManager:              ports.NewPortManager(),
                clock:                    clock,
                stats:                    opts.Stats.FillIn(),
                handleLocal:              opts.HandleLocal,
                tables:                   opts.IPTables,
                icmpRateLimiter:          NewICMPRateLimiter(),
                seed:                     seed,
                nudConfigs:               opts.NUDConfigs,
                uniqueIDGenerator:        opts.UniqueID,
                nudDisp:                  opts.NUDDisp,
                randomGenerator:          randomGenerator,
                secureRNG:                opts.SecureRNG,
                sendBufferSize: tcpip.SendBufferSizeOption{
                        Min:     MinBufferSize,
                        Default: DefaultBufferSize,
                        Max:     DefaultMaxBufferSize,
                },
                receiveBufferSize: tcpip.ReceiveBufferSizeOption{
                        Min:     MinBufferSize,
                        Default: DefaultBufferSize,
                        Max:     DefaultMaxBufferSize,
                },
                tcpInvalidRateLimit: defaultTCPInvalidRateLimit,
        }

        // Add specified network protocols.
        for _, netProtoFactory := range opts.NetworkProtocols {
                netProto := netProtoFactory(s)
                s.networkProtocols[netProto.Number()] = netProto
        }

        // Add specified transport protocols.
        for _, transProtoFactory := range opts.TransportProtocols {
                transProto := transProtoFactory(s)
                s.transportProtocols[transProto.Number()] = &transportProtocolState{
                        proto: transProto,
                }
        }

        // Add the factory for raw endpoints, if present.
        s.rawFactory = opts.RawFactory

        // Create the global transport demuxer.
        s.demux = newTransportDemuxer(s)

        return s
}

// newJob returns a tcpip.Job using the Stack clock.
func (s *Stack) newJob(l sync.Locker, f func()) *tcpip.Job {
        return tcpip.NewJob(s.clock, l, f)
}

// UniqueID returns a unique identifier.
func (s *Stack) UniqueID() uint64 {
        return s.uniqueIDGenerator.UniqueID()
}

// SetNetworkProtocolOption allows configuring individual protocol level
// options. This method returns an error if the protocol is not supported or
// option is not supported by the protocol implementation or the provided value
// is incorrect.
func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, option tcpip.SettableNetworkProtocolOption) tcpip.Error {
        netProto, ok := s.networkProtocols[network]
        if !ok {
                return &tcpip.ErrUnknownProtocol{}
        }
        return netProto.SetOption(option)
}

// NetworkProtocolOption allows retrieving individual protocol level option
// values. This method returns an error if the protocol is not supported or
// option is not supported by the protocol implementation.
// e.g.
// var v ipv4.MyOption
// err := s.NetworkProtocolOption(tcpip.IPv4ProtocolNumber, &v)
// if err != nil {
//   ...
// }
func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, option tcpip.GettableNetworkProtocolOption) tcpip.Error {
        netProto, ok := s.networkProtocols[network]
        if !ok {
                return &tcpip.ErrUnknownProtocol{}
        }
        return netProto.Option(option)
}

// SetTransportProtocolOption allows configuring individual protocol level
// options. This method returns an error if the protocol is not supported or
// option is not supported by the protocol implementation or the provided value
// is incorrect.
func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumber, option tcpip.SettableTransportProtocolOption) tcpip.Error {
        transProtoState, ok := s.transportProtocols[transport]
        if !ok {
                return &tcpip.ErrUnknownProtocol{}
        }
        return transProtoState.proto.SetOption(option)
}

// TransportProtocolOption allows retrieving individual protocol level option
// values. This method returns an error if the protocol is not supported or
// option is not supported by the protocol implementation.
// var v tcp.SACKEnabled
// if err := s.TransportProtocolOption(tcpip.TCPProtocolNumber, &v); err != nil {
//   ...
// }
func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber, option tcpip.GettableTransportProtocolOption) tcpip.Error {
        transProtoState, ok := s.transportProtocols[transport]
        if !ok {
                return &tcpip.ErrUnknownProtocol{}
        }
        return transProtoState.proto.Option(option)
}

// SetTransportProtocolHandler sets the per-stack default handler for the given
// protocol.
//
// It must be called only during initialization of the stack. Changing it as the
// stack is operating is not supported.
func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(TransportEndpointID, *PacketBuffer) bool) {
        state := s.transportProtocols[p]
        if state != nil {
                state.defaultHandler = h
        }
}

// Clock returns the Stack's clock for retrieving the current time and
// scheduling work.
func (s *Stack) Clock() tcpip.Clock {
        return s.clock
}

// Stats returns a mutable copy of the current stats.
//
// This is not generally exported via the public interface, but is available
// internally.
func (s *Stack) Stats() tcpip.Stats {
        return s.stats
}

// SetNICForwarding enables or disables packet forwarding on the specified NIC
// for the passed protocol.
func (s *Stack) SetNICForwarding(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, enable bool) tcpip.Error {
        s.mu.RLock()
        defer s.mu.RUnlock()

        nic, ok := s.nics[id]
        if !ok {
                return &tcpip.ErrUnknownNICID{}
        }

        return nic.setForwarding(protocol, enable)
}

// NICForwarding returns the forwarding configuration for the specified NIC.
func (s *Stack) NICForwarding(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (bool, tcpip.Error) {
        s.mu.RLock()
        defer s.mu.RUnlock()

        nic, ok := s.nics[id]
        if !ok {
                return false, &tcpip.ErrUnknownNICID{}
        }

        return nic.forwarding(protocol)
}

// SetForwardingDefaultAndAllNICs sets packet forwarding for all NICs for the
// passed protocol and sets the default setting for newly created NICs.
func (s *Stack) SetForwardingDefaultAndAllNICs(protocol tcpip.NetworkProtocolNumber, enable bool) tcpip.Error {
        s.mu.Lock()
        defer s.mu.Unlock()

        doneOnce := false
        for id, nic := range s.nics {
                if err := nic.setForwarding(protocol, enable); err != nil {
                        // Expect forwarding to be settable on all interfaces if it was set on
                        // one.
                        if doneOnce {
                                panic(fmt.Sprintf("nic(id=%d).setForwarding(%d, %t): %s", id, protocol, enable, err))
                        }

                        return err
                }

                doneOnce = true
        }

        if enable {
                s.defaultForwardingEnabled[protocol] = struct{}{}
        } else {
                delete(s.defaultForwardingEnabled, protocol)
        }

        return nil
}

// PortRange returns the UDP and TCP inclusive range of ephemeral ports used in
// both IPv4 and IPv6.
func (s *Stack) PortRange() (uint16, uint16) {
        return s.PortManager.PortRange()
}

// SetPortRange sets the UDP and TCP IPv4 and IPv6 ephemeral port range
// (inclusive).
func (s *Stack) SetPortRange(start uint16, end uint16) tcpip.Error {
        return s.PortManager.SetPortRange(start, end)
}

// SetRouteTable assigns the route table to be used by this stack. It
// specifies which NIC to use for given destination address ranges.
//
// This method takes ownership of the table.
func (s *Stack) SetRouteTable(table []tcpip.Route) {
        s.route.mu.Lock()
        defer s.route.mu.Unlock()
        s.route.mu.table = table
}

// GetRouteTable returns the route table which is currently in use.
func (s *Stack) GetRouteTable() []tcpip.Route {
        s.route.mu.RLock()
        defer s.route.mu.RUnlock()
        return append([]tcpip.Route(nil), s.route.mu.table...)
}

// AddRoute appends a route to the route table.
func (s *Stack) AddRoute(route tcpip.Route) {
        s.route.mu.Lock()
        defer s.route.mu.Unlock()
        s.route.mu.table = append(s.route.mu.table, route)
}

// RemoveRoutes removes matching routes from the route table.
func (s *Stack) RemoveRoutes(match func(tcpip.Route) bool) {
        s.route.mu.Lock()
        defer s.route.mu.Unlock()

        var filteredRoutes []tcpip.Route
        for _, route := range s.route.mu.table {
                if !match(route) {
                        filteredRoutes = append(filteredRoutes, route)
                }
        }
        s.route.mu.table = filteredRoutes
}

// NewEndpoint creates a new transport layer endpoint of the given protocol.
func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
        t, ok := s.transportProtocols[transport]
        if !ok {
                return nil, &tcpip.ErrUnknownProtocol{}
        }

        return t.proto.NewEndpoint(network, waiterQueue)
}

// NewRawEndpoint creates a new raw transport layer endpoint of the given
// protocol. Raw endpoints receive all traffic for a given protocol regardless
// of address.
func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, tcpip.Error) {
        if s.rawFactory == nil {
                return nil, &tcpip.ErrNotPermitted{}
        }

        if !associated {
                return s.rawFactory.NewUnassociatedEndpoint(s, network, transport, waiterQueue)
        }

        t, ok := s.transportProtocols[transport]
        if !ok {
                return nil, &tcpip.ErrUnknownProtocol{}
        }

        return t.proto.NewRawEndpoint(network, waiterQueue)
}

// NewPacketEndpoint creates a new packet endpoint listening for the given
// netProto.
func (s *Stack) NewPacketEndpoint(cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
        if s.rawFactory == nil {
                return nil, &tcpip.ErrNotPermitted{}
        }

        return s.rawFactory.NewPacketEndpoint(s, cooked, netProto, waiterQueue)
}

// NICContext is an opaque pointer used to store client-supplied NIC metadata.
type NICContext interface{}

// NICOptions specifies the configuration of a NIC as it is being created.
// The zero value creates an enabled, unnamed NIC.
type NICOptions struct {
        // Name specifies the name of the NIC.
        Name string

        // Disabled specifies whether to avoid calling Attach on the passed
        // LinkEndpoint.
        Disabled bool

        // Context specifies user-defined data that will be returned in stack.NICInfo
        // for the NIC. Clients of this library can use it to add metadata that
        // should be tracked alongside a NIC, to avoid having to keep a
        // map[tcpip.NICID]metadata mirroring stack.Stack's nic map.
        Context NICContext
}

// CreateNICWithOptions creates a NIC with the provided id, LinkEndpoint, and
// NICOptions. See the documentation on type NICOptions for details on how
// NICs can be configured.
//
// LinkEndpoint.Attach will be called to bind ep with a NetworkDispatcher.
func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOptions) tcpip.Error {
        s.mu.Lock()
        defer s.mu.Unlock()

        // Make sure id is unique.
        if _, ok := s.nics[id]; ok {
                return &tcpip.ErrDuplicateNICID{}
        }

        // Make sure name is unique, unless unnamed.
        if opts.Name != "" {
                for _, n := range s.nics {
                        if n.Name() == opts.Name {
                                return &tcpip.ErrDuplicateNICID{}
                        }
                }
        }

        n := newNIC(s, id, opts.Name, ep, opts.Context)
        for proto := range s.defaultForwardingEnabled {
                if err := n.setForwarding(proto, true); err != nil {
                        panic(fmt.Sprintf("newNIC(%d, ...).setForwarding(%d, true): %s", id, proto, err))
                }
        }
        s.nics[id] = n
        if !opts.Disabled {
                return n.enable()
        }

        return nil
}

// CreateNIC creates a NIC with the provided id and LinkEndpoint and calls
// LinkEndpoint.Attach to bind ep with a NetworkDispatcher.
func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) tcpip.Error {
        return s.CreateNICWithOptions(id, ep, NICOptions{})
}

// GetLinkEndpointByName gets the link endpoint specified by name.
func (s *Stack) GetLinkEndpointByName(name string) LinkEndpoint {
        s.mu.RLock()
        defer s.mu.RUnlock()
        for _, nic := range s.nics {
                if nic.Name() == name {
                        return nic.LinkEndpoint
                }
        }
        return nil
}

// EnableNIC enables the given NIC so that the link-layer endpoint can start
// delivering packets to it.
func (s *Stack) EnableNIC(id tcpip.NICID) tcpip.Error {
        s.mu.RLock()
        defer s.mu.RUnlock()

        nic, ok := s.nics[id]
        if !ok {
                return &tcpip.ErrUnknownNICID{}
        }

        return nic.enable()
}

// DisableNIC disables the given NIC.
func (s *Stack) DisableNIC(id tcpip.NICID) tcpip.Error {
        s.mu.RLock()
        defer s.mu.RUnlock()

        nic, ok := s.nics[id]
        if !ok {
                return &tcpip.ErrUnknownNICID{}
        }

        nic.disable()
        return nil
}

// CheckNIC checks if a NIC is usable.
func (s *Stack) CheckNIC(id tcpip.NICID) bool {
        s.mu.RLock()
        defer s.mu.RUnlock()

        nic, ok := s.nics[id]
        if !ok {
                return false
        }

        return nic.Enabled()
}

// RemoveNIC removes NIC and all related routes from the network stack.
func (s *Stack) RemoveNIC(id tcpip.NICID) tcpip.Error {
        s.mu.Lock()
        defer s.mu.Unlock()

        return s.removeNICLocked(id)
}

// removeNICLocked removes NIC and all related routes from the network stack.
//
// s.mu must be locked.
func (s *Stack) removeNICLocked(id tcpip.NICID) tcpip.Error {
        nic, ok := s.nics[id]
        if !ok {
                return &tcpip.ErrUnknownNICID{}
        }
        delete(s.nics, id)

        // Remove routes in-place. n tracks the number of routes written.
        s.route.mu.Lock()
        n := 0
        for i, r := range s.route.mu.table {
                s.route.mu.table[i] = tcpip.Route{}
                if r.NIC != id {
                        // Keep this route.
                        s.route.mu.table[n] = r
                        n++
                }
        }
        s.route.mu.table = s.route.mu.table[:n]
        s.route.mu.Unlock()

        return nic.remove()
}

// NICInfo captures the name and addresses assigned to a NIC.
type NICInfo struct {
        Name              string
        LinkAddress       tcpip.LinkAddress
        ProtocolAddresses []tcpip.ProtocolAddress

        // Flags indicate the state of the NIC.
        Flags NICStateFlags

        // MTU is the maximum transmission unit.
        MTU uint32

        Stats tcpip.NICStats

        // NetworkStats holds the stats of each NetworkEndpoint bound to the NIC.
        NetworkStats map[tcpip.NetworkProtocolNumber]NetworkEndpointStats

        // Context is user-supplied data optionally supplied in CreateNICWithOptions.
        // See type NICOptions for more details.
        Context NICContext

        // ARPHardwareType holds the ARP Hardware type of the NIC. This is the
        // value sent in haType field of an ARP Request sent by this NIC and the
        // value expected in the haType field of an ARP response.
        ARPHardwareType header.ARPHardwareType

        // Forwarding holds the forwarding status for each network endpoint that
        // supports forwarding.
        Forwarding map[tcpip.NetworkProtocolNumber]bool
}

// HasNIC returns true if the NICID is defined in the stack.
func (s *Stack) HasNIC(id tcpip.NICID) bool {
        s.mu.RLock()
        _, ok := s.nics[id]
        s.mu.RUnlock()
        return ok
}

// NICInfo returns a map of NICIDs to their associated information.
func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
        s.mu.RLock()
        defer s.mu.RUnlock()

        nics := make(map[tcpip.NICID]NICInfo)
        for id, nic := range s.nics {
                flags := NICStateFlags{
                        Up:          true, // Netstack interfaces are always up.
                        Running:     nic.Enabled(),
                        Promiscuous: nic.Promiscuous(),
                        Loopback:    nic.IsLoopback(),
                }

                netStats := make(map[tcpip.NetworkProtocolNumber]NetworkEndpointStats)
                for proto, netEP := range nic.networkEndpoints {
                        netStats[proto] = netEP.Stats()
                }

                info := NICInfo{
                        Name:              nic.name,
                        LinkAddress:       nic.LinkEndpoint.LinkAddress(),
                        ProtocolAddresses: nic.primaryAddresses(),
                        Flags:             flags,
                        MTU:               nic.LinkEndpoint.MTU(),
                        Stats:             nic.stats.local,
                        NetworkStats:      netStats,
                        Context:           nic.context,
                        ARPHardwareType:   nic.LinkEndpoint.ARPHardwareType(),
                        Forwarding:        make(map[tcpip.NetworkProtocolNumber]bool),
                }

                for proto := range s.networkProtocols {
                        switch forwarding, err := nic.forwarding(proto); err.(type) {
                        case nil:
                                info.Forwarding[proto] = forwarding
                        case *tcpip.ErrUnknownProtocol:
                                panic(fmt.Sprintf("expected network protocol %d to be available on NIC %d", proto, nic.ID()))
                        case *tcpip.ErrNotSupported:
                                // Not all network protocols support forwarding.
                        default:
                                panic(fmt.Sprintf("nic(id=%d).forwarding(%d): %s", nic.ID(), proto, err))
                        }
                }

                nics[id] = info
        }
        return nics
}

// NICStateFlags holds information about the state of an NIC.
type NICStateFlags struct {
        // Up indicates whether the interface is running.
        Up bool

        // Running indicates whether resources are allocated.
        Running bool

        // Promiscuous indicates whether the interface is in promiscuous mode.
        Promiscuous bool

        // Loopback indicates whether the interface is a loopback.
        Loopback bool
}

// AddAddress adds a new network-layer address to the specified NIC.
func (s *Stack) AddAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.Error {
        return s.AddAddressWithOptions(id, protocol, addr, CanBePrimaryEndpoint)
}

// AddAddressWithPrefix is the same as AddAddress, but allows you to specify
// the address prefix.
func (s *Stack) AddAddressWithPrefix(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.AddressWithPrefix) tcpip.Error {
        ap := tcpip.ProtocolAddress{
                Protocol:          protocol,
                AddressWithPrefix: addr,
        }
        return s.AddProtocolAddressWithOptions(id, ap, CanBePrimaryEndpoint)
}

// AddProtocolAddress adds a new network-layer protocol address to the
// specified NIC.
func (s *Stack) AddProtocolAddress(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress) tcpip.Error {
        return s.AddProtocolAddressWithOptions(id, protocolAddress, CanBePrimaryEndpoint)
}

// AddAddressWithOptions is the same as AddAddress, but allows you to specify
// whether the new endpoint can be primary or not.
func (s *Stack) AddAddressWithOptions(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, peb PrimaryEndpointBehavior) tcpip.Error {
        netProto, ok := s.networkProtocols[protocol]
        if !ok {
                return &tcpip.ErrUnknownProtocol{}
        }
        return s.AddProtocolAddressWithOptions(id, tcpip.ProtocolAddress{
                Protocol: protocol,
                AddressWithPrefix: tcpip.AddressWithPrefix{
                        Address:   addr,
                        PrefixLen: netProto.DefaultPrefixLen(),
                },
        }, peb)
}

// AddProtocolAddressWithOptions is the same as AddProtocolAddress, but allows
// you to specify whether the new endpoint can be primary or not.
func (s *Stack) AddProtocolAddressWithOptions(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) tcpip.Error {
        s.mu.RLock()
        defer s.mu.RUnlock()

        nic, ok := s.nics[id]
        if !ok {
                return &tcpip.ErrUnknownNICID{}
        }

        return nic.addAddress(protocolAddress, peb)
}

// RemoveAddress removes an existing network-layer address from the specified
// NIC.
func (s *Stack) RemoveAddress(id tcpip.NICID, addr tcpip.Address) tcpip.Error {
        s.mu.RLock()
        defer s.mu.RUnlock()

        if nic, ok := s.nics[id]; ok {
                return nic.removeAddress(addr)
        }

        return &tcpip.ErrUnknownNICID{}
}

// AllAddresses returns a map of NICIDs to their protocol addresses (primary
// and non-primary).
func (s *Stack) AllAddresses() map[tcpip.NICID][]tcpip.ProtocolAddress {
        s.mu.RLock()
        defer s.mu.RUnlock()

        nics := make(map[tcpip.NICID][]tcpip.ProtocolAddress)
        for id, nic := range s.nics {
                nics[id] = nic.allPermanentAddresses()
        }
        return nics
}

// GetMainNICAddress returns the first non-deprecated primary address and prefix
// for the given NIC and protocol. If no non-deprecated primary addresses exist,
// a deprecated address will be returned. If no deprecated addresses exist, the
// zero value will be returned.
func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, tcpip.Error) {
        s.mu.RLock()
        defer s.mu.RUnlock()

        nic, ok := s.nics[id]
        if !ok {
                return tcpip.AddressWithPrefix{}, &tcpip.ErrUnknownNICID{}
        }

        return nic.PrimaryAddress(protocol)
}

func (s *Stack) getAddressEP(nic *nic, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) AssignableAddressEndpoint {
        if len(localAddr) == 0 {
                return nic.primaryEndpoint(netProto, remoteAddr)
        }
        return nic.findEndpoint(netProto, localAddr, CanBePrimaryEndpoint)
}

// findLocalRouteFromNICRLocked is like findLocalRouteRLocked but finds a route
// from the specified NIC.
//
// Precondition: s.mu must be read locked.
func (s *Stack) findLocalRouteFromNICRLocked(localAddressNIC *nic, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) *Route {
        localAddressEndpoint := localAddressNIC.getAddressOrCreateTempInner(netProto, localAddr, false /* createTemp */, NeverPrimaryEndpoint)
        if localAddressEndpoint == nil {
                return nil
        }

        var outgoingNIC *nic
        // Prefer a local route to the same interface as the local address.
        if localAddressNIC.hasAddress(netProto, remoteAddr) {
                outgoingNIC = localAddressNIC
        }

        // If the remote address isn't owned by the local address's NIC, check all
        // NICs.
        if outgoingNIC == nil {
                for _, nic := range s.nics {
                        if nic.hasAddress(netProto, remoteAddr) {
                                outgoingNIC = nic
                                break
                        }
                }
        }

        // If the remote address is not owned by the stack, we can't return a local
        // route.
        if outgoingNIC == nil {
                localAddressEndpoint.DecRef()
                return nil
        }

        r := makeLocalRoute(
                netProto,
                localAddr,
                remoteAddr,
                outgoingNIC,
                localAddressNIC,
                localAddressEndpoint,
        )

        if r.IsOutboundBroadcast() {
                r.Release()
                return nil
        }

        return r
}

// findLocalRouteRLocked returns a local route.
//
// A local route is a route to some remote address which the stack owns. That
// is, a local route is a route where packets never have to leave the stack.
//
// Precondition: s.mu must be read locked.
func (s *Stack) findLocalRouteRLocked(localAddressNICID tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) *Route {
        if len(localAddr) == 0 {
                localAddr = remoteAddr
        }

        if localAddressNICID == 0 {
                for _, localAddressNIC := range s.nics {
                        if r := s.findLocalRouteFromNICRLocked(localAddressNIC, localAddr, remoteAddr, netProto); r != nil {
                                return r
                        }
                }

                return nil
        }

        if localAddressNIC, ok := s.nics[localAddressNICID]; ok {
                return s.findLocalRouteFromNICRLocked(localAddressNIC, localAddr, remoteAddr, netProto)
        }

        return nil
}

// HandleLocal returns true if non-loopback interfaces are allowed to loop packets.
func (s *Stack) HandleLocal() bool {
        return s.handleLocal
}

func isNICForwarding(nic *nic, proto tcpip.NetworkProtocolNumber) bool {
        switch forwarding, err := nic.forwarding(proto); err.(type) {
        case nil:
                return forwarding
        case *tcpip.ErrUnknownProtocol:
                panic(fmt.Sprintf("expected network protocol %d to be available on NIC %d", proto, nic.ID()))
        case *tcpip.ErrNotSupported:
                // Not all network protocols support forwarding.
                return false
        default:
                panic(fmt.Sprintf("nic(id=%d).forwarding(%d): %s", nic.ID(), proto, err))
        }
}

// FindRoute creates a route to the given destination address, leaving through
// the given NIC and local address (if provided).
//
// If a NIC is not specified, the returned route will leave through the same
// NIC as the NIC that has the local address assigned when forwarding is
// disabled. If forwarding is enabled and the NIC is unspecified, the route may
// leave through any interface unless the route is link-local.
//
// If no local address is provided, the stack will select a local address. If no
// remote address is provided, the stack wil use a remote address equal to the
// local address.
func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber, multicastLoop bool) (*Route, tcpip.Error) {
        s.mu.RLock()
        defer s.mu.RUnlock()

        isLinkLocal := header.IsV6LinkLocalUnicastAddress(remoteAddr) || header.IsV6LinkLocalMulticastAddress(remoteAddr)
        isLocalBroadcast := remoteAddr == header.IPv4Broadcast
        isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)
        isLoopback := header.IsV4LoopbackAddress(remoteAddr) || header.IsV6LoopbackAddress(remoteAddr)
        needRoute := !(isLocalBroadcast || isMulticast || isLinkLocal || isLoopback)

        if s.handleLocal && !isMulticast && !isLocalBroadcast {
                if r := s.findLocalRouteRLocked(id, localAddr, remoteAddr, netProto); r != nil {
                        return r, nil
                }
        }

        // If the interface is specified and we do not need a route, return a route
        // through the interface if the interface is valid and enabled.
        if id != 0 && !needRoute {
                if nic, ok := s.nics[id]; ok && nic.Enabled() {
                        if addressEndpoint := s.getAddressEP(nic, localAddr, remoteAddr, netProto); addressEndpoint != nil {
                                return makeRoute(
                                        netProto,
                                        "", /* gateway */
                                        localAddr,
                                        remoteAddr,
                                        nic, /* outboundNIC */
                                        nic, /* localAddressNIC*/
                                        addressEndpoint,
                                        s.handleLocal,
                                        multicastLoop,
                                ), nil
                        }
                }

                if isLoopback {
                        return nil, &tcpip.ErrBadLocalAddress{}
                }
                return nil, &tcpip.ErrNetworkUnreachable{}
        }

        onlyGlobalAddresses := !header.IsV6LinkLocalUnicastAddress(localAddr) && !isLinkLocal

        // Find a route to the remote with the route table.
        var chosenRoute tcpip.Route
        if r := func() *Route {
                s.route.mu.RLock()
                defer s.route.mu.RUnlock()

                for _, route := range s.route.mu.table {
                        if len(remoteAddr) != 0 && !route.Destination.Contains(remoteAddr) {
                                continue
                        }

                        nic, ok := s.nics[route.NIC]
                        if !ok || !nic.Enabled() {
                                continue
                        }

                        if id == 0 || id == route.NIC {
                                if addressEndpoint := s.getAddressEP(nic, localAddr, remoteAddr, netProto); addressEndpoint != nil {
                                        var gateway tcpip.Address
                                        if needRoute {
                                                gateway = route.Gateway
                                        }
                                        r := constructAndValidateRoute(netProto, addressEndpoint, nic /* outgoingNIC */, nic /* outgoingNIC */, gateway, localAddr, remoteAddr, s.handleLocal, multicastLoop)
                                        if r == nil {
                                                panic(fmt.Sprintf("non-forwarding route validation failed with route table entry = %#v, id = %d, localAddr = %s, remoteAddr = %s", route, id, localAddr, remoteAddr))
                                        }
                                        return r
                                }
                        }

                        // If the stack has forwarding enabled and we haven't found a valid route
                        // to the remote address yet, keep track of the first valid route. We
                        // keep iterating because we prefer routes that let us use a local
                        // address that is assigned to the outgoing interface. There is no
                        // requirement to do this from any RFC but simply a choice made to better
                        // follow a strong host model which the netstack follows at the time of
                        // writing.
                        if onlyGlobalAddresses && chosenRoute == (tcpip.Route{}) && isNICForwarding(nic, netProto) {
                                chosenRoute = route
                        }
                }

                return nil
        }(); r != nil {
                return r, nil
        }

        if chosenRoute != (tcpip.Route{}) {
                // At this point we know the stack has forwarding enabled since chosenRoute is
                // only set when forwarding is enabled.
                nic, ok := s.nics[chosenRoute.NIC]
                if !ok {
                        // If the route's NIC was invalid, we should not have chosen the route.
                        panic(fmt.Sprintf("chosen route must have a valid NIC with ID = %d", chosenRoute.NIC))
                }

                var gateway tcpip.Address
                if needRoute {
                        gateway = chosenRoute.Gateway
                }

                // Use the specified NIC to get the local address endpoint.
                if id != 0 {
                        if aNIC, ok := s.nics[id]; ok {
                                if addressEndpoint := s.getAddressEP(aNIC, localAddr, remoteAddr, netProto); addressEndpoint != nil {
                                        if r := constructAndValidateRoute(netProto, addressEndpoint, aNIC /* localAddressNIC */, nic /* outgoingNIC */, gateway, localAddr, remoteAddr, s.handleLocal, multicastLoop); r != nil {
                                                return r, nil
                                        }
                                }
                        }

                        return nil, &tcpip.ErrNoRoute{}
                }

                if id == 0 {
                        // If an interface is not specified, try to find a NIC that holds the local
                        // address endpoint to construct a route.
                        for _, aNIC := range s.nics {
                                addressEndpoint := s.getAddressEP(aNIC, localAddr, remoteAddr, netProto)
                                if addressEndpoint == nil {
                                        continue
                                }

                                if r := constructAndValidateRoute(netProto, addressEndpoint, aNIC /* localAddressNIC */, nic /* outgoingNIC */, gateway, localAddr, remoteAddr, s.handleLocal, multicastLoop); r != nil {
                                        return r, nil
                                }
                        }
                }
        }

        if needRoute {
                return nil, &tcpip.ErrNoRoute{}
        }
        if header.IsV6LoopbackAddress(remoteAddr) {
                return nil, &tcpip.ErrBadLocalAddress{}
        }
        return nil, &tcpip.ErrNetworkUnreachable{}
}

// CheckNetworkProtocol checks if a given network protocol is enabled in the
// stack.
func (s *Stack) CheckNetworkProtocol(protocol tcpip.NetworkProtocolNumber) bool {
        _, ok := s.networkProtocols[protocol]
        return ok
}

// CheckDuplicateAddress performs duplicate address detection for the address on
// the specified interface.
func (s *Stack) CheckDuplicateAddress(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, h DADCompletionHandler) (DADCheckAddressDisposition, tcpip.Error) {
        nic, ok := s.nics[nicID]
        if !ok {
                return 0, &tcpip.ErrUnknownNICID{}
        }

        return nic.checkDuplicateAddress(protocol, addr, h)
}

// CheckLocalAddress determines if the given local address exists, and if it
// does, returns the id of the NIC it's bound to. Returns 0 if the address
// does not exist.
func (s *Stack) CheckLocalAddress(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID {
        s.mu.RLock()
        defer s.mu.RUnlock()

        // If a NIC is specified, we try to find the address there only.
        if nicID != 0 {
                nic, ok := s.nics[nicID]
                if !ok {
                        return 0
                }

                if nic.CheckLocalAddress(protocol, addr) {
                        return nic.id
                }

                return 0
        }

        // Go through all the NICs.
        for _, nic := range s.nics {
                if nic.CheckLocalAddress(protocol, addr) {
                        return nic.id
                }
        }

        return 0
}

// SetPromiscuousMode enables or disables promiscuous mode in the given NIC.
func (s *Stack) SetPromiscuousMode(nicID tcpip.NICID, enable bool) tcpip.Error {
        s.mu.RLock()
        defer s.mu.RUnlock()

        nic, ok := s.nics[nicID]
        if !ok {
                return &tcpip.ErrUnknownNICID{}
        }

        nic.setPromiscuousMode(enable)

        return nil
}

// SetSpoofing enables or disables address spoofing in the given NIC, allowing
// endpoints to bind to any address in the NIC.
func (s *Stack) SetSpoofing(nicID tcpip.NICID, enable bool) tcpip.Error {
        s.mu.RLock()
        defer s.mu.RUnlock()

        nic, ok := s.nics[nicID]
        if !ok {
                return &tcpip.ErrUnknownNICID{}
        }

        nic.setSpoofing(enable)

        return nil
}

// LinkResolutionResult is the result of a link address resolution attempt.
type LinkResolutionResult struct {
        LinkAddress tcpip.LinkAddress
        Err         tcpip.Error
}

// GetLinkAddress finds the link address corresponding to a network address.
//
// Returns ErrNotSupported if the stack is not configured with a link address
// resolver for the specified network protocol.
//
// Returns ErrWouldBlock if the link address is not readily available, along
// with a notification channel for the caller to block on. Triggers address
// resolution asynchronously.
//
// onResolve will be called either immediately, if resolution is not required,
// or when address resolution is complete, with the resolved link address and
// whether resolution succeeded.
//
// If specified, the local address must be an address local to the interface
// the neighbor cache belongs to. The local address is the source address of
// a packet prompting NUD/link address resolution.
func (s *Stack) GetLinkAddress(nicID tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, onResolve func(LinkResolutionResult)) tcpip.Error {
        s.mu.RLock()
        nic, ok := s.nics[nicID]
        s.mu.RUnlock()
        if !ok {
                return &tcpip.ErrUnknownNICID{}
        }

        return nic.getLinkAddress(addr, localAddr, protocol, onResolve)
}

// Neighbors returns all IP to MAC address associations.
func (s *Stack) Neighbors(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber) ([]NeighborEntry, tcpip.Error) {
        s.mu.RLock()
        nic, ok := s.nics[nicID]
        s.mu.RUnlock()

        if !ok {
                return nil, &tcpip.ErrUnknownNICID{}
        }

        return nic.neighbors(protocol)
}

// AddStaticNeighbor statically associates an IP address to a MAC address.
func (s *Stack) AddStaticNeighbor(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, linkAddr tcpip.LinkAddress) tcpip.Error {
        s.mu.RLock()
        nic, ok := s.nics[nicID]
        s.mu.RUnlock()

        if !ok {
                return &tcpip.ErrUnknownNICID{}
        }

        return nic.addStaticNeighbor(addr, protocol, linkAddr)
}

// RemoveNeighbor removes an IP to MAC address association previously created
// either automically or by AddStaticNeighbor. Returns ErrBadAddress if there
// is no association with the provided address.
func (s *Stack) RemoveNeighbor(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.Error {
        s.mu.RLock()
        nic, ok := s.nics[nicID]
        s.mu.RUnlock()

        if !ok {
                return &tcpip.ErrUnknownNICID{}
        }

        return nic.removeNeighbor(protocol, addr)
}

// ClearNeighbors removes all IP to MAC address associations.
func (s *Stack) ClearNeighbors(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber) tcpip.Error {
        s.mu.RLock()
        nic, ok := s.nics[nicID]
        s.mu.RUnlock()

        if !ok {
                return &tcpip.ErrUnknownNICID{}
        }

        return nic.clearNeighbors(protocol)
}

// RegisterTransportEndpoint registers the given endpoint with the stack
// transport dispatcher. Received packets that match the provided id will be
// delivered to the given endpoint; specifying a nic is optional, but
// nic-specific IDs have precedence over global ones.
func (s *Stack) RegisterTransportEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error {
        return s.demux.registerEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
}

// CheckRegisterTransportEndpoint checks if an endpoint can be registered with
// the stack transport dispatcher.
func (s *Stack) CheckRegisterTransportEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) tcpip.Error {
        return s.demux.checkEndpoint(netProtos, protocol, id, flags, bindToDevice)
}

// UnregisterTransportEndpoint removes the endpoint with the given id from the
// stack transport dispatcher.
func (s *Stack) UnregisterTransportEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
        s.demux.unregisterEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
}

// StartTransportEndpointCleanup removes the endpoint with the given id from
// the stack transport dispatcher. It also transitions it to the cleanup stage.
func (s *Stack) StartTransportEndpointCleanup(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
        s.cleanupEndpointsMu.Lock()
        s.cleanupEndpoints[ep] = struct{}{}
        s.cleanupEndpointsMu.Unlock()

        s.demux.unregisterEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
}

// CompleteTransportEndpointCleanup removes the endpoint from the cleanup
// stage.
func (s *Stack) CompleteTransportEndpointCleanup(ep TransportEndpoint) {
        s.cleanupEndpointsMu.Lock()
        delete(s.cleanupEndpoints, ep)
        s.cleanupEndpointsMu.Unlock()
}

// FindTransportEndpoint finds an endpoint that most closely matches the provided
// id. If no endpoint is found it returns nil.
func (s *Stack) FindTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, nicID tcpip.NICID) TransportEndpoint {
        return s.demux.findTransportEndpoint(netProto, transProto, id, nicID)
}

// RegisterRawTransportEndpoint registers the given endpoint with the stack
// transport dispatcher. Received packets that match the provided transport
// protocol will be delivered to the given endpoint.
func (s *Stack) RegisterRawTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) tcpip.Error {
        return s.demux.registerRawEndpoint(netProto, transProto, ep)
}

// UnregisterRawTransportEndpoint removes the endpoint for the transport
// protocol from the stack transport dispatcher.
func (s *Stack) UnregisterRawTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) {
        s.demux.unregisterRawEndpoint(netProto, transProto, ep)
}

// RegisterRestoredEndpoint records e as an endpoint that has been restored on
// this stack.
func (s *Stack) RegisterRestoredEndpoint(e ResumableEndpoint) {
        s.mu.Lock()
        s.resumableEndpoints = append(s.resumableEndpoints, e)
        s.mu.Unlock()
}

// RegisteredEndpoints returns all endpoints which are currently registered.
func (s *Stack) RegisteredEndpoints() []TransportEndpoint {
        s.mu.Lock()
        defer s.mu.Unlock()
        var es []TransportEndpoint
        for _, e := range s.demux.protocol {
                es = append(es, e.transportEndpoints()...)
        }
        return es
}

// CleanupEndpoints returns endpoints currently in the cleanup state.
func (s *Stack) CleanupEndpoints() []TransportEndpoint {
        s.cleanupEndpointsMu.Lock()
        es := make([]TransportEndpoint, 0, len(s.cleanupEndpoints))
        for e := range s.cleanupEndpoints {
                es = append(es, e)
        }
        s.cleanupEndpointsMu.Unlock()
        return es
}

// RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful
// for restoring a stack after a save.
func (s *Stack) RestoreCleanupEndpoints(es []TransportEndpoint) {
        s.cleanupEndpointsMu.Lock()
        for _, e := range es {
                s.cleanupEndpoints[e] = struct{}{}
        }
        s.cleanupEndpointsMu.Unlock()
}

// Close closes all currently registered transport endpoints.
//
// Endpoints created or modified during this call may not get closed.
func (s *Stack) Close() {
        for _, e := range s.RegisteredEndpoints() {
                e.Abort()
        }
        for _, p := range s.transportProtocols {
                p.proto.Close()
        }
        for _, p := range s.networkProtocols {
                p.Close()
        }
}

// Wait waits for all transport and link endpoints to halt their worker
// goroutines.
//
// Endpoints created or modified during this call may not get waited on.
//
// Note that link endpoints must be stopped via an implementation specific
// mechanism.
func (s *Stack) Wait() {
        for _, e := range s.RegisteredEndpoints() {
                e.Wait()
        }
        for _, e := range s.CleanupEndpoints() {
                e.Wait()
        }
        for _, p := range s.transportProtocols {
                p.proto.Wait()
        }
        for _, p := range s.networkProtocols {
                p.Wait()
        }

        s.mu.RLock()
        defer s.mu.RUnlock()
        for _, n := range s.nics {
                n.LinkEndpoint.Wait()
        }
}

// Resume restarts the stack after a restore. This must be called after the
// entire system has been restored.
func (s *Stack) Resume() {
        // ResumableEndpoint.Resume() may call other methods on s, so we can't hold
        // s.mu while resuming the endpoints.
        s.mu.Lock()
        eps := s.resumableEndpoints
        s.resumableEndpoints = nil
        s.mu.Unlock()
        for _, e := range eps {
                e.Resume(s)
        }
}

// RegisterPacketEndpoint registers ep with the stack, causing it to receive
// all traffic of the specified netProto on the given NIC. If nicID is 0, it
// receives traffic from every NIC.
func (s *Stack) RegisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) tcpip.Error {
        s.mu.Lock()
        defer s.mu.Unlock()

        // If no NIC is specified, capture on all devices.
        if nicID == 0 {
                // Register with each NIC.
                for _, nic := range s.nics {
                        if err := nic.registerPacketEndpoint(netProto, ep); err != nil {
                                s.unregisterPacketEndpointLocked(0, netProto, ep)
                                return err
                        }
                }
                return nil
        }

        // Capture on a specific device.
        nic, ok := s.nics[nicID]
        if !ok {
                return &tcpip.ErrUnknownNICID{}
        }
        if err := nic.registerPacketEndpoint(netProto, ep); err != nil {
                return err
        }

        return nil
}

// UnregisterPacketEndpoint unregisters ep for packets of the specified
// netProto from the specified NIC. If nicID is 0, ep is unregistered from all
// NICs.
func (s *Stack) UnregisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) {
        s.mu.Lock()
        defer s.mu.Unlock()
        s.unregisterPacketEndpointLocked(nicID, netProto, ep)
}

func (s *Stack) unregisterPacketEndpointLocked(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) {
        // If no NIC is specified, unregister on all devices.
        if nicID == 0 {
                // Unregister with each NIC.
                for _, nic := range s.nics {
                        nic.unregisterPacketEndpoint(netProto, ep)
                }
                return
        }

        // Unregister in a single device.
        nic, ok := s.nics[nicID]
        if !ok {
                return
        }
        nic.unregisterPacketEndpoint(netProto, ep)
}

// WritePacketToRemote writes a payload on the specified NIC using the provided
// network protocol and remote link address.
func (s *Stack) WritePacketToRemote(nicID tcpip.NICID, remote tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) tcpip.Error {
        s.mu.Lock()
        nic, ok := s.nics[nicID]
        s.mu.Unlock()
        if !ok {
                return &tcpip.ErrUnknownDevice{}
        }
        pkt := NewPacketBuffer(PacketBufferOptions{
                ReserveHeaderBytes: int(nic.MaxHeaderLength()),
                Data:               payload,
        })
        return nic.WritePacketToRemote(remote, netProto, pkt)
}

// NetworkProtocolInstance returns the protocol instance in the stack for the
// specified network protocol. This method is public for protocol implementers
// and tests to use.
func (s *Stack) NetworkProtocolInstance(num tcpip.NetworkProtocolNumber) NetworkProtocol {
        if p, ok := s.networkProtocols[num]; ok {
                return p
        }
        return nil
}

// TransportProtocolInstance returns the protocol instance in the stack for the
// specified transport protocol. This method is public for protocol implementers
// and tests to use.
func (s *Stack) TransportProtocolInstance(num tcpip.TransportProtocolNumber) TransportProtocol {
        if pState, ok := s.transportProtocols[num]; ok {
                return pState.proto
        }
        return nil
}

// AddTCPProbe installs a probe function that will be invoked on every segment
// received by a given TCP endpoint. The probe function is passed a copy of the
// TCP endpoint state before and after processing of the segment.
//
// NOTE: TCPProbe is added only to endpoints created after this call. Endpoints
// created prior to this call will not call the probe function.
//
// Further, installing two different probes back to back can result in some
// endpoints calling the first one and some the second one. There is no
// guarantee provided on which probe will be invoked. Ideally this should only
// be called once per stack.
func (s *Stack) AddTCPProbe(probe TCPProbeFunc) {
        s.tcpProbeFunc.Store(probe)
}

// GetTCPProbe returns the TCPProbeFunc if installed with AddTCPProbe, nil
// otherwise.
func (s *Stack) GetTCPProbe() TCPProbeFunc {
        p := s.tcpProbeFunc.Load()
        if p == nil {
                return nil
        }
        return p.(TCPProbeFunc)
}

// RemoveTCPProbe removes an installed TCP probe.
//
// NOTE: This only ensures that endpoints created after this call do not
// have a probe attached. Endpoints already created will continue to invoke
// TCP probe.
func (s *Stack) RemoveTCPProbe() {
        // This must be TCPProbeFunc(nil) because atomic.Value.Store(nil) panics.
        s.tcpProbeFunc.Store(TCPProbeFunc(nil))
}

// JoinGroup joins the given multicast group on the given NIC.
func (s *Stack) JoinGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) tcpip.Error {
        s.mu.RLock()
        defer s.mu.RUnlock()

        if nic, ok := s.nics[nicID]; ok {
                return nic.joinGroup(protocol, multicastAddr)
        }
        return &tcpip.ErrUnknownNICID{}
}

// LeaveGroup leaves the given multicast group on the given NIC.
func (s *Stack) LeaveGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) tcpip.Error {
        s.mu.RLock()
        defer s.mu.RUnlock()

        if nic, ok := s.nics[nicID]; ok {
                return nic.leaveGroup(protocol, multicastAddr)
        }
        return &tcpip.ErrUnknownNICID{}
}

// IsInGroup returns true if the NIC with ID nicID has joined the multicast
// group multicastAddr.
func (s *Stack) IsInGroup(nicID tcpip.NICID, multicastAddr tcpip.Address) (bool, tcpip.Error) {
        s.mu.RLock()
        defer s.mu.RUnlock()

        if nic, ok := s.nics[nicID]; ok {
                return nic.isInGroup(multicastAddr), nil
        }
        return false, &tcpip.ErrUnknownNICID{}
}

// IPTables returns the stack's iptables.
func (s *Stack) IPTables() *IPTables {
        return s.tables
}

// ICMPLimit returns the maximum number of ICMP messages that can be sent
// in one second.
func (s *Stack) ICMPLimit() rate.Limit {
        return s.icmpRateLimiter.Limit()
}

// SetICMPLimit sets the maximum number of ICMP messages that be sent
// in one second.
func (s *Stack) SetICMPLimit(newLimit rate.Limit) {
        s.icmpRateLimiter.SetLimit(newLimit)
}

// ICMPBurst returns the maximum number of ICMP messages that can be sent
// in a single burst.
func (s *Stack) ICMPBurst() int {
        return s.icmpRateLimiter.Burst()
}

// SetICMPBurst sets the maximum number of ICMP messages that can be sent
// in a single burst.
func (s *Stack) SetICMPBurst(burst int) {
        s.icmpRateLimiter.SetBurst(burst)
}

// AllowICMPMessage returns true if we the rate limiter allows at least one
// ICMP message to be sent at this instant.
func (s *Stack) AllowICMPMessage() bool {
        return s.icmpRateLimiter.Allow()
}

// GetNetworkEndpoint returns the NetworkEndpoint with the specified protocol
// number installed on the specified NIC.
func (s *Stack) GetNetworkEndpoint(nicID tcpip.NICID, proto tcpip.NetworkProtocolNumber) (NetworkEndpoint, tcpip.Error) {
        s.mu.Lock()
        defer s.mu.Unlock()

        nic, ok := s.nics[nicID]
        if !ok {
                return nil, &tcpip.ErrUnknownNICID{}
        }

        return nic.getNetworkEndpoint(proto), nil
}

// NUDConfigurations gets the per-interface NUD configurations.
func (s *Stack) NUDConfigurations(id tcpip.NICID, proto tcpip.NetworkProtocolNumber) (NUDConfigurations, tcpip.Error) {
        s.mu.RLock()
        nic, ok := s.nics[id]
        s.mu.RUnlock()

        if !ok {
                return NUDConfigurations{}, &tcpip.ErrUnknownNICID{}
        }

        return nic.nudConfigs(proto)
}

// SetNUDConfigurations sets the per-interface NUD configurations.
//
// Note, if c contains invalid NUD configuration values, it will be fixed to
// use default values for the erroneous values.
func (s *Stack) SetNUDConfigurations(id tcpip.NICID, proto tcpip.NetworkProtocolNumber, c NUDConfigurations) tcpip.Error {
        s.mu.RLock()
        nic, ok := s.nics[id]
        s.mu.RUnlock()

        if !ok {
                return &tcpip.ErrUnknownNICID{}
        }

        return nic.setNUDConfigs(proto, c)
}

// Seed returns a 32 bit value that can be used as a seed value for port
// picking, ISN generation etc.
//
// NOTE: The seed is generated once during stack initialization only.
func (s *Stack) Seed() uint32 {
        return s.seed
}

// Rand returns a reference to a pseudo random generator that can be used
// to generate random numbers as required.
func (s *Stack) Rand() *rand.Rand {
        return s.randomGenerator
}

// SecureRNG returns the stack's cryptographically secure random number
// generator.
func (s *Stack) SecureRNG() io.Reader {
        return s.secureRNG
}

// FindNICNameFromID returns the name of the NIC for the given NICID.
func (s *Stack) FindNICNameFromID(id tcpip.NICID) string {
        s.mu.RLock()
        defer s.mu.RUnlock()

        nic, ok := s.nics[id]
        if !ok {
                return ""
        }

        return nic.Name()
}

// NewJob returns a new tcpip.Job using the stack's clock.
func (s *Stack) NewJob(l sync.Locker, f func()) *tcpip.Job {
        return tcpip.NewJob(s.clock, l, f)
}

// ParseResult indicates the result of a parsing attempt.
type ParseResult int

const (
        // ParsedOK indicates that a packet was successfully parsed.
        ParsedOK ParseResult = iota

        // UnknownTransportProtocol indicates that the transport protocol is unknown.
        UnknownTransportProtocol

        // TransportLayerParseError indicates that the transport packet was not
        // successfully parsed.
        TransportLayerParseError
)

// ParsePacketBufferTransport parses the provided packet buffer's transport
// header.
func (s *Stack) ParsePacketBufferTransport(protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) ParseResult {
        // ICMP packets don't have their TransportHeader fields set yet, parse it
        // here. See icmp/protocol.go:protocol.Parse for a full explanation.
        if protocol == header.ICMPv4ProtocolNumber || protocol == header.ICMPv6ProtocolNumber {
                return ParsedOK
        }

        pkt.TransportProtocolNumber = protocol
        // Parse the transport header if present.
        state, ok := s.transportProtocols[protocol]
        if !ok {
                return UnknownTransportProtocol
        }

        if !state.proto.Parse(pkt) {
                return TransportLayerParseError
        }

        return ParsedOK
}

// networkProtocolNumbers returns the network protocol numbers the stack is
// configured with.
func (s *Stack) networkProtocolNumbers() []tcpip.NetworkProtocolNumber {
        protos := make([]tcpip.NetworkProtocolNumber, 0, len(s.networkProtocols))
        for p := range s.networkProtocols {
                protos = append(protos, p)
        }
        return protos
}

func isSubnetBroadcastOnNIC(nic *nic, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) bool {
        addressEndpoint := nic.getAddressOrCreateTempInner(protocol, addr, false /* createTemp */, NeverPrimaryEndpoint)
        if addressEndpoint == nil {
                return false
        }

        subnet := addressEndpoint.Subnet()
        addressEndpoint.DecRef()
        return subnet.IsBroadcast(addr)
}

// IsSubnetBroadcast returns true if the provided address is a subnet-local
// broadcast address on the specified NIC and protocol.
//
// Returns false if the NIC is unknown or if the protocol is unknown or does
// not support addressing.
//
// If the NIC is not specified, the stack will check all NICs.
func (s *Stack) IsSubnetBroadcast(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) bool {
        s.mu.RLock()
        defer s.mu.RUnlock()

        if nicID != 0 {
                nic, ok := s.nics[nicID]
                if !ok {
                        return false
                }

                return isSubnetBroadcastOnNIC(nic, protocol, addr)
        }

        for _, nic := range s.nics {
                if isSubnetBroadcastOnNIC(nic, protocol, addr) {
                        return true
                }
        }

        return false
}







































    9 





















   31 

   31 

   31 



   31 











   31 



    3 


    1 



    2 
    2 




    2 




    1 




    3 


    1 






    2 
    1 



    1 








    3 



    1 






    2 
    1 



    1 




    1 




    3 

    3 
    3 





    3 
    3 








    1 











    1 































































































































































































    2 





    2 
    1 


    1 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package netfilter helps the sentry interact with netstack's netfilter
// capabilities.
package netfilter

import (
        "bytes"
        "errors"
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

// enableLogging controls whether to log the (de)serialization of netfilter
// structs between userspace and netstack. These logs are useful when
// developing iptables, but can pollute sentry logs otherwise.
const enableLogging = false

// nflog logs messages related to the writing and reading of iptables.
func nflog(format string, args ...interface{}) {
        if enableLogging && log.IsLogging(log.Debug) {
                log.Debugf("netfilter: "+format, args...)
        }
}

// Table names.
const (
        natTable    = "nat"
        mangleTable = "mangle"
        filterTable = "filter"
)

// nameToID is immutable.
var nameToID = map[string]stack.TableID{
        natTable:    stack.NATID,
        mangleTable: stack.MangleID,
        filterTable: stack.FilterID,
}

// DefaultLinuxTables returns the rules of stack.DefaultTables() wrapped for
// compatibility with netfilter extensions.
func DefaultLinuxTables(seed uint32) *stack.IPTables {
        tables := stack.DefaultTables(seed)
        tables.VisitTargets(func(oldTarget stack.Target) stack.Target {
                switch val := oldTarget.(type) {
                case *stack.AcceptTarget:
                        return &acceptTarget{AcceptTarget: *val}
                case *stack.DropTarget:
                        return &dropTarget{DropTarget: *val}
                case *stack.ErrorTarget:
                        return &errorTarget{ErrorTarget: *val}
                case *stack.UserChainTarget:
                        return &userChainTarget{UserChainTarget: *val}
                case *stack.ReturnTarget:
                        return &returnTarget{ReturnTarget: *val}
                case *stack.RedirectTarget:
                        return &redirectTarget{RedirectTarget: *val}
                default:
                        panic(fmt.Sprintf("Unknown rule in default iptables of type %T", val))
                }
        })
        return tables
}

// GetInfo returns information about iptables.
func GetInfo(t *kernel.Task, stack *stack.Stack, outPtr hostarch.Addr, ipv6 bool) (linux.IPTGetinfo, *syserr.Error) {
        // Read in the struct and table name.
        var info linux.IPTGetinfo
        if _, err := info.CopyIn(t, outPtr); err != nil {
                return linux.IPTGetinfo{}, syserr.FromError(err)
        }

        var err error
        if ipv6 {
                _, info, err = convertNetstackToBinary6(stack, info.Name)
        } else {
                _, info, err = convertNetstackToBinary4(stack, info.Name)
        }
        if err != nil {
                nflog("couldn't convert iptables: %v", err)
                return linux.IPTGetinfo{}, syserr.ErrInvalidArgument
        }

        nflog("returning info: %+v", info)
        return info, nil
}

// GetEntries4 returns netstack's iptables rules.
func GetEntries4(t *kernel.Task, stack *stack.Stack, outPtr hostarch.Addr, outLen int) (linux.KernelIPTGetEntries, *syserr.Error) {
        // Read in the struct and table name.
        var userEntries linux.IPTGetEntries
        if _, err := userEntries.CopyIn(t, outPtr); err != nil {
                nflog("couldn't copy in entries %q", userEntries.Name)
                return linux.KernelIPTGetEntries{}, syserr.FromError(err)
        }

        // Convert netstack's iptables rules to something that the iptables
        // tool can understand.
        entries, _, err := convertNetstackToBinary4(stack, userEntries.Name)
        if err != nil {
                nflog("couldn't read entries: %v", err)
                return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
        }
        if entries.SizeBytes() > outLen {
                nflog("insufficient GetEntries output size: %d", uintptr(outLen))
                return linux.KernelIPTGetEntries{}, syserr.ErrInvalidArgument
        }

        return entries, nil
}

// GetEntries6 returns netstack's ip6tables rules.
func GetEntries6(t *kernel.Task, stack *stack.Stack, outPtr hostarch.Addr, outLen int) (linux.KernelIP6TGetEntries, *syserr.Error) {
        // Read in the struct and table name. IPv4 and IPv6 utilize structs
        // with the same layout.
        var userEntries linux.IPTGetEntries
        if _, err := userEntries.CopyIn(t, outPtr); err != nil {
                nflog("couldn't copy in entries %q", userEntries.Name)
                return linux.KernelIP6TGetEntries{}, syserr.FromError(err)
        }

        // Convert netstack's iptables rules to something that the iptables
        // tool can understand.
        entries, _, err := convertNetstackToBinary6(stack, userEntries.Name)
        if err != nil {
                nflog("couldn't read entries: %v", err)
                return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument
        }
        if entries.SizeBytes() > outLen {
                nflog("insufficient GetEntries output size: %d", uintptr(outLen))
                return linux.KernelIP6TGetEntries{}, syserr.ErrInvalidArgument
        }

        return entries, nil
}

// setHooksAndUnderflow checks whether the rule at ruleIdx is a hook entrypoint
// or underflow, in which case it fills in info.HookEntry and info.Underflows.
func setHooksAndUnderflow(info *linux.IPTGetinfo, table stack.Table, offset uint32, ruleIdx int) {
        // Is this a chain entry point?
        for hook, hookRuleIdx := range table.BuiltinChains {
                if hookRuleIdx == ruleIdx {
                        nflog("convert to binary: found hook %d at offset %d", hook, offset)
                        info.HookEntry[hook] = offset
                }
        }
        // Is this a chain underflow point?
        for underflow, underflowRuleIdx := range table.Underflows {
                if underflowRuleIdx == ruleIdx {
                        nflog("convert to binary: found underflow %d at offset %d", underflow, offset)
                        info.Underflow[underflow] = offset
                }
        }
}

// SetEntries sets iptables rules for a single table. See
// net/ipv4/netfilter/ip_tables.c:translate_table for reference.
func SetEntries(task *kernel.Task, stk *stack.Stack, optVal []byte, ipv6 bool) *syserr.Error {
        var replace linux.IPTReplace
        replaceBuf := optVal[:linux.SizeOfIPTReplace]
        optVal = optVal[linux.SizeOfIPTReplace:]
        replace.UnmarshalBytes(replaceBuf)

        var table stack.Table
        switch replace.Name.String() {
        case filterTable:
                table = stack.EmptyFilterTable()
        case natTable:
                table = stack.EmptyNATTable()
        default:
                nflog("unknown iptables table %q", replace.Name.String())
                return syserr.ErrInvalidArgument
        }

        var err *syserr.Error
        var offsets map[uint32]int
        if ipv6 {
                offsets, err = modifyEntries6(task, stk, optVal, &replace, &table)
        } else {
                offsets, err = modifyEntries4(task, stk, optVal, &replace, &table)
        }
        if err != nil {
                return err
        }

        // Go through the list of supported hooks for this table and, for each
        // one, set the rule it corresponds to.
        for hook := range replace.HookEntry {
                if table.ValidHooks()&(1<<hook) != 0 {
                        hk := hookFromLinux(hook)
                        table.BuiltinChains[hk] = stack.HookUnset
                        table.Underflows[hk] = stack.HookUnset
                        for offset, ruleIdx := range offsets {
                                if offset == replace.HookEntry[hook] {
                                        table.BuiltinChains[hk] = ruleIdx
                                }
                                if offset == replace.Underflow[hook] {
                                        if !validUnderflow(table.Rules[ruleIdx], ipv6) {
                                                nflog("underflow for hook %d isn't an unconditional ACCEPT or DROP: %+v", ruleIdx)
                                                return syserr.ErrInvalidArgument
                                        }
                                        table.Underflows[hk] = ruleIdx
                                }
                        }
                        if ruleIdx := table.BuiltinChains[hk]; ruleIdx == stack.HookUnset {
                                nflog("hook %v is unset.", hk)
                                return syserr.ErrInvalidArgument
                        }
                        if ruleIdx := table.Underflows[hk]; ruleIdx == stack.HookUnset {
                                nflog("underflow %v is unset.", hk)
                                return syserr.ErrInvalidArgument
                        }
                }
        }

        // Check the user chains.
        for ruleIdx, rule := range table.Rules {
                if _, ok := rule.Target.(*stack.UserChainTarget); !ok {
                        continue
                }

                // We found a user chain. Before inserting it into the table,
                // check that:
                // - There's some other rule after it.
                // - There are no matchers.
                if ruleIdx == len(table.Rules)-1 {
                        nflog("user chain must have a rule or default policy")
                        return syserr.ErrInvalidArgument
                }
                if len(table.Rules[ruleIdx].Matchers) != 0 {
                        nflog("user chain's first node must have no matchers")
                        return syserr.ErrInvalidArgument
                }
        }

        // Set each jump to point to the appropriate rule. Right now they hold byte
        // offsets.
        for ruleIdx, rule := range table.Rules {
                jump, ok := rule.Target.(*JumpTarget)
                if !ok {
                        continue
                }

                // Find the rule corresponding to the jump rule offset.
                jumpTo, ok := offsets[jump.Offset]
                if !ok {
                        nflog("failed to find a rule to jump to")
                        return syserr.ErrInvalidArgument
                }
                jump.RuleNum = jumpTo
                rule.Target = jump
                table.Rules[ruleIdx] = rule
        }

        // Since we don't support FORWARD, yet, make sure all other chains point to
        // ACCEPT rules.
        for hook, ruleIdx := range table.BuiltinChains {
                if hook := stack.Hook(hook); hook == stack.Forward {
                        if ruleIdx == stack.HookUnset {
                                continue
                        }
                        if !isUnconditionalAccept(table.Rules[ruleIdx], ipv6) {
                                nflog("hook %d is unsupported.", hook)
                                return syserr.ErrInvalidArgument
                        }
                }
        }

        // TODO(gvisor.dev/issue/6167): Check the following conditions:
        // - There are no loops.
        // - There are no chains without an unconditional final rule.
        // - There are no chains without an unconditional underflow rule.

        return syserr.TranslateNetstackError(stk.IPTables().ReplaceTable(nameToID[replace.Name.String()], table, ipv6))
}

// parseMatchers parses 0 or more matchers from optVal. optVal should contain
// only the matchers.
func parseMatchers(task *kernel.Task, filter stack.IPHeaderFilter, optVal []byte) ([]stack.Matcher, error) {
        nflog("set entries: parsing matchers of size %d", len(optVal))
        var matchers []stack.Matcher
        for len(optVal) > 0 {
                nflog("set entries: optVal has len %d", len(optVal))

                // Get the XTEntryMatch.
                if len(optVal) < linux.SizeOfXTEntryMatch {
                        return nil, fmt.Errorf("optVal has insufficient size for entry match: %d", len(optVal))
                }
                var match linux.XTEntryMatch
                buf := optVal[:match.SizeBytes()]
                match.UnmarshalUnsafe(buf)
                nflog("set entries: parsed entry match %q: %+v", match.Name.String(), match)

                // Check some invariants.
                if match.MatchSize < linux.SizeOfXTEntryMatch {
                        return nil, fmt.Errorf("match size is too small, must be at least %d", linux.SizeOfXTEntryMatch)
                }
                if len(optVal) < int(match.MatchSize) {
                        return nil, fmt.Errorf("optVal has insufficient size for match: %d", len(optVal))
                }

                // Parse the specific matcher.
                matcher, err := unmarshalMatcher(task, match, filter, optVal[linux.SizeOfXTEntryMatch:match.MatchSize])
                if err != nil {
                        return nil, fmt.Errorf("failed to create matcher: %v", err)
                }
                matchers = append(matchers, matcher)

                // TODO(gvisor.dev/issue/6167): Check the revision field.
                optVal = optVal[match.MatchSize:]
        }

        if len(optVal) != 0 {
                return nil, errors.New("optVal should be exhausted after parsing matchers")
        }

        return matchers, nil
}

func validUnderflow(rule stack.Rule, ipv6 bool) bool {
        if len(rule.Matchers) != 0 {
                return false
        }
        if (ipv6 && rule.Filter != emptyIPv6Filter) || (!ipv6 && rule.Filter != emptyIPv4Filter) {
                return false
        }
        switch rule.Target.(type) {
        case *acceptTarget, *dropTarget:
                return true
        default:
                return false
        }
}

func isUnconditionalAccept(rule stack.Rule, ipv6 bool) bool {
        if !validUnderflow(rule, ipv6) {
                return false
        }
        _, ok := rule.Target.(*acceptTarget)
        return ok
}

func hookFromLinux(hook int) stack.Hook {
        switch hook {
        case linux.NF_INET_PRE_ROUTING:
                return stack.Prerouting
        case linux.NF_INET_LOCAL_IN:
                return stack.Input
        case linux.NF_INET_FORWARD:
                return stack.Forward
        case linux.NF_INET_LOCAL_OUT:
                return stack.Output
        case linux.NF_INET_POST_ROUTING:
                return stack.Postrouting
        }
        panic(fmt.Sprintf("Unknown hook %d does not correspond to a builtin chain", hook))
}

// TargetRevision returns a linux.XTGetRevision for a given target. It sets
// Revision to the highest supported value, unless the provided revision number
// is larger.
func TargetRevision(t *kernel.Task, revPtr hostarch.Addr, netProto tcpip.NetworkProtocolNumber) (linux.XTGetRevision, *syserr.Error) {
        // Read in the target name and version.
        var rev linux.XTGetRevision
        if _, err := rev.CopyIn(t, revPtr); err != nil {
                return linux.XTGetRevision{}, syserr.FromError(err)
        }
        maxSupported, ok := targetRevision(rev.Name.String(), netProto, rev.Revision)
        if !ok {
                return linux.XTGetRevision{}, syserr.ErrProtocolNotSupported
        }
        rev.Revision = maxSupported
        return rev, nil
}

func trimNullBytes(b []byte) []byte {
        n := bytes.IndexByte(b, 0)
        if n == -1 {
                n = len(b)
        }
        return b[:n]
}




































  298 

  297 


  298 


  298 


  297 



  871 




  800 

  799 


  800 


  801 


  800 




 1961 
   27 


 1960 


 1955 


 1958 



  255 


















  204 






  190 


  200 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package hostarch

import "golang.org/x/sys/unix"

// AccessType specifies memory access types. This is used for
// setting mapping permissions, as well as communicating faults.
//
// +stateify savable
type AccessType struct {
        // Read is read access.
        Read bool

        // Write is write access.
        Write bool

        // Execute is executable access.
        Execute bool
}

// String returns a pretty representation of access. This looks like the
// familiar r-x, rw-, etc. and can be relied on as such.
func (a AccessType) String() string {
        bits := [3]byte{'-', '-', '-'}
        if a.Read {
                bits[0] = 'r'
        }
        if a.Write {
                bits[1] = 'w'
        }
        if a.Execute {
                bits[2] = 'x'
        }
        return string(bits[:])
}

// Any returns true iff at least one of Read, Write or Execute is true.
func (a AccessType) Any() bool {
        return a.Read || a.Write || a.Execute
}

// Prot returns the system prot (unix.PROT_READ, etc.) for this access.
func (a AccessType) Prot() int {
        var prot int
        if a.Read {
                prot |= unix.PROT_READ
        }
        if a.Write {
                prot |= unix.PROT_WRITE
        }
        if a.Execute {
                prot |= unix.PROT_EXEC
        }
        return prot
}

// SupersetOf returns true iff the access types in a are a superset of the
// access types in other.
func (a AccessType) SupersetOf(other AccessType) bool {
        if !a.Read && other.Read {
                return false
        }
        if !a.Write && other.Write {
                return false
        }
        if !a.Execute && other.Execute {
                return false
        }
        return true
}

// Intersect returns the access types set in both a and other.
func (a AccessType) Intersect(other AccessType) AccessType {
        return AccessType{
                Read:    a.Read && other.Read,
                Write:   a.Write && other.Write,
                Execute: a.Execute && other.Execute,
        }
}

// Union returns the access types set in either a or other.
func (a AccessType) Union(other AccessType) AccessType {
        return AccessType{
                Read:    a.Read || other.Read,
                Write:   a.Write || other.Write,
                Execute: a.Execute || other.Execute,
        }
}

// Effective returns the set of effective access types allowed by a, even if
// some types are not explicitly allowed.
func (a AccessType) Effective() AccessType {
        // In Linux, Write and Execute access generally imply Read access. See
        // mm/mmap.c:protection_map.
        //
        // The notable exception is get_user_pages, which only checks against
        // the original vma flags. That said, most user memory accesses do not
        // use GUP.
        if a.Write || a.Execute {
                a.Read = true
        }
        return a
}

// Convenient access types.
var (
        NoAccess  = AccessType{}
        Read      = AccessType{Read: true}
        Write     = AccessType{Write: true}
        Execute   = AccessType{Execute: true}
        ReadWrite = AccessType{Read: true, Write: true}
        AnyAccess = AccessType{Read: true, Write: true, Execute: true}
)
























































































 1958 







 1960 









 1959 












 1955 





























    4 





    4 







    4 
    4 








    4 



    4 
















































    3 








    2 







    2 















    2 













 1957 





 1959 


 1960 

 1959 




 1954 





 1961 





 1962 





 1963 


 1958 

 1957 




 1961 





 1963 










 1955 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "fmt"
        "io"
        "sync"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/coverage"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/mm"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        "gvisor.dev/gvisor/pkg/sentry/usage"
)

// kcovAreaSizeMax is the maximum number of uint64 entries allowed in the kcov
// area. On Linux, the maximum is INT_MAX / 8.
const kcovAreaSizeMax = 10 * 1024 * 1024

// Kcov provides kernel coverage data to userspace through a memory-mapped
// region, as kcov does in Linux.
//
// To give the illusion that the data is always up to date, we update the shared
// memory every time before we return to userspace.
type Kcov struct {
        // mfp provides application memory. It is immutable after creation.
        mfp pgalloc.MemoryFileProvider

        // mu protects all of the fields below.
        mu sync.RWMutex

        // mode is the current kcov mode.
        mode uint8

        // size is the size of the mapping through which the kernel conveys coverage
        // information to userspace.
        size uint64

        // owningTask is the task that currently owns coverage data on the system. The
        // interface for kcov essentially requires that coverage is only going to a
        // single task. Note that kcov should only generate coverage data for the
        // owning task, but we currently generate global coverage.
        owningTask *Task

        // count is a locally cached version of the first uint64 in the kcov data,
        // which is the number of subsequent entries representing PCs.
        //
        // It is used with kcovInode.countBlock(), to copy in/out the first element of
        // the actual data in an efficient manner, avoid boilerplate, and prevent
        // accidental garbage escapes by the temporary counts.
        count uint64

        mappable *mm.SpecialMappable
}

// NewKcov creates and returns a Kcov instance.
func (k *Kernel) NewKcov() *Kcov {
        return &Kcov{
                mfp: k,
        }
}

var coveragePool = sync.Pool{
        New: func() interface{} {
                return make([]byte, 0)
        },
}

// TaskWork implements TaskWorker.TaskWork.
func (kcov *Kcov) TaskWork(t *Task) {
        kcov.mu.Lock()
        defer kcov.mu.Unlock()

        if kcov.mode != linux.KCOV_MODE_TRACE_PC {
                return
        }

        rw := &kcovReadWriter{
                mf: kcov.mfp.MemoryFile(),
                fr: kcov.mappable.FileRange(),
        }

        // Read in the PC count.
        if _, err := safemem.ReadFullToBlocks(rw, kcov.countBlock()); err != nil {
                panic(fmt.Sprintf("Internal error reading count from kcov area: %v", err))
        }

        rw.off = 8 * (1 + kcov.count)
        n := coverage.ConsumeCoverageData(&kcovIOWriter{rw})

        // Update the pc count, based on the number of entries written. Note that if
        // we reached the end of the kcov area, we may not have written everything in
        // output.
        kcov.count += uint64(n / 8)
        rw.off = 0
        if _, err := safemem.WriteFullFromBlocks(rw, kcov.countBlock()); err != nil {
                panic(fmt.Sprintf("Internal error writing count to kcov area: %v", err))
        }

        // Re-register for future work.
        t.RegisterWork(kcov)
}

// InitTrace performs the KCOV_INIT_TRACE ioctl.
func (kcov *Kcov) InitTrace(size uint64) error {
        kcov.mu.Lock()
        defer kcov.mu.Unlock()

        if kcov.mode != linux.KCOV_MODE_DISABLED {
                return linuxerr.EBUSY
        }

        // To simplify all the logic around mapping, we require that the length of the
        // shared region is a multiple of the system page size.
        if (8*size)&(hostarch.PageSize-1) != 0 {
                return linuxerr.EINVAL
        }

        // We need space for at least two uint64s to hold current position and a
        // single PC.
        if size < 2 || size > kcovAreaSizeMax {
                return linuxerr.EINVAL
        }

        kcov.size = size
        kcov.mode = linux.KCOV_MODE_INIT
        return nil
}

// EnableTrace performs the KCOV_ENABLE_TRACE ioctl.
func (kcov *Kcov) EnableTrace(ctx context.Context, traceKind uint8) error {
        t := TaskFromContext(ctx)
        if t == nil {
                panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine")
        }

        kcov.mu.Lock()
        defer kcov.mu.Unlock()

        // KCOV_ENABLE must be preceded by KCOV_INIT_TRACE and an mmap call.
        if kcov.mode != linux.KCOV_MODE_INIT || kcov.mappable == nil {
                return linuxerr.EINVAL
        }

        switch traceKind {
        case linux.KCOV_TRACE_PC:
                kcov.mode = linux.KCOV_MODE_TRACE_PC
        case linux.KCOV_TRACE_CMP:
                // We do not support KCOV_MODE_TRACE_CMP.
                return linuxerr.ENOTSUP
        default:
                return linuxerr.EINVAL
        }

        if kcov.owningTask != nil && kcov.owningTask != t {
                return linuxerr.EBUSY
        }

        kcov.owningTask = t
        t.SetKcov(kcov)
        t.RegisterWork(kcov)

        // Clear existing coverage data; the task expects to read only coverage data
        // from the time it is activated.
        coverage.ClearCoverageData()
        return nil
}

// DisableTrace performs the KCOV_DISABLE_TRACE ioctl.
func (kcov *Kcov) DisableTrace(ctx context.Context) error {
        kcov.mu.Lock()
        defer kcov.mu.Unlock()

        t := TaskFromContext(ctx)
        if t == nil {
                panic("kcovInode.EnableTrace() cannot be used outside of a task goroutine")
        }

        if t != kcov.owningTask {
                return linuxerr.EINVAL
        }
        kcov.mode = linux.KCOV_MODE_INIT
        kcov.owningTask = nil
        if kcov.mappable != nil {
                kcov.mappable.DecRef(ctx)
                kcov.mappable = nil
        }
        return nil
}

// Clear resets the mode and clears the owning task and memory mapping for kcov.
// It is called when the fd corresponding to kcov is closed. Note that the mode
// needs to be set so that the next call to kcov.TaskWork() will exit early.
func (kcov *Kcov) Clear(ctx context.Context) {
        kcov.mu.Lock()
        kcov.mode = linux.KCOV_MODE_INIT
        kcov.owningTask = nil
        if kcov.mappable != nil {
                kcov.mappable.DecRef(ctx)
                kcov.mappable = nil
        }
        kcov.mu.Unlock()
}

// OnTaskExit is called when the owning task exits. It is similar to
// kcov.Clear(), except the memory mapping is not cleared, so that the same
// mapping can be used in the future if kcov is enabled again by another task.
func (kcov *Kcov) OnTaskExit() {
        kcov.mu.Lock()
        kcov.mode = linux.KCOV_MODE_INIT
        kcov.owningTask = nil
        kcov.mu.Unlock()
}

// ConfigureMMap is called by the vfs.FileDescription for this kcov instance to
// implement vfs.FileDescription.ConfigureMMap.
func (kcov *Kcov) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
        kcov.mu.Lock()
        defer kcov.mu.Unlock()

        if kcov.mode != linux.KCOV_MODE_INIT {
                return linuxerr.EINVAL
        }

        if kcov.mappable == nil {
                // Set up the kcov area.
                fr, err := kcov.mfp.MemoryFile().Allocate(kcov.size*8, usage.Anonymous)
                if err != nil {
                        return err
                }

                // Get the thread id for the mmap name.
                t := TaskFromContext(ctx)
                if t == nil {
                        panic("ThreadFromContext returned nil")
                }
                // For convenience, a special mappable is used here. Note that these mappings
                // will look different under /proc/[pid]/maps than they do on Linux.
                kcov.mappable = mm.NewSpecialMappable(fmt.Sprintf("[kcov:%d]", t.ThreadID()), kcov.mfp, fr)
        }
        kcov.mappable.IncRef()
        opts.Mappable = kcov.mappable
        opts.MappingIdentity = kcov.mappable
        return nil
}

// kcovReadWriter implements safemem.Reader and safemem.Writer.
type kcovReadWriter struct {
        off uint64
        mf  *pgalloc.MemoryFile
        fr  memmap.FileRange
}

// ReadToBlocks implements safemem.Reader.ReadToBlocks.
func (rw *kcovReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
        if dsts.IsEmpty() {
                return 0, nil
        }

        // Limit the read to the kcov range and check for overflow.
        if rw.fr.Length() <= rw.off {
                return 0, io.EOF
        }
        start := rw.fr.Start + rw.off
        end := rw.fr.Start + rw.fr.Length()
        if rend := start + dsts.NumBytes(); rend < end {
                end = rend
        }

        // Get internal mappings.
        bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, hostarch.Read)
        if err != nil {
                return 0, err
        }

        // Copy from internal mappings.
        n, err := safemem.CopySeq(dsts, bs)
        rw.off += n
        return n, err
}

// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
func (rw *kcovReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
        if srcs.IsEmpty() {
                return 0, nil
        }

        // Limit the write to the kcov area and check for overflow.
        if rw.fr.Length() <= rw.off {
                return 0, io.EOF
        }
        start := rw.fr.Start + rw.off
        end := rw.fr.Start + rw.fr.Length()
        if wend := start + srcs.NumBytes(); wend < end {
                end = wend
        }

        // Get internal mapping.
        bs, err := rw.mf.MapInternal(memmap.FileRange{start, end}, hostarch.Write)
        if err != nil {
                return 0, err
        }

        // Copy to internal mapping.
        n, err := safemem.CopySeq(bs, srcs)
        rw.off += n
        return n, err
}

// kcovIOWriter implements io.Writer as a basic wrapper over kcovReadWriter.
type kcovIOWriter struct {
        rw *kcovReadWriter
}

// Write implements io.Writer.Write.
func (w *kcovIOWriter) Write(p []byte) (int, error) {
        bs := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(p))
        n, err := safemem.WriteFullFromBlocks(w.rw, bs)
        return int(n), err
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/session_refs.go: no such file or directory






































    2 





    2 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package unimpl contains interface to emit events about unimplemented
// features.
package unimpl

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/log"
)

// contextID is the events package's type for context.Context.Value keys.
type contextID int

const (
        // CtxEvents is a Context.Value key for a Events.
        CtxEvents contextID = iota
)

// Events interface defines method to emit unsupported events.
type Events interface {
        EmitUnimplementedEvent(context.Context)
}

// EmitUnimplementedEvent emits unsupported syscall event to the context.
func EmitUnimplementedEvent(ctx context.Context) {
        e := ctx.Value(CtxEvents)
        if e == nil {
                log.Warningf("Context.Value(CtxEvents) not present, unimplemented syscall event not reported.")
                return
        }
        e.(Events).EmitUnimplementedEvent(ctx)
}


























    1 




    1 






    1 





    1 






















    4 






    1 


    3 



    2 
    1 


    1 




























    1 




    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/syserror"
)

// Sync implements Linux syscall sync(2).
func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return 0, nil, t.Kernel().VFS().SyncAllFilesystems(t)
}

// Syncfs implements Linux syscall syncfs(2).
func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        if file.StatusFlags()&linux.O_PATH != 0 {
                return 0, nil, linuxerr.EBADF
        }

        return 0, nil, file.SyncFS(t)
}

// Fsync implements Linux syscall fsync(2).
func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        return 0, nil, file.Sync(t)
}

// Fdatasync implements Linux syscall fdatasync(2).
func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        // TODO(gvisor.dev/issue/1897): Avoid writeback of unnecessary metadata.
        return Fsync(t, args)
}

// SyncFileRange implements Linux syscall sync_file_range(2).
func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        offset := args[1].Int64()
        nbytes := args[2].Int64()
        flags := args[3].Uint()

        // Check for negative values and overflow.
        if offset < 0 || offset+nbytes < 0 {
                return 0, nil, linuxerr.EINVAL
        }
        if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // TODO(gvisor.dev/issue/1897): Currently, the only file syncing we support
        // is a full-file sync, i.e. fsync(2). As a result, there are severe
        // limitations on how much we support sync_file_range:
        // - In Linux, sync_file_range(2) doesn't write out the file's metadata, even
        //   if the file size is changed. We do.
        // - We always sync the entire file instead of [offset, offset+nbytes).
        // - We do not support the use of WAIT_BEFORE without WAIT_AFTER. For
        //   correctness, we would have to perform a write-out every time WAIT_BEFORE
        //   was used, but this would be much more expensive than expected if there
        //   were no write-out operations in progress.
        // - Whenever WAIT_AFTER is used, we sync the file.
        // - Ignore WRITE. If this flag is used with WAIT_AFTER, then the file will
        //   be synced anyway. If this flag is used without WAIT_AFTER, then it is
        //   safe (and less expensive) to do nothing, because the syscall will not
        //   wait for the write-out to complete--we only need to make sure that the
        //   next time WAIT_BEFORE or WAIT_AFTER are used, the write-out completes.
        // - According to fs/sync.c, WAIT_BEFORE|WAIT_AFTER "will detect any I/O
        //   errors or ENOSPC conditions and will return those to the caller, after
        //   clearing the EIO and ENOSPC flags in the address_space." We don't do
        //   this.

        if flags&linux.SYNC_FILE_RANGE_WAIT_BEFORE != 0 &&
                flags&linux.SYNC_FILE_RANGE_WAIT_AFTER == 0 {
                t.Kernel().EmitUnimplementedEvent(t)
                return 0, nil, syserror.ENOSYS
        }

        if flags&linux.SYNC_FILE_RANGE_WAIT_AFTER != 0 {
                if err := file.Sync(t); err != nil {
                        return 0, nil, syserror.ConvertIntr(err, syserror.ERESTARTSYS)
                }
        }
        return 0, nil, nil
}





































  605 

    3 









  607 

  237 





  597 


    2 






  619 








   98 
    1 


   98 



   97 

   90 





   90 


   10 



   98 








   97 


   97 


   97 




   97 
    2 


   97 
    3 









   94 
    6 



    8 








   82 



   94 








    4 
    1 





    3 










    3 
    1 









    2 

















   14 

   10 




    4 









    4 


    1 




    3 


















































   10 



   10 


   10 



   10 




   10 
































   13 







   13 





   13 






   13 




    1 
















   12 





    1 
    1 





   11 



    2 
    2 





    2 





    3 



    2 




    2 





















    2 





    7 








    7 




    7 




    7 




    7 









    7 





    1 






    7 


    7 


    1 




    6 
    1 



    5 





    4 




    5 










































    5 




    1 


    5 






    5 





    4 



    5 



    5 



   23 



   23 


   23 



   23 



   23 












   23 







   23 
    1 




   22 

   22 





   22 

   22 


    1 


   22 




    6 



   22 

   18 




   22 
    9 


    1 





    9 
    8 



    9 



   22 


    3 
    1 

























































































    6 



    1 



    5 


    4 

    1 





    1 







    5 





    5 

    5 




    5 



    4 

    2 


    5 


    2 

    5 






    5 





    4 
    1 






    3 
    1 






    1 


    1 




    3 
    3 









    1 



    4 














    4 




    4 


    4 
    3 

    1 





    1 





    3 



    2 

    1 





    3 



    3 







    1 
    1 





    1 
    1 






    2 


    3 



    5 



    1 


    4 




    4 




    4 




    4 



    4 

    4 



    4 

    4 
    1 




    4 




    3 


    2 






























  420 





  420 









  422 




  423 







  424 


  425 






  426 


  428 









  433 


  432 












    5 



    5 


    5 



    5 
    1 



    4 






    4 

    4 





    4 


    4 

    1 






    3 








    1 


    2 


    2 

    3 
    1 



    3 



    2 


    2 



    3 





    3 





    3 

    3 



















    2 







    2 






    2 






  562 






    1 







    1 





    1 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mm

import (
        "fmt"
        mrand "math/rand"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/kernel/futex"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/syserror"
)

// HandleUserFault handles an application page fault. sp is the faulting
// application thread's stack pointer.
//
// Preconditions: mm.as != nil.
func (mm *MemoryManager) HandleUserFault(ctx context.Context, addr hostarch.Addr, at hostarch.AccessType, sp hostarch.Addr) error {
        ar, ok := addr.RoundDown().ToRange(hostarch.PageSize)
        if !ok {
                return linuxerr.EFAULT
        }

        // Don't bother trying existingPMAsLocked; in most cases, if we did have
        // existing pmas, we wouldn't have faulted.

        // Ensure that we have a usable vma. Here and below, since we are only
        // asking for a single page, there is no possibility of partial success,
        // and any error is immediately fatal.
        mm.mappingMu.RLock()
        vseg, _, err := mm.getVMAsLocked(ctx, ar, at, false)
        if err != nil {
                mm.mappingMu.RUnlock()
                return err
        }

        // Ensure that we have a usable pma.
        mm.activeMu.Lock()
        pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, at)
        mm.mappingMu.RUnlock()
        if err != nil {
                mm.activeMu.Unlock()
                return err
        }

        // Downgrade to a read-lock on activeMu since we don't need to mutate pmas
        // anymore.
        mm.activeMu.DowngradeLock()

        // Map the faulted page into the active AddressSpace.
        err = mm.mapASLocked(pseg, ar, false)
        mm.activeMu.RUnlock()
        return err
}

// MMap establishes a memory mapping.
func (mm *MemoryManager) MMap(ctx context.Context, opts memmap.MMapOpts) (hostarch.Addr, error) {
        if opts.Length == 0 {
                return 0, linuxerr.EINVAL
        }
        length, ok := hostarch.Addr(opts.Length).RoundUp()
        if !ok {
                return 0, syserror.ENOMEM
        }
        opts.Length = uint64(length)

        if opts.Mappable != nil {
                // Offset must be aligned.
                if hostarch.Addr(opts.Offset).RoundDown() != hostarch.Addr(opts.Offset) {
                        return 0, linuxerr.EINVAL
                }
                // Offset + length must not overflow.
                if end := opts.Offset + opts.Length; end < opts.Offset {
                        return 0, syserror.ENOMEM
                }
        } else {
                opts.Offset = 0
        }

        if opts.Addr.RoundDown() != opts.Addr {
                // MAP_FIXED requires addr to be page-aligned; non-fixed mappings
                // don't.
                if opts.Fixed {
                        return 0, linuxerr.EINVAL
                }
                opts.Addr = opts.Addr.RoundDown()
        }

        if !opts.MaxPerms.SupersetOf(opts.Perms) {
                return 0, linuxerr.EACCES
        }
        if opts.Unmap && !opts.Fixed {
                return 0, linuxerr.EINVAL
        }
        if opts.GrowsDown && opts.Mappable != nil {
                return 0, linuxerr.EINVAL
        }

        // Get the new vma.
        mm.mappingMu.Lock()
        if opts.MLockMode < mm.defMLockMode {
                opts.MLockMode = mm.defMLockMode
        }
        vseg, ar, err := mm.createVMALocked(ctx, opts)
        if err != nil {
                mm.mappingMu.Unlock()
                return 0, err
        }

        // TODO(jamieliu): In Linux, VM_LOCKONFAULT (which may be set on the new
        // vma by mlockall(MCL_FUTURE|MCL_ONFAULT) => mm_struct::def_flags) appears
        // to effectively disable MAP_POPULATE by unsetting FOLL_POPULATE in
        // mm/util.c:vm_mmap_pgoff() => mm/gup.c:__mm_populate() =>
        // populate_vma_page_range(). Confirm this behavior.
        switch {
        case opts.Precommit || opts.MLockMode == memmap.MLockEager:
                // Get pmas and map with precommit as requested.
                mm.populateVMAAndUnlock(ctx, vseg, ar, true)

        case opts.Mappable == nil && length <= privateAllocUnit:
                // NOTE(b/63077076, b/63360184): Get pmas and map eagerly in the hope
                // that doing so will save on future page faults. We only do this for
                // anonymous mappings, since otherwise the cost of
                // memmap.Mappable.Translate is unknown; and only for small mappings,
                // to avoid needing to allocate large amounts of memory that we may
                // subsequently need to checkpoint.
                mm.populateVMAAndUnlock(ctx, vseg, ar, false)

        default:
                mm.mappingMu.Unlock()
        }

        return ar.Start, nil
}

// populateVMA obtains pmas for addresses in ar in the given vma, and maps them
// into mm.as if it is active.
//
// Preconditions:
// * mm.mappingMu must be locked.
// * vseg.Range().IsSupersetOf(ar).
func (mm *MemoryManager) populateVMA(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, precommit bool) {
        if !vseg.ValuePtr().effectivePerms.Any() {
                // Linux doesn't populate inaccessible pages. See
                // mm/gup.c:populate_vma_page_range.
                return
        }

        mm.activeMu.Lock()
        // Can't defer mm.activeMu.Unlock(); see below.

        // Even if we get new pmas, we can't actually map them if we don't have an
        // AddressSpace.
        if mm.as == nil {
                mm.activeMu.Unlock()
                return
        }

        // Ensure that we have usable pmas.
        pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, hostarch.NoAccess)
        if err != nil {
                // mm/util.c:vm_mmap_pgoff() ignores the error, if any, from
                // mm/gup.c:mm_populate(). If it matters, we'll get it again when
                // userspace actually tries to use the failing page.
                mm.activeMu.Unlock()
                return
        }

        // Downgrade to a read-lock on activeMu since we don't need to mutate pmas
        // anymore.
        mm.activeMu.DowngradeLock()

        // As above, errors are silently ignored.
        mm.mapASLocked(pseg, ar, precommit)
        mm.activeMu.RUnlock()
}

// populateVMAAndUnlock is equivalent to populateVMA, but also unconditionally
// unlocks mm.mappingMu. In cases where populateVMAAndUnlock is usable, it is
// preferable to populateVMA since it unlocks mm.mappingMu before performing
// expensive operations that don't require it to be locked.
//
// Preconditions:
// * mm.mappingMu must be locked for writing.
// * vseg.Range().IsSupersetOf(ar).
//
// Postconditions: mm.mappingMu will be unlocked.
// +checklocksrelease:mm.mappingMu
func (mm *MemoryManager) populateVMAAndUnlock(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, precommit bool) {
        // See populateVMA above for commentary.
        if !vseg.ValuePtr().effectivePerms.Any() {
                mm.mappingMu.Unlock()
                return
        }

        mm.activeMu.Lock()

        if mm.as == nil {
                mm.activeMu.Unlock()
                mm.mappingMu.Unlock()
                return
        }

        // mm.mappingMu doesn't need to be write-locked for getPMAsLocked, and it
        // isn't needed at all for mapASLocked.
        mm.mappingMu.DowngradeLock()
        pseg, _, err := mm.getPMAsLocked(ctx, vseg, ar, hostarch.NoAccess)
        mm.mappingMu.RUnlock()
        if err != nil {
                mm.activeMu.Unlock()
                return
        }

        mm.activeMu.DowngradeLock()
        mm.mapASLocked(pseg, ar, precommit)
        mm.activeMu.RUnlock()
}

// MapStack allocates the initial process stack.
func (mm *MemoryManager) MapStack(ctx context.Context) (hostarch.AddrRange, error) {
        // maxStackSize is the maximum supported process stack size in bytes.
        //
        // This limit exists because stack growing isn't implemented, so the entire
        // process stack must be mapped up-front.
        const maxStackSize = 128 << 20

        stackSize := limits.FromContext(ctx).Get(limits.Stack)
        r, ok := hostarch.Addr(stackSize.Cur).RoundUp()
        sz := uint64(r)
        if !ok {
                // RLIM_INFINITY rounds up to 0.
                sz = linux.DefaultStackSoftLimit
        } else if sz > maxStackSize {
                ctx.Warningf("Capping stack size from RLIMIT_STACK of %v down to %v.", sz, maxStackSize)
                sz = maxStackSize
        } else if sz == 0 {
                return hostarch.AddrRange{}, syserror.ENOMEM
        }
        szaddr := hostarch.Addr(sz)
        ctx.Debugf("Allocating stack with size of %v bytes", sz)

        // Determine the stack's desired location. Unlike Linux, address
        // randomization can't be disabled.
        stackEnd := mm.layout.MaxAddr - hostarch.Addr(mrand.Int63n(int64(mm.layout.MaxStackRand))).RoundDown()
        if stackEnd < szaddr {
                return hostarch.AddrRange{}, syserror.ENOMEM
        }
        stackStart := stackEnd - szaddr
        mm.mappingMu.Lock()
        defer mm.mappingMu.Unlock()
        _, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
                Length:    sz,
                Addr:      stackStart,
                Perms:     hostarch.ReadWrite,
                MaxPerms:  hostarch.AnyAccess,
                Private:   true,
                GrowsDown: true,
                MLockMode: mm.defMLockMode,
                Hint:      "[stack]",
        })
        return ar, err
}

// MUnmap implements the semantics of Linux's munmap(2).
func (mm *MemoryManager) MUnmap(ctx context.Context, addr hostarch.Addr, length uint64) error {
        if addr != addr.RoundDown() {
                return linuxerr.EINVAL
        }
        if length == 0 {
                return linuxerr.EINVAL
        }
        la, ok := hostarch.Addr(length).RoundUp()
        if !ok {
                return linuxerr.EINVAL
        }
        ar, ok := addr.ToRange(uint64(la))
        if !ok {
                return linuxerr.EINVAL
        }

        mm.mappingMu.Lock()
        defer mm.mappingMu.Unlock()
        mm.unmapLocked(ctx, ar)
        return nil
}

// MRemapOpts specifies options to MRemap.
type MRemapOpts struct {
        // Move controls whether MRemap moves the remapped mapping to a new address.
        Move MRemapMoveMode

        // NewAddr is the new address for the remapping. NewAddr is ignored unless
        // Move is MMRemapMustMove.
        NewAddr hostarch.Addr
}

// MRemapMoveMode controls MRemap's moving behavior.
type MRemapMoveMode int

const (
        // MRemapNoMove prevents MRemap from moving the remapped mapping.
        MRemapNoMove MRemapMoveMode = iota

        // MRemapMayMove allows MRemap to move the remapped mapping.
        MRemapMayMove

        // MRemapMustMove requires MRemap to move the remapped mapping to
        // MRemapOpts.NewAddr, replacing any existing mappings in the remapped
        // range.
        MRemapMustMove
)

// MRemap implements the semantics of Linux's mremap(2).
func (mm *MemoryManager) MRemap(ctx context.Context, oldAddr hostarch.Addr, oldSize uint64, newSize uint64, opts MRemapOpts) (hostarch.Addr, error) {
        // "Note that old_address has to be page aligned." - mremap(2)
        if oldAddr.RoundDown() != oldAddr {
                return 0, linuxerr.EINVAL
        }

        // Linux treats an old_size that rounds up to 0 as 0, which is otherwise a
        // valid size. However, new_size can't be 0 after rounding.
        oldSizeAddr, _ := hostarch.Addr(oldSize).RoundUp()
        oldSize = uint64(oldSizeAddr)
        newSizeAddr, ok := hostarch.Addr(newSize).RoundUp()
        if !ok || newSizeAddr == 0 {
                return 0, linuxerr.EINVAL
        }
        newSize = uint64(newSizeAddr)

        oldEnd, ok := oldAddr.AddLength(oldSize)
        if !ok {
                return 0, linuxerr.EINVAL
        }

        mm.mappingMu.Lock()
        defer mm.mappingMu.Unlock()

        // All cases require that a vma exists at oldAddr.
        vseg := mm.vmas.FindSegment(oldAddr)
        if !vseg.Ok() {
                return 0, linuxerr.EFAULT
        }

        // Behavior matrix:
        //
        // Move     | oldSize = 0 | oldSize < newSize | oldSize = newSize | oldSize > newSize
        // ---------+-------------+-------------------+-------------------+------------------
        //   NoMove | ENOMEM [1]  | Grow in-place     | No-op             | Shrink in-place
        //  MayMove | Copy [1]    | Grow in-place or  | No-op             | Shrink in-place
        //          |             |   move            |                   |
        // MustMove | Copy        | Move and grow     | Move              | Shrink and move
        //
        // [1] In-place growth is impossible because the vma at oldAddr already
        // occupies at least part of the destination. Thus the NoMove case always
        // fails and the MayMove case always falls back to copying.

        if vma := vseg.ValuePtr(); newSize > oldSize && vma.mlockMode != memmap.MLockNone {
                // Check against RLIMIT_MEMLOCK. Unlike mmap, mlock, and mlockall,
                // mremap in Linux does not check mm/mlock.c:can_do_mlock() and
                // therefore does not return EPERM if RLIMIT_MEMLOCK is 0 and
                // !CAP_IPC_LOCK.
                mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
                if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
                        if newLockedAS := mm.lockedAS - oldSize + newSize; newLockedAS > mlockLimit {
                                return 0, linuxerr.EAGAIN
                        }
                }
        }

        if opts.Move != MRemapMustMove {
                // Handle no-ops and in-place shrinking. These cases don't care if
                // [oldAddr, oldEnd) maps to a single vma, or is even mapped at all
                // (aside from oldAddr).
                if newSize <= oldSize {
                        if newSize < oldSize {
                                // If oldAddr+oldSize didn't overflow, oldAddr+newSize can't
                                // either.
                                newEnd := oldAddr + hostarch.Addr(newSize)
                                mm.unmapLocked(ctx, hostarch.AddrRange{newEnd, oldEnd})
                        }
                        return oldAddr, nil
                }

                // Handle in-place growing.

                // Check that oldEnd maps to the same vma as oldAddr.
                if vseg.End() < oldEnd {
                        return 0, linuxerr.EFAULT
                }
                // "Grow" the existing vma by creating a new mergeable one.
                vma := vseg.ValuePtr()
                var newOffset uint64
                if vma.mappable != nil {
                        newOffset = vseg.mappableRange().End
                }
                vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
                        Length:          newSize - oldSize,
                        MappingIdentity: vma.id,
                        Mappable:        vma.mappable,
                        Offset:          newOffset,
                        Addr:            oldEnd,
                        Fixed:           true,
                        Perms:           vma.realPerms,
                        MaxPerms:        vma.maxPerms,
                        Private:         vma.private,
                        GrowsDown:       vma.growsDown,
                        MLockMode:       vma.mlockMode,
                        Hint:            vma.hint,
                })
                if err == nil {
                        if vma.mlockMode == memmap.MLockEager {
                                mm.populateVMA(ctx, vseg, ar, true)
                        }
                        return oldAddr, nil
                }
                // In-place growth failed. In the MRemapMayMove case, fall through to
                // copying/moving below.
                if opts.Move == MRemapNoMove {
                        return 0, err
                }
        }

        // Find a location for the new mapping.
        var newAR hostarch.AddrRange
        switch opts.Move {
        case MRemapMayMove:
                newAddr, err := mm.findAvailableLocked(newSize, findAvailableOpts{})
                if err != nil {
                        return 0, err
                }
                newAR, _ = newAddr.ToRange(newSize)

        case MRemapMustMove:
                newAddr := opts.NewAddr
                if newAddr.RoundDown() != newAddr {
                        return 0, linuxerr.EINVAL
                }
                var ok bool
                newAR, ok = newAddr.ToRange(newSize)
                if !ok {
                        return 0, linuxerr.EINVAL
                }
                if (hostarch.AddrRange{oldAddr, oldEnd}).Overlaps(newAR) {
                        return 0, linuxerr.EINVAL
                }

                // Check that the new region is valid.
                _, err := mm.findAvailableLocked(newSize, findAvailableOpts{
                        Addr:  newAddr,
                        Fixed: true,
                        Unmap: true,
                })
                if err != nil {
                        return 0, err
                }

                // Unmap any mappings at the destination.
                mm.unmapLocked(ctx, newAR)

                // If the sizes specify shrinking, unmap everything between the new and
                // old sizes at the source. Unmapping before the following checks is
                // correct: compare Linux's mm/mremap.c:mremap_to() => do_munmap(),
                // vma_to_resize().
                if newSize < oldSize {
                        oldNewEnd := oldAddr + hostarch.Addr(newSize)
                        mm.unmapLocked(ctx, hostarch.AddrRange{oldNewEnd, oldEnd})
                        oldEnd = oldNewEnd
                }

                // unmapLocked may have invalidated vseg; look it up again.
                vseg = mm.vmas.FindSegment(oldAddr)
        }

        oldAR := hostarch.AddrRange{oldAddr, oldEnd}

        // Check that oldEnd maps to the same vma as oldAddr.
        if vseg.End() < oldEnd {
                return 0, linuxerr.EFAULT
        }

        // Check against RLIMIT_AS.
        newUsageAS := mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
        if limitAS := limits.FromContext(ctx).Get(limits.AS).Cur; newUsageAS > limitAS {
                return 0, syserror.ENOMEM
        }

        if vma := vseg.ValuePtr(); vma.mappable != nil {
                // Check that offset+length does not overflow.
                if vma.off+uint64(newAR.Length()) < vma.off {
                        return 0, linuxerr.EINVAL
                }
                // Inform the Mappable, if any, of the new mapping.
                if err := vma.mappable.CopyMapping(ctx, mm, oldAR, newAR, vseg.mappableOffsetAt(oldAR.Start), vma.canWriteMappableLocked()); err != nil {
                        return 0, err
                }
        }

        if oldSize == 0 {
                // Handle copying.
                //
                // We can't use createVMALocked because it calls Mappable.AddMapping,
                // whereas we've already called Mappable.CopyMapping (which is
                // consistent with Linux). Call vseg.Value() (rather than
                // vseg.ValuePtr()) to make a copy of the vma.
                vma := vseg.Value()
                if vma.mappable != nil {
                        vma.off = vseg.mappableOffsetAt(oldAR.Start)
                }
                if vma.id != nil {
                        vma.id.IncRef()
                }
                vseg := mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
                mm.usageAS += uint64(newAR.Length())
                if vma.isPrivateDataLocked() {
                        mm.dataAS += uint64(newAR.Length())
                }
                if vma.mlockMode != memmap.MLockNone {
                        mm.lockedAS += uint64(newAR.Length())
                        if vma.mlockMode == memmap.MLockEager {
                                mm.populateVMA(ctx, vseg, newAR, true)
                        }
                }
                return newAR.Start, nil
        }

        // Handle moving.
        //
        // Remove the existing vma before inserting the new one to minimize
        // iterator invalidation. We do this directly (instead of calling
        // removeVMAsLocked) because:
        //
        // 1. We can't drop the reference on vma.id, which will be transferred to
        // the new vma.
        //
        // 2. We can't call vma.mappable.RemoveMapping, because pmas are still at
        // oldAR, so calling RemoveMapping could cause us to miss an invalidation
        // overlapping oldAR.
        //
        // Call vseg.Value() (rather than vseg.ValuePtr()) to make a copy of the
        // vma.
        vseg = mm.vmas.Isolate(vseg, oldAR)
        vma := vseg.Value()
        mm.vmas.Remove(vseg)
        vseg = mm.vmas.Insert(mm.vmas.FindGap(newAR.Start), newAR, vma)
        mm.usageAS = mm.usageAS - uint64(oldAR.Length()) + uint64(newAR.Length())
        if vma.isPrivateDataLocked() {
                mm.dataAS = mm.dataAS - uint64(oldAR.Length()) + uint64(newAR.Length())
        }
        if vma.mlockMode != memmap.MLockNone {
                mm.lockedAS = mm.lockedAS - uint64(oldAR.Length()) + uint64(newAR.Length())
        }

        // Move pmas. This is technically optional for non-private pmas, which
        // could just go through memmap.Mappable.Translate again, but it's required
        // for private pmas.
        mm.activeMu.Lock()
        mm.movePMAsLocked(oldAR, newAR)
        mm.activeMu.Unlock()

        // Now that pmas have been moved to newAR, we can notify vma.mappable that
        // oldAR is no longer mapped.
        if vma.mappable != nil {
                vma.mappable.RemoveMapping(ctx, mm, oldAR, vma.off, vma.canWriteMappableLocked())
        }

        if vma.mlockMode == memmap.MLockEager {
                mm.populateVMA(ctx, vseg, newAR, true)
        }

        return newAR.Start, nil
}

// MProtect implements the semantics of Linux's mprotect(2).
func (mm *MemoryManager) MProtect(addr hostarch.Addr, length uint64, realPerms hostarch.AccessType, growsDown bool) error {
        if addr.RoundDown() != addr {
                return linuxerr.EINVAL
        }
        if length == 0 {
                return nil
        }
        rlength, ok := hostarch.Addr(length).RoundUp()
        if !ok {
                return syserror.ENOMEM
        }
        ar, ok := addr.ToRange(uint64(rlength))
        if !ok {
                return syserror.ENOMEM
        }
        effectivePerms := realPerms.Effective()

        mm.mappingMu.Lock()
        defer mm.mappingMu.Unlock()
        // Non-growsDown mprotect requires that all of ar is mapped, and stops at
        // the first non-empty gap. growsDown mprotect requires that the first vma
        // be growsDown, but does not require it to extend all the way to ar.Start;
        // vmas after the first must be contiguous but need not be growsDown, like
        // the non-growsDown case.
        vseg := mm.vmas.LowerBoundSegment(ar.Start)
        if !vseg.Ok() {
                return syserror.ENOMEM
        }
        if growsDown {
                if !vseg.ValuePtr().growsDown {
                        return linuxerr.EINVAL
                }
                if ar.End <= vseg.Start() {
                        return syserror.ENOMEM
                }
                ar.Start = vseg.Start()
        } else {
                if ar.Start < vseg.Start() {
                        return syserror.ENOMEM
                }
        }

        mm.activeMu.Lock()
        defer mm.activeMu.Unlock()
        defer func() {
                mm.vmas.MergeRange(ar)
                mm.vmas.MergeAdjacent(ar)
                mm.pmas.MergeRange(ar)
                mm.pmas.MergeAdjacent(ar)
        }()
        pseg := mm.pmas.LowerBoundSegment(ar.Start)
        var didUnmapAS bool
        for {
                // Check for permission validity before splitting vmas, for consistency
                // with Linux.
                if !vseg.ValuePtr().maxPerms.SupersetOf(effectivePerms) {
                        return linuxerr.EACCES
                }
                vseg = mm.vmas.Isolate(vseg, ar)

                // Update vma permissions.
                vma := vseg.ValuePtr()
                vmaLength := vseg.Range().Length()
                if vma.isPrivateDataLocked() {
                        mm.dataAS -= uint64(vmaLength)
                }

                vma.realPerms = realPerms
                vma.effectivePerms = effectivePerms
                if vma.isPrivateDataLocked() {
                        mm.dataAS += uint64(vmaLength)
                }

                // Propagate vma permission changes to pmas.
                for pseg.Ok() && pseg.Start() < vseg.End() {
                        if pseg.Range().Overlaps(vseg.Range()) {
                                pseg = mm.pmas.Isolate(pseg, vseg.Range())
                                pma := pseg.ValuePtr()
                                if !effectivePerms.SupersetOf(pma.effectivePerms) && !didUnmapAS {
                                        // Unmap all of ar, not just vseg.Range(), to minimize host
                                        // syscalls.
                                        mm.unmapASLocked(ar)
                                        didUnmapAS = true
                                }
                                pma.effectivePerms = effectivePerms.Intersect(pma.translatePerms)
                                if pma.needCOW {
                                        pma.effectivePerms.Write = false
                                }
                        }
                        pseg = pseg.NextSegment()
                }

                // Continue to the next vma.
                if ar.End <= vseg.End() {
                        return nil
                }
                vseg, _ = vseg.NextNonEmpty()
                if !vseg.Ok() {
                        return syserror.ENOMEM
                }
        }
}

// BrkSetup sets mm's brk address to addr and its brk size to 0.
func (mm *MemoryManager) BrkSetup(ctx context.Context, addr hostarch.Addr) {
        mm.mappingMu.Lock()
        defer mm.mappingMu.Unlock()
        // Unmap the existing brk.
        if mm.brk.Length() != 0 {
                mm.unmapLocked(ctx, mm.brk)
        }
        mm.brk = hostarch.AddrRange{addr, addr}
}

// Brk implements the semantics of Linux's brk(2), except that it returns an
// error on failure.
func (mm *MemoryManager) Brk(ctx context.Context, addr hostarch.Addr) (hostarch.Addr, error) {
        mm.mappingMu.Lock()
        // Can't defer mm.mappingMu.Unlock(); see below.

        if addr < mm.brk.Start {
                addr = mm.brk.End
                mm.mappingMu.Unlock()
                return addr, linuxerr.EINVAL
        }

        // TODO(gvisor.dev/issue/156): This enforces RLIMIT_DATA, but is
        // slightly more permissive than the usual data limit. In particular,
        // this only limits the size of the heap; a true RLIMIT_DATA limits the
        // size of heap + data + bss. The segment sizes need to be plumbed from
        // the loader package to fully enforce RLIMIT_DATA.
        if uint64(addr-mm.brk.Start) > limits.FromContext(ctx).Get(limits.Data).Cur {
                addr = mm.brk.End
                mm.mappingMu.Unlock()
                return addr, syserror.ENOMEM
        }

        oldbrkpg, _ := mm.brk.End.RoundUp()
        newbrkpg, ok := addr.RoundUp()
        if !ok {
                addr = mm.brk.End
                mm.mappingMu.Unlock()
                return addr, linuxerr.EFAULT
        }

        switch {
        case oldbrkpg < newbrkpg:
                vseg, ar, err := mm.createVMALocked(ctx, memmap.MMapOpts{
                        Length: uint64(newbrkpg - oldbrkpg),
                        Addr:   oldbrkpg,
                        Fixed:  true,
                        // Compare Linux's
                        // arch/x86/include/asm/page_types.h:VM_DATA_DEFAULT_FLAGS.
                        Perms:    hostarch.ReadWrite,
                        MaxPerms: hostarch.AnyAccess,
                        Private:  true,
                        // Linux: mm/mmap.c:sys_brk() => do_brk_flags() includes
                        // mm->def_flags.
                        MLockMode: mm.defMLockMode,
                        Hint:      "[heap]",
                })
                if err != nil {
                        addr = mm.brk.End
                        mm.mappingMu.Unlock()
                        return addr, err
                }
                mm.brk.End = addr
                if mm.defMLockMode == memmap.MLockEager {
                        mm.populateVMAAndUnlock(ctx, vseg, ar, true)
                } else {
                        mm.mappingMu.Unlock()
                }

        case newbrkpg < oldbrkpg:
                mm.unmapLocked(ctx, hostarch.AddrRange{newbrkpg, oldbrkpg})
                fallthrough

        default:
                mm.brk.End = addr
                mm.mappingMu.Unlock()
        }

        return addr, nil
}

// MLock implements the semantics of Linux's mlock()/mlock2()/munlock(),
// depending on mode.
func (mm *MemoryManager) MLock(ctx context.Context, addr hostarch.Addr, length uint64, mode memmap.MLockMode) error {
        // Linux allows this to overflow.
        la, _ := hostarch.Addr(length + addr.PageOffset()).RoundUp()
        ar, ok := addr.RoundDown().ToRange(uint64(la))
        if !ok {
                return linuxerr.EINVAL
        }

        mm.mappingMu.Lock()
        // Can't defer mm.mappingMu.Unlock(); see below.

        if mode != memmap.MLockNone {
                // Check against RLIMIT_MEMLOCK.
                if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
                        mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
                        if mlockLimit == 0 {
                                mm.mappingMu.Unlock()
                                return linuxerr.EPERM
                        }
                        if newLockedAS := mm.lockedAS + uint64(ar.Length()) - mm.mlockedBytesRangeLocked(ar); newLockedAS > mlockLimit {
                                mm.mappingMu.Unlock()
                                return syserror.ENOMEM
                        }
                }
        }

        // Check this after RLIMIT_MEMLOCK for consistency with Linux.
        if ar.Length() == 0 {
                mm.mappingMu.Unlock()
                return nil
        }

        // Apply the new mlock mode to vmas.
        var unmapped bool
        vseg := mm.vmas.FindSegment(ar.Start)
        for {
                if !vseg.Ok() {
                        unmapped = true
                        break
                }
                vseg = mm.vmas.Isolate(vseg, ar)
                vma := vseg.ValuePtr()
                prevMode := vma.mlockMode
                vma.mlockMode = mode
                if mode != memmap.MLockNone && prevMode == memmap.MLockNone {
                        mm.lockedAS += uint64(vseg.Range().Length())
                } else if mode == memmap.MLockNone && prevMode != memmap.MLockNone {
                        mm.lockedAS -= uint64(vseg.Range().Length())
                }
                if ar.End <= vseg.End() {
                        break
                }
                vseg, _ = vseg.NextNonEmpty()
        }
        mm.vmas.MergeRange(ar)
        mm.vmas.MergeAdjacent(ar)
        if unmapped {
                mm.mappingMu.Unlock()
                return syserror.ENOMEM
        }

        if mode == memmap.MLockEager {
                // Ensure that we have usable pmas. Since we didn't return ENOMEM
                // above, ar must be fully covered by vmas, so we can just use
                // NextSegment below.
                mm.activeMu.Lock()
                mm.mappingMu.DowngradeLock()
                for vseg := mm.vmas.FindSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
                        if !vseg.ValuePtr().effectivePerms.Any() {
                                // Linux: mm/gup.c:__get_user_pages() returns EFAULT in this
                                // case, which is converted to ENOMEM by mlock.
                                mm.activeMu.Unlock()
                                mm.mappingMu.RUnlock()
                                return syserror.ENOMEM
                        }
                        _, _, err := mm.getPMAsLocked(ctx, vseg, vseg.Range().Intersect(ar), hostarch.NoAccess)
                        if err != nil {
                                mm.activeMu.Unlock()
                                mm.mappingMu.RUnlock()
                                // Linux: mm/mlock.c:__mlock_posix_error_return()
                                if linuxerr.Equals(linuxerr.EFAULT, err) {
                                        return syserror.ENOMEM
                                }
                                if linuxerr.Equals(linuxerr.ENOMEM, err) {
                                        return linuxerr.EAGAIN
                                }
                                return err
                        }
                }

                // Map pmas into the active AddressSpace, if we have one.
                mm.mappingMu.RUnlock()
                if mm.as != nil {
                        mm.activeMu.DowngradeLock()
                        err := mm.mapASLocked(mm.pmas.LowerBoundSegment(ar.Start), ar, true /* precommit */)
                        mm.activeMu.RUnlock()
                        if err != nil {
                                return err
                        }
                } else {
                        mm.activeMu.Unlock()
                }
        } else {
                mm.mappingMu.Unlock()
        }

        return nil
}

// MLockAllOpts holds options to MLockAll.
type MLockAllOpts struct {
        // If Current is true, change the memory-locking behavior of all mappings
        // to Mode. If Future is true, upgrade the memory-locking behavior of all
        // future mappings to Mode. At least one of Current or Future must be true.
        Current bool
        Future  bool
        Mode    memmap.MLockMode
}

// MLockAll implements the semantics of Linux's mlockall()/munlockall(),
// depending on opts.
func (mm *MemoryManager) MLockAll(ctx context.Context, opts MLockAllOpts) error {
        if !opts.Current && !opts.Future {
                return linuxerr.EINVAL
        }

        mm.mappingMu.Lock()
        // Can't defer mm.mappingMu.Unlock(); see below.

        if opts.Current {
                if opts.Mode != memmap.MLockNone {
                        // Check against RLIMIT_MEMLOCK.
                        if creds := auth.CredentialsFromContext(ctx); !creds.HasCapabilityIn(linux.CAP_IPC_LOCK, creds.UserNamespace.Root()) {
                                mlockLimit := limits.FromContext(ctx).Get(limits.MemoryLocked).Cur
                                if mlockLimit == 0 {
                                        mm.mappingMu.Unlock()
                                        return linuxerr.EPERM
                                }
                                if uint64(mm.vmas.Span()) > mlockLimit {
                                        mm.mappingMu.Unlock()
                                        return syserror.ENOMEM
                                }
                        }
                }
                for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
                        vma := vseg.ValuePtr()
                        prevMode := vma.mlockMode
                        vma.mlockMode = opts.Mode
                        if opts.Mode != memmap.MLockNone && prevMode == memmap.MLockNone {
                                mm.lockedAS += uint64(vseg.Range().Length())
                        } else if opts.Mode == memmap.MLockNone && prevMode != memmap.MLockNone {
                                mm.lockedAS -= uint64(vseg.Range().Length())
                        }
                }
        }

        if opts.Future {
                mm.defMLockMode = opts.Mode
        }

        if opts.Current && opts.Mode == memmap.MLockEager {
                // Linux: mm/mlock.c:sys_mlockall() => include/linux/mm.h:mm_populate()
                // ignores the return value of __mm_populate(), so all errors below are
                // ignored.
                //
                // Try to get usable pmas.
                mm.activeMu.Lock()
                mm.mappingMu.DowngradeLock()
                for vseg := mm.vmas.FirstSegment(); vseg.Ok(); vseg = vseg.NextSegment() {
                        if vseg.ValuePtr().effectivePerms.Any() {
                                mm.getPMAsLocked(ctx, vseg, vseg.Range(), hostarch.NoAccess)
                        }
                }

                // Map all pmas into the active AddressSpace, if we have one.
                mm.mappingMu.RUnlock()
                if mm.as != nil {
                        mm.activeMu.DowngradeLock()
                        mm.mapASLocked(mm.pmas.FirstSegment(), mm.applicationAddrRange(), true /* precommit */)
                        mm.activeMu.RUnlock()
                } else {
                        mm.activeMu.Unlock()
                }
        } else {
                mm.mappingMu.Unlock()
        }
        return nil
}

// NumaPolicy implements the semantics of Linux's get_mempolicy(MPOL_F_ADDR).
func (mm *MemoryManager) NumaPolicy(addr hostarch.Addr) (linux.NumaPolicy, uint64, error) {
        mm.mappingMu.RLock()
        defer mm.mappingMu.RUnlock()
        vseg := mm.vmas.FindSegment(addr)
        if !vseg.Ok() {
                return 0, 0, linuxerr.EFAULT
        }
        vma := vseg.ValuePtr()
        return vma.numaPolicy, vma.numaNodemask, nil
}

// SetNumaPolicy implements the semantics of Linux's mbind().
func (mm *MemoryManager) SetNumaPolicy(addr hostarch.Addr, length uint64, policy linux.NumaPolicy, nodemask uint64) error {
        if !addr.IsPageAligned() {
                return linuxerr.EINVAL
        }
        // Linux allows this to overflow.
        la, _ := hostarch.Addr(length).RoundUp()
        ar, ok := addr.ToRange(uint64(la))
        if !ok {
                return linuxerr.EINVAL
        }
        if ar.Length() == 0 {
                return nil
        }

        mm.mappingMu.Lock()
        defer mm.mappingMu.Unlock()
        defer func() {
                mm.vmas.MergeRange(ar)
                mm.vmas.MergeAdjacent(ar)
        }()
        vseg := mm.vmas.LowerBoundSegment(ar.Start)
        lastEnd := ar.Start
        for {
                if !vseg.Ok() || lastEnd < vseg.Start() {
                        // "EFAULT: ... there was an unmapped hole in the specified memory
                        // range specified [sic] by addr and len." - mbind(2)
                        return linuxerr.EFAULT
                }
                vseg = mm.vmas.Isolate(vseg, ar)
                vma := vseg.ValuePtr()
                vma.numaPolicy = policy
                vma.numaNodemask = nodemask
                lastEnd = vseg.End()
                if ar.End <= lastEnd {
                        return nil
                }
                vseg, _ = vseg.NextNonEmpty()
        }
}

// SetDontFork implements the semantics of madvise MADV_DONTFORK.
func (mm *MemoryManager) SetDontFork(addr hostarch.Addr, length uint64, dontfork bool) error {
        ar, ok := addr.ToRange(length)
        if !ok {
                return linuxerr.EINVAL
        }

        mm.mappingMu.Lock()
        defer mm.mappingMu.Unlock()
        defer func() {
                mm.vmas.MergeRange(ar)
                mm.vmas.MergeAdjacent(ar)
        }()

        for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
                vseg = mm.vmas.Isolate(vseg, ar)
                vma := vseg.ValuePtr()
                vma.dontfork = dontfork
        }

        if mm.vmas.SpanRange(ar) != ar.Length() {
                return syserror.ENOMEM
        }
        return nil
}

// Decommit implements the semantics of Linux's madvise(MADV_DONTNEED).
func (mm *MemoryManager) Decommit(addr hostarch.Addr, length uint64) error {
        ar, ok := addr.ToRange(length)
        if !ok {
                return linuxerr.EINVAL
        }

        mm.mappingMu.RLock()
        defer mm.mappingMu.RUnlock()
        mm.activeMu.Lock()
        defer mm.activeMu.Unlock()

        // This is invalidateLocked(invalidatePrivate=true, invalidateShared=true),
        // with the additional wrinkle that we must refuse to invalidate pmas under
        // mlocked vmas.
        var didUnmapAS bool
        pseg := mm.pmas.LowerBoundSegment(ar.Start)
        for vseg := mm.vmas.LowerBoundSegment(ar.Start); vseg.Ok() && vseg.Start() < ar.End; vseg = vseg.NextSegment() {
                vma := vseg.ValuePtr()
                if vma.mlockMode != memmap.MLockNone {
                        return linuxerr.EINVAL
                }
                vsegAR := vseg.Range().Intersect(ar)
                // pseg should already correspond to either this vma or a later one,
                // since there can't be a pma without a corresponding vma.
                if checkInvariants {
                        if pseg.Ok() && pseg.End() <= vsegAR.Start {
                                panic(fmt.Sprintf("pma %v precedes vma %v", pseg.Range(), vsegAR))
                        }
                }
                for pseg.Ok() && pseg.Start() < vsegAR.End {
                        pseg = mm.pmas.Isolate(pseg, vsegAR)
                        pma := pseg.ValuePtr()
                        if !didUnmapAS {
                                // Unmap all of ar, not just pseg.Range(), to minimize host
                                // syscalls. AddressSpace mappings must be removed before
                                // mm.decPrivateRef().
                                mm.unmapASLocked(ar)
                                didUnmapAS = true
                        }
                        if pma.private {
                                mm.decPrivateRef(pseg.fileRange())
                        }
                        pma.file.DecRef(pseg.fileRange())
                        mm.removeRSSLocked(pseg.Range())
                        pseg = mm.pmas.Remove(pseg).NextSegment()
                }
        }

        // "If there are some parts of the specified address space that are not
        // mapped, the Linux version of madvise() ignores them and applies the call
        // to the rest (but returns ENOMEM from the system call, as it should)." -
        // madvise(2)
        if mm.vmas.SpanRange(ar) != ar.Length() {
                return syserror.ENOMEM
        }
        return nil
}

// MSyncOpts holds options to MSync.
type MSyncOpts struct {
        // Sync has the semantics of MS_SYNC.
        Sync bool

        // Invalidate has the semantics of MS_INVALIDATE.
        Invalidate bool
}

// MSync implements the semantics of Linux's msync().
func (mm *MemoryManager) MSync(ctx context.Context, addr hostarch.Addr, length uint64, opts MSyncOpts) error {
        if addr != addr.RoundDown() {
                return linuxerr.EINVAL
        }
        if length == 0 {
                return nil
        }
        la, ok := hostarch.Addr(length).RoundUp()
        if !ok {
                return syserror.ENOMEM
        }
        ar, ok := addr.ToRange(uint64(la))
        if !ok {
                return syserror.ENOMEM
        }

        mm.mappingMu.RLock()
        // Can't defer mm.mappingMu.RUnlock(); see below.
        vseg := mm.vmas.LowerBoundSegment(ar.Start)
        if !vseg.Ok() {
                mm.mappingMu.RUnlock()
                return syserror.ENOMEM
        }
        var unmapped bool
        lastEnd := ar.Start
        for {
                if !vseg.Ok() {
                        mm.mappingMu.RUnlock()
                        unmapped = true
                        break
                }
                if lastEnd < vseg.Start() {
                        unmapped = true
                }
                lastEnd = vseg.End()
                vma := vseg.ValuePtr()
                if opts.Invalidate && vma.mlockMode != memmap.MLockNone {
                        mm.mappingMu.RUnlock()
                        return linuxerr.EBUSY
                }
                // It's only possible to have dirtied the Mappable through a shared
                // mapping. Don't check if the mapping is writable, because mprotect
                // may have changed this, and also because Linux doesn't.
                if id := vma.id; opts.Sync && id != nil && vma.mappable != nil && !vma.private {
                        // We can't call memmap.MappingIdentity.Msync while holding
                        // mm.mappingMu since it may take fs locks that precede it in the
                        // lock order.
                        id.IncRef()
                        mr := vseg.mappableRangeOf(vseg.Range().Intersect(ar))
                        mm.mappingMu.RUnlock()
                        err := id.Msync(ctx, mr)
                        id.DecRef(ctx)
                        if err != nil {
                                return err
                        }
                        if lastEnd >= ar.End {
                                break
                        }
                        mm.mappingMu.RLock()
                        vseg = mm.vmas.LowerBoundSegment(lastEnd)
                } else {
                        if lastEnd >= ar.End {
                                mm.mappingMu.RUnlock()
                                break
                        }
                        vseg = vseg.NextSegment()
                }
        }

        if unmapped {
                return syserror.ENOMEM
        }
        return nil
}

// GetSharedFutexKey is used by kernel.Task.GetSharedKey.
func (mm *MemoryManager) GetSharedFutexKey(ctx context.Context, addr hostarch.Addr) (futex.Key, error) {
        ar, ok := addr.ToRange(4) // sizeof(int32).
        if !ok {
                return futex.Key{}, linuxerr.EFAULT
        }

        mm.mappingMu.RLock()
        defer mm.mappingMu.RUnlock()
        vseg, _, err := mm.getVMAsLocked(ctx, ar, hostarch.Read, false)
        if err != nil {
                return futex.Key{}, err
        }
        vma := vseg.ValuePtr()

        if vma.private {
                return futex.Key{
                        Kind:   futex.KindSharedPrivate,
                        Offset: uint64(addr),
                }, nil
        }

        if vma.id != nil {
                vma.id.IncRef()
        }
        return futex.Key{
                Kind:            futex.KindSharedMappable,
                Mappable:        vma.mappable,
                MappingIdentity: vma.id,
                Offset:          vseg.mappableOffsetAt(addr),
        }, nil
}

// VirtualMemorySize returns the combined length in bytes of all mappings in
// mm.
func (mm *MemoryManager) VirtualMemorySize() uint64 {
        mm.mappingMu.RLock()
        defer mm.mappingMu.RUnlock()
        return mm.usageAS
}

// VirtualMemorySizeRange returns the combined length in bytes of all mappings
// in ar in mm.
func (mm *MemoryManager) VirtualMemorySizeRange(ar hostarch.AddrRange) uint64 {
        mm.mappingMu.RLock()
        defer mm.mappingMu.RUnlock()
        return uint64(mm.vmas.SpanRange(ar))
}

// ResidentSetSize returns the value advertised as mm's RSS in bytes.
func (mm *MemoryManager) ResidentSetSize() uint64 {
        mm.activeMu.RLock()
        defer mm.activeMu.RUnlock()
        return mm.curRSS
}

// MaxResidentSetSize returns the value advertised as mm's max RSS in bytes.
func (mm *MemoryManager) MaxResidentSetSize() uint64 {
        mm.activeMu.RLock()
        defer mm.activeMu.RUnlock()
        return mm.maxRSS
}

// VirtualDataSize returns the size of private data segments in mm.
func (mm *MemoryManager) VirtualDataSize() uint64 {
        mm.mappingMu.RLock()
        defer mm.mappingMu.RUnlock()
        return mm.dataAS
}

// EnableMembarrierPrivate causes future calls to IsMembarrierPrivateEnabled to
// return true.
func (mm *MemoryManager) EnableMembarrierPrivate() {
        atomic.StoreUint32(&mm.membarrierPrivateEnabled, 1)
}

// IsMembarrierPrivateEnabled returns true if mm.EnableMembarrierPrivate() has
// previously been called.
func (mm *MemoryManager) IsMembarrierPrivateEnabled() bool {
        return atomic.LoadUint32(&mm.membarrierPrivateEnabled) != 0
}

// EnableMembarrierRSeq causes future calls to IsMembarrierRSeqEnabled to
// return true.
func (mm *MemoryManager) EnableMembarrierRSeq() {
        atomic.StoreUint32(&mm.membarrierRSeqEnabled, 1)
}

// IsMembarrierRSeqEnabled returns true if mm.EnableMembarrierRSeq() has
// previously been called.
func (mm *MemoryManager) IsMembarrierRSeqEnabled() bool {
        return atomic.LoadUint32(&mm.membarrierRSeqEnabled) != 0
}















































   37 







   38 









  298 







  298 









    4 







    4 


    4 

















  323 





  322 



  322 
  305 


  268 

















  267 







  323 






















  302 

   16 
























  303 



  268 




  292 






  293 


  299 









  323 




  322 





  321 
  321 


  322 
  322 




  322 





  293 

  290 












































   46 





  248 

























  321 







  319 






  320 





  320 
  323 


  322 


  319 
   48 


  322 







  321 
  321 




  322 

  320 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package gofer

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
)

type errPartialRevalidation struct{}

// Error implements error.Error.
func (errPartialRevalidation) Error() string {
        return "partial revalidation"
}

type errRevalidationStepDone struct{}

// Error implements error.Error.
func (errRevalidationStepDone) Error() string {
        return "stop revalidation"
}

// revalidatePath checks cached dentries for external modification. File
// attributes are refreshed and cache is invalidated in case the dentry has been
// deleted, or a new file/directory created in its place.
//
// Revalidation stops at symlinks and mount points. The caller is responsible
// for revalidating again after symlinks are resolved and after changing to
// different mounts.
//
// Preconditions:
// * fs.renameMu must be locked.
func (fs *filesystem) revalidatePath(ctx context.Context, rpOrig *vfs.ResolvingPath, start *dentry, ds **[]*dentry) error {
        // Revalidation is done even if start is synthetic in case the path is
        // something like: ../non_synthetic_file.
        if fs.opts.interop != InteropModeShared {
                return nil
        }

        // Copy resolving path to walk the path for revalidation.
        rp := rpOrig.Copy()
        err := fs.revalidate(ctx, rp, start, rp.Done, ds)
        rp.Release(ctx)
        return err
}

// revalidateParentDir does the same as revalidatePath, but stops at the parent.
//
// Preconditions:
// * fs.renameMu must be locked.
func (fs *filesystem) revalidateParentDir(ctx context.Context, rpOrig *vfs.ResolvingPath, start *dentry, ds **[]*dentry) error {
        // Revalidation is done even if start is synthetic in case the path is
        // something like: ../non_synthetic_file and parent is non synthetic.
        if fs.opts.interop != InteropModeShared {
                return nil
        }

        // Copy resolving path to walk the path for revalidation.
        rp := rpOrig.Copy()
        err := fs.revalidate(ctx, rp, start, rp.Final, ds)
        rp.Release(ctx)
        return err
}

// revalidateOne does the same as revalidatePath, but checks a single dentry.
//
// Preconditions:
// * fs.renameMu must be locked.
func (fs *filesystem) revalidateOne(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, ds **[]*dentry) error {
        // Skip revalidation for interop mode different than InteropModeShared or
        // if the parent is synthetic (child must be synthetic too, but it cannot be
        // replaced without first replacing the parent).
        if parent.cachedMetadataAuthoritative() {
                return nil
        }

        parent.dirMu.Lock()
        child, ok := parent.children[name]
        parent.dirMu.Unlock()
        if !ok {
                return nil
        }

        state := makeRevalidateState(parent)
        defer state.release()

        state.add(name, child)
        return fs.revalidateHelper(ctx, vfsObj, state, ds)
}

// revalidate revalidates path components in rp until done returns true, or
// until a mount point or symlink is reached. It may send multiple MultiGetAttr
// calls to the gofer to handle ".." in the path.
//
// Preconditions:
// * fs.renameMu must be locked.
// * InteropModeShared is in effect.
func (fs *filesystem) revalidate(ctx context.Context, rp *vfs.ResolvingPath, start *dentry, done func() bool, ds **[]*dentry) error {
        state := makeRevalidateState(start)
        defer state.release()

        // Skip synthetic dentries because the start dentry cannot be replaced in case
        // it has been created in the remote file system.
        if !start.isSynthetic() {
                state.add("", start)
        }

done:
        for cur := start; !done(); {
                var err error
                cur, err = fs.revalidateStep(ctx, rp, cur, state)
                if err != nil {
                        switch err.(type) {
                        case errPartialRevalidation:
                                if err := fs.revalidateHelper(ctx, rp.VirtualFilesystem(), state, ds); err != nil {
                                        return err
                                }

                                // Reset state to release any remaining locks and restart from where
                                // stepping stopped.
                                state.reset()
                                state.start = cur

                                // Skip synthetic dentries because the start dentry cannot be replaced in
                                // case it has been created in the remote file system.
                                if !cur.isSynthetic() {
                                        state.add("", cur)
                                }

                        case errRevalidationStepDone:
                                break done

                        default:
                                return err
                        }
                }
        }
        return fs.revalidateHelper(ctx, rp.VirtualFilesystem(), state, ds)
}

// revalidateStep walks one element of the path and updates revalidationState
// with the dentry if needed. It may also stop the stepping or ask for a
// partial revalidation. Partial revalidation requires the caller to revalidate
// the current revalidationState, release all locks, and resume stepping.
// In case a symlink is hit, revalidation stops and the caller is responsible
// for calling revalidate again after the symlink is resolved. Revalidation may
// also stop for other reasons, like hitting a child not in the cache.
//
// Returns:
// * (dentry, nil): step worked, continue stepping.`
// * (dentry, errPartialRevalidation): revalidation should be done with the
//     state gathered so far. Then continue stepping with the remainder of the
//     path, starting at `dentry`.
// * (nil, errRevalidationStepDone): revalidation doesn't need to step any
//     further. It hit a symlink, a mount point, or an uncached dentry.
//
// Preconditions:
// * fs.renameMu must be locked.
// * !rp.Done().
// * InteropModeShared is in effect (assumes no negative dentries).
func (fs *filesystem) revalidateStep(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, state *revalidateState) (*dentry, error) {
        switch name := rp.Component(); name {
        case ".":
                // Do nothing.

        case "..":
                // Partial revalidation is required when ".." is hit because metadata locks
                // can only be acquired from parent to child to avoid deadlocks.
                if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
                        return nil, errRevalidationStepDone{}
                } else if isRoot || d.parent == nil {
                        rp.Advance()
                        return d, errPartialRevalidation{}
                }
                // We must assume that d.parent is correct, because if d has been moved
                // elsewhere in the remote filesystem so that its parent has changed,
                // we have no way of determining its new parent's location in the
                // filesystem.
                //
                // Call rp.CheckMount() before updating d.parent's metadata, since if
                // we traverse to another mount then d.parent's metadata is irrelevant.
                if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
                        return nil, errRevalidationStepDone{}
                }
                rp.Advance()
                return d.parent, errPartialRevalidation{}

        default:
                d.dirMu.Lock()
                child, ok := d.children[name]
                d.dirMu.Unlock()
                if !ok {
                        // child is not cached, no need to validate any further.
                        return nil, errRevalidationStepDone{}
                }

                state.add(name, child)

                // Symlink must be resolved before continuing with revalidation.
                if child.isSymlink() {
                        return nil, errRevalidationStepDone{}
                }

                d = child
        }

        rp.Advance()
        return d, nil
}

// revalidateHelper calls the gofer to stat all dentries in `state`. It will
// update or invalidate dentries in the cache based on the result.
//
// Preconditions:
// * fs.renameMu must be locked.
// * InteropModeShared is in effect.
func (fs *filesystem) revalidateHelper(ctx context.Context, vfsObj *vfs.VirtualFilesystem, state *revalidateState, ds **[]*dentry) error {
        if len(state.names) == 0 {
                return nil
        }
        // Lock metadata on all dentries *before* getting attributes for them.
        state.lockAllMetadata()
        stats, err := state.start.file.multiGetAttr(ctx, state.names)
        if err != nil {
                return err
        }

        i := -1
        for d := state.popFront(); d != nil; d = state.popFront() {
                i++
                found := i < len(stats)
                if i == 0 && len(state.names[0]) == 0 {
                        if found && !d.isSynthetic() {
                                // First dentry is where the search is starting, just update attributes
                                // since it cannot be replaced.
                                d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: acquired by lockAllMetadata.
                        }
                        d.metadataMu.Unlock() // +checklocksforce: see above.
                        continue
                }

                // Note that synthetic dentries will always fails the comparison check
                // below.
                if !found || d.qidPath != stats[i].QID.Path {
                        d.metadataMu.Unlock() // +checklocksforce: see above.
                        if !found && d.isSynthetic() {
                                // We have a synthetic file, and no remote file has arisen to replace
                                // it.
                                return nil
                        }
                        // The file at this path has changed or no longer exists. Mark the
                        // dentry invalidated, and re-evaluate its caching status (i.e. if it
                        // has 0 references, drop it). The dentry will be reloaded next time it's
                        // accessed.
                        vfsObj.InvalidateDentry(ctx, &d.vfsd)

                        name := state.names[i]
                        d.parent.dirMu.Lock()

                        if d.isSynthetic() {
                                // Normally we don't mark invalidated dentries as deleted since
                                // they may still exist (but at a different path), and also for
                                // consistency with Linux. However, synthetic files are guaranteed
                                // to become unreachable if their dentries are invalidated, so
                                // treat their invalidation as deletion.
                                d.setDeleted()
                                d.decRefNoCaching()
                                *ds = appendDentry(*ds, d)

                                d.parent.syntheticChildren--
                                d.parent.dirents = nil
                        }

                        // Since the dirMu was released and reacquired, re-check that the
                        // parent's child with this name is still the same. Do not touch it if
                        // it has been replaced with a different one.
                        if child := d.parent.children[name]; child == d {
                                // Invalidate dentry so it gets reloaded next time it's accessed.
                                delete(d.parent.children, name)
                        }
                        d.parent.dirMu.Unlock()

                        return nil
                }

                // The file at this path hasn't changed. Just update cached metadata.
                d.updateFromP9AttrsLocked(stats[i].Valid, &stats[i].Attr) // +checklocksforce: see above.
                d.metadataMu.Unlock()
        }

        return nil
}

// revalidateStatePool caches revalidateState instances to save array
// allocations for dentries and names.
var revalidateStatePool = sync.Pool{
        New: func() interface{} {
                return &revalidateState{}
        },
}

// revalidateState keeps state related to a revalidation request. It keeps track
// of {name, dentry} list being revalidated, as well as metadata locks on the
// dentries. The list must be in ancestry order, in other words `n` must be
// `n-1` child.
type revalidateState struct {
        // start is the dentry where to start the attributes search.
        start *dentry

        // List of names of entries to refresh attributes. Names length must be the
        // same as detries length. They are kept in separate slices because names is
        // used to call File.MultiGetAttr().
        names []string

        // dentries is the list of dentries that correspond to the names above.
        // dentry.metadataMu is acquired as each dentry is added to this list.
        dentries []*dentry

        // locked indicates if metadata lock has been acquired on dentries.
        locked bool
}

func makeRevalidateState(start *dentry) *revalidateState {
        r := revalidateStatePool.Get().(*revalidateState)
        r.start = start
        return r
}

// release must be called after the caller is done with this object. It releases
// all metadata locks and resources.
func (r *revalidateState) release() {
        r.reset()
        revalidateStatePool.Put(r)
}

// Preconditions:
// * d is a descendant of all dentries in r.dentries.
func (r *revalidateState) add(name string, d *dentry) {
        r.names = append(r.names, name)
        r.dentries = append(r.dentries, d)
}

// +checklocksignore
func (r *revalidateState) lockAllMetadata() {
        for _, d := range r.dentries {
                d.metadataMu.Lock()
        }
        r.locked = true
}

func (r *revalidateState) popFront() *dentry {
        if len(r.dentries) == 0 {
                return nil
        }
        d := r.dentries[0]
        r.dentries = r.dentries[1:]
        return d
}

// reset releases all metadata locks and resets all fields to allow this
// instance to be reused.
// +checklocksignore
func (r *revalidateState) reset() {
        if r.locked {
                // Unlock any remaining dentries.
                for _, d := range r.dentries {
                        d.metadataMu.Unlock()
                }
                r.locked = false
        }
        r.start = nil
        r.names = r.names[:0]
        r.dentries = r.dentries[:0]
}















































































































































































































    4 

    4 

    2 



    4 








   58 

   58 

   51 








   28 







   28 

   28 

   26 









    3 




    3 








    9 





    9 






    9 





    9 
    9 

    1 




    8 




    8 








   46 








   45 





   46 


   46 

   39 



   10 




   10 








   20 








   20 




   20 
   20 

   15 



    5 




    5 








  251 










   13 





  251 
    1 


    4 


    3 




  250 




  250 

    6 


  250 
   10 


  249 

    2 


    5 



    5 


  244 

  210 


    9 






    9 




    9 





  206 


  201 







    9 

    9 

    4 



    6 







   28 






   28 




   27 
    2 


   26 




   25 

    1 




   24 





   24 

    1 


   24 

    3 




   21 




   21 








   12 








   12 




   12 
   12 

    2 



   10 




   10 







   12 

   12 

    8 



    4 







   17 

   17 

   15 



    3 








    6 

    6 

    6 



    3 







   18 








   18 




   18 
   18 

   13 



    5 




    5 







   12 
    1 


    1 




   11 




   11 
   11 

    3 



    8 




    8 







   16 

   16 

   10 



    6 




    6 








    3 

    3 

    1 



    2 







    2 








    8 

    8 

    2 



    6 








   12 

   12 

    2 



   10 







    5 

    5 

    1 



    4 







    1 

    1 



    1 

    1 


    1 



    1 



    1 

    1 





















































































    2 








  653 





  617 






  707 






  313 





  334 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package vfs implements a virtual filesystem layer.
//
// Lock order:
//
// EpollInstance.interestMu
//   FileDescription.epollMu
//     FilesystemImpl/FileDescriptionImpl locks
//       VirtualFilesystem.mountMu
//         Dentry.mu
//           Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
//         VirtualFilesystem.filesystemsMu
//       fdnotifier.notifier.mu
//         EpollInstance.mu
//           Locks acquired by FileDescriptionImpl.Readiness
//       Inotify.mu
//         Watches.mu
//           Inotify.evMu
// VirtualFilesystem.fsTypesMu
//
// Locking Dentry.mu in multiple Dentries requires holding
// VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple
// EpollInstances requires holding epollCycleMu.
package vfs

import (
        "fmt"
        "path"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/sentry/fsmetric"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
)

// A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts.
//
// There is no analogue to the VirtualFilesystem type in Linux, as the
// equivalent state in Linux is global.
//
// +stateify savable
type VirtualFilesystem struct {
        // mountMu serializes mount mutations.
        //
        // mountMu is analogous to Linux's namespace_sem.
        mountMu sync.Mutex `state:"nosave"`

        // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts
        // are uniquely namespaced, including mount parent in the key correctly
        // handles both bind mounts and mount namespaces; Linux does the same.)
        // Synchronization between mutators and readers is provided by mounts.seq;
        // synchronization between mutators is provided by mountMu.
        //
        // mounts is used to follow mount points during path traversal. We use a
        // single table rather than per-Dentry tables to reduce size (and therefore
        // cache footprint) for the vast majority of Dentries that are not mount
        // points.
        //
        // mounts is analogous to Linux's mount_hashtable.
        mounts mountTable `state:".([]*Mount)"`

        // mountpoints maps mount points to mounts at those points in all
        // namespaces. mountpoints is protected by mountMu.
        //
        // mountpoints is used to find mounts that must be umounted due to
        // removal of a mount point Dentry from another mount namespace. ("A file
        // or directory that is a mount point in one namespace that is not a mount
        // point in another namespace, may be renamed, unlinked, or removed
        // (rmdir(2)) in the mount namespace in which it is not a mount point
        // (subject to the usual permission checks)." - mount_namespaces(7))
        //
        // mountpoints is analogous to Linux's mountpoint_hashtable.
        mountpoints map[*Dentry]map[*Mount]struct{}

        // lastMountID is the last allocated mount ID. lastMountID is accessed
        // using atomic memory operations.
        lastMountID uint64

        // anonMount is a Mount, not included in mounts or mountpoints,
        // representing an anonFilesystem. anonMount is used to back
        // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
        // anonMount is immutable.
        //
        // anonMount is analogous to Linux's anon_inode_mnt.
        anonMount *Mount

        // devices contains all registered Devices. devices is protected by
        // devicesMu.
        devicesMu sync.RWMutex `state:"nosave"`
        devices   map[devTuple]*registeredDevice

        // anonBlockDevMinor contains all allocated anonymous block device minor
        // numbers. anonBlockDevMinorNext is a lower bound for the smallest
        // unallocated anonymous block device number. anonBlockDevMinorNext and
        // anonBlockDevMinor are protected by anonBlockDevMinorMu.
        anonBlockDevMinorMu   sync.Mutex `state:"nosave"`
        anonBlockDevMinorNext uint32
        anonBlockDevMinor     map[uint32]struct{}

        // fsTypes contains all registered FilesystemTypes. fsTypes is protected by
        // fsTypesMu.
        fsTypesMu sync.RWMutex `state:"nosave"`
        fsTypes   map[string]*registeredFilesystemType

        // filesystems contains all Filesystems. filesystems is protected by
        // filesystemsMu.
        filesystemsMu sync.Mutex `state:"nosave"`
        filesystems   map[*Filesystem]struct{}
}

// Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes.
func (vfs *VirtualFilesystem) Init(ctx context.Context) error {
        if vfs.mountpoints != nil {
                panic("VFS already initialized")
        }
        vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{})
        vfs.devices = make(map[devTuple]*registeredDevice)
        vfs.anonBlockDevMinorNext = 1
        vfs.anonBlockDevMinor = make(map[uint32]struct{})
        vfs.fsTypes = make(map[string]*registeredFilesystemType)
        vfs.filesystems = make(map[*Filesystem]struct{})
        vfs.mounts.Init()

        // Construct vfs.anonMount.
        anonfsDevMinor, err := vfs.GetAnonBlockDevMinor()
        if err != nil {
                // This shouldn't be possible since anonBlockDevMinorNext was
                // initialized to 1 above (no device numbers have been allocated yet).
                panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err))
        }
        anonfs := anonFilesystem{
                devMinor: anonfsDevMinor,
        }
        anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs)
        defer anonfs.vfsfs.DecRef(ctx)
        anonMount, err := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{})
        if err != nil {
                // We should not be passing any MountOptions that would cause
                // construction of this mount to fail.
                panic(fmt.Sprintf("VirtualFilesystem.Init: anonfs mount failed: %v", err))
        }
        vfs.anonMount = anonMount

        return nil
}

// Release drops references on filesystem objects held by vfs.
//
// Precondition: This must be called after VFS.Init() has succeeded.
func (vfs *VirtualFilesystem) Release(ctx context.Context) {
        vfs.anonMount.DecRef(ctx)
        for _, fst := range vfs.fsTypes {
                fst.fsType.Release(ctx)
        }
}

// PathOperation specifies the path operated on by a VFS method.
//
// PathOperation is passed to VFS methods by pointer to reduce memory copying:
// it's somewhat large and should never escape. (Options structs are passed by
// pointer to VFS and FileDescription methods for the same reason.)
//
// +stateify savable
type PathOperation struct {
        // Root is the VFS root. References on Root are borrowed from the provider
        // of the PathOperation.
        //
        // Invariants: Root.Ok().
        Root VirtualDentry

        // Start is the starting point for the path traversal. References on Start
        // are borrowed from the provider of the PathOperation (i.e. the caller of
        // the VFS method to which the PathOperation was passed).
        //
        // Invariants: Start.Ok(). If Path.Absolute, then Start == Root.
        Start VirtualDentry

        // Path is the pathname traversed by this operation.
        Path fspath.Path

        // If FollowFinalSymlink is true, and the Dentry traversed by the final
        // path component represents a symbolic link, the symbolic link should be
        // followed.
        FollowFinalSymlink bool
}

// AccessAt checks whether a user with creds has access to the file at
// the given path.
func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error {
        rp := vfs.getResolvingPath(creds, pop)
        for {
                err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats)
                if err == nil {
                        rp.Release(ctx)
                        return nil
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return err
                }
        }
}

// GetDentryAt returns a VirtualDentry representing the given path, at which a
// file must exist. A reference is taken on the returned VirtualDentry.
func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
        rp := vfs.getResolvingPath(creds, pop)
        for {
                d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
                if err == nil {
                        vd := VirtualDentry{
                                mount:  rp.mount,
                                dentry: d,
                        }
                        rp.mount.IncRef()
                        rp.Release(ctx)
                        return vd, nil
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return VirtualDentry{}, err
                }
        }
}

// Preconditions: pop.Path.Begin.Ok().
func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) {
        rp := vfs.getResolvingPath(creds, pop)
        for {
                parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp)
                if err == nil {
                        parentVD := VirtualDentry{
                                mount:  rp.mount,
                                dentry: parent,
                        }
                        rp.mount.IncRef()
                        name := rp.Component()
                        rp.Release(ctx)
                        return parentVD, name, nil
                }
                if checkInvariants {
                        if rp.canHandleError(err) && rp.Done() {
                                panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
                        }
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return VirtualDentry{}, "", err
                }
        }
}

// LinkAt creates a hard link at newpop representing the existing file at
// oldpop.
func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error {
        oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
        if err != nil {
                return err
        }

        if !newpop.Path.Begin.Ok() {
                oldVD.DecRef(ctx)
                if newpop.Path.Absolute {
                        return linuxerr.EEXIST
                }
                return syserror.ENOENT
        }
        if newpop.FollowFinalSymlink {
                oldVD.DecRef(ctx)
                ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink")
                return linuxerr.EINVAL
        }

        rp := vfs.getResolvingPath(creds, newpop)
        for {
                err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
                if err == nil {
                        rp.Release(ctx)
                        oldVD.DecRef(ctx)
                        return nil
                }
                if checkInvariants {
                        if rp.canHandleError(err) && rp.Done() {
                                panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
                        }
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        oldVD.DecRef(ctx)
                        return err
                }
        }
}

// MkdirAt creates a directory at the given path.
func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
        if !pop.Path.Begin.Ok() {
                // pop.Path should not be empty in operations that create/delete files.
                // This is consistent with mkdirat(dirfd, "", mode).
                if pop.Path.Absolute {
                        return linuxerr.EEXIST
                }
                return syserror.ENOENT
        }
        if pop.FollowFinalSymlink {
                ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink")
                return linuxerr.EINVAL
        }
        // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
        // also honored." - mkdir(2)
        opts.Mode &= 0777 | linux.S_ISVTX

        rp := vfs.getResolvingPath(creds, pop)
        for {
                err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
                if err == nil {
                        rp.Release(ctx)
                        return nil
                }
                if checkInvariants {
                        if rp.canHandleError(err) && rp.Done() {
                                panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
                        }
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return err
                }
        }
}

// MknodAt creates a file of the given mode at the given path. It returns an
// error from the syserror package.
func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
        if !pop.Path.Begin.Ok() {
                // pop.Path should not be empty in operations that create/delete files.
                // This is consistent with mknodat(dirfd, "", mode, dev).
                if pop.Path.Absolute {
                        return linuxerr.EEXIST
                }
                return syserror.ENOENT
        }
        if pop.FollowFinalSymlink {
                ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink")
                return linuxerr.EINVAL
        }

        rp := vfs.getResolvingPath(creds, pop)
        for {
                err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
                if err == nil {
                        rp.Release(ctx)
                        return nil
                }
                if checkInvariants {
                        if rp.canHandleError(err) && rp.Done() {
                                panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
                        }
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return err
                }
        }
}

// OpenAt returns a FileDescription providing access to the file at the given
// path. A reference is taken on the returned FileDescription.
func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
        fsmetric.Opens.Increment()

        // Remove:
        //
        // - O_CLOEXEC, which affects file descriptors and therefore must be
        // handled outside of VFS.
        //
        // - Unknown flags.
        opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_LARGEFILE | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
        // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
        if opts.Flags&linux.O_SYNC != 0 {
                opts.Flags |= linux.O_DSYNC
        }
        // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
        // with O_DIRECTORY and a writable access mode (to ensure that it fails on
        // filesystem implementations that do not support it).
        if opts.Flags&linux.O_TMPFILE != 0 {
                if opts.Flags&linux.O_DIRECTORY == 0 {
                        return nil, linuxerr.EINVAL
                }
                if opts.Flags&linux.O_CREAT != 0 {
                        return nil, linuxerr.EINVAL
                }
                if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
                        return nil, linuxerr.EINVAL
                }
        }
        // O_PATH causes most other flags to be ignored.
        if opts.Flags&linux.O_PATH != 0 {
                opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
        }
        // "On Linux, the following bits are also honored in mode: [S_ISUID,
        // S_ISGID, S_ISVTX]" - open(2)
        opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX

        if opts.Flags&linux.O_NOFOLLOW != 0 {
                pop.FollowFinalSymlink = false
        }
        rp := vfs.getResolvingPath(creds, pop)
        if opts.Flags&linux.O_DIRECTORY != 0 {
                rp.mustBeDir = true
        }
        if opts.Flags&linux.O_PATH != 0 {
                vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{})
                if err != nil {
                        return nil, err
                }
                fd := &opathFD{}
                if err := fd.vfsfd.Init(fd, opts.Flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{}); err != nil {
                        return nil, err
                }
                vd.DecRef(ctx)
                return &fd.vfsfd, err
        }
        for {
                fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
                if err == nil {
                        rp.Release(ctx)

                        if opts.FileExec {
                                if fd.Mount().Flags.NoExec {
                                        fd.DecRef(ctx)
                                        return nil, linuxerr.EACCES
                                }

                                // Only a regular file can be executed.
                                stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
                                if err != nil {
                                        fd.DecRef(ctx)
                                        return nil, err
                                }
                                if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG {
                                        fd.DecRef(ctx)
                                        return nil, linuxerr.EACCES
                                }
                        }

                        fd.Dentry().InotifyWithParent(ctx, linux.IN_OPEN, 0, PathEvent)
                        return fd, nil
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return nil, err
                }
        }
}

// ReadlinkAt returns the target of the symbolic link at the given path.
func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) {
        rp := vfs.getResolvingPath(creds, pop)
        for {
                target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
                if err == nil {
                        rp.Release(ctx)
                        return target, nil
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return "", err
                }
        }
}

// RenameAt renames the file at oldpop to newpop.
func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error {
        if !oldpop.Path.Begin.Ok() {
                if oldpop.Path.Absolute {
                        return linuxerr.EBUSY
                }
                return syserror.ENOENT
        }
        if oldpop.FollowFinalSymlink {
                ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink")
                return linuxerr.EINVAL
        }

        oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop)
        if err != nil {
                return err
        }
        if oldName == "." || oldName == ".." {
                oldParentVD.DecRef(ctx)
                return linuxerr.EBUSY
        }

        if !newpop.Path.Begin.Ok() {
                oldParentVD.DecRef(ctx)
                if newpop.Path.Absolute {
                        return linuxerr.EBUSY
                }
                return syserror.ENOENT
        }
        if newpop.FollowFinalSymlink {
                oldParentVD.DecRef(ctx)
                ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink")
                return linuxerr.EINVAL
        }

        rp := vfs.getResolvingPath(creds, newpop)
        renameOpts := *opts
        if oldpop.Path.Dir {
                renameOpts.MustBeDir = true
        }
        for {
                err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts)
                if err == nil {
                        rp.Release(ctx)
                        oldParentVD.DecRef(ctx)
                        return nil
                }
                if checkInvariants {
                        if rp.canHandleError(err) && rp.Done() {
                                panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
                        }
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        oldParentVD.DecRef(ctx)
                        return err
                }
        }
}

// RmdirAt removes the directory at the given path.
func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
        if !pop.Path.Begin.Ok() {
                // pop.Path should not be empty in operations that create/delete files.
                // This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR).
                if pop.Path.Absolute {
                        return linuxerr.EBUSY
                }
                return syserror.ENOENT
        }
        if pop.FollowFinalSymlink {
                ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink")
                return linuxerr.EINVAL
        }

        rp := vfs.getResolvingPath(creds, pop)
        for {
                err := rp.mount.fs.impl.RmdirAt(ctx, rp)
                if err == nil {
                        rp.Release(ctx)
                        return nil
                }
                if checkInvariants {
                        if rp.canHandleError(err) && rp.Done() {
                                panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
                        }
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return err
                }
        }
}

// SetStatAt changes metadata for the file at the given path.
func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error {
        rp := vfs.getResolvingPath(creds, pop)
        for {
                err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
                if err == nil {
                        rp.Release(ctx)
                        return nil
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return err
                }
        }
}

// StatAt returns metadata for the file at the given path.
func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
        rp := vfs.getResolvingPath(creds, pop)
        for {
                stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
                if err == nil {
                        rp.Release(ctx)
                        return stat, nil
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return linux.Statx{}, err
                }
        }
}

// StatFSAt returns metadata for the filesystem containing the file at the
// given path.
func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) {
        rp := vfs.getResolvingPath(creds, pop)
        for {
                statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
                if err == nil {
                        rp.Release(ctx)
                        return statfs, nil
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return linux.Statfs{}, err
                }
        }
}

// SymlinkAt creates a symbolic link at the given path with the given target.
func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
        if !pop.Path.Begin.Ok() {
                // pop.Path should not be empty in operations that create/delete files.
                // This is consistent with symlinkat(oldpath, newdirfd, "").
                if pop.Path.Absolute {
                        return linuxerr.EEXIST
                }
                return syserror.ENOENT
        }
        if pop.FollowFinalSymlink {
                ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink")
                return linuxerr.EINVAL
        }

        rp := vfs.getResolvingPath(creds, pop)
        for {
                err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
                if err == nil {
                        rp.Release(ctx)
                        return nil
                }
                if checkInvariants {
                        if rp.canHandleError(err) && rp.Done() {
                                panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
                        }
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return err
                }
        }
}

// UnlinkAt deletes the non-directory file at the given path.
func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
        if !pop.Path.Begin.Ok() {
                // pop.Path should not be empty in operations that create/delete files.
                // This is consistent with unlinkat(dirfd, "", 0).
                if pop.Path.Absolute {
                        return linuxerr.EBUSY
                }
                return syserror.ENOENT
        }
        if pop.FollowFinalSymlink {
                ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink")
                return linuxerr.EINVAL
        }

        rp := vfs.getResolvingPath(creds, pop)
        for {
                err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
                if err == nil {
                        rp.Release(ctx)
                        return nil
                }
                if checkInvariants {
                        if rp.canHandleError(err) && rp.Done() {
                                panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
                        }
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return err
                }
        }
}

// BoundEndpointAt gets the bound endpoint at the given path, if one exists.
func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) {
        rp := vfs.getResolvingPath(creds, pop)
        for {
                bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts)
                if err == nil {
                        rp.Release(ctx)
                        return bep, nil
                }
                if checkInvariants {
                        if rp.canHandleError(err) && rp.Done() {
                                panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err))
                        }
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return nil, err
                }
        }
}

// ListXattrAt returns all extended attribute names for the file at the given
// path.
func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) {
        rp := vfs.getResolvingPath(creds, pop)
        for {
                names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size)
                if err == nil {
                        rp.Release(ctx)
                        return names, nil
                }
                if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) {
                        // Linux doesn't actually return EOPNOTSUPP in this case; instead,
                        // fs/xattr.c:vfs_listxattr() falls back to allowing the security
                        // subsystem to return security extended attributes, which by
                        // default don't exist.
                        rp.Release(ctx)
                        return nil, nil
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return nil, err
                }
        }
}

// GetXattrAt returns the value associated with the given extended attribute
// for the file at the given path.
func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) {
        rp := vfs.getResolvingPath(creds, pop)
        for {
                val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
                if err == nil {
                        rp.Release(ctx)
                        return val, nil
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return "", err
                }
        }
}

// SetXattrAt changes the value associated with the given extended attribute
// for the file at the given path.
func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error {
        rp := vfs.getResolvingPath(creds, pop)
        for {
                err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
                if err == nil {
                        rp.Release(ctx)
                        return nil
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return err
                }
        }
}

// RemoveXattrAt removes the given extended attribute from the file at rp.
func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
        rp := vfs.getResolvingPath(creds, pop)
        for {
                err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
                if err == nil {
                        rp.Release(ctx)
                        return nil
                }
                if !rp.handleError(ctx, err) {
                        rp.Release(ctx)
                        return err
                }
        }
}

// SyncAllFilesystems has the semantics of Linux's sync(2).
func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
        var retErr error
        for fs := range vfs.getFilesystems() {
                if err := fs.impl.Sync(ctx); err != nil && retErr == nil {
                        retErr = err
                }
                fs.DecRef(ctx)
        }
        return retErr
}

func (vfs *VirtualFilesystem) getFilesystems() map[*Filesystem]struct{} {
        fss := make(map[*Filesystem]struct{})
        vfs.filesystemsMu.Lock()
        defer vfs.filesystemsMu.Unlock()
        for fs := range vfs.filesystems {
                if !fs.TryIncRef() {
                        continue
                }
                fss[fs] = struct{}{}
        }
        return fss
}

// MkdirAllAt recursively creates non-existent directories on the given path
// (including the last component).
func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string, root VirtualDentry, creds *auth.Credentials, mkdirOpts *MkdirOptions) error {
        pop := &PathOperation{
                Root:  root,
                Start: root,
                Path:  fspath.Parse(currentPath),
        }
        stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE})
        switch {
        case err == nil:
                if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory {
                        return linuxerr.ENOTDIR
                }
                // Directory already exists.
                return nil
        case linuxerr.Equals(linuxerr.ENOENT, err):
                // Expected, we will create the dir.
        default:
                return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err)
        }

        // Recurse to ensure parent is created and then create the final directory.
        if err := vfs.MkdirAllAt(ctx, path.Dir(currentPath), root, creds, mkdirOpts); err != nil {
                return err
        }
        if err := vfs.MkdirAt(ctx, creds, pop, mkdirOpts); err != nil {
                return fmt.Errorf("failed to create directory %q: %w", currentPath, err)
        }
        return nil
}

// MakeSyntheticMountpoint creates parent directories of target if they do not
// exist and attempts to create a directory for the mountpoint. If a
// non-directory file already exists there then we allow it.
func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error {
        mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true}

        // Make sure the parent directory of target exists.
        if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts); err != nil {
                return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err)
        }

        // Attempt to mkdir the final component. If a file (of any type) exists
        // then we let allow mounting on top of that because we do not require the
        // target to be an existing directory, unlike Linux mount(2).
        if err := vfs.MkdirAt(ctx, creds, &PathOperation{
                Root:  root,
                Start: root,
                Path:  fspath.Parse(target),
        }, mkdirOpts); err != nil && !linuxerr.Equals(linuxerr.EEXIST, err) {
                return fmt.Errorf("failed to create mountpoint %q: %w", target, err)
        }
        return nil
}

// A VirtualDentry represents a node in a VFS tree, by combining a Dentry
// (which represents a node in a Filesystem's tree) and a Mount (which
// represents the Filesystem's position in a VFS mount tree).
//
// VirtualDentry's semantics are similar to that of a Go interface object
// representing a pointer: it is a copyable value type that represents
// references to another entity. The zero value of VirtualDentry is an "empty
// VirtualDentry", directly analogous to a nil interface object.
// VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless
// otherwise specified, all other VirtualDentry methods require
// VirtualDentry.Ok() == true.
//
// Mounts and Dentries are reference-counted, requiring that users call
// VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to
// references on the Mount and Dentry referred to by a VirtualDentry as
// references on the VirtualDentry itself. Unless otherwise specified, all
// VirtualDentry methods require that a reference is held on the VirtualDentry.
//
// VirtualDentry is analogous to Linux's struct path.
//
// +stateify savable
type VirtualDentry struct {
        mount  *Mount
        dentry *Dentry
}

// MakeVirtualDentry creates a VirtualDentry.
func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry {
        return VirtualDentry{
                mount:  mount,
                dentry: dentry,
        }
}

// Ok returns true if vd is not empty. It does not require that a reference is
// held.
func (vd VirtualDentry) Ok() bool {
        return vd.mount != nil
}

// IncRef increments the reference counts on the Mount and Dentry represented
// by vd.
func (vd VirtualDentry) IncRef() {
        vd.mount.IncRef()
        vd.dentry.IncRef()
}

// DecRef decrements the reference counts on the Mount and Dentry represented
// by vd.
func (vd VirtualDentry) DecRef(ctx context.Context) {
        vd.dentry.DecRef(ctx)
        vd.mount.DecRef(ctx)
}

// Mount returns the Mount associated with vd. It does not take a reference on
// the returned Mount.
func (vd VirtualDentry) Mount() *Mount {
        return vd.mount
}

// Dentry returns the Dentry associated with vd. It does not take a reference
// on the returned Dentry.
func (vd VirtualDentry) Dentry() *Dentry {
        return vd.dentry
}






























  122 
  123 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build amd64 arm64

package bits

// TrailingZeros64 returns the number of bits before the least significant 1
// bit in x; in other words, it returns the index of the least significant 1
// bit in x. If x is 0, TrailingZeros64 returns 64.
func TrailingZeros64(x uint64) int

// MostSignificantOne64 returns the index of the most significant 1 bit in
// x. If x is 0, MostSignificantOne64 returns 64.
func MostSignificantOne64(x uint64) int

// ForEachSetBit64 calls f once for each set bit in x, with argument i equal to
// the set bit's index.
func ForEachSetBit64(x uint64, f func(i int)) {
        for x != 0 {
                i := TrailingZeros64(x)
                f(i)
                x &^= MaskOf64(i)
        }
}







































































   32 


























   32 
















   32 















   32 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ipv6

import (
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

var _ stack.IPNetworkEndpointStats = (*Stats)(nil)

// Stats holds statistics related to the IPv6 protocol family.
type Stats struct {
        // IP holds IPv6 statistics.
        IP tcpip.IPStats

        // ICMP holds ICMPv6 statistics.
        ICMP tcpip.ICMPv6Stats

        // UnhandledRouterAdvertisements is the number of Router Advertisements that
        // were observed but not handled.
        UnhandledRouterAdvertisements *tcpip.StatCounter
}

// IsNetworkEndpointStats implements stack.NetworkEndpointStats.
func (*Stats) IsNetworkEndpointStats() {}

// IPStats implements stack.IPNetworkEndointStats
func (s *Stats) IPStats() *tcpip.IPStats {
        return &s.IP
}

type sharedStats struct {
        localStats Stats
        ip         ip.MultiCounterIPStats
        icmp       multiCounterICMPv6Stats
}

// LINT.IfChange(multiCounterICMPv6PacketStats)

type multiCounterICMPv6PacketStats struct {
        echoRequest             tcpip.MultiCounterStat
        echoReply               tcpip.MultiCounterStat
        dstUnreachable          tcpip.MultiCounterStat
        packetTooBig            tcpip.MultiCounterStat
        timeExceeded            tcpip.MultiCounterStat
        paramProblem            tcpip.MultiCounterStat
        routerSolicit           tcpip.MultiCounterStat
        routerAdvert            tcpip.MultiCounterStat
        neighborSolicit         tcpip.MultiCounterStat
        neighborAdvert          tcpip.MultiCounterStat
        redirectMsg             tcpip.MultiCounterStat
        multicastListenerQuery  tcpip.MultiCounterStat
        multicastListenerReport tcpip.MultiCounterStat
        multicastListenerDone   tcpip.MultiCounterStat
}

func (m *multiCounterICMPv6PacketStats) init(a, b *tcpip.ICMPv6PacketStats) {
        m.echoRequest.Init(a.EchoRequest, b.EchoRequest)
        m.echoReply.Init(a.EchoReply, b.EchoReply)
        m.dstUnreachable.Init(a.DstUnreachable, b.DstUnreachable)
        m.packetTooBig.Init(a.PacketTooBig, b.PacketTooBig)
        m.timeExceeded.Init(a.TimeExceeded, b.TimeExceeded)
        m.paramProblem.Init(a.ParamProblem, b.ParamProblem)
        m.routerSolicit.Init(a.RouterSolicit, b.RouterSolicit)
        m.routerAdvert.Init(a.RouterAdvert, b.RouterAdvert)
        m.neighborSolicit.Init(a.NeighborSolicit, b.NeighborSolicit)
        m.neighborAdvert.Init(a.NeighborAdvert, b.NeighborAdvert)
        m.redirectMsg.Init(a.RedirectMsg, b.RedirectMsg)
        m.multicastListenerQuery.Init(a.MulticastListenerQuery, b.MulticastListenerQuery)
        m.multicastListenerReport.Init(a.MulticastListenerReport, b.MulticastListenerReport)
        m.multicastListenerDone.Init(a.MulticastListenerDone, b.MulticastListenerDone)
}

// LINT.ThenChange(../../tcpip.go:ICMPv6PacketStats)

// LINT.IfChange(multiCounterICMPv6SentPacketStats)

type multiCounterICMPv6SentPacketStats struct {
        multiCounterICMPv6PacketStats
        dropped     tcpip.MultiCounterStat
        rateLimited tcpip.MultiCounterStat
}

func (m *multiCounterICMPv6SentPacketStats) init(a, b *tcpip.ICMPv6SentPacketStats) {
        m.multiCounterICMPv6PacketStats.init(&a.ICMPv6PacketStats, &b.ICMPv6PacketStats)
        m.dropped.Init(a.Dropped, b.Dropped)
        m.rateLimited.Init(a.RateLimited, b.RateLimited)
}

// LINT.ThenChange(../../tcpip.go:ICMPv6SentPacketStats)

// LINT.IfChange(multiCounterICMPv6ReceivedPacketStats)

type multiCounterICMPv6ReceivedPacketStats struct {
        multiCounterICMPv6PacketStats
        unrecognized                   tcpip.MultiCounterStat
        invalid                        tcpip.MultiCounterStat
        routerOnlyPacketsDroppedByHost tcpip.MultiCounterStat
}

func (m *multiCounterICMPv6ReceivedPacketStats) init(a, b *tcpip.ICMPv6ReceivedPacketStats) {
        m.multiCounterICMPv6PacketStats.init(&a.ICMPv6PacketStats, &b.ICMPv6PacketStats)
        m.unrecognized.Init(a.Unrecognized, b.Unrecognized)
        m.invalid.Init(a.Invalid, b.Invalid)
        m.routerOnlyPacketsDroppedByHost.Init(a.RouterOnlyPacketsDroppedByHost, b.RouterOnlyPacketsDroppedByHost)
}

// LINT.ThenChange(../../tcpip.go:ICMPv6ReceivedPacketStats)

// LINT.IfChange(multiCounterICMPv6Stats)

type multiCounterICMPv6Stats struct {
        packetsSent     multiCounterICMPv6SentPacketStats
        packetsReceived multiCounterICMPv6ReceivedPacketStats
}

func (m *multiCounterICMPv6Stats) init(a, b *tcpip.ICMPv6Stats) {
        m.packetsSent.init(&a.PacketsSent, &b.PacketsSent)
        m.packetsReceived.init(&a.PacketsReceived, &b.PacketsReceived)
}

// LINT.ThenChange(../../tcpip.go:ICMPv6Stats)






























  697 

  698 




  697 





    1 




































    3 






    3 



    3 



  636 


    3 





    3 














    1 




















    1 
    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "bytes"
        "fmt"
        "sort"
        "strings"

        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/log"
)

// EnterInitialCgroups moves t into an initial set of cgroups.
//
// Precondition: t isn't in any cgroups yet, t.cgs is empty.
func (t *Task) EnterInitialCgroups(parent *Task) {
        var inherit map[Cgroup]struct{}
        if parent != nil {
                parent.mu.Lock()
                defer parent.mu.Unlock()
                inherit = parent.cgroups
        }
        joinSet := t.k.cgroupRegistry.computeInitialGroups(inherit)

        t.mu.Lock()
        defer t.mu.Unlock()
        // Transfer ownership of joinSet refs to the task's cgset.
        t.cgroups = joinSet
        for c, _ := range t.cgroups {
                // Since t isn't in any cgroup yet, we can skip the check against
                // existing cgroups.
                c.Enter(t)
        }
}

// EnterCgroup moves t into c.
func (t *Task) EnterCgroup(c Cgroup) error {
        newControllers := make(map[CgroupControllerType]struct{})
        for _, ctl := range c.Controllers() {
                newControllers[ctl.Type()] = struct{}{}
        }

        t.mu.Lock()
        defer t.mu.Unlock()

        for oldCG, _ := range t.cgroups {
                for _, oldCtl := range oldCG.Controllers() {
                        if _, ok := newControllers[oldCtl.Type()]; ok {
                                // Already in a cgroup with the same controller as one of the
                                // new ones.  Requires migration between cgroups.
                                //
                                // TODO(b/183137098): Implement cgroup migration.
                                log.Warningf("Cgroup migration is not implemented")
                                return linuxerr.EBUSY
                        }
                }
        }

        // No migration required.
        t.enterCgroupLocked(c)

        return nil
}

// +checklocks:t.mu
func (t *Task) enterCgroupLocked(c Cgroup) {
        c.IncRef()
        t.cgroups[c] = struct{}{}
        c.Enter(t)
}

// +checklocks:t.mu
func (t *Task) enterCgroupIfNotYetLocked(c Cgroup) {
        if _, ok := t.cgroups[c]; ok {
                return
        }
        t.enterCgroupLocked(c)
}

// LeaveCgroups removes t out from all its cgroups.
func (t *Task) LeaveCgroups() {
        t.mu.Lock()
        defer t.mu.Unlock()
        for c, _ := range t.cgroups {
                t.leaveCgroupLocked(c)
        }
}

// +checklocks:t.mu
func (t *Task) leaveCgroupLocked(c Cgroup) {
        c.Leave(t)
        delete(t.cgroups, c)
        c.decRef()
}

// taskCgroupEntry represents a line in /proc/<pid>/cgroup, and is used to
// format a cgroup for display.
type taskCgroupEntry struct {
        hierarchyID uint32
        controllers string
        path        string
}

// GenerateProcTaskCgroup writes the contents of /proc/<pid>/cgroup for t to buf.
func (t *Task) GenerateProcTaskCgroup(buf *bytes.Buffer) {
        t.mu.Lock()
        defer t.mu.Unlock()

        cgEntries := make([]taskCgroupEntry, 0, len(t.cgroups))
        for c, _ := range t.cgroups {
                ctls := c.Controllers()
                ctlNames := make([]string, 0, len(ctls))
                for _, ctl := range ctls {
                        ctlNames = append(ctlNames, string(ctl.Type()))
                }

                cgEntries = append(cgEntries, taskCgroupEntry{
                        // Note: We're guaranteed to have at least one controller, and all
                        // controllers are guaranteed to be on the same hierarchy.
                        hierarchyID: ctls[0].HierarchyID(),
                        controllers: strings.Join(ctlNames, ","),
                        path:        c.Path(),
                })
        }

        sort.Slice(cgEntries, func(i, j int) bool { return cgEntries[i].hierarchyID > cgEntries[j].hierarchyID })
        for _, cgE := range cgEntries {
                fmt.Fprintf(buf, "%d:%s:%s\n", cgE.hierarchyID, cgE.controllers, cgE.path)
        }
}
































   15 




   15 
   15 



   15 


   14 

    5 


    9 
















    2 

    2 
    2 




    2 





    2 
    2 


    2 






    2 





























  186 



















    7 




    7 
    2 


    5 



    7 



    7 


    7 







   14 

   14 



    1 


    1 








  145 
  145 


















    5 









    1 




    1 







    4 




    4 






    1 




















  186 













    7 




    7 
    3 


    4 








    2 




    3 



    2 


    2 



  145 
  144 
















    3 

    1 


    2 









    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package proc

import (
        "bytes"
        "fmt"
        "sort"
        "strconv"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) {
        var (
                file  *vfs.FileDescription
                flags kernel.FDFlags
        )
        t.WithMuLocked(func(t *kernel.Task) {
                if fdt := t.FDTable(); fdt != nil {
                        file, flags = fdt.GetVFS2(fd)
                }
        })
        return file, flags
}

func taskFDExists(ctx context.Context, fs *filesystem, t *kernel.Task, fd int32) bool {
        file, _ := getTaskFD(t, fd)
        if file == nil {
                return false
        }
        fs.SafeDecRefFD(ctx, file)
        return true
}

// +stateify savable
type fdDir struct {
        locks vfs.FileLocks

        fs   *filesystem
        task *kernel.Task

        // When produceSymlinks is set, dirents produces for the FDs are reported
        // as symlink. Otherwise, they are reported as regular files.
        produceSymlink bool
}

// IterDirents implements kernfs.inodeDirectory.IterDirents.
func (i *fdDir) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
        var fds []int32
        i.task.WithMuLocked(func(t *kernel.Task) {
                if fdTable := t.FDTable(); fdTable != nil {
                        fds = fdTable.GetFDs(ctx)
                }
        })

        typ := uint8(linux.DT_REG)
        if i.produceSymlink {
                typ = linux.DT_LNK
        }

        // Find the appropriate starting point.
        idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(relOffset) })
        if idx >= len(fds) {
                return offset, nil
        }
        for _, fd := range fds[idx:] {
                dirent := vfs.Dirent{
                        Name:    strconv.FormatUint(uint64(fd), 10),
                        Type:    typ,
                        Ino:     i.fs.NextIno(),
                        NextOff: int64(fd) + 3,
                }
                if err := cb.Handle(dirent); err != nil {
                        // Getdents should iterate correctly despite mutation
                        // of fds, so we return the next fd to serialize plus
                        // 2 (which accounts for the "." and ".." tracked by
                        // kernfs) as the offset.
                        return int64(fd) + 2, err
                }
        }
        // We serialized them all.  Next offset should be higher than last
        // serialized fd.
        return int64(fds[len(fds)-1]) + 3, nil
}

// fdDirInode represents the inode for /proc/[pid]/fd directory.
//
// +stateify savable
type fdDirInode struct {
        fdDir
        fdDirInodeRefs
        implStatFS
        kernfs.InodeAlwaysValid
        kernfs.InodeAttrs
        kernfs.InodeDirectoryNoNewChildren
        kernfs.InodeNotSymlink
        kernfs.InodeTemporary
        kernfs.OrderedChildren
}

var _ kernfs.Inode = (*fdDirInode)(nil)

func (fs *filesystem) newFDDirInode(ctx context.Context, task *kernel.Task) kernfs.Inode {
        inode := &fdDirInode{
                fdDir: fdDir{
                        fs:             fs,
                        task:           task,
                        produceSymlink: true,
                },
        }
        inode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
        inode.InitRefs()
        inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
        return inode
}

// IterDirents implements kernfs.inodeDirectory.IterDirents.
func (i *fdDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
        return i.fdDir.IterDirents(ctx, mnt, cb, offset, relOffset)
}

// Lookup implements kernfs.inodeDirectory.Lookup.
func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
        fdInt, err := strconv.ParseInt(name, 10, 32)
        if err != nil {
                return nil, syserror.ENOENT
        }
        fd := int32(fdInt)
        if !taskFDExists(ctx, i.fs, i.task, fd) {
                return nil, syserror.ENOENT
        }
        return i.fs.newFDSymlink(ctx, i.task, fd, i.fs.NextIno()), nil
}

// Open implements kernfs.Inode.Open.
func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
                SeekEnd: kernfs.SeekEndZero,
        })
        if err != nil {
                return nil, err
        }
        return fd.VFSFileDescription(), nil
}

// CheckPermissions implements kernfs.Inode.CheckPermissions.
//
// This is to match Linux, which uses a special permission handler to guarantee
// that a process can still access /proc/self/fd after it has executed
// setuid. See fs/proc/fd.c:proc_fd_permission.
func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
        err := i.InodeAttrs.CheckPermissions(ctx, creds, ats)
        if err == nil {
                // Access granted, no extra check needed.
                return nil
        }
        if t := kernel.TaskFromContext(ctx); t != nil {
                // Allow access if the task trying to access it is in the thread group
                // corresponding to this directory.
                if i.task.ThreadGroup() == t.ThreadGroup() {
                        // Access granted (overridden).
                        return nil
                }
        }
        return err
}

// DecRef implements kernfs.Inode.DecRef.
func (i *fdDirInode) DecRef(ctx context.Context) {
        i.fdDirInodeRefs.DecRef(func() { i.Destroy(ctx) })
}

// fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file.
//
// +stateify savable
type fdSymlink struct {
        implStatFS
        kernfs.InodeAttrs
        kernfs.InodeNoopRefCount
        kernfs.InodeSymlink

        fs   *filesystem
        task *kernel.Task
        fd   int32
}

var _ kernfs.Inode = (*fdSymlink)(nil)

func (fs *filesystem) newFDSymlink(ctx context.Context, task *kernel.Task, fd int32, ino uint64) kernfs.Inode {
        inode := &fdSymlink{
                fs:   fs,
                task: task,
                fd:   fd,
        }
        inode.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
        return inode
}

func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
        file, _ := getTaskFD(s.task, s.fd)
        if file == nil {
                return "", syserror.ENOENT
        }
        defer s.fs.SafeDecRefFD(ctx, file)
        root := vfs.RootFromContext(ctx)
        defer s.fs.SafeDecRef(ctx, root)

        // Note: it's safe to reenter kernfs from Readlink if needed to resolve path.
        return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry())
}

func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
        file, _ := getTaskFD(s.task, s.fd)
        if file == nil {
                return vfs.VirtualDentry{}, "", syserror.ENOENT
        }
        defer s.fs.SafeDecRefFD(ctx, file)
        vd := file.VirtualDentry()
        vd.IncRef()
        return vd, "", nil
}

// Valid implements kernfs.Inode.Valid.
func (s *fdSymlink) Valid(ctx context.Context) bool {
        return taskFDExists(ctx, s.fs, s.task, s.fd)
}

// fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory.
//
// +stateify savable
type fdInfoDirInode struct {
        fdDir
        fdInfoDirInodeRefs
        implStatFS
        kernfs.InodeAlwaysValid
        kernfs.InodeAttrs
        kernfs.InodeDirectoryNoNewChildren
        kernfs.InodeNotSymlink
        kernfs.InodeTemporary
        kernfs.OrderedChildren
}

var _ kernfs.Inode = (*fdInfoDirInode)(nil)

func (fs *filesystem) newFDInfoDirInode(ctx context.Context, task *kernel.Task) kernfs.Inode {
        inode := &fdInfoDirInode{
                fdDir: fdDir{
                        fs:   fs,
                        task: task,
                },
        }
        inode.InodeAttrs.Init(ctx, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
        inode.InitRefs()
        inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
        return inode
}

// Lookup implements kernfs.inodeDirectory.Lookup.
func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
        fdInt, err := strconv.ParseInt(name, 10, 32)
        if err != nil {
                return nil, syserror.ENOENT
        }
        fd := int32(fdInt)
        if !taskFDExists(ctx, i.fs, i.task, fd) {
                return nil, syserror.ENOENT
        }
        data := &fdInfoData{
                fs:   i.fs,
                task: i.task,
                fd:   fd,
        }
        return i.fs.newTaskOwnedInode(ctx, i.task, i.fs.NextIno(), 0444, data), nil
}

// IterDirents implements Inode.IterDirents.
func (i *fdInfoDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
        return i.fdDir.IterDirents(ctx, mnt, cb, offset, relOffset)
}

// Open implements kernfs.Inode.Open.
func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
                SeekEnd: kernfs.SeekEndZero,
        })
        if err != nil {
                return nil, err
        }
        return fd.VFSFileDescription(), nil
}

// DecRef implements kernfs.Inode.DecRef.
func (i *fdInfoDirInode) DecRef(ctx context.Context) {
        i.fdInfoDirInodeRefs.DecRef(func() { i.Destroy(ctx) })
}

// fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd].
//
// +stateify savable
type fdInfoData struct {
        kernfs.DynamicBytesFile

        fs   *filesystem
        task *kernel.Task
        fd   int32
}

var _ dynamicInode = (*fdInfoData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        file, descriptorFlags := getTaskFD(d.task, d.fd)
        if file == nil {
                return syserror.ENOENT
        }
        defer d.fs.SafeDecRefFD(ctx, file)
        // TODO(b/121266871): Include pos, locks, and other data. For now we only
        // have flags.
        // See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
        flags := uint(file.StatusFlags()) | descriptorFlags.ToLinuxFileFlags()
        fmt.Fprintf(buf, "flags:\t0%o\n", flags)
        return nil
}

// Valid implements kernfs.Inode.Valid.
func (d *fdInfoData) Valid(ctx context.Context) bool {
        return taskFDExists(ctx, d.fs, d.task, d.fd)
}



























   19 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build amd64

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)

// This takes both input and output as pointer arguments to avoid copying large
// structs.
func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
        // Linux just copies fields from struct kstat without regard to struct
        // kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
        userns := t.UserNamespace()
        *stat = linux.Stat{
                Dev:     uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
                Ino:     statx.Ino,
                Nlink:   uint64(statx.Nlink),
                Mode:    uint32(statx.Mode),
                UID:     uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
                GID:     uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
                Rdev:    uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
                Size:    int64(statx.Size),
                Blksize: int64(statx.Blksize),
                Blocks:  int64(statx.Blocks),
                ATime:   timespecFromStatxTimestamp(statx.Atime),
                MTime:   timespecFromStatxTimestamp(statx.Mtime),
                CTime:   timespecFromStatxTimestamp(statx.Ctime),
        }
}

















































  897 



  880 

  548 

   15 




  902 





  903 


  901 









  902 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package log

import (
        "fmt"
        "os"
        "runtime"
        "strings"
        "time"
)

// GoogleEmitter is a wrapper that emits logs in a format compatible with
// package github.com/golang/glog.
type GoogleEmitter struct {
        *Writer
}

// pid is used for the threadid component of the header.
var pid = os.Getpid()

// Emit emits the message, google-style.
//
// Log lines have this form:
//   Lmmdd hh:mm:ss.uuuuuu threadid file:line] msg...
//
// where the fields are defined as follows:
//   L                A single character, representing the log level (eg 'I' for INFO)
//   mm               The month (zero padded; ie May is '05')
//   dd               The day (zero padded)
//   hh:mm:ss.uuuuuu  Time in hours, minutes and fractional seconds
//   threadid         The space-padded thread ID as returned by GetTID()
//   file             The file name
//   line             The line number
//   msg              The user-supplied message
//
func (g GoogleEmitter) Emit(depth int, level Level, timestamp time.Time, format string, args ...interface{}) {
        // Log level.
        prefix := byte('?')
        switch level {
        case Debug:
                prefix = byte('D')
        case Info:
                prefix = byte('I')
        case Warning:
                prefix = byte('W')
        }

        // Timestamp.
        _, month, day := timestamp.Date()
        hour, minute, second := timestamp.Clock()
        microsecond := int(timestamp.Nanosecond() / 1000)

        // 0 = this frame.
        _, file, line, ok := runtime.Caller(depth + 1)
        if ok {
                // Trim any directory path from the file.
                slash := strings.LastIndexByte(file, byte('/'))
                if slash >= 0 {
                        file = file[slash+1:]
                }
        } else {
                // We don't have a filename.
                file = "???"
                line = 0
        }

        // Generate the message.
        message := fmt.Sprintf(format, args...)

        // Emit the formatted result.
        fmt.Fprintf(g.Writer, "%c%02d%02d %02d:%02d:%02d.%06d % 7d %s:%d] %s\n", prefix, int(month), day, hour, minute, second, microsecond, pid, file, line, message)
}

































































    1 
















    2 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package syscalls is the interface from the application to the kernel.
// Traditionally, syscalls is the interface that is used by applications to
// request services from the kernel of a operating system. We provide a
// user-mode kernel that needs to handle those requests coming from unmodified
// applications. Therefore, we still use the term "syscalls" to denote this
// interface.
//
// Note that the stubs in this package may merely provide the interface, not
// the actual implementation. It just makes writing syscall stubs
// straightforward.
package syscalls

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/syserror"
)

// Supported returns a syscall that is fully supported.
func Supported(name string, fn kernel.SyscallFn) kernel.Syscall {
        return kernel.Syscall{
                Name:         name,
                Fn:           fn,
                SupportLevel: kernel.SupportFull,
                Note:         "Fully Supported.",
        }
}

// PartiallySupported returns a syscall that has a partial implementation.
func PartiallySupported(name string, fn kernel.SyscallFn, note string, urls []string) kernel.Syscall {
        return kernel.Syscall{
                Name:         name,
                Fn:           fn,
                SupportLevel: kernel.SupportPartial,
                Note:         note,
                URLs:         urls,
        }
}

// Error returns a syscall handler that will always give the passed error.
func Error(name string, err error, note string, urls []string) kernel.Syscall {
        if note != "" {
                note = note + "; "
        }
        return kernel.Syscall{
                Name: name,
                Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
                        return 0, nil, err
                },
                SupportLevel: kernel.SupportUnimplemented,
                Note:         fmt.Sprintf("%sReturns %q.", note, err.Error()),
                URLs:         urls,
        }
}

// ErrorWithEvent gives a syscall function that sends an unimplemented
// syscall event via the event channel and returns the passed error.
func ErrorWithEvent(name string, err error, note string, urls []string) kernel.Syscall {
        if note != "" {
                note = note + "; "
        }
        return kernel.Syscall{
                Name: name,
                Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
                        t.Kernel().EmitUnimplementedEvent(t)
                        return 0, nil, err
                },
                SupportLevel: kernel.SupportUnimplemented,
                Note:         fmt.Sprintf("%sReturns %q.", note, err.Error()),
                URLs:         urls,
        }
}

// CapError gives a syscall function that checks for capability c.  If the task
// has the capability, it returns ENOSYS, otherwise EPERM. To unprivileged
// tasks, it will seem like there is an implementation.
func CapError(name string, c linux.Capability, note string, urls []string) kernel.Syscall {
        if note != "" {
                note = note + "; "
        }
        return kernel.Syscall{
                Name: name,
                Fn: func(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
                        if !t.HasCapability(c) {
                                return 0, nil, linuxerr.EPERM
                        }
                        t.Kernel().EmitUnimplementedEvent(t)
                        return 0, nil, syserror.ENOSYS
                },
                SupportLevel: kernel.SupportUnimplemented,
                Note:         fmt.Sprintf("%sReturns %q if the process does not have %s; %q otherwise.", note, linuxerr.EPERM, c.String(), syserror.ENOSYS),
                URLs:         urls,
        }
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/auth/auth_abi_autogen_unsafe.go: no such file or directory



























   18 

    3 

    2 


    1 


    3 




























   10 










    1 











    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build amd64

package kernel

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
)

// ptraceArch implements arch-specific ptrace commands.
func (t *Task) ptraceArch(target *Task, req int64, addr, data hostarch.Addr) error {
        switch req {
        case linux.PTRACE_PEEKUSR: // aka PTRACE_PEEKUSER
                n, err := target.Arch().PtracePeekUser(uintptr(addr))
                if err != nil {
                        return err
                }
                _, err = n.CopyOut(t, data)
                return err

        case linux.PTRACE_POKEUSR: // aka PTRACE_POKEUSER
                return target.Arch().PtracePokeUser(uintptr(addr), uintptr(data))

        case linux.PTRACE_GETREGS:
                // "Copy the tracee's general-purpose ... registers ... to the address
                // data in the tracer. ... (addr is ignored.) Note that SPARC systems
                // have the meaning of data and addr reversed ..."
                _, err := target.Arch().PtraceGetRegs(&usermem.IOReadWriter{
                        Ctx:  t,
                        IO:   t.MemoryManager(),
                        Addr: data,
                        Opts: usermem.IOOpts{
                                AddressSpaceActive: true,
                        },
                })
                return err

        case linux.PTRACE_GETFPREGS:
                s := target.Arch().FloatingPointData()
                _, err := target.Arch().FloatingPointData().PtraceGetFPRegs(&usermem.IOReadWriter{
                        Ctx:  t,
                        IO:   t.MemoryManager(),
                        Addr: data,
                        Opts: usermem.IOOpts{
                                AddressSpaceActive: true,
                        },
                }, len(*s))
                return err

        case linux.PTRACE_SETREGS:
                _, err := target.Arch().PtraceSetRegs(&usermem.IOReadWriter{
                        Ctx:  t,
                        IO:   t.MemoryManager(),
                        Addr: data,
                        Opts: usermem.IOOpts{
                                AddressSpaceActive: true,
                        },
                })
                return err

        case linux.PTRACE_SETFPREGS:
                s := target.Arch().FloatingPointData()
                _, err := s.PtraceSetFPRegs(&usermem.IOReadWriter{
                        Ctx:  t,
                        IO:   t.MemoryManager(),
                        Addr: data,
                        Opts: usermem.IOOpts{
                                AddressSpaceActive: true,
                        },
                }, len(*s))
                return err

        default:
                return syserror.EIO
        }
}

































   32 












   32 

















   32 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "gvisor.dev/gvisor/pkg/tcpip"
)

type sharedStats struct {
        local tcpip.NICStats
        multiCounterNICStats
}

// LINT.IfChange(multiCounterNICPacketStats)

type multiCounterNICPacketStats struct {
        packets tcpip.MultiCounterStat
        bytes   tcpip.MultiCounterStat
}

func (m *multiCounterNICPacketStats) init(a, b *tcpip.NICPacketStats) {
        m.packets.Init(a.Packets, b.Packets)
        m.bytes.Init(a.Bytes, b.Bytes)
}

// LINT.ThenChange(../../tcpip.go:NICPacketStats)

// LINT.IfChange(multiCounterNICNeighborStats)

type multiCounterNICNeighborStats struct {
        unreachableEntryLookups tcpip.MultiCounterStat
}

func (m *multiCounterNICNeighborStats) init(a, b *tcpip.NICNeighborStats) {
        m.unreachableEntryLookups.Init(a.UnreachableEntryLookups, b.UnreachableEntryLookups)
}

// LINT.ThenChange(../../tcpip.go:NICNeighborStats)

// LINT.IfChange(multiCounterNICStats)

type multiCounterNICStats struct {
        unknownL3ProtocolRcvdPackets tcpip.MultiCounterStat
        unknownL4ProtocolRcvdPackets tcpip.MultiCounterStat
        malformedL4RcvdPackets       tcpip.MultiCounterStat
        tx                           multiCounterNICPacketStats
        rx                           multiCounterNICPacketStats
        disabledRx                   multiCounterNICPacketStats
        neighbor                     multiCounterNICNeighborStats
}

func (m *multiCounterNICStats) init(a, b *tcpip.NICStats) {
        m.unknownL3ProtocolRcvdPackets.Init(a.UnknownL3ProtocolRcvdPackets, b.UnknownL3ProtocolRcvdPackets)
        m.unknownL4ProtocolRcvdPackets.Init(a.UnknownL4ProtocolRcvdPackets, b.UnknownL4ProtocolRcvdPackets)
        m.malformedL4RcvdPackets.Init(a.MalformedL4RcvdPackets, b.MalformedL4RcvdPackets)
        m.tx.init(&a.Tx, &b.Tx)
        m.rx.init(&a.Rx, &b.Rx)
        m.disabledRx.init(&a.DisabledRx, &b.DisabledRx)
        m.neighbor.init(&a.Neighbor, &b.Neighbor)
}

// LINT.ThenChange(../../tcpip.go:NICStats)










































































   13 




    3 














    2 














    6 











































    2 

















    2 





    2 





    1 












    2 



























































































  123 


    3 


  121 





   59 






   16 


    3 


   13 





   82 








    5 


    1 


    4 





    4 






   10 





   10 





  235 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package primitive defines marshal.Marshallable implementations for primitive
// types.
package primitive

import (
        "io"

        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal"
)

// Int8 is a marshal.Marshallable implementation for int8.
//
// +marshal slice:Int8Slice:inner
type Int8 int8

// Uint8 is a marshal.Marshallable implementation for uint8.
//
// +marshal slice:Uint8Slice:inner
type Uint8 uint8

// Int16 is a marshal.Marshallable implementation for int16.
//
// +marshal slice:Int16Slice:inner
type Int16 int16

// Uint16 is a marshal.Marshallable implementation for uint16.
//
// +marshal slice:Uint16Slice:inner
type Uint16 uint16

// Int32 is a marshal.Marshallable implementation for int32.
//
// +marshal slice:Int32Slice:inner
type Int32 int32

// Uint32 is a marshal.Marshallable implementation for uint32.
//
// +marshal slice:Uint32Slice:inner
type Uint32 uint32

// Int64 is a marshal.Marshallable implementation for int64.
//
// +marshal slice:Int64Slice:inner
type Int64 int64

// Uint64 is a marshal.Marshallable implementation for uint64.
//
// +marshal slice:Uint64Slice:inner
type Uint64 uint64

// ByteSlice is a marshal.Marshallable implementation for []byte.
// This is a convenience wrapper around a dynamically sized type, and can't be
// embedded in other marshallable types because it breaks assumptions made by
// go-marshal internals. It violates the "no dynamically-sized types"
// constraint of the go-marshal library.
type ByteSlice []byte

// SizeBytes implements marshal.Marshallable.SizeBytes.
func (b *ByteSlice) SizeBytes() int {
        return len(*b)
}

// MarshalBytes implements marshal.Marshallable.MarshalBytes.
func (b *ByteSlice) MarshalBytes(dst []byte) {
        copy(dst, *b)
}

// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
func (b *ByteSlice) UnmarshalBytes(src []byte) {
        copy(*b, src)
}

// Packed implements marshal.Marshallable.Packed.
func (b *ByteSlice) Packed() bool {
        return false
}

// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
func (b *ByteSlice) MarshalUnsafe(dst []byte) {
        b.MarshalBytes(dst)
}

// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
func (b *ByteSlice) UnmarshalUnsafe(src []byte) {
        b.UnmarshalBytes(src)
}

// CopyIn implements marshal.Marshallable.CopyIn.
func (b *ByteSlice) CopyIn(cc marshal.CopyContext, addr hostarch.Addr) (int, error) {
        return cc.CopyInBytes(addr, *b)
}

// CopyOut implements marshal.Marshallable.CopyOut.
func (b *ByteSlice) CopyOut(cc marshal.CopyContext, addr hostarch.Addr) (int, error) {
        return cc.CopyOutBytes(addr, *b)
}

// CopyOutN implements marshal.Marshallable.CopyOutN.
func (b *ByteSlice) CopyOutN(cc marshal.CopyContext, addr hostarch.Addr, limit int) (int, error) {
        return cc.CopyOutBytes(addr, (*b)[:limit])
}

// WriteTo implements io.WriterTo.WriteTo.
func (b *ByteSlice) WriteTo(w io.Writer) (int64, error) {
        n, err := w.Write(*b)
        return int64(n), err
}

var _ marshal.Marshallable = (*ByteSlice)(nil)

// The following set of functions are convenient shorthands for wrapping a
// built-in type in a marshallable primitive type. For example:
//
// func useMarshallable(m marshal.Marshallable) { ... }
//
// // Compare:
//
// buf = []byte{...}
// // useMarshallable(&primitive.ByteSlice(buf)) // Not allowed, can't address temp value.
// bufP := primitive.ByteSlice(buf)
// useMarshallable(&bufP)
//
// // Vs:
//
// useMarshallable(AsByteSlice(buf))
//
// Note that the argument to these function escapes, so avoid using them on very
// hot code paths. But generally if a function accepts an interface as an
// argument, the argument escapes anyways.

// AllocateInt8 returns x as a marshallable.
func AllocateInt8(x int8) marshal.Marshallable {
        p := Int8(x)
        return &p
}

// AllocateUint8 returns x as a marshallable.
func AllocateUint8(x uint8) marshal.Marshallable {
        p := Uint8(x)
        return &p
}

// AllocateInt16 returns x as a marshallable.
func AllocateInt16(x int16) marshal.Marshallable {
        p := Int16(x)
        return &p
}

// AllocateUint16 returns x as a marshallable.
func AllocateUint16(x uint16) marshal.Marshallable {
        p := Uint16(x)
        return &p
}

// AllocateInt32 returns x as a marshallable.
func AllocateInt32(x int32) marshal.Marshallable {
        p := Int32(x)
        return &p
}

// AllocateUint32 returns x as a marshallable.
func AllocateUint32(x uint32) marshal.Marshallable {
        p := Uint32(x)
        return &p
}

// AllocateInt64 returns x as a marshallable.
func AllocateInt64(x int64) marshal.Marshallable {
        p := Int64(x)
        return &p
}

// AllocateUint64 returns x as a marshallable.
func AllocateUint64(x uint64) marshal.Marshallable {
        p := Uint64(x)
        return &p
}

// AsByteSlice returns b as a marshallable. Note that this allocates a new slice
// header, but does not copy the slice contents.
func AsByteSlice(b []byte) marshal.Marshallable {
        bs := ByteSlice(b)
        return &bs
}

// Below, we define some convenience functions for marshalling primitive types
// using the newtypes above, without requiring superfluous casts.

// 8-bit integers

// CopyInt8In is a convenient wrapper for copying in an int8 from the task's
// memory.
func CopyInt8In(cc marshal.CopyContext, addr hostarch.Addr, dst *int8) (int, error) {
        var buf Int8
        n, err := buf.CopyIn(cc, addr)
        if err != nil {
                return n, err
        }
        *dst = int8(buf)
        return n, nil
}

// CopyInt8Out is a convenient wrapper for copying out an int8 to the task's
// memory.
func CopyInt8Out(cc marshal.CopyContext, addr hostarch.Addr, src int8) (int, error) {
        srcP := Int8(src)
        return srcP.CopyOut(cc, addr)
}

// CopyUint8In is a convenient wrapper for copying in a uint8 from the task's
// memory.
func CopyUint8In(cc marshal.CopyContext, addr hostarch.Addr, dst *uint8) (int, error) {
        var buf Uint8
        n, err := buf.CopyIn(cc, addr)
        if err != nil {
                return n, err
        }
        *dst = uint8(buf)
        return n, nil
}

// CopyUint8Out is a convenient wrapper for copying out a uint8 to the task's
// memory.
func CopyUint8Out(cc marshal.CopyContext, addr hostarch.Addr, src uint8) (int, error) {
        srcP := Uint8(src)
        return srcP.CopyOut(cc, addr)
}

// 16-bit integers

// CopyInt16In is a convenient wrapper for copying in an int16 from the task's
// memory.
func CopyInt16In(cc marshal.CopyContext, addr hostarch.Addr, dst *int16) (int, error) {
        var buf Int16
        n, err := buf.CopyIn(cc, addr)
        if err != nil {
                return n, err
        }
        *dst = int16(buf)
        return n, nil
}

// CopyInt16Out is a convenient wrapper for copying out an int16 to the task's
// memory.
func CopyInt16Out(cc marshal.CopyContext, addr hostarch.Addr, src int16) (int, error) {
        srcP := Int16(src)
        return srcP.CopyOut(cc, addr)
}

// CopyUint16In is a convenient wrapper for copying in a uint16 from the task's
// memory.
func CopyUint16In(cc marshal.CopyContext, addr hostarch.Addr, dst *uint16) (int, error) {
        var buf Uint16
        n, err := buf.CopyIn(cc, addr)
        if err != nil {
                return n, err
        }
        *dst = uint16(buf)
        return n, nil
}

// CopyUint16Out is a convenient wrapper for copying out a uint16 to the task's
// memory.
func CopyUint16Out(cc marshal.CopyContext, addr hostarch.Addr, src uint16) (int, error) {
        srcP := Uint16(src)
        return srcP.CopyOut(cc, addr)
}

// 32-bit integers

// CopyInt32In is a convenient wrapper for copying in an int32 from the task's
// memory.
func CopyInt32In(cc marshal.CopyContext, addr hostarch.Addr, dst *int32) (int, error) {
        var buf Int32
        n, err := buf.CopyIn(cc, addr)
        if err != nil {
                return n, err
        }
        *dst = int32(buf)
        return n, nil
}

// CopyInt32Out is a convenient wrapper for copying out an int32 to the task's
// memory.
func CopyInt32Out(cc marshal.CopyContext, addr hostarch.Addr, src int32) (int, error) {
        srcP := Int32(src)
        return srcP.CopyOut(cc, addr)
}

// CopyUint32In is a convenient wrapper for copying in a uint32 from the task's
// memory.
func CopyUint32In(cc marshal.CopyContext, addr hostarch.Addr, dst *uint32) (int, error) {
        var buf Uint32
        n, err := buf.CopyIn(cc, addr)
        if err != nil {
                return n, err
        }
        *dst = uint32(buf)
        return n, nil
}

// CopyUint32Out is a convenient wrapper for copying out a uint32 to the task's
// memory.
func CopyUint32Out(cc marshal.CopyContext, addr hostarch.Addr, src uint32) (int, error) {
        srcP := Uint32(src)
        return srcP.CopyOut(cc, addr)
}

// 64-bit integers

// CopyInt64In is a convenient wrapper for copying in an int64 from the task's
// memory.
func CopyInt64In(cc marshal.CopyContext, addr hostarch.Addr, dst *int64) (int, error) {
        var buf Int64
        n, err := buf.CopyIn(cc, addr)
        if err != nil {
                return n, err
        }
        *dst = int64(buf)
        return n, nil
}

// CopyInt64Out is a convenient wrapper for copying out an int64 to the task's
// memory.
func CopyInt64Out(cc marshal.CopyContext, addr hostarch.Addr, src int64) (int, error) {
        srcP := Int64(src)
        return srcP.CopyOut(cc, addr)
}

// CopyUint64In is a convenient wrapper for copying in a uint64 from the task's
// memory.
func CopyUint64In(cc marshal.CopyContext, addr hostarch.Addr, dst *uint64) (int, error) {
        var buf Uint64
        n, err := buf.CopyIn(cc, addr)
        if err != nil {
                return n, err
        }
        *dst = uint64(buf)
        return n, nil
}

// CopyUint64Out is a convenient wrapper for copying out a uint64 to the task's
// memory.
func CopyUint64Out(cc marshal.CopyContext, addr hostarch.Addr, src uint64) (int, error) {
        srcP := Uint64(src)
        return srcP.CopyOut(cc, addr)
}

// CopyByteSliceIn is a convenient wrapper for copying in a []byte from the
// task's memory.
func CopyByteSliceIn(cc marshal.CopyContext, addr hostarch.Addr, dst *[]byte) (int, error) {
        var buf ByteSlice
        n, err := buf.CopyIn(cc, addr)
        if err != nil {
                return n, err
        }
        *dst = []byte(buf)
        return n, nil
}

// CopyByteSliceOut is a convenient wrapper for copying out a []byte to the
// task's memory.
func CopyByteSliceOut(cc marshal.CopyContext, addr hostarch.Addr, src []byte) (int, error) {
        srcP := ByteSlice(src)
        return srcP.CopyOut(cc, addr)
}

// CopyStringIn is a convenient wrapper for copying in a string from the
// task's memory.
func CopyStringIn(cc marshal.CopyContext, addr hostarch.Addr, dst *string) (int, error) {
        var buf ByteSlice
        n, err := buf.CopyIn(cc, addr)
        if err != nil {
                return n, err
        }
        *dst = string(buf)
        return n, nil
}

// CopyStringOut is a convenient wrapper for copying out a string to the task's
// memory.
func CopyStringOut(cc marshal.CopyContext, addr hostarch.Addr, src string) (int, error) {
        srcP := ByteSlice(src)
        return srcP.CopyOut(cc, addr)
}













































   77 
    1 


   75 




    3 
    2 


    1 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package limits

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
)

// FromLinuxResource maps linux resources to sentry LimitTypes.
var FromLinuxResource = map[int]LimitType{
        linux.RLIMIT_CPU:        CPU,
        linux.RLIMIT_FSIZE:      FileSize,
        linux.RLIMIT_DATA:       Data,
        linux.RLIMIT_STACK:      Stack,
        linux.RLIMIT_CORE:       Core,
        linux.RLIMIT_RSS:        Rss,
        linux.RLIMIT_NPROC:      ProcessCount,
        linux.RLIMIT_NOFILE:     NumberOfFiles,
        linux.RLIMIT_MEMLOCK:    MemoryLocked,
        linux.RLIMIT_AS:         AS,
        linux.RLIMIT_LOCKS:      Locks,
        linux.RLIMIT_SIGPENDING: SignalsPending,
        linux.RLIMIT_MSGQUEUE:   MessageQueueBytes,
        linux.RLIMIT_NICE:       Nice,
        linux.RLIMIT_RTPRIO:     RealTimePriority,
        linux.RLIMIT_RTTIME:     Rttime,
}

// FromLinux maps linux rlimit values to sentry Limits, being careful to handle
// infinities.
func FromLinux(rl uint64) uint64 {
        if rl == linux.RLimInfinity {
                return Infinity
        }
        return rl
}

// ToLinux maps sentry Limits to linux rlimit values, being careful to handle
// infinities.
func ToLinux(l uint64) uint64 {
        if l == Infinity {
                return linux.RLimInfinity
        }
        return l
}

// NewLinuxLimitSet returns a LimitSet whose values match the default rlimits
// in Linux.
func NewLinuxLimitSet() (*LimitSet, error) {
        ls := NewLimitSet()
        for rlt, rl := range linux.InitRLimits {
                lt, ok := FromLinuxResource[rlt]
                if !ok {
                        return nil, fmt.Errorf("unknown rlimit type %v", rlt)
                }
                ls.SetUnchecked(lt, Limit{
                        Cur: FromLinux(rl.Cur),
                        Max: FromLinux(rl.Max),
                })
        }
        return ls, nil
}

// NewLinuxDistroLimitSet returns a new LimitSet whose values are typical
// for a booted Linux distro.
//
// Many Linux init systems adjust the default Linux limits to values more
// expected by the rest of the userspace. NewLinuxDistroLimitSet returns a
// LimitSet with sensible defaults for applications that aren't starting
// their own init system.
func NewLinuxDistroLimitSet() (*LimitSet, error) {
        ls, err := NewLinuxLimitSet()
        if err != nil {
                return nil, err
        }

        // Adjust ProcessCount to a lower value because GNU bash allocates 16
        // bytes per proc and OOMs if this number is set too high. Value was
        // picked arbitrarily.
        //
        // 1,048,576 ought to be enough for anyone.
        l := ls.Get(ProcessCount)
        l.Cur = 1 << 20
        ls.Set(ProcessCount, l, true /* privileged */)
        return ls, nil
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/tmpfs/dentry_list.go: no such file or directory
















































































  443 




  584 



























































  727 





  237 




  739 





  618 





  423 





  631 







    4 





    3 











    6 

    1 



    5 
















    5 











  233 



















    2 





    2 




    1 

    2 











    1 













    1 

    1 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sync"
)

// Dentry represents a node in a Filesystem tree at which a file exists.
//
// Dentries are reference-counted. Unless otherwise specified, all Dentry
// methods require that a reference is held.
//
// Dentry is loosely analogous to Linux's struct dentry, but:
//
// - VFS does not associate Dentries with inodes. gVisor interacts primarily
// with filesystems that are accessed through filesystem APIs (as opposed to
// raw block devices); many such APIs support only paths and file descriptors,
// and not inodes. Furthermore, when parties outside the scope of VFS can
// rename inodes on such filesystems, VFS generally cannot "follow" the rename,
// both due to synchronization issues and because it may not even be able to
// name the destination path; this implies that it would in fact be incorrect
// for Dentries to be associated with inodes on such filesystems. Consequently,
// operations that are inode operations in Linux are FilesystemImpl methods
// and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do
// support inodes may store appropriate state in implementations of DentryImpl.
//
// - VFS does not require that Dentries are instantiated for all paths accessed
// through VFS, only those that are tracked beyond the scope of a single
// Filesystem operation. This includes file descriptions, mount points, mount
// roots, process working directories, and chroots. This avoids instantiation
// of Dentries for operations on mutable remote filesystems that can't actually
// cache any state in the Dentry.
//
// - VFS does not track filesystem structure (i.e. relationships between
// Dentries), since both the relevant state and synchronization are
// filesystem-specific.
//
// - For the reasons above, VFS is not directly responsible for managing Dentry
// lifetime. Dentry reference counts only indicate the extent to which VFS
// requires Dentries to exist; Filesystems may elect to cache or discard
// Dentries with zero references.
//
// +stateify savable
type Dentry struct {
        // mu synchronizes deletion/invalidation and mounting over this Dentry.
        mu sync.Mutex `state:"nosave"`

        // dead is true if the file represented by this Dentry has been deleted (by
        // CommitDeleteDentry or CommitRenameReplaceDentry) or invalidated (by
        // InvalidateDentry). dead is protected by mu.
        dead bool

        // mounts is the number of Mounts for which this Dentry is Mount.point.
        // mounts is accessed using atomic memory operations.
        mounts uint32

        // impl is the DentryImpl associated with this Dentry. impl is immutable.
        // This should be the last field in Dentry.
        impl DentryImpl
}

// Init must be called before first use of d.
func (d *Dentry) Init(impl DentryImpl) {
        d.impl = impl
}

// Impl returns the DentryImpl associated with d.
func (d *Dentry) Impl() DentryImpl {
        return d.impl
}

// DentryImpl contains implementation details for a Dentry. Implementations of
// DentryImpl should contain their associated Dentry by value as their first
// field.
//
// +stateify savable
type DentryImpl interface {
        // IncRef increments the Dentry's reference count. A Dentry with a non-zero
        // reference count must remain coherent with the state of the filesystem.
        IncRef()

        // TryIncRef increments the Dentry's reference count and returns true. If
        // the Dentry's reference count is zero, TryIncRef may do nothing and
        // return false. (It is also permitted to succeed if it can restore the
        // guarantee that the Dentry is coherent with the state of the filesystem.)
        //
        // TryIncRef does not require that a reference is held on the Dentry.
        TryIncRef() bool

        // DecRef decrements the Dentry's reference count.
        DecRef(ctx context.Context)

        // InotifyWithParent notifies all watches on the targets represented by this
        // dentry and its parent. The parent's watches are notified first, followed
        // by this dentry's.
        //
        // InotifyWithParent automatically adds the IN_ISDIR flag for dentries
        // representing directories.
        //
        // Note that the events may not actually propagate up to the user, depending
        // on the event masks.
        InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType)

        // Watches returns the set of inotify watches for the file corresponding to
        // the Dentry. Dentries that are hard links to the same underlying file
        // share the same watches.
        //
        // Watches may return nil if the dentry belongs to a FilesystemImpl that
        // does not support inotify. If an implementation returns a non-nil watch
        // set, it must always return a non-nil watch set. Likewise, if an
        // implementation returns a nil watch set, it must always return a nil watch
        // set.
        //
        // The caller does not need to hold a reference on the dentry.
        Watches() *Watches

        // OnZeroWatches is called whenever the number of watches on a dentry drops
        // to zero. This is needed by some FilesystemImpls (e.g. gofer) to manage
        // dentry lifetime.
        //
        // The caller does not need to hold a reference on the dentry. OnZeroWatches
        // may acquire inotify locks, so to prevent deadlock, no inotify locks should
        // be held by the caller.
        OnZeroWatches(ctx context.Context)
}

// IncRef increments d's reference count.
func (d *Dentry) IncRef() {
        d.impl.IncRef()
}

// TryIncRef increments d's reference count and returns true. If d's reference
// count is zero, TryIncRef may instead do nothing and return false.
func (d *Dentry) TryIncRef() bool {
        return d.impl.TryIncRef()
}

// DecRef decrements d's reference count.
func (d *Dentry) DecRef(ctx context.Context) {
        d.impl.DecRef(ctx)
}

// IsDead returns true if d has been deleted or invalidated by its owning
// filesystem.
func (d *Dentry) IsDead() bool {
        d.mu.Lock()
        defer d.mu.Unlock()
        return d.dead
}

func (d *Dentry) isMounted() bool {
        return atomic.LoadUint32(&d.mounts) != 0
}

// InotifyWithParent notifies all watches on the targets represented by d and
// its parent of events.
func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) {
        d.impl.InotifyWithParent(ctx, events, cookie, et)
}

// Watches returns the set of inotify watches associated with d.
//
// Watches will return nil if d belongs to a FilesystemImpl that does not
// support inotify.
func (d *Dentry) Watches() *Watches {
        return d.impl.Watches()
}

// OnZeroWatches performs cleanup tasks whenever the number of watches on a
// dentry drops to zero.
func (d *Dentry) OnZeroWatches(ctx context.Context) {
        d.impl.OnZeroWatches(ctx)
}

// The following functions are exported so that filesystem implementations can
// use them. The vfs package, and users of VFS, should not call these
// functions.

// PrepareDeleteDentry must be called before attempting to delete the file
// represented by d. If PrepareDeleteDentry succeeds, the caller must call
// AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome.
// +checklocksacquire:d.mu
func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dentry) error {
        vfs.mountMu.Lock()
        if mntns.mountpoints[d] != 0 {
                vfs.mountMu.Unlock()
                return linuxerr.EBUSY // +checklocksforce: inconsistent return.
        }
        d.mu.Lock()
        vfs.mountMu.Unlock()
        // Return with d.mu locked to block attempts to mount over it; it will be
        // unlocked by AbortDeleteDentry or CommitDeleteDentry.
        return nil
}

// AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion
// fails.
// +checklocksrelease:d.mu
func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) {
        d.mu.Unlock()
}

// CommitDeleteDentry must be called after PrepareDeleteDentry if the deletion
// succeeds.
// +checklocksrelease:d.mu
func (vfs *VirtualFilesystem) CommitDeleteDentry(ctx context.Context, d *Dentry) {
        d.dead = true
        d.mu.Unlock()
        if d.isMounted() {
                vfs.forgetDeadMountpoint(ctx, d)
        }
}

// InvalidateDentry is called when d ceases to represent the file it formerly
// did for reasons outside of VFS' control (e.g. d represents the local state
// of a file on a remote filesystem on which the file has already been
// deleted).
func (vfs *VirtualFilesystem) InvalidateDentry(ctx context.Context, d *Dentry) {
        d.mu.Lock()
        d.dead = true
        d.mu.Unlock()
        if d.isMounted() {
                vfs.forgetDeadMountpoint(ctx, d)
        }
}

// PrepareRenameDentry must be called before attempting to rename the file
// represented by from. If to is not nil, it represents the file that will be
// replaced or exchanged by the rename. If PrepareRenameDentry succeeds, the
// caller must call AbortRenameDentry, CommitRenameReplaceDentry, or
// CommitRenameExchangeDentry depending on the rename's outcome.
//
// Preconditions:
// * If to is not nil, it must be a child Dentry from the same Filesystem.
// * from != to.
// +checklocksacquire:from.mu
// +checklocksacquire:to.mu
func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
        vfs.mountMu.Lock()
        if mntns.mountpoints[from] != 0 {
                vfs.mountMu.Unlock()
                return linuxerr.EBUSY // +checklocksforce: no locks acquired.
        }
        if to != nil {
                if mntns.mountpoints[to] != 0 {
                        vfs.mountMu.Unlock()
                        return linuxerr.EBUSY // +checklocksforce: no locks acquired.
                }
                to.mu.Lock()
        }
        from.mu.Lock()
        vfs.mountMu.Unlock()
        // Return with from.mu and to.mu locked, which will be unlocked by
        // AbortRenameDentry, CommitRenameReplaceDentry, or
        // CommitRenameExchangeDentry.
        return nil // +checklocksforce: to may not be acquired.
}

// AbortRenameDentry must be called after PrepareRenameDentry if the rename
// fails.
// +checklocksrelease:from.mu
// +checklocksrelease:to.mu
func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) {
        from.mu.Unlock()
        if to != nil {
                to.mu.Unlock()
        }
}

// CommitRenameReplaceDentry must be called after the file represented by from
// is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file
// that was replaced by from.
//
// Preconditions: PrepareRenameDentry was previously called on from and to.
// +checklocksrelease:from.mu
// +checklocksrelease:to.mu
func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(ctx context.Context, from, to *Dentry) {
        from.mu.Unlock()
        if to != nil {
                to.dead = true
                to.mu.Unlock()
                if to.isMounted() {
                        vfs.forgetDeadMountpoint(ctx, to)
                }
        }
}

// CommitRenameExchangeDentry must be called after the files represented by
// from and to are exchanged by rename(RENAME_EXCHANGE).
//
// Preconditions: PrepareRenameDentry was previously called on from and to.
// +checklocksrelease:from.mu
// +checklocksrelease:to.mu
func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) {
        from.mu.Unlock()
        to.mu.Unlock()
}

// forgetDeadMountpoint is called when a mount point is deleted or invalidated
// to umount all mounts using it in all other mount namespaces.
//
// forgetDeadMountpoint is analogous to Linux's
// fs/namespace.c:__detach_mounts().
func (vfs *VirtualFilesystem) forgetDeadMountpoint(ctx context.Context, d *Dentry) {
        var (
                vdsToDecRef    []VirtualDentry
                mountsToDecRef []*Mount
        )
        vfs.mountMu.Lock()
        vfs.mounts.seq.BeginWrite()
        for mnt := range vfs.mountpoints[d] {
                vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(mnt, &umountRecursiveOptions{}, vdsToDecRef, mountsToDecRef)
        }
        vfs.mounts.seq.EndWrite()
        vfs.mountMu.Unlock()
        for _, vd := range vdsToDecRef {
                vd.DecRef(ctx)
        }
        for _, mnt := range mountsToDecRef {
                mnt.DecRef(ctx)
        }
}















































   64 



   64 





   64 
   64 



   32 



   32 





   31 



   32 
   61 




   64 
   64 



   63 







































































































































































    1 
    1 



    1 





    1 







  165 
  165 








  164 



  164 
   12 



  163 

  164 



  164 



  112 




  111 




























































  163 




















  142 
  139 

  130 



  134 



  132 


  135 






































































































































  165 
  160 


  164 


  165 


  164 



  163 






  163 




  164 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package buffer

import (
        "fmt"
        "io"
)

// Buffer is an alias to View.
type Buffer = View

// View is a non-linear buffer.
//
// All methods are thread compatible.
//
// +stateify savable
type View struct {
        data bufferList
        size int64
        pool pool
}

// TrimFront removes the first count bytes from the buffer.
func (v *View) TrimFront(count int64) {
        if count >= v.size {
                v.advanceRead(v.size)
        } else {
                v.advanceRead(count)
        }
}

// Remove deletes data at specified location in v. It returns false if specified
// range does not fully reside in v.
func (v *View) Remove(offset, length int) bool {
        if offset < 0 || length < 0 {
                return false
        }
        tgt := Range{begin: offset, end: offset + length}
        if tgt.Len() != tgt.Intersect(Range{end: int(v.size)}).Len() {
                return false
        }

        // Scan through each buffer and remove intersections.
        var curr Range
        for buf := v.data.Front(); buf != nil; {
                origLen := buf.ReadSize()
                curr.end = curr.begin + origLen

                if x := curr.Intersect(tgt); x.Len() > 0 {
                        if !buf.Remove(x.Offset(-curr.begin)) {
                                panic("buf.Remove() failed")
                        }
                        if buf.ReadSize() == 0 {
                                // buf fully removed, removing it from the list.
                                oldBuf := buf
                                buf = buf.Next()
                                v.data.Remove(oldBuf)
                                v.pool.put(oldBuf)
                        } else {
                                // Only partial data intersects, moving on to next one.
                                buf = buf.Next()
                        }
                        v.size -= int64(x.Len())
                } else {
                        // This buffer is not in range, moving on to next one.
                        buf = buf.Next()
                }

                curr.begin += origLen
                if curr.begin >= tgt.end {
                        break
                }
        }
        return true
}

// ReadAt implements io.ReaderAt.ReadAt.
func (v *View) ReadAt(p []byte, offset int64) (int, error) {
        var (
                skipped int64
                done    int64
        )
        for buf := v.data.Front(); buf != nil && done < int64(len(p)); buf = buf.Next() {
                needToSkip := int(offset - skipped)
                if sz := buf.ReadSize(); sz <= needToSkip {
                        skipped += int64(sz)
                        continue
                }

                // Actually read data.
                n := copy(p[done:], buf.ReadSlice()[needToSkip:])
                skipped += int64(needToSkip)
                done += int64(n)
        }
        if int(done) < len(p) || offset+done == v.size {
                return int(done), io.EOF
        }
        return int(done), nil
}

// advanceRead advances the view's read index.
//
// Precondition: there must be sufficient bytes in the buffer.
func (v *View) advanceRead(count int64) {
        for buf := v.data.Front(); buf != nil && count > 0; {
                sz := int64(buf.ReadSize())
                if sz > count {
                        // There is still data for reading.
                        buf.ReadMove(int(count))
                        v.size -= count
                        count = 0
                        break
                }

                // Consume the whole buffer.
                oldBuf := buf
                buf = buf.Next() // Iterate.
                v.data.Remove(oldBuf)
                v.pool.put(oldBuf)

                // Update counts.
                count -= sz
                v.size -= sz
        }
        if count > 0 {
                panic(fmt.Sprintf("advanceRead still has %d bytes remaining", count))
        }
}

// Truncate truncates the view to the given bytes.
//
// This will not grow the view, only shrink it. If a length is passed that is
// greater than the current size of the view, then nothing will happen.
//
// Precondition: length must be >= 0.
func (v *View) Truncate(length int64) {
        if length < 0 {
                panic("negative length provided")
        }
        if length >= v.size {
                return // Nothing to do.
        }
        for buf := v.data.Back(); buf != nil && v.size > length; buf = v.data.Back() {
                sz := int64(buf.ReadSize())
                if after := v.size - sz; after < length {
                        // Truncate the buffer locally.
                        left := (length - after)
                        buf.write = buf.read + int(left)
                        v.size = length
                        break
                }

                // Drop the buffer completely; see above.
                v.data.Remove(buf)
                v.pool.put(buf)
                v.size -= sz
        }
}

// Grow grows the given view to the number of bytes, which will be appended. If
// zero is true, all these bytes will be zero. If zero is false, then this is
// the caller's responsibility.
//
// Precondition: length must be >= 0.
func (v *View) Grow(length int64, zero bool) {
        if length < 0 {
                panic("negative length provided")
        }
        for v.size < length {
                buf := v.data.Back()

                // Is there some space in the last buffer?
                if buf == nil || buf.Full() {
                        buf = v.pool.get()
                        v.data.PushBack(buf)
                }

                // Write up to length bytes.
                sz := buf.WriteSize()
                if int64(sz) > length-v.size {
                        sz = int(length - v.size)
                }

                // Zero the written section; note that this pattern is
                // specifically recognized and optimized by the compiler.
                if zero {
                        for i := buf.write; i < buf.write+sz; i++ {
                                buf.data[i] = 0
                        }
                }

                // Advance the index.
                buf.WriteMove(sz)
                v.size += int64(sz)
        }
}

// Prepend prepends the given data.
func (v *View) Prepend(data []byte) {
        // Is there any space in the first buffer?
        if buf := v.data.Front(); buf != nil && buf.read > 0 {
                // Fill up before the first write.
                avail := buf.read
                bStart := 0
                dStart := len(data) - avail
                if avail > len(data) {
                        bStart = avail - len(data)
                        dStart = 0
                }
                n := copy(buf.data[bStart:], data[dStart:])
                data = data[:dStart]
                v.size += int64(n)
                buf.read -= n
        }

        for len(data) > 0 {
                // Do we need an empty buffer?
                buf := v.pool.get()
                v.data.PushFront(buf)

                // The buffer is empty; copy last chunk.
                avail := len(buf.data)
                bStart := 0
                dStart := len(data) - avail
                if avail > len(data) {
                        bStart = avail - len(data)
                        dStart = 0
                }

                // We have to put the data at the end of the current
                // buffer in order to ensure that the next prepend will
                // correctly fill up the beginning of this buffer.
                n := copy(buf.data[bStart:], data[dStart:])
                data = data[:dStart]
                v.size += int64(n)
                buf.read = len(buf.data) - n
                buf.write = len(buf.data)
        }
}

// Append appends the given data.
func (v *View) Append(data []byte) {
        for done := 0; done < len(data); {
                buf := v.data.Back()

                // Ensure there's a buffer with space.
                if buf == nil || buf.Full() {
                        buf = v.pool.get()
                        v.data.PushBack(buf)
                }

                // Copy in to the given buffer.
                n := copy(buf.WriteSlice(), data[done:])
                done += n
                buf.WriteMove(n)
                v.size += int64(n)
        }
}

// AppendOwned takes ownership of data and appends it to v.
func (v *View) AppendOwned(data []byte) {
        if len(data) > 0 {
                buf := v.pool.getNoInit()
                buf.initWithData(data)
                v.data.PushBack(buf)
                v.size += int64(len(data))
        }
}

// PullUp makes the specified range contiguous and returns the backing memory.
func (v *View) PullUp(offset, length int) ([]byte, bool) {
        if length == 0 {
                return nil, true
        }
        tgt := Range{begin: offset, end: offset + length}
        if tgt.Intersect(Range{end: int(v.size)}).Len() != length {
                return nil, false
        }

        curr := Range{}
        buf := v.data.Front()
        for ; buf != nil; buf = buf.Next() {
                origLen := buf.ReadSize()
                curr.end = curr.begin + origLen

                if x := curr.Intersect(tgt); x.Len() == tgt.Len() {
                        // buf covers the whole requested target range.
                        sub := x.Offset(-curr.begin)
                        return buf.ReadSlice()[sub.begin:sub.end], true
                } else if x.Len() > 0 {
                        // buf is pointing at the starting buffer we want to merge.
                        break
                }

                curr.begin += origLen
        }

        // Calculate the total merged length.
        totLen := 0
        for n := buf; n != nil; n = n.Next() {
                totLen += n.ReadSize()
                if curr.begin+totLen >= tgt.end {
                        break
                }
        }

        // Merge the buffers.
        data := make([]byte, totLen)
        off := 0
        for n := buf; n != nil && off < totLen; {
                copy(data[off:], n.ReadSlice())
                off += n.ReadSize()

                // Remove buffers except for the first one, which will be reused.
                if n == buf {
                        n = n.Next()
                } else {
                        old := n
                        n = n.Next()
                        v.data.Remove(old)
                        v.pool.put(old)
                }
        }

        // Update the first buffer with merged data.
        buf.initWithData(data)

        r := tgt.Offset(-curr.begin)
        return buf.data[r.begin:r.end], true
}

// Flatten returns a flattened copy of this data.
//
// This method should not be used in any performance-sensitive paths. It may
// allocate a fresh byte slice sufficiently large to contain all the data in
// the buffer. This is principally for debugging.
//
// N.B. Tee data still belongs to this view, as if there is a single buffer
// present, then it will be returned directly. This should be used for
// temporary use only, and a reference to the given slice should not be held.
func (v *View) Flatten() []byte {
        if buf := v.data.Front(); buf == nil {
                return nil // No data at all.
        } else if buf.Next() == nil {
                return buf.ReadSlice() // Only one buffer.
        }
        data := make([]byte, 0, v.size) // Need to flatten.
        for buf := v.data.Front(); buf != nil; buf = buf.Next() {
                // Copy to the allocated slice.
                data = append(data, buf.ReadSlice()...)
        }
        return data
}

// Size indicates the total amount of data available in this view.
func (v *View) Size() int64 {
        return v.size
}

// Copy makes a strict copy of this view.
func (v *View) Copy() (other View) {
        for buf := v.data.Front(); buf != nil; buf = buf.Next() {
                other.Append(buf.ReadSlice())
        }
        return
}

// Apply applies the given function across all valid data.
func (v *View) Apply(fn func([]byte)) {
        for buf := v.data.Front(); buf != nil; buf = buf.Next() {
                fn(buf.ReadSlice())
        }
}

// SubApply applies fn to a given range of data in v. Any part of the range
// outside of v is ignored.
func (v *View) SubApply(offset, length int, fn func([]byte)) {
        for buf := v.data.Front(); length > 0 && buf != nil; buf = buf.Next() {
                d := buf.ReadSlice()
                if offset >= len(d) {
                        offset -= len(d)
                        continue
                }
                if offset > 0 {
                        d = d[offset:]
                        offset = 0
                }
                if length < len(d) {
                        d = d[:length]
                }
                fn(d)
                length -= len(d)
        }
}

// Merge merges the provided View with this one.
//
// The other view will be appended to v, and other will be empty after this
// operation completes.
func (v *View) Merge(other *View) {
        // Copy over all buffers.
        for buf := other.data.Front(); buf != nil; buf = other.data.Front() {
                other.data.Remove(buf)
                v.data.PushBack(buf)
        }

        // Adjust sizes.
        v.size += other.size
        other.size = 0
}

// WriteFromReader writes to the buffer from an io.Reader.
//
// A minimum read size equal to unsafe.Sizeof(unintptr) is enforced,
// provided that count is greater than or equal to unsafe.Sizeof(uintptr).
func (v *View) WriteFromReader(r io.Reader, count int64) (int64, error) {
        var (
                done int64
                n    int
                err  error
        )
        for done < count {
                buf := v.data.Back()

                // Ensure we have an empty buffer.
                if buf == nil || buf.Full() {
                        buf = v.pool.get()
                        v.data.PushBack(buf)
                }

                // Is this less than the minimum batch?
                if buf.WriteSize() < minBatch && (count-done) >= int64(minBatch) {
                        tmp := make([]byte, minBatch)
                        n, err = r.Read(tmp)
                        v.Append(tmp[:n])
                        done += int64(n)
                        if err != nil {
                                break
                        }
                        continue
                }

                // Limit the read, if necessary.
                sz := buf.WriteSize()
                if left := count - done; int64(sz) > left {
                        sz = int(left)
                }

                // Pass the relevant portion of the buffer.
                n, err = r.Read(buf.WriteSlice()[:sz])
                buf.WriteMove(n)
                done += int64(n)
                v.size += int64(n)
                if err == io.EOF {
                        err = nil // Short write allowed.
                        break
                } else if err != nil {
                        break
                }
        }
        return done, err
}

// ReadToWriter reads from the buffer into an io.Writer.
//
// N.B. This does not consume the bytes read. TrimFront should
// be called appropriately after this call in order to do so.
//
// A minimum write size equal to unsafe.Sizeof(unintptr) is enforced,
// provided that count is greater than or equal to unsafe.Sizeof(uintptr).
func (v *View) ReadToWriter(w io.Writer, count int64) (int64, error) {
        var (
                done int64
                n    int
                err  error
        )
        offset := 0 // Spill-over for batching.
        for buf := v.data.Front(); buf != nil && done < count; buf = buf.Next() {
                // Has this been consumed? Skip it.
                sz := buf.ReadSize()
                if sz <= offset {
                        offset -= sz
                        continue
                }
                sz -= offset

                // Is this less than the minimum batch?
                left := count - done
                if sz < minBatch && left >= int64(minBatch) && (v.size-done) >= int64(minBatch) {
                        tmp := make([]byte, minBatch)
                        n, err = v.ReadAt(tmp, done)
                        w.Write(tmp[:n])
                        done += int64(n)
                        offset = n - sz // Reset below.
                        if err != nil {
                                break
                        }
                        continue
                }

                // Limit the write if necessary.
                if int64(sz) >= left {
                        sz = int(left)
                }

                // Perform the actual write.
                n, err = w.Write(buf.ReadSlice()[offset : offset+sz])
                done += int64(n)
                if err != nil {
                        break
                }

                // Reset spill-over.
                offset = 0
        }
        return done, err
}

// A Range specifies a range of buffer.
type Range struct {
        begin int
        end   int
}

// Intersect returns the intersection of x and y.
func (x Range) Intersect(y Range) Range {
        if x.begin < y.begin {
                x.begin = y.begin
        }
        if x.end > y.end {
                x.end = y.end
        }
        if x.begin >= x.end {
                return Range{}
        }
        return x
}

// Offset returns x offset by off.
func (x Range) Offset(off int) Range {
        x.begin += off
        x.end += off
        return x
}

// Len returns the length of x.
func (x Range) Len() int {
        l := x.end - x.begin
        if l < 0 {
                l = 0
        }
        return l
}










































































   11 



   11 

   11 



   11 










































   10 


















   10 
   10 


   10 



   10 





















































































    1 









    1 





















    1 










    4 


























    1 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
)

const neighborCacheSize = 512 // max entries per interface

// NeighborStats holds metrics for the neighbor table.
type NeighborStats struct {
        // UnreachableEntryLookups counts the number of lookups performed on an
        // entry in Unreachable state.
        UnreachableEntryLookups *tcpip.StatCounter
}

// neighborCache maps IP addresses to link addresses. It uses the Least
// Recently Used (LRU) eviction strategy to implement a bounded cache for
// dynamically acquired entries. It contains the state machine and configuration
// for running Neighbor Unreachability Detection (NUD).
//
// There are two types of entries in the neighbor cache:
//  1. Dynamic entries are discovered automatically by neighbor discovery
//     protocols (e.g. ARP, NDP). These protocols will attempt to reconfirm
//     reachability with the device once the entry's state becomes Stale.
//  2. Static entries are explicitly added by a user and have no expiration.
//     Their state is always Static. The amount of static entries stored in the
//     cache is unbounded.
type neighborCache struct {
        nic     *nic
        state   *NUDState
        linkRes LinkAddressResolver

        mu struct {
                sync.RWMutex

                cache   map[tcpip.Address]*neighborEntry
                dynamic struct {
                        lru neighborEntryList

                        // count tracks the amount of dynamic entries in the cache. This is
                        // needed since static entries do not count towards the LRU cache
                        // eviction strategy.
                        count uint16
                }
        }
}

// getOrCreateEntry retrieves a cache entry associated with addr. The
// returned entry is always refreshed in the cache (it is reachable via the
// map, and its place is bumped in LRU).
//
// If a matching entry exists in the cache, it is returned. If no matching
// entry exists and the cache is full, an existing entry is evicted via LRU,
// reset to state incomplete, and returned. If no matching entry exists and the
// cache is not full, a new entry with state incomplete is allocated and
// returned.
func (n *neighborCache) getOrCreateEntry(remoteAddr tcpip.Address) *neighborEntry {
        n.mu.Lock()
        defer n.mu.Unlock()

        if entry, ok := n.mu.cache[remoteAddr]; ok {
                entry.mu.RLock()
                if entry.mu.neigh.State != Static {
                        n.mu.dynamic.lru.Remove(entry)
                        n.mu.dynamic.lru.PushFront(entry)
                }
                entry.mu.RUnlock()
                return entry
        }

        // The entry that needs to be created must be dynamic since all static
        // entries are directly added to the cache via addStaticEntry.
        entry := newNeighborEntry(n, remoteAddr, n.state)
        if n.mu.dynamic.count == neighborCacheSize {
                e := n.mu.dynamic.lru.Back()
                e.mu.Lock()

                delete(n.mu.cache, e.mu.neigh.Addr)
                n.mu.dynamic.lru.Remove(e)
                n.mu.dynamic.count--

                e.removeLocked()
                e.mu.Unlock()
        }
        n.mu.cache[remoteAddr] = entry
        n.mu.dynamic.lru.PushFront(entry)
        n.mu.dynamic.count++
        return entry
}

// entry looks up neighbor information matching the remote address, and returns
// it if readily available.
//
// Returns ErrWouldBlock if the link address is not readily available, along
// with a notification channel for the caller to block on. Triggers address
// resolution asynchronously.
//
// If onResolve is provided, it will be called either immediately, if resolution
// is not required, or when address resolution is complete, with the resolved
// link address and whether resolution succeeded. After any callbacks have been
// called, the returned notification channel is closed.
//
// NB: if a callback is provided, it should not call into the neighbor cache.
//
// If specified, the local address must be an address local to the interface the
// neighbor cache belongs to. The local address is the source address of a
// packet prompting NUD/link address resolution.
//
// TODO(gvisor.dev/issue/5151): Don't return the neighbor entry.
func (n *neighborCache) entry(remoteAddr, localAddr tcpip.Address, onResolve func(LinkResolutionResult)) (NeighborEntry, <-chan struct{}, tcpip.Error) {
        entry := n.getOrCreateEntry(remoteAddr)
        entry.mu.Lock()
        defer entry.mu.Unlock()

        switch s := entry.mu.neigh.State; s {
        case Stale:
                entry.handlePacketQueuedLocked(localAddr)
                fallthrough
        case Reachable, Static, Delay, Probe:
                // As per RFC 4861 section 7.3.3:
                //  "Neighbor Unreachability Detection operates in parallel with the sending
                //   of packets to a neighbor. While reasserting a neighbor's reachability,
                //   a node continues sending packets to that neighbor using the cached
                //   link-layer address."
                if onResolve != nil {
                        onResolve(LinkResolutionResult{LinkAddress: entry.mu.neigh.LinkAddr, Err: nil})
                }
                return entry.mu.neigh, nil, nil
        case Unknown, Incomplete, Unreachable:
                if onResolve != nil {
                        entry.mu.onResolve = append(entry.mu.onResolve, onResolve)
                }
                if entry.mu.done == nil {
                        // Address resolution needs to be initiated.
                        entry.mu.done = make(chan struct{})
                }
                entry.handlePacketQueuedLocked(localAddr)
                return entry.mu.neigh, entry.mu.done, &tcpip.ErrWouldBlock{}
        default:
                panic(fmt.Sprintf("Invalid cache entry state: %s", s))
        }
}

// entries returns all entries in the neighbor cache.
func (n *neighborCache) entries() []NeighborEntry {
        n.mu.RLock()
        defer n.mu.RUnlock()

        entries := make([]NeighborEntry, 0, len(n.mu.cache))
        for _, entry := range n.mu.cache {
                entry.mu.RLock()
                entries = append(entries, entry.mu.neigh)
                entry.mu.RUnlock()
        }
        return entries
}

// addStaticEntry adds a static entry to the neighbor cache, mapping an IP
// address to a link address. If a dynamic entry exists in the neighbor cache
// with the same address, it will be replaced with this static entry. If a
// static entry exists with the same address but different link address, it
// will be updated with the new link address. If a static entry exists with the
// same address and link address, nothing will happen.
func (n *neighborCache) addStaticEntry(addr tcpip.Address, linkAddr tcpip.LinkAddress) {
        n.mu.Lock()
        defer n.mu.Unlock()

        if entry, ok := n.mu.cache[addr]; ok {
                entry.mu.Lock()
                if entry.mu.neigh.State != Static {
                        // Dynamic entry found with the same address.
                        n.mu.dynamic.lru.Remove(entry)
                        n.mu.dynamic.count--
                } else if entry.mu.neigh.LinkAddr == linkAddr {
                        // Static entry found with the same address and link address.
                        entry.mu.Unlock()
                        return
                } else {
                        // Static entry found with the same address but different link address.
                        entry.mu.neigh.LinkAddr = linkAddr
                        entry.dispatchChangeEventLocked()
                        entry.mu.Unlock()
                        return
                }

                entry.removeLocked()
                entry.mu.Unlock()
        }

        entry := newStaticNeighborEntry(n, addr, linkAddr, n.state)
        n.mu.cache[addr] = entry

        entry.mu.Lock()
        defer entry.mu.Unlock()
        entry.dispatchAddEventLocked()
}

// removeEntry removes a dynamic or static entry by address from the neighbor
// cache. Returns true if the entry was found and deleted.
func (n *neighborCache) removeEntry(addr tcpip.Address) bool {
        n.mu.Lock()
        defer n.mu.Unlock()

        entry, ok := n.mu.cache[addr]
        if !ok {
                return false
        }

        entry.mu.Lock()
        defer entry.mu.Unlock()

        if entry.mu.neigh.State != Static {
                n.mu.dynamic.lru.Remove(entry)
                n.mu.dynamic.count--
        }

        entry.removeLocked()
        delete(n.mu.cache, entry.mu.neigh.Addr)
        return true
}

// clear removes all dynamic and static entries from the neighbor cache.
func (n *neighborCache) clear() {
        n.mu.Lock()
        defer n.mu.Unlock()

        for _, entry := range n.mu.cache {
                entry.mu.Lock()
                entry.removeLocked()
                entry.mu.Unlock()
        }

        n.mu.dynamic.lru = neighborEntryList{}
        n.mu.cache = make(map[tcpip.Address]*neighborEntry)
        n.mu.dynamic.count = 0
}

// config returns the NUD configuration.
func (n *neighborCache) config() NUDConfigurations {
        return n.state.Config()
}

// setConfig changes the NUD configuration.
//
// If config contains invalid NUD configuration values, it will be fixed to
// use default values for the erroneous values.
func (n *neighborCache) setConfig(config NUDConfigurations) {
        config.resetInvalidFields()
        n.state.SetConfig(config)
}

// handleProbe handles a neighbor probe as defined by RFC 4861 section 7.2.3.
//
// Validation of the probe is expected to be handled by the caller.
func (n *neighborCache) handleProbe(remoteAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) {
        entry := n.getOrCreateEntry(remoteAddr)
        entry.mu.Lock()
        entry.handleProbeLocked(remoteLinkAddr)
        entry.mu.Unlock()
}

// handleConfirmation handles a neighbor confirmation as defined by
// RFC 4861 section 7.2.5.
//
// Validation of the confirmation is expected to be handled by the caller.
func (n *neighborCache) handleConfirmation(addr tcpip.Address, linkAddr tcpip.LinkAddress, flags ReachabilityConfirmationFlags) {
        n.mu.RLock()
        entry, ok := n.mu.cache[addr]
        n.mu.RUnlock()
        if ok {
                entry.mu.Lock()
                entry.handleConfirmationLocked(linkAddr, flags)
                entry.mu.Unlock()
        }
        // The confirmation SHOULD be silently discarded if the recipient did not
        // initiate any communication with the target. This is indicated if there is
        // no matching entry for the remote address.
}

// handleUpperLevelConfirmation processes a confirmation of reachablity from
// some protocol that operates at a layer above the IP/link layer.
func (n *neighborCache) handleUpperLevelConfirmation(addr tcpip.Address) {
        n.mu.RLock()
        entry, ok := n.mu.cache[addr]
        n.mu.RUnlock()
        if ok {
                entry.mu.Lock()
                entry.handleUpperLevelConfirmationLocked()
                entry.mu.Unlock()
        }
}

func (n *neighborCache) init(nic *nic, r LinkAddressResolver) {
        *n = neighborCache{
                nic:     nic,
                state:   NewNUDState(nic.stack.nudConfigs, nic.stack.clock, nic.stack.randomGenerator),
                linkRes: r,
        }
        n.mu.Lock()
        n.mu.cache = make(map[tcpip.Address]*neighborEntry, neighborCacheSize)
        n.mu.Unlock()
}



































    6 









    2 


    5 



    4 

    1 


    3 


    1 



    2 
    1 



    1 


    3 



    1 


    2 



    1 





    1 




    1 






    1 



    7 






    1 


    6 













    6 


    1 


    1 


    1 









    4 

    1 


    3 









    1 


    2 

    2 




    2 
    1 



    1 








































    6 





    5 



    1 




    6 












    6 



    6 
    2 





    4 



   12 






   12 




   11 
    6 


    4 




    7 
    3 





    1 




    2 

    1 


    1 

    1 









    5 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/usermem"
)

// We unconditionally report a single NUMA node. This also means that our
// "nodemask_t" is a single unsigned long (uint64).
const (
        maxNodes        = 1
        allowedNodemask = (1 << maxNodes) - 1
)

func copyInNodemask(t *kernel.Task, addr hostarch.Addr, maxnode uint32) (uint64, error) {
        // "nodemask points to a bit mask of node IDs that contains up to maxnode
        // bits. The bit mask size is rounded to the next multiple of
        // sizeof(unsigned long), but the kernel will use bits only up to maxnode.
        // A NULL value of nodemask or a maxnode value of zero specifies the empty
        // set of nodes. If the value of maxnode is zero, the nodemask argument is
        // ignored." - set_mempolicy(2). Unfortunately, most of this is inaccurate
        // because of what appears to be a bug: mm/mempolicy.c:get_nodes() uses
        // maxnode-1, not maxnode, as the number of bits.
        bits := maxnode - 1
        if bits > hostarch.PageSize*8 { // also handles overflow from maxnode == 0
                return 0, linuxerr.EINVAL
        }
        if bits == 0 {
                return 0, nil
        }
        // Copy in the whole nodemask.
        numUint64 := (bits + 63) / 64
        buf := t.CopyScratchBuffer(int(numUint64) * 8)
        if _, err := t.CopyInBytes(addr, buf); err != nil {
                return 0, err
        }
        val := hostarch.ByteOrder.Uint64(buf)
        // Check that only allowed bits in the first unsigned long in the nodemask
        // are set.
        if val&^allowedNodemask != 0 {
                return 0, linuxerr.EINVAL
        }
        // Check that all remaining bits in the nodemask are 0.
        for i := 8; i < len(buf); i++ {
                if buf[i] != 0 {
                        return 0, linuxerr.EINVAL
                }
        }
        return val, nil
}

func copyOutNodemask(t *kernel.Task, addr hostarch.Addr, maxnode uint32, val uint64) error {
        // mm/mempolicy.c:copy_nodes_to_user() also uses maxnode-1 as the number of
        // bits.
        bits := maxnode - 1
        if bits > hostarch.PageSize*8 { // also handles overflow from maxnode == 0
                return linuxerr.EINVAL
        }
        if bits == 0 {
                return nil
        }
        // Copy out the first unsigned long in the nodemask.
        buf := t.CopyScratchBuffer(8)
        hostarch.ByteOrder.PutUint64(buf, val)
        if _, err := t.CopyOutBytes(addr, buf); err != nil {
                return err
        }
        // Zero out remaining unsigned longs in the nodemask.
        if bits > 64 {
                remAddr, ok := addr.AddLength(8)
                if !ok {
                        return linuxerr.EFAULT
                }
                remUint64 := (bits - 1) / 64
                if _, err := t.MemoryManager().ZeroOut(t, remAddr, int64(remUint64)*8, usermem.IOOpts{
                        AddressSpaceActive: true,
                }); err != nil {
                        return err
                }
        }
        return nil
}

// GetMempolicy implements the syscall get_mempolicy(2).
func GetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        mode := args[0].Pointer()
        nodemask := args[1].Pointer()
        maxnode := args[2].Uint()
        addr := args[3].Pointer()
        flags := args[4].Uint()

        if flags&^(linux.MPOL_F_NODE|linux.MPOL_F_ADDR|linux.MPOL_F_MEMS_ALLOWED) != 0 {
                return 0, nil, linuxerr.EINVAL
        }
        nodeFlag := flags&linux.MPOL_F_NODE != 0
        addrFlag := flags&linux.MPOL_F_ADDR != 0
        memsAllowed := flags&linux.MPOL_F_MEMS_ALLOWED != 0

        // "EINVAL: The value specified by maxnode is less than the number of node
        // IDs supported by the system." - get_mempolicy(2)
        if nodemask != 0 && maxnode < maxNodes {
                return 0, nil, linuxerr.EINVAL
        }

        // "If flags specifies MPOL_F_MEMS_ALLOWED [...], the mode argument is
        // ignored and the set of nodes (memories) that the thread is allowed to
        // specify in subsequent calls to mbind(2) or set_mempolicy(2) (in the
        // absence of any mode flags) is returned in nodemask."
        if memsAllowed {
                // "It is not permitted to combine MPOL_F_MEMS_ALLOWED with either
                // MPOL_F_ADDR or MPOL_F_NODE."
                if nodeFlag || addrFlag {
                        return 0, nil, linuxerr.EINVAL
                }
                if err := copyOutNodemask(t, nodemask, maxnode, allowedNodemask); err != nil {
                        return 0, nil, err
                }
                return 0, nil, nil
        }

        // "If flags specifies MPOL_F_ADDR, then information is returned about the
        // policy governing the memory address given in addr. ... If the mode
        // argument is not NULL, then get_mempolicy() will store the policy mode
        // and any optional mode flags of the requested NUMA policy in the location
        // pointed to by this argument. If nodemask is not NULL, then the nodemask
        // associated with the policy will be stored in the location pointed to by
        // this argument."
        if addrFlag {
                policy, nodemaskVal, err := t.MemoryManager().NumaPolicy(addr)
                if err != nil {
                        return 0, nil, err
                }
                if nodeFlag {
                        // "If flags specifies both MPOL_F_NODE and MPOL_F_ADDR,
                        // get_mempolicy() will return the node ID of the node on which the
                        // address addr is allocated into the location pointed to by mode.
                        // If no page has yet been allocated for the specified address,
                        // get_mempolicy() will allocate a page as if the thread had
                        // performed a read (load) access to that address, and return the
                        // ID of the node where that page was allocated."
                        buf := t.CopyScratchBuffer(1)
                        _, err := t.CopyInBytes(addr, buf)
                        if err != nil {
                                return 0, nil, err
                        }
                        policy = linux.MPOL_DEFAULT // maxNodes == 1
                }
                if mode != 0 {
                        if _, err := policy.CopyOut(t, mode); err != nil {
                                return 0, nil, err
                        }
                }
                if nodemask != 0 {
                        if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
                                return 0, nil, err
                        }
                }
                return 0, nil, nil
        }

        // "EINVAL: ... flags specified MPOL_F_ADDR and addr is NULL, or flags did
        // not specify MPOL_F_ADDR and addr is not NULL." This is partially
        // inaccurate: if flags specifies MPOL_F_ADDR,
        // mm/mempolicy.c:do_get_mempolicy() doesn't special-case NULL; it will
        // just (usually) fail to find a VMA at address 0 and return EFAULT.
        if addr != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // "If flags is specified as 0, then information about the calling thread's
        // default policy (as set by set_mempolicy(2)) is returned, in the buffers
        // pointed to by mode and nodemask. ... If flags specifies MPOL_F_NODE, but
        // not MPOL_F_ADDR, and the thread's current policy is MPOL_INTERLEAVE,
        // then get_mempolicy() will return in the location pointed to by a
        // non-NULL mode argument, the node ID of the next node that will be used
        // for interleaving of internal kernel pages allocated on behalf of the
        // thread."
        policy, nodemaskVal := t.NumaPolicy()
        if nodeFlag {
                if policy&^linux.MPOL_MODE_FLAGS != linux.MPOL_INTERLEAVE {
                        return 0, nil, linuxerr.EINVAL
                }
                policy = linux.MPOL_DEFAULT // maxNodes == 1
        }
        if mode != 0 {
                if _, err := policy.CopyOut(t, mode); err != nil {
                        return 0, nil, err
                }
        }
        if nodemask != 0 {
                if err := copyOutNodemask(t, nodemask, maxnode, nodemaskVal); err != nil {
                        return 0, nil, err
                }
        }
        return 0, nil, nil
}

// SetMempolicy implements the syscall set_mempolicy(2).
func SetMempolicy(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        modeWithFlags := linux.NumaPolicy(args[0].Int())
        nodemask := args[1].Pointer()
        maxnode := args[2].Uint()

        modeWithFlags, nodemaskVal, err := copyInMempolicyNodemask(t, modeWithFlags, nodemask, maxnode)
        if err != nil {
                return 0, nil, err
        }

        t.SetNumaPolicy(modeWithFlags, nodemaskVal)
        return 0, nil, nil
}

// Mbind implements the syscall mbind(2).
func Mbind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        length := args[1].Uint64()
        mode := linux.NumaPolicy(args[2].Int())
        nodemask := args[3].Pointer()
        maxnode := args[4].Uint()
        flags := args[5].Uint()

        if flags&^linux.MPOL_MF_VALID != 0 {
                return 0, nil, linuxerr.EINVAL
        }
        // "If MPOL_MF_MOVE_ALL is passed in flags ... [the] calling thread must be
        // privileged (CAP_SYS_NICE) to use this flag." - mbind(2)
        if flags&linux.MPOL_MF_MOVE_ALL != 0 && !t.HasCapability(linux.CAP_SYS_NICE) {
                return 0, nil, linuxerr.EPERM
        }

        mode, nodemaskVal, err := copyInMempolicyNodemask(t, mode, nodemask, maxnode)
        if err != nil {
                return 0, nil, err
        }

        // Since we claim to have only a single node, all flags can be ignored
        // (since all pages must already be on that single node).
        err = t.MemoryManager().SetNumaPolicy(addr, length, mode, nodemaskVal)
        return 0, nil, err
}

func copyInMempolicyNodemask(t *kernel.Task, modeWithFlags linux.NumaPolicy, nodemask hostarch.Addr, maxnode uint32) (linux.NumaPolicy, uint64, error) {
        flags := linux.NumaPolicy(modeWithFlags & linux.MPOL_MODE_FLAGS)
        mode := linux.NumaPolicy(modeWithFlags &^ linux.MPOL_MODE_FLAGS)
        if flags == linux.MPOL_MODE_FLAGS {
                // Can't specify both mode flags simultaneously.
                return 0, 0, linuxerr.EINVAL
        }
        if mode < 0 || mode >= linux.MPOL_MAX {
                // Must specify a valid mode.
                return 0, 0, linuxerr.EINVAL
        }

        var nodemaskVal uint64
        if nodemask != 0 {
                var err error
                nodemaskVal, err = copyInNodemask(t, nodemask, maxnode)
                if err != nil {
                        return 0, 0, err
                }
        }

        switch mode {
        case linux.MPOL_DEFAULT:
                // "nodemask must be specified as NULL." - set_mempolicy(2). This is inaccurate;
                // Linux allows a nodemask to be specified, as long as it is empty.
                if nodemaskVal != 0 {
                        return 0, 0, linuxerr.EINVAL
                }
        case linux.MPOL_BIND, linux.MPOL_INTERLEAVE:
                // These require a non-empty nodemask.
                if nodemaskVal == 0 {
                        return 0, 0, linuxerr.EINVAL
                }
        case linux.MPOL_PREFERRED:
                // This permits an empty nodemask, as long as no flags are set.
                if nodemaskVal == 0 && flags != 0 {
                        return 0, 0, linuxerr.EINVAL
                }
        case linux.MPOL_LOCAL:
                // This requires an empty nodemask and no flags set ...
                if nodemaskVal != 0 || flags != 0 {
                        return 0, 0, linuxerr.EINVAL
                }
                // ... and is implemented as MPOL_PREFERRED.
                mode = linux.MPOL_PREFERRED
        default:
                // Unknown mode, which we should have rejected above.
                panic(fmt.Sprintf("unknown mode: %v", mode))
        }

        return mode | flags, nodemaskVal, nil
}





















































































  225 
  123 





  114 
    1 



  113 




  108 
   22 


   89 






















































 1067 

  109 



    8 

   25 

 1062 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package syserror contains syscall error codes exported as error interface
// instead of Errno. This allows for fast comparison and returns when the
// comparand or return value is of type error because there is no need to
// convert from Errno to an interface, i.e., runtime.convT2I isn't called.
package syserror

import (
        "errors"

        "golang.org/x/sys/unix"
)

// The following variables have the same meaning as their syscall equivalent.
var (
        EIDRM    = error(unix.EIDRM)
        EINTR    = error(unix.EINTR)
        EIO      = error(unix.EIO)
        EISDIR   = error(unix.EISDIR)
        ENOENT   = error(unix.ENOENT)
        ENOEXEC  = error(unix.ENOEXEC)
        ENOMEM   = error(unix.ENOMEM)
        ENOTSOCK = error(unix.ENOTSOCK)
        ENOSPC   = error(unix.ENOSPC)
        ENOSYS   = error(unix.ENOSYS)
)

var (
        // ErrWouldBlock is an internal error used to indicate that an operation
        // cannot be satisfied immediately, and should be retried at a later
        // time, possibly when the caller has received a notification that the
        // operation may be able to complete. It is used by implementations of
        // the kio.File interface.
        ErrWouldBlock = errors.New("request would block")

        // ErrInterrupted is returned if a request is interrupted before it can
        // complete.
        ErrInterrupted = errors.New("request was interrupted")

        // ErrExceedsFileSizeLimit is returned if a request would exceed the
        // file's size limit.
        ErrExceedsFileSizeLimit = errors.New("exceeds file size limit")
)

// errorMap is the map used to convert generic errors into errnos.
var errorMap = map[error]unix.Errno{}

// errorUnwrappers is an array of unwrap functions to extract typed errors.
var errorUnwrappers = []func(error) (unix.Errno, bool){}

// AddErrorTranslation allows modules to populate the error map by adding their
// own translations during initialization. Returns if the error translation is
// accepted or not. A pre-existing translation will not be overwritten by the
// new translation.
func AddErrorTranslation(from error, to unix.Errno) bool {
        if _, ok := errorMap[from]; ok {
                return false
        }

        errorMap[from] = to
        return true
}

// AddErrorUnwrapper registers an unwrap method that can extract a concrete error
// from a typed, but not initialized, error.
func AddErrorUnwrapper(unwrap func(e error) (unix.Errno, bool)) {
        errorUnwrappers = append(errorUnwrappers, unwrap)
}

// TranslateError translates errors to errnos, it will return false if
// the error was not registered.
func TranslateError(from error) (unix.Errno, bool) {
        if err, ok := errorMap[from]; ok {
                return err, true
        }
        // Try to unwrap the error if we couldn't match an error
        // exactly.  This might mean that a package has its own
        // error type.
        for _, unwrap := range errorUnwrappers {
                if err, ok := unwrap(from); ok {
                        return err, true
                }
        }
        return 0, false
}

// ConvertIntr converts the provided error code (err) to another one (intr) if
// the first error corresponds to an interrupted operation.
func ConvertIntr(err, intr error) error {
        if err == ErrInterrupted {
                return intr
        }
        return err
}

// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel
// include/linux/errno.h. These errnos are never returned to userspace
// directly, but are used to communicate the expected behavior of an
// interrupted syscall from the syscall to signal handling.
type SyscallRestartErrno int

// These numeric values are significant because ptrace syscall exit tracing can
// observe them.
//
// For all of the following errnos, if the syscall is not interrupted by a
// signal delivered to a user handler, the syscall is restarted.
const (
        // ERESTARTSYS is returned by an interrupted syscall to indicate that it
        // should be converted to EINTR if interrupted by a signal delivered to a
        // user handler without SA_RESTART set, and restarted otherwise.
        ERESTARTSYS = SyscallRestartErrno(512)

        // ERESTARTNOINTR is returned by an interrupted syscall to indicate that it
        // should always be restarted.
        ERESTARTNOINTR = SyscallRestartErrno(513)

        // ERESTARTNOHAND is returned by an interrupted syscall to indicate that it
        // should be converted to EINTR if interrupted by a signal delivered to a
        // user handler, and restarted otherwise.
        ERESTARTNOHAND = SyscallRestartErrno(514)

        // ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate
        // that it should be restarted using a custom function. The interrupted
        // syscall must register a custom restart function by calling
        // Task.SetRestartSyscallFn.
        ERESTART_RESTARTBLOCK = SyscallRestartErrno(516)
)

// Error implements error.Error.
func (e SyscallRestartErrno) Error() string {
        // Descriptions are borrowed from strace.
        switch e {
        case ERESTARTSYS:
                return "to be restarted if SA_RESTART is set"
        case ERESTARTNOINTR:
                return "to be restarted"
        case ERESTARTNOHAND:
                return "to be restarted if no handler"
        case ERESTART_RESTARTBLOCK:
                return "interrupted by signal"
        default:
                return "(unknown interrupt error)"
        }
}

// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by
// rv, the value in a syscall return register.
func SyscallRestartErrnoFromReturn(rv uintptr) (SyscallRestartErrno, bool) {
        switch int(rv) {
        case -int(ERESTARTSYS):
                return ERESTARTSYS, true
        case -int(ERESTARTNOINTR):
                return ERESTARTNOINTR, true
        case -int(ERESTARTNOHAND):
                return ERESTARTNOHAND, true
        case -int(ERESTART_RESTARTBLOCK):
                return ERESTART_RESTARTBLOCK, true
        default:
                return 0, false
        }
}

func init() {
        AddErrorTranslation(ErrWouldBlock, unix.EWOULDBLOCK)
        AddErrorTranslation(ErrInterrupted, unix.EINTR)
        AddErrorTranslation(ErrExceedsFileSizeLimit, unix.EFBIG)
}








































































































































































































    9 





























































































































































   10 

    9 







    9 




    9 













   10 

   10 










   10 







   10 






   10 








    9 












    1 




























    1 















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "fmt"
        "sync"
        "time"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
)

const (
        // immediateDuration is a duration of zero for scheduling work that needs to
        // be done immediately but asynchronously to avoid deadlock.
        immediateDuration time.Duration = 0
)

// NeighborEntry describes a neighboring device in the local network.
type NeighborEntry struct {
        Addr      tcpip.Address
        LinkAddr  tcpip.LinkAddress
        State     NeighborState
        UpdatedAt time.Time
}

// NeighborState defines the state of a NeighborEntry within the Neighbor
// Unreachability Detection state machine, as per RFC 4861 section 7.3.2 and
// RFC 7048.
type NeighborState uint8

const (
        // Unknown means reachability has not been verified yet. This is the initial
        // state of entries that have been created automatically by the Neighbor
        // Unreachability Detection state machine.
        Unknown NeighborState = iota
        // Incomplete means that there is an outstanding request to resolve the
        // address.
        Incomplete
        // Reachable means the path to the neighbor is functioning properly for both
        // receive and transmit paths.
        Reachable
        // Stale means reachability to the neighbor is unknown, but packets are still
        // able to be transmitted to the possibly stale link address.
        Stale
        // Delay means reachability to the neighbor is unknown and pending
        // confirmation from an upper-level protocol like TCP, but packets are still
        // able to be transmitted to the possibly stale link address.
        Delay
        // Probe means a reachability confirmation is actively being sought by
        // periodically retransmitting reachability probes until a reachability
        // confirmation is received, or until the maximum number of probes has been
        // sent.
        Probe
        // Static describes entries that have been explicitly added by the user. They
        // do not expire and are not deleted until explicitly removed.
        Static
        // Unreachable means reachability confirmation failed; the maximum number of
        // reachability probes has been sent and no replies have been received.
        //
        // TODO(gvisor.dev/issue/5472): Add the following sentence when we implement
        // RFC 7048: "Packets continue to be sent to the neighbor while
        // re-attempting to resolve the address."
        Unreachable
)

type timer struct {
        // done indicates to the timer that the timer was stopped.
        done *bool

        timer tcpip.Timer
}

// neighborEntry implements a neighbor entry's individual node behavior, as per
// RFC 4861 section 7.3.3. Neighbor Unreachability Detection operates in
// parallel with the sending of packets to a neighbor, necessitating the
// entry's lock to be acquired for all operations.
type neighborEntry struct {
        neighborEntryEntry

        cache *neighborCache

        // nudState points to the Neighbor Unreachability Detection configuration.
        nudState *NUDState

        mu struct {
                sync.RWMutex

                neigh NeighborEntry

                // done is closed when address resolution is complete. It is nil iff s is
                // incomplete and resolution is not yet in progress.
                done chan struct{}

                // onResolve is called with the result of address resolution.
                onResolve []func(LinkResolutionResult)

                isRouter bool

                timer timer
        }
}

// newNeighborEntry creates a neighbor cache entry starting at the default
// state, Unknown. Transition out of Unknown by calling either
// `handlePacketQueuedLocked` or `handleProbeLocked` on the newly created
// neighborEntry.
func newNeighborEntry(cache *neighborCache, remoteAddr tcpip.Address, nudState *NUDState) *neighborEntry {
        n := &neighborEntry{
                cache:    cache,
                nudState: nudState,
        }
        n.mu.Lock()
        n.mu.neigh = NeighborEntry{
                Addr:  remoteAddr,
                State: Unknown,
        }
        n.mu.Unlock()
        return n

}

// newStaticNeighborEntry creates a neighbor cache entry starting at the
// Static state. The entry can only transition out of Static by directly
// calling `setStateLocked`.
func newStaticNeighborEntry(cache *neighborCache, addr tcpip.Address, linkAddr tcpip.LinkAddress, state *NUDState) *neighborEntry {
        entry := NeighborEntry{
                Addr:      addr,
                LinkAddr:  linkAddr,
                State:     Static,
                UpdatedAt: cache.nic.stack.clock.Now(),
        }
        n := &neighborEntry{
                cache:    cache,
                nudState: state,
        }
        n.mu.Lock()
        n.mu.neigh = entry
        n.mu.Unlock()
        return n
}

// notifyCompletionLocked notifies those waiting for address resolution, with
// the link address if resolution completed successfully.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) notifyCompletionLocked(err tcpip.Error) {
        res := LinkResolutionResult{LinkAddress: e.mu.neigh.LinkAddr, Err: err}
        for _, callback := range e.mu.onResolve {
                callback(res)
        }
        e.mu.onResolve = nil
        if ch := e.mu.done; ch != nil {
                close(ch)
                e.mu.done = nil
                // Dequeue the pending packets asynchronously to not hold up the current
                // goroutine as writing packets may be a costly operation.
                //
                // At the time of writing, when writing packets, a neighbor's link address
                // is resolved (which ends up obtaining the entry's lock) while holding the
                // link resolution queue's lock. Dequeuing packets asynchronously avoids a
                // lock ordering violation.
                //
                // NB: this is equivalent to spawning a goroutine directly using the go
                // keyword but allows tests that use manual clocks to deterministically
                // wait for this work to complete.
                e.cache.nic.stack.clock.AfterFunc(0, func() {
                        e.cache.nic.linkResQueue.dequeue(ch, e.mu.neigh.LinkAddr, err)
                })
        }
}

// dispatchAddEventLocked signals to stack's NUD Dispatcher that the entry has
// been added.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) dispatchAddEventLocked() {
        if nudDisp := e.cache.nic.stack.nudDisp; nudDisp != nil {
                nudDisp.OnNeighborAdded(e.cache.nic.id, e.mu.neigh)
        }
}

// dispatchChangeEventLocked signals to stack's NUD Dispatcher that the entry
// has changed state or link-layer address.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) dispatchChangeEventLocked() {
        if nudDisp := e.cache.nic.stack.nudDisp; nudDisp != nil {
                nudDisp.OnNeighborChanged(e.cache.nic.id, e.mu.neigh)
        }
}

// dispatchRemoveEventLocked signals to stack's NUD Dispatcher that the entry
// has been removed.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) dispatchRemoveEventLocked() {
        if nudDisp := e.cache.nic.stack.nudDisp; nudDisp != nil {
                nudDisp.OnNeighborRemoved(e.cache.nic.id, e.mu.neigh)
        }
}

// cancelTimerLocked cancels the currently scheduled action, if there is one.
// Entries in Unknown, Stale, or Static state do not have a scheduled action.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) cancelTimerLocked() {
        if e.mu.timer.timer != nil {
                e.mu.timer.timer.Stop()
                *e.mu.timer.done = true

                e.mu.timer = timer{}
        }
}

// removeLocked prepares the entry for removal.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) removeLocked() {
        e.mu.neigh.UpdatedAt = e.cache.nic.stack.clock.Now()
        e.dispatchRemoveEventLocked()
        e.cancelTimerLocked()
        // TODO(https://gvisor.dev/issues/5583): test the case where this function is
        // called during resolution; that can happen in at least these scenarios:
        //
        // - manual address removal during resolution
        //
        // - neighbor cache eviction during resolution
        e.notifyCompletionLocked(&tcpip.ErrAborted{})
}

// setStateLocked transitions the entry to the specified state immediately.
//
// Follows the logic defined in RFC 4861 section 7.3.3.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) setStateLocked(next NeighborState) {
        e.cancelTimerLocked()

        prev := e.mu.neigh.State
        e.mu.neigh.State = next
        e.mu.neigh.UpdatedAt = e.cache.nic.stack.clock.Now()
        config := e.nudState.Config()

        switch next {
        case Incomplete:
                panic(fmt.Sprintf("should never transition to Incomplete with setStateLocked; neigh = %#v, prev state = %s", e.mu.neigh, prev))

        case Reachable:
                // Protected by e.mu.
                done := false

                e.mu.timer = timer{
                        done: &done,
                        timer: e.cache.nic.stack.Clock().AfterFunc(e.nudState.ReachableTime(), func() {
                                e.mu.Lock()
                                defer e.mu.Unlock()

                                if done {
                                        // The timer was stopped because the entry changed state.
                                        return
                                }

                                e.setStateLocked(Stale)
                                e.dispatchChangeEventLocked()
                        }),
                }

        case Delay:
                // Protected by e.mu.
                done := false

                e.mu.timer = timer{
                        done: &done,
                        timer: e.cache.nic.stack.Clock().AfterFunc(config.DelayFirstProbeTime, func() {
                                e.mu.Lock()
                                defer e.mu.Unlock()

                                if done {
                                        // The timer was stopped because the entry changed state.
                                        return
                                }

                                e.setStateLocked(Probe)
                                e.dispatchChangeEventLocked()
                        }),
                }

        case Probe:
                // Protected by e.mu.
                done := false

                remaining := config.MaxUnicastProbes
                addr := e.mu.neigh.Addr
                linkAddr := e.mu.neigh.LinkAddr

                // Send a probe in another gorountine to free this thread of execution
                // for finishing the state transition. This is necessary to escape the
                // currently held lock so we can send the probe message without holding
                // a shared lock.
                e.mu.timer = timer{
                        done: &done,
                        timer: e.cache.nic.stack.Clock().AfterFunc(immediateDuration, func() {
                                var err tcpip.Error = &tcpip.ErrTimeout{}
                                if remaining != 0 {
                                        err = e.cache.linkRes.LinkAddressRequest(addr, "" /* localAddr */, linkAddr)
                                }

                                e.mu.Lock()
                                defer e.mu.Unlock()

                                if done {
                                        // The timer was stopped because the entry changed state.
                                        return
                                }

                                if err != nil {
                                        e.setStateLocked(Unreachable)
                                        e.notifyCompletionLocked(err)
                                        e.dispatchChangeEventLocked()
                                        return
                                }

                                remaining--
                                e.mu.timer.timer.Reset(config.RetransmitTimer)
                        }),
                }

        case Unreachable:

        case Unknown, Stale, Static:
                // Do nothing

        default:
                panic(fmt.Sprintf("Invalid state transition from %q to %q", prev, next))
        }
}

// handlePacketQueuedLocked advances the state machine according to a packet
// being queued for outgoing transmission.
//
// Follows the logic defined in RFC 4861 section 7.3.3.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) handlePacketQueuedLocked(localAddr tcpip.Address) {
        switch e.mu.neigh.State {
        case Unknown, Unreachable:
                prev := e.mu.neigh.State
                e.mu.neigh.State = Incomplete
                e.mu.neigh.UpdatedAt = e.cache.nic.stack.clock.Now()

                switch prev {
                case Unknown:
                        e.dispatchAddEventLocked()
                case Unreachable:
                        e.dispatchChangeEventLocked()
                        e.cache.nic.stats.neighbor.unreachableEntryLookups.Increment()
                }

                config := e.nudState.Config()

                // Protected by e.mu.
                done := false

                remaining := config.MaxMulticastProbes
                addr := e.mu.neigh.Addr

                // Send a probe in another gorountine to free this thread of execution
                // for finishing the state transition. This is necessary to escape the
                // currently held lock so we can send the probe message without holding
                // a shared lock.
                e.mu.timer = timer{
                        done: &done,
                        timer: e.cache.nic.stack.Clock().AfterFunc(immediateDuration, func() {
                                var err tcpip.Error = &tcpip.ErrTimeout{}
                                if remaining != 0 {
                                        // As per RFC 4861 section 7.2.2:
                                        //
                                        //  If the source address of the packet prompting the solicitation is
                                        //  the same as one of the addresses assigned to the outgoing interface,
                                        //  that address SHOULD be placed in the IP Source Address of the
                                        //  outgoing solicitation.
                                        //
                                        err = e.cache.linkRes.LinkAddressRequest(addr, localAddr, "" /* linkAddr */)
                                }

                                e.mu.Lock()
                                defer e.mu.Unlock()

                                if done {
                                        // The timer was stopped because the entry changed state.
                                        return
                                }

                                if err != nil {
                                        e.setStateLocked(Unreachable)
                                        e.notifyCompletionLocked(err)
                                        e.dispatchChangeEventLocked()
                                        return
                                }

                                remaining--
                                e.mu.timer.timer.Reset(config.RetransmitTimer)
                        }),
                }

        case Stale:
                e.setStateLocked(Delay)
                e.dispatchChangeEventLocked()

        case Incomplete, Reachable, Delay, Probe, Static:
                // Do nothing
        default:
                panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State))
        }
}

// handleProbeLocked processes an incoming neighbor probe (e.g. ARP request or
// Neighbor Solicitation for ARP or NDP, respectively).
//
// Follows the logic defined in RFC 4861 section 7.2.3.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) handleProbeLocked(remoteLinkAddr tcpip.LinkAddress) {
        // Probes MUST be silently discarded if the target address is tentative, does
        // not exist, or not bound to the NIC as per RFC 4861 section 7.2.3. These
        // checks MUST be done by the NetworkEndpoint.

        switch e.mu.neigh.State {
        case Unknown:
                e.mu.neigh.LinkAddr = remoteLinkAddr
                e.setStateLocked(Stale)
                e.dispatchAddEventLocked()

        case Incomplete:
                // "If an entry already exists, and the cached link-layer address
                // differs from the one in the received Source Link-Layer option, the
                // cached address should be replaced by the received address, and the
                // entry's reachability state MUST be set to STALE."
                //  - RFC 4861 section 7.2.3
                e.mu.neigh.LinkAddr = remoteLinkAddr
                e.setStateLocked(Stale)
                e.notifyCompletionLocked(nil)
                e.dispatchChangeEventLocked()

        case Reachable, Delay, Probe:
                if e.mu.neigh.LinkAddr != remoteLinkAddr {
                        e.mu.neigh.LinkAddr = remoteLinkAddr
                        e.setStateLocked(Stale)
                        e.dispatchChangeEventLocked()
                }

        case Stale:
                if e.mu.neigh.LinkAddr != remoteLinkAddr {
                        e.mu.neigh.LinkAddr = remoteLinkAddr
                        e.dispatchChangeEventLocked()
                }

        case Unreachable:
                // TODO(gvisor.dev/issue/5472): Do not change the entry if the link
                // address is the same, as per RFC 7048.
                e.mu.neigh.LinkAddr = remoteLinkAddr
                e.setStateLocked(Stale)
                e.dispatchChangeEventLocked()

        case Static:
                // Do nothing

        default:
                panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State))
        }
}

// handleConfirmationLocked processes an incoming neighbor confirmation
// (e.g. ARP reply or Neighbor Advertisement for ARP or NDP, respectively).
//
// Follows the state machine defined by RFC 4861 section 7.2.5.
//
// TODO(gvisor.dev/issue/2277): To protect against ARP poisoning and other
// attacks against NDP functions, Secure Neighbor Discovery (SEND) Protocol
// should be deployed where preventing access to the broadcast segment might
// not be possible. SEND uses RSA key pairs to produce Cryptographically
// Generated Addresses (CGA), as defined in RFC 3972. This ensures that the
// claimed source of an NDP message is the owner of the claimed address.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, flags ReachabilityConfirmationFlags) {
        switch e.mu.neigh.State {
        case Incomplete:
                if len(linkAddr) == 0 {
                        // "If the link layer has addresses and no Target Link-Layer Address
                        // option is included, the receiving node SHOULD silently discard the
                        // received advertisement." - RFC 4861 section 7.2.5
                        break
                }

                e.mu.neigh.LinkAddr = linkAddr
                if flags.Solicited {
                        e.setStateLocked(Reachable)
                } else {
                        e.setStateLocked(Stale)
                }
                e.dispatchChangeEventLocked()
                e.mu.isRouter = flags.IsRouter
                e.notifyCompletionLocked(nil)

                // "Note that the Override flag is ignored if the entry is in the
                // INCOMPLETE state." - RFC 4861 section 7.2.5

        case Reachable, Stale, Delay, Probe:
                isLinkAddrDifferent := len(linkAddr) != 0 && e.mu.neigh.LinkAddr != linkAddr

                if isLinkAddrDifferent {
                        if !flags.Override {
                                if e.mu.neigh.State == Reachable {
                                        e.setStateLocked(Stale)
                                        e.dispatchChangeEventLocked()
                                }
                                break
                        }

                        e.mu.neigh.LinkAddr = linkAddr

                        if !flags.Solicited {
                                if e.mu.neigh.State != Stale {
                                        e.setStateLocked(Stale)
                                        e.dispatchChangeEventLocked()
                                } else {
                                        // Notify the LinkAddr change, even though NUD state hasn't changed.
                                        e.dispatchChangeEventLocked()
                                }
                                break
                        }
                }

                if flags.Solicited && (flags.Override || !isLinkAddrDifferent) {
                        wasReachable := e.mu.neigh.State == Reachable
                        // Set state to Reachable again to refresh timers.
                        e.setStateLocked(Reachable)
                        e.notifyCompletionLocked(nil)
                        if !wasReachable {
                                e.dispatchChangeEventLocked()
                        }
                }

                if e.mu.isRouter && !flags.IsRouter && header.IsV6UnicastAddress(e.mu.neigh.Addr) {
                        // "In those cases where the IsRouter flag changes from TRUE to FALSE as
                        // a result of this update, the node MUST remove that router from the
                        // Default Router List and update the Destination Cache entries for all
                        // destinations using that neighbor as a router as specified in Section
                        // 7.3.3.  This is needed to detect when a node that is used as a router
                        // stops forwarding packets due to being configured as a host."
                        //  - RFC 4861 section 7.2.5
                        //
                        // TODO(gvisor.dev/issue/4085): Remove the special casing we do for IPv6
                        // here.
                        ep, ok := e.cache.nic.networkEndpoints[header.IPv6ProtocolNumber]
                        if !ok {
                                panic(fmt.Sprintf("have a neighbor entry for an IPv6 router but no IPv6 network endpoint"))
                        }

                        if ndpEP, ok := ep.(NDPEndpoint); ok {
                                ndpEP.InvalidateDefaultRouter(e.mu.neigh.Addr)
                        }
                }
                e.mu.isRouter = flags.IsRouter

        case Unknown, Unreachable, Static:
                // Do nothing

        default:
                panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State))
        }
}

// handleUpperLevelConfirmationLocked processes an incoming upper-level protocol
// (e.g. TCP acknowledgements) reachability confirmation.
//
// Precondition: e.mu MUST be locked.
func (e *neighborEntry) handleUpperLevelConfirmationLocked() {
        switch e.mu.neigh.State {
        case Reachable, Stale, Delay, Probe:
                wasReachable := e.mu.neigh.State == Reachable
                // Set state to Reachable again to refresh timers.
                e.setStateLocked(Reachable)
                if !wasReachable {
                        e.dispatchChangeEventLocked()
                }

        case Unknown, Incomplete, Unreachable, Static:
                // Do nothing

        default:
                panic(fmt.Sprintf("Invalid cache entry state: %s", e.mu.neigh.State))
        }
}
























































    4 









    3 













    4 













    3 













    6 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import "gvisor.dev/gvisor/pkg/tcpip"

// NDPNeighborAdvert is an NDP Neighbor Advertisement message. It will
// only contain the body of an ICMPv6 packet.
//
// See RFC 4861 section 4.4 for more details.
type NDPNeighborAdvert []byte

const (
        // NDPNAMinimumSize is the minimum size of a valid NDP Neighbor
        // Advertisement message (body of an ICMPv6 packet).
        NDPNAMinimumSize = 20

        // ndpNATargetAddressOffset is the start of the Target Address
        // field within an NDPNeighborAdvert.
        ndpNATargetAddressOffset = 4

        // ndpNAOptionsOffset is the start of the NDP options in an
        // NDPNeighborAdvert.
        ndpNAOptionsOffset = ndpNATargetAddressOffset + IPv6AddressSize

        // ndpNAFlagsOffset is the offset of the flags within an
        // NDPNeighborAdvert
        ndpNAFlagsOffset = 0

        // ndpNARouterFlagMask is the mask of the Router Flag field in
        // the flags byte within in an NDPNeighborAdvert.
        ndpNARouterFlagMask = (1 << 7)

        // ndpNASolicitedFlagMask is the mask of the Solicited Flag field in
        // the flags byte within in an NDPNeighborAdvert.
        ndpNASolicitedFlagMask = (1 << 6)

        // ndpNAOverrideFlagMask is the mask of the Override Flag field in
        // the flags byte within in an NDPNeighborAdvert.
        ndpNAOverrideFlagMask = (1 << 5)
)

// TargetAddress returns the value within the Target Address field.
func (b NDPNeighborAdvert) TargetAddress() tcpip.Address {
        return tcpip.Address(b[ndpNATargetAddressOffset:][:IPv6AddressSize])
}

// SetTargetAddress sets the value within the Target Address field.
func (b NDPNeighborAdvert) SetTargetAddress(addr tcpip.Address) {
        copy(b[ndpNATargetAddressOffset:][:IPv6AddressSize], addr)
}

// RouterFlag returns the value of the Router Flag field.
func (b NDPNeighborAdvert) RouterFlag() bool {
        return b[ndpNAFlagsOffset]&ndpNARouterFlagMask != 0
}

// SetRouterFlag sets the value in the Router Flag field.
func (b NDPNeighborAdvert) SetRouterFlag(f bool) {
        if f {
                b[ndpNAFlagsOffset] |= ndpNARouterFlagMask
        } else {
                b[ndpNAFlagsOffset] &^= ndpNARouterFlagMask
        }
}

// SolicitedFlag returns the value of the Solicited Flag field.
func (b NDPNeighborAdvert) SolicitedFlag() bool {
        return b[ndpNAFlagsOffset]&ndpNASolicitedFlagMask != 0
}

// SetSolicitedFlag sets the value in the Solicited Flag field.
func (b NDPNeighborAdvert) SetSolicitedFlag(f bool) {
        if f {
                b[ndpNAFlagsOffset] |= ndpNASolicitedFlagMask
        } else {
                b[ndpNAFlagsOffset] &^= ndpNASolicitedFlagMask
        }
}

// OverrideFlag returns the value of the Override Flag field.
func (b NDPNeighborAdvert) OverrideFlag() bool {
        return b[ndpNAFlagsOffset]&ndpNAOverrideFlagMask != 0
}

// SetOverrideFlag sets the value in the Override Flag field.
func (b NDPNeighborAdvert) SetOverrideFlag(f bool) {
        if f {
                b[ndpNAFlagsOffset] |= ndpNAOverrideFlagMask
        } else {
                b[ndpNAFlagsOffset] &^= ndpNAOverrideFlagMask
        }
}

// Options returns an NDPOptions of the the options body.
func (b NDPNeighborAdvert) Options() NDPOptions {
        return NDPOptions(b[ndpNAOptionsOffset:])
}


































   14 











    5 


   10 



    8 

    4 



    4 






    4 



    4 
    1 



    3 
    1 


    2 



    1 
    1 








    9 




    1 


    8 



    8 
    1 


    7 


    8 


    1 


    7 

    8 

    6 


    3 







   25 





    6 

    1 


    5 

    3 



    2 



    1 



    1 


    1 






    1 


    2 



    2 


    1 



    2 

    2 



    2 



    1 






    1 

    1 



    1 




    4 



    2 


    2 


    1 

    4 



    1 


    3 


    2 

    1 




    1 





    1 


    1 
















    2 


    1 


    1 



    4 


    1 


    3 

    1 


    2 


    4 

    1 


    3 




    3 


    5 


    1 


    4 




    3 


    1 


    2 
    1 


    1 




    2 


    1 


    1 



    1 


    1 











    2 





    2 

    2 










    2 


    1 


    1 



    2 


    1 


    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "math"
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)

const opsMax = 500 // SEMOPM

// Semget handles: semget(key_t key, int nsems, int semflg)
func Semget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        key := args[0].Int()
        nsems := args[1].Int()
        flag := args[2].Int()

        private := key == linux.IPC_PRIVATE
        create := flag&linux.IPC_CREAT == linux.IPC_CREAT
        exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL
        mode := linux.FileMode(flag & 0777)

        r := t.IPCNamespace().SemaphoreRegistry()
        set, err := r.FindOrCreate(t, key, nsems, mode, private, create, exclusive)
        if err != nil {
                return 0, nil, err
        }
        return uintptr(set.ID), nil, nil
}

// Semtimedop handles: semop(int semid, struct sembuf *sops, size_t nsops, const struct timespec *timeout)
func Semtimedop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        // If the timeout argument is NULL, then semtimedop() behaves exactly like semop().
        if args[3].Pointer() == 0 {
                return Semop(t, args)
        }

        id := args[0].Int()
        sembufAddr := args[1].Pointer()
        nsops := args[2].SizeT()
        timespecAddr := args[3].Pointer()
        if nsops <= 0 {
                return 0, nil, linuxerr.EINVAL
        }
        if nsops > opsMax {
                return 0, nil, linuxerr.E2BIG
        }

        ops := make([]linux.Sembuf, nsops)
        if _, err := linux.CopySembufSliceIn(t, sembufAddr, ops); err != nil {
                return 0, nil, err
        }

        var timeout linux.Timespec
        if _, err := timeout.CopyIn(t, timespecAddr); err != nil {
                return 0, nil, err
        }
        if timeout.Sec < 0 || timeout.Nsec < 0 || timeout.Nsec >= 1e9 {
                return 0, nil, linuxerr.EINVAL
        }

        if err := semTimedOp(t, id, ops, true, timeout.ToDuration()); err != nil {
                if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
                        return 0, nil, linuxerr.EAGAIN
                }
                return 0, nil, err
        }
        return 0, nil, nil
}

// Semop handles: semop(int semid, struct sembuf *sops, size_t nsops)
func Semop(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        id := args[0].Int()
        sembufAddr := args[1].Pointer()
        nsops := args[2].SizeT()

        if nsops <= 0 {
                return 0, nil, linuxerr.EINVAL
        }
        if nsops > opsMax {
                return 0, nil, linuxerr.E2BIG
        }

        ops := make([]linux.Sembuf, nsops)
        if _, err := linux.CopySembufSliceIn(t, sembufAddr, ops); err != nil {
                return 0, nil, err
        }
        return 0, nil, semTimedOp(t, id, ops, false, time.Second)
}

func semTimedOp(t *kernel.Task, id int32, ops []linux.Sembuf, haveTimeout bool, timeout time.Duration) error {
        set := t.IPCNamespace().SemaphoreRegistry().FindByID(id)

        if set == nil {
                return linuxerr.EINVAL
        }
        creds := auth.CredentialsFromContext(t)
        pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
        for {
                ch, num, err := set.ExecuteOps(t, ops, creds, int32(pid))
                if ch == nil || err != nil {
                        return err
                }
                if _, err = t.BlockWithTimeout(ch, haveTimeout, timeout); err != nil {
                        set.AbortWait(num, ch)
                        return err
                }
        }
}

// Semctl handles: semctl(int semid, int semnum, int cmd, ...)
func Semctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        id := args[0].Int()
        num := args[1].Int()
        cmd := args[2].Int()

        switch cmd {
        case linux.SETVAL:
                val := args[3].Int()
                if val > math.MaxInt16 {
                        return 0, nil, linuxerr.ERANGE
                }
                return 0, nil, setVal(t, id, num, int16(val))

        case linux.SETALL:
                array := args[3].Pointer()
                return 0, nil, setValAll(t, id, array)

        case linux.GETVAL:
                v, err := getVal(t, id, num)
                return uintptr(v), nil, err

        case linux.GETALL:
                array := args[3].Pointer()
                return 0, nil, getValAll(t, id, array)

        case linux.IPC_RMID:
                return 0, nil, remove(t, id)

        case linux.IPC_SET:
                arg := args[3].Pointer()
                var s linux.SemidDS
                if _, err := s.CopyIn(t, arg); err != nil {
                        return 0, nil, err
                }

                perms := fs.FilePermsFromMode(linux.FileMode(s.SemPerm.Mode & 0777))
                return 0, nil, ipcSet(t, id, auth.UID(s.SemPerm.UID), auth.GID(s.SemPerm.GID), perms)

        case linux.GETPID:
                v, err := getPID(t, id, num)
                return uintptr(v), nil, err

        case linux.IPC_STAT:
                arg := args[3].Pointer()
                ds, err := ipcStat(t, id)
                if err == nil {
                        _, err = ds.CopyOut(t, arg)
                }

                return 0, nil, err

        case linux.GETZCNT:
                v, err := getZCnt(t, id, num)
                return uintptr(v), nil, err

        case linux.GETNCNT:
                v, err := getNCnt(t, id, num)
                return uintptr(v), nil, err

        case linux.IPC_INFO:
                buf := args[3].Pointer()
                r := t.IPCNamespace().SemaphoreRegistry()
                info := r.IPCInfo()
                if _, err := info.CopyOut(t, buf); err != nil {
                        return 0, nil, err
                }
                return uintptr(r.HighestIndex()), nil, nil

        case linux.SEM_INFO:
                buf := args[3].Pointer()
                r := t.IPCNamespace().SemaphoreRegistry()
                info := r.SemInfo()
                if _, err := info.CopyOut(t, buf); err != nil {
                        return 0, nil, err
                }
                return uintptr(r.HighestIndex()), nil, nil

        case linux.SEM_STAT:
                arg := args[3].Pointer()
                // id is an index in SEM_STAT.
                semid, ds, err := semStat(t, id)
                if err != nil {
                        return 0, nil, err
                }
                if _, err := ds.CopyOut(t, arg); err != nil {
                        return 0, nil, err
                }
                return uintptr(semid), nil, err

        case linux.SEM_STAT_ANY:
                arg := args[3].Pointer()
                // id is an index in SEM_STAT.
                semid, ds, err := semStatAny(t, id)
                if err != nil {
                        return 0, nil, err
                }
                if _, err := ds.CopyOut(t, arg); err != nil {
                        return 0, nil, err
                }
                return uintptr(semid), nil, err

        default:
                return 0, nil, linuxerr.EINVAL
        }
}

func remove(t *kernel.Task, id int32) error {
        r := t.IPCNamespace().SemaphoreRegistry()
        creds := auth.CredentialsFromContext(t)
        return r.RemoveID(id, creds)
}

func ipcSet(t *kernel.Task, id int32, uid auth.UID, gid auth.GID, perms fs.FilePermissions) error {
        r := t.IPCNamespace().SemaphoreRegistry()
        set := r.FindByID(id)
        if set == nil {
                return linuxerr.EINVAL
        }

        creds := auth.CredentialsFromContext(t)
        kuid := creds.UserNamespace.MapToKUID(uid)
        if !kuid.Ok() {
                return linuxerr.EINVAL
        }
        kgid := creds.UserNamespace.MapToKGID(gid)
        if !kgid.Ok() {
                return linuxerr.EINVAL
        }
        owner := fs.FileOwner{UID: kuid, GID: kgid}
        return set.Change(t, creds, owner, perms)
}

func ipcStat(t *kernel.Task, id int32) (*linux.SemidDS, error) {
        r := t.IPCNamespace().SemaphoreRegistry()
        set := r.FindByID(id)
        if set == nil {
                return nil, linuxerr.EINVAL
        }
        creds := auth.CredentialsFromContext(t)
        return set.GetStat(creds)
}

func semStat(t *kernel.Task, index int32) (int32, *linux.SemidDS, error) {
        r := t.IPCNamespace().SemaphoreRegistry()
        set := r.FindByIndex(index)
        if set == nil {
                return 0, nil, linuxerr.EINVAL
        }
        creds := auth.CredentialsFromContext(t)
        ds, err := set.GetStat(creds)
        if err != nil {
                return 0, ds, err
        }
        return set.ID, ds, nil
}

func semStatAny(t *kernel.Task, index int32) (int32, *linux.SemidDS, error) {
        set := t.IPCNamespace().SemaphoreRegistry().FindByIndex(index)
        if set == nil {
                return 0, nil, linuxerr.EINVAL
        }
        creds := auth.CredentialsFromContext(t)
        ds, err := set.GetStatAny(creds)
        if err != nil {
                return 0, ds, err
        }
        return set.ID, ds, nil
}

func setVal(t *kernel.Task, id int32, num int32, val int16) error {
        r := t.IPCNamespace().SemaphoreRegistry()
        set := r.FindByID(id)
        if set == nil {
                return linuxerr.EINVAL
        }
        creds := auth.CredentialsFromContext(t)
        pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
        return set.SetVal(t, num, val, creds, int32(pid))
}

func setValAll(t *kernel.Task, id int32, array hostarch.Addr) error {
        r := t.IPCNamespace().SemaphoreRegistry()
        set := r.FindByID(id)
        if set == nil {
                return linuxerr.EINVAL
        }
        vals := make([]uint16, set.Size())
        if _, err := primitive.CopyUint16SliceIn(t, array, vals); err != nil {
                return err
        }
        creds := auth.CredentialsFromContext(t)
        pid := t.Kernel().GlobalInit().PIDNamespace().IDOfThreadGroup(t.ThreadGroup())
        return set.SetValAll(t, vals, creds, int32(pid))
}

func getVal(t *kernel.Task, id int32, num int32) (int16, error) {
        r := t.IPCNamespace().SemaphoreRegistry()
        set := r.FindByID(id)
        if set == nil {
                return 0, linuxerr.EINVAL
        }
        creds := auth.CredentialsFromContext(t)
        return set.GetVal(num, creds)
}

func getValAll(t *kernel.Task, id int32, array hostarch.Addr) error {
        r := t.IPCNamespace().SemaphoreRegistry()
        set := r.FindByID(id)
        if set == nil {
                return linuxerr.EINVAL
        }
        creds := auth.CredentialsFromContext(t)
        vals, err := set.GetValAll(creds)
        if err != nil {
                return err
        }
        _, err = primitive.CopyUint16SliceOut(t, array, vals)
        return err
}

func getPID(t *kernel.Task, id int32, num int32) (int32, error) {
        r := t.IPCNamespace().SemaphoreRegistry()
        set := r.FindByID(id)
        if set == nil {
                return 0, linuxerr.EINVAL
        }
        creds := auth.CredentialsFromContext(t)
        gpid, err := set.GetPID(num, creds)
        if err != nil {
                return 0, err
        }
        // Convert pid from init namespace to the caller's namespace.
        tg := t.PIDNamespace().ThreadGroupWithID(kernel.ThreadID(gpid))
        if tg == nil {
                return 0, nil
        }
        return int32(tg.ID()), nil
}

func getZCnt(t *kernel.Task, id int32, num int32) (uint16, error) {
        r := t.IPCNamespace().SemaphoreRegistry()
        set := r.FindByID(id)
        if set == nil {
                return 0, linuxerr.EINVAL
        }
        creds := auth.CredentialsFromContext(t)
        return set.CountZeroWaiters(num, creds)
}

func getNCnt(t *kernel.Task, id int32, num int32) (uint16, error) {
        r := t.IPCNamespace().SemaphoreRegistry()
        set := r.FindByID(id)
        if set == nil {
                return 0, linuxerr.EINVAL
        }
        creds := auth.CredentialsFromContext(t)
        return set.CountNegativeWaiters(num, creds)
}





































  296 


 1465 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package errors holds the standardized error definition for gVisor.
package errors

import (
        "gvisor.dev/gvisor/pkg/abi/linux/errno"
)

// Error represents a syscall errno with a descriptive message.
type Error struct {
        errno   errno.Errno
        message string
}

// New creates a new *Error.
func New(err errno.Errno, message string) *Error {
        return &Error{
                errno:   err,
                message: message,
        }
}

// Error implements error.Error.
func (e *Error) Error() string { return e.message }

// Errno returns the underlying errno.Errno value.
func (e *Error) Errno() errno.Errno { return e.errno }




























































































































































































   60 




   12 




   59 




   55 









   58 





   56 










    3 





   22 





   22 




   21 





   21 









    1 









   22 











   60 




   57 
    1 



   59 



   57 




   30 




   30 




  160 
   66 


  106 






   32 





   32 




   32 







   40 



















































  114 
   59 


   66 




  113 





   71 






















































    8 




    8 



    8 











   32 





















































































    4 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import (
        "crypto/sha256"
        "encoding/binary"
        "fmt"

        "gvisor.dev/gvisor/pkg/tcpip"
)

const (
        versTCFL = 0
        // IPv6PayloadLenOffset is the offset of the PayloadLength field in
        // IPv6 header.
        IPv6PayloadLenOffset = 4
        // IPv6NextHeaderOffset is the offset of the NextHeader field in
        // IPv6 header.
        IPv6NextHeaderOffset = 6
        hopLimit             = 7
        v6SrcAddr            = 8
        v6DstAddr            = v6SrcAddr + IPv6AddressSize

        // IPv6FixedHeaderSize is the size of the fixed header.
        IPv6FixedHeaderSize = v6DstAddr + IPv6AddressSize
)

// IPv6Fields contains the fields of an IPv6 packet. It is used to describe the
// fields of a packet that needs to be encoded.
type IPv6Fields struct {
        // TrafficClass is the "traffic class" field of an IPv6 packet.
        TrafficClass uint8

        // FlowLabel is the "flow label" field of an IPv6 packet.
        FlowLabel uint32

        // PayloadLength is the "payload length" field of an IPv6 packet, including
        // the length of all extension headers.
        PayloadLength uint16

        // TransportProtocol is the transport layer protocol number. Serialized in the
        // last "next header" field of the IPv6 header + extension headers.
        TransportProtocol tcpip.TransportProtocolNumber

        // HopLimit is the "Hop Limit" field of an IPv6 packet.
        HopLimit uint8

        // SrcAddr is the "source ip address" of an IPv6 packet.
        SrcAddr tcpip.Address

        // DstAddr is the "destination ip address" of an IPv6 packet.
        DstAddr tcpip.Address

        // ExtensionHeaders are the extension headers following the IPv6 header.
        ExtensionHeaders IPv6ExtHdrSerializer
}

// IPv6 represents an ipv6 header stored in a byte array.
// Most of the methods of IPv6 access to the underlying slice without
// checking the boundaries and could panic because of 'index out of range'.
// Always call IsValid() to validate an instance of IPv6 before using other methods.
type IPv6 []byte

const (
        // IPv6MinimumSize is the minimum size of a valid IPv6 packet.
        IPv6MinimumSize = IPv6FixedHeaderSize

        // IPv6AddressSize is the size, in bytes, of an IPv6 address.
        IPv6AddressSize = 16

        // IPv6MaximumPayloadSize is the maximum size of a valid IPv6 payload per
        // RFC 8200 Section 4.5.
        IPv6MaximumPayloadSize = 65535

        // IPv6ProtocolNumber is IPv6's network protocol number.
        IPv6ProtocolNumber tcpip.NetworkProtocolNumber = 0x86dd

        // IPv6Version is the version of the ipv6 protocol.
        IPv6Version = 6

        // IPv6AllNodesMulticastAddress is a link-local multicast group that
        // all IPv6 nodes MUST join, as per RFC 4291, section 2.8. Packets
        // destined to this address will reach all nodes on a link.
        //
        // The address is ff02::1.
        IPv6AllNodesMulticastAddress tcpip.Address = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"

        // IPv6AllRoutersInterfaceLocalMulticastAddress is an interface-local
        // multicast group that all IPv6 routers MUST join, as per RFC 4291, section
        // 2.8. Packets destined to this address will reach the router on an
        // interface.
        //
        // The address is ff01::2.
        IPv6AllRoutersInterfaceLocalMulticastAddress tcpip.Address = "\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"

        // IPv6AllRoutersLinkLocalMulticastAddress is a link-local multicast group
        // that all IPv6 routers MUST join, as per RFC 4291, section 2.8. Packets
        // destined to this address will reach all routers on a link.
        //
        // The address is ff02::2.
        IPv6AllRoutersLinkLocalMulticastAddress tcpip.Address = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"

        // IPv6AllRoutersSiteLocalMulticastAddress is a site-local multicast group
        // that all IPv6 routers MUST join, as per RFC 4291, section 2.8. Packets
        // destined to this address will reach all routers in a site.
        //
        // The address is ff05::2.
        IPv6AllRoutersSiteLocalMulticastAddress tcpip.Address = "\xff\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"

        // IPv6MinimumMTU is the minimum MTU required by IPv6, per RFC 8200,
        // section 5:
        //   IPv6 requires that every link in the Internet have an MTU of 1280 octets
        //   or greater.  This is known as the IPv6 minimum link MTU.
        IPv6MinimumMTU = 1280

        // IPv6Loopback is the IPv6 Loopback address.
        IPv6Loopback tcpip.Address = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"

        // IPv6Any is the non-routable IPv6 "any" meta address. It is also
        // known as the unspecified address.
        IPv6Any tcpip.Address = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"

        // IIDSize is the size of an interface identifier (IID), in bytes, as
        // defined by RFC 4291 section 2.5.1.
        IIDSize = 8

        // IIDOffsetInIPv6Address is the offset, in bytes, from the start
        // of an IPv6 address to the beginning of the interface identifier
        // (IID) for auto-generated addresses. That is, all bytes before
        // the IIDOffsetInIPv6Address-th byte are the prefix bytes, and all
        // bytes including and after the IIDOffsetInIPv6Address-th byte are
        // for the IID.
        IIDOffsetInIPv6Address = 8

        // OpaqueIIDSecretKeyMinBytes is the recommended minimum number of bytes
        // for the secret key used to generate an opaque interface identifier as
        // outlined by RFC 7217.
        OpaqueIIDSecretKeyMinBytes = 16

        // ipv6MulticastAddressScopeByteIdx is the byte where the scope (scop) field
        // is located within a multicast IPv6 address, as per RFC 4291 section 2.7.
        ipv6MulticastAddressScopeByteIdx = 1

        // ipv6MulticastAddressScopeMask is the mask for the scope (scop) field,
        // within the byte holding the field, as per RFC 4291 section 2.7.
        ipv6MulticastAddressScopeMask = 0xF
)

// IPv6EmptySubnet is the empty IPv6 subnet. It may also be known as the
// catch-all or wildcard subnet. That is, all IPv6 addresses are considered to
// be contained within this subnet.
var IPv6EmptySubnet = tcpip.AddressWithPrefix{
        Address:   IPv6Any,
        PrefixLen: 0,
}.Subnet()

// IPv4MappedIPv6Subnet is the prefix for an IPv4 mapped IPv6 address as defined
// by RFC 4291 section 2.5.5.
var IPv4MappedIPv6Subnet = tcpip.AddressWithPrefix{
        Address:   "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00",
        PrefixLen: 96,
}.Subnet()

// IPv6LinkLocalPrefix is the prefix for IPv6 link-local addresses, as defined
// by RFC 4291 section 2.5.6.
//
// The prefix is fe80::/64
var IPv6LinkLocalPrefix = tcpip.AddressWithPrefix{
        Address:   "\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        PrefixLen: 64,
}

// PayloadLength returns the value of the "payload length" field of the ipv6
// header.
func (b IPv6) PayloadLength() uint16 {
        return binary.BigEndian.Uint16(b[IPv6PayloadLenOffset:])
}

// HopLimit returns the value of the "Hop Limit" field of the ipv6 header.
func (b IPv6) HopLimit() uint8 {
        return b[hopLimit]
}

// NextHeader returns the value of the "next header" field of the ipv6 header.
func (b IPv6) NextHeader() uint8 {
        return b[IPv6NextHeaderOffset]
}

// TransportProtocol implements Network.TransportProtocol.
func (b IPv6) TransportProtocol() tcpip.TransportProtocolNumber {
        return tcpip.TransportProtocolNumber(b.NextHeader())
}

// Payload implements Network.Payload.
func (b IPv6) Payload() []byte {
        return b[IPv6MinimumSize:][:b.PayloadLength()]
}

// SourceAddress returns the "source address" field of the ipv6 header.
func (b IPv6) SourceAddress() tcpip.Address {
        return tcpip.Address(b[v6SrcAddr:][:IPv6AddressSize])
}

// DestinationAddress returns the "destination address" field of the ipv6
// header.
func (b IPv6) DestinationAddress() tcpip.Address {
        return tcpip.Address(b[v6DstAddr:][:IPv6AddressSize])
}

// Checksum implements Network.Checksum. Given that IPv6 doesn't have a
// checksum, it just returns 0.
func (IPv6) Checksum() uint16 {
        return 0
}

// TOS returns the "traffic class" and "flow label" fields of the ipv6 header.
func (b IPv6) TOS() (uint8, uint32) {
        v := binary.BigEndian.Uint32(b[versTCFL:])
        return uint8(v >> 20), v & 0xfffff
}

// SetTOS sets the "traffic class" and "flow label" fields of the ipv6 header.
func (b IPv6) SetTOS(t uint8, l uint32) {
        vtf := (6 << 28) | (uint32(t) << 20) | (l & 0xfffff)
        binary.BigEndian.PutUint32(b[versTCFL:], vtf)
}

// SetPayloadLength sets the "payload length" field of the ipv6 header.
func (b IPv6) SetPayloadLength(payloadLength uint16) {
        binary.BigEndian.PutUint16(b[IPv6PayloadLenOffset:], payloadLength)
}

// SetSourceAddress sets the "source address" field of the ipv6 header.
func (b IPv6) SetSourceAddress(addr tcpip.Address) {
        copy(b[v6SrcAddr:][:IPv6AddressSize], addr)
}

// SetDestinationAddress sets the "destination address" field of the ipv6
// header.
func (b IPv6) SetDestinationAddress(addr tcpip.Address) {
        copy(b[v6DstAddr:][:IPv6AddressSize], addr)
}

// SetHopLimit sets the value of the "Hop Limit" field.
func (b IPv6) SetHopLimit(v uint8) {
        b[hopLimit] = v
}

// SetNextHeader sets the value of the "next header" field of the ipv6 header.
func (b IPv6) SetNextHeader(v uint8) {
        b[IPv6NextHeaderOffset] = v
}

// SetChecksum implements Network.SetChecksum. Given that IPv6 doesn't have a
// checksum, it is empty.
func (IPv6) SetChecksum(uint16) {
}

// Encode encodes all the fields of the ipv6 header.
func (b IPv6) Encode(i *IPv6Fields) {
        extHdr := b[IPv6MinimumSize:]
        b.SetTOS(i.TrafficClass, i.FlowLabel)
        b.SetPayloadLength(i.PayloadLength)
        b[hopLimit] = i.HopLimit
        b.SetSourceAddress(i.SrcAddr)
        b.SetDestinationAddress(i.DstAddr)
        nextHeader, _ := i.ExtensionHeaders.Serialize(i.TransportProtocol, extHdr)
        b[IPv6NextHeaderOffset] = nextHeader
}

// IsValid performs basic validation on the packet.
func (b IPv6) IsValid(pktSize int) bool {
        if len(b) < IPv6MinimumSize {
                return false
        }

        dlen := int(b.PayloadLength())
        if dlen > pktSize-IPv6MinimumSize {
                return false
        }

        if IPVersion(b) != IPv6Version {
                return false
        }

        return true
}

// IsV4MappedAddress determines if the provided address is an IPv4 mapped
// address by checking if its prefix is 0:0:0:0:0:ffff::/96.
func IsV4MappedAddress(addr tcpip.Address) bool {
        if len(addr) != IPv6AddressSize {
                return false
        }

        return IPv4MappedIPv6Subnet.Contains(addr)
}

// IsV6MulticastAddress determines if the provided address is an IPv6
// multicast address (anything starting with FF).
func IsV6MulticastAddress(addr tcpip.Address) bool {
        if len(addr) != IPv6AddressSize {
                return false
        }
        return addr[0] == 0xff
}

// IsV6UnicastAddress determines if the provided address is a valid IPv6
// unicast (and specified) address. That is, IsV6UnicastAddress returns
// true if addr contains IPv6AddressSize bytes, is not the unspecified
// address and is not a multicast address.
func IsV6UnicastAddress(addr tcpip.Address) bool {
        if len(addr) != IPv6AddressSize {
                return false
        }

        // Must not be unspecified
        if addr == IPv6Any {
                return false
        }

        // Return if not a multicast.
        return addr[0] != 0xff
}

const solicitedNodeMulticastPrefix = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff"

// SolicitedNodeAddr computes the solicited-node multicast address. This is
// used for NDP. Described in RFC 4291. The argument must be a full-length IPv6
// address.
func SolicitedNodeAddr(addr tcpip.Address) tcpip.Address {
        return solicitedNodeMulticastPrefix + addr[len(addr)-3:]
}

// IsSolicitedNodeAddr determines whether the address is a solicited-node
// multicast address.
func IsSolicitedNodeAddr(addr tcpip.Address) bool {
        return solicitedNodeMulticastPrefix == addr[:len(addr)-3]
}

// EthernetAdddressToModifiedEUI64IntoBuf populates buf with a modified EUI-64
// from a 48-bit Ethernet/MAC address, as per RFC 4291 section 2.5.1.
//
// buf MUST be at least 8 bytes.
func EthernetAdddressToModifiedEUI64IntoBuf(linkAddr tcpip.LinkAddress, buf []byte) {
        buf[0] = linkAddr[0] ^ 2
        buf[1] = linkAddr[1]
        buf[2] = linkAddr[2]
        buf[3] = 0xFF
        buf[4] = 0xFE
        buf[5] = linkAddr[3]
        buf[6] = linkAddr[4]
        buf[7] = linkAddr[5]
}

// EthernetAddressToModifiedEUI64 computes a modified EUI-64 from a 48-bit
// Ethernet/MAC address, as per RFC 4291 section 2.5.1.
func EthernetAddressToModifiedEUI64(linkAddr tcpip.LinkAddress) [IIDSize]byte {
        var buf [IIDSize]byte
        EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, buf[:])
        return buf
}

// LinkLocalAddr computes the default IPv6 link-local address from a link-layer
// (MAC) address.
func LinkLocalAddr(linkAddr tcpip.LinkAddress) tcpip.Address {
        // Convert a 48-bit MAC to a modified EUI-64 and then prepend the
        // link-local header, FE80::.
        //
        // The conversion is very nearly:
        //        aa:bb:cc:dd:ee:ff => FE80::Aabb:ccFF:FEdd:eeff
        // Note the capital A. The conversion aa->Aa involves a bit flip.
        lladdrb := [IPv6AddressSize]byte{
                0: 0xFE,
                1: 0x80,
        }
        EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, lladdrb[IIDOffsetInIPv6Address:])
        return tcpip.Address(lladdrb[:])
}

// IsV6LinkLocalUnicastAddress returns true iff the provided address is an IPv6
// link-local unicast address, as defined by RFC 4291 section 2.5.6.
func IsV6LinkLocalUnicastAddress(addr tcpip.Address) bool {
        if len(addr) != IPv6AddressSize {
                return false
        }
        return addr[0] == 0xfe && (addr[1]&0xc0) == 0x80
}

// IsV6LoopbackAddress returns true iff the provided address is an IPv6 loopback
// address, as defined by RFC 4291 section 2.5.3.
func IsV6LoopbackAddress(addr tcpip.Address) bool {
        return addr == IPv6Loopback
}

// IsV6LinkLocalMulticastAddress returns true iff the provided address is an
// IPv6 link-local multicast address, as defined by RFC 4291 section 2.7.
func IsV6LinkLocalMulticastAddress(addr tcpip.Address) bool {
        return IsV6MulticastAddress(addr) && V6MulticastScope(addr) == IPv6LinkLocalMulticastScope
}

// AppendOpaqueInterfaceIdentifier appends a 64 bit opaque interface identifier
// (IID) to buf as outlined by RFC 7217 and returns the extended buffer.
//
// The opaque IID is generated from the cryptographic hash of the concatenation
// of the prefix, NIC's name, DAD counter (DAD retry counter) and the secret
// key. The secret key SHOULD be at least OpaqueIIDSecretKeyMinBytes bytes and
// MUST be generated to a pseudo-random number. See RFC 4086 for randomness
// requirements for security.
//
// If buf has enough capacity for the IID (IIDSize bytes), a new underlying
// array for the buffer will not be allocated.
func AppendOpaqueInterfaceIdentifier(buf []byte, prefix tcpip.Subnet, nicName string, dadCounter uint8, secretKey []byte) []byte {
        // As per RFC 7217 section 5, the opaque identifier can be generated as a
        // cryptographic hash of the concatenation of each of the function parameters.
        // Note, we omit the optional Network_ID field.
        h := sha256.New()
        // h.Write never returns an error.
        h.Write([]byte(prefix.ID()[:IIDOffsetInIPv6Address]))
        h.Write([]byte(nicName))
        h.Write([]byte{dadCounter})
        h.Write(secretKey)

        var sumBuf [sha256.Size]byte
        sum := h.Sum(sumBuf[:0])

        return append(buf, sum[:IIDSize]...)
}

// LinkLocalAddrWithOpaqueIID computes the default IPv6 link-local address with
// an opaque IID.
func LinkLocalAddrWithOpaqueIID(nicName string, dadCounter uint8, secretKey []byte) tcpip.Address {
        lladdrb := [IPv6AddressSize]byte{
                0: 0xFE,
                1: 0x80,
        }

        return tcpip.Address(AppendOpaqueInterfaceIdentifier(lladdrb[:IIDOffsetInIPv6Address], IPv6LinkLocalPrefix.Subnet(), nicName, dadCounter, secretKey))
}

// IPv6AddressScope is the scope of an IPv6 address.
type IPv6AddressScope int

const (
        // LinkLocalScope indicates a link-local address.
        LinkLocalScope IPv6AddressScope = iota

        // GlobalScope indicates a global address.
        GlobalScope
)

// ScopeForIPv6Address returns the scope for an IPv6 address.
func ScopeForIPv6Address(addr tcpip.Address) (IPv6AddressScope, tcpip.Error) {
        if len(addr) != IPv6AddressSize {
                return GlobalScope, &tcpip.ErrBadAddress{}
        }

        switch {
        case IsV6LinkLocalMulticastAddress(addr):
                return LinkLocalScope, nil

        case IsV6LinkLocalUnicastAddress(addr):
                return LinkLocalScope, nil

        default:
                return GlobalScope, nil
        }
}

// InitialTempIID generates the initial temporary IID history value to generate
// temporary SLAAC addresses with.
//
// Panics if initialTempIIDHistory is not at least IIDSize bytes.
func InitialTempIID(initialTempIIDHistory []byte, seed []byte, nicID tcpip.NICID) {
        h := sha256.New()
        // h.Write never returns an error.
        h.Write(seed)
        var nicIDBuf [4]byte
        binary.BigEndian.PutUint32(nicIDBuf[:], uint32(nicID))
        h.Write(nicIDBuf[:])

        var sumBuf [sha256.Size]byte
        sum := h.Sum(sumBuf[:0])

        if n := copy(initialTempIIDHistory, sum[sha256.Size-IIDSize:]); n != IIDSize {
                panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, IIDSize))
        }
}

// GenerateTempIPv6SLAACAddr generates a temporary SLAAC IPv6 address for an
// associated stable/permanent SLAAC address.
//
// GenerateTempIPv6SLAACAddr will update the temporary IID history value to be
// used when generating a new temporary IID.
//
// Panics if tempIIDHistory is not at least IIDSize bytes.
func GenerateTempIPv6SLAACAddr(tempIIDHistory []byte, stableAddr tcpip.Address) tcpip.AddressWithPrefix {
        addrBytes := []byte(stableAddr)
        h := sha256.New()
        h.Write(tempIIDHistory)
        h.Write(addrBytes[IIDOffsetInIPv6Address:])
        var sumBuf [sha256.Size]byte
        sum := h.Sum(sumBuf[:0])

        // The rightmost 64 bits of sum are saved for the next iteration.
        if n := copy(tempIIDHistory, sum[sha256.Size-IIDSize:]); n != IIDSize {
                panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, IIDSize))
        }

        // The leftmost 64 bits of sum is used as the IID.
        if n := copy(addrBytes[IIDOffsetInIPv6Address:], sum); n != IIDSize {
                panic(fmt.Sprintf("copied %d IID bytes, expected %d bytes", n, IIDSize))
        }

        return tcpip.AddressWithPrefix{
                Address:   tcpip.Address(addrBytes),
                PrefixLen: IIDOffsetInIPv6Address * 8,
        }
}

// IPv6MulticastScope is the scope of a multicast IPv6 address, as defined by
// RFC 7346 section 2.
type IPv6MulticastScope uint8

// The various values for IPv6 multicast scopes, as per RFC 7346 section 2:
//
//      +------+--------------------------+-------------------------+
//      | scop | NAME                     | REFERENCE               |
//      +------+--------------------------+-------------------------+
//      |  0   | Reserved                 | [RFC4291], RFC 7346     |
//      |  1   | Interface-Local scope    | [RFC4291], RFC 7346     |
//      |  2   | Link-Local scope         | [RFC4291], RFC 7346     |
//      |  3   | Realm-Local scope        | [RFC4291], RFC 7346     |
//      |  4   | Admin-Local scope        | [RFC4291], RFC 7346     |
//      |  5   | Site-Local scope         | [RFC4291], RFC 7346     |
//      |  6   | Unassigned               |                         |
//      |  7   | Unassigned               |                         |
//      |  8   | Organization-Local scope | [RFC4291], RFC 7346     |
//      |  9   | Unassigned               |                         |
//      |  A   | Unassigned               |                         |
//      |  B   | Unassigned               |                         |
//      |  C   | Unassigned               |                         |
//      |  D   | Unassigned               |                         |
//      |  E   | Global scope             | [RFC4291], RFC 7346     |
//      |  F   | Reserved                 | [RFC4291], RFC 7346     |
//      +------+--------------------------+-------------------------+
const (
        IPv6Reserved0MulticastScope         = IPv6MulticastScope(0x0)
        IPv6InterfaceLocalMulticastScope    = IPv6MulticastScope(0x1)
        IPv6LinkLocalMulticastScope         = IPv6MulticastScope(0x2)
        IPv6RealmLocalMulticastScope        = IPv6MulticastScope(0x3)
        IPv6AdminLocalMulticastScope        = IPv6MulticastScope(0x4)
        IPv6SiteLocalMulticastScope         = IPv6MulticastScope(0x5)
        IPv6OrganizationLocalMulticastScope = IPv6MulticastScope(0x8)
        IPv6GlobalMulticastScope            = IPv6MulticastScope(0xE)
        IPv6ReservedFMulticastScope         = IPv6MulticastScope(0xF)
)

// V6MulticastScope returns the scope of a multicast address.
func V6MulticastScope(addr tcpip.Address) IPv6MulticastScope {
        return IPv6MulticastScope(addr[ipv6MulticastAddressScopeByteIdx] & ipv6MulticastAddressScopeMask)
}

















































   34 










   27 




   27 





   33 
   27 







    2 












    8 






















   25 







   25 


   25 





   24 

    1 
    1 









   23 





   20 







   22 
   20 




   22 
















   12 


    6 

    1 

    1 



    6 




    9 













    1 













    1 




   27 













    5 







    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package transport

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/waiter"
)

// queue is a buffer queue.
//
// +stateify savable
type queue struct {
        queueRefs

        ReaderQueue *waiter.Queue
        WriterQueue *waiter.Queue

        mu       sync.Mutex `state:"nosave"`
        closed   bool
        unread   bool
        used     int64
        limit    int64
        dataList messageList
}

// Close closes q for reading and writing. It is immediately not writable and
// will become unreadable when no more data is pending.
//
// Both the read and write queues must be notified after closing:
// q.ReaderQueue.Notify(waiter.ReadableEvents)
// q.WriterQueue.Notify(waiter.WritableEvents)
func (q *queue) Close() {
        q.mu.Lock()
        q.closed = true
        q.mu.Unlock()
}

// Reset empties the queue and Releases all of the Entries.
//
// Both the read and write queues must be notified after resetting:
// q.ReaderQueue.Notify(waiter.ReadableEvents)
// q.WriterQueue.Notify(waiter.WritableEvents)
func (q *queue) Reset(ctx context.Context) {
        q.mu.Lock()
        for cur := q.dataList.Front(); cur != nil; cur = cur.Next() {
                cur.Release(ctx)
        }
        q.dataList.Reset()
        q.used = 0
        q.mu.Unlock()
}

// DecRef implements RefCounter.DecRef.
func (q *queue) DecRef(ctx context.Context) {
        q.queueRefs.DecRef(func() {
                // We don't need to notify after resetting because no one cares about
                // this queue after all references have been dropped.
                q.Reset(ctx)
        })
}

// IsReadable determines if q is currently readable.
func (q *queue) IsReadable() bool {
        q.mu.Lock()
        defer q.mu.Unlock()

        return q.closed || q.dataList.Front() != nil
}

// bufWritable returns true if there is space for writing.
//
// N.B. Linux only considers a unix socket "writable" if >75% of the buffer is
// free.
//
// See net/unix/af_unix.c:unix_writeable.
func (q *queue) bufWritable() bool {
        return 4*q.used < q.limit
}

// IsWritable determines if q is currently writable.
func (q *queue) IsWritable() bool {
        q.mu.Lock()
        defer q.mu.Unlock()

        return q.closed || q.bufWritable()
}

// Enqueue adds an entry to the data queue if room is available.
//
// If discardEmpty is true and there are zero bytes of data, the packet is
// dropped.
//
// If truncate is true, Enqueue may truncate the message before enqueuing it.
// Otherwise, the entire message must fit. If l is less than the size of data,
// err indicates why.
//
// If notify is true, ReaderQueue.Notify must be called:
// q.ReaderQueue.Notify(waiter.ReadableEvents)
func (q *queue) Enqueue(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress, discardEmpty bool, truncate bool) (l int64, notify bool, err *syserr.Error) {
        q.mu.Lock()

        if q.closed {
                q.mu.Unlock()
                return 0, false, syserr.ErrClosedForSend
        }

        for _, d := range data {
                l += int64(len(d))
        }
        if discardEmpty && l == 0 {
                q.mu.Unlock()
                c.Release(ctx)
                return 0, false, nil
        }

        free := q.limit - q.used

        if l > free && truncate {
                if free == 0 {
                        // Message can't fit right now.
                        q.mu.Unlock()
                        return 0, false, syserr.ErrWouldBlock
                }

                l = free
                err = syserr.ErrWouldBlock
        }

        if l > q.limit {
                // Message is too big to ever fit.
                q.mu.Unlock()
                return 0, false, syserr.ErrMessageTooLong
        }

        if l > free {
                // Message can't fit right now, and could not be truncated.
                q.mu.Unlock()
                return 0, false, syserr.ErrWouldBlock
        }

        // Aggregate l bytes of data. This will truncate the data if l is less than
        // the total bytes held in data.
        v := make([]byte, l)
        for i, b := 0, v; i < len(data) && len(b) > 0; i++ {
                n := copy(b, data[i])
                b = b[n:]
        }

        notify = q.dataList.Front() == nil
        q.used += l
        q.dataList.PushBack(&message{
                Data:    buffer.View(v),
                Control: c,
                Address: from,
        })

        q.mu.Unlock()

        return l, notify, err
}

// Dequeue removes the first entry in the data queue, if one exists.
//
// If notify is true, WriterQueue.Notify must be called:
// q.WriterQueue.Notify(waiter.WritableEvents)
func (q *queue) Dequeue() (e *message, notify bool, err *syserr.Error) {
        q.mu.Lock()

        if q.dataList.Front() == nil {
                err := syserr.ErrWouldBlock
                if q.closed {
                        err = syserr.ErrClosedForReceive
                        if q.unread {
                                err = syserr.ErrConnectionReset
                        }
                }
                q.mu.Unlock()

                return nil, false, err
        }

        notify = !q.bufWritable()

        e = q.dataList.Front()
        q.dataList.Remove(e)
        q.used -= e.Length()

        notify = notify && q.bufWritable()

        q.mu.Unlock()

        return e, notify, nil
}

// Peek returns the first entry in the data queue, if one exists.
func (q *queue) Peek() (*message, *syserr.Error) {
        q.mu.Lock()
        defer q.mu.Unlock()

        if q.dataList.Front() == nil {
                err := syserr.ErrWouldBlock
                if q.closed {
                        if err = syserr.ErrClosedForReceive; q.unread {
                                err = syserr.ErrConnectionReset
                        }
                }
                return nil, err
        }

        return q.dataList.Front().Peek(), nil
}

// QueuedSize returns the number of bytes currently in the queue, that is, the
// number of readable bytes.
func (q *queue) QueuedSize() int64 {
        q.mu.Lock()
        defer q.mu.Unlock()
        return q.used
}

// MaxQueueSize returns the maximum number of bytes storable in the queue.
func (q *queue) MaxQueueSize() int64 {
        q.mu.Lock()
        defer q.mu.Unlock()
        return q.limit
}

// SetMaxQueueSize sets the maximum number of bytes storable in the queue.
func (q *queue) SetMaxQueueSize(v int64) {
        q.mu.Lock()
        defer q.mu.Unlock()
        q.limit = v
}

// CloseUnread sets flag to indicate that the peer is closed (not shutdown)
// with unread data. So if read on this queue shall return ECONNRESET error.
func (q *queue) CloseUnread() {
        q.mu.Lock()
        defer q.mu.Unlock()
        q.unread = true
}



































































































   39 










   16 





   14 


    3 







    3 














    9 









    9 








    8 

    1 












    1 





    1 






    1 



    1 






    6 
    7 





    7 







    7 







    7 






    7 





    7 





    7 















    7 





    7 






























    2 















    2 









    2 





   36 





    1 

    1 


    1 

   36 

    1 





























































































    1 






    1 




    1 








   17 
    2 







    1 




    6 



    6 






    6 



    2 






    2 





    2 







    1 




    2 




    2 





























    8 






    8 



    1 





    7 


















    1 




    2 





    1 







    1 
    1 


    1 














    1 






















    2 



    1 



    1 


    1 



















    2 



    1 



    2 















    2 

    1 






    1 






   11 



   10 

    1 


   11 




   11 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package shm implements sysv shared memory segments.
//
// Known missing features:
//
// - SHM_LOCK/SHM_UNLOCK are no-ops. The sentry currently doesn't implement
//   memory locking in general.
//
// - SHM_HUGETLB and related flags for shmget(2) are ignored. There's no easy
//   way to implement hugetlb support on a per-map basis, and it has no impact
//   on correctness.
//
// - SHM_NORESERVE for shmget(2) is ignored, the sentry doesn't implement swap
//   so it's meaningless to reserve space for swap.
//
// - No per-process segment size enforcement. This feature probably isn't used
//   much anyways, since Linux sets the per-process limits to the system-wide
//   limits by default.
//
// Lock ordering: mm.mappingMu -> shm registry lock -> shm lock
package shm

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        "gvisor.dev/gvisor/pkg/sentry/usage"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
)

// Key represents a shm segment key. Analogous to a file name.
type Key int32

// ID represents the opaque handle for a shm segment. Analogous to an fd.
type ID int32

// Registry tracks all shared memory segments in an IPC namespace. The registry
// provides the mechanisms for creating and finding segments, and reporting
// global shm parameters.
//
// +stateify savable
type Registry struct {
        // userNS owns the IPC namespace this registry belong to. Immutable.
        userNS *auth.UserNamespace

        // mu protects all fields below.
        mu sync.Mutex `state:"nosave"`

        // shms maps segment ids to segments.
        //
        // shms holds all referenced segments, which are removed on the last
        // DecRef. Thus, it cannot itself hold a reference on the Shm.
        //
        // Since removal only occurs after the last (unlocked) DecRef, there
        // exists a short window during which a Shm still exists in Shm, but is
        // unreferenced. Users must use TryIncRef to determine if the Shm is
        // still valid.
        shms map[ID]*Shm

        // keysToShms maps segment keys to segments.
        //
        // Shms in keysToShms are guaranteed to be referenced, as they are
        // removed by disassociateKey before the last DecRef.
        keysToShms map[Key]*Shm

        // Sum of the sizes of all existing segments rounded up to page size, in
        // units of page size.
        totalPages uint64

        // ID assigned to the last created segment. Used to quickly find the next
        // unused ID.
        lastIDUsed ID
}

// NewRegistry creates a new shm registry.
func NewRegistry(userNS *auth.UserNamespace) *Registry {
        return &Registry{
                userNS:     userNS,
                shms:       make(map[ID]*Shm),
                keysToShms: make(map[Key]*Shm),
        }
}

// FindByID looks up a segment given an ID.
//
// FindByID returns a reference on Shm.
func (r *Registry) FindByID(id ID) *Shm {
        r.mu.Lock()
        defer r.mu.Unlock()
        s := r.shms[id]
        // Take a reference on s. If TryIncRef fails, s has reached the last
        // DecRef, but hasn't quite been removed from r.shms yet.
        if s != nil && s.TryIncRef() {
                return s
        }
        return nil
}

// dissociateKey removes the association between a segment and its key,
// preventing it from being discovered in the registry. This doesn't necessarily
// mean the segment is about to be destroyed. This is analogous to unlinking a
// file; the segment can still be used by a process already referencing it, but
// cannot be discovered by a new process.
func (r *Registry) dissociateKey(s *Shm) {
        r.mu.Lock()
        defer r.mu.Unlock()
        s.mu.Lock()
        defer s.mu.Unlock()
        if s.key != linux.IPC_PRIVATE {
                delete(r.keysToShms, s.key)
                s.key = linux.IPC_PRIVATE
        }
}

// FindOrCreate looks up or creates a segment in the registry. It's functionally
// analogous to open(2).
//
// FindOrCreate returns a reference on Shm.
func (r *Registry) FindOrCreate(ctx context.Context, pid int32, key Key, size uint64, mode linux.FileMode, private, create, exclusive bool) (*Shm, error) {
        if (create || private) && (size < linux.SHMMIN || size > linux.SHMMAX) {
                // "A new segment was to be created and size is less than SHMMIN or
                // greater than SHMMAX." - man shmget(2)
                //
                // Note that 'private' always implies the creation of a new segment
                // whether IPC_CREAT is specified or not.
                return nil, linuxerr.EINVAL
        }

        r.mu.Lock()
        defer r.mu.Unlock()

        if len(r.shms) >= linux.SHMMNI {
                // "All possible shared memory IDs have been taken (SHMMNI) ..."
                //   - man shmget(2)
                return nil, syserror.ENOSPC
        }

        if !private {
                // Look up an existing segment.
                if shm := r.keysToShms[key]; shm != nil {
                        shm.mu.Lock()
                        defer shm.mu.Unlock()

                        // Check that caller can access the segment.
                        if !shm.checkPermissions(ctx, fs.PermsFromMode(mode)) {
                                // "The user does not have permission to access the shared
                                // memory segment, and does not have the CAP_IPC_OWNER
                                // capability in the user namespace that governs its IPC
                                // namespace." - man shmget(2)
                                return nil, linuxerr.EACCES
                        }

                        if size > shm.size {
                                // "A segment for the given key exists, but size is greater than
                                // the size of that segment." - man shmget(2)
                                return nil, linuxerr.EINVAL
                        }

                        if create && exclusive {
                                // "IPC_CREAT and IPC_EXCL were specified in shmflg, but a
                                // shared memory segment already exists for key."
                                //  - man shmget(2)
                                return nil, linuxerr.EEXIST
                        }

                        shm.IncRef()
                        return shm, nil
                }

                if !create {
                        // "No segment exists for the given key, and IPC_CREAT was not
                        // specified." - man shmget(2)
                        return nil, syserror.ENOENT
                }
        }

        var sizeAligned uint64
        if val, ok := hostarch.Addr(size).RoundUp(); ok {
                sizeAligned = uint64(val)
        } else {
                return nil, linuxerr.EINVAL
        }

        if numPages := sizeAligned / hostarch.PageSize; r.totalPages+numPages > linux.SHMALL {
                // "... allocating a segment of the requested size would cause the
                // system to exceed the system-wide limit on shared memory (SHMALL)."
                //   - man shmget(2)
                return nil, syserror.ENOSPC
        }

        // Need to create a new segment.
        creator := fs.FileOwnerFromContext(ctx)
        perms := fs.FilePermsFromMode(mode)
        s, err := r.newShm(ctx, pid, key, creator, perms, size)
        if err != nil {
                return nil, err
        }
        // The initial reference is held by s itself. Take another to return to
        // the caller.
        s.IncRef()
        return s, nil
}

// newShm creates a new segment in the registry.
//
// Precondition: Caller must hold r.mu.
func (r *Registry) newShm(ctx context.Context, pid int32, key Key, creator fs.FileOwner, perms fs.FilePermissions, size uint64) (*Shm, error) {
        mfp := pgalloc.MemoryFileProviderFromContext(ctx)
        if mfp == nil {
                panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
        }

        effectiveSize := uint64(hostarch.Addr(size).MustRoundUp())
        fr, err := mfp.MemoryFile().Allocate(effectiveSize, usage.Anonymous)
        if err != nil {
                return nil, err
        }

        shm := &Shm{
                mfp:           mfp,
                registry:      r,
                creator:       creator,
                size:          size,
                effectiveSize: effectiveSize,
                fr:            fr,
                key:           key,
                perms:         perms,
                owner:         creator,
                creatorPID:    pid,
                changeTime:    ktime.NowFromContext(ctx),
        }
        shm.InitRefs()

        // Find the next available ID.
        for id := r.lastIDUsed + 1; id != r.lastIDUsed; id++ {
                // Handle wrap around.
                if id < 0 {
                        id = 0
                        continue
                }
                if r.shms[id] == nil {
                        r.lastIDUsed = id

                        shm.ID = id
                        r.shms[id] = shm
                        r.keysToShms[key] = shm

                        r.totalPages += effectiveSize / hostarch.PageSize

                        return shm, nil
                }
        }

        log.Warningf("Shm ids exhuasted, they may be leaking")
        return nil, syserror.ENOSPC
}

// IPCInfo reports global parameters for sysv shared memory segments on this
// system. See shmctl(IPC_INFO).
func (r *Registry) IPCInfo() *linux.ShmParams {
        return &linux.ShmParams{
                ShmMax: linux.SHMMAX,
                ShmMin: linux.SHMMIN,
                ShmMni: linux.SHMMNI,
                ShmSeg: linux.SHMSEG,
                ShmAll: linux.SHMALL,
        }
}

// ShmInfo reports linux-specific global parameters for sysv shared memory
// segments on this system. See shmctl(SHM_INFO).
func (r *Registry) ShmInfo() *linux.ShmInfo {
        r.mu.Lock()
        defer r.mu.Unlock()

        return &linux.ShmInfo{
                UsedIDs: int32(r.lastIDUsed),
                ShmTot:  r.totalPages,
                ShmRss:  r.totalPages, // We could probably get a better estimate from memory accounting.
                ShmSwp:  0,            // No reclaim at the moment.
        }
}

// remove deletes a segment from this registry, deaccounting the memory used by
// the segment.
//
// Precondition: Must follow a call to r.dissociateKey(s).
func (r *Registry) remove(s *Shm) {
        r.mu.Lock()
        defer r.mu.Unlock()
        s.mu.Lock()
        defer s.mu.Unlock()

        if s.key != linux.IPC_PRIVATE {
                panic(fmt.Sprintf("Attempted to remove %s from the registry whose key is still associated", s.debugLocked()))
        }

        delete(r.shms, s.ID)
        r.totalPages -= s.effectiveSize / hostarch.PageSize
}

// Release drops the self-reference of each active shm segment in the registry.
// It is called when the kernel.IPCNamespace containing r is being destroyed.
func (r *Registry) Release(ctx context.Context) {
        // Because Shm.DecRef() may acquire the same locks, collect the segments to
        // release first. Note that this should not race with any updates to r, since
        // the IPC namespace containing it has no more references.
        toRelease := make([]*Shm, 0)
        r.mu.Lock()
        for _, s := range r.keysToShms {
                s.mu.Lock()
                if !s.pendingDestruction {
                        toRelease = append(toRelease, s)
                }
                s.mu.Unlock()
        }
        r.mu.Unlock()

        for _, s := range toRelease {
                r.dissociateKey(s)
                s.DecRef(ctx)
        }
}

// Shm represents a single shared memory segment.
//
// Shm segments are backed directly by an allocation from platform memory.
// Segments are always mapped as a whole, greatly simplifying how mappings are
// tracked. However note that mremap and munmap calls may cause the vma for a
// segment to become fragmented; which requires special care when unmapping a
// segment. See mm/shm.go.
//
// Segments persist until they are explicitly marked for destruction via
// MarkDestroyed().
//
// Shm implements memmap.Mappable and memmap.MappingIdentity.
//
// +stateify savable
type Shm struct {
        // ShmRefs tracks the number of references to this segment.
        //
        // A segment holds a reference to itself until it is marked for
        // destruction.
        //
        // In addition to direct users, the MemoryManager will hold references
        // via MappingIdentity.
        ShmRefs

        mfp pgalloc.MemoryFileProvider

        // registry points to the shm registry containing this segment. Immutable.
        registry *Registry

        // ID is the kernel identifier for this segment. Immutable.
        ID ID

        // creator is the user that created the segment. Immutable.
        creator fs.FileOwner

        // size is the requested size of the segment at creation, in
        // bytes. Immutable.
        size uint64

        // effectiveSize of the segment, rounding up to the next page
        // boundary. Immutable.
        //
        // Invariant: effectiveSize must be a multiple of hostarch.PageSize.
        effectiveSize uint64

        // fr is the offset into mfp.MemoryFile() that backs this contents of this
        // segment. Immutable.
        fr memmap.FileRange

        // mu protects all fields below.
        mu sync.Mutex `state:"nosave"`

        // key is the public identifier for this segment.
        key Key

        // perms is the access permissions for the segment.
        perms fs.FilePermissions

        // owner of this segment.
        owner fs.FileOwner
        // attachTime is updated on every successful shmat.
        attachTime ktime.Time
        // detachTime is updated on every successful shmdt.
        detachTime ktime.Time
        // changeTime is updated on every successful changes to the segment via
        // shmctl(IPC_SET).
        changeTime ktime.Time

        // creatorPID is the PID of the process that created the segment.
        creatorPID int32
        // lastAttachDetachPID is the pid of the process that issued the last shmat
        // or shmdt syscall.
        lastAttachDetachPID int32

        // pendingDestruction indicates the segment was marked as destroyed through
        // shmctl(IPC_RMID). When marked as destroyed, the segment will not be found
        // in the registry and can no longer be attached. When the last user
        // detaches from the segment, it is destroyed.
        pendingDestruction bool
}

// Precondition: Caller must hold s.mu.
func (s *Shm) debugLocked() string {
        return fmt.Sprintf("Shm{id: %d, key: %d, size: %d bytes, refs: %d, destroyed: %v}",
                s.ID, s.key, s.size, s.ReadRefs(), s.pendingDestruction)
}

// MappedName implements memmap.MappingIdentity.MappedName.
func (s *Shm) MappedName(ctx context.Context) string {
        s.mu.Lock()
        defer s.mu.Unlock()
        return fmt.Sprintf("SYSV%08d", s.key)
}

// DeviceID implements memmap.MappingIdentity.DeviceID.
func (s *Shm) DeviceID() uint64 {
        return shmDevice.DeviceID()
}

// InodeID implements memmap.MappingIdentity.InodeID.
func (s *Shm) InodeID() uint64 {
        // "shmid gets reported as "inode#" in /proc/pid/maps. proc-ps tools use
        // this. Changing this will break them." -- Linux, ipc/shm.c:newseg()
        return uint64(s.ID)
}

// DecRef drops a reference on s.
//
// Precondition: Caller must not hold s.mu.
func (s *Shm) DecRef(ctx context.Context) {
        s.ShmRefs.DecRef(func() {
                s.mfp.MemoryFile().DecRef(s.fr)
                s.registry.remove(s)
        })
}

// Msync implements memmap.MappingIdentity.Msync. Msync is a no-op for shm
// segments.
func (s *Shm) Msync(context.Context, memmap.MappableRange) error {
        return nil
}

// AddMapping implements memmap.Mappable.AddMapping.
func (s *Shm) AddMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) error {
        s.mu.Lock()
        defer s.mu.Unlock()
        s.attachTime = ktime.NowFromContext(ctx)
        if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
                s.lastAttachDetachPID = pid
        } else {
                // AddMapping is called during a syscall, so ctx should always be a task
                // context.
                log.Warningf("Adding mapping to %s but couldn't get the current pid; not updating the last attach pid", s.debugLocked())
        }
        return nil
}

// RemoveMapping implements memmap.Mappable.RemoveMapping.
func (s *Shm) RemoveMapping(ctx context.Context, _ memmap.MappingSpace, _ hostarch.AddrRange, _ uint64, _ bool) {
        s.mu.Lock()
        defer s.mu.Unlock()
        // RemoveMapping may be called during task exit, when ctx
        // is context.Background. Gracefully handle missing clocks. Failing to
        // update the detach time in these cases is ok, since no one can observe the
        // omission.
        if clock := ktime.RealtimeClockFromContext(ctx); clock != nil {
                s.detachTime = clock.Now()
        }

        // If called from a non-task context we also won't have a threadgroup
        // id. Silently skip updating the lastAttachDetachPid in that case.
        if pid, ok := context.ThreadGroupIDFromContext(ctx); ok {
                s.lastAttachDetachPID = pid
        } else {
                log.Debugf("Couldn't obtain pid when removing mapping to %s, not updating the last detach pid.", s.debugLocked())
        }
}

// CopyMapping implements memmap.Mappable.CopyMapping.
func (*Shm) CopyMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, hostarch.AddrRange, uint64, bool) error {
        return nil
}

// Translate implements memmap.Mappable.Translate.
func (s *Shm) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
        var err error
        if required.End > s.fr.Length() {
                err = &memmap.BusError{linuxerr.EFAULT}
        }
        if source := optional.Intersect(memmap.MappableRange{0, s.fr.Length()}); source.Length() != 0 {
                return []memmap.Translation{
                        {
                                Source: source,
                                File:   s.mfp.MemoryFile(),
                                Offset: s.fr.Start + source.Start,
                                Perms:  hostarch.AnyAccess,
                        },
                }, err
        }
        return nil, err
}

// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
func (s *Shm) InvalidateUnsavable(ctx context.Context) error {
        return nil
}

// AttachOpts describes various flags passed to shmat(2).
type AttachOpts struct {
        Execute  bool
        Readonly bool
        Remap    bool
}

// ConfigureAttach creates an mmap configuration for the segment with the
// requested attach options.
//
// Postconditions: The returned MMapOpts are valid only as long as a reference
// continues to be held on s.
func (s *Shm) ConfigureAttach(ctx context.Context, addr hostarch.Addr, opts AttachOpts) (memmap.MMapOpts, error) {
        s.mu.Lock()
        defer s.mu.Unlock()
        if s.pendingDestruction && s.ReadRefs() == 0 {
                return memmap.MMapOpts{}, syserror.EIDRM
        }

        if !s.checkPermissions(ctx, fs.PermMask{
                Read:    true,
                Write:   !opts.Readonly,
                Execute: opts.Execute,
        }) {
                // "The calling process does not have the required permissions for the
                // requested attach type, and does not have the CAP_IPC_OWNER capability
                // in the user namespace that governs its IPC namespace." - man shmat(2)
                return memmap.MMapOpts{}, linuxerr.EACCES
        }
        return memmap.MMapOpts{
                Length: s.size,
                Offset: 0,
                Addr:   addr,
                Fixed:  opts.Remap,
                Perms: hostarch.AccessType{
                        Read:    true,
                        Write:   !opts.Readonly,
                        Execute: opts.Execute,
                },
                MaxPerms:        hostarch.AnyAccess,
                Mappable:        s,
                MappingIdentity: s,
        }, nil
}

// EffectiveSize returns the size of the underlying shared memory segment. This
// may be larger than the requested size at creation, due to rounding to page
// boundaries.
func (s *Shm) EffectiveSize() uint64 {
        return s.effectiveSize
}

// IPCStat returns information about a shm. See shmctl(IPC_STAT).
func (s *Shm) IPCStat(ctx context.Context) (*linux.ShmidDS, error) {
        s.mu.Lock()
        defer s.mu.Unlock()

        // "The caller must have read permission on the shared memory segment."
        //   - man shmctl(2)
        if !s.checkPermissions(ctx, fs.PermMask{Read: true}) {
                // "IPC_STAT or SHM_STAT is requested and shm_perm.mode does not allow
                // read access for shmid, and the calling process does not have the
                // CAP_IPC_OWNER capability in the user namespace that governs its IPC
                // namespace." - man shmctl(2)
                return nil, linuxerr.EACCES
        }

        var mode uint16
        if s.pendingDestruction {
                mode |= linux.SHM_DEST
        }
        creds := auth.CredentialsFromContext(ctx)

        // Use the reference count as a rudimentary count of the number of
        // attaches. We exclude:
        //
        // 1. The reference the caller holds.
        // 2. The self-reference held by s prior to destruction.
        //
        // Note that this may still overcount by including transient references
        // used in concurrent calls.
        nattach := uint64(s.ReadRefs()) - 1
        if !s.pendingDestruction {
                nattach--
        }

        ds := &linux.ShmidDS{
                ShmPerm: linux.IPCPerm{
                        Key:  uint32(s.key),
                        UID:  uint32(creds.UserNamespace.MapFromKUID(s.owner.UID)),
                        GID:  uint32(creds.UserNamespace.MapFromKGID(s.owner.GID)),
                        CUID: uint32(creds.UserNamespace.MapFromKUID(s.creator.UID)),
                        CGID: uint32(creds.UserNamespace.MapFromKGID(s.creator.GID)),
                        Mode: mode | uint16(s.perms.LinuxMode()),
                        Seq:  0, // IPC sequences not supported.
                },
                ShmSegsz:   s.size,
                ShmAtime:   s.attachTime.TimeT(),
                ShmDtime:   s.detachTime.TimeT(),
                ShmCtime:   s.changeTime.TimeT(),
                ShmCpid:    s.creatorPID,
                ShmLpid:    s.lastAttachDetachPID,
                ShmNattach: nattach,
        }

        return ds, nil
}

// Set modifies attributes for a segment. See shmctl(IPC_SET).
func (s *Shm) Set(ctx context.Context, ds *linux.ShmidDS) error {
        s.mu.Lock()
        defer s.mu.Unlock()

        if !s.checkOwnership(ctx) {
                return linuxerr.EPERM
        }

        creds := auth.CredentialsFromContext(ctx)
        uid := creds.UserNamespace.MapToKUID(auth.UID(ds.ShmPerm.UID))
        gid := creds.UserNamespace.MapToKGID(auth.GID(ds.ShmPerm.GID))
        if !uid.Ok() || !gid.Ok() {
                return linuxerr.EINVAL
        }

        // User may only modify the lower 9 bits of the mode. All the other bits are
        // always 0 for the underlying inode.
        mode := linux.FileMode(ds.ShmPerm.Mode & 0x1ff)
        s.perms = fs.FilePermsFromMode(mode)

        s.owner.UID = uid
        s.owner.GID = gid

        s.changeTime = ktime.NowFromContext(ctx)
        return nil
}

// MarkDestroyed marks a segment for destruction. The segment is actually
// destroyed once it has no references. MarkDestroyed may be called multiple
// times, and is safe to call after a segment has already been destroyed. See
// shmctl(IPC_RMID).
func (s *Shm) MarkDestroyed(ctx context.Context) {
        s.registry.dissociateKey(s)

        s.mu.Lock()
        if s.pendingDestruction {
                s.mu.Unlock()
                return
        }
        s.pendingDestruction = true
        s.mu.Unlock()

        // Drop the self-reference so destruction occurs when all
        // external references are gone.
        //
        // N.B. This cannot be the final DecRef, as the caller also
        // holds a reference.
        s.DecRef(ctx)
        return
}

// checkOwnership verifies whether a segment may be accessed by ctx as an
// owner. See ipc/util.c:ipcctl_pre_down_nolock() in Linux.
//
// Precondition: Caller must hold s.mu.
func (s *Shm) checkOwnership(ctx context.Context) bool {
        creds := auth.CredentialsFromContext(ctx)
        if s.owner.UID == creds.EffectiveKUID || s.creator.UID == creds.EffectiveKUID {
                return true
        }

        // Tasks with CAP_SYS_ADMIN may bypass ownership checks. Strangely, Linux
        // doesn't use CAP_IPC_OWNER for this despite CAP_IPC_OWNER being documented
        // for use to "override IPC ownership checks".
        return creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, s.registry.userNS)
}

// checkPermissions verifies whether a segment is accessible by ctx for access
// described by req. See ipc/util.c:ipcperms() in Linux.
//
// Precondition: Caller must hold s.mu.
func (s *Shm) checkPermissions(ctx context.Context, req fs.PermMask) bool {
        creds := auth.CredentialsFromContext(ctx)

        p := s.perms.Other
        if s.owner.UID == creds.EffectiveKUID {
                p = s.perms.User
        } else if creds.InGroup(s.owner.GID) {
                p = s.perms.Group
        }
        if p.SupersetOf(req) {
                return true
        }

        // Tasks with CAP_IPC_OWNER may bypass permission checks.
        return creds.HasCapabilityIn(linux.CAP_IPC_OWNER, s.registry.userNS)
}























































































    1 






























































   15 




   15 



    2 






    5 

    1 


    4 








    1 




    3 





    1 




    1 





    1 






    3 






    9 

    8 

    5 


    4 


    2 






    1 


    4 






    3 












    6 
    2 
    1 


    1 



    3 




    3 














   25 





    1 



   24 












    1 




   21 











   23 
    2 






    2 
    1 


    2 


    3 






    3 


    2 

    1 


    1 










    1 









    1 
    1 










    1 




























































    1 































































































































    6 










    6 








    6 
    2 




    4 


















































    4 






    3 




    3 






    3 









    3 
    1 



    3 
    1 



    2 

























    2 

    2 





    2 







    2 







    1 




    1 










































    3 












    3 










    3 







    3 
    3 
















































    3 

    1 

    1 

    1 





    3 
























    1 




   10 

   10 




   10 








   10 




   10 





   13 
    1 


   11 











































    5 



    6 









    1 



    2 






































































































    8 



























    2 











    6 
    1 






    6 
    2 


    4 













    4 






    4 
















    4 










    4 


    4 



    4 



    4 













    3 




    1 


























    4 


















    4 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ipv6

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

// icmpv6DestinationUnreachableSockError is a general ICMPv6 Destination
// Unreachable error.
//
// +stateify savable
type icmpv6DestinationUnreachableSockError struct{}

// Origin implements tcpip.SockErrorCause.
func (*icmpv6DestinationUnreachableSockError) Origin() tcpip.SockErrOrigin {
        return tcpip.SockExtErrorOriginICMP6
}

// Type implements tcpip.SockErrorCause.
func (*icmpv6DestinationUnreachableSockError) Type() uint8 {
        return uint8(header.ICMPv6DstUnreachable)
}

// Info implements tcpip.SockErrorCause.
func (*icmpv6DestinationUnreachableSockError) Info() uint32 {
        return 0
}

var _ stack.TransportError = (*icmpv6DestinationNetworkUnreachableSockError)(nil)

// icmpv6DestinationNetworkUnreachableSockError is an ICMPv6 Destination Network
// Unreachable error.
//
// It indicates that the destination network is unreachable.
//
// +stateify savable
type icmpv6DestinationNetworkUnreachableSockError struct {
        icmpv6DestinationUnreachableSockError
}

// Code implements tcpip.SockErrorCause.
func (*icmpv6DestinationNetworkUnreachableSockError) Code() uint8 {
        return uint8(header.ICMPv6NetworkUnreachable)
}

// Kind implements stack.TransportError.
func (*icmpv6DestinationNetworkUnreachableSockError) Kind() stack.TransportErrorKind {
        return stack.DestinationNetworkUnreachableTransportError
}

var _ stack.TransportError = (*icmpv6DestinationPortUnreachableSockError)(nil)

// icmpv6DestinationPortUnreachableSockError is an ICMPv6 Destination Port
// Unreachable error.
//
// It indicates that a packet reached the destination host, but the transport
// protocol was not active on the destination port.
//
// +stateify savable
type icmpv6DestinationPortUnreachableSockError struct {
        icmpv6DestinationUnreachableSockError
}

// Code implements tcpip.SockErrorCause.
func (*icmpv6DestinationPortUnreachableSockError) Code() uint8 {
        return uint8(header.ICMPv6PortUnreachable)
}

// Kind implements stack.TransportError.
func (*icmpv6DestinationPortUnreachableSockError) Kind() stack.TransportErrorKind {
        return stack.DestinationPortUnreachableTransportError
}

var _ stack.TransportError = (*icmpv6DestinationAddressUnreachableSockError)(nil)

// icmpv6DestinationAddressUnreachableSockError is an ICMPv6 Destination Address
// Unreachable error.
//
// It indicates that a packet was not able to reach the destination.
//
// +stateify savable
type icmpv6DestinationAddressUnreachableSockError struct {
        icmpv6DestinationUnreachableSockError
}

// Code implements tcpip.SockErrorCause.
func (*icmpv6DestinationAddressUnreachableSockError) Code() uint8 {
        return uint8(header.ICMPv6AddressUnreachable)
}

// Kind implements stack.TransportError.
func (*icmpv6DestinationAddressUnreachableSockError) Kind() stack.TransportErrorKind {
        return stack.DestinationHostUnreachableTransportError
}

var _ stack.TransportError = (*icmpv6PacketTooBigSockError)(nil)

// icmpv6PacketTooBigSockError is an ICMPv6 Packet Too Big error.
//
// It indicates that a link exists on the path to the destination with an MTU
// that is too small to carry the packet.
//
// +stateify savable
type icmpv6PacketTooBigSockError struct {
        mtu uint32
}

// Origin implements tcpip.SockErrorCause.
func (*icmpv6PacketTooBigSockError) Origin() tcpip.SockErrOrigin {
        return tcpip.SockExtErrorOriginICMP6
}

// Type implements tcpip.SockErrorCause.
func (*icmpv6PacketTooBigSockError) Type() uint8 {
        return uint8(header.ICMPv6PacketTooBig)
}

// Code implements tcpip.SockErrorCause.
func (*icmpv6PacketTooBigSockError) Code() uint8 {
        return uint8(header.ICMPv6UnusedCode)
}

// Info implements tcpip.SockErrorCause.
func (e *icmpv6PacketTooBigSockError) Info() uint32 {
        return e.mtu
}

// Kind implements stack.TransportError.
func (*icmpv6PacketTooBigSockError) Kind() stack.TransportErrorKind {
        return stack.PacketTooBigTransportError
}

func (e *endpoint) checkLocalAddress(addr tcpip.Address) bool {
        if e.nic.Spoofing() {
                return true
        }

        if addressEndpoint := e.AcquireAssignedAddress(addr, false, stack.NeverPrimaryEndpoint); addressEndpoint != nil {
                addressEndpoint.DecRef()
                return true
        }
        return false
}

// handleControl handles the case when an ICMP packet contains the headers of
// the original packet that caused the ICMP one to be sent. This information is
// used to find out which transport endpoint must be notified about the ICMP
// packet.
func (e *endpoint) handleControl(transErr stack.TransportError, pkt *stack.PacketBuffer) {
        h, ok := pkt.Data().PullUp(header.IPv6MinimumSize)
        if !ok {
                return
        }
        hdr := header.IPv6(h)

        // We don't use IsValid() here because ICMP only requires that up to
        // 1280 bytes of the original packet be included. So it's likely that it
        // is truncated, which would cause IsValid to return false.
        //
        // Drop packet if it doesn't have the basic IPv6 header or if the
        // original source address doesn't match an address we own.
        srcAddr := hdr.SourceAddress()
        if !e.checkLocalAddress(srcAddr) {
                return
        }

        // Keep needed information before trimming header.
        p := hdr.TransportProtocol()
        dstAddr := hdr.DestinationAddress()

        // Skip the IP header, then handle the fragmentation header if there
        // is one.
        pkt.Data().DeleteFront(header.IPv6MinimumSize)
        if p == header.IPv6FragmentHeader {
                f, ok := pkt.Data().PullUp(header.IPv6FragmentHeaderSize)
                if !ok {
                        return
                }
                fragHdr := header.IPv6Fragment(f)
                if !fragHdr.IsValid() || fragHdr.FragmentOffset() != 0 {
                        // We can't handle fragments that aren't at offset 0
                        // because they don't have the transport headers.
                        return
                }
                p = fragHdr.TransportProtocol()

                // Skip fragmentation header and find out the actual protocol
                // number.
                pkt.Data().DeleteFront(header.IPv6FragmentHeaderSize)
        }

        e.dispatcher.DeliverTransportError(srcAddr, dstAddr, ProtocolNumber, p, transErr, pkt)
}

// getLinkAddrOption searches NDP options for a given link address option using
// the provided getAddr function as a filter. Returns the link address if
// found; otherwise, returns the zero link address value. Also returns true if
// the options are valid as per the wire format, false otherwise.
func getLinkAddrOption(it header.NDPOptionIterator, getAddr func(header.NDPOption) tcpip.LinkAddress) (tcpip.LinkAddress, bool) {
        var linkAddr tcpip.LinkAddress
        for {
                opt, done, err := it.Next()
                if err != nil {
                        return "", false
                }
                if done {
                        break
                }
                if addr := getAddr(opt); len(addr) != 0 {
                        // No RFCs define what to do when an NDP message has multiple Link-Layer
                        // Address options. Since no interface can have multiple link-layer
                        // addresses, we consider such messages invalid.
                        if len(linkAddr) != 0 {
                                return "", false
                        }
                        linkAddr = addr
                }
        }
        return linkAddr, true
}

// getSourceLinkAddr searches NDP options for the source link address option.
// Returns the link address if found; otherwise, returns the zero link address
// value. Also returns true if the options are valid as per the wire format,
// false otherwise.
func getSourceLinkAddr(it header.NDPOptionIterator) (tcpip.LinkAddress, bool) {
        return getLinkAddrOption(it, func(opt header.NDPOption) tcpip.LinkAddress {
                if src, ok := opt.(header.NDPSourceLinkLayerAddressOption); ok {
                        return src.EthernetAddress()
                }
                return ""
        })
}

// getTargetLinkAddr searches NDP options for the target link address option.
// Returns the link address if found; otherwise, returns the zero link address
// value. Also returns true if the options are valid as per the wire format,
// false otherwise.
func getTargetLinkAddr(it header.NDPOptionIterator) (tcpip.LinkAddress, bool) {
        return getLinkAddrOption(it, func(opt header.NDPOption) tcpip.LinkAddress {
                if dst, ok := opt.(header.NDPTargetLinkLayerAddressOption); ok {
                        return dst.EthernetAddress()
                }
                return ""
        })
}

func isMLDValid(pkt *stack.PacketBuffer, iph header.IPv6, routerAlert *header.IPv6RouterAlertOption) bool {
        // As per RFC 2710 section 3:
        //   All MLD messages described in this document are sent with a link-local
        //   IPv6 Source Address, an IPv6 Hop Limit of 1, and an IPv6 Router Alert
        //   option in a Hop-by-Hop Options header.
        if routerAlert == nil || routerAlert.Value != header.IPv6RouterAlertMLD {
                return false
        }
        if pkt.Data().Size() < header.ICMPv6HeaderSize+header.MLDMinimumSize {
                return false
        }
        if iph.HopLimit() != header.MLDHopLimit {
                return false
        }
        if !header.IsV6LinkLocalUnicastAddress(iph.SourceAddress()) {
                return false
        }
        return true
}

func (e *endpoint) handleICMP(pkt *stack.PacketBuffer, hasFragmentHeader bool, routerAlert *header.IPv6RouterAlertOption) {
        sent := e.stats.icmp.packetsSent
        received := e.stats.icmp.packetsReceived
        // ICMP packets don't have their TransportHeader fields set. See
        // icmp/protocol.go:protocol.Parse for a full explanation.
        v, ok := pkt.Data().PullUp(header.ICMPv6HeaderSize)
        if !ok {
                received.invalid.Increment()
                return
        }
        h := header.ICMPv6(v)
        iph := header.IPv6(pkt.NetworkHeader().View())
        srcAddr := iph.SourceAddress()
        dstAddr := iph.DestinationAddress()

        // Validate ICMPv6 checksum before processing the packet.
        payload := pkt.Data().AsRange().SubRange(len(h))
        if got, want := h.Checksum(), header.ICMPv6Checksum(header.ICMPv6ChecksumParams{
                Header:      h,
                Src:         srcAddr,
                Dst:         dstAddr,
                PayloadCsum: payload.Checksum(),
                PayloadLen:  payload.Size(),
        }); got != want {
                received.invalid.Increment()
                return
        }

        isNDPValid := func() bool {
                // As per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1, 7.1.2 and
                // 8.1, nodes MUST silently drop NDP packets where the Hop Limit field
                // in the IPv6 header is not set to 255, or the ICMPv6 Code field is not
                // set to 0.
                //
                // As per RFC 6980 section 5, nodes MUST silently drop NDP messages if the
                // packet includes a fragmentation header.
                return !hasFragmentHeader && iph.HopLimit() == header.NDPHopLimit && h.Code() == 0
        }

        // TODO(b/112892170): Meaningfully handle all ICMP types.
        switch icmpType := h.Type(); icmpType {
        case header.ICMPv6PacketTooBig:
                received.packetTooBig.Increment()
                hdr, ok := pkt.Data().PullUp(header.ICMPv6PacketTooBigMinimumSize)
                if !ok {
                        received.invalid.Increment()
                        return
                }
                networkMTU, err := calculateNetworkMTU(header.ICMPv6(hdr).MTU(), header.IPv6MinimumSize)
                if err != nil {
                        networkMTU = 0
                }
                pkt.Data().DeleteFront(header.ICMPv6PacketTooBigMinimumSize)
                e.handleControl(&icmpv6PacketTooBigSockError{mtu: networkMTU}, pkt)

        case header.ICMPv6DstUnreachable:
                received.dstUnreachable.Increment()
                hdr, ok := pkt.Data().PullUp(header.ICMPv6DstUnreachableMinimumSize)
                if !ok {
                        received.invalid.Increment()
                        return
                }
                code := header.ICMPv6(hdr).Code()
                pkt.Data().DeleteFront(header.ICMPv6DstUnreachableMinimumSize)
                switch code {
                case header.ICMPv6NetworkUnreachable:
                        e.handleControl(&icmpv6DestinationNetworkUnreachableSockError{}, pkt)
                case header.ICMPv6PortUnreachable:
                        e.handleControl(&icmpv6DestinationPortUnreachableSockError{}, pkt)
                }
        case header.ICMPv6NeighborSolicit:
                received.neighborSolicit.Increment()
                if !isNDPValid() || pkt.Data().Size() < header.ICMPv6NeighborSolicitMinimumSize {
                        received.invalid.Increment()
                        return
                }

                // The remainder of payload must be only the neighbor solicitation, so
                // payload.AsView() always returns the solicitation. Per RFC 6980 section 5,
                // NDP messages cannot be fragmented. Also note that in the common case NDP
                // datagrams are very small and AsView() will not incur allocations.
                ns := header.NDPNeighborSolicit(payload.AsView())
                targetAddr := ns.TargetAddress()

                // As per RFC 4861 section 4.3, the Target Address MUST NOT be a multicast
                // address.
                if header.IsV6MulticastAddress(targetAddr) {
                        received.invalid.Increment()
                        return
                }

                var it header.NDPOptionIterator
                {
                        var err error
                        it, err = ns.Options().Iter(false /* check */)
                        if err != nil {
                                // Options are not valid as per the wire format, silently drop the
                                // packet.
                                received.invalid.Increment()
                                return
                        }
                }

                if e.hasTentativeAddr(targetAddr) {
                        // If the target address is tentative and the source of the packet is a
                        // unicast (specified) address, then the source of the packet is
                        // attempting to perform address resolution on the target. In this case,
                        // the solicitation is silently ignored, as per RFC 4862 section 5.4.3.
                        //
                        // If the target address is tentative and the source of the packet is the
                        // unspecified address (::), then we know another node is also performing
                        // DAD for the same address (since the target address is tentative for us,
                        // we know we are also performing DAD on it). In this case we let the
                        // stack know so it can handle such a scenario and do nothing further with
                        // the NS.
                        if srcAddr == header.IPv6Any {
                                var nonce []byte
                                for {
                                        opt, done, err := it.Next()
                                        if err != nil {
                                                received.invalid.Increment()
                                                return
                                        }
                                        if done {
                                                break
                                        }
                                        if n, ok := opt.(header.NDPNonceOption); ok {
                                                nonce = n.Nonce()
                                                break
                                        }
                                }

                                // Since this is a DAD message we know the sender does not actually hold
                                // the target address so there is no "holder".
                                var holderLinkAddress tcpip.LinkAddress

                                // We would get an error if the address no longer exists or the address
                                // is no longer tentative (DAD resolved between the call to
                                // hasTentativeAddr and this point). Both of these are valid scenarios:
                                //   1) An address may be removed at any time.
                                //   2) As per RFC 4862 section 5.4, DAD is not a perfect:
                                //       "Note that the method for detecting duplicates
                                //        is not completely reliable, and it is possible that duplicate
                                //        addresses will still exist"
                                //
                                // TODO(gvisor.dev/issue/4046): Handle the scenario when a duplicate
                                // address is detected for an assigned address.
                                switch err := e.dupTentativeAddrDetected(targetAddr, holderLinkAddress, nonce); err.(type) {
                                case nil, *tcpip.ErrBadAddress, *tcpip.ErrInvalidEndpointState:
                                default:
                                        panic(fmt.Sprintf("unexpected error handling duplicate tentative address: %s", err))
                                }
                        }

                        // Do not handle neighbor solicitations targeted to an address that is
                        // tentative on the NIC any further.
                        return
                }

                // At this point we know that the target address is not tentative on the NIC
                // so the packet is processed as defined in RFC 4861, as per RFC 4862
                // section 5.4.3.

                // Is the NS targeting us?
                if !e.checkLocalAddress(targetAddr) {
                        return
                }

                sourceLinkAddr, ok := getSourceLinkAddr(it)
                if !ok {
                        received.invalid.Increment()
                        return
                }

                // As per RFC 4861 section 4.3, the Source Link-Layer Address Option MUST
                // NOT be included when the source IP address is the unspecified address.
                // Otherwise, on link layers that have addresses this option MUST be
                // included in multicast solicitations and SHOULD be included in unicast
                // solicitations.
                unspecifiedSource := srcAddr == header.IPv6Any
                if len(sourceLinkAddr) == 0 {
                        if header.IsV6MulticastAddress(dstAddr) && !unspecifiedSource {
                                received.invalid.Increment()
                                return
                        }
                } else if unspecifiedSource {
                        received.invalid.Increment()
                        return
                } else {
                        switch err := e.nic.HandleNeighborProbe(ProtocolNumber, srcAddr, sourceLinkAddr); err.(type) {
                        case nil:
                        case *tcpip.ErrNotSupported:
                        // The stack may support ICMPv6 but the NIC may not need link resolution.
                        default:
                                panic(fmt.Sprintf("unexpected error when informing NIC of neighbor probe message: %s", err))
                        }
                }

                // As per RFC 4861 section 7.1.1:
                //   A node MUST silently discard any received Neighbor Solicitation
                //   messages that do not satisfy all of the following validity checks:
                //    ...
                //    - If the IP source address is the unspecified address, the IP
                //      destination address is a solicited-node multicast address.
                if unspecifiedSource && !header.IsSolicitedNodeAddr(dstAddr) {
                        received.invalid.Increment()
                        return
                }

                // As per RFC 4861 section 7.2.4:
                //
                //   If the source of the solicitation is the unspecified address, the node
                //   MUST [...] and multicast the advertisement to the all-nodes address.
                //
                remoteAddr := srcAddr
                if unspecifiedSource {
                        remoteAddr = header.IPv6AllNodesMulticastAddress
                }

                // Even if we were able to receive a packet from some remote, we may not
                // have a route to it - the remote may be blocked via routing rules. We must
                // always consult our routing table and find a route to the remote before
                // sending any packet.
                r, err := e.protocol.stack.FindRoute(e.nic.ID(), targetAddr, remoteAddr, ProtocolNumber, false /* multicastLoop */)
                if err != nil {
                        // If we cannot find a route to the destination, silently drop the packet.
                        return
                }
                defer r.Release()

                // If the NS has a source link-layer option, resolve the route immediately
                // to avoid querying the neighbor table when the neighbor entry was updated
                // as probing the neighbor table for a link address will transition the
                // entry's state from stale to delay.
                //
                // Note, if the source link address is unspecified and this is a unicast
                // solicitation, we may need to perform neighbor discovery to send the
                // neighbor advertisement response. This is expected as per RFC 4861 section
                // 7.2.4:
                //
                //   Because unicast Neighbor Solicitations are not required to include a
                //   Source Link-Layer Address, it is possible that a node sending a
                //   solicited Neighbor Advertisement does not have a corresponding link-
                //   layer address for its neighbor in its Neighbor Cache. In such
                //   situations, a node will first have to use Neighbor Discovery to
                //   determine the link-layer address of its neighbor (i.e., send out a
                //   multicast Neighbor Solicitation).
                //
                if len(sourceLinkAddr) != 0 {
                        r.ResolveWith(sourceLinkAddr)
                }

                optsSerializer := header.NDPOptionsSerializer{
                        header.NDPTargetLinkLayerAddressOption(e.nic.LinkAddress()),
                }
                neighborAdvertSize := header.ICMPv6NeighborAdvertMinimumSize + optsSerializer.Length()
                pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                        ReserveHeaderBytes: int(r.MaxHeaderLength()) + neighborAdvertSize,
                })
                pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
                packet := header.ICMPv6(pkt.TransportHeader().Push(neighborAdvertSize))
                packet.SetType(header.ICMPv6NeighborAdvert)
                na := header.NDPNeighborAdvert(packet.MessageBody())

                // As per RFC 4861 section 7.2.4:
                //
                //   If the source of the solicitation is the unspecified address, the node
                //   MUST set the Solicited flag to zero and [..]. Otherwise, the node MUST
                //   set the Solicited flag to one and [..].
                //
                na.SetSolicitedFlag(!unspecifiedSource)
                na.SetOverrideFlag(true)
                na.SetTargetAddress(targetAddr)
                na.Options().Serialize(optsSerializer)
                packet.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{
                        Header: packet,
                        Src:    r.LocalAddress(),
                        Dst:    r.RemoteAddress(),
                }))

                // RFC 4861 Neighbor Discovery for IP version 6 (IPv6)
                //
                // 7.1.2. Validation of Neighbor Advertisements
                //
                // The IP Hop Limit field has a value of 255, i.e., the packet
                // could not possibly have been forwarded by a router.
                if err := r.WritePacket(stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}, pkt); err != nil {
                        sent.dropped.Increment()
                        return
                }
                sent.neighborAdvert.Increment()

        case header.ICMPv6NeighborAdvert:
                received.neighborAdvert.Increment()
                if !isNDPValid() || pkt.Data().Size() < header.ICMPv6NeighborAdvertMinimumSize {
                        received.invalid.Increment()
                        return
                }

                // The remainder of payload must be only the neighbor advertisement, so
                // payload.AsView() always returns the advertisement. Per RFC 6980 section
                // 5, NDP messages cannot be fragmented. Also note that in the common case
                // NDP datagrams are very small and AsView() will not incur allocations.
                na := header.NDPNeighborAdvert(payload.AsView())

                it, err := na.Options().Iter(false /* check */)
                if err != nil {
                        // If we have a malformed NDP NA option, drop the packet.
                        received.invalid.Increment()
                        return
                }

                targetLinkAddr, ok := getTargetLinkAddr(it)
                if !ok {
                        received.invalid.Increment()
                        return
                }

                targetAddr := na.TargetAddress()

                e.dad.mu.Lock()
                e.dad.mu.dad.StopLocked(targetAddr, &stack.DADDupAddrDetected{HolderLinkAddress: targetLinkAddr})
                e.dad.mu.Unlock()

                if e.hasTentativeAddr(targetAddr) {
                        // We only send a nonce value in DAD messages to check for loopedback
                        // messages so we use the empty nonce value here.
                        var nonce []byte

                        // We just got an NA from a node that owns an address we are performing
                        // DAD on, implying the address is not unique. In this case we let the
                        // stack know so it can handle such a scenario and do nothing furthur with
                        // the NDP NA.
                        //
                        // We would get an error if the address no longer exists or the address
                        // is no longer tentative (DAD resolved between the call to
                        // hasTentativeAddr and this point). Both of these are valid scenarios:
                        //   1) An address may be removed at any time.
                        //   2) As per RFC 4862 section 5.4, DAD is not a perfect:
                        //       "Note that the method for detecting duplicates
                        //        is not completely reliable, and it is possible that duplicate
                        //        addresses will still exist"
                        //
                        // TODO(gvisor.dev/issue/4046): Handle the scenario when a duplicate
                        // address is detected for an assigned address.
                        switch err := e.dupTentativeAddrDetected(targetAddr, targetLinkAddr, nonce); err.(type) {
                        case nil, *tcpip.ErrBadAddress, *tcpip.ErrInvalidEndpointState:
                                return
                        default:
                                panic(fmt.Sprintf("unexpected error handling duplicate tentative address: %s", err))
                        }
                }

                // At this point we know that the target address is not tentative on the
                // NIC. However, the target address may still be assigned to the NIC but not
                // tentative (it could be permanent). Such a scenario is beyond the scope of
                // RFC 4862. As such, we simply ignore such a scenario for now and proceed
                // as normal.
                //
                // TODO(b/143147598): Handle the scenario described above. Also inform the
                // netstack integration that a duplicate address was detected outside of
                // DAD.

                // As per RFC 4861 section 7.1.2:
                //   A node MUST silently discard any received Neighbor Advertisement
                //   messages that do not satisfy all of the following validity checks:
                //    ...
                //    - If the IP Destination Address is a multicast address the
                //             Solicited flag is zero.
                if header.IsV6MulticastAddress(dstAddr) && na.SolicitedFlag() {
                        received.invalid.Increment()
                        return
                }

                // If the NA message has the target link layer option, update the link
                // address cache with the link address for the target of the message.
                switch err := e.nic.HandleNeighborConfirmation(ProtocolNumber, targetAddr, targetLinkAddr, stack.ReachabilityConfirmationFlags{
                        Solicited: na.SolicitedFlag(),
                        Override:  na.OverrideFlag(),
                        IsRouter:  na.RouterFlag(),
                }); err.(type) {
                case nil:
                case *tcpip.ErrNotSupported:
                // The stack may support ICMPv6 but the NIC may not need link resolution.
                default:
                        panic(fmt.Sprintf("unexpected error when informing NIC of neighbor confirmation message: %s", err))
                }

        case header.ICMPv6EchoRequest:
                received.echoRequest.Increment()
                icmpHdr, ok := pkt.TransportHeader().Consume(header.ICMPv6EchoMinimumSize)
                if !ok {
                        received.invalid.Increment()
                        return
                }

                // As per RFC 4291 section 2.7, multicast addresses must not be used as
                // source addresses in IPv6 packets.
                localAddr := dstAddr
                if header.IsV6MulticastAddress(dstAddr) {
                        localAddr = ""
                }

                r, err := e.protocol.stack.FindRoute(e.nic.ID(), localAddr, srcAddr, ProtocolNumber, false /* multicastLoop */)
                if err != nil {
                        // If we cannot find a route to the destination, silently drop the packet.
                        return
                }
                defer r.Release()

                replyPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                        ReserveHeaderBytes: int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize,
                        Data:               pkt.Data().ExtractVV(),
                })
                icmp := header.ICMPv6(replyPkt.TransportHeader().Push(header.ICMPv6EchoMinimumSize))
                pkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber
                copy(icmp, icmpHdr)
                icmp.SetType(header.ICMPv6EchoReply)
                dataRange := replyPkt.Data().AsRange()
                icmp.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{
                        Header:      icmp,
                        Src:         r.LocalAddress(),
                        Dst:         r.RemoteAddress(),
                        PayloadCsum: dataRange.Checksum(),
                        PayloadLen:  dataRange.Size(),
                }))
                if err := r.WritePacket(stack.NetworkHeaderParams{
                        Protocol: header.ICMPv6ProtocolNumber,
                        TTL:      r.DefaultTTL(),
                        TOS:      stack.DefaultTOS,
                }, replyPkt); err != nil {
                        sent.dropped.Increment()
                        return
                }
                sent.echoReply.Increment()

        case header.ICMPv6EchoReply:
                received.echoReply.Increment()
                if pkt.Data().Size() < header.ICMPv6EchoMinimumSize {
                        received.invalid.Increment()
                        return
                }
                e.dispatcher.DeliverTransportPacket(header.ICMPv6ProtocolNumber, pkt)

        case header.ICMPv6TimeExceeded:
                received.timeExceeded.Increment()

        case header.ICMPv6ParamProblem:
                received.paramProblem.Increment()

        case header.ICMPv6RouterSolicit:
                received.routerSolicit.Increment()

                //
                // Validate the RS as per RFC 4861 section 6.1.1.
                //

                // Is the NDP payload of sufficient size to hold a Router Solictation?
                if !isNDPValid() || pkt.Data().Size()-header.ICMPv6HeaderSize < header.NDPRSMinimumSize {
                        received.invalid.Increment()
                        return
                }

                if !e.Forwarding() {
                        received.routerOnlyPacketsDroppedByHost.Increment()
                        return
                }

                // Note that in the common case NDP datagrams are very small and AsView()
                // will not incur allocations.
                rs := header.NDPRouterSolicit(payload.AsView())
                it, err := rs.Options().Iter(false /* check */)
                if err != nil {
                        // Options are not valid as per the wire format, silently drop the packet.
                        received.invalid.Increment()
                        return
                }

                sourceLinkAddr, ok := getSourceLinkAddr(it)
                if !ok {
                        received.invalid.Increment()
                        return
                }

                // If the RS message has the source link layer option, update the link
                // address cache with the link address for the source of the message.
                if len(sourceLinkAddr) != 0 {
                        // As per RFC 4861 section 4.1, the Source Link-Layer Address Option MUST
                        // NOT be included when the source IP address is the unspecified address.
                        // Otherwise, it SHOULD be included on link layers that have addresses.
                        if srcAddr == header.IPv6Any {
                                received.invalid.Increment()
                                return
                        }

                        // A RS with a specified source IP address modifies the neighbor table
                        // in the same way a regular probe would.
                        switch err := e.nic.HandleNeighborProbe(ProtocolNumber, srcAddr, sourceLinkAddr); err.(type) {
                        case nil:
                        case *tcpip.ErrNotSupported:
                        // The stack may support ICMPv6 but the NIC may not need link resolution.
                        default:
                                panic(fmt.Sprintf("unexpected error when informing NIC of neighbor probe message: %s", err))
                        }
                }

        case header.ICMPv6RouterAdvert:
                received.routerAdvert.Increment()

                //
                // Validate the RA as per RFC 4861 section 6.1.2.
                //

                // Is the NDP payload of sufficient size to hold a Router Advertisement?
                if !isNDPValid() || pkt.Data().Size()-header.ICMPv6HeaderSize < header.NDPRAMinimumSize {
                        received.invalid.Increment()
                        return
                }

                routerAddr := srcAddr

                // Is the IP Source Address a link-local address?
                if !header.IsV6LinkLocalUnicastAddress(routerAddr) {
                        // ...No, silently drop the packet.
                        received.invalid.Increment()
                        return
                }

                // Note that in the common case NDP datagrams are very small and AsView()
                // will not incur allocations.
                ra := header.NDPRouterAdvert(payload.AsView())
                it, err := ra.Options().Iter(false /* check */)
                if err != nil {
                        // Options are not valid as per the wire format, silently drop the packet.
                        received.invalid.Increment()
                        return
                }

                sourceLinkAddr, ok := getSourceLinkAddr(it)
                if !ok {
                        received.invalid.Increment()
                        return
                }

                //
                // At this point, we have a valid Router Advertisement, as far
                // as RFC 4861 section 6.1.2 is concerned.
                //

                // If the RA has the source link layer option, update the link address
                // cache with the link address for the advertised router.
                if len(sourceLinkAddr) != 0 {
                        switch err := e.nic.HandleNeighborProbe(ProtocolNumber, routerAddr, sourceLinkAddr); err.(type) {
                        case nil:
                        case *tcpip.ErrNotSupported:
                        // The stack may support ICMPv6 but the NIC may not need link resolution.
                        default:
                                panic(fmt.Sprintf("unexpected error when informing NIC of neighbor probe message: %s", err))
                        }
                }

                e.mu.Lock()
                e.mu.ndp.handleRA(routerAddr, ra)
                e.mu.Unlock()

        case header.ICMPv6RedirectMsg:
                // TODO(gvisor.dev/issue/2285): Call `e.nud.HandleProbe` after validating
                // this redirect message, as per RFC 4871 section 7.3.3:
                //
                //    "A Neighbor Cache entry enters the STALE state when created as a
                //    result of receiving packets other than solicited Neighbor
                //    Advertisements (i.e., Router Solicitations, Router Advertisements,
                //    Redirects, and Neighbor Solicitations).  These packets contain the
                //    link-layer address of either the sender or, in the case of Redirect,
                //    the redirection target.  However, receipt of these link-layer
                //    addresses does not confirm reachability of the forward-direction path
                //    to that node.  Placing a newly created Neighbor Cache entry for which
                //    the link-layer address is known in the STALE state provides assurance
                //    that path failures are detected quickly. In addition, should a cached
                //    link-layer address be modified due to receiving one of the above
                //    messages, the state SHOULD also be set to STALE to provide prompt
                //    verification that the path to the new link-layer address is working."
                received.redirectMsg.Increment()
                if !isNDPValid() {
                        received.invalid.Increment()
                        return
                }

        case header.ICMPv6MulticastListenerQuery, header.ICMPv6MulticastListenerReport, header.ICMPv6MulticastListenerDone:
                switch icmpType {
                case header.ICMPv6MulticastListenerQuery:
                        received.multicastListenerQuery.Increment()
                case header.ICMPv6MulticastListenerReport:
                        received.multicastListenerReport.Increment()
                case header.ICMPv6MulticastListenerDone:
                        received.multicastListenerDone.Increment()
                default:
                        panic(fmt.Sprintf("unrecognized MLD message = %d", icmpType))
                }

                if !isMLDValid(pkt, iph, routerAlert) {
                        received.invalid.Increment()
                        return
                }

                switch icmpType {
                case header.ICMPv6MulticastListenerQuery:
                        e.mu.Lock()
                        e.mu.mld.handleMulticastListenerQuery(header.MLD(payload.AsView()))
                        e.mu.Unlock()
                case header.ICMPv6MulticastListenerReport:
                        e.mu.Lock()
                        e.mu.mld.handleMulticastListenerReport(header.MLD(payload.AsView()))
                        e.mu.Unlock()
                case header.ICMPv6MulticastListenerDone:
                default:
                        panic(fmt.Sprintf("unrecognized MLD message = %d", icmpType))
                }

        default:
                received.unrecognized.Increment()
        }
}

// LinkAddressProtocol implements stack.LinkAddressResolver.
func (*endpoint) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
        return header.IPv6ProtocolNumber
}

// LinkAddressRequest implements stack.LinkAddressResolver.
func (e *endpoint) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) tcpip.Error {
        remoteAddr := targetAddr
        if len(remoteLinkAddr) == 0 {
                remoteAddr = header.SolicitedNodeAddr(targetAddr)
                remoteLinkAddr = header.EthernetAddressFromMulticastIPv6Address(remoteAddr)
        }

        if len(localAddr) == 0 {
                // Find an address that we can use as our source address.
                addressEndpoint := e.AcquireOutgoingPrimaryAddress(remoteAddr, false /* allowExpired */)
                if addressEndpoint == nil {
                        return &tcpip.ErrNetworkUnreachable{}
                }

                localAddr = addressEndpoint.AddressWithPrefix().Address
                addressEndpoint.DecRef()
        } else if !e.checkLocalAddress(localAddr) {
                // The provided local address is not assigned to us.
                return &tcpip.ErrBadLocalAddress{}
        }

        return e.sendNDPNS(localAddr, remoteAddr, targetAddr, remoteLinkAddr, header.NDPOptionsSerializer{
                header.NDPSourceLinkLayerAddressOption(e.nic.LinkAddress()),
        })
}

// ResolveStaticAddress implements stack.LinkAddressResolver.
func (*endpoint) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
        if header.IsV6MulticastAddress(addr) {
                return header.EthernetAddressFromMulticastIPv6Address(addr), true
        }
        return tcpip.LinkAddress([]byte(nil)), false
}

// ======= ICMP Error packet generation =========

// icmpReason is a marker interface for IPv6 specific ICMP errors.
type icmpReason interface {
        isICMPReason()
        // isForwarding indicates whether or not the error arose while attempting to
        // forward a packet.
        isForwarding() bool
        // respondToMulticast indicates whether this error falls under the exception
        // outlined by RFC 4443 section 2.4 point e.3 exception 2:
        //
        //   (e.3) A packet destined to an IPv6 multicast address. (There are two
        //   exceptions to this rule: (1) the Packet Too Big Message (Section 3.2) to
        //   allow Path MTU discovery to work for IPv6 multicast, and (2) the Parameter
        //   Problem Message, Code 2 (Section 3.4) reporting an unrecognized IPv6
        //   option (see Section 4.2 of [IPv6]) that has the Option Type highest-
        //   order two bits set to 10).
        respondsToMulticast() bool
}

// icmpReasonParameterProblem is an error during processing of extension headers
// or the fixed header defined in RFC 4443 section 3.4.
type icmpReasonParameterProblem struct {
        code header.ICMPv6Code

        // pointer is defined in the RFC 4443 setion 3.4 which reads:
        //
        //  Pointer         Identifies the octet offset within the invoking packet
        //                  where the error was detected.
        //
        //                  The pointer will point beyond the end of the ICMPv6
        //                  packet if the field in error is beyond what can fit
        //                  in the maximum size of an ICMPv6 error message.
        pointer uint32

        forwarding bool

        respondToMulticast bool
}

func (*icmpReasonParameterProblem) isICMPReason() {}
func (p *icmpReasonParameterProblem) isForwarding() bool {
        return p.forwarding
}

func (p *icmpReasonParameterProblem) respondsToMulticast() bool {
        return p.respondToMulticast
}

// icmpReasonPortUnreachable is an error where the transport protocol has no
// listener and no alternative means to inform the sender.
type icmpReasonPortUnreachable struct{}

func (*icmpReasonPortUnreachable) isICMPReason() {}

func (*icmpReasonPortUnreachable) isForwarding() bool {
        return false
}

func (*icmpReasonPortUnreachable) respondsToMulticast() bool {
        return false
}

// icmpReasonNetUnreachable is an error where no route can be found to the
// network of the final destination.
type icmpReasonNetUnreachable struct{}

func (*icmpReasonNetUnreachable) isICMPReason() {}

func (*icmpReasonNetUnreachable) isForwarding() bool {
        // If we hit a Network Unreachable error, then we also know we are
        // operating as a router. As per RFC 4443 section 3.1:
        //
        //   If the reason for the failure to deliver is lack of a matching
        //   entry in the forwarding node's routing table, the Code field is
        //   set to 0 (Network Unreachable).
        return true
}

func (*icmpReasonNetUnreachable) respondsToMulticast() bool {
        return false
}

// icmpReasonHostUnreachable is an error in which the host specified in the
// internet destination field of the datagram is unreachable.
type icmpReasonHostUnreachable struct{}

func (*icmpReasonHostUnreachable) isICMPReason() {}
func (*icmpReasonHostUnreachable) isForwarding() bool {
        // If we hit a Host Unreachable error, then we know we are operating as a
        // router. As per RFC 4443 page 8, Destination Unreachable Message,
        //
        //   If the reason for the failure to deliver cannot be mapped to any of
        //   other codes, the Code field is set to 3.  Example of such cases are
        //   an inability to resolve the IPv6 destination address into a
        //   corresponding link address, or a link-specific problem of some sort.
        return true
}

func (*icmpReasonHostUnreachable) respondsToMulticast() bool {
        return false
}

// icmpReasonFragmentationNeeded is an error where a packet is to big to be sent
// out through the outgoing MTU, as per RFC 4443 page 9, Packet Too Big Message.
type icmpReasonPacketTooBig struct{}

func (*icmpReasonPacketTooBig) isICMPReason() {}

func (*icmpReasonPacketTooBig) isForwarding() bool {
        // If we hit a Packet Too Big error, then we know we are operating as a router.
        // As per RFC 4443 section 3.2:
        //
        //   A Packet Too Big MUST be sent by a router in response to a packet that it
        //   cannot forward because the packet is larger than the MTU of the outgoing
        //   link.
        return true
}

func (*icmpReasonPacketTooBig) respondsToMulticast() bool {
        return true
}

// icmpReasonHopLimitExceeded is an error where a packet's hop limit exceeded in
// transit to its final destination, as per RFC 4443 section 3.3.
type icmpReasonHopLimitExceeded struct{}

func (*icmpReasonHopLimitExceeded) isICMPReason() {}

func (*icmpReasonHopLimitExceeded) isForwarding() bool {
        // If we hit a Hop Limit Exceeded error, then we know we are operating
        // as a router. As per RFC 4443 section 3.3:
        //
        //   If a router receives a packet with a Hop Limit of zero, or if a
        //   router decrements a packet's Hop Limit to zero, it MUST discard
        //   the packet and originate an ICMPv6 Time Exceeded message with Code
        //   0 to the source of the packet.  This indicates either a routing
        //   loop or too small an initial Hop Limit value.
        return true
}

func (*icmpReasonHopLimitExceeded) respondsToMulticast() bool {
        return false
}

// icmpReasonReassemblyTimeout is an error where insufficient fragments are
// received to complete reassembly of a packet within a configured time after
// the reception of the first-arriving fragment of that packet.
type icmpReasonReassemblyTimeout struct{}

func (*icmpReasonReassemblyTimeout) isICMPReason() {}

func (*icmpReasonReassemblyTimeout) isForwarding() bool {
        return false
}

func (*icmpReasonReassemblyTimeout) respondsToMulticast() bool {
        return false
}

// returnError takes an error descriptor and generates the appropriate ICMP
// error packet for IPv6 and sends it.
func (p *protocol) returnError(reason icmpReason, pkt *stack.PacketBuffer) tcpip.Error {
        origIPHdr := header.IPv6(pkt.NetworkHeader().View())
        origIPHdrSrc := origIPHdr.SourceAddress()
        origIPHdrDst := origIPHdr.DestinationAddress()

        // Only send ICMP error if the address is not a multicast v6
        // address and the source is not the unspecified address.
        //
        // There are exceptions to this rule.
        // See: point e.3) RFC 4443 section-2.4
        //
        //         (e) An ICMPv6 error message MUST NOT be originated as a result of
        //       receiving the following:
        //
        //       (e.1) An ICMPv6 error message.
        //
        //       (e.2) An ICMPv6 redirect message [IPv6-DISC].
        //
        //       (e.3) A packet destined to an IPv6 multicast address.  (There are
        //             two exceptions to this rule: (1) the Packet Too Big Message
        //             (Section 3.2) to allow Path MTU discovery to work for IPv6
        //             multicast, and (2) the Parameter Problem Message, Code 2
        //             (Section 3.4) reporting an unrecognized IPv6 option (see
        //             Section 4.2 of [IPv6]) that has the Option Type highest-
        //             order two bits set to 10).
        //
        allowResponseToMulticast := reason.respondsToMulticast()
        isOrigDstMulticast := header.IsV6MulticastAddress(origIPHdrDst)
        if (!allowResponseToMulticast && isOrigDstMulticast) || origIPHdrSrc == header.IPv6Any {
                return nil
        }

        // If we are operating as a router, do not use the packet's destination
        // address as the response's source address as we should not own the
        // destination address of a packet we are forwarding.
        //
        // If the packet was originally destined to a multicast address, then do not
        // use the packet's destination address as the source for the response ICMP
        // packet as "multicast addresses must not be used as source addresses in IPv6
        // packets", as per RFC 4291 section 2.7.
        localAddr := origIPHdrDst
        if reason.isForwarding() || isOrigDstMulticast {
                localAddr = ""
        }
        // Even if we were able to receive a packet from some remote, we may not have
        // a route to it - the remote may be blocked via routing rules. We must always
        // consult our routing table and find a route to the remote before sending any
        // packet.
        route, err := p.stack.FindRoute(pkt.NICID, localAddr, origIPHdrSrc, ProtocolNumber, false /* multicastLoop */)
        if err != nil {
                return err
        }
        defer route.Release()

        p.mu.Lock()
        // We retrieve an endpoint using the newly constructed route's NICID rather
        // than the packet's NICID. The packet's NICID corresponds to the NIC on
        // which it arrived, which isn't necessarily the same as the NIC on which it
        // will be transmitted. On the other hand, the route's NIC *is* guaranteed
        // to be the NIC on which the packet will be transmitted.
        netEP, ok := p.mu.eps[route.NICID()]
        p.mu.Unlock()
        if !ok {
                return &tcpip.ErrNotConnected{}
        }

        sent := netEP.stats.icmp.packetsSent

        if !p.stack.AllowICMPMessage() {
                sent.rateLimited.Increment()
                return nil
        }

        if pkt.TransportProtocolNumber == header.ICMPv6ProtocolNumber {
                // TODO(gvisor.dev/issues/3810): Sort this out when ICMP headers are stored.
                // Unfortunately at this time ICMP Packets do not have a transport
                // header separated out. It is in the Data part so we need to
                // separate it out now. We will just pretend it is a minimal length
                // ICMP packet as we don't really care if any later bits of a
                // larger ICMP packet are in the header view or in the Data view.
                transport, ok := pkt.TransportHeader().Consume(header.ICMPv6MinimumSize)
                if !ok {
                        return nil
                }
                typ := header.ICMPv6(transport).Type()
                if typ.IsErrorType() || typ == header.ICMPv6RedirectMsg {
                        return nil
                }
        }

        network, transport := pkt.NetworkHeader().View(), pkt.TransportHeader().View()

        // As per RFC 4443 section 2.4
        //
        //    (c) Every ICMPv6 error message (type < 128) MUST include
        //    as much of the IPv6 offending (invoking) packet (the
        //    packet that caused the error) as possible without making
        //    the error message packet exceed the minimum IPv6 MTU
        //    [IPv6].
        mtu := int(route.MTU())
        const maxIPv6Data = header.IPv6MinimumMTU - header.IPv6FixedHeaderSize
        if mtu > maxIPv6Data {
                mtu = maxIPv6Data
        }
        available := mtu - header.ICMPv6ErrorHeaderSize
        if available < header.IPv6MinimumSize {
                return nil
        }
        payloadLen := network.Size() + transport.Size() + pkt.Data().Size()
        if payloadLen > available {
                payloadLen = available
        }
        payload := network.ToVectorisedView()
        payload.AppendView(transport)
        payload.Append(pkt.Data().ExtractVV())
        payload.CapLength(payloadLen)

        newPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                ReserveHeaderBytes: int(route.MaxHeaderLength()) + header.ICMPv6ErrorHeaderSize,
                Data:               payload,
        })
        newPkt.TransportProtocolNumber = header.ICMPv6ProtocolNumber

        icmpHdr := header.ICMPv6(newPkt.TransportHeader().Push(header.ICMPv6DstUnreachableMinimumSize))
        var counter tcpip.MultiCounterStat
        switch reason := reason.(type) {
        case *icmpReasonParameterProblem:
                icmpHdr.SetType(header.ICMPv6ParamProblem)
                icmpHdr.SetCode(reason.code)
                icmpHdr.SetTypeSpecific(reason.pointer)
                counter = sent.paramProblem
        case *icmpReasonPortUnreachable:
                icmpHdr.SetType(header.ICMPv6DstUnreachable)
                icmpHdr.SetCode(header.ICMPv6PortUnreachable)
                counter = sent.dstUnreachable
        case *icmpReasonNetUnreachable:
                icmpHdr.SetType(header.ICMPv6DstUnreachable)
                icmpHdr.SetCode(header.ICMPv6NetworkUnreachable)
                counter = sent.dstUnreachable
        case *icmpReasonHostUnreachable:
                icmpHdr.SetType(header.ICMPv6DstUnreachable)
                icmpHdr.SetCode(header.ICMPv6AddressUnreachable)
                counter = sent.dstUnreachable
        case *icmpReasonPacketTooBig:
                icmpHdr.SetType(header.ICMPv6PacketTooBig)
                icmpHdr.SetCode(header.ICMPv6UnusedCode)
                counter = sent.packetTooBig
        case *icmpReasonHopLimitExceeded:
                icmpHdr.SetType(header.ICMPv6TimeExceeded)
                icmpHdr.SetCode(header.ICMPv6HopLimitExceeded)
                counter = sent.timeExceeded
        case *icmpReasonReassemblyTimeout:
                icmpHdr.SetType(header.ICMPv6TimeExceeded)
                icmpHdr.SetCode(header.ICMPv6ReassemblyTimeout)
                counter = sent.timeExceeded
        default:
                panic(fmt.Sprintf("unsupported ICMP type %T", reason))
        }
        dataRange := newPkt.Data().AsRange()
        icmpHdr.SetChecksum(header.ICMPv6Checksum(header.ICMPv6ChecksumParams{
                Header:      icmpHdr,
                Src:         route.LocalAddress(),
                Dst:         route.RemoteAddress(),
                PayloadCsum: dataRange.Checksum(),
                PayloadLen:  dataRange.Size(),
        }))
        if err := route.WritePacket(
                stack.NetworkHeaderParams{
                        Protocol: header.ICMPv6ProtocolNumber,
                        TTL:      route.DefaultTTL(),
                        TOS:      stack.DefaultTOS,
                },
                newPkt,
        ); err != nil {
                sent.dropped.Increment()
                return err
        }
        counter.Increment()
        return nil
}

// OnReassemblyTimeout implements fragmentation.TimeoutHandler.
func (p *protocol) OnReassemblyTimeout(pkt *stack.PacketBuffer) {
        // OnReassemblyTimeout sends a Time Exceeded Message as per RFC 2460 Section
        // 4.5:
        //
        //   If the first fragment (i.e., the one with a Fragment Offset of zero) has
        //   been received, an ICMP Time Exceeded -- Fragment Reassembly Time Exceeded
        //   message should be sent to the source of that fragment.
        if pkt != nil {
                p.returnError(&icmpReasonReassemblyTimeout{}, pkt)
        }
}
































    1 




  163 






    6 






   31 






   20 

   29 

    4 




   32 





    1 




  164 









  162 









    1 




    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package buffer provides the implementation of a buffer view.
//
// A view is an flexible buffer, supporting the safecopy operations natively as
// well as the ability to grow via either prepend or append, as well as shrink.
package buffer

// buffer encapsulates a queueable byte buffer.
//
// +stateify savable
type buffer struct {
        data  []byte
        read  int
        write int
        bufferEntry
}

// init performs in-place initialization for zero value.
func (b *buffer) init(size int) {
        b.data = make([]byte, size)
}

// initWithData initializes b with data, taking ownership.
func (b *buffer) initWithData(data []byte) {
        b.data = data
        b.read = 0
        b.write = len(data)
}

// Reset resets read and write locations, effectively emptying the buffer.
func (b *buffer) Reset() {
        b.read = 0
        b.write = 0
}

// Remove removes r from the unread portion. It returns false if r does not
// fully reside in b.
func (b *buffer) Remove(r Range) bool {
        sz := b.ReadSize()
        switch {
        case r.Len() != r.Intersect(Range{end: sz}).Len():
                return false
        case r.Len() == 0:
                // Noop
        case r.begin == 0:
                b.read += r.end
        case r.end == sz:
                b.write -= r.Len()
        default:
                // Remove from the middle of b.data.
                copy(b.data[b.read+r.begin:], b.data[b.read+r.end:b.write])
                b.write -= r.Len()
        }
        return true
}

// Full indicates the buffer is full.
//
// This indicates there is no capacity left to write.
func (b *buffer) Full() bool {
        return b.write == len(b.data)
}

// ReadSize returns the number of bytes available for reading.
func (b *buffer) ReadSize() int {
        return b.write - b.read
}

// ReadMove advances the read index by the given amount.
func (b *buffer) ReadMove(n int) {
        b.read += n
}

// ReadSlice returns the read slice for this buffer.
func (b *buffer) ReadSlice() []byte {
        return b.data[b.read:b.write]
}

// WriteSize returns the number of bytes available for writing.
func (b *buffer) WriteSize() int {
        return len(b.data) - b.write
}

// WriteMove advances the write index by the given amount.
func (b *buffer) WriteMove(n int) {
        b.write += n
}

// WriteSlice returns the write slice for this buffer.
func (b *buffer) WriteSlice() []byte {
        return b.data[b.write:]
}































   16 












   14 










   17 






   17 





    2 




   16 


    2 


   16 




   16 






   15 





   15 







   15 

    3 


   14 







   14 




   14 





   14 












   14 
    1 






   14 







    1 





    1 











    1 







































   16 











   14 






   14 








    7 




    6 



    1 


    5 


    2 



    3 




    3 







   17 
    8 


   17 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package netlink

import (
        "fmt"
        "math"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/bits"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
)

// alignPad returns the length of padding required for alignment.
//
// Preconditions: align is a power of two.
func alignPad(length int, align uint) int {
        return bits.AlignUp(length, align) - length
}

// Message contains a complete serialized netlink message.
type Message struct {
        hdr linux.NetlinkMessageHeader
        buf []byte
}

// NewMessage creates a new Message containing the passed header.
//
// The header length will be updated by Finalize.
func NewMessage(hdr linux.NetlinkMessageHeader) *Message {
        return &Message{
                hdr: hdr,
                buf: marshal.Marshal(&hdr),
        }
}

// ParseMessage parses the first message seen at buf, returning the rest of the
// buffer. If message is malformed, ok of false is returned. For last message,
// padding check is loose, if there isn't enought padding, whole buf is consumed
// and ok is set to true.
func ParseMessage(buf []byte) (msg *Message, rest []byte, ok bool) {
        b := BytesView(buf)

        hdrBytes, ok := b.Extract(linux.NetlinkMessageHeaderSize)
        if !ok {
                return
        }
        var hdr linux.NetlinkMessageHeader
        hdr.UnmarshalUnsafe(hdrBytes)

        // Msg portion.
        totalMsgLen := int(hdr.Length)
        _, ok = b.Extract(totalMsgLen - linux.NetlinkMessageHeaderSize)
        if !ok {
                return
        }

        // Padding.
        numPad := alignPad(totalMsgLen, linux.NLMSG_ALIGNTO)
        // Linux permits the last message not being aligned, just consume all of it.
        // Ref: net/netlink/af_netlink.c:netlink_rcv_skb
        if numPad > len(b) {
                numPad = len(b)
        }
        _, ok = b.Extract(numPad)
        if !ok {
                return
        }

        return &Message{
                hdr: hdr,
                buf: buf[:totalMsgLen],
        }, []byte(b), true
}

// Header returns the header of this message.
func (m *Message) Header() linux.NetlinkMessageHeader {
        return m.hdr
}

// GetData unmarshals the payload message header from this netlink message, and
// returns the attributes portion.
func (m *Message) GetData(msg marshal.Marshallable) (AttrsView, bool) {
        b := BytesView(m.buf)

        _, ok := b.Extract(linux.NetlinkMessageHeaderSize)
        if !ok {
                return nil, false
        }

        size := msg.SizeBytes()
        msgBytes, ok := b.Extract(size)
        if !ok {
                return nil, false
        }
        msg.UnmarshalUnsafe(msgBytes)

        numPad := alignPad(linux.NetlinkMessageHeaderSize+size, linux.NLMSG_ALIGNTO)
        // Linux permits the last message not being aligned, just consume all of it.
        // Ref: net/netlink/af_netlink.c:netlink_rcv_skb
        if numPad > len(b) {
                numPad = len(b)
        }
        _, ok = b.Extract(numPad)
        if !ok {
                return nil, false
        }

        return AttrsView(b), true
}

// Finalize returns the []byte containing the entire message, with the total
// length set in the message header. The Message must not be modified after
// calling Finalize.
func (m *Message) Finalize() []byte {
        // Update length, which is the first 4 bytes of the header.
        hostarch.ByteOrder.PutUint32(m.buf, uint32(len(m.buf)))

        // Align the message. Note that the message length in the header (set
        // above) is the useful length of the message, not the total aligned
        // length. See net/netlink/af_netlink.c:__nlmsg_put.
        aligned := bits.AlignUp(len(m.buf), linux.NLMSG_ALIGNTO)
        m.putZeros(aligned - len(m.buf))
        return m.buf
}

// putZeros adds n zeros to the message.
func (m *Message) putZeros(n int) {
        for n > 0 {
                m.buf = append(m.buf, 0)
                n--
        }
}

// Put serializes v into the message.
func (m *Message) Put(v marshal.Marshallable) {
        m.buf = append(m.buf, marshal.Marshal(v)...)
}

// PutAttr adds v to the message as a netlink attribute.
//
// Preconditions: The serialized attribute (linux.NetlinkAttrHeaderSize +
// v.SizeBytes()) fits in math.MaxUint16 bytes.
func (m *Message) PutAttr(atype uint16, v marshal.Marshallable) {
        l := linux.NetlinkAttrHeaderSize + v.SizeBytes()
        if l > math.MaxUint16 {
                panic(fmt.Sprintf("attribute too large: %d", l))
        }

        m.Put(&linux.NetlinkAttrHeader{
                Type:   atype,
                Length: uint16(l),
        })
        m.Put(v)

        // Align the attribute.
        aligned := bits.AlignUp(l, linux.NLA_ALIGNTO)
        m.putZeros(aligned - l)
}

// PutAttrString adds s to the message as a netlink attribute.
func (m *Message) PutAttrString(atype uint16, s string) {
        l := linux.NetlinkAttrHeaderSize + len(s) + 1
        m.Put(&linux.NetlinkAttrHeader{
                Type:   atype,
                Length: uint16(l),
        })

        // String + NUL-termination.
        m.Put(primitive.AsByteSlice([]byte(s)))
        m.putZeros(1)

        // Align the attribute.
        aligned := bits.AlignUp(l, linux.NLA_ALIGNTO)
        m.putZeros(aligned - l)
}

// MessageSet contains a series of netlink messages.
type MessageSet struct {
        // Multi indicates that this a multi-part message, to be terminated by
        // NLMSG_DONE. NLMSG_DONE is sent even if the set contains only one
        // Message.
        //
        // If Multi is set, all added messages will have NLM_F_MULTI set.
        Multi bool

        // PortID is the destination port for all messages.
        PortID int32

        // Seq is the sequence counter for all messages in the set.
        Seq uint32

        // Messages contains the messages in the set.
        Messages []*Message
}

// NewMessageSet creates a new MessageSet.
//
// portID is the destination port to set as PortID in all messages.
//
// seq is the sequence counter to set as seq in all messages in the set.
func NewMessageSet(portID int32, seq uint32) *MessageSet {
        return &MessageSet{
                PortID: portID,
                Seq:    seq,
        }
}

// AddMessage adds a new message to the set and returns it for further
// additions.
//
// The passed header will have Seq, PortID and the multi flag set
// automatically.
func (ms *MessageSet) AddMessage(hdr linux.NetlinkMessageHeader) *Message {
        hdr.Seq = ms.Seq
        hdr.PortID = uint32(ms.PortID)
        if ms.Multi {
                hdr.Flags |= linux.NLM_F_MULTI
        }

        m := NewMessage(hdr)
        ms.Messages = append(ms.Messages, m)
        return m
}

// AttrsView is a view into the attributes portion of a netlink message.
type AttrsView []byte

// Empty returns whether there is no attribute left in v.
func (v AttrsView) Empty() bool {
        return len(v) == 0
}

// ParseFirst parses first netlink attribute at the beginning of v.
func (v AttrsView) ParseFirst() (hdr linux.NetlinkAttrHeader, value []byte, rest AttrsView, ok bool) {
        b := BytesView(v)

        hdrBytes, ok := b.Extract(linux.NetlinkAttrHeaderSize)
        if !ok {
                return
        }
        hdr.UnmarshalUnsafe(hdrBytes)

        value, ok = b.Extract(int(hdr.Length) - linux.NetlinkAttrHeaderSize)
        if !ok {
                return
        }

        _, ok = b.Extract(alignPad(int(hdr.Length), linux.NLA_ALIGNTO))
        if !ok {
                return
        }

        return hdr, value, AttrsView(b), ok
}

// BytesView supports extracting data from a byte slice with bounds checking.
type BytesView []byte

// Extract removes the first n bytes from v and returns it. If n is out of
// bounds, it returns false.
func (v *BytesView) Extract(n int) ([]byte, bool) {
        if n < 0 || n > len(*v) {
                return nil, false
        }
        extracted := (*v)[:n]
        *v = (*v)[n:]
        return extracted, true
}




















































































































































































    9 
    9 

    4 




    7 











    7 



    1 




    1 




    7 




    6 






    6 



    1 
















    1 



    1 





    2 

    1 



    1 

    1 

    1 





    1 




















   10 
















   10 










   10 


   10 


   10 





















   10 





























   10 


   10 






































   10 


   10 



   10 














































   10 


























































    1 
    1 

































































































































































































    2 







    2 
    1 



    1 





    1 





    1 



    1 


    1 



























































    1 





    1 
    1 
























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import (
        "bytes"
        "encoding/binary"
        "errors"
        "fmt"
        "io"
        "math"
        "time"

        "gvisor.dev/gvisor/pkg/tcpip"
)

// ndpOptionIdentifier is an NDP option type identifier.
type ndpOptionIdentifier uint8

const (
        // ndpSourceLinkLayerAddressOptionType is the type of the Source Link Layer
        // Address option, as per RFC 4861 section 4.6.1.
        ndpSourceLinkLayerAddressOptionType ndpOptionIdentifier = 1

        // ndpTargetLinkLayerAddressOptionType is the type of the Target Link Layer
        // Address option, as per RFC 4861 section 4.6.1.
        ndpTargetLinkLayerAddressOptionType ndpOptionIdentifier = 2

        // ndpPrefixInformationType is the type of the Prefix Information
        // option, as per RFC 4861 section 4.6.2.
        ndpPrefixInformationType ndpOptionIdentifier = 3

        // ndpNonceOptionType is the type of the Nonce option, as per
        // RFC 3971 section 5.3.2.
        ndpNonceOptionType ndpOptionIdentifier = 14

        // ndpRecursiveDNSServerOptionType is the type of the Recursive DNS
        // Server option, as per RFC 8106 section 5.1.
        ndpRecursiveDNSServerOptionType ndpOptionIdentifier = 25

        // ndpDNSSearchListOptionType is the type of the DNS Search List option,
        // as per RFC 8106 section 5.2.
        ndpDNSSearchListOptionType ndpOptionIdentifier = 31
)

const (
        // NDPLinkLayerAddressSize is the size of a Source or Target Link Layer
        // Address option for an Ethernet address.
        NDPLinkLayerAddressSize = 8

        // ndpPrefixInformationLength is the expected length, in bytes, of the
        // body of an NDP Prefix Information option, as per RFC 4861 section
        // 4.6.2 which specifies that the Length field is 4. Given this, the
        // expected length, in bytes, is 30 becuase 4 * lengthByteUnits (8) - 2
        // (Type & Length) = 30.
        ndpPrefixInformationLength = 30

        // ndpPrefixInformationPrefixLengthOffset is the offset of the Prefix
        // Length field within an NDPPrefixInformation.
        ndpPrefixInformationPrefixLengthOffset = 0

        // ndpPrefixInformationFlagsOffset is the offset of the flags byte
        // within an NDPPrefixInformation.
        ndpPrefixInformationFlagsOffset = 1

        // ndpPrefixInformationOnLinkFlagMask is the mask of the On-Link Flag
        // field in the flags byte within an NDPPrefixInformation.
        ndpPrefixInformationOnLinkFlagMask = 1 << 7

        // ndpPrefixInformationAutoAddrConfFlagMask is the mask of the
        // Autonomous Address-Configuration flag field in the flags byte within
        // an NDPPrefixInformation.
        ndpPrefixInformationAutoAddrConfFlagMask = 1 << 6

        // ndpPrefixInformationReserved1FlagsMask is the mask of the Reserved1
        // field in the flags byte within an NDPPrefixInformation.
        ndpPrefixInformationReserved1FlagsMask = 63

        // ndpPrefixInformationValidLifetimeOffset is the start of the 4-byte
        // Valid Lifetime field within an NDPPrefixInformation.
        ndpPrefixInformationValidLifetimeOffset = 2

        // ndpPrefixInformationPreferredLifetimeOffset is the start of the
        // 4-byte Preferred Lifetime field within an NDPPrefixInformation.
        ndpPrefixInformationPreferredLifetimeOffset = 6

        // ndpPrefixInformationReserved2Offset is the start of the 4-byte
        // Reserved2 field within an NDPPrefixInformation.
        ndpPrefixInformationReserved2Offset = 10

        // ndpPrefixInformationReserved2Length is the length of the Reserved2
        // field.
        //
        // It is 4 bytes.
        ndpPrefixInformationReserved2Length = 4

        // ndpPrefixInformationPrefixOffset is the start of the Prefix field
        // within an NDPPrefixInformation.
        ndpPrefixInformationPrefixOffset = 14

        // ndpRecursiveDNSServerLifetimeOffset is the start of the 4-byte
        // Lifetime field within an NDPRecursiveDNSServer.
        ndpRecursiveDNSServerLifetimeOffset = 2

        // ndpRecursiveDNSServerAddressesOffset is the start of the addresses
        // for IPv6 Recursive DNS Servers within an NDPRecursiveDNSServer.
        ndpRecursiveDNSServerAddressesOffset = 6

        // minNDPRecursiveDNSServerLength is the minimum NDP Recursive DNS Server
        // option's body size when it contains at least one IPv6 address, as per
        // RFC 8106 section 5.3.1.
        minNDPRecursiveDNSServerBodySize = 22

        // ndpDNSSearchListLifetimeOffset is the start of the 4-byte
        // Lifetime field within an NDPDNSSearchList.
        ndpDNSSearchListLifetimeOffset = 2

        // ndpDNSSearchListDomainNamesOffset is the start of the DNS search list
        // domain names within an NDPDNSSearchList.
        ndpDNSSearchListDomainNamesOffset = 6

        // minNDPDNSSearchListBodySize is the minimum NDP DNS Search List option's
        // body size when it contains at least one domain name, as per RFC 8106
        // section 5.3.1.
        minNDPDNSSearchListBodySize = 14

        // maxDomainNameLabelLength is the maximum length of a domain name
        // label, as per RFC 1035 section 3.1.
        maxDomainNameLabelLength = 63

        // maxDomainNameLength is the maximum length of a domain name, including
        // label AND label length octet, as per RFC 1035 section 3.1.
        maxDomainNameLength = 255

        // lengthByteUnits is the multiplier factor for the Length field of an
        // NDP option. That is, the length field for NDP options is in units of
        // 8 octets, as per RFC 4861 section 4.6.
        lengthByteUnits = 8

        // NDPInfiniteLifetime is a value that represents infinity for the
        // 4-byte lifetime fields found in various NDP options. Its value is
        // (2^32 - 1)s = 4294967295s.
        NDPInfiniteLifetime = time.Second * math.MaxUint32
)

// NDPOptionIterator is an iterator of NDPOption.
//
// Note, between when an NDPOptionIterator is obtained and last used, no changes
// to the NDPOptions may happen. Doing so may cause undefined and unexpected
// behaviour. It is fine to obtain an NDPOptionIterator, iterate over the first
// few NDPOption then modify the backing NDPOptions so long as the
// NDPOptionIterator obtained before modification is no longer used.
type NDPOptionIterator struct {
        opts *bytes.Buffer
}

// Potential errors when iterating over an NDPOptions.
var (
        ErrNDPOptMalformedBody   = errors.New("NDP option has a malformed body")
        ErrNDPOptMalformedHeader = errors.New("NDP option has a malformed header")
)

// Next returns the next element in the backing NDPOptions, or true if we are
// done, or false if an error occured.
//
// The return can be read as option, done, error. Note, option should only be
// used if done is false and error is nil.
func (i *NDPOptionIterator) Next() (NDPOption, bool, error) {
        for {
                // Do we still have elements to look at?
                if i.opts.Len() == 0 {
                        return nil, true, nil
                }

                // Get the Type field.
                temp, err := i.opts.ReadByte()
                if err != nil {
                        if err != io.EOF {
                                // ReadByte should only ever return nil or io.EOF.
                                panic(fmt.Sprintf("unexpected error when reading the option's Type field: %s", err))
                        }

                        // We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected once
                        // we start parsing an option; we expect the buffer to contain enough
                        // bytes for the whole option.
                        return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Type field: %w", io.ErrUnexpectedEOF)
                }
                kind := ndpOptionIdentifier(temp)

                // Get the Length field.
                length, err := i.opts.ReadByte()
                if err != nil {
                        if err != io.EOF {
                                panic(fmt.Sprintf("unexpected error when reading the option's Length field for %s: %s", kind, err))
                        }

                        return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Length field for %s: %w", kind, io.ErrUnexpectedEOF)
                }

                // This would indicate an erroneous NDP option as the Length field should
                // never be 0.
                if length == 0 {
                        return nil, true, fmt.Errorf("zero valued Length field for %s: %w", kind, ErrNDPOptMalformedHeader)
                }

                // Get the body.
                numBytes := int(length) * lengthByteUnits
                numBodyBytes := numBytes - 2
                body := i.opts.Next(numBodyBytes)
                if len(body) < numBodyBytes {
                        return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Body for %s: %w", kind, io.ErrUnexpectedEOF)
                }

                switch kind {
                case ndpSourceLinkLayerAddressOptionType:
                        return NDPSourceLinkLayerAddressOption(body), false, nil

                case ndpTargetLinkLayerAddressOptionType:
                        return NDPTargetLinkLayerAddressOption(body), false, nil

                case ndpNonceOptionType:
                        return NDPNonceOption(body), false, nil

                case ndpRouteInformationType:
                        if numBodyBytes > ndpRouteInformationMaxLength {
                                return nil, true, fmt.Errorf("got %d bytes for NDP Route Information option's body, expected at max %d bytes: %w", numBodyBytes, ndpRouteInformationMaxLength, ErrNDPOptMalformedBody)
                        }
                        opt := NDPRouteInformation(body)
                        if err := opt.hasError(); err != nil {
                                return nil, true, err
                        }

                        return opt, false, nil

                case ndpPrefixInformationType:
                        // Make sure the length of a Prefix Information option
                        // body is ndpPrefixInformationLength, as per RFC 4861
                        // section 4.6.2.
                        if numBodyBytes != ndpPrefixInformationLength {
                                return nil, true, fmt.Errorf("got %d bytes for NDP Prefix Information option's body, expected %d bytes: %w", numBodyBytes, ndpPrefixInformationLength, ErrNDPOptMalformedBody)
                        }

                        return NDPPrefixInformation(body), false, nil

                case ndpRecursiveDNSServerOptionType:
                        opt := NDPRecursiveDNSServer(body)
                        if err := opt.checkAddresses(); err != nil {
                                return nil, true, err
                        }

                        return opt, false, nil

                case ndpDNSSearchListOptionType:
                        opt := NDPDNSSearchList(body)
                        if err := opt.checkDomainNames(); err != nil {
                                return nil, true, err
                        }

                        return opt, false, nil

                default:
                        // We do not yet recognize the option, just skip for
                        // now. This is okay because RFC 4861 allows us to
                        // skip/ignore any unrecognized options. However,
                        // we MUST recognized all the options in RFC 4861.
                        //
                        // TODO(b/141487990): Handle all NDP options as defined
                        //                    by RFC 4861.
                }
        }
}

// NDPOptions is a buffer of NDP options as defined by RFC 4861 section 4.6.
type NDPOptions []byte

// Iter returns an iterator of NDPOption.
//
// If check is true, Iter will do an integrity check on the options by iterating
// over it and returning an error if detected.
//
// See NDPOptionIterator for more information.
func (b NDPOptions) Iter(check bool) (NDPOptionIterator, error) {
        it := NDPOptionIterator{
                opts: bytes.NewBuffer(b),
        }

        if check {
                it2 := NDPOptionIterator{
                        opts: bytes.NewBuffer(b),
                }

                for {
                        if _, done, err := it2.Next(); err != nil || done {
                                return it, err
                        }
                }
        }

        return it, nil
}

// Serialize serializes the provided list of NDP options into b.
//
// Note, b must be of sufficient size to hold all the options in s. See
// NDPOptionsSerializer.Length for details on the getting the total size
// of a serialized NDPOptionsSerializer.
//
// Serialize may panic if b is not of sufficient size to hold all the options
// in s.
func (b NDPOptions) Serialize(s NDPOptionsSerializer) int {
        done := 0

        for _, o := range s {
                l := paddedLength(o)

                if l == 0 {
                        continue
                }

                b[0] = byte(o.kind())

                // We know this safe because paddedLength would have returned
                // 0 if o had an invalid length (> 255 * lengthByteUnits).
                b[1] = uint8(l / lengthByteUnits)

                // Serialize NDP option body.
                used := o.serializeInto(b[2:])

                // Zero out remaining (padding) bytes, if any exists.
                for i := used + 2; i < l; i++ {
                        b[i] = 0
                }

                b = b[l:]
                done += l
        }

        return done
}

// NDPOption is the set of functions to be implemented by all NDP option types.
type NDPOption interface {
        fmt.Stringer

        // kind returns the type of the receiver.
        kind() ndpOptionIdentifier

        // length returns the length of the body of the receiver, in bytes.
        length() int

        // serializeInto serializes the receiver into the provided byte
        // buffer.
        //
        // Note, the caller MUST provide a byte buffer with size of at least
        // Length. Implementers of this function may assume that the byte buffer
        // is of sufficient size. serializeInto MAY panic if the provided byte
        // buffer is not of sufficient size.
        //
        // serializeInto will return the number of bytes that was used to
        // serialize the receiver. Implementers must only use the number of
        // bytes required to serialize the receiver. Callers MAY provide a
        // larger buffer than required to serialize into.
        serializeInto([]byte) int
}

// paddedLength returns the length of o, in bytes, with any padding bytes, if
// required.
func paddedLength(o NDPOption) int {
        l := o.length()

        if l == 0 {
                return 0
        }

        // Length excludes the 2 Type and Length bytes.
        l += 2

        // Add extra bytes if needed to make sure the option is
        // lengthByteUnits-byte aligned. We do this by adding lengthByteUnits-1
        // to l and then stripping off the last few LSBits from l. This will
        // make sure that l is rounded up to the nearest unit of
        // lengthByteUnits. This works since lengthByteUnits is a power of 2
        // (= 8).
        mask := lengthByteUnits - 1
        l += mask
        l &^= mask

        if l/lengthByteUnits > 255 {
                // Should never happen because an option can only have a max
                // value of 255 for its Length field, so just return 0 so this
                // option does not get serialized.
                //
                // Returning 0 here will make sure that this option does not get
                // serialized when NDPOptions.Serialize is called with the
                // NDPOptionsSerializer that holds this option, effectively
                // skipping this option during serialization. Also note that
                // a value of zero for the Length field in an NDP option is
                // invalid so this is another sign to the caller that this NDP
                // option is malformed, as per RFC 4861 section 4.6.
                return 0
        }

        return l
}

// NDPOptionsSerializer is a serializer for NDP options.
type NDPOptionsSerializer []NDPOption

// Length returns the total number of bytes required to serialize.
func (b NDPOptionsSerializer) Length() int {
        l := 0

        for _, o := range b {
                l += paddedLength(o)
        }

        return l
}

// NDPNonceOption is the NDP Nonce Option as defined by RFC 3971 section 5.3.2.
//
// It is the first X bytes following the NDP option's Type and Length field
// where X is the value in Length multiplied by lengthByteUnits - 2 bytes.
type NDPNonceOption []byte

// kind implements NDPOption.
func (o NDPNonceOption) kind() ndpOptionIdentifier {
        return ndpNonceOptionType
}

// length implements NDPOption.
func (o NDPNonceOption) length() int {
        return len(o)
}

// serializeInto implements NDPOption.
func (o NDPNonceOption) serializeInto(b []byte) int {
        return copy(b, o)
}

// String implements fmt.Stringer.
func (o NDPNonceOption) String() string {
        return fmt.Sprintf("%T(%x)", o, []byte(o))
}

// Nonce returns the nonce value this option holds.
func (o NDPNonceOption) Nonce() []byte {
        return o
}

// NDPSourceLinkLayerAddressOption is the NDP Source Link Layer Option
// as defined by RFC 4861 section 4.6.1.
//
// It is the first X bytes following the NDP option's Type and Length field
// where X is the value in Length multiplied by lengthByteUnits - 2 bytes.
type NDPSourceLinkLayerAddressOption tcpip.LinkAddress

// kind implements NDPOption.
func (o NDPSourceLinkLayerAddressOption) kind() ndpOptionIdentifier {
        return ndpSourceLinkLayerAddressOptionType
}

// length implements NDPOption.
func (o NDPSourceLinkLayerAddressOption) length() int {
        return len(o)
}

// serializeInto implements NDPOption.
func (o NDPSourceLinkLayerAddressOption) serializeInto(b []byte) int {
        return copy(b, o)
}

// String implements fmt.Stringer.
func (o NDPSourceLinkLayerAddressOption) String() string {
        return fmt.Sprintf("%T(%s)", o, tcpip.LinkAddress(o))
}

// EthernetAddress will return an ethernet (MAC) address if the
// NDPSourceLinkLayerAddressOption's body has at minimum EthernetAddressSize
// bytes. If the body has more than EthernetAddressSize bytes, only the first
// EthernetAddressSize bytes are returned as that is all that is needed for an
// Ethernet address.
func (o NDPSourceLinkLayerAddressOption) EthernetAddress() tcpip.LinkAddress {
        if len(o) >= EthernetAddressSize {
                return tcpip.LinkAddress(o[:EthernetAddressSize])
        }

        return tcpip.LinkAddress([]byte(nil))
}

// NDPTargetLinkLayerAddressOption is the NDP Target Link Layer Option
// as defined by RFC 4861 section 4.6.1.
//
// It is the first X bytes following the NDP option's Type and Length field
// where X is the value in Length multiplied by lengthByteUnits - 2 bytes.
type NDPTargetLinkLayerAddressOption tcpip.LinkAddress

// kind implements NDPOption.
func (o NDPTargetLinkLayerAddressOption) kind() ndpOptionIdentifier {
        return ndpTargetLinkLayerAddressOptionType
}

// length implements NDPOption.
func (o NDPTargetLinkLayerAddressOption) length() int {
        return len(o)
}

// serializeInto implements NDPOption.
func (o NDPTargetLinkLayerAddressOption) serializeInto(b []byte) int {
        return copy(b, o)
}

// String implements fmt.Stringer.
func (o NDPTargetLinkLayerAddressOption) String() string {
        return fmt.Sprintf("%T(%s)", o, tcpip.LinkAddress(o))
}

// EthernetAddress will return an ethernet (MAC) address if the
// NDPTargetLinkLayerAddressOption's body has at minimum EthernetAddressSize
// bytes. If the body has more than EthernetAddressSize bytes, only the first
// EthernetAddressSize bytes are returned as that is all that is needed for an
// Ethernet address.
func (o NDPTargetLinkLayerAddressOption) EthernetAddress() tcpip.LinkAddress {
        if len(o) >= EthernetAddressSize {
                return tcpip.LinkAddress(o[:EthernetAddressSize])
        }

        return tcpip.LinkAddress([]byte(nil))
}

// NDPPrefixInformation is the NDP Prefix Information option as defined by
// RFC 4861 section 4.6.2.
//
// The length, in bytes, of a valid NDP Prefix Information option body MUST be
// ndpPrefixInformationLength bytes.
type NDPPrefixInformation []byte

// kind implements NDPOption.
func (o NDPPrefixInformation) kind() ndpOptionIdentifier {
        return ndpPrefixInformationType
}

// length implements NDPOption.
func (o NDPPrefixInformation) length() int {
        return ndpPrefixInformationLength
}

// serializeInto implements NDPOption.
func (o NDPPrefixInformation) serializeInto(b []byte) int {
        used := copy(b, o)

        // Zero out the Reserved1 field.
        b[ndpPrefixInformationFlagsOffset] &^= ndpPrefixInformationReserved1FlagsMask

        // Zero out the Reserved2 field.
        reserved2 := b[ndpPrefixInformationReserved2Offset:][:ndpPrefixInformationReserved2Length]
        for i := range reserved2 {
                reserved2[i] = 0
        }

        return used
}

// String implements fmt.Stringer.
func (o NDPPrefixInformation) String() string {
        return fmt.Sprintf("%T(O=%t, A=%t, PL=%s, VL=%s, Prefix=%s)",
                o,
                o.OnLinkFlag(),
                o.AutonomousAddressConfigurationFlag(),
                o.PreferredLifetime(),
                o.ValidLifetime(),
                o.Subnet())
}

// PrefixLength returns the value in the number of leading bits in the Prefix
// that are valid.
//
// Valid values are in the range [0, 128], but o may not always contain valid
// values. It is up to the caller to valdiate the Prefix Information option.
func (o NDPPrefixInformation) PrefixLength() uint8 {
        return o[ndpPrefixInformationPrefixLengthOffset]
}

// OnLinkFlag returns true of the prefix is considered on-link. On-link means
// that a forwarding node is not needed to send packets to other nodes on the
// same prefix.
//
// Note, when this function returns false, no statement is made about the
// on-link property of a prefix. That is, if OnLinkFlag returns false, the
// caller MUST NOT conclude that the prefix is off-link and MUST NOT update any
// previously stored state for this prefix about its on-link status.
func (o NDPPrefixInformation) OnLinkFlag() bool {
        return o[ndpPrefixInformationFlagsOffset]&ndpPrefixInformationOnLinkFlagMask != 0
}

// AutonomousAddressConfigurationFlag returns true if the prefix can be used for
// Stateless Address Auto-Configuration (as specified in RFC 4862).
func (o NDPPrefixInformation) AutonomousAddressConfigurationFlag() bool {
        return o[ndpPrefixInformationFlagsOffset]&ndpPrefixInformationAutoAddrConfFlagMask != 0
}

// ValidLifetime returns the length of time that the prefix is valid for the
// purpose of on-link determination. This value is relative to the send time of
// the packet that the Prefix Information option was present in.
//
// Note, a value of 0 implies the prefix should not be considered as on-link,
// and a value of infinity/forever is represented by
// NDPInfiniteLifetime.
func (o NDPPrefixInformation) ValidLifetime() time.Duration {
        // The field is the time in seconds, as per RFC 4861 section 4.6.2.
        return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpPrefixInformationValidLifetimeOffset:]))
}

// PreferredLifetime returns the length of time that an address generated from
// the prefix via Stateless Address Auto-Configuration remains preferred. This
// value is relative to the send time of the packet that the Prefix Information
// option was present in.
//
// Note, a value of 0 implies that addresses generated from the prefix should
// no longer remain preferred, and a value of infinity is represented by
// NDPInfiniteLifetime.
//
// Also note that the value of this field MUST NOT exceed the Valid Lifetime
// field to avoid preferring addresses that are no longer valid, for the
// purpose of Stateless Address Auto-Configuration.
func (o NDPPrefixInformation) PreferredLifetime() time.Duration {
        // The field is the time in seconds, as per RFC 4861 section 4.6.2.
        return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpPrefixInformationPreferredLifetimeOffset:]))
}

// Prefix returns an IPv6 address or a prefix of an IPv6 address. The Prefix
// Length field (see NDPPrefixInformation.PrefixLength) contains the number
// of valid leading bits in the prefix.
//
// Hosts SHOULD ignore an NDP Prefix Information option where the Prefix field
// holds the link-local prefix (fe80::).
func (o NDPPrefixInformation) Prefix() tcpip.Address {
        return tcpip.Address(o[ndpPrefixInformationPrefixOffset:][:IPv6AddressSize])
}

// Subnet returns the Prefix field and Prefix Length field represented in a
// tcpip.Subnet.
func (o NDPPrefixInformation) Subnet() tcpip.Subnet {
        addrWithPrefix := tcpip.AddressWithPrefix{
                Address:   o.Prefix(),
                PrefixLen: int(o.PrefixLength()),
        }
        return addrWithPrefix.Subnet()
}

// NDPRecursiveDNSServer is the NDP Recursive DNS Server option, as defined by
// RFC 8106 section 5.1.
//
// To make sure that the option meets its minimum length and does not end in the
// middle of a DNS server's IPv6 address, the length of a valid
// NDPRecursiveDNSServer must meet the following constraint:
//   (Length - ndpRecursiveDNSServerAddressesOffset) % IPv6AddressSize == 0
type NDPRecursiveDNSServer []byte

// Type returns the type of an NDP Recursive DNS Server option.
//
// kind implements NDPOption.
func (NDPRecursiveDNSServer) kind() ndpOptionIdentifier {
        return ndpRecursiveDNSServerOptionType
}

// length implements NDPOption.
func (o NDPRecursiveDNSServer) length() int {
        return len(o)
}

// serializeInto implements NDPOption.
func (o NDPRecursiveDNSServer) serializeInto(b []byte) int {
        used := copy(b, o)

        // Zero out the reserved bytes that are before the Lifetime field.
        for i := 0; i < ndpRecursiveDNSServerLifetimeOffset; i++ {
                b[i] = 0
        }

        return used
}

// String implements fmt.Stringer.
func (o NDPRecursiveDNSServer) String() string {
        lt := o.Lifetime()
        addrs, err := o.Addresses()
        if err != nil {
                return fmt.Sprintf("%T([] valid for %s; err = %s)", o, lt, err)
        }
        return fmt.Sprintf("%T(%s valid for %s)", o, addrs, lt)
}

// Lifetime returns the length of time that the DNS server addresses
// in this option may be used for name resolution.
//
// Note, a value of 0 implies the addresses should no longer be used,
// and a value of infinity/forever is represented by NDPInfiniteLifetime.
//
// Lifetime may panic if o does not have enough bytes to hold the Lifetime
// field.
func (o NDPRecursiveDNSServer) Lifetime() time.Duration {
        // The field is the time in seconds, as per RFC 8106 section 5.1.
        return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpRecursiveDNSServerLifetimeOffset:]))
}

// Addresses returns the recursive DNS server IPv6 addresses that may be
// used for name resolution.
//
// Note, the addresses MAY be link-local addresses.
func (o NDPRecursiveDNSServer) Addresses() ([]tcpip.Address, error) {
        var addrs []tcpip.Address
        return addrs, o.iterAddresses(func(addr tcpip.Address) { addrs = append(addrs, addr) })
}

// checkAddresses iterates over the addresses in an NDP Recursive DNS Server
// option and returns any error it encounters.
func (o NDPRecursiveDNSServer) checkAddresses() error {
        return o.iterAddresses(nil)
}

// iterAddresses iterates over the addresses in an NDP Recursive DNS Server
// option and calls a function with each valid unicast IPv6 address.
//
// Note, the addresses MAY be link-local addresses.
func (o NDPRecursiveDNSServer) iterAddresses(fn func(tcpip.Address)) error {
        if l := len(o); l < minNDPRecursiveDNSServerBodySize {
                return fmt.Errorf("got %d bytes for NDP Recursive DNS Server option's body, expected at least %d bytes: %w", l, minNDPRecursiveDNSServerBodySize, io.ErrUnexpectedEOF)
        }

        o = o[ndpRecursiveDNSServerAddressesOffset:]
        l := len(o)
        if l%IPv6AddressSize != 0 {
                return fmt.Errorf("NDP Recursive DNS Server option's body ends in the middle of an IPv6 address (addresses body size = %d bytes): %w", l, ErrNDPOptMalformedBody)
        }

        for i := 0; len(o) != 0; i++ {
                addr := tcpip.Address(o[:IPv6AddressSize])
                if !IsV6UnicastAddress(addr) {
                        return fmt.Errorf("%d-th address (%s) in NDP Recursive DNS Server option is not a valid unicast IPv6 address: %w", i, addr, ErrNDPOptMalformedBody)
                }

                if fn != nil {
                        fn(addr)
                }

                o = o[IPv6AddressSize:]
        }

        return nil
}

// NDPDNSSearchList is the NDP DNS Search List option, as defined by
// RFC 8106 section 5.2.
type NDPDNSSearchList []byte

// kind implements NDPOption.
func (o NDPDNSSearchList) kind() ndpOptionIdentifier {
        return ndpDNSSearchListOptionType
}

// length implements NDPOption.
func (o NDPDNSSearchList) length() int {
        return len(o)
}

// serializeInto implements NDPOption.
func (o NDPDNSSearchList) serializeInto(b []byte) int {
        used := copy(b, o)

        // Zero out the reserved bytes that are before the Lifetime field.
        for i := 0; i < ndpDNSSearchListLifetimeOffset; i++ {
                b[i] = 0
        }

        return used
}

// String implements fmt.Stringer.
func (o NDPDNSSearchList) String() string {
        lt := o.Lifetime()
        domainNames, err := o.DomainNames()
        if err != nil {
                return fmt.Sprintf("%T([] valid for %s; err = %s)", o, lt, err)
        }
        return fmt.Sprintf("%T(%s valid for %s)", o, domainNames, lt)
}

// Lifetime returns the length of time that the DNS search list of domain names
// in this option may be used for name resolution.
//
// Note, a value of 0 implies the domain names should no longer be used,
// and a value of infinity/forever is represented by NDPInfiniteLifetime.
func (o NDPDNSSearchList) Lifetime() time.Duration {
        // The field is the time in seconds, as per RFC 8106 section 5.1.
        return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpDNSSearchListLifetimeOffset:]))
}

// DomainNames returns a DNS search list of domain names.
//
// DomainNames will parse the backing buffer as outlined by RFC 1035 section
// 3.1 and return a list of strings, with all domain names in lower case.
func (o NDPDNSSearchList) DomainNames() ([]string, error) {
        var domainNames []string
        return domainNames, o.iterDomainNames(func(domainName string) { domainNames = append(domainNames, domainName) })
}

// checkDomainNames iterates over the domain names in an NDP DNS Search List
// option and returns any error it encounters.
func (o NDPDNSSearchList) checkDomainNames() error {
        return o.iterDomainNames(nil)
}

// iterDomainNames iterates over the domain names in an NDP DNS Search List
// option and calls a function with each valid domain name.
func (o NDPDNSSearchList) iterDomainNames(fn func(string)) error {
        if l := len(o); l < minNDPDNSSearchListBodySize {
                return fmt.Errorf("got %d bytes for NDP DNS Search List  option's body, expected at least %d bytes: %w", l, minNDPDNSSearchListBodySize, io.ErrUnexpectedEOF)
        }

        var searchList bytes.Reader
        searchList.Reset(o[ndpDNSSearchListDomainNamesOffset:])

        var scratch [maxDomainNameLength]byte
        domainName := bytes.NewBuffer(scratch[:])

        // Parse the domain names, as per RFC 1035 section 3.1.
        for searchList.Len() != 0 {
                domainName.Reset()

                // Parse a label within a domain name, as per RFC 1035 section 3.1.
                for {
                        // The first byte is the label length.
                        labelLenByte, err := searchList.ReadByte()
                        if err != nil {
                                if err != io.EOF {
                                        // ReadByte should only ever return nil or io.EOF.
                                        panic(fmt.Sprintf("unexpected error when reading a label's length: %s", err))
                                }

                                // We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected
                                // once we start parsing a domain name; we expect the buffer to contain
                                // enough bytes for the whole domain name.
                                return fmt.Errorf("unexpected exhausted buffer while parsing a new label for a domain from NDP Search List option: %w", io.ErrUnexpectedEOF)
                        }
                        labelLen := int(labelLenByte)

                        // A zero-length label implies the end of a domain name.
                        if labelLen == 0 {
                                // If the domain name is empty or we have no callback function, do
                                // nothing further with the current domain name.
                                if domainName.Len() == 0 || fn == nil {
                                        break
                                }

                                // Ignore the trailing period in the parsed domain name.
                                domainName.Truncate(domainName.Len() - 1)
                                fn(domainName.String())
                                break
                        }

                        // The label's length must not exceed the maximum length for a label.
                        if labelLen > maxDomainNameLabelLength {
                                return fmt.Errorf("label length of %d bytes is greater than the max label length of %d bytes for an NDP Search List option: %w", labelLen, maxDomainNameLabelLength, ErrNDPOptMalformedBody)
                        }

                        // The label (and trailing period) must not make the domain name too long.
                        if labelLen+1 > domainName.Cap()-domainName.Len() {
                                return fmt.Errorf("label would make an NDP Search List option's domain name longer than the max domain name length of %d bytes: %w", maxDomainNameLength, ErrNDPOptMalformedBody)
                        }

                        // Copy the label and add a trailing period.
                        for i := 0; i < labelLen; i++ {
                                b, err := searchList.ReadByte()
                                if err != nil {
                                        if err != io.EOF {
                                                panic(fmt.Sprintf("unexpected error when reading domain name's label: %s", err))
                                        }

                                        return fmt.Errorf("read %d out of %d bytes for a domain name's label from NDP Search List option: %w", i, labelLen, io.ErrUnexpectedEOF)
                                }

                                // As per RFC 1035 section 2.3.1:
                                //  1) the label must only contain ASCII include letters, digits and
                                //     hyphens
                                //  2) the first character in a label must be a letter
                                //  3) the last letter in a label must be a letter or digit

                                if !isLetter(b) {
                                        if i == 0 {
                                                return fmt.Errorf("first character of a domain name's label in an NDP Search List option must be a letter, got character code = %d: %w", b, ErrNDPOptMalformedBody)
                                        }

                                        if b == '-' {
                                                if i == labelLen-1 {
                                                        return fmt.Errorf("last character of a domain name's label in an NDP Search List option must not be a hyphen (-): %w", ErrNDPOptMalformedBody)
                                                }
                                        } else if !isDigit(b) {
                                                return fmt.Errorf("domain name's label in an NDP Search List option may only contain letters, digits and hyphens, got character code = %d: %w", b, ErrNDPOptMalformedBody)
                                        }
                                }

                                // If b is an upper case character, make it lower case.
                                if isUpperLetter(b) {
                                        b = b - 'A' + 'a'
                                }

                                if err := domainName.WriteByte(b); err != nil {
                                        panic(fmt.Sprintf("unexpected error writing label to domain name buffer: %s", err))
                                }
                        }
                        if err := domainName.WriteByte('.'); err != nil {
                                panic(fmt.Sprintf("unexpected error writing trailing period to domain name buffer: %s", err))
                        }
                }
        }

        return nil
}

func isLetter(b byte) bool {
        return b >= 'a' && b <= 'z' || isUpperLetter(b)
}

func isUpperLetter(b byte) bool {
        return b >= 'A' && b <= 'Z'
}

func isDigit(b byte) bool {
        return b >= '0' && b <= '9'
}

// As per RFC 4191 section 2.3,
//
//  2.3.  Route Information Option
//
//      0                   1                   2                   3
//       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
//      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//      |     Type      |    Length     | Prefix Length |Resvd|Prf|Resvd|
//      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//      |                        Route Lifetime                         |
//      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//      |                   Prefix (Variable Length)                    |
//      .                                                               .
//      .                                                               .
//      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
//
//   Fields:
//
//   Type        24
//
//
//   Length      8-bit unsigned integer.  The length of the option
//               (including the Type and Length fields) in units of 8
//               octets.  The Length field is 1, 2, or 3 depending on the
//               Prefix Length.  If Prefix Length is greater than 64, then
//               Length must be 3.  If Prefix Length is greater than 0,
//               then Length must be 2 or 3.  If Prefix Length is zero,
//               then Length must be 1, 2, or 3.
const (
        ndpRouteInformationType      = ndpOptionIdentifier(24)
        ndpRouteInformationMaxLength = 22

        ndpRouteInformationPrefixLengthIdx  = 0
        ndpRouteInformationFlagsIdx         = 1
        ndpRouteInformationPrfShift         = 3
        ndpRouteInformationPrfMask          = 3 << ndpRouteInformationPrfShift
        ndpRouteInformationRouteLifetimeIdx = 2
        ndpRouteInformationRoutePrefixIdx   = 6
)

// NDPRouteInformation is the NDP Router Information option, as defined by
// RFC 4191 section 2.3.
type NDPRouteInformation []byte

func (NDPRouteInformation) kind() ndpOptionIdentifier {
        return ndpRouteInformationType
}

func (o NDPRouteInformation) length() int {
        return len(o)
}

func (o NDPRouteInformation) serializeInto(b []byte) int {
        return copy(b, o)
}

// String implements fmt.Stringer.
func (o NDPRouteInformation) String() string {
        return fmt.Sprintf("%T", o)
}

// PrefixLength returns the length of the prefix.
func (o NDPRouteInformation) PrefixLength() uint8 {
        return o[ndpRouteInformationPrefixLengthIdx]
}

// RoutePreference returns the preference of the route over other routes to the
// same destination but through a different router.
func (o NDPRouteInformation) RoutePreference() NDPRoutePreference {
        return NDPRoutePreference((o[ndpRouteInformationFlagsIdx] & ndpRouteInformationPrfMask) >> ndpRouteInformationPrfShift)
}

// RouteLifetime returns the lifetime of the route.
//
// Note, a value of 0 implies the route is now invalid and a value of
// infinity/forever is represented by NDPInfiniteLifetime.
func (o NDPRouteInformation) RouteLifetime() time.Duration {
        return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpRouteInformationRouteLifetimeIdx:]))
}

// Prefix returns the prefix of the destination subnet this route is for.
func (o NDPRouteInformation) Prefix() (tcpip.Subnet, error) {
        prefixLength := int(o.PrefixLength())
        if max := IPv6AddressSize * 8; prefixLength > max {
                return tcpip.Subnet{}, fmt.Errorf("got prefix length = %d, want <= %d", prefixLength, max)
        }

        prefix := o[ndpRouteInformationRoutePrefixIdx:]
        var addrBytes [IPv6AddressSize]byte
        if n := copy(addrBytes[:], prefix); n != len(prefix) {
                panic(fmt.Sprintf("got copy(addrBytes, prefix) = %d, want = %d", n, len(prefix)))
        }

        return tcpip.AddressWithPrefix{
                Address:   tcpip.Address(addrBytes[:]),
                PrefixLen: prefixLength,
        }.Subnet(), nil
}

func (o NDPRouteInformation) hasError() error {
        l := len(o)
        if l < ndpRouteInformationRoutePrefixIdx {
                return fmt.Errorf("%T too small, got = %d bytes: %w", o, l, ErrNDPOptMalformedBody)
        }

        prefixLength := int(o.PrefixLength())
        if max := IPv6AddressSize * 8; prefixLength > max {
                return fmt.Errorf("got prefix length = %d, want <= %d: %w", prefixLength, max, ErrNDPOptMalformedBody)
        }

        //   Length      8-bit unsigned integer.  The length of the option
        //               (including the Type and Length fields) in units of 8
        //               octets.  The Length field is 1, 2, or 3 depending on the
        //               Prefix Length.  If Prefix Length is greater than 64, then
        //               Length must be 3.  If Prefix Length is greater than 0,
        //               then Length must be 2 or 3.  If Prefix Length is zero,
        //               then Length must be 1, 2, or 3.
        l += 2 // Add 2 bytes for the type and length bytes.
        lengthField := l / lengthByteUnits
        if prefixLength > 64 {
                if lengthField != 3 {
                        return fmt.Errorf("Length field must be 3 when Prefix Length (%d) is > 64 (got = %d): %w", prefixLength, lengthField, ErrNDPOptMalformedBody)
                }
        } else if prefixLength > 0 {
                if lengthField != 2 && lengthField != 3 {
                        return fmt.Errorf("Length field must be 2 or 3 when Prefix Length (%d) is between 0 and 64 (got = %d): %w", prefixLength, lengthField, ErrNDPOptMalformedBody)
                }
        } else if lengthField == 0 || lengthField > 3 {
                return fmt.Errorf("Length field must be 1, 2, or 3 when Prefix Length is zero (got = %d): %w", lengthField, ErrNDPOptMalformedBody)
        }

        return nil
}






























































    1 




    1 












    1 













































  241 






  242 


  242 



  242 
  243 



  243 









  243 

  243 







  243 

  242 


  242 

  243 




    1 




    1 





    1 












































    1 











    1 



    1 



    1 













    1 


    1 

















    1 


    1 


    1 


    1 


    1 


    1 


    1 


    1 


    1 


    1 


    1 


    1 


    1 


    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mm

import (
        "bytes"
        "fmt"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/fs/proc/seqfile"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
)

const (
        // devMinorBits is the number of minor bits in a device number. Linux:
        // include/linux/kdev_t.h:MINORBITS
        devMinorBits = 20

        vsyscallEnd        = hostarch.Addr(0xffffffffff601000)
        vsyscallMapsEntry  = "ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]\n"
        vsyscallSmapsEntry = vsyscallMapsEntry +
                "Size:                  4 kB\n" +
                "Rss:                   0 kB\n" +
                "Pss:                   0 kB\n" +
                "Shared_Clean:          0 kB\n" +
                "Shared_Dirty:          0 kB\n" +
                "Private_Clean:         0 kB\n" +
                "Private_Dirty:         0 kB\n" +
                "Referenced:            0 kB\n" +
                "Anonymous:             0 kB\n" +
                "AnonHugePages:         0 kB\n" +
                "Shared_Hugetlb:        0 kB\n" +
                "Private_Hugetlb:       0 kB\n" +
                "Swap:                  0 kB\n" +
                "SwapPss:               0 kB\n" +
                "KernelPageSize:        4 kB\n" +
                "MMUPageSize:           4 kB\n" +
                "Locked:                0 kB\n" +
                "VmFlags: rd ex \n"
)

// NeedsUpdate implements seqfile.SeqSource.NeedsUpdate.
func (mm *MemoryManager) NeedsUpdate(generation int64) bool {
        return true
}

// ReadMapsDataInto is called by fsimpl/proc.mapsData.Generate to
// implement /proc/[pid]/maps.
func (mm *MemoryManager) ReadMapsDataInto(ctx context.Context, buf *bytes.Buffer) {
        mm.mappingMu.RLock()
        defer mm.mappingMu.RUnlock()
        var start hostarch.Addr

        for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
                mm.appendVMAMapsEntryLocked(ctx, vseg, buf)
        }

        // We always emulate vsyscall, so advertise it here. Everything about a
        // vsyscall region is static, so just hard code the maps entry since we
        // don't have a real vma backing it. The vsyscall region is at the end of
        // the virtual address space so nothing should be mapped after it (if
        // something is really mapped in the tiny ~10 MiB segment afterwards, we'll
        // get the sorting on the maps file wrong at worst; but that's not possible
        // on any current platform).
        //
        // Artifically adjust the seqfile handle so we only output vsyscall entry once.
        if start != vsyscallEnd {
                buf.WriteString(vsyscallMapsEntry)
        }
}

// ReadMapsSeqFileData is called by fs/proc.mapsData.ReadSeqFileData to
// implement /proc/[pid]/maps.
func (mm *MemoryManager) ReadMapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
        mm.mappingMu.RLock()
        defer mm.mappingMu.RUnlock()
        var data []seqfile.SeqData
        var start hostarch.Addr
        if handle != nil {
                start = *handle.(*hostarch.Addr)
        }
        for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
                vmaAddr := vseg.End()
                data = append(data, seqfile.SeqData{
                        Buf:    mm.vmaMapsEntryLocked(ctx, vseg),
                        Handle: &vmaAddr,
                })
        }

        // We always emulate vsyscall, so advertise it here. Everything about a
        // vsyscall region is static, so just hard code the maps entry since we
        // don't have a real vma backing it. The vsyscall region is at the end of
        // the virtual address space so nothing should be mapped after it (if
        // something is really mapped in the tiny ~10 MiB segment afterwards, we'll
        // get the sorting on the maps file wrong at worst; but that's not possible
        // on any current platform).
        //
        // Artifically adjust the seqfile handle so we only output vsyscall entry once.
        if start != vsyscallEnd {
                vmaAddr := vsyscallEnd
                data = append(data, seqfile.SeqData{
                        Buf:    []byte(vsyscallMapsEntry),
                        Handle: &vmaAddr,
                })
        }
        return data, 1
}

// vmaMapsEntryLocked returns a /proc/[pid]/maps entry for the vma iterated by
// vseg, including the trailing newline.
//
// Preconditions: mm.mappingMu must be locked.
func (mm *MemoryManager) vmaMapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
        var b bytes.Buffer
        mm.appendVMAMapsEntryLocked(ctx, vseg, &b)
        return b.Bytes()
}

// Preconditions: mm.mappingMu must be locked.
func (mm *MemoryManager) appendVMAMapsEntryLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) {
        vma := vseg.ValuePtr()
        private := "p"
        if !vma.private {
                private = "s"
        }

        var dev, ino uint64
        if vma.id != nil {
                dev = vma.id.DeviceID()
                ino = vma.id.InodeID()
        }
        devMajor := uint32(dev >> devMinorBits)
        devMinor := uint32(dev & ((1 << devMinorBits) - 1))

        // Do not include the guard page: fs/proc/task_mmu.c:show_map_vma() =>
        // stack_guard_page_start().
        lineLen, _ := fmt.Fprintf(b, "%08x-%08x %s%s %08x %02x:%02x %d ",
                vseg.Start(), vseg.End(), vma.realPerms, private, vma.off, devMajor, devMinor, ino)

        // Figure out our filename or hint.
        var s string
        if vma.hint != "" {
                s = vma.hint
        } else if vma.id != nil {
                // FIXME(jamieliu): We are holding mm.mappingMu here, which is
                // consistent with Linux's holding mmap_sem in
                // fs/proc/task_mmu.c:show_map_vma() => fs/seq_file.c:seq_file_path().
                // However, it's not clear that fs.File.MappedName() is actually
                // consistent with this lock order.
                s = vma.id.MappedName(ctx)
        }
        if s != "" {
                // Per linux, we pad until the 74th character.
                for pad := 73 - lineLen; pad > 0; pad-- {
                        b.WriteByte(' ')
                }
                b.WriteString(s)
        }
        b.WriteByte('\n')
}

// ReadSmapsDataInto is called by fsimpl/proc.smapsData.Generate to
// implement /proc/[pid]/maps.
func (mm *MemoryManager) ReadSmapsDataInto(ctx context.Context, buf *bytes.Buffer) {
        mm.mappingMu.RLock()
        defer mm.mappingMu.RUnlock()
        var start hostarch.Addr

        for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
                mm.vmaSmapsEntryIntoLocked(ctx, vseg, buf)
        }

        // We always emulate vsyscall, so advertise it here. See
        // ReadMapsSeqFileData for additional commentary.
        if start != vsyscallEnd {
                buf.WriteString(vsyscallSmapsEntry)
        }
}

// ReadSmapsSeqFileData is called by fs/proc.smapsData.ReadSeqFileData to
// implement /proc/[pid]/smaps.
func (mm *MemoryManager) ReadSmapsSeqFileData(ctx context.Context, handle seqfile.SeqHandle) ([]seqfile.SeqData, int64) {
        mm.mappingMu.RLock()
        defer mm.mappingMu.RUnlock()
        var data []seqfile.SeqData
        var start hostarch.Addr
        if handle != nil {
                start = *handle.(*hostarch.Addr)
        }
        for vseg := mm.vmas.LowerBoundSegment(start); vseg.Ok(); vseg = vseg.NextSegment() {
                vmaAddr := vseg.End()
                data = append(data, seqfile.SeqData{
                        Buf:    mm.vmaSmapsEntryLocked(ctx, vseg),
                        Handle: &vmaAddr,
                })
        }

        // We always emulate vsyscall, so advertise it here. See
        // ReadMapsSeqFileData for additional commentary.
        if start != vsyscallEnd {
                vmaAddr := vsyscallEnd
                data = append(data, seqfile.SeqData{
                        Buf:    []byte(vsyscallSmapsEntry),
                        Handle: &vmaAddr,
                })
        }
        return data, 1
}

// vmaSmapsEntryLocked returns a /proc/[pid]/smaps entry for the vma iterated
// by vseg, including the trailing newline.
//
// Preconditions: mm.mappingMu must be locked.
func (mm *MemoryManager) vmaSmapsEntryLocked(ctx context.Context, vseg vmaIterator) []byte {
        var b bytes.Buffer
        mm.vmaSmapsEntryIntoLocked(ctx, vseg, &b)
        return b.Bytes()
}

func (mm *MemoryManager) vmaSmapsEntryIntoLocked(ctx context.Context, vseg vmaIterator, b *bytes.Buffer) {
        mm.appendVMAMapsEntryLocked(ctx, vseg, b)
        vma := vseg.ValuePtr()

        // We take mm.activeMu here in each call to vmaSmapsEntryLocked, instead of
        // requiring it to be locked as a precondition, to reduce the latency
        // impact of reading /proc/[pid]/smaps on concurrent performance-sensitive
        // operations requiring activeMu for writing like faults.
        mm.activeMu.RLock()
        var rss uint64
        var anon uint64
        vsegAR := vseg.Range()
        for pseg := mm.pmas.LowerBoundSegment(vsegAR.Start); pseg.Ok() && pseg.Start() < vsegAR.End; pseg = pseg.NextSegment() {
                psegAR := pseg.Range().Intersect(vsegAR)
                size := uint64(psegAR.Length())
                rss += size
                if pseg.ValuePtr().private {
                        anon += size
                }
        }
        mm.activeMu.RUnlock()

        fmt.Fprintf(b, "Size:           %8d kB\n", vseg.Range().Length()/1024)
        fmt.Fprintf(b, "Rss:            %8d kB\n", rss/1024)
        // Currently we report PSS = RSS, i.e. we pretend each page mapped by a pma
        // is only mapped by that pma. This avoids having to query memmap.Mappables
        // for reference count information on each page. As a corollary, all pages
        // are accounted as "private" whether or not the vma is private; compare
        // Linux's fs/proc/task_mmu.c:smaps_account().
        fmt.Fprintf(b, "Pss:            %8d kB\n", rss/1024)
        fmt.Fprintf(b, "Shared_Clean:   %8d kB\n", 0)
        fmt.Fprintf(b, "Shared_Dirty:   %8d kB\n", 0)
        // Pretend that all pages are dirty if the vma is writable, and clean otherwise.
        clean := rss
        if vma.effectivePerms.Write {
                clean = 0
        }
        fmt.Fprintf(b, "Private_Clean:  %8d kB\n", clean/1024)
        fmt.Fprintf(b, "Private_Dirty:  %8d kB\n", (rss-clean)/1024)
        // Pretend that all pages are "referenced" (recently touched).
        fmt.Fprintf(b, "Referenced:     %8d kB\n", rss/1024)
        fmt.Fprintf(b, "Anonymous:      %8d kB\n", anon/1024)
        // Hugepages (hugetlb and THP) are not implemented.
        fmt.Fprintf(b, "AnonHugePages:  %8d kB\n", 0)
        fmt.Fprintf(b, "Shared_Hugetlb: %8d kB\n", 0)
        fmt.Fprintf(b, "Private_Hugetlb: %7d kB\n", 0)
        // Swap is not implemented.
        fmt.Fprintf(b, "Swap:           %8d kB\n", 0)
        fmt.Fprintf(b, "SwapPss:        %8d kB\n", 0)
        fmt.Fprintf(b, "KernelPageSize: %8d kB\n", hostarch.PageSize/1024)
        fmt.Fprintf(b, "MMUPageSize:    %8d kB\n", hostarch.PageSize/1024)
        locked := rss
        if vma.mlockMode == memmap.MLockNone {
                locked = 0
        }
        fmt.Fprintf(b, "Locked:         %8d kB\n", locked/1024)

        b.WriteString("VmFlags: ")
        if vma.realPerms.Read {
                b.WriteString("rd ")
        }
        if vma.realPerms.Write {
                b.WriteString("wr ")
        }
        if vma.realPerms.Execute {
                b.WriteString("ex ")
        }
        if vma.canWriteMappableLocked() { // VM_SHARED
                b.WriteString("sh ")
        }
        if vma.maxPerms.Read {
                b.WriteString("mr ")
        }
        if vma.maxPerms.Write {
                b.WriteString("mw ")
        }
        if vma.maxPerms.Execute {
                b.WriteString("me ")
        }
        if !vma.private { // VM_MAYSHARE
                b.WriteString("ms ")
        }
        if vma.growsDown {
                b.WriteString("gd ")
        }
        if vma.mlockMode != memmap.MLockNone { // VM_LOCKED
                b.WriteString("lo ")
        }
        if vma.mlockMode == memmap.MLockLazy { // VM_LOCKONFAULT
                b.WriteString("?? ") // no explicit encoding in fs/proc/task_mmu.c:show_smap_vma_flags()
        }
        if vma.private && vma.effectivePerms.Write { // VM_ACCOUNT
                b.WriteString("ac ")
        }
        b.WriteString("\n")
}



























































































  401 

  402 




  400 




  401 










  296 










  401 






































  294 





  399 














  402 


  245 


  220 



  220 
  220 





    2 





  218 





















  346 





  343 





  341 























  171 





  219 







  343 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build amd64

package arch

import (
        "math"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch/fpu"
)

// SignalContext64 is equivalent to struct sigcontext, the type passed as the
// second argument to signal handlers set by signal(2).
//
// +marshal
type SignalContext64 struct {
        R8      uint64
        R9      uint64
        R10     uint64
        R11     uint64
        R12     uint64
        R13     uint64
        R14     uint64
        R15     uint64
        Rdi     uint64
        Rsi     uint64
        Rbp     uint64
        Rbx     uint64
        Rdx     uint64
        Rax     uint64
        Rcx     uint64
        Rsp     uint64
        Rip     uint64
        Eflags  uint64
        Cs      uint16
        Gs      uint16 // always 0 on amd64.
        Fs      uint16 // always 0 on amd64.
        Ss      uint16 // only restored if _UC_STRICT_RESTORE_SS (unsupported).
        Err     uint64
        Trapno  uint64
        Oldmask linux.SignalSet
        Cr2     uint64
        // Pointer to a struct _fpstate. See b/33003106#comment8.
        Fpstate  uint64
        Reserved [8]uint64
}

// Flags for UContext64.Flags.
const (
        _UC_FP_XSTATE         = 1
        _UC_SIGCONTEXT_SS     = 2
        _UC_STRICT_RESTORE_SS = 4
)

// UContext64 is equivalent to ucontext_t on 64-bit x86.
//
// +marshal
type UContext64 struct {
        Flags    uint64
        Link     uint64
        Stack    linux.SignalStack
        MContext SignalContext64
        Sigset   linux.SignalSet
}

// From Linux 'arch/x86/include/uapi/asm/sigcontext.h' the following is the
// size of the magic cookie at the end of the xsave frame.
//
// NOTE(b/33003106#comment11): Currently we don't actually populate the fpstate
// on the signal stack.
const _FP_XSTATE_MAGIC2_SIZE = 4

func (c *context64) fpuFrameSize() (size int, useXsave bool) {
        size = len(c.fpState)
        if size > 512 {
                // Make room for the magic cookie at the end of the xsave frame.
                size += _FP_XSTATE_MAGIC2_SIZE
                useXsave = true
        }
        return size, useXsave
}

// SignalSetup implements Context.SignalSetup. (Compare to Linux's
// arch/x86/kernel/signal.c:__setup_rt_frame().)
func (c *context64) SignalSetup(st *Stack, act *linux.SigAction, info *linux.SignalInfo, alt *linux.SignalStack, sigset linux.SignalSet) error {
        sp := st.Bottom

        // "The 128-byte area beyond the location pointed to by %rsp is considered
        // to be reserved and shall not be modified by signal or interrupt
        // handlers. ... leaf functions may use this area for their entire stack
        // frame, rather than adjusting the stack pointer in the prologue and
        // epilogue." - AMD64 ABI
        //
        // (But this doesn't apply if we're starting at the top of the signal
        // stack, in which case there is no following stack frame.)
        if !(alt.IsEnabled() && sp == alt.Top()) {
                sp -= 128
        }

        // Allocate space for floating point state on the stack.
        //
        // This isn't strictly necessary because we don't actually populate
        // the fpstate. However we do store the floating point state of the
        // interrupted thread inside the sentry. Simply accounting for this
        // space on the user stack naturally caps the amount of memory the
        // sentry will allocate for this purpose.
        fpSize, _ := c.fpuFrameSize()
        sp = (sp - hostarch.Addr(fpSize)) & ^hostarch.Addr(63)

        // Construct the UContext64 now since we need its size.
        uc := &UContext64{
                // No _UC_FP_XSTATE: see Fpstate above.
                // No _UC_STRICT_RESTORE_SS: we don't allow SS changes.
                Flags: _UC_SIGCONTEXT_SS,
                Stack: *alt,
                MContext: SignalContext64{
                        R8:      c.Regs.R8,
                        R9:      c.Regs.R9,
                        R10:     c.Regs.R10,
                        R11:     c.Regs.R11,
                        R12:     c.Regs.R12,
                        R13:     c.Regs.R13,
                        R14:     c.Regs.R14,
                        R15:     c.Regs.R15,
                        Rdi:     c.Regs.Rdi,
                        Rsi:     c.Regs.Rsi,
                        Rbp:     c.Regs.Rbp,
                        Rbx:     c.Regs.Rbx,
                        Rdx:     c.Regs.Rdx,
                        Rax:     c.Regs.Rax,
                        Rcx:     c.Regs.Rcx,
                        Rsp:     c.Regs.Rsp,
                        Rip:     c.Regs.Rip,
                        Eflags:  c.Regs.Eflags,
                        Cs:      uint16(c.Regs.Cs),
                        Ss:      uint16(c.Regs.Ss),
                        Oldmask: sigset,
                },
                Sigset: sigset,
        }

        // TODO(gvisor.dev/issue/159): Set SignalContext64.Err, Trapno, and Cr2
        // based on the fault that caused the signal. For now, leave Err and
        // Trapno unset and assume CR2 == info.Addr() for SIGSEGVs and
        // SIGBUSes.
        if linux.Signal(info.Signo) == linux.SIGSEGV || linux.Signal(info.Signo) == linux.SIGBUS {
                uc.MContext.Cr2 = info.Addr()
        }

        // "... the value (%rsp+8) is always a multiple of 16 (...) when
        // control is transferred to the function entry point." - AMD64 ABI
        ucSize := uc.SizeBytes()
        // st.Arch.Width() is for the restorer address. sizeof(siginfo) == 128.
        frameSize := int(st.Arch.Width()) + ucSize + 128
        frameBottom := (sp-hostarch.Addr(frameSize)) & ^hostarch.Addr(15) - 8
        sp = frameBottom + hostarch.Addr(frameSize)
        st.Bottom = sp

        // Prior to proceeding, figure out if the frame will exhaust the range
        // for the signal stack. This is not allowed, and should immediately
        // force signal delivery (reverting to the default handler).
        if act.Flags&linux.SA_ONSTACK != 0 && alt.IsEnabled() && !alt.Contains(frameBottom) {
                return unix.EFAULT
        }

        // Adjust the code.
        info.FixSignalCodeForUser()

        // Set up the stack frame.
        if _, err := info.CopyOut(st, StackBottomMagic); err != nil {
                return err
        }
        infoAddr := st.Bottom
        if _, err := uc.CopyOut(st, StackBottomMagic); err != nil {
                return err
        }
        ucAddr := st.Bottom
        if act.Flags&linux.SA_RESTORER != 0 {
                // Push the restorer return address.
                // Note that this doesn't need to be popped.
                if _, err := primitive.CopyUint64Out(st, StackBottomMagic, act.Restorer); err != nil {
                        return err
                }
        } else {
                // amd64 requires a restorer.
                return unix.EFAULT
        }

        // Set up registers.
        c.Regs.Rip = act.Handler
        c.Regs.Rsp = uint64(st.Bottom)
        c.Regs.Rdi = uint64(info.Signo)
        c.Regs.Rsi = uint64(infoAddr)
        c.Regs.Rdx = uint64(ucAddr)
        c.Regs.Rax = 0
        c.Regs.Ds = userDS
        c.Regs.Es = userDS
        c.Regs.Cs = userCS
        c.Regs.Ss = userDS

        // Save the thread's floating point state.
        c.sigFPState = append(c.sigFPState, c.fpState)

        // Signal handler gets a clean floating point state.
        c.fpState = fpu.NewState()

        return nil
}

// SignalRestore implements Context.SignalRestore. (Compare to Linux's
// arch/x86/kernel/signal.c:sys_rt_sigreturn().)
func (c *context64) SignalRestore(st *Stack, rt bool) (linux.SignalSet, linux.SignalStack, error) {
        // Copy out the stack frame.
        var uc UContext64
        if _, err := uc.CopyIn(st, StackBottomMagic); err != nil {
                return 0, linux.SignalStack{}, err
        }
        var info linux.SignalInfo
        if _, err := info.CopyIn(st, StackBottomMagic); err != nil {
                return 0, linux.SignalStack{}, err
        }

        // Restore registers.
        c.Regs.R8 = uc.MContext.R8
        c.Regs.R9 = uc.MContext.R9
        c.Regs.R10 = uc.MContext.R10
        c.Regs.R11 = uc.MContext.R11
        c.Regs.R12 = uc.MContext.R12
        c.Regs.R13 = uc.MContext.R13
        c.Regs.R14 = uc.MContext.R14
        c.Regs.R15 = uc.MContext.R15
        c.Regs.Rdi = uc.MContext.Rdi
        c.Regs.Rsi = uc.MContext.Rsi
        c.Regs.Rbp = uc.MContext.Rbp
        c.Regs.Rbx = uc.MContext.Rbx
        c.Regs.Rdx = uc.MContext.Rdx
        c.Regs.Rax = uc.MContext.Rax
        c.Regs.Rcx = uc.MContext.Rcx
        c.Regs.Rsp = uc.MContext.Rsp
        c.Regs.Rip = uc.MContext.Rip
        c.Regs.Eflags = (c.Regs.Eflags & ^eflagsRestorable) | (uc.MContext.Eflags & eflagsRestorable)
        c.Regs.Cs = uint64(uc.MContext.Cs) | 3
        // N.B. _UC_STRICT_RESTORE_SS not supported.
        c.Regs.Orig_rax = math.MaxUint64

        // Restore floating point state.
        l := len(c.sigFPState)
        if l > 0 {
                c.fpState = c.sigFPState[l-1]
                // NOTE(cl/133042258): State save requires that any slice
                // elements from '[len:cap]' to be zero value.
                c.sigFPState[l-1] = nil
                c.sigFPState = c.sigFPState[0 : l-1]
        } else {
                // This might happen if sigreturn(2) calls are unbalanced with
                // respect to signal handler entries. This is not expected so
                // don't bother to do anything fancy with the floating point
                // state.
                log.Infof("sigreturn unable to restore application fpstate")
        }

        return uc.Sigset, uc.Stack, nil
}












































  217 



  275 















    1 




    1 














    1 













































































































  398 




  400 
  209 


  400 














   18 




   18 




   60 




  487 



























   40 

















   13 
    2 







   11 


   11 


   10 



    8 

    3 



    8 



    8 
    2 






    2 








    8 
    4 


    4 





    8 
    2 
    2 


    2 

    8 
    2 


    2 


    8 



  283 










  209 






























































  207 







  147 



  147 
  147 



  147 













  209 

  209 
  206 


  208 



  206 



  252 




  191 



  245 




    3 























  208 





  208 




















































































    1 
    1 



































    8 
    4 
    4 


    3 

    4 



































  190 












  189 



  189 




    3 

    3 


    2 



    1 




  147 
  146 








  255 









  255 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernfs

import (
        "fmt"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
)

// InodeNoopRefCount partially implements the Inode interface, specifically the
// inodeRefs sub interface. InodeNoopRefCount implements a simple reference
// count for inodes, performing no extra actions when references are obtained or
// released. This is suitable for simple file inodes that don't reference any
// resources.
//
// +stateify savable
type InodeNoopRefCount struct {
        InodeTemporary
}

// IncRef implements Inode.IncRef.
func (InodeNoopRefCount) IncRef() {
}

// DecRef implements Inode.DecRef.
func (InodeNoopRefCount) DecRef(context.Context) {
}

// TryIncRef implements Inode.TryIncRef.
func (InodeNoopRefCount) TryIncRef() bool {
        return true
}

// InodeDirectoryNoNewChildren partially implements the Inode interface.
// InodeDirectoryNoNewChildren represents a directory inode which does not
// support creation of new children.
//
// +stateify savable
type InodeDirectoryNoNewChildren struct{}

// NewFile implements Inode.NewFile.
func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) {
        return nil, linuxerr.EPERM
}

// NewDir implements Inode.NewDir.
func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) {
        return nil, linuxerr.EPERM
}

// NewLink implements Inode.NewLink.
func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (Inode, error) {
        return nil, linuxerr.EPERM
}

// NewSymlink implements Inode.NewSymlink.
func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (Inode, error) {
        return nil, linuxerr.EPERM
}

// NewNode implements Inode.NewNode.
func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) {
        return nil, linuxerr.EPERM
}

// InodeNotDirectory partially implements the Inode interface, specifically the
// inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not
// represent directories can embed this to provide no-op implementations for
// directory-related functions.
//
// +stateify savable
type InodeNotDirectory struct {
        InodeAlwaysValid
}

// HasChildren implements Inode.HasChildren.
func (InodeNotDirectory) HasChildren() bool {
        return false
}

// NewFile implements Inode.NewFile.
func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) {
        panic("NewFile called on non-directory inode")
}

// NewDir implements Inode.NewDir.
func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) {
        panic("NewDir called on non-directory inode")
}

// NewLink implements Inode.NewLinkink.
func (InodeNotDirectory) NewLink(context.Context, string, Inode) (Inode, error) {
        panic("NewLink called on non-directory inode")
}

// NewSymlink implements Inode.NewSymlink.
func (InodeNotDirectory) NewSymlink(context.Context, string, string) (Inode, error) {
        panic("NewSymlink called on non-directory inode")
}

// NewNode implements Inode.NewNode.
func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) {
        panic("NewNode called on non-directory inode")
}

// Unlink implements Inode.Unlink.
func (InodeNotDirectory) Unlink(context.Context, string, Inode) error {
        panic("Unlink called on non-directory inode")
}

// RmDir implements Inode.RmDir.
func (InodeNotDirectory) RmDir(context.Context, string, Inode) error {
        panic("RmDir called on non-directory inode")
}

// Rename implements Inode.Rename.
func (InodeNotDirectory) Rename(context.Context, string, string, Inode, Inode) error {
        panic("Rename called on non-directory inode")
}

// Lookup implements Inode.Lookup.
func (InodeNotDirectory) Lookup(ctx context.Context, name string) (Inode, error) {
        panic("Lookup called on non-directory inode")
}

// IterDirents implements Inode.IterDirents.
func (InodeNotDirectory) IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
        panic("IterDirents called on non-directory inode")
}

// InodeNotSymlink partially implements the Inode interface, specifically the
// inodeSymlink sub interface. All inodes that are not symlinks may embed this
// to return the appropriate errors from symlink-related functions.
//
// +stateify savable
type InodeNotSymlink struct{}

// Readlink implements Inode.Readlink.
func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) {
        return "", linuxerr.EINVAL
}

// Getlink implements Inode.Getlink.
func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) {
        return vfs.VirtualDentry{}, "", linuxerr.EINVAL
}

// InodeAttrs partially implements the Inode interface, specifically the
// inodeMetadata sub interface. InodeAttrs provides functionality related to
// inode attributes.
//
// Must be initialized by Init prior to first use.
//
// +stateify savable
type InodeAttrs struct {
        devMajor  uint32
        devMinor  uint32
        ino       uint64
        mode      uint32
        uid       uint32
        gid       uint32
        nlink     uint32
        blockSize uint32

        // Timestamps, all nsecs from the Unix epoch.
        atime int64
        mtime int64
        ctime int64
}

// Init initializes this InodeAttrs.
func (a *InodeAttrs) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) {
        if mode.FileType() == 0 {
                panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode))
        }

        nlink := uint32(1)
        if mode.FileType() == linux.ModeDirectory {
                nlink = 2
        }
        a.devMajor = devMajor
        a.devMinor = devMinor
        atomic.StoreUint64(&a.ino, ino)
        atomic.StoreUint32(&a.mode, uint32(mode))
        atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID))
        atomic.StoreUint32(&a.gid, uint32(creds.EffectiveKGID))
        atomic.StoreUint32(&a.nlink, nlink)
        atomic.StoreUint32(&a.blockSize, hostarch.PageSize)
        now := ktime.NowFromContext(ctx).Nanoseconds()
        atomic.StoreInt64(&a.atime, now)
        atomic.StoreInt64(&a.mtime, now)
        atomic.StoreInt64(&a.ctime, now)
}

// DevMajor returns the device major number.
func (a *InodeAttrs) DevMajor() uint32 {
        return a.devMajor
}

// DevMinor returns the device minor number.
func (a *InodeAttrs) DevMinor() uint32 {
        return a.devMinor
}

// Ino returns the inode id.
func (a *InodeAttrs) Ino() uint64 {
        return atomic.LoadUint64(&a.ino)
}

// Mode implements Inode.Mode.
func (a *InodeAttrs) Mode() linux.FileMode {
        return linux.FileMode(atomic.LoadUint32(&a.mode))
}

// TouchAtime updates a.atime to the current time.
func (a *InodeAttrs) TouchAtime(ctx context.Context, mnt *vfs.Mount) {
        if mnt.Flags.NoATime || mnt.ReadOnly() {
                return
        }
        if err := mnt.CheckBeginWrite(); err != nil {
                return
        }
        atomic.StoreInt64(&a.atime, ktime.NowFromContext(ctx).Nanoseconds())
        mnt.EndWrite()
}

// TouchCMtime updates a.{c/m}time to the current time. The caller should
// synchronize calls to this so that ctime and mtime are updated to the same
// value.
func (a *InodeAttrs) TouchCMtime(ctx context.Context) {
        now := ktime.NowFromContext(ctx).Nanoseconds()
        atomic.StoreInt64(&a.mtime, now)
        atomic.StoreInt64(&a.ctime, now)
}

// Stat partially implements Inode.Stat. Note that this function doesn't provide
// all the stat fields, and the embedder should consider extending the result
// with filesystem-specific fields.
func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
        var stat linux.Statx
        stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME
        stat.DevMajor = a.devMajor
        stat.DevMinor = a.devMinor
        stat.Ino = atomic.LoadUint64(&a.ino)
        stat.Mode = uint16(a.Mode())
        stat.UID = atomic.LoadUint32(&a.uid)
        stat.GID = atomic.LoadUint32(&a.gid)
        stat.Nlink = atomic.LoadUint32(&a.nlink)
        stat.Blksize = atomic.LoadUint32(&a.blockSize)
        stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.atime))
        stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.mtime))
        stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.ctime))
        return stat, nil
}

// SetStat implements Inode.SetStat.
func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
        if opts.Stat.Mask == 0 {
                return nil
        }

        // Note that not all fields are modifiable. For example, the file type and
        // inode numbers are immutable after node creation. Setting the size is often
        // allowed by kernfs files but does not do anything. If some other behavior is
        // needed, the embedder should consider extending SetStat.
        if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
                return linuxerr.EPERM
        }
        if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() {
                return syserror.EISDIR
        }
        if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
                return err
        }

        clearSID := false
        stat := opts.Stat
        if stat.Mask&linux.STATX_UID != 0 {
                atomic.StoreUint32(&a.uid, stat.UID)
                clearSID = true
        }
        if stat.Mask&linux.STATX_GID != 0 {
                atomic.StoreUint32(&a.gid, stat.GID)
                clearSID = true
        }
        if stat.Mask&linux.STATX_MODE != 0 {
                for {
                        old := atomic.LoadUint32(&a.mode)
                        ft := old & linux.S_IFMT
                        newMode := ft | uint32(stat.Mode & ^uint16(linux.S_IFMT))
                        if clearSID {
                                newMode = vfs.ClearSUIDAndSGID(newMode)
                        }
                        if swapped := atomic.CompareAndSwapUint32(&a.mode, old, newMode); swapped {
                                clearSID = false
                                break
                        }
                }
        }

        // We may have to clear the SUID/SGID bits, but didn't do so as part of
        // STATX_MODE.
        if clearSID {
                for {
                        old := atomic.LoadUint32(&a.mode)
                        newMode := vfs.ClearSUIDAndSGID(old)
                        if swapped := atomic.CompareAndSwapUint32(&a.mode, old, newMode); swapped {
                                break
                        }
                }
        }

        now := ktime.NowFromContext(ctx).Nanoseconds()
        if stat.Mask&linux.STATX_ATIME != 0 {
                if stat.Atime.Nsec == linux.UTIME_NOW {
                        stat.Atime = linux.NsecToStatxTimestamp(now)
                }
                atomic.StoreInt64(&a.atime, stat.Atime.ToNsec())
        }
        if stat.Mask&linux.STATX_MTIME != 0 {
                if stat.Mtime.Nsec == linux.UTIME_NOW {
                        stat.Mtime = linux.NsecToStatxTimestamp(now)
                }
                atomic.StoreInt64(&a.mtime, stat.Mtime.ToNsec())
        }

        return nil
}

// CheckPermissions implements Inode.CheckPermissions.
func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
        return vfs.GenericCheckPermissions(
                creds,
                ats,
                a.Mode(),
                auth.KUID(atomic.LoadUint32(&a.uid)),
                auth.KGID(atomic.LoadUint32(&a.gid)),
        )
}

// IncLinks implements Inode.IncLinks.
func (a *InodeAttrs) IncLinks(n uint32) {
        if atomic.AddUint32(&a.nlink, n) <= n {
                panic("InodeLink.IncLinks called with no existing links")
        }
}

// DecLinks implements Inode.DecLinks.
func (a *InodeAttrs) DecLinks() {
        if nlink := atomic.AddUint32(&a.nlink, ^uint32(0)); nlink == ^uint32(0) {
                // Negative overflow
                panic("Inode.DecLinks called at 0 links")
        }
}

// +stateify savable
type slot struct {
        name   string
        inode  Inode
        static bool
        slotEntry
}

// OrderedChildrenOptions contains initialization options for OrderedChildren.
//
// +stateify savable
type OrderedChildrenOptions struct {
        // Writable indicates whether vfs.FilesystemImpl methods implemented by
        // OrderedChildren may modify the tracked children. This applies to
        // operations related to rename, unlink and rmdir. If an OrderedChildren is
        // not writable, these operations all fail with EPERM.
        //
        // Note that writable users must implement the sticky bit (I_SVTX).
        Writable bool
}

// OrderedChildren partially implements the Inode interface. OrderedChildren can
// be embedded in directory inodes to keep track of children in the
// directory, and can then be used to implement a generic directory FD -- see
// GenericDirectoryFD.
//
// OrderedChildren can represent a node in an Inode tree. The children inodes
// might be directories themselves using OrderedChildren; hence extending the
// tree. The parent inode (OrderedChildren user) holds a ref on all its static
// children. This lets the static inodes outlive their associated dentry.
// While the dentry might have to be regenerated via a Lookup() call, we can
// keep reusing the same static inode. These static children inodes are finally
// DecRef'd when this directory inode is being destroyed. This makes
// OrderedChildren suitable for static directory entries as well.
//
// Must be initialize with Init before first use.
//
// +stateify savable
type OrderedChildren struct {
        // Can children be modified by user syscalls? It set to false, interface
        // methods that would modify the children return EPERM. Immutable.
        writable bool

        mu    sync.RWMutex `state:"nosave"`
        order slotList
        set   map[string]*slot
}

// Init initializes an OrderedChildren.
func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
        o.writable = opts.Writable
        o.set = make(map[string]*slot)
}

// Destroy clears the children stored in o. It should be called by structs
// embedding OrderedChildren upon destruction, i.e. when their reference count
// reaches zero.
func (o *OrderedChildren) Destroy(ctx context.Context) {
        o.mu.Lock()
        defer o.mu.Unlock()
        // Drop the ref that o owns on the static inodes it holds.
        for _, s := range o.set {
                if s.static {
                        s.inode.DecRef(ctx)
                }
        }
        o.order.Reset()
        o.set = nil
}

// Populate inserts static children into this OrderedChildren.
// Populate returns the number of directories inserted, which the caller
// may use to update the link count for the parent directory.
//
// Precondition:
//   * d must represent a directory inode.
//   * children must not contain any conflicting entries already in o.
//   * Caller must hold a reference on all inodes passed.
//
// Postcondition: Caller's references on inodes are transferred to o.
func (o *OrderedChildren) Populate(children map[string]Inode) uint32 {
        var links uint32
        for name, child := range children {
                if child.Mode().IsDir() {
                        links++
                }
                if err := o.insert(name, child, true); err != nil {
                        panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v)", name, child))
                }
        }
        return links
}

// Lookup implements Inode.Lookup.
func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error) {
        o.mu.RLock()
        defer o.mu.RUnlock()

        s, ok := o.set[name]
        if !ok {
                return nil, syserror.ENOENT
        }

        s.inode.IncRef() // This ref is passed to the dentry upon creation via Init.
        return s.inode, nil
}

// IterDirents implements Inode.IterDirents.
func (o *OrderedChildren) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
        // All entries from OrderedChildren have already been handled in
        // GenericDirectoryFD.IterDirents.
        return offset, nil
}

// HasChildren implements Inode.HasChildren.
func (o *OrderedChildren) HasChildren() bool {
        o.mu.RLock()
        defer o.mu.RUnlock()
        return len(o.set) > 0
}

// Insert inserts a dynamic child into o. This ignores the writability of o, as
// this is not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
func (o *OrderedChildren) Insert(name string, child Inode) error {
        return o.insert(name, child, false)
}

// insert inserts child into o.
//
// Precondition: Caller must be holding a ref on child if static is true.
//
// Postcondition: Caller's ref on child is transferred to o if static is true.
func (o *OrderedChildren) insert(name string, child Inode, static bool) error {
        o.mu.Lock()
        defer o.mu.Unlock()
        if _, ok := o.set[name]; ok {
                return linuxerr.EEXIST
        }
        s := &slot{
                name:   name,
                inode:  child,
                static: static,
        }
        o.order.PushBack(s)
        o.set[name] = s
        return nil
}

// Precondition: caller must hold o.mu for writing.
func (o *OrderedChildren) removeLocked(name string) {
        if s, ok := o.set[name]; ok {
                if s.static {
                        panic(fmt.Sprintf("removeLocked called on a static inode: %v", s.inode))
                }
                delete(o.set, name)
                o.order.Remove(s)
        }
}

// Precondition: caller must hold o.mu for writing.
func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, newI Inode) {
        if s, ok := o.set[name]; ok {
                if s.static {
                        panic(fmt.Sprintf("replacing a static inode: %v", s.inode))
                }

                // Existing slot with given name, simply replace the dentry.
                s.inode = newI
        }

        // No existing slot with given name, create and hash new slot.
        s := &slot{
                name:   name,
                inode:  newI,
                static: false,
        }
        o.order.PushBack(s)
        o.set[name] = s
}

// Precondition: caller must hold o.mu for reading or writing.
func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error {
        s, ok := o.set[name]
        if !ok {
                return syserror.ENOENT
        }
        if s.inode != child {
                panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child))
        }
        return nil
}

// Unlink implements Inode.Unlink.
func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) error {
        if !o.writable {
                return linuxerr.EPERM
        }
        o.mu.Lock()
        defer o.mu.Unlock()
        if err := o.checkExistingLocked(name, child); err != nil {
                return err
        }

        o.removeLocked(name)
        return nil
}

// RmDir implements Inode.RmDir.
func (o *OrderedChildren) RmDir(ctx context.Context, name string, child Inode) error {
        // We're not responsible for checking that child is a directory, that it's
        // empty, or updating any link counts; so this is the same as unlink.
        return o.Unlink(ctx, name, child)
}

// Rename implements Inode.Rename.
//
// Precondition: Rename may only be called across two directory inodes with
// identical implementations of Rename. Practically, this means filesystems that
// implement Rename by embedding OrderedChildren for any directory
// implementation must use OrderedChildren for all directory implementations
// that will support Rename.
//
// Postcondition: reference on any replaced dentry transferred to caller.
func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error {
        if !o.writable {
                return linuxerr.EPERM
        }

        dst, ok := dstDir.(interface{}).(*OrderedChildren)
        if !ok {
                return linuxerr.EXDEV
        }
        if !dst.writable {
                return linuxerr.EPERM
        }

        // Note: There's a potential deadlock below if concurrent calls to Rename
        // refer to the same src and dst directories in reverse. We avoid any
        // ordering issues because the caller is required to serialize concurrent
        // calls to Rename in accordance with the interface declaration.
        o.mu.Lock()
        defer o.mu.Unlock()
        if dst != o {
                dst.mu.Lock()
                defer dst.mu.Unlock()
        }
        if err := o.checkExistingLocked(oldname, child); err != nil {
                return err
        }
        o.removeLocked(oldname)

        dst.replaceChildLocked(ctx, newname, child)
        return nil
}

// nthLocked returns an iterator to the nth child tracked by this object. The
// iterator is valid until the caller releases o.mu. Returns nil if the
// requested index falls out of bounds.
//
// Preconditon: Caller must hold o.mu for reading.
func (o *OrderedChildren) nthLocked(i int64) *slot {
        for it := o.order.Front(); it != nil && i >= 0; it = it.Next() {
                if i == 0 {
                        return it
                }
                i--
        }
        return nil
}

// InodeSymlink partially implements Inode interface for symlinks.
//
// +stateify savable
type InodeSymlink struct {
        InodeNotDirectory
}

// Open implements Inode.Open.
func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        return nil, linuxerr.ELOOP
}

// StaticDirectory is a standard implementation of a directory with static
// contents.
//
// +stateify savable
type StaticDirectory struct {
        InodeAlwaysValid
        InodeAttrs
        InodeDirectoryNoNewChildren
        InodeNoStatFS
        InodeNotSymlink
        InodeTemporary
        OrderedChildren
        StaticDirectoryRefs

        locks  vfs.FileLocks
        fdOpts GenericDirectoryFDOptions
}

var _ Inode = (*StaticDirectory)(nil)

// NewStaticDir creates a new static directory and returns its dentry.
func NewStaticDir(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]Inode, fdOpts GenericDirectoryFDOptions) Inode {
        inode := &StaticDirectory{}
        inode.Init(ctx, creds, devMajor, devMinor, ino, perm, fdOpts)
        inode.InitRefs()

        inode.OrderedChildren.Init(OrderedChildrenOptions{})
        links := inode.OrderedChildren.Populate(children)
        inode.IncLinks(links)

        return inode
}

// Init initializes StaticDirectory.
func (s *StaticDirectory) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, fdOpts GenericDirectoryFDOptions) {
        if perm&^linux.PermissionsMask != 0 {
                panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
        }
        s.fdOpts = fdOpts
        s.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
}

// Open implements Inode.Open.
func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd, err := NewGenericDirectoryFD(rp.Mount(), d, &s.OrderedChildren, &s.locks, &opts, s.fdOpts)
        if err != nil {
                return nil, err
        }
        return fd.VFSFileDescription(), nil
}

// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
        return linuxerr.EPERM
}

// DecRef implements Inode.DecRef.
func (s *StaticDirectory) DecRef(ctx context.Context) {
        s.StaticDirectoryRefs.DecRef(func() { s.Destroy(ctx) })
}

// InodeAlwaysValid partially implements Inode.
//
// +stateify savable
type InodeAlwaysValid struct{}

// Valid implements Inode.Valid.
func (*InodeAlwaysValid) Valid(context.Context) bool {
        return true
}

// InodeTemporary partially implements Inode.
//
// +stateify savable
type InodeTemporary struct{}

// Keep implements Inode.Keep.
func (*InodeTemporary) Keep() bool {
        return false
}

// InodeNoStatFS partially implements the Inode interface, where the client
// filesystem doesn't support statfs(2).
//
// +stateify savable
type InodeNoStatFS struct{}

// StatFS implements Inode.StatFS.
func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
        return linux.Statfs{}, syserror.ENOSYS
}













































    7 





















    7 




    7 































































    7 

































































































































































    1 







    1 




    6 
    1 



    5 


    5 








    5 




    5 




    2 


    3 



    6 

    2 

    4 







    5 

    2 

    3 


























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package proc

import (
        "bytes"
        "fmt"
        "math"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/inet"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
        "gvisor.dev/gvisor/pkg/usermem"
)

// +stateify savable
type tcpMemDir int

const (
        tcpRMem tcpMemDir = iota
        tcpWMem
)

// newSysDir returns the dentry corresponding to /proc/sys directory.
func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
        return fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
                "kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
                        "hostname": fs.newInode(ctx, root, 0444, &hostnameData{}),
                        "sem":      fs.newInode(ctx, root, 0444, newStaticFile(fmt.Sprintf("%d\t%d\t%d\t%d\n", linux.SEMMSL, linux.SEMMNS, linux.SEMOPM, linux.SEMMNI))),
                        "shmall":   fs.newInode(ctx, root, 0444, shmData(linux.SHMALL)),
                        "shmmax":   fs.newInode(ctx, root, 0444, shmData(linux.SHMMAX)),
                        "shmmni":   fs.newInode(ctx, root, 0444, shmData(linux.SHMMNI)),
                        "yama": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
                                "ptrace_scope": fs.newYAMAPtraceScopeFile(ctx, k, root),
                        }),
                }),
                "vm": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
                        "max_map_count":     fs.newInode(ctx, root, 0444, newStaticFile("2147483647\n")),
                        "mmap_min_addr":     fs.newInode(ctx, root, 0444, &mmapMinAddrData{k: k}),
                        "overcommit_memory": fs.newInode(ctx, root, 0444, newStaticFile("0\n")),
                }),
                "net": fs.newSysNetDir(ctx, root, k),
        })
}

// newSysNetDir returns the dentry corresponding to /proc/sys/net directory.
func (fs *filesystem) newSysNetDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
        var contents map[string]kernfs.Inode

        // TODO(gvisor.dev/issue/1833): Support for using the network stack in the
        // network namespace of the calling process.
        if stack := k.RootNetworkNamespace().Stack(); stack != nil {
                contents = map[string]kernfs.Inode{
                        "ipv4": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
                                "ip_forward":          fs.newInode(ctx, root, 0444, &ipForwarding{stack: stack}),
                                "ip_local_port_range": fs.newInode(ctx, root, 0644, &portRange{stack: stack}),
                                "tcp_recovery":        fs.newInode(ctx, root, 0644, &tcpRecoveryData{stack: stack}),
                                "tcp_rmem":            fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
                                "tcp_sack":            fs.newInode(ctx, root, 0644, &tcpSackData{stack: stack}),
                                "tcp_wmem":            fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}),

                                // The following files are simple stubs until they are implemented in
                                // netstack, most of these files are configuration related. We use the
                                // value closest to the actual netstack behavior or any empty file, all
                                // of these files will have mode 0444 (read-only for all users).
                                "ip_local_reserved_ports": fs.newInode(ctx, root, 0444, newStaticFile("")),
                                "ipfrag_time":             fs.newInode(ctx, root, 0444, newStaticFile("30")),
                                "ip_nonlocal_bind":        fs.newInode(ctx, root, 0444, newStaticFile("0")),
                                "ip_no_pmtu_disc":         fs.newInode(ctx, root, 0444, newStaticFile("1")),

                                // tcp_allowed_congestion_control tell the user what they are able to
                                // do as an unprivledged process so we leave it empty.
                                "tcp_allowed_congestion_control":   fs.newInode(ctx, root, 0444, newStaticFile("")),
                                "tcp_available_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")),
                                "tcp_congestion_control":           fs.newInode(ctx, root, 0444, newStaticFile("reno")),

                                // Many of the following stub files are features netstack doesn't
                                // support. The unsupported features return "0" to indicate they are
                                // disabled.
                                "tcp_base_mss":              fs.newInode(ctx, root, 0444, newStaticFile("1280")),
                                "tcp_dsack":                 fs.newInode(ctx, root, 0444, newStaticFile("0")),
                                "tcp_early_retrans":         fs.newInode(ctx, root, 0444, newStaticFile("0")),
                                "tcp_fack":                  fs.newInode(ctx, root, 0444, newStaticFile("0")),
                                "tcp_fastopen":              fs.newInode(ctx, root, 0444, newStaticFile("0")),
                                "tcp_fastopen_key":          fs.newInode(ctx, root, 0444, newStaticFile("")),
                                "tcp_invalid_ratelimit":     fs.newInode(ctx, root, 0444, newStaticFile("0")),
                                "tcp_keepalive_intvl":       fs.newInode(ctx, root, 0444, newStaticFile("0")),
                                "tcp_keepalive_probes":      fs.newInode(ctx, root, 0444, newStaticFile("0")),
                                "tcp_keepalive_time":        fs.newInode(ctx, root, 0444, newStaticFile("7200")),
                                "tcp_mtu_probing":           fs.newInode(ctx, root, 0444, newStaticFile("0")),
                                "tcp_no_metrics_save":       fs.newInode(ctx, root, 0444, newStaticFile("1")),
                                "tcp_probe_interval":        fs.newInode(ctx, root, 0444, newStaticFile("0")),
                                "tcp_probe_threshold":       fs.newInode(ctx, root, 0444, newStaticFile("0")),
                                "tcp_retries1":              fs.newInode(ctx, root, 0444, newStaticFile("3")),
                                "tcp_retries2":              fs.newInode(ctx, root, 0444, newStaticFile("15")),
                                "tcp_rfc1337":               fs.newInode(ctx, root, 0444, newStaticFile("1")),
                                "tcp_slow_start_after_idle": fs.newInode(ctx, root, 0444, newStaticFile("1")),
                                "tcp_synack_retries":        fs.newInode(ctx, root, 0444, newStaticFile("5")),
                                "tcp_syn_retries":           fs.newInode(ctx, root, 0444, newStaticFile("3")),
                                "tcp_timestamps":            fs.newInode(ctx, root, 0444, newStaticFile("1")),
                        }),
                        "core": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
                                "default_qdisc": fs.newInode(ctx, root, 0444, newStaticFile("pfifo_fast")),
                                "message_burst": fs.newInode(ctx, root, 0444, newStaticFile("10")),
                                "message_cost":  fs.newInode(ctx, root, 0444, newStaticFile("5")),
                                "optmem_max":    fs.newInode(ctx, root, 0444, newStaticFile("0")),
                                "rmem_default":  fs.newInode(ctx, root, 0444, newStaticFile("212992")),
                                "rmem_max":      fs.newInode(ctx, root, 0444, newStaticFile("212992")),
                                "somaxconn":     fs.newInode(ctx, root, 0444, newStaticFile("128")),
                                "wmem_default":  fs.newInode(ctx, root, 0444, newStaticFile("212992")),
                                "wmem_max":      fs.newInode(ctx, root, 0444, newStaticFile("212992")),
                        }),
                }
        }

        return fs.newStaticDir(ctx, root, contents)
}

// mmapMinAddrData implements vfs.DynamicBytesSource for
// /proc/sys/vm/mmap_min_addr.
//
// +stateify savable
type mmapMinAddrData struct {
        kernfs.DynamicBytesFile

        k *kernel.Kernel
}

var _ dynamicInode = (*mmapMinAddrData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress())
        return nil
}

// hostnameData implements vfs.DynamicBytesSource for /proc/sys/kernel/hostname.
//
// +stateify savable
type hostnameData struct {
        kernfs.DynamicBytesFile
}

var _ dynamicInode = (*hostnameData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (*hostnameData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        utsns := kernel.UTSNamespaceFromContext(ctx)
        buf.WriteString(utsns.HostName())
        buf.WriteString("\n")
        return nil
}

// tcpSackData implements vfs.WritableDynamicBytesSource for
// /proc/sys/net/tcp_sack.
//
// +stateify savable
type tcpSackData struct {
        kernfs.DynamicBytesFile

        stack   inet.Stack `state:"wait"`
        enabled *bool
}

var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        if d.enabled == nil {
                sack, err := d.stack.TCPSACKEnabled()
                if err != nil {
                        return err
                }
                d.enabled = &sack
        }

        val := "0\n"
        if *d.enabled {
                // Technically, this is not quite compatible with Linux. Linux stores these
                // as an integer, so if you write "2" into tcp_sack, you should get 2 back.
                // Tough luck.
                val = "1\n"
        }
        _, err := buf.WriteString(val)
        return err
}

// Write implements vfs.WritableDynamicBytesSource.Write.
func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
        if offset != 0 {
                // No need to handle partial writes thus far.
                return 0, linuxerr.EINVAL
        }
        if src.NumBytes() == 0 {
                return 0, nil
        }

        // Limit the amount of memory allocated.
        src = src.TakeFirst(hostarch.PageSize - 1)

        var v int32
        n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
        if err != nil {
                return 0, err
        }
        if d.enabled == nil {
                d.enabled = new(bool)
        }
        *d.enabled = v != 0
        return n, d.stack.SetTCPSACKEnabled(*d.enabled)
}

// tcpRecoveryData implements vfs.WritableDynamicBytesSource for
// /proc/sys/net/ipv4/tcp_recovery.
//
// +stateify savable
type tcpRecoveryData struct {
        kernfs.DynamicBytesFile

        stack inet.Stack `state:"wait"`
}

var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        recovery, err := d.stack.TCPRecovery()
        if err != nil {
                return err
        }

        _, err = buf.WriteString(fmt.Sprintf("%d\n", recovery))
        return err
}

// Write implements vfs.WritableDynamicBytesSource.Write.
func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
        if offset != 0 {
                // No need to handle partial writes thus far.
                return 0, linuxerr.EINVAL
        }
        if src.NumBytes() == 0 {
                return 0, nil
        }

        // Limit the amount of memory allocated.
        src = src.TakeFirst(hostarch.PageSize - 1)

        var v int32
        n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
        if err != nil {
                return 0, err
        }
        if err := d.stack.SetTCPRecovery(inet.TCPLossRecovery(v)); err != nil {
                return 0, err
        }
        return n, nil
}

// tcpMemData implements vfs.WritableDynamicBytesSource for
// /proc/sys/net/ipv4/tcp_rmem and /proc/sys/net/ipv4/tcp_wmem.
//
// +stateify savable
type tcpMemData struct {
        kernfs.DynamicBytesFile

        dir   tcpMemDir
        stack inet.Stack `state:"wait"`

        // mu protects against concurrent reads/writes to FDs based on the dentry
        // backing this byte source.
        mu sync.Mutex `state:"nosave"`
}

var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        d.mu.Lock()
        defer d.mu.Unlock()

        size, err := d.readSizeLocked()
        if err != nil {
                return err
        }
        _, err = buf.WriteString(fmt.Sprintf("%d\t%d\t%d\n", size.Min, size.Default, size.Max))
        return err
}

// Write implements vfs.WritableDynamicBytesSource.Write.
func (d *tcpMemData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
        if offset != 0 {
                // No need to handle partial writes thus far.
                return 0, linuxerr.EINVAL
        }
        if src.NumBytes() == 0 {
                return 0, nil
        }
        d.mu.Lock()
        defer d.mu.Unlock()

        // Limit the amount of memory allocated.
        src = src.TakeFirst(hostarch.PageSize - 1)
        size, err := d.readSizeLocked()
        if err != nil {
                return 0, err
        }
        buf := []int32{int32(size.Min), int32(size.Default), int32(size.Max)}
        n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, buf, src.Opts)
        if err != nil {
                return 0, err
        }
        newSize := inet.TCPBufferSize{
                Min:     int(buf[0]),
                Default: int(buf[1]),
                Max:     int(buf[2]),
        }
        if err := d.writeSizeLocked(newSize); err != nil {
                return 0, err
        }
        return n, nil
}

// Precondition: d.mu must be locked.
func (d *tcpMemData) readSizeLocked() (inet.TCPBufferSize, error) {
        switch d.dir {
        case tcpRMem:
                return d.stack.TCPReceiveBufferSize()
        case tcpWMem:
                return d.stack.TCPSendBufferSize()
        default:
                panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
        }
}

// Precondition: d.mu must be locked.
func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error {
        switch d.dir {
        case tcpRMem:
                return d.stack.SetTCPReceiveBufferSize(size)
        case tcpWMem:
                return d.stack.SetTCPSendBufferSize(size)
        default:
                panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
        }
}

// ipForwarding implements vfs.WritableDynamicBytesSource for
// /proc/sys/net/ipv4/ip_forward.
//
// +stateify savable
type ipForwarding struct {
        kernfs.DynamicBytesFile

        stack   inet.Stack `state:"wait"`
        enabled bool
}

var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error {
        val := "0\n"
        if ipf.enabled {
                // Technically, this is not quite compatible with Linux. Linux stores these
                // as an integer, so if you write "2" into tcp_sack, you should get 2 back.
                // Tough luck.
                val = "1\n"
        }
        buf.WriteString(val)

        return nil
}

// Write implements vfs.WritableDynamicBytesSource.Write.
func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
        if offset != 0 {
                // No need to handle partial writes thus far.
                return 0, linuxerr.EINVAL
        }
        if src.NumBytes() == 0 {
                return 0, nil
        }

        // Limit input size so as not to impact performance if input size is large.
        src = src.TakeFirst(hostarch.PageSize - 1)

        var v int32
        n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
        if err != nil {
                return 0, err
        }
        ipf.enabled = v != 0
        if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, ipf.enabled); err != nil {
                return 0, err
        }
        return n, nil
}

// portRange implements vfs.WritableDynamicBytesSource for
// /proc/sys/net/ipv4/ip_local_port_range.
//
// +stateify savable
type portRange struct {
        kernfs.DynamicBytesFile

        stack inet.Stack `state:"wait"`

        // start and end store the port range. We must save/restore this here,
        // since a netstack instance is created on restore.
        start *uint16
        end   *uint16
}

var _ vfs.WritableDynamicBytesSource = (*portRange)(nil)

// Generate implements vfs.DynamicBytesSource.Generate.
func (pr *portRange) Generate(ctx context.Context, buf *bytes.Buffer) error {
        if pr.start == nil {
                start, end := pr.stack.PortRange()
                pr.start = &start
                pr.end = &end
        }
        _, err := fmt.Fprintf(buf, "%d %d\n", *pr.start, *pr.end)
        return err
}

// Write implements vfs.WritableDynamicBytesSource.Write.
func (pr *portRange) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
        if offset != 0 {
                // No need to handle partial writes thus far.
                return 0, linuxerr.EINVAL
        }
        if src.NumBytes() == 0 {
                return 0, nil
        }

        // Limit input size so as not to impact performance if input size is
        // large.
        src = src.TakeFirst(hostarch.PageSize - 1)

        ports := make([]int32, 2)
        n, err := usermem.CopyInt32StringsInVec(ctx, src.IO, src.Addrs, ports, src.Opts)
        if err != nil {
                return 0, err
        }

        // Port numbers must be uint16s.
        if ports[0] < 0 || ports[1] < 0 || ports[0] > math.MaxUint16 || ports[1] > math.MaxUint16 {
                return 0, linuxerr.EINVAL
        }

        if err := pr.stack.SetPortRange(uint16(ports[0]), uint16(ports[1])); err != nil {
                return 0, err
        }
        if pr.start == nil {
                pr.start = new(uint16)
                pr.end = new(uint16)
        }
        *pr.start = uint16(ports[0])
        *pr.end = uint16(ports[1])
        return n, nil
}






























  494 
  496 







   66 
   66 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package limits

import (
        "gvisor.dev/gvisor/pkg/context"
)

// contextID is the limit package's type for context.Context.Value keys.
type contextID int

const (
        // CtxLimits is a Context.Value key for a LimitSet.
        CtxLimits contextID = iota
)

// FromContext returns the limits that apply to ctx.
func FromContext(ctx context.Context) *LimitSet {
        if v := ctx.Value(CtxLimits); v != nil {
                return v.(*LimitSet)
        }
        return nil
}

// FromContextOrDie returns FromContext(ctx) if the latter is not nil.
// Otherwise, panic is triggered.
func FromContextOrDie(ctx context.Context) *LimitSet {
        if v := ctx.Value(CtxLimits); v != nil {
                return v.(*LimitSet)
        }
        panic("failed to create limit set from context")
}







































    7 




    7 





    7 




























































    4 











    2 

    2 







    2 

    2 



































































































    2 

    2 







    1 






    1 

    1 









































































































































    2 

    2 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fuse

import (
        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

const fuseDevMinor = 229

// fuseDevice implements vfs.Device for /dev/fuse.
//
// +stateify savable
type fuseDevice struct{}

// Open implements vfs.Device.Open.
func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        if !kernel.FUSEEnabled {
                return nil, syserror.ENOENT
        }

        var fd DeviceFD
        if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
                UseDentryMetadata: true,
        }); err != nil {
                return nil, err
        }
        return &fd.vfsfd, nil
}

// DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
//
// +stateify savable
type DeviceFD struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.DentryMetadataFileDescriptionImpl
        vfs.NoLockFD

        // nextOpID is used to create new requests.
        nextOpID linux.FUSEOpID

        // queue is the list of requests that need to be processed by the FUSE server.
        queue requestList

        // numActiveRequests is the number of requests made by the Sentry that has
        // yet to be responded to.
        numActiveRequests uint64

        // completions is used to map a request to its response. A Writer will use this
        // to notify the caller of a completed response.
        completions map[linux.FUSEOpID]*futureResponse

        writeCursor uint32

        // writeBuf is the memory buffer used to copy in the FUSE out header from
        // userspace.
        writeBuf []byte

        // writeCursorFR current FR being copied from server.
        writeCursorFR *futureResponse

        // mu protects all the queues, maps, buffers and cursors and nextOpID.
        mu sync.Mutex `state:"nosave"`

        // waitQueue is used to notify interested parties when the device becomes
        // readable or writable.
        waitQueue waiter.Queue

        // fullQueueCh is a channel used to synchronize the readers with the writers.
        // Writers (inbound requests to the filesystem) block if there are too many
        // unprocessed in-flight requests.
        fullQueueCh chan struct{} `state:".(int)"`

        // fs is the FUSE filesystem that this FD is being used for. A reference is
        // held on fs.
        fs *filesystem
}

func (fd *DeviceFD) saveFullQueueCh() int {
        return cap(fd.fullQueueCh)
}

func (fd *DeviceFD) loadFullQueueCh(capacity int) {
        fd.fullQueueCh = make(chan struct{}, capacity)
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *DeviceFD) Release(ctx context.Context) {
        if fd.fs != nil {
                fd.fs.conn.mu.Lock()
                fd.fs.conn.connected = false
                fd.fs.conn.mu.Unlock()

                fd.fs.VFSFilesystem().DecRef(ctx)
                fd.fs = nil
        }
}

// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
        if fd.fs == nil {
                return 0, linuxerr.EPERM
        }

        return 0, syserror.ENOSYS
}

// Read implements vfs.FileDescriptionImpl.Read.
func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
        if fd.fs == nil {
                return 0, linuxerr.EPERM
        }

        // We require that any Read done on this filesystem have a sane minimum
        // read buffer. It must have the capacity for the fixed parts of any request
        // header (Linux uses the request header and the FUSEWriteIn header for this
        // calculation) + the negotiated MaxWrite room for the data.
        minBuffSize := linux.FUSE_MIN_READ_BUFFER
        inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes())
        writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes())
        negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.maxWrite
        if minBuffSize < negotiatedMinBuffSize {
                minBuffSize = negotiatedMinBuffSize
        }

        // If the read buffer is too small, error out.
        if dst.NumBytes() < int64(minBuffSize) {
                return 0, linuxerr.EINVAL
        }

        fd.mu.Lock()
        defer fd.mu.Unlock()
        return fd.readLocked(ctx, dst, opts)
}

// readLocked implements the reading of the fuse device while locked with DeviceFD.mu.
//
// Preconditions: dst is large enough for any reasonable request.
func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        var req *Request

        // Find the first valid request.
        // For the normal case this loop only execute once.
        for !fd.queue.Empty() {
                req = fd.queue.Front()

                if int64(req.hdr.Len)+int64(len(req.payload)) <= dst.NumBytes() {
                        break
                }

                // The request is too large. Cannot process it. All requests must be smaller than the
                // negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
                // handshake.
                errno := -int32(unix.EIO)
                if req.hdr.Opcode == linux.FUSE_SETXATTR {
                        errno = -int32(unix.E2BIG)
                }

                // Return the error to the calling task.
                if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil {
                        return 0, err
                }

                // We're done with this request.
                fd.queue.Remove(req)
                req = nil
        }

        if req == nil {
                return 0, syserror.ErrWouldBlock
        }

        // We already checked the size: dst must be able to fit the whole request.
        // Now we write the marshalled header, the payload,
        // and the potential additional payload
        // to the user memory IOSequence.

        n, err := dst.CopyOut(ctx, req.data)
        if err != nil {
                return 0, err
        }
        if n != len(req.data) {
                return 0, syserror.EIO
        }

        if req.hdr.Opcode == linux.FUSE_WRITE {
                written, err := dst.DropFirst(n).CopyOut(ctx, req.payload)
                if err != nil {
                        return 0, err
                }
                if written != len(req.payload) {
                        return 0, syserror.EIO
                }
                n += int(written)
        }

        // Fully done with this req, remove it from the queue.
        fd.queue.Remove(req)

        // Remove noReply ones from map of requests expecting a reply.
        if req.noReply {
                fd.numActiveRequests -= 1
                delete(fd.completions, req.hdr.Unique)
        }

        return int64(n), nil
}

// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
        // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
        if fd.fs == nil {
                return 0, linuxerr.EPERM
        }

        return 0, syserror.ENOSYS
}

// Write implements vfs.FileDescriptionImpl.Write.
func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        fd.mu.Lock()
        defer fd.mu.Unlock()
        return fd.writeLocked(ctx, src, opts)
}

// writeLocked implements writing to the fuse device while locked with DeviceFD.mu.
func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
        if fd.fs == nil {
                return 0, linuxerr.EPERM
        }

        // Return ENODEV if the filesystem is umounted.
        if fd.fs.umounted {
                return 0, linuxerr.ENODEV
        }

        var cn, n int64
        hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())

        for src.NumBytes() > 0 {
                if fd.writeCursorFR != nil {
                        // Already have common header, and we're now copying the payload.
                        wantBytes := fd.writeCursorFR.hdr.Len

                        // Note that the FR data doesn't have the header. Copy it over if its necessary.
                        if fd.writeCursorFR.data == nil {
                                fd.writeCursorFR.data = make([]byte, wantBytes)
                        }

                        bytesCopied, err := src.CopyIn(ctx, fd.writeCursorFR.data[fd.writeCursor:wantBytes])
                        if err != nil {
                                return 0, err
                        }
                        src = src.DropFirst(bytesCopied)

                        cn = int64(bytesCopied)
                        n += cn
                        fd.writeCursor += uint32(cn)
                        if fd.writeCursor == wantBytes {
                                // Done reading this full response. Clean up and unblock the
                                // initiator.
                                break
                        }

                        // Check if we have more data in src.
                        continue
                }

                // Assert that the header isn't read into the writeBuf yet.
                if fd.writeCursor >= hdrLen {
                        return 0, linuxerr.EINVAL
                }

                // We don't have the full common response header yet.
                wantBytes := hdrLen - fd.writeCursor
                bytesCopied, err := src.CopyIn(ctx, fd.writeBuf[fd.writeCursor:wantBytes])
                if err != nil {
                        return 0, err
                }
                src = src.DropFirst(bytesCopied)

                cn = int64(bytesCopied)
                n += cn
                fd.writeCursor += uint32(cn)
                if fd.writeCursor == hdrLen {
                        // Have full header in the writeBuf. Use it to fetch the actual futureResponse
                        // from the device's completions map.
                        var hdr linux.FUSEHeaderOut
                        hdr.UnmarshalBytes(fd.writeBuf)

                        // We have the header now and so the writeBuf has served its purpose.
                        // We could reset it manually here but instead of doing that, at the
                        // end of the write, the writeCursor will be set to 0 thereby allowing
                        // the next request to overwrite whats in the buffer,

                        fut, ok := fd.completions[hdr.Unique]
                        if !ok {
                                // Server sent us a response for a request we never sent,
                                // or for which we already received a reply (e.g. aborted), an unlikely event.
                                return 0, linuxerr.EINVAL
                        }

                        delete(fd.completions, hdr.Unique)

                        // Copy over the header into the future response. The rest of the payload
                        // will be copied over to the FR's data in the next iteration.
                        fut.hdr = &hdr
                        fd.writeCursorFR = fut

                        // Next iteration will now try read the complete request, if src has
                        // any data remaining. Otherwise we're done.
                }
        }

        if fd.writeCursorFR != nil {
                if err := fd.sendResponse(ctx, fd.writeCursorFR); err != nil {
                        return 0, err
                }

                // Ready the device for the next request.
                fd.writeCursorFR = nil
                fd.writeCursor = 0
        }

        return n, nil
}

// Readiness implements vfs.FileDescriptionImpl.Readiness.
func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
        fd.mu.Lock()
        defer fd.mu.Unlock()
        return fd.readinessLocked(mask)
}

// readinessLocked implements checking the readiness of the fuse device while
// locked with DeviceFD.mu.
func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask {
        var ready waiter.EventMask

        if fd.fs == nil || fd.fs.umounted {
                ready |= waiter.EventErr
                return ready & mask
        }

        // FD is always writable.
        ready |= waiter.WritableEvents
        if !fd.queue.Empty() {
                // Have reqs available, FD is readable.
                ready |= waiter.ReadableEvents
        }

        return ready & mask
}

// EventRegister implements waiter.Waitable.EventRegister.
func (fd *DeviceFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        fd.waitQueue.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
        fd.waitQueue.EventUnregister(e)
}

// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        // Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
        if fd.fs == nil {
                return 0, linuxerr.EPERM
        }

        return 0, syserror.ENOSYS
}

// sendResponse sends a response to the waiting task (if any).
//
// Preconditions: fd.mu must be held.
func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
        // Signal the task waiting on a response if any.
        defer close(fut.ch)

        // Signal that the queue is no longer full.
        select {
        case fd.fullQueueCh <- struct{}{}:
        default:
        }
        fd.numActiveRequests--

        if fut.async {
                return fd.asyncCallBack(ctx, fut.getResponse())
        }

        return nil
}

// sendError sends an error response to the waiting task (if any) by calling sendResponse().
//
// Preconditions: fd.mu must be held.
func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error {
        // Return the error to the calling task.
        outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
        respHdr := linux.FUSEHeaderOut{
                Len:    outHdrLen,
                Error:  errno,
                Unique: unique,
        }

        fut, ok := fd.completions[respHdr.Unique]
        if !ok {
                // A response for a request we never sent,
                // or for which we already received a reply (e.g. aborted).
                return linuxerr.EINVAL
        }
        delete(fd.completions, respHdr.Unique)

        fut.hdr = &respHdr
        return fd.sendResponse(ctx, fut)
}

// asyncCallBack executes pre-defined callback function for async requests.
// Currently used by: FUSE_INIT.
func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error {
        switch r.opcode {
        case linux.FUSE_INIT:
                creds := auth.CredentialsFromContext(ctx)
                rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
                return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
                // TODO(gvisor.dev/issue/3247): support async read: correctly process the response.
        }

        return nil
}
































   42 
































































    1 




    1 
    1 






    2 
    1 


    1 







    2 
    1 



    1 



    1 



    1 



    1 



    1 



    2 
    1 


    1 



    1 
    1 






    1 



    1 



    1 



    1 



    1 
    1 






    1 






    1 



    2 




    2 


























    1 



    1 



    2 
    1 


    1 














    1 



    1 



















    1 



    1 



   14 










   51 









   52 








   31 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
)

// NewAnonVirtualDentry returns a VirtualDentry with the given synthetic name,
// consistent with Linux's fs/anon_inodes.c:anon_inode_getfile(). References
// are taken on the returned VirtualDentry.
func (vfs *VirtualFilesystem) NewAnonVirtualDentry(name string) VirtualDentry {
        d := anonDentry{
                name: name,
        }
        d.vfsd.Init(&d)
        vfs.anonMount.IncRef()
        // anonDentry no-ops refcounting.
        return VirtualDentry{
                mount:  vfs.anonMount,
                dentry: &d.vfsd,
        }
}

const (
        anonfsBlockSize = hostarch.PageSize // via fs/libfs.c:pseudo_fs_fill_super()

        // Mode, UID, and GID for a generic anonfs file.
        anonFileMode = 0600 // no type is correct
        anonFileUID  = auth.RootKUID
        anonFileGID  = auth.RootKGID
)

// anonFilesystemType implements FilesystemType.
//
// +stateify savable
type anonFilesystemType struct{}

// GetFilesystem implements FilesystemType.GetFilesystem.
func (anonFilesystemType) GetFilesystem(context.Context, *VirtualFilesystem, *auth.Credentials, string, GetFilesystemOptions) (*Filesystem, *Dentry, error) {
        panic("cannot instaniate an anon filesystem")
}

// Name implements FilesystemType.Name.
func (anonFilesystemType) Name() string {
        return "none"
}

// Release implemenents FilesystemType.Release.
func (anonFilesystemType) Release(ctx context.Context) {}

// anonFilesystem is the implementation of FilesystemImpl that backs
// VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry().
//
// Since all Dentries in anonFilesystem are non-directories, all FilesystemImpl
// methods that would require an anonDentry to be a directory return ENOTDIR.
//
// +stateify savable
type anonFilesystem struct {
        vfsfs Filesystem

        devMinor uint32
}

// +stateify savable
type anonDentry struct {
        vfsd Dentry

        name string
}

// Release implements FilesystemImpl.Release.
func (fs *anonFilesystem) Release(ctx context.Context) {
}

// Sync implements FilesystemImpl.Sync.
func (fs *anonFilesystem) Sync(ctx context.Context) error {
        return nil
}

// AccessAt implements vfs.Filesystem.Impl.AccessAt.
func (fs *anonFilesystem) AccessAt(ctx context.Context, rp *ResolvingPath, creds *auth.Credentials, ats AccessTypes) error {
        if !rp.Done() {
                return linuxerr.ENOTDIR
        }
        return GenericCheckPermissions(creds, ats, anonFileMode, anonFileUID, anonFileGID)
}

// GetDentryAt implements FilesystemImpl.GetDentryAt.
func (fs *anonFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error) {
        if !rp.Done() {
                return nil, linuxerr.ENOTDIR
        }
        if opts.CheckSearchable {
                return nil, linuxerr.ENOTDIR
        }
        // anonDentry no-ops refcounting.
        return rp.Start(), nil
}

// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt.
func (fs *anonFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) {
        if !rp.Final() {
                return nil, linuxerr.ENOTDIR
        }
        // anonDentry no-ops refcounting.
        return rp.Start(), nil
}

// LinkAt implements FilesystemImpl.LinkAt.
func (fs *anonFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
        if !rp.Final() {
                return linuxerr.ENOTDIR
        }
        return linuxerr.EPERM
}

// MkdirAt implements FilesystemImpl.MkdirAt.
func (fs *anonFilesystem) MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error {
        if !rp.Final() {
                return linuxerr.ENOTDIR
        }
        return linuxerr.EPERM
}

// MknodAt implements FilesystemImpl.MknodAt.
func (fs *anonFilesystem) MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error {
        if !rp.Final() {
                return linuxerr.ENOTDIR
        }
        return linuxerr.EPERM
}

// OpenAt implements FilesystemImpl.OpenAt.
func (fs *anonFilesystem) OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error) {
        if !rp.Done() {
                return nil, linuxerr.ENOTDIR
        }
        return nil, linuxerr.ENODEV
}

// ReadlinkAt implements FilesystemImpl.ReadlinkAt.
func (fs *anonFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error) {
        if !rp.Done() {
                return "", linuxerr.ENOTDIR
        }
        return "", linuxerr.EINVAL
}

// RenameAt implements FilesystemImpl.RenameAt.
func (fs *anonFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error {
        if !rp.Final() {
                return linuxerr.ENOTDIR
        }
        return linuxerr.EPERM
}

// RmdirAt implements FilesystemImpl.RmdirAt.
func (fs *anonFilesystem) RmdirAt(ctx context.Context, rp *ResolvingPath) error {
        if !rp.Final() {
                return linuxerr.ENOTDIR
        }
        return linuxerr.EPERM
}

// SetStatAt implements FilesystemImpl.SetStatAt.
func (fs *anonFilesystem) SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error {
        if !rp.Done() {
                return linuxerr.ENOTDIR
        }
        // Linux actually permits anon_inode_inode's metadata to be set, which is
        // visible to all users of anon_inode_inode. We just silently ignore
        // metadata changes.
        return nil
}

// StatAt implements FilesystemImpl.StatAt.
func (fs *anonFilesystem) StatAt(ctx context.Context, rp *ResolvingPath, opts StatOptions) (linux.Statx, error) {
        if !rp.Done() {
                return linux.Statx{}, linuxerr.ENOTDIR
        }
        // See fs/anon_inodes.c:anon_inode_init() => fs/libfs.c:alloc_anon_inode().
        return linux.Statx{
                Mask:     linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
                Blksize:  anonfsBlockSize,
                Nlink:    1,
                UID:      uint32(anonFileUID),
                GID:      uint32(anonFileGID),
                Mode:     anonFileMode,
                Ino:      1,
                Size:     0,
                Blocks:   0,
                DevMajor: linux.UNNAMED_MAJOR,
                DevMinor: fs.devMinor,
        }, nil
}

// StatFSAt implements FilesystemImpl.StatFSAt.
func (fs *anonFilesystem) StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error) {
        if !rp.Done() {
                return linux.Statfs{}, linuxerr.ENOTDIR
        }
        return linux.Statfs{
                Type:      linux.ANON_INODE_FS_MAGIC,
                BlockSize: anonfsBlockSize,
        }, nil
}

// SymlinkAt implements FilesystemImpl.SymlinkAt.
func (fs *anonFilesystem) SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error {
        if !rp.Final() {
                return linuxerr.ENOTDIR
        }
        return linuxerr.EPERM
}

// UnlinkAt implements FilesystemImpl.UnlinkAt.
func (fs *anonFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) error {
        if !rp.Final() {
                return linuxerr.ENOTDIR
        }
        return linuxerr.EPERM
}

// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
func (fs *anonFilesystem) BoundEndpointAt(ctx context.Context, rp *ResolvingPath, opts BoundEndpointOptions) (transport.BoundEndpoint, error) {
        if !rp.Final() {
                return nil, linuxerr.ENOTDIR
        }
        if err := GenericCheckPermissions(rp.Credentials(), MayWrite, anonFileMode, anonFileUID, anonFileGID); err != nil {
                return nil, err
        }
        return nil, linuxerr.ECONNREFUSED
}

// ListXattrAt implements FilesystemImpl.ListXattrAt.
func (fs *anonFilesystem) ListXattrAt(ctx context.Context, rp *ResolvingPath, size uint64) ([]string, error) {
        if !rp.Done() {
                return nil, linuxerr.ENOTDIR
        }
        return nil, nil
}

// GetXattrAt implements FilesystemImpl.GetXattrAt.
func (fs *anonFilesystem) GetXattrAt(ctx context.Context, rp *ResolvingPath, opts GetXattrOptions) (string, error) {
        if !rp.Done() {
                return "", linuxerr.ENOTDIR
        }
        return "", linuxerr.ENOTSUP
}

// SetXattrAt implements FilesystemImpl.SetXattrAt.
func (fs *anonFilesystem) SetXattrAt(ctx context.Context, rp *ResolvingPath, opts SetXattrOptions) error {
        if !rp.Done() {
                return linuxerr.ENOTDIR
        }
        return linuxerr.EPERM
}

// RemoveXattrAt implements FilesystemImpl.RemoveXattrAt.
func (fs *anonFilesystem) RemoveXattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
        if !rp.Done() {
                return linuxerr.ENOTDIR
        }
        return linuxerr.EPERM
}

// PrependPath implements FilesystemImpl.PrependPath.
func (fs *anonFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
        b.PrependComponent(fmt.Sprintf("anon_inode:%s", vd.dentry.impl.(*anonDentry).name))
        return PrependPathSyntheticError{}
}

// MountOptions implements FilesystemImpl.MountOptions.
func (fs *anonFilesystem) MountOptions() string {
        return ""
}

// IncRef implements DentryImpl.IncRef.
func (d *anonDentry) IncRef() {
        // no-op
}

// TryIncRef implements DentryImpl.TryIncRef.
func (d *anonDentry) TryIncRef() bool {
        return true
}

// DecRef implements DentryImpl.DecRef.
func (d *anonDentry) DecRef(ctx context.Context) {
        // no-op
}

// InotifyWithParent implements DentryImpl.InotifyWithParent.
//
// Although Linux technically supports inotify on pseudo filesystems (inotify
// is implemented at the vfs layer), it is not particularly useful. It is left
// unimplemented until someone actually needs it.
func (d *anonDentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et EventType) {}

// Watches implements DentryImpl.Watches.
func (d *anonDentry) Watches() *Watches {
        return nil
}

// OnZeroWatches implements Dentry.OnZeroWatches.
func (d *anonDentry) OnZeroWatches(context.Context) {}








































  276 




  267 



   11 



    4 




    1 



    3 


    3 


























































































































  227 










   71 





    1 



   70 




   70 


   68 



  173 






    1 



  172 

    1 



  169 


  168 



    5 





    1 



    4 
    3 

    1 


    2 

    3 



    3 
    1 



    2 









  272 





    4 








    4 
    4 





    4 
    1 

    1 

    1 

    1 



    3 



    2 



    1 









    1 
    1 




























    4 










    4 
    4 

    1 


    3 


    2 




    2 
    2 



    1 





    1 



   46 













   46 


   46 


    1 





   45 



   45 



   44 






   13 







    1 




   12 
    1 


   11 










   11 



   10 


   10 



    1 







    1 











   17 
   17 






















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "math"
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/signalfd"
        "gvisor.dev/gvisor/pkg/syserror"
)

// "For a process to have permission to send a signal it must
// - either be privileged (CAP_KILL), or
// - the real or effective user ID of the sending process must be equal to the
// real or saved set-user-ID of the target process.
//
// In the case of SIGCONT it suffices when the sending and receiving processes
// belong to the same session." - kill(2)
//
// Equivalent to kernel/signal.c:check_kill_permission.
func mayKill(t *kernel.Task, target *kernel.Task, sig linux.Signal) bool {
        // kernel/signal.c:check_kill_permission also allows a signal if the
        // sending and receiving tasks share a thread group, which is not
        // mentioned in kill(2) since kill does not allow task-level
        // granularity in signal sending.
        if t.ThreadGroup() == target.ThreadGroup() {
                return true
        }

        if t.HasCapabilityIn(linux.CAP_KILL, target.UserNamespace()) {
                return true
        }

        creds := t.Credentials()
        tcreds := target.Credentials()
        if creds.EffectiveKUID == tcreds.SavedKUID ||
                creds.EffectiveKUID == tcreds.RealKUID ||
                creds.RealKUID == tcreds.SavedKUID ||
                creds.RealKUID == tcreds.RealKUID {
                return true
        }

        if sig == linux.SIGCONT && target.ThreadGroup().Session() == t.ThreadGroup().Session() {
                return true
        }
        return false
}

// Kill implements linux syscall kill(2).
func Kill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pid := kernel.ThreadID(args[0].Int())
        sig := linux.Signal(args[1].Int())

        switch {
        case pid > 0:
                // "If pid is positive, then signal sig is sent to the process with the
                // ID specified by pid." - kill(2)
                // This loops to handle races with execve where target dies between
                // TaskWithID and SendGroupSignal. Compare Linux's
                // kernel/signal.c:kill_pid_info().
                for {
                        target := t.PIDNamespace().TaskWithID(pid)
                        if target == nil {
                                return 0, nil, linuxerr.ESRCH
                        }
                        if !mayKill(t, target, sig) {
                                return 0, nil, linuxerr.EPERM
                        }
                        info := &linux.SignalInfo{
                                Signo: int32(sig),
                                Code:  linux.SI_USER,
                        }
                        info.SetPID(int32(target.PIDNamespace().IDOfTask(t)))
                        info.SetUID(int32(t.Credentials().RealKUID.In(target.UserNamespace()).OrOverflow()))
                        if err := target.SendGroupSignal(info); !linuxerr.Equals(linuxerr.ESRCH, err) {
                                return 0, nil, err
                        }
                }
        case pid == -1:
                // "If pid equals -1, then sig is sent to every process for which the
                // calling process has permission to send signals, except for process 1
                // (init), but see below. ... POSIX.1-2001 requires that kill(-1,sig)
                // send sig to all processes that the calling process may send signals
                // to, except possibly for some implementation-defined system
                // processes. Linux allows a process to signal itself, but on Linux the
                // call kill(-1,sig) does not signal the calling process."
                var (
                        lastErr   error
                        delivered int
                )
                for _, tg := range t.PIDNamespace().ThreadGroups() {
                        if tg == t.ThreadGroup() {
                                continue
                        }
                        if t.PIDNamespace().IDOfThreadGroup(tg) == kernel.InitTID {
                                continue
                        }

                        // If pid == -1, the returned error is the last non-EPERM error
                        // from any call to group_send_sig_info.
                        if !mayKill(t, tg.Leader(), sig) {
                                continue
                        }
                        // Here and below, whether or not kill returns an error may
                        // depend on the iteration order. We at least implement the
                        // semantics documented by the man page: "On success (at least
                        // one signal was sent), zero is returned."
                        info := &linux.SignalInfo{
                                Signo: int32(sig),
                                Code:  linux.SI_USER,
                        }
                        info.SetPID(int32(tg.PIDNamespace().IDOfTask(t)))
                        info.SetUID(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow()))
                        err := tg.SendSignal(info)
                        if linuxerr.Equals(linuxerr.ESRCH, err) {
                                // ESRCH is ignored because it means the task
                                // exited while we were iterating.  This is a
                                // race which would not normally exist on
                                // Linux, so we suppress it.
                                continue
                        }
                        delivered++
                        if err != nil {
                                lastErr = err
                        }
                }
                if delivered > 0 {
                        return 0, nil, lastErr
                }
                return 0, nil, linuxerr.ESRCH
        default:
                // "If pid equals 0, then sig is sent to every process in the process
                // group of the calling process."
                //
                // "If pid is less than -1, then sig is sent to every process
                // in the process group whose ID is -pid."
                pgid := kernel.ProcessGroupID(-pid)
                if pgid == 0 {
                        pgid = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())
                }

                // If pid != -1 (i.e. signalling a process group), the returned error
                // is the last error from any call to group_send_sig_info.
                lastErr := error(linuxerr.ESRCH)
                for _, tg := range t.PIDNamespace().ThreadGroups() {
                        if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid {
                                if !mayKill(t, tg.Leader(), sig) {
                                        lastErr = linuxerr.EPERM
                                        continue
                                }

                                info := &linux.SignalInfo{
                                        Signo: int32(sig),
                                        Code:  linux.SI_USER,
                                }
                                info.SetPID(int32(tg.PIDNamespace().IDOfTask(t)))
                                info.SetUID(int32(t.Credentials().RealKUID.In(tg.Leader().UserNamespace()).OrOverflow()))
                                // See note above regarding ESRCH race above.
                                if err := tg.SendSignal(info); !linuxerr.Equals(linuxerr.ESRCH, err) {
                                        lastErr = err
                                }
                        }
                }

                return 0, nil, lastErr
        }
}

func tkillSigInfo(sender, receiver *kernel.Task, sig linux.Signal) *linux.SignalInfo {
        info := &linux.SignalInfo{
                Signo: int32(sig),
                Code:  linux.SI_TKILL,
        }
        info.SetPID(int32(receiver.PIDNamespace().IDOfThreadGroup(sender.ThreadGroup())))
        info.SetUID(int32(sender.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
        return info
}

// Tkill implements linux syscall tkill(2).
func Tkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        tid := kernel.ThreadID(args[0].Int())
        sig := linux.Signal(args[1].Int())

        // N.B. Inconsistent with man page, linux actually rejects calls with
        // tid <=0 by EINVAL. This isn't the same for all signal calls.
        if tid <= 0 {
                return 0, nil, linuxerr.EINVAL
        }

        target := t.PIDNamespace().TaskWithID(tid)
        if target == nil {
                return 0, nil, linuxerr.ESRCH
        }

        if !mayKill(t, target, sig) {
                return 0, nil, linuxerr.EPERM
        }
        return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig))
}

// Tgkill implements linux syscall tgkill(2).
func Tgkill(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        tgid := kernel.ThreadID(args[0].Int())
        tid := kernel.ThreadID(args[1].Int())
        sig := linux.Signal(args[2].Int())

        // N.B. Inconsistent with man page, linux actually rejects calls with
        // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls.
        if tgid <= 0 || tid <= 0 {
                return 0, nil, linuxerr.EINVAL
        }

        targetTG := t.PIDNamespace().ThreadGroupWithID(tgid)
        target := t.PIDNamespace().TaskWithID(tid)
        if targetTG == nil || target == nil || target.ThreadGroup() != targetTG {
                return 0, nil, linuxerr.ESRCH
        }

        if !mayKill(t, target, sig) {
                return 0, nil, linuxerr.EPERM
        }
        return 0, nil, target.SendSignal(tkillSigInfo(t, target, sig))
}

// RtSigaction implements linux syscall rt_sigaction(2).
func RtSigaction(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        sig := linux.Signal(args[0].Int())
        newactarg := args[1].Pointer()
        oldactarg := args[2].Pointer()
        sigsetsize := args[3].SizeT()

        if sigsetsize != linux.SignalSetSize {
                return 0, nil, linuxerr.EINVAL
        }

        var newactptr *linux.SigAction
        if newactarg != 0 {
                var newact linux.SigAction
                if _, err := newact.CopyIn(t, newactarg); err != nil {
                        return 0, nil, err
                }
                newactptr = &newact
        }
        oldact, err := t.ThreadGroup().SetSigAction(sig, newactptr)
        if err != nil {
                return 0, nil, err
        }
        if oldactarg != 0 {
                if _, err := oldact.CopyOut(t, oldactarg); err != nil {
                        return 0, nil, err
                }
        }
        return 0, nil, nil
}

// Sigreturn implements linux syscall sigreturn(2).
func Sigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        ctrl, err := t.SignalReturn(false)
        return 0, ctrl, err
}

// RtSigreturn implements linux syscall rt_sigreturn(2).
func RtSigreturn(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        ctrl, err := t.SignalReturn(true)
        return 0, ctrl, err
}

// RtSigprocmask implements linux syscall rt_sigprocmask(2).
func RtSigprocmask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        how := args[0].Int()
        setaddr := args[1].Pointer()
        oldaddr := args[2].Pointer()
        sigsetsize := args[3].SizeT()

        if sigsetsize != linux.SignalSetSize {
                return 0, nil, linuxerr.EINVAL
        }
        oldmask := t.SignalMask()
        if setaddr != 0 {
                mask, err := CopyInSigSet(t, setaddr, sigsetsize)
                if err != nil {
                        return 0, nil, err
                }

                switch how {
                case linux.SIG_BLOCK:
                        t.SetSignalMask(oldmask | mask)
                case linux.SIG_UNBLOCK:
                        t.SetSignalMask(oldmask &^ mask)
                case linux.SIG_SETMASK:
                        t.SetSignalMask(mask)
                default:
                        return 0, nil, linuxerr.EINVAL
                }
        }
        if oldaddr != 0 {
                return 0, nil, copyOutSigSet(t, oldaddr, oldmask)
        }

        return 0, nil, nil
}

// Sigaltstack implements linux syscall sigaltstack(2).
func Sigaltstack(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        setaddr := args[0].Pointer()
        oldaddr := args[1].Pointer()

        alt := t.SignalStack()
        if oldaddr != 0 {
                if _, err := alt.CopyOut(t, oldaddr); err != nil {
                        return 0, nil, err
                }
        }
        if setaddr != 0 {
                if _, err := alt.CopyIn(t, setaddr); err != nil {
                        return 0, nil, err
                }
                // The signal stack cannot be changed if the task is currently
                // on the stack. This is enforced at the lowest level because
                // these semantics apply to changing the signal stack via a
                // ucontext during a signal handler.
                if !t.SetSignalStack(alt) {
                        return 0, nil, linuxerr.EPERM
                }
        }

        return 0, nil, nil
}

// Pause implements linux syscall pause(2).
func Pause(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return 0, nil, syserror.ConvertIntr(t.Block(nil), syserror.ERESTARTNOHAND)
}

// RtSigpending implements linux syscall rt_sigpending(2).
func RtSigpending(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        pending := t.PendingSignals()
        _, err := pending.CopyOut(t, addr)
        return 0, nil, err
}

// RtSigtimedwait implements linux syscall rt_sigtimedwait(2).
func RtSigtimedwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        sigset := args[0].Pointer()
        siginfo := args[1].Pointer()
        timespec := args[2].Pointer()
        sigsetsize := args[3].SizeT()

        mask, err := CopyInSigSet(t, sigset, sigsetsize)
        if err != nil {
                return 0, nil, err
        }

        var timeout time.Duration
        if timespec != 0 {
                d, err := copyTimespecIn(t, timespec)
                if err != nil {
                        return 0, nil, err
                }
                if !d.Valid() {
                        return 0, nil, linuxerr.EINVAL
                }
                timeout = time.Duration(d.ToNsecCapped())
        } else {
                timeout = time.Duration(math.MaxInt64)
        }

        si, err := t.Sigtimedwait(mask, timeout)
        if err != nil {
                return 0, nil, err
        }

        if siginfo != 0 {
                si.FixSignalCodeForUser()
                if _, err := si.CopyOut(t, siginfo); err != nil {
                        return 0, nil, err
                }
        }
        return uintptr(si.Signo), nil, nil
}

// RtSigqueueinfo implements linux syscall rt_sigqueueinfo(2).
func RtSigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pid := kernel.ThreadID(args[0].Int())
        sig := linux.Signal(args[1].Int())
        infoAddr := args[2].Pointer()

        // Copy in the info.
        //
        // We must ensure that the Signo is set (Linux overrides this in the
        // same way), and that the code is in the allowed set. This same logic
        // appears below in RtSigtgqueueinfo and should be kept in sync.
        var info linux.SignalInfo
        if _, err := info.CopyIn(t, infoAddr); err != nil {
                return 0, nil, err
        }
        info.Signo = int32(sig)

        // This must loop to handle the race with execve described in Kill.
        for {
                // Deliver to the given task's thread group.
                target := t.PIDNamespace().TaskWithID(pid)
                if target == nil {
                        return 0, nil, linuxerr.ESRCH
                }

                // If the sender is not the receiver, it can't use si_codes used by the
                // kernel or SI_TKILL.
                if (info.Code >= 0 || info.Code == linux.SI_TKILL) && target != t {
                        return 0, nil, linuxerr.EPERM
                }

                if !mayKill(t, target, sig) {
                        return 0, nil, linuxerr.EPERM
                }

                if err := target.SendGroupSignal(&info); !linuxerr.Equals(linuxerr.ESRCH, err) {
                        return 0, nil, err
                }
        }
}

// RtTgsigqueueinfo implements linux syscall rt_tgsigqueueinfo(2).
func RtTgsigqueueinfo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        tgid := kernel.ThreadID(args[0].Int())
        tid := kernel.ThreadID(args[1].Int())
        sig := linux.Signal(args[2].Int())
        infoAddr := args[3].Pointer()

        // N.B. Inconsistent with man page, linux actually rejects calls with
        // tgid/tid <=0 by EINVAL. This isn't the same for all signal calls.
        if tgid <= 0 || tid <= 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Copy in the info. See RtSigqueueinfo above.
        var info linux.SignalInfo
        if _, err := info.CopyIn(t, infoAddr); err != nil {
                return 0, nil, err
        }
        info.Signo = int32(sig)

        // Deliver to the given task.
        targetTG := t.PIDNamespace().ThreadGroupWithID(tgid)
        target := t.PIDNamespace().TaskWithID(tid)
        if targetTG == nil || target == nil || target.ThreadGroup() != targetTG {
                return 0, nil, linuxerr.ESRCH
        }

        // If the sender is not the receiver, it can't use si_codes used by the
        // kernel or SI_TKILL.
        if (info.Code >= 0 || info.Code == linux.SI_TKILL) && target != t {
                return 0, nil, linuxerr.EPERM
        }

        if !mayKill(t, target, sig) {
                return 0, nil, linuxerr.EPERM
        }
        return 0, nil, target.SendSignal(&info)
}

// RtSigsuspend implements linux syscall rt_sigsuspend(2).
func RtSigsuspend(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        sigset := args[0].Pointer()

        // Copy in the signal mask.
        var mask linux.SignalSet
        if _, err := mask.CopyIn(t, sigset); err != nil {
                return 0, nil, err
        }
        mask &^= kernel.UnblockableSignals

        // Swap the mask.
        oldmask := t.SignalMask()
        t.SetSignalMask(mask)
        t.SetSavedSignalMask(oldmask)

        // Perform the wait.
        return 0, nil, syserror.ConvertIntr(t.Block(nil), syserror.ERESTARTNOHAND)
}

// RestartSyscall implements the linux syscall restart_syscall(2).
func RestartSyscall(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        if r := t.SyscallRestartBlock(); r != nil {
                n, err := r.Restart(t)
                return n, nil, err
        }
        // The restart block should never be nil here, but it's possible
        // ERESTART_RESTARTBLOCK was set by ptrace without the current syscall
        // setting up a restart block. If ptrace didn't manipulate the return value,
        // finding a nil restart block is a bug. Linux ensures that the restart
        // function is never null by (re)initializing it with one that translates
        // the restart into EINTR. We'll emulate that behaviour.
        t.Debugf("Restart block missing in restart_syscall(2). Did ptrace inject a return value of ERESTART_RESTARTBLOCK?")
        return 0, nil, syserror.EINTR
}

// sharedSignalfd is shared between the two calls.
func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) {
        // Copy in the signal mask.
        mask, err := CopyInSigSet(t, sigset, sigsetsize)
        if err != nil {
                return 0, nil, err
        }

        // Always check for valid flags, even if not creating.
        if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Is this a change to an existing signalfd?
        //
        // The spec indicates that this should adjust the mask.
        if fd != -1 {
                file := t.GetFile(fd)
                if file == nil {
                        return 0, nil, linuxerr.EBADF
                }
                defer file.DecRef(t)

                // Is this a signalfd?
                if s, ok := file.FileOperations.(*signalfd.SignalOperations); ok {
                        s.SetMask(mask)
                        return 0, nil, nil
                }

                // Not a signalfd.
                return 0, nil, linuxerr.EINVAL
        }

        // Create a new file.
        file, err := signalfd.New(t, mask)
        if err != nil {
                return 0, nil, err
        }
        defer file.DecRef(t)

        // Set appropriate flags.
        file.SetFlags(fs.SettableFileFlags{
                NonBlocking: flags&linux.SFD_NONBLOCK != 0,
        })

        // Create a new descriptor.
        fd, err = t.NewFDFrom(0, file, kernel.FDFlags{
                CloseOnExec: flags&linux.SFD_CLOEXEC != 0,
        })
        if err != nil {
                return 0, nil, err
        }

        // Done.
        return uintptr(fd), nil, nil
}

// Signalfd implements the linux syscall signalfd(2).
func Signalfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        sigset := args[1].Pointer()
        sigsetsize := args[2].SizeT()
        return sharedSignalfd(t, fd, sigset, sigsetsize, 0)
}

// Signalfd4 implements the linux syscall signalfd4(2).
func Signalfd4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        sigset := args[1].Pointer()
        sigsetsize := args[2].SizeT()
        flags := args[3].Int()
        return sharedSignalfd(t, fd, sigset, sigsetsize, flags)
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/fs_context_refs.go: no such file or directory







































   10 












































   32 




































   10 


















   10 






   10 


















   10 

    4 



    9 







    9 





    9 

    9 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
)

const (
        // maxPendingResolutions is the maximum number of pending link-address
        // resolutions.
        maxPendingResolutions          = 64
        maxPendingPacketsPerResolution = 256
)

// pendingPacketBuffer is a pending packet buffer.
//
// TODO(gvisor.dev/issue/5331): Drop this when we drop WritePacket and only use
// WritePackets so we can use a PacketBufferList everywhere.
type pendingPacketBuffer interface {
        len() int
}

func (*PacketBuffer) len() int {
        return 1
}

func (p *PacketBufferList) len() int {
        return p.Len()
}

type pendingPacket struct {
        routeInfo RouteInfo
        proto     tcpip.NetworkProtocolNumber
        pkt       pendingPacketBuffer
}

// packetsPendingLinkResolution is a queue of packets pending link resolution.
//
// Once link resolution completes successfully, the packets will be written.
type packetsPendingLinkResolution struct {
        nic *nic

        mu struct {
                sync.Mutex

                // The packets to send once the resolver completes.
                //
                // The link resolution channel is used as the key for this map.
                packets map[<-chan struct{}][]pendingPacket

                // FIFO of channels used to cancel the oldest goroutine waiting for
                // link-address resolution.
                //
                // cancelChans holds the same channels that are used as keys to packets.
                cancelChans []<-chan struct{}
        }
}

func (f *packetsPendingLinkResolution) incrementOutgoingPacketErrors(proto tcpip.NetworkProtocolNumber, pkt pendingPacketBuffer) {
        n := uint64(pkt.len())
        f.nic.stack.stats.IP.OutgoingPacketErrors.IncrementBy(n)

        if ipEndpointStats, ok := f.nic.getNetworkEndpoint(proto).Stats().(IPNetworkEndpointStats); ok {
                ipEndpointStats.IPStats().OutgoingPacketErrors.IncrementBy(n)
        }
}

func (f *packetsPendingLinkResolution) init(nic *nic) {
        f.mu.Lock()
        defer f.mu.Unlock()
        f.nic = nic
        f.mu.packets = make(map[<-chan struct{}][]pendingPacket)
}

// dequeue any pending packets associated with ch.
//
// If err is nil, packets will be written and sent to the given remote link
// address.
func (f *packetsPendingLinkResolution) dequeue(ch <-chan struct{}, linkAddr tcpip.LinkAddress, err tcpip.Error) {
        f.mu.Lock()
        packets, ok := f.mu.packets[ch]
        delete(f.mu.packets, ch)

        if ok {
                for i, cancelChan := range f.mu.cancelChans {
                        if cancelChan == ch {
                                f.mu.cancelChans = append(f.mu.cancelChans[:i], f.mu.cancelChans[i+1:]...)
                                break
                        }
                }
        }

        f.mu.Unlock()

        if ok {
                f.dequeuePackets(packets, linkAddr, err)
        }
}

// enqueue a packet to be sent once link resolution completes.
//
// If the maximum number of pending resolutions is reached, the packets
// associated with the oldest link resolution will be dequeued as if they failed
// link resolution.
func (f *packetsPendingLinkResolution) enqueue(r *Route, proto tcpip.NetworkProtocolNumber, pkt pendingPacketBuffer) (int, tcpip.Error) {
        f.mu.Lock()
        // Make sure we attempt resolution while holding f's lock so that we avoid
        // a race where link resolution completes before we enqueue the packets.
        //
        //   A @ T1: Call ResolvedFields (get link resolution channel)
        //   B @ T2: Complete link resolution, dequeue pending packets
        //   C @ T1: Enqueue packet that already completed link resolution (which will
        //       never dequeue)
        //
        // To make sure B does not interleave with A and C, we make sure A and C are
        // done while holding the lock.
        routeInfo, ch, err := r.resolvedFields(nil)
        switch err.(type) {
        case nil:
                // The route resolved immediately, so we don't need to wait for link
                // resolution to send the packet.
                f.mu.Unlock()
                return f.nic.writePacketBuffer(routeInfo, proto, pkt)
        case *tcpip.ErrWouldBlock:
                // We need to wait for link resolution to complete.
        default:
                f.mu.Unlock()
                return 0, err
        }

        defer f.mu.Unlock()

        packets, ok := f.mu.packets[ch]
        packets = append(packets, pendingPacket{
                routeInfo: routeInfo,
                proto:     proto,
                pkt:       pkt,
        })

        if len(packets) > maxPendingPacketsPerResolution {
                f.incrementOutgoingPacketErrors(packets[0].proto, packets[0].pkt)
                packets[0] = pendingPacket{}
                packets = packets[1:]

                if numPackets := len(packets); numPackets != maxPendingPacketsPerResolution {
                        panic(fmt.Sprintf("holding more queued packets than expected; got = %d, want <= %d", numPackets, maxPendingPacketsPerResolution))
                }
        }

        f.mu.packets[ch] = packets

        if ok {
                return pkt.len(), nil
        }

        cancelledPackets := f.newCancelChannelLocked(ch)

        if len(cancelledPackets) != 0 {
                // Dequeue the pending packets in a new goroutine to not hold up the current
                // goroutine as handing link resolution failures may be a costly operation.
                go f.dequeuePackets(cancelledPackets, "" /* linkAddr */, &tcpip.ErrAborted{})
        }

        return pkt.len(), nil
}

// newCancelChannelLocked appends the link resolution channel to a FIFO. If the
// maximum number of pending resolutions is reached, the oldest channel will be
// removed and its associated pending packets will be returned.
func (f *packetsPendingLinkResolution) newCancelChannelLocked(newCH <-chan struct{}) []pendingPacket {
        f.mu.cancelChans = append(f.mu.cancelChans, newCH)
        if len(f.mu.cancelChans) <= maxPendingResolutions {
                return nil
        }

        ch := f.mu.cancelChans[0]
        f.mu.cancelChans[0] = nil
        f.mu.cancelChans = f.mu.cancelChans[1:]
        if l := len(f.mu.cancelChans); l > maxPendingResolutions {
                panic(fmt.Sprintf("max pending resolutions reached; got %d active resolutions, max = %d", l, maxPendingResolutions))
        }

        packets, ok := f.mu.packets[ch]
        if !ok {
                panic("must have a packet queue for an uncancelled channel")
        }
        delete(f.mu.packets, ch)

        return packets
}

func (f *packetsPendingLinkResolution) dequeuePackets(packets []pendingPacket, linkAddr tcpip.LinkAddress, err tcpip.Error) {
        for _, p := range packets {
                if err == nil {
                        p.routeInfo.RemoteLinkAddress = linkAddr
                        _, _ = f.nic.writePacketBuffer(p.routeInfo, p.proto, p.pkt)
                } else {
                        f.incrementOutgoingPacketErrors(p.proto, p.pkt)

                        if linkResolvableEP, ok := f.nic.getNetworkEndpoint(p.proto).(LinkResolvableNetworkEndpoint); ok {
                                switch pkt := p.pkt.(type) {
                                case *PacketBuffer:
                                        linkResolvableEP.HandleLinkResolutionFailure(pkt)
                                case *PacketBufferList:
                                        for pb := pkt.Front(); pb != nil; pb = pb.Next() {
                                                linkResolvableEP.HandleLinkResolutionFailure(pb)
                                        }
                                default:
                                        panic(fmt.Sprintf("unrecognized pending packet buffer type = %T", p.pkt))
                                }
                        }
                }
        }
}

























  238 




  237 



  238 









  233 



  233 
  230 




  233 

  233 


  229 







  230 
  228 


  229 


  230 


  229 


  232 





  230 
    1 


  232 







    3 



    3 
    2 



    3 
    3 


    2 



    2 



    2 


    2 

    3 







    3 





    2 




    2 


    2 
    2 



    2 

    2 


    2 



    2 




    2 


    2 

    2 








    2 



    2 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
)

var fspathBuilderPool = sync.Pool{
        New: func() interface{} {
                return &fspath.Builder{}
        },
}

func getFSPathBuilder() *fspath.Builder {
        return fspathBuilderPool.Get().(*fspath.Builder)
}

func putFSPathBuilder(b *fspath.Builder) {
        // No methods can be called on b after b.String(), so reset it to its zero
        // value (as returned by fspathBuilderPool.New) instead.
        *b = fspath.Builder{}
        fspathBuilderPool.Put(b)
}

// PathnameWithDeleted returns an absolute pathname to vd, consistent with
// Linux's d_path(). In particular, if vd.Dentry() has been disowned,
// PathnameWithDeleted appends " (deleted)" to the returned pathname.
func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
        b := getFSPathBuilder()
        defer putFSPathBuilder(b)
        haveRef := false
        defer func() {
                if haveRef {
                        vd.DecRef(ctx)
                }
        }()

        origD := vd.dentry
loop:
        for {
                err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
                switch err.(type) {
                case nil:
                        if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
                                // genericfstree.PrependPath() will have returned
                                // PrependPathAtVFSRootError in this case since it checks
                                // against vfsroot before mnt.root, but other implementations
                                // of FilesystemImpl.PrependPath() may return nil instead.
                                break loop
                        }
                        nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot)
                        if !nextVD.Ok() {
                                break loop
                        }
                        if haveRef {
                                vd.DecRef(ctx)
                        }
                        vd = nextVD
                        haveRef = true
                        // continue loop
                case PrependPathSyntheticError:
                        // Skip prepending "/" and appending " (deleted)".
                        return b.String(), nil
                case PrependPathAtVFSRootError, PrependPathAtNonMountRootError:
                        break loop
                default:
                        return "", err
                }
        }
        b.PrependByte('/')
        if origD.IsDead() {
                b.AppendString(" (deleted)")
        }
        return b.String(), nil
}

// PathnameReachable returns an absolute pathname to vd, consistent with
// Linux's __d_path() (as used by seq_path_root()). If vfsroot.Ok() and vd is
// not reachable from vfsroot, such that seq_path_root() would return SEQ_SKIP
// (causing the entire containing entry to be skipped), PathnameReachable
// returns ("", nil).
func (vfs *VirtualFilesystem) PathnameReachable(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
        b := getFSPathBuilder()
        defer putFSPathBuilder(b)
        haveRef := false
        defer func() {
                if haveRef {
                        vd.DecRef(ctx)
                }
        }()
loop:
        for {
                err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
                switch err.(type) {
                case nil:
                        if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
                                break loop
                        }
                        nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot)
                        if !nextVD.Ok() {
                                return "", nil
                        }
                        if haveRef {
                                vd.DecRef(ctx)
                        }
                        vd = nextVD
                        haveRef = true
                case PrependPathAtVFSRootError:
                        break loop
                case PrependPathAtNonMountRootError, PrependPathSyntheticError:
                        return "", nil
                default:
                        return "", err
                }
        }
        b.PrependByte('/')
        return b.String(), nil
}

// PathnameForGetcwd returns an absolute pathname to vd, consistent with
// Linux's sys_getcwd().
func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
        if vd.dentry.IsDead() {
                return "", syserror.ENOENT
        }

        b := getFSPathBuilder()
        defer putFSPathBuilder(b)
        haveRef := false
        defer func() {
                if haveRef {
                        vd.DecRef(ctx)
                }
        }()
        unreachable := false
loop:
        for {
                err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
                switch err.(type) {
                case nil:
                        if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
                                break loop
                        }
                        nextVD := vfs.getMountpointAt(ctx, vd.mount, vfsroot)
                        if !nextVD.Ok() {
                                unreachable = true
                                break loop
                        }
                        if haveRef {
                                vd.DecRef(ctx)
                        }
                        vd = nextVD
                        haveRef = true
                case PrependPathAtVFSRootError:
                        break loop
                case PrependPathAtNonMountRootError, PrependPathSyntheticError:
                        unreachable = true
                        break loop
                default:
                        return "", err
                }
        }
        b.PrependByte('/')
        if unreachable {
                b.PrependString("(unreachable)")
        }
        return b.String(), nil
}

// As of this writing, we do not have equivalents to:
//
// - d_absolute_path(), which returns EINVAL if (effectively) any call to
// FilesystemImpl.PrependPath() would return PrependPathAtNonMountRootError.
//
// - dentry_path(), which does not walk up mounts (and only returns the path
// relative to Filesystem root), but also appends "//deleted" for disowned
// Dentries.
//
// These should be added as necessary.




























































































   23 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fs

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)

// contextID is the fs package's type for context.Context.Value keys.
type contextID int

const (
        // CtxRoot is a Context.Value key for a Dirent.
        CtxRoot contextID = iota

        // CtxDirentCacheLimiter is a Context.Value key for DirentCacheLimiter.
        CtxDirentCacheLimiter
)

// ContextCanAccessFile determines whether `file` can be accessed in the requested way
// (for reading, writing, or execution) using the caller's credentials and user
// namespace, as does Linux's fs/namei.c:generic_permission.
func ContextCanAccessFile(ctx context.Context, inode *Inode, reqPerms PermMask) bool {
        creds := auth.CredentialsFromContext(ctx)
        uattr, err := inode.UnstableAttr(ctx)
        if err != nil {
                return false
        }

        p := uattr.Perms.Other
        // Are we owner or in group?
        if uattr.Owner.UID == creds.EffectiveKUID {
                p = uattr.Perms.User
        } else if creds.InGroup(uattr.Owner.GID) {
                p = uattr.Perms.Group
        }

        // Do not allow programs to be executed if MS_NOEXEC is set.
        if IsFile(inode.StableAttr) && reqPerms.Execute && inode.MountSource.Flags.NoExec {
                return false
        }

        // Are permissions satisfied without capability checks?
        if p.SupersetOf(reqPerms) {
                return true
        }

        if IsDir(inode.StableAttr) {
                // CAP_DAC_OVERRIDE can override any perms on directories.
                if inode.CheckCapability(ctx, linux.CAP_DAC_OVERRIDE) {
                        return true
                }

                // CAP_DAC_READ_SEARCH can normally only override Read perms,
                // but for directories it can also override execution.
                if !reqPerms.Write && inode.CheckCapability(ctx, linux.CAP_DAC_READ_SEARCH) {
                        return true
                }
        }

        // CAP_DAC_OVERRIDE can always override Read/Write.
        // Can override executable only when at least one execute bit is set.
        if !reqPerms.Execute || uattr.Perms.AnyExecute() {
                if inode.CheckCapability(ctx, linux.CAP_DAC_OVERRIDE) {
                        return true
                }
        }

        // Read perms can be overridden by CAP_DAC_READ_SEARCH.
        if reqPerms.OnlyRead() && inode.CheckCapability(ctx, linux.CAP_DAC_READ_SEARCH) {
                return true
        }
        return false
}

// FileOwnerFromContext returns a FileOwner using the effective user and group
// IDs used by ctx.
func FileOwnerFromContext(ctx context.Context) FileOwner {
        creds := auth.CredentialsFromContext(ctx)
        return FileOwner{creds.EffectiveKUID, creds.EffectiveKGID}
}

// RootFromContext returns the root of the virtual filesystem observed by ctx,
// or nil if ctx is not associated with a virtual filesystem. If
// RootFromContext returns a non-nil fs.Dirent, a reference is taken on it.
func RootFromContext(ctx context.Context) *Dirent {
        if v := ctx.Value(CtxRoot); v != nil {
                return v.(*Dirent)
        }
        return nil
}

// DirentCacheLimiterFromContext returns the DirentCacheLimiter used by ctx, or
// nil if ctx does not have a dirent cache limiter.
func DirentCacheLimiterFromContext(ctx context.Context) *DirentCacheLimiter {
        if v := ctx.Value(CtxDirentCacheLimiter); v != nil {
                return v.(*DirentCacheLimiter)
        }
        return nil
}

type rootContext struct {
        context.Context
        root *Dirent
}

// WithRoot returns a copy of ctx with the given root.
func WithRoot(ctx context.Context, root *Dirent) context.Context {
        return &rootContext{
                Context: ctx,
                root:    root,
        }
}

// Value implements Context.Value.
func (rc rootContext) Value(key interface{}) interface{} {
        switch key {
        case CtxRoot:
                rc.root.IncRef()
                return rc.root
        default:
                return rc.Context.Value(key)
        }
}



















































  333 





  334 
    1 



  334 





  335 







    1 




  334 




  334 

    1 


  335 



  321 

    1 


  322 



  331 

    1 


  330 



  330 

    1 


  329 



  330 














  328 




  326 









  326 














  334 




    1 







    1 
    1 


    1 



  333 




  323 




  335 




  196 














  334 
























    1 









  334 




  323 

  305 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package p9

import (
        "encoding/binary"
)

// encoder is used for messages and 9P primitives.
type encoder interface {
        // decode decodes from the given buffer. decode may be called more than once
        // to reuse the instance. It must clear any previous state.
        //
        // This may not fail, exhaustion will be recorded in the buffer.
        decode(b *buffer)

        // encode encodes to the given buffer.
        //
        // This may not fail.
        encode(b *buffer)
}

// order is the byte order used for encoding.
var order = binary.LittleEndian

// buffer is a slice that is consumed.
//
// This is passed to the encoder methods.
type buffer struct {
        // data is the underlying data. This may grow during encode.
        data []byte

        // overflow indicates whether an overflow has occurred.
        overflow bool
}

// append appends n bytes to the buffer and returns a slice pointing to the
// newly appended bytes.
func (b *buffer) append(n int) []byte {
        b.data = append(b.data, make([]byte, n)...)
        return b.data[len(b.data)-n:]
}

// consume consumes n bytes from the buffer.
func (b *buffer) consume(n int) ([]byte, bool) {
        if !b.has(n) {
                b.markOverrun()
                return nil, false
        }
        rval := b.data[:n]
        b.data = b.data[n:]
        return rval, true
}

// has returns true if n bytes are available.
func (b *buffer) has(n int) bool {
        return len(b.data) >= n
}

// markOverrun immediately marks this buffer as overrun.
//
// This is used by ReadString, since some invalid data implies the rest of the
// buffer is no longer valid either.
func (b *buffer) markOverrun() {
        b.overflow = true
}

// isOverrun returns true if this buffer has run past the end.
func (b *buffer) isOverrun() bool {
        return b.overflow
}

// Read8 reads a byte from the buffer.
func (b *buffer) Read8() uint8 {
        v, ok := b.consume(1)
        if !ok {
                return 0
        }
        return uint8(v[0])
}

// Read16 reads a 16-bit value from the buffer.
func (b *buffer) Read16() uint16 {
        v, ok := b.consume(2)
        if !ok {
                return 0
        }
        return order.Uint16(v)
}

// Read32 reads a 32-bit value from the buffer.
func (b *buffer) Read32() uint32 {
        v, ok := b.consume(4)
        if !ok {
                return 0
        }
        return order.Uint32(v)
}

// Read64 reads a 64-bit value from the buffer.
func (b *buffer) Read64() uint64 {
        v, ok := b.consume(8)
        if !ok {
                return 0
        }
        return order.Uint64(v)
}

// ReadQIDType reads a QIDType value.
func (b *buffer) ReadQIDType() QIDType {
        return QIDType(b.Read8())
}

// ReadTag reads a Tag value.
func (b *buffer) ReadTag() Tag {
        return Tag(b.Read16())
}

// ReadFID reads a FID value.
func (b *buffer) ReadFID() FID {
        return FID(b.Read32())
}

// ReadUID reads a UID value.
func (b *buffer) ReadUID() UID {
        return UID(b.Read32())
}

// ReadGID reads a GID value.
func (b *buffer) ReadGID() GID {
        return GID(b.Read32())
}

// ReadPermissions reads a file mode value and applies the mask for permissions.
func (b *buffer) ReadPermissions() FileMode {
        return b.ReadFileMode() & permissionsMask
}

// ReadFileMode reads a file mode value.
func (b *buffer) ReadFileMode() FileMode {
        return FileMode(b.Read32())
}

// ReadOpenFlags reads an OpenFlags.
func (b *buffer) ReadOpenFlags() OpenFlags {
        return OpenFlags(b.Read32())
}

// ReadConnectFlags reads a ConnectFlags.
func (b *buffer) ReadConnectFlags() ConnectFlags {
        return ConnectFlags(b.Read32())
}

// ReadMsgType writes a MsgType.
func (b *buffer) ReadMsgType() MsgType {
        return MsgType(b.Read8())
}

// ReadString deserializes a string.
func (b *buffer) ReadString() string {
        l := b.Read16()
        if !b.has(int(l)) {
                // Mark the buffer as corrupted.
                b.markOverrun()
                return ""
        }

        bs := make([]byte, l)
        for i := 0; i < int(l); i++ {
                bs[i] = byte(b.Read8())
        }
        return string(bs)
}

// Write8 writes a byte to the buffer.
func (b *buffer) Write8(v uint8) {
        b.append(1)[0] = byte(v)
}

// Write16 writes a 16-bit value to the buffer.
func (b *buffer) Write16(v uint16) {
        order.PutUint16(b.append(2), v)
}

// Write32 writes a 32-bit value to the buffer.
func (b *buffer) Write32(v uint32) {
        order.PutUint32(b.append(4), v)
}

// Write64 writes a 64-bit value to the buffer.
func (b *buffer) Write64(v uint64) {
        order.PutUint64(b.append(8), v)
}

// WriteQIDType writes a QIDType value.
func (b *buffer) WriteQIDType(qidType QIDType) {
        b.Write8(uint8(qidType))
}

// WriteTag writes a Tag value.
func (b *buffer) WriteTag(tag Tag) {
        b.Write16(uint16(tag))
}

// WriteFID writes a FID value.
func (b *buffer) WriteFID(fid FID) {
        b.Write32(uint32(fid))
}

// WriteUID writes a UID value.
func (b *buffer) WriteUID(uid UID) {
        b.Write32(uint32(uid))
}

// WriteGID writes a GID value.
func (b *buffer) WriteGID(gid GID) {
        b.Write32(uint32(gid))
}

// WritePermissions applies a permissions mask and writes the FileMode.
func (b *buffer) WritePermissions(perm FileMode) {
        b.WriteFileMode(perm & permissionsMask)
}

// WriteFileMode writes a FileMode.
func (b *buffer) WriteFileMode(mode FileMode) {
        b.Write32(uint32(mode))
}

// WriteOpenFlags writes an OpenFlags.
func (b *buffer) WriteOpenFlags(flags OpenFlags) {
        b.Write32(uint32(flags))
}

// WriteConnectFlags writes a ConnectFlags.
func (b *buffer) WriteConnectFlags(flags ConnectFlags) {
        b.Write32(uint32(flags))
}

// WriteMsgType writes a MsgType.
func (b *buffer) WriteMsgType(t MsgType) {
        b.Write8(uint8(t))
}

// WriteString serializes the given string.
func (b *buffer) WriteString(s string) {
        b.Write16(uint16(len(s)))
        for i := 0; i < len(s); i++ {
                b.Write8(byte(s[i]))
        }
}
































    3 





    4 





    2 




    5 

    1 


    4 


    3 



    3 




    3 



    4 







    2 



    2 




    1 







 1622 
    5 
    2 



    3 
    1 



    2 


    2 



    2 


    2 







 1623 
 1627 


    1 










    1 

    1 





    4 




    1 


    3 




    1 




    2 







    2 


    1 













    4 







 1623 















 1631 


   18 


   18 




    4 





    4 








    1 






  983 








  985 





  986 





 1621 










 1625 




 1626 







 1627 
    1 



 1623 



 1623 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "fmt"
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/syserror"
)

// The most significant 29 bits hold either a pid or a file descriptor.
func pidOfClockID(c int32) kernel.ThreadID {
        return kernel.ThreadID(^(c >> 3))
}

// whichCPUClock returns one of CPUCLOCK_PERF, CPUCLOCK_VIRT, CPUCLOCK_SCHED or
// CLOCK_FD.
func whichCPUClock(c int32) int32 {
        return c & linux.CPUCLOCK_CLOCK_MASK
}

// isCPUClockPerThread returns true if the CPUCLOCK_PERTHREAD bit is set in the
// clock id.
func isCPUClockPerThread(c int32) bool {
        return c&linux.CPUCLOCK_PERTHREAD_MASK != 0
}

// isValidCPUClock returns checks that the cpu clock id is valid.
func isValidCPUClock(c int32) bool {
        // Bits 0, 1, and 2 cannot all be set.
        if c&7 == 7 {
                return false
        }
        if whichCPUClock(c) >= linux.CPUCLOCK_MAX {
                return false
        }
        return true
}

// targetTask returns the kernel.Task for the given clock id.
func targetTask(t *kernel.Task, c int32) *kernel.Task {
        pid := pidOfClockID(c)
        if pid == 0 {
                return t
        }
        return t.PIDNamespace().TaskWithID(pid)
}

// ClockGetres implements linux syscall clock_getres(2).
func ClockGetres(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        clockID := int32(args[0].Int())
        addr := args[1].Pointer()
        r := linux.Timespec{
                Sec:  0,
                Nsec: 1,
        }

        if _, err := getClock(t, clockID); err != nil {
                return 0, nil, linuxerr.EINVAL
        }

        if addr == 0 {
                // Don't need to copy out.
                return 0, nil, nil
        }

        return 0, nil, copyTimespecOut(t, addr, &r)
}

type cpuClocker interface {
        UserCPUClock() ktime.Clock
        CPUClock() ktime.Clock
}

func getClock(t *kernel.Task, clockID int32) (ktime.Clock, error) {
        if clockID < 0 {
                if !isValidCPUClock(clockID) {
                        return nil, linuxerr.EINVAL
                }

                targetTask := targetTask(t, clockID)
                if targetTask == nil {
                        return nil, linuxerr.EINVAL
                }

                var target cpuClocker
                if isCPUClockPerThread(clockID) {
                        target = targetTask
                } else {
                        target = targetTask.ThreadGroup()
                }

                switch whichCPUClock(clockID) {
                case linux.CPUCLOCK_VIRT:
                        return target.UserCPUClock(), nil
                case linux.CPUCLOCK_PROF, linux.CPUCLOCK_SCHED:
                        // CPUCLOCK_SCHED is approximated by CPUCLOCK_PROF.
                        return target.CPUClock(), nil
                default:
                        return nil, linuxerr.EINVAL
                }
        }

        switch clockID {
        case linux.CLOCK_REALTIME, linux.CLOCK_REALTIME_COARSE:
                return t.Kernel().RealtimeClock(), nil
        case linux.CLOCK_MONOTONIC, linux.CLOCK_MONOTONIC_COARSE,
                linux.CLOCK_MONOTONIC_RAW, linux.CLOCK_BOOTTIME:
                // CLOCK_MONOTONIC approximates CLOCK_MONOTONIC_RAW.
                // CLOCK_BOOTTIME is internally mapped to CLOCK_MONOTONIC, as:
                // - CLOCK_BOOTTIME should behave as CLOCK_MONOTONIC while also
                //   including suspend time.
                // - gVisor has no concept of suspend/resume.
                // - CLOCK_MONOTONIC already includes save/restore time, which is
                //   the closest to suspend time.
                return t.Kernel().MonotonicClock(), nil
        case linux.CLOCK_PROCESS_CPUTIME_ID:
                return t.ThreadGroup().CPUClock(), nil
        case linux.CLOCK_THREAD_CPUTIME_ID:
                return t.CPUClock(), nil
        default:
                return nil, linuxerr.EINVAL
        }
}

// ClockGettime implements linux syscall clock_gettime(2).
func ClockGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        clockID := int32(args[0].Int())
        addr := args[1].Pointer()

        c, err := getClock(t, clockID)
        if err != nil {
                return 0, nil, err
        }
        ts := c.Now().Timespec()
        return 0, nil, copyTimespecOut(t, addr, &ts)
}

// ClockSettime implements linux syscall clock_settime(2).
func ClockSettime(*kernel.Task, arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return 0, nil, linuxerr.EPERM
}

// Time implements linux syscall time(2).
func Time(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()

        r := t.Kernel().RealtimeClock().Now().TimeT()
        if addr == hostarch.Addr(0) {
                return uintptr(r), nil, nil
        }

        if _, err := r.CopyOut(t, addr); err != nil {
                return 0, nil, err
        }
        return uintptr(r), nil, nil
}

// clockNanosleepRestartBlock encapsulates the state required to restart
// clock_nanosleep(2) via restart_syscall(2).
//
// +stateify savable
type clockNanosleepRestartBlock struct {
        c   ktime.Clock
        end ktime.Time
        rem hostarch.Addr
}

// Restart implements kernel.SyscallRestartBlock.Restart.
func (n *clockNanosleepRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
        return 0, clockNanosleepUntil(t, n.c, n.end, n.rem, true)
}

// clockNanosleepUntil blocks until a specified time.
//
// If blocking is interrupted, the syscall is restarted with the original
// arguments.
func clockNanosleepUntil(t *kernel.Task, c ktime.Clock, end ktime.Time, rem hostarch.Addr, needRestartBlock bool) error {
        notifier, tchan := ktime.NewChannelNotifier()
        timer := ktime.NewTimer(c, notifier)

        // Turn on the timer.
        timer.Swap(ktime.Setting{
                Period:  0,
                Enabled: true,
                Next:    end,
        })

        err := t.BlockWithTimer(nil, tchan)

        timer.Destroy()

        switch {
        case linuxerr.Equals(linuxerr.ETIMEDOUT, err):
                // Slept for entire timeout.
                return nil
        case err == syserror.ErrInterrupted:
                // Interrupted.
                remaining := end.Sub(c.Now())
                if remaining <= 0 {
                        return nil
                }

                // Copy out remaining time.
                if rem != 0 {
                        timeleft := linux.NsecToTimespec(remaining.Nanoseconds())
                        if err := copyTimespecOut(t, rem, &timeleft); err != nil {
                                return err
                        }
                }
                if needRestartBlock {
                        // Arrange for a restart with the remaining duration.
                        t.SetSyscallRestartBlock(&clockNanosleepRestartBlock{
                                c:   c,
                                end: end,
                                rem: rem,
                        })
                        return syserror.ERESTART_RESTARTBLOCK
                }
                return syserror.ERESTARTNOHAND
        default:
                panic(fmt.Sprintf("Impossible BlockWithTimer error %v", err))
        }
}

// Nanosleep implements linux syscall Nanosleep(2).
func Nanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        rem := args[1].Pointer()

        ts, err := copyTimespecIn(t, addr)
        if err != nil {
                return 0, nil, err
        }

        if !ts.Valid() {
                return 0, nil, linuxerr.EINVAL
        }

        // Just like linux, we cap the timeout with the max number that int64 can
        // represent which is roughly 292 years.
        dur := time.Duration(ts.ToNsecCapped()) * time.Nanosecond
        c := t.Kernel().MonotonicClock()
        return 0, nil, clockNanosleepUntil(t, c, c.Now().Add(dur), rem, true)
}

// ClockNanosleep implements linux syscall clock_nanosleep(2).
func ClockNanosleep(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        clockID := int32(args[0].Int())
        flags := args[1].Int()
        addr := args[2].Pointer()
        rem := args[3].Pointer()

        req, err := copyTimespecIn(t, addr)
        if err != nil {
                return 0, nil, err
        }

        if !req.Valid() {
                return 0, nil, linuxerr.EINVAL
        }

        // Only allow clock constants also allowed by Linux.
        if clockID > 0 {
                if clockID != linux.CLOCK_REALTIME &&
                        clockID != linux.CLOCK_MONOTONIC &&
                        clockID != linux.CLOCK_PROCESS_CPUTIME_ID {
                        return 0, nil, linuxerr.EINVAL
                }
        }

        c, err := getClock(t, clockID)
        if err != nil {
                return 0, nil, err
        }

        if flags&linux.TIMER_ABSTIME != 0 {
                return 0, nil, clockNanosleepUntil(t, c, ktime.FromTimespec(req), 0, false)
        }

        dur := time.Duration(req.ToNsecCapped()) * time.Nanosecond
        return 0, nil, clockNanosleepUntil(t, c, c.Now().Add(dur), rem, true)
}

// Gettimeofday implements linux syscall gettimeofday(2).
func Gettimeofday(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        tv := args[0].Pointer()
        tz := args[1].Pointer()

        if tv != hostarch.Addr(0) {
                nowTv := t.Kernel().RealtimeClock().Now().Timeval()
                if err := copyTimevalOut(t, tv, &nowTv); err != nil {
                        return 0, nil, err
                }
        }

        if tz != hostarch.Addr(0) {
                // Ask the time package for the timezone.
                _, offset := time.Now().Zone()
                // This int32 array mimics linux's struct timezone.
                timezone := []int32{-int32(offset) / 60, 0}
                _, err := primitive.CopyInt32SliceOut(t, tz, timezone)
                return 0, nil, err
        }
        return 0, nil, nil
}

























    3 




    1 



    1 
    1 






    1 



    1 
    1 



    1 


    1 


    1 









    1 



    1 


    1 























    1 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
)

// Membarrier implements syscall membarrier(2).
func Membarrier(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        cmd := args[0].Int()
        flags := args[1].Uint()

        switch cmd {
        case linux.MEMBARRIER_CMD_QUERY:
                if flags != 0 {
                        return 0, nil, linuxerr.EINVAL
                }
                var supportedCommands uintptr
                if t.Kernel().Platform.HaveGlobalMemoryBarrier() {
                        supportedCommands |= linux.MEMBARRIER_CMD_GLOBAL |
                                linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED |
                                linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED |
                                linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED |
                                linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED
                }
                if t.RSeqAvailable() {
                        supportedCommands |= linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ |
                                linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ
                }
                return supportedCommands, nil, nil
        case linux.MEMBARRIER_CMD_GLOBAL, linux.MEMBARRIER_CMD_GLOBAL_EXPEDITED, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED:
                if flags != 0 {
                        return 0, nil, linuxerr.EINVAL
                }
                if !t.Kernel().Platform.HaveGlobalMemoryBarrier() {
                        return 0, nil, linuxerr.EINVAL
                }
                if cmd == linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED && !t.MemoryManager().IsMembarrierPrivateEnabled() {
                        return 0, nil, linuxerr.EPERM
                }
                return 0, nil, t.Kernel().Platform.GlobalMemoryBarrier()
        case linux.MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
                if flags != 0 {
                        return 0, nil, linuxerr.EINVAL
                }
                if !t.Kernel().Platform.HaveGlobalMemoryBarrier() {
                        return 0, nil, linuxerr.EINVAL
                }
                // no-op
                return 0, nil, nil
        case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
                if flags != 0 {
                        return 0, nil, linuxerr.EINVAL
                }
                if !t.Kernel().Platform.HaveGlobalMemoryBarrier() {
                        return 0, nil, linuxerr.EINVAL
                }
                t.MemoryManager().EnableMembarrierPrivate()
                return 0, nil, nil
        case linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
                if flags&^linux.MEMBARRIER_CMD_FLAG_CPU != 0 {
                        return 0, nil, linuxerr.EINVAL
                }
                if !t.RSeqAvailable() {
                        return 0, nil, linuxerr.EINVAL
                }
                if !t.MemoryManager().IsMembarrierRSeqEnabled() {
                        return 0, nil, linuxerr.EPERM
                }
                // MEMBARRIER_CMD_FLAG_CPU and cpu_id are ignored since we don't have
                // the ability to preempt specific CPUs.
                return 0, nil, t.Kernel().Platform.PreemptAllCPUs()
        case linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
                if flags != 0 {
                        return 0, nil, linuxerr.EINVAL
                }
                if !t.RSeqAvailable() {
                        return 0, nil, linuxerr.EINVAL
                }
                t.MemoryManager().EnableMembarrierRSeq()
                return 0, nil, nil
        default:
                // Probably a command we don't implement.
                t.Kernel().EmitUnimplementedEvent(t)
                return 0, nil, linuxerr.EINVAL
        }
}





































    1 




    1 









    1 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package uevent provides a NETLINK_KOBJECT_UEVENT socket protocol.
//
// NETLINK_KOBJECT_UEVENT sockets send udev-style device events. gVisor does
// not support any device events, so these sockets never send any messages.
package uevent

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
        "gvisor.dev/gvisor/pkg/syserr"
)

// Protocol implements netlink.Protocol.
//
// +stateify savable
type Protocol struct{}

var _ netlink.Protocol = (*Protocol)(nil)

// NewProtocol creates a NETLINK_KOBJECT_UEVENT netlink.Protocol.
func NewProtocol(t *kernel.Task) (netlink.Protocol, *syserr.Error) {
        return &Protocol{}, nil
}

// Protocol implements netlink.Protocol.Protocol.
func (p *Protocol) Protocol() int {
        return linux.NETLINK_KOBJECT_UEVENT
}

// CanSend implements netlink.Protocol.CanSend.
func (p *Protocol) CanSend() bool {
        return false
}

// ProcessMessage implements netlink.Protocol.ProcessMessage.
func (p *Protocol) ProcessMessage(ctx context.Context, msg *netlink.Message, ms *netlink.MessageSet) *syserr.Error {
        // Silently ignore all messages.
        return nil
}

// init registers the NETLINK_KOBJECT_UEVENT provider.
func init() {
        netlink.RegisterProvider(linux.NETLINK_KOBJECT_UEVENT, NewProtocol)
}









































    4 





    4 
    2 




    2 



    2 






    2 
























    2 


    2 


    2 



    2 













    2 



    2 









    2 
































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package netfilter

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

// emptyIPv6Filter is for comparison with a rule's filters to determine whether
// it is also empty. It is immutable.
var emptyIPv6Filter = stack.IPHeaderFilter{
        Dst:     "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        DstMask: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        Src:     "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
        SrcMask: "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
}

// convertNetstackToBinary6 converts the ip6tables as stored in netstack to the
// format expected by the iptables tool. Linux stores each table as a binary
// blob that can only be traversed by parsing a little data, reading some
// offsets, jumping to those offsets, parsing again, etc.
func convertNetstackToBinary6(stk *stack.Stack, tablename linux.TableName) (linux.KernelIP6TGetEntries, linux.IPTGetinfo, error) {
        // The table name has to fit in the struct.
        if linux.XT_TABLE_MAXNAMELEN < len(tablename) {
                return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("table name %q too long", tablename)
        }

        id, ok := nameToID[tablename.String()]
        if !ok {
                return linux.KernelIP6TGetEntries{}, linux.IPTGetinfo{}, fmt.Errorf("couldn't find table %q", tablename)
        }

        // Setup the info struct, which is the same in IPv4 and IPv6.
        entries, info := getEntries6(stk.IPTables().GetTable(id, true), tablename)
        return entries, info, nil
}

func getEntries6(table stack.Table, tablename linux.TableName) (linux.KernelIP6TGetEntries, linux.IPTGetinfo) {
        var info linux.IPTGetinfo
        var entries linux.KernelIP6TGetEntries
        copy(info.Name[:], tablename[:])
        copy(entries.Name[:], info.Name[:])
        info.ValidHooks = table.ValidHooks()

        for ruleIdx, rule := range table.Rules {
                nflog("convert to binary: current offset: %d", entries.Size)

                setHooksAndUnderflow(&info, table, entries.Size, ruleIdx)
                // Each rule corresponds to an entry.
                entry := linux.KernelIP6TEntry{
                        Entry: linux.IP6TEntry{
                                IPv6: linux.IP6TIP{
                                        Protocol: uint16(rule.Filter.Protocol),
                                },
                                NextOffset:   linux.SizeOfIP6TEntry,
                                TargetOffset: linux.SizeOfIP6TEntry,
                        },
                }
                copy(entry.Entry.IPv6.Dst[:], rule.Filter.Dst)
                copy(entry.Entry.IPv6.DstMask[:], rule.Filter.DstMask)
                copy(entry.Entry.IPv6.Src[:], rule.Filter.Src)
                copy(entry.Entry.IPv6.SrcMask[:], rule.Filter.SrcMask)
                copy(entry.Entry.IPv6.OutputInterface[:], rule.Filter.OutputInterface)
                copy(entry.Entry.IPv6.OutputInterfaceMask[:], rule.Filter.OutputInterfaceMask)
                copy(entry.Entry.IPv6.InputInterface[:], rule.Filter.InputInterface)
                copy(entry.Entry.IPv6.InputInterfaceMask[:], rule.Filter.InputInterfaceMask)
                if rule.Filter.DstInvert {
                        entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_DSTIP
                }
                if rule.Filter.SrcInvert {
                        entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_SRCIP
                }
                if rule.Filter.OutputInterfaceInvert {
                        entry.Entry.IPv6.InverseFlags |= linux.IP6T_INV_VIA_OUT
                }
                if rule.Filter.CheckProtocol {
                        entry.Entry.IPv6.Flags |= linux.IP6T_F_PROTO
                }

                for _, matcher := range rule.Matchers {
                        // Serialize the matcher and add it to the
                        // entry.
                        serialized := marshalMatcher(matcher)
                        nflog("convert to binary: matcher serialized as: %v", serialized)
                        if len(serialized)%8 != 0 {
                                panic(fmt.Sprintf("matcher %T is not 64-bit aligned", matcher))
                        }
                        entry.Elems = append(entry.Elems, serialized...)
                        entry.Entry.NextOffset += uint16(len(serialized))
                        entry.Entry.TargetOffset += uint16(len(serialized))
                }

                // Serialize and append the target.
                serialized := marshalTarget(rule.Target)
                if len(serialized)%8 != 0 {
                        panic(fmt.Sprintf("target %T is not 64-bit aligned", rule.Target))
                }
                entry.Elems = append(entry.Elems, serialized...)
                entry.Entry.NextOffset += uint16(len(serialized))

                nflog("convert to binary: adding entry: %+v", entry)

                entries.Size += uint32(entry.Entry.NextOffset)
                entries.Entrytable = append(entries.Entrytable, entry)
                info.NumEntries++
        }

        info.Size = entries.Size
        nflog("convert to binary: finished with an marshalled size of %d", info.Size)
        return entries, info
}

func modifyEntries6(task *kernel.Task, stk *stack.Stack, optVal []byte, replace *linux.IPTReplace, table *stack.Table) (map[uint32]int, *syserr.Error) {
        nflog("set entries: setting entries in table %q", replace.Name.String())

        // Convert input into a list of rules and their offsets.
        var offset uint32
        // offsets maps rule byte offsets to their position in table.Rules.
        offsets := map[uint32]int{}
        for entryIdx := uint32(0); entryIdx < replace.NumEntries; entryIdx++ {
                nflog("set entries: processing entry at offset %d", offset)

                // Get the struct ipt_entry.
                if len(optVal) < linux.SizeOfIP6TEntry {
                        nflog("optVal has insufficient size for entry %d", len(optVal))
                        return nil, syserr.ErrInvalidArgument
                }
                var entry linux.IP6TEntry
                entry.UnmarshalUnsafe(optVal[:entry.SizeBytes()])
                initialOptValLen := len(optVal)
                optVal = optVal[entry.SizeBytes():]

                if entry.TargetOffset < linux.SizeOfIP6TEntry {
                        nflog("entry has too-small target offset %d", entry.TargetOffset)
                        return nil, syserr.ErrInvalidArgument
                }

                filter, err := filterFromIP6TIP(entry.IPv6)
                if err != nil {
                        nflog("bad iptip: %v", err)
                        return nil, syserr.ErrInvalidArgument
                }

                // Get matchers.
                matchersSize := entry.TargetOffset - linux.SizeOfIP6TEntry
                if len(optVal) < int(matchersSize) {
                        nflog("entry doesn't have enough room for its matchers (only %d bytes remain)", len(optVal))
                        return nil, syserr.ErrInvalidArgument
                }
                matchers, err := parseMatchers(task, filter, optVal[:matchersSize])
                if err != nil {
                        nflog("failed to parse matchers: %v", err)
                        return nil, syserr.ErrInvalidArgument
                }
                optVal = optVal[matchersSize:]

                // Get the target of the rule.
                targetSize := entry.NextOffset - entry.TargetOffset
                if len(optVal) < int(targetSize) {
                        nflog("entry doesn't have enough room for its target (only %d bytes remain)", len(optVal))
                        return nil, syserr.ErrInvalidArgument
                }

                rule := stack.Rule{
                        Filter:   filter,
                        Matchers: matchers,
                }

                {
                        target, err := parseTarget(filter, optVal[:targetSize], true /* ipv6 */)
                        if err != nil {
                                nflog("failed to parse target: %v", err)
                                return nil, err
                        }
                        rule.Target = target
                }
                optVal = optVal[targetSize:]

                table.Rules = append(table.Rules, rule)
                offsets[offset] = int(entryIdx)
                offset += uint32(entry.NextOffset)

                if initialOptValLen-len(optVal) != int(entry.NextOffset) {
                        nflog("entry NextOffset is %d, but entry took up %d bytes", entry.NextOffset, initialOptValLen-len(optVal))
                        return nil, syserr.ErrInvalidArgument
                }
        }
        return offsets, nil
}

func filterFromIP6TIP(iptip linux.IP6TIP) (stack.IPHeaderFilter, error) {
        if containsUnsupportedFields6(iptip) {
                return stack.IPHeaderFilter{}, fmt.Errorf("unsupported fields in struct iptip: %+v", iptip)
        }
        if len(iptip.Dst) != header.IPv6AddressSize || len(iptip.DstMask) != header.IPv6AddressSize {
                return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of destination (%d) and/or destination mask (%d) fields", len(iptip.Dst), len(iptip.DstMask))
        }
        if len(iptip.Src) != header.IPv6AddressSize || len(iptip.SrcMask) != header.IPv6AddressSize {
                return stack.IPHeaderFilter{}, fmt.Errorf("incorrect length of source (%d) and/or source mask (%d) fields", len(iptip.Src), len(iptip.SrcMask))
        }

        return stack.IPHeaderFilter{
                Protocol: tcpip.TransportProtocolNumber(iptip.Protocol),
                // In ip6tables a flag controls whether to check the protocol.
                CheckProtocol:         iptip.Flags&linux.IP6T_F_PROTO != 0,
                Dst:                   tcpip.Address(iptip.Dst[:]),
                DstMask:               tcpip.Address(iptip.DstMask[:]),
                DstInvert:             iptip.InverseFlags&linux.IP6T_INV_DSTIP != 0,
                Src:                   tcpip.Address(iptip.Src[:]),
                SrcMask:               tcpip.Address(iptip.SrcMask[:]),
                SrcInvert:             iptip.InverseFlags&linux.IP6T_INV_SRCIP != 0,
                InputInterface:        string(trimNullBytes(iptip.InputInterface[:])),
                InputInterfaceMask:    string(trimNullBytes(iptip.InputInterfaceMask[:])),
                InputInterfaceInvert:  iptip.InverseFlags&linux.IP6T_INV_VIA_IN != 0,
                OutputInterface:       string(trimNullBytes(iptip.OutputInterface[:])),
                OutputInterfaceMask:   string(trimNullBytes(iptip.OutputInterfaceMask[:])),
                OutputInterfaceInvert: iptip.InverseFlags&linux.IP6T_INV_VIA_OUT != 0,
        }, nil
}

func containsUnsupportedFields6(iptip linux.IP6TIP) bool {
        // The following features are supported:
        // - Protocol
        // - Dst and DstMask
        // - Src and SrcMask
        // - The inverse destination IP check flag
        // - InputInterface, InputInterfaceMask and its inverse.
        // - OutputInterface, OutputInterfaceMask and its inverse.
        const flagMask = linux.IP6T_F_PROTO
        // Disable any supported inverse flags.
        const inverseMask = linux.IP6T_INV_DSTIP | linux.IP6T_INV_SRCIP |
                linux.IP6T_INV_VIA_IN | linux.IP6T_INV_VIA_OUT
        return iptip.Flags&^flagMask != 0 ||
                iptip.InverseFlags&^inverseMask != 0 ||
                iptip.TOS != 0
}

























 1962 



 1957 








 1957 











  126 





   22 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build amd64

package arch

const restartSyscallNr = uintptr(219)

// SyscallSaveOrig save the value of the register which is clobbered in
// syscall handler(doSyscall()).
//
// Noop on x86.
func (c *context64) SyscallSaveOrig() {
}

// SyscallNo returns the syscall number according to the 64-bit convention.
func (c *context64) SyscallNo() uintptr {
        return uintptr(c.Regs.Orig_rax)
}

// SyscallArgs provides syscall arguments according to the 64-bit convention.
//
// Due to the way addresses are mapped for the sentry this binary *must* be
// built in 64-bit mode. So we can just assume the syscall numbers that come
// back match the expected host system call numbers.
func (c *context64) SyscallArgs() SyscallArguments {
        return SyscallArguments{
                SyscallArgument{Value: uintptr(c.Regs.Rdi)},
                SyscallArgument{Value: uintptr(c.Regs.Rsi)},
                SyscallArgument{Value: uintptr(c.Regs.Rdx)},
                SyscallArgument{Value: uintptr(c.Regs.R10)},
                SyscallArgument{Value: uintptr(c.Regs.R8)},
                SyscallArgument{Value: uintptr(c.Regs.R9)},
        }
}

// RestartSyscall implements Context.RestartSyscall.
func (c *context64) RestartSyscall() {
        c.Regs.Rip -= SyscallWidth
        c.Regs.Rax = c.Regs.Orig_rax
}

// RestartSyscallWithRestartBlock implements Context.RestartSyscallWithRestartBlock.
func (c *context64) RestartSyscallWithRestartBlock() {
        c.Regs.Rip -= SyscallWidth
        c.Regs.Rax = uint64(restartSyscallNr)
}
































   25 








   25 



    1 



   24 




   24 








   24 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
)

const (
        memfdPrefix     = "memfd:"
        memfdMaxNameLen = linux.NAME_MAX - len(memfdPrefix)
        memfdAllFlags   = uint32(linux.MFD_CLOEXEC | linux.MFD_ALLOW_SEALING)
)

// MemfdCreate implements the linux syscall memfd_create(2).
func MemfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        flags := args[1].Uint()

        if flags&^memfdAllFlags != 0 {
                // Unknown bits in flags.
                return 0, nil, linuxerr.EINVAL
        }

        allowSeals := flags&linux.MFD_ALLOW_SEALING != 0
        cloExec := flags&linux.MFD_CLOEXEC != 0

        name, err := t.CopyInString(addr, memfdMaxNameLen)
        if err != nil {
                return 0, nil, err
        }

        shmMount := t.Kernel().ShmMount()
        file, err := tmpfs.NewMemfd(t, t.Credentials(), shmMount, allowSeals, memfdPrefix+name)
        if err != nil {
                return 0, nil, err
        }
        defer file.DecRef(t)

        fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
                CloseOnExec: cloExec,
        })
        if err != nil {
                return 0, nil, err
        }

        return uintptr(fd), nil, nil
}



































































































































































    8 
    7 
    8 
















































































































   32 




























   31 






















































    5 






































































   22 






































































   32 










   32 







   32 





























   32 

















   32 




   32 





   32 




   78 





  104 






   32 
   32 


    1 



    1 





    1 
    1 



    1 













    1 



    1 






    1 







   21 





    5 




    5 




   23 






   22 


    1 


   21 












   17 









    1 















    1 





    1 



    1 




    1 
    1 






   22 
    1 





   21 













   21 









   20 


   20 
    5 





   21 





   17 






   16 






   17 






    1 






    1 




   16 




   16 



































































































































































































































   55 









   55 
    3 




   52 
   46 
    1 




   45 





   44 

    1 











   43 







   49 





    5 












    5 


   54 





















   54 

   24 
    1 



























   52 







   51 





   52 



















   52 




    2 



   51 















   52 





   52 
    9 


    1 








    9 

    9 

    2 



    8 



    5 












    5 

    2 
    1 

    2 
    1 


    1 
    2 




















    1 

























    5 


    2 









    4 






    2 






    2 

    1 




    1 



    1 
















    1 
    1 







    3 
    1 














    2 










    2 






















    2 













    1 





    1 











    3 


    3 





    3 





    2 
    1 


    2 
    1 


    1 
    1 



















   38 


















   25 


   14 


   10 
    2 







    2 























    1 


    2 

















   36 



    1 









    1 




   31 














   31 





   31 



   31 

   31 





   31 






   31 





































































    5 




   43 






   72 









   71 




   11 





























   10 











    3 





    8 
    8 





    8 








    8 










    8 





    8 



























































    8 
    8 








   20 






















   32 




   32 













    1 




   24 






   20 































   32 














   12 





   32 































































    1 






   31 

   31 



















   31 




   22 













   59 

    1 



   59 


    2 



   57 

   37 
   16 







   57 



   59 

    1 



   58 






   19 
    1 










   18 



   18 
    1 


   18 






















































   31 





   31 















   30 



    1 





    1 







    1 



    1 





    1 


    1 



















    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package ipv6 contains the implementation of the ipv6 network protocol.
package ipv6

import (
        "encoding/binary"
        "fmt"
        "hash/fnv"
        "math"
        "reflect"
        "sort"
        "sync/atomic"
        "time"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/header/parse"
        "gvisor.dev/gvisor/pkg/tcpip/network/hash"
        "gvisor.dev/gvisor/pkg/tcpip/network/internal/fragmentation"
        "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

const (
        // ReassembleTimeout controls how long a fragment will be held.
        // As per RFC 8200 section 4.5:
        //
        //   If insufficient fragments are received to complete reassembly of a packet
        //   within 60 seconds of the reception of the first-arriving fragment of that
        //   packet, reassembly of that packet must be abandoned.
        //
        // Linux also uses 60 seconds for reassembly timeout:
        // https://github.com/torvalds/linux/blob/47ec5303d73ea344e84f46660fff693c57641386/include/net/ipv6.h#L456
        ReassembleTimeout = 60 * time.Second

        // ProtocolNumber is the ipv6 protocol number.
        ProtocolNumber = header.IPv6ProtocolNumber

        // maxPayloadSize is the maximum size that can be encoded in the 16-bit
        // PayloadLength field of the ipv6 header.
        maxPayloadSize = 0xffff

        // DefaultTTL is the default hop limit for IPv6 Packets egressed by
        // Netstack.
        DefaultTTL = 64

        // buckets for fragment identifiers
        buckets = 2048
)

const (
        forwardingDisabled = 0
        forwardingEnabled  = 1
)

// policyTable is the default policy table defined in RFC 6724 section 2.1.
//
// A more human-readable version:
//
//  Prefix        Precedence Label
//  ::1/128               50     0
//  ::/0                  40     1
//  ::ffff:0:0/96         35     4
//  2002::/16             30     2
//  2001::/32              5     5
//  fc00::/7               3    13
//  ::/96                  1     3
//  fec0::/10              1    11
//  3ffe::/16              1    12
//
// The table is sorted by prefix length so longest-prefix match can be easily
// achieved.
//
// We willingly left out ::/96, fec0::/10 and 3ffe::/16 since those prefix
// assignments are deprecated.
//
// As per RFC 4291 section 2.5.5.1 (for ::/96),
//
//   The "IPv4-Compatible IPv6 address" is now deprecated because the
//   current IPv6 transition mechanisms no longer use these addresses.
//   New or updated implementations are not required to support this
//   address type.
//
// As per RFC 3879 section 4 (for fec0::/10),
//
//    This document formally deprecates the IPv6 site-local unicast prefix
//    defined in [RFC3513], i.e., 1111111011 binary or FEC0::/10.
//
// As per RFC 3701 section 1 (for 3ffe::/16),
//
//   As clearly stated in [TEST-NEW], the addresses for the 6bone are
//   temporary and will be reclaimed in the future. It further states
//   that all users of these addresses (within the 3FFE::/16 prefix) will
//   be required to renumber at some time in the future.
//
// and section 2,
//
//   Thus after the pTLA allocation cutoff date January 1, 2004, it is
//   REQUIRED that no new 6bone 3FFE pTLAs be allocated.
//
// MUST NOT BE MODIFIED.
var policyTable = [...]struct {
        subnet tcpip.Subnet

        label uint8
}{
        // ::1/128
        {
                subnet: header.IPv6Loopback.WithPrefix().Subnet(),
                label:  0,
        },
        // ::ffff:0:0/96
        {
                subnet: header.IPv4MappedIPv6Subnet,
                label:  4,
        },
        // 2001::/32 (Teredo prefix as per RFC 4380 section 2.6).
        {
                subnet: tcpip.AddressWithPrefix{
                        Address:   "\x20\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
                        PrefixLen: 32,
                }.Subnet(),
                label: 5,
        },
        // 2002::/16 (6to4 prefix as per RFC 3056 section 2).
        {
                subnet: tcpip.AddressWithPrefix{
                        Address:   "\x20\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
                        PrefixLen: 16,
                }.Subnet(),
                label: 2,
        },
        // fc00::/7 (Unique local addresses as per RFC 4193 section 3.1).
        {
                subnet: tcpip.AddressWithPrefix{
                        Address:   "\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
                        PrefixLen: 7,
                }.Subnet(),
                label: 13,
        },
        // ::/0
        {
                subnet: header.IPv6EmptySubnet,
                label:  1,
        },
}

func getLabel(addr tcpip.Address) uint8 {
        for _, p := range policyTable {
                if p.subnet.Contains(addr) {
                        return p.label
                }
        }

        panic(fmt.Sprintf("should have a label for address = %s", addr))
}

var _ stack.DuplicateAddressDetector = (*endpoint)(nil)
var _ stack.LinkAddressResolver = (*endpoint)(nil)
var _ stack.LinkResolvableNetworkEndpoint = (*endpoint)(nil)
var _ stack.ForwardingNetworkEndpoint = (*endpoint)(nil)
var _ stack.GroupAddressableEndpoint = (*endpoint)(nil)
var _ stack.AddressableEndpoint = (*endpoint)(nil)
var _ stack.NetworkEndpoint = (*endpoint)(nil)
var _ stack.NDPEndpoint = (*endpoint)(nil)
var _ NDPEndpoint = (*endpoint)(nil)

type endpoint struct {
        nic        stack.NetworkInterface
        dispatcher stack.TransportDispatcher
        protocol   *protocol
        stats      sharedStats

        // enabled is set to 1 when the endpoint is enabled and 0 when it is
        // disabled.
        //
        // Must be accessed using atomic operations.
        enabled uint32

        // forwarding is set to forwardingEnabled when the endpoint has forwarding
        // enabled and forwardingDisabled when it is disabled.
        //
        // Must be accessed using atomic operations.
        forwarding uint32

        mu struct {
                sync.RWMutex

                addressableEndpointState stack.AddressableEndpointState
                ndp                      ndpState
                mld                      mldState
        }

        // dad is used to check if an arbitrary address is already assigned to some
        // neighbor.
        //
        // Note: this is different from mu.ndp.dad which is used to perform DAD for
        // addresses that are assigned to the interface. Removing an address aborts
        // DAD; if we had used the same state, handlers for a removed address would
        // not be called with the actual DAD result.
        //
        // LOCK ORDERING: mu > dad.mu.
        dad struct {
                mu struct {
                        sync.Mutex

                        dad ip.DAD
                }
        }
}

// NICNameFromID is a function that returns a stable name for the specified NIC,
// even if different NIC IDs are used to refer to the same NIC in different
// program runs. It is used when generating opaque interface identifiers (IIDs).
// If the NIC was created with a name, it is passed to NICNameFromID.
//
// NICNameFromID SHOULD return unique NIC names so unique opaque IIDs are
// generated for the same prefix on different NICs.
type NICNameFromID func(tcpip.NICID, string) string

// OpaqueInterfaceIdentifierOptions holds the options related to the generation
// of opaque interface identifiers (IIDs) as defined by RFC 7217.
type OpaqueInterfaceIdentifierOptions struct {
        // NICNameFromID is a function that returns a stable name for a specified NIC,
        // even if the NIC ID changes over time.
        //
        // Must be specified to generate the opaque IID.
        NICNameFromID NICNameFromID

        // SecretKey is a pseudo-random number used as the secret key when generating
        // opaque IIDs as defined by RFC 7217. The key SHOULD be at least
        // header.OpaqueIIDSecretKeyMinBytes bytes and MUST follow minimum randomness
        // requirements for security as outlined by RFC 4086. SecretKey MUST NOT
        // change between program runs, unless explicitly changed.
        //
        // OpaqueInterfaceIdentifierOptions takes ownership of SecretKey. SecretKey
        // MUST NOT be modified after Stack is created.
        //
        // May be nil, but a nil value is highly discouraged to maintain
        // some level of randomness between nodes.
        SecretKey []byte
}

// CheckDuplicateAddress implements stack.DuplicateAddressDetector.
func (e *endpoint) CheckDuplicateAddress(addr tcpip.Address, h stack.DADCompletionHandler) stack.DADCheckAddressDisposition {
        e.dad.mu.Lock()
        defer e.dad.mu.Unlock()
        return e.dad.mu.dad.CheckDuplicateAddressLocked(addr, h)
}

// SetDADConfigurations implements stack.DuplicateAddressDetector.
func (e *endpoint) SetDADConfigurations(c stack.DADConfigurations) {
        e.mu.Lock()
        defer e.mu.Unlock()
        e.dad.mu.Lock()
        defer e.dad.mu.Unlock()

        e.mu.ndp.dad.SetConfigsLocked(c)
        e.dad.mu.dad.SetConfigsLocked(c)
}

// DuplicateAddressProtocol implements stack.DuplicateAddressDetector.
func (*endpoint) DuplicateAddressProtocol() tcpip.NetworkProtocolNumber {
        return ProtocolNumber
}

// HandleLinkResolutionFailure implements stack.LinkResolvableNetworkEndpoint.
func (e *endpoint) HandleLinkResolutionFailure(pkt *stack.PacketBuffer) {
        // If we are operating as a router, we should return an ICMP error to the
        // original packet's sender.
        if pkt.NetworkPacketInfo.IsForwardedPacket {
                // TODO(gvisor.dev/issue/6005): Propagate asynchronously generated ICMP
                // errors to local endpoints.
                e.protocol.returnError(&icmpReasonHostUnreachable{}, pkt)
                e.stats.ip.Forwarding.Errors.Increment()
                e.stats.ip.Forwarding.HostUnreachable.Increment()
                return
        }
        // handleControl expects the entire offending packet to be in the packet
        // buffer's data field.
        pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{
                Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()),
        })
        pkt.NICID = e.nic.ID()
        pkt.NetworkProtocolNumber = ProtocolNumber
        e.handleControl(&icmpv6DestinationAddressUnreachableSockError{}, pkt)
}

// onAddressAssignedLocked handles an address being assigned.
//
// Precondition: e.mu must be exclusively locked.
func (e *endpoint) onAddressAssignedLocked(addr tcpip.Address) {
        // As per RFC 2710 section 3,
        //
        //   All MLD  messages described in this document are sent with a link-local
        //   IPv6 Source Address, ...
        //
        // If we just completed DAD for a link-local address, then attempt to send any
        // queued MLD reports. Note, we may have sent reports already for some of the
        // groups before we had a valid link-local address to use as the source for
        // the MLD messages, but that was only so that MLD snooping switches are aware
        // of our membership to groups - routers would not have handled those reports.
        //
        // As per RFC 3590 section 4,
        //
        //   MLD Report and Done messages are sent with a link-local address as
        //   the IPv6 source address, if a valid address is available on the
        //   interface. If a valid link-local address is not available (e.g., one
        //   has not been configured), the message is sent with the unspecified
        //   address (::) as the IPv6 source address.
        //
        //   Once a valid link-local address is available, a node SHOULD generate
        //   new MLD Report messages for all multicast addresses joined on the
        //   interface.
        //
        //   Routers receiving an MLD Report or Done message with the unspecified
        //   address as the IPv6 source address MUST silently discard the packet
        //   without taking any action on the packets contents.
        //
        //   Snooping switches MUST manage multicast forwarding state based on MLD
        //   Report and Done messages sent with the unspecified address as the
        //   IPv6 source address.
        if header.IsV6LinkLocalUnicastAddress(addr) {
                e.mu.mld.sendQueuedReports()
        }
}

// InvalidateDefaultRouter implements stack.NDPEndpoint.
func (e *endpoint) InvalidateDefaultRouter(rtr tcpip.Address) {
        e.mu.Lock()
        defer e.mu.Unlock()

        // We represent default routers with a default (off-link) route through the
        // router.
        e.mu.ndp.invalidateOffLinkRoute(offLinkRoute{dest: header.IPv6EmptySubnet, router: rtr})
}

// SetNDPConfigurations implements NDPEndpoint.
func (e *endpoint) SetNDPConfigurations(c NDPConfigurations) {
        c.validate()
        e.mu.Lock()
        defer e.mu.Unlock()
        e.mu.ndp.configs = c
}

// hasTentativeAddr returns true if addr is tentative on e.
func (e *endpoint) hasTentativeAddr(addr tcpip.Address) bool {
        e.mu.RLock()
        addressEndpoint := e.getAddressRLocked(addr)
        e.mu.RUnlock()
        return addressEndpoint != nil && addressEndpoint.GetKind() == stack.PermanentTentative
}

// dupTentativeAddrDetected attempts to inform e that a tentative addr is a
// duplicate on a link.
//
// dupTentativeAddrDetected removes the tentative address if it exists. If the
// address was generated via SLAAC, an attempt is made to generate a new
// address.
func (e *endpoint) dupTentativeAddrDetected(addr tcpip.Address, holderLinkAddr tcpip.LinkAddress, nonce []byte) tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()

        addressEndpoint := e.getAddressRLocked(addr)
        if addressEndpoint == nil {
                return &tcpip.ErrBadAddress{}
        }

        if addressEndpoint.GetKind() != stack.PermanentTentative {
                return &tcpip.ErrInvalidEndpointState{}
        }

        switch result := e.mu.ndp.dad.ExtendIfNonceEqualLocked(addr, nonce); result {
        case ip.Extended:
                // The nonce we got back was the same we sent so we know the message
                // indicating a duplicate address was likely ours so do not consider
                // the address duplicate here.
                return nil
        case ip.AlreadyExtended:
                // See Extended.
                //
                // Our DAD message was looped back already.
                return nil
        case ip.NoDADStateFound:
                panic(fmt.Sprintf("expected DAD state for tentative address %s", addr))
        case ip.NonceDisabled:
                // If nonce is disabled then we have no way to know if the packet was
                // looped-back so we have to assume it indicates a duplicate address.
                fallthrough
        case ip.NonceNotEqual:
                // If the address is a SLAAC address, do not invalidate its SLAAC prefix as an
                // attempt will be made to generate a new address for it.
                if err := e.removePermanentEndpointLocked(addressEndpoint, false /* allowSLAACInvalidation */, &stack.DADDupAddrDetected{HolderLinkAddress: holderLinkAddr}); err != nil {
                        return err
                }

                prefix := addressEndpoint.Subnet()

                switch t := addressEndpoint.ConfigType(); t {
                case stack.AddressConfigStatic:
                case stack.AddressConfigSlaac:
                        e.mu.ndp.regenerateSLAACAddr(prefix)
                case stack.AddressConfigSlaacTemp:
                        // Do not reset the generation attempts counter for the prefix as the
                        // temporary address is being regenerated in response to a DAD conflict.
                        e.mu.ndp.regenerateTempSLAACAddr(prefix, false /* resetGenAttempts */)
                default:
                        panic(fmt.Sprintf("unrecognized address config type = %d", t))
                }

                return nil
        default:
                panic(fmt.Sprintf("unhandled result = %d", result))
        }
}

// Forwarding implements stack.ForwardingNetworkEndpoint.
func (e *endpoint) Forwarding() bool {
        return atomic.LoadUint32(&e.forwarding) == forwardingEnabled
}

// setForwarding sets the forwarding status for the endpoint.
//
// Returns true if the forwarding status was updated.
func (e *endpoint) setForwarding(v bool) bool {
        forwarding := uint32(forwardingDisabled)
        if v {
                forwarding = forwardingEnabled
        }

        return atomic.SwapUint32(&e.forwarding, forwarding) != forwarding
}

// SetForwarding implements stack.ForwardingNetworkEndpoint.
func (e *endpoint) SetForwarding(forwarding bool) {
        e.mu.Lock()
        defer e.mu.Unlock()

        if !e.setForwarding(forwarding) {
                return
        }

        allRoutersGroups := [...]tcpip.Address{
                header.IPv6AllRoutersInterfaceLocalMulticastAddress,
                header.IPv6AllRoutersLinkLocalMulticastAddress,
                header.IPv6AllRoutersSiteLocalMulticastAddress,
        }

        if forwarding {
                // As per RFC 4291 section 2.8:
                //
                //   A router is required to recognize all addresses that a host is
                //   required to recognize, plus the following addresses as identifying
                //   itself:
                //
                //      o The All-Routers multicast addresses defined in Section 2.7.1.
                //
                // As per RFC 4291 section 2.7.1,
                //
                //      All Routers Addresses:   FF01:0:0:0:0:0:0:2
                //                               FF02:0:0:0:0:0:0:2
                //                               FF05:0:0:0:0:0:0:2
                //
                //   The above multicast addresses identify the group of all IPv6 routers,
                //   within scope 1 (interface-local), 2 (link-local), or 5 (site-local).
                for _, g := range allRoutersGroups {
                        if err := e.joinGroupLocked(g); err != nil {
                                // joinGroupLocked only returns an error if the group address is not a
                                // valid IPv6 multicast address.
                                panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", g, err))
                        }
                }
        } else {
                for _, g := range allRoutersGroups {
                        switch err := e.leaveGroupLocked(g).(type) {
                        case nil:
                        case *tcpip.ErrBadLocalAddress:
                                // The endpoint may have already left the multicast group.
                        default:
                                panic(fmt.Sprintf("e.leaveGroupLocked(%s): %s", g, err))
                        }
                }
        }

        e.mu.ndp.forwardingChanged(forwarding)
}

// Enable implements stack.NetworkEndpoint.
func (e *endpoint) Enable() tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()

        // If the NIC is not enabled, the endpoint can't do anything meaningful so
        // don't enable the endpoint.
        if !e.nic.Enabled() {
                return &tcpip.ErrNotPermitted{}
        }

        // If the endpoint is already enabled, there is nothing for it to do.
        if !e.setEnabled(true) {
                return nil
        }

        // Groups may have been joined when the endpoint was disabled, or the
        // endpoint may have left groups from the perspective of MLD when the
        // endpoint was disabled. Either way, we need to let routers know to
        // send us multicast traffic.
        e.mu.mld.initializeAll()

        // Join the IPv6 All-Nodes Multicast group if the stack is configured to
        // use IPv6. This is required to ensure that this node properly receives
        // and responds to the various NDP messages that are destined to the
        // all-nodes multicast address. An example is the Neighbor Advertisement
        // when we perform Duplicate Address Detection, or Router Advertisement
        // when we do Router Discovery. See RFC 4862, section 5.4.2 and RFC 4861
        // section 4.2 for more information.
        //
        // Also auto-generate an IPv6 link-local address based on the endpoint's
        // link address if it is configured to do so. Note, each interface is
        // required to have IPv6 link-local unicast address, as per RFC 4291
        // section 2.1.

        // Join the All-Nodes multicast group before starting DAD as responses to DAD
        // (NDP NS) messages may be sent to the All-Nodes multicast group if the
        // source address of the NDP NS is the unspecified address, as per RFC 4861
        // section 7.2.4.
        if err := e.joinGroupLocked(header.IPv6AllNodesMulticastAddress); err != nil {
                // joinGroupLocked only returns an error if the group address is not a valid
                // IPv6 multicast address.
                panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", header.IPv6AllNodesMulticastAddress, err))
        }

        // Perform DAD on the all the unicast IPv6 endpoints that are in the permanent
        // state.
        //
        // Addresses may have already completed DAD but in the time since the endpoint
        // was last enabled, other devices may have acquired the same addresses.
        var err tcpip.Error
        e.mu.addressableEndpointState.ForEachEndpoint(func(addressEndpoint stack.AddressEndpoint) bool {
                addr := addressEndpoint.AddressWithPrefix().Address
                if !header.IsV6UnicastAddress(addr) {
                        return true
                }

                switch addressEndpoint.GetKind() {
                case stack.Permanent:
                        addressEndpoint.SetKind(stack.PermanentTentative)
                        fallthrough
                case stack.PermanentTentative:
                        err = e.mu.ndp.startDuplicateAddressDetection(addr, addressEndpoint)
                        return err == nil
                default:
                        return true
                }
        })
        if err != nil {
                return err
        }

        // Do not auto-generate an IPv6 link-local address for loopback devices.
        if e.protocol.options.AutoGenLinkLocal && !e.nic.IsLoopback() {
                // The valid and preferred lifetime is infinite for the auto-generated
                // link-local address.
                e.mu.ndp.doSLAAC(header.IPv6LinkLocalPrefix.Subnet(), header.NDPInfiniteLifetime, header.NDPInfiniteLifetime)
        }

        e.mu.ndp.startSolicitingRouters()
        return nil
}

// Enabled implements stack.NetworkEndpoint.
func (e *endpoint) Enabled() bool {
        return e.nic.Enabled() && e.isEnabled()
}

// isEnabled returns true if the endpoint is enabled, regardless of the
// enabled status of the NIC.
func (e *endpoint) isEnabled() bool {
        return atomic.LoadUint32(&e.enabled) == 1
}

// setEnabled sets the enabled status for the endpoint.
//
// Returns true if the enabled status was updated.
func (e *endpoint) setEnabled(v bool) bool {
        if v {
                return atomic.SwapUint32(&e.enabled, 1) == 0
        }
        return atomic.SwapUint32(&e.enabled, 0) == 1
}

// Disable implements stack.NetworkEndpoint.
func (e *endpoint) Disable() {
        e.mu.Lock()
        defer e.mu.Unlock()
        e.disableLocked()
}

func (e *endpoint) disableLocked() {
        if !e.Enabled() {
                return
        }

        e.mu.ndp.stopSolicitingRouters()
        // Stop DAD for all the tentative unicast addresses.
        e.mu.addressableEndpointState.ForEachEndpoint(func(addressEndpoint stack.AddressEndpoint) bool {
                if addressEndpoint.GetKind() != stack.PermanentTentative {
                        return true
                }

                addr := addressEndpoint.AddressWithPrefix().Address
                if header.IsV6UnicastAddress(addr) {
                        e.mu.ndp.stopDuplicateAddressDetection(addr, &stack.DADAborted{})
                }

                return true
        })
        e.mu.ndp.cleanupState()

        // The endpoint may have already left the multicast group.
        switch err := e.leaveGroupLocked(header.IPv6AllNodesMulticastAddress).(type) {
        case nil, *tcpip.ErrBadLocalAddress:
        default:
                panic(fmt.Sprintf("unexpected error when leaving group = %s: %s", header.IPv6AllNodesMulticastAddress, err))
        }

        // Leave groups from the perspective of MLD so that routers know that
        // we are no longer interested in the group.
        e.mu.mld.softLeaveAll()

        if !e.setEnabled(false) {
                panic("should have only done work to disable the endpoint if it was enabled")
        }
}

// DefaultTTL is the default hop limit for this endpoint.
func (e *endpoint) DefaultTTL() uint8 {
        return e.protocol.DefaultTTL()
}

// MTU implements stack.NetworkEndpoint. It returns the link-layer MTU minus the
// network layer max header length.
func (e *endpoint) MTU() uint32 {
        networkMTU, err := calculateNetworkMTU(e.nic.MTU(), header.IPv6MinimumSize)
        if err != nil {
                return 0
        }
        return networkMTU
}

// MaxHeaderLength returns the maximum length needed by ipv6 headers (and
// underlying protocols).
func (e *endpoint) MaxHeaderLength() uint16 {
        // TODO(gvisor.dev/issues/5035): The maximum header length returned here does
        // not open the possibility for the caller to know about size required for
        // extension headers.
        return e.nic.MaxHeaderLength() + header.IPv6MinimumSize
}

func addIPHeader(srcAddr, dstAddr tcpip.Address, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams, extensionHeaders header.IPv6ExtHdrSerializer) tcpip.Error {
        extHdrsLen := extensionHeaders.Length()
        length := pkt.Size() + extensionHeaders.Length()
        if length > math.MaxUint16 {
                return &tcpip.ErrMessageTooLong{}
        }
        header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize + extHdrsLen)).Encode(&header.IPv6Fields{
                PayloadLength:     uint16(length),
                TransportProtocol: params.Protocol,
                HopLimit:          params.TTL,
                TrafficClass:      params.TOS,
                SrcAddr:           srcAddr,
                DstAddr:           dstAddr,
                ExtensionHeaders:  extensionHeaders,
        })
        pkt.NetworkProtocolNumber = ProtocolNumber
        return nil
}

func packetMustBeFragmented(pkt *stack.PacketBuffer, networkMTU uint32) bool {
        payload := pkt.TransportHeader().View().Size() + pkt.Data().Size()
        return pkt.GSOOptions.Type == stack.GSONone && uint32(payload) > networkMTU
}

// handleFragments fragments pkt and calls the handler function on each
// fragment. It returns the number of fragments handled and the number of
// fragments left to be processed. The IP header must already be present in the
// original packet. The transport header protocol number is required to avoid
// parsing the IPv6 extension headers.
func (e *endpoint) handleFragments(r *stack.Route, networkMTU uint32, pkt *stack.PacketBuffer, transProto tcpip.TransportProtocolNumber, handler func(*stack.PacketBuffer) tcpip.Error) (int, int, tcpip.Error) {
        networkHeader := header.IPv6(pkt.NetworkHeader().View())

        // TODO(gvisor.dev/issue/3912): Once the Authentication or ESP Headers are
        // supported for outbound packets, their length should not affect the fragment
        // maximum payload length because they should only be transmitted once.
        fragmentPayloadLen := (networkMTU - header.IPv6FragmentHeaderSize) &^ 7
        if fragmentPayloadLen < header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit {
                // We need at least 8 bytes of space left for the fragmentable part because
                // the fragment payload must obviously be non-zero and must be a multiple
                // of 8 as per RFC 8200 section 4.5:
                //   Each complete fragment, except possibly the last ("rightmost") one, is
                //   an integer multiple of 8 octets long.
                return 0, 1, &tcpip.ErrMessageTooLong{}
        }

        if fragmentPayloadLen < uint32(pkt.TransportHeader().View().Size()) {
                // As per RFC 8200 Section 4.5, the Transport Header is expected to be small
                // enough to fit in the first fragment.
                return 0, 1, &tcpip.ErrMessageTooLong{}
        }

        pf := fragmentation.MakePacketFragmenter(pkt, fragmentPayloadLen, calculateFragmentReserve(pkt))
        id := atomic.AddUint32(&e.protocol.ids[hashRoute(r, e.protocol.hashIV)%buckets], 1)

        var n int
        for {
                fragPkt, more := buildNextFragment(&pf, networkHeader, transProto, id)
                if err := handler(fragPkt); err != nil {
                        return n, pf.RemainingFragmentCount() + 1, err
                }
                n++
                if !more {
                        return n, pf.RemainingFragmentCount(), nil
                }
        }
}

// WritePacket writes a packet to the given destination address and protocol.
func (e *endpoint) WritePacket(r *stack.Route, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) tcpip.Error {
        if err := addIPHeader(r.LocalAddress(), r.RemoteAddress(), pkt, params, nil /* extensionHeaders */); err != nil {
                return err
        }

        // iptables filtering. All packets that reach here are locally
        // generated.
        outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
        if ok := e.protocol.stack.IPTables().Check(stack.Output, pkt, r, "" /* preroutingAddr */, "" /* inNicName */, outNicName); !ok {
                // iptables is telling us to drop the packet.
                e.stats.ip.IPTablesOutputDropped.Increment()
                return nil
        }

        // If the packet is manipulated as per NAT Output rules, handle packet
        // based on destination address and do not send the packet to link
        // layer.
        //
        // We should do this for every packet, rather than only NATted packets, but
        // removing this check short circuits broadcasts before they are sent out to
        // other hosts.
        if pkt.NatDone {
                netHeader := header.IPv6(pkt.NetworkHeader().View())
                if ep := e.protocol.findEndpointWithAddress(netHeader.DestinationAddress()); ep != nil {
                        // Since we rewrote the packet but it is being routed back to us, we
                        // can safely assume the checksum is valid.
                        ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */)
                        return nil
                }
        }

        return e.writePacket(r, pkt, params.Protocol, false /* headerIncluded */)
}

func (e *endpoint) writePacket(r *stack.Route, pkt *stack.PacketBuffer, protocol tcpip.TransportProtocolNumber, headerIncluded bool) tcpip.Error {
        if r.Loop()&stack.PacketLoop != 0 {
                // If the packet was generated by the stack (not a raw/packet endpoint
                // where a packet may be written with the header included), then we can
                // safely assume the checksum is valid.
                e.handleLocalPacket(pkt, !headerIncluded /* canSkipRXChecksum */)
        }
        if r.Loop()&stack.PacketOut == 0 {
                return nil
        }

        // Postrouting NAT can only change the source address, and does not alter the
        // route or outgoing interface of the packet.
        outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
        if ok := e.protocol.stack.IPTables().Check(stack.Postrouting, pkt, r, "" /* preroutingAddr */, "" /* inNicName */, outNicName); !ok {
                // iptables is telling us to drop the packet.
                e.stats.ip.IPTablesPostroutingDropped.Increment()
                return nil
        }

        stats := e.stats.ip
        networkMTU, err := calculateNetworkMTU(e.nic.MTU(), uint32(pkt.NetworkHeader().View().Size()))
        if err != nil {
                stats.OutgoingPacketErrors.Increment()
                return err
        }

        if packetMustBeFragmented(pkt, networkMTU) {
                if pkt.NetworkPacketInfo.IsForwardedPacket {
                        // As per RFC 2460, section 4.5:
                        //   Unlike IPv4, fragmentation in IPv6 is performed only by source nodes,
                        //   not by routers along a packet's delivery path.
                        return &tcpip.ErrMessageTooLong{}
                }
                sent, remain, err := e.handleFragments(r, networkMTU, pkt, protocol, func(fragPkt *stack.PacketBuffer) tcpip.Error {
                        // TODO(gvisor.dev/issue/3884): Evaluate whether we want to send each
                        // fragment one by one using WritePacket() (current strategy) or if we
                        // want to create a PacketBufferList from the fragments and feed it to
                        // WritePackets(). It'll be faster but cost more memory.
                        return e.nic.WritePacket(r, ProtocolNumber, fragPkt)
                })
                stats.PacketsSent.IncrementBy(uint64(sent))
                stats.OutgoingPacketErrors.IncrementBy(uint64(remain))
                return err
        }

        if err := e.nic.WritePacket(r, ProtocolNumber, pkt); err != nil {
                stats.OutgoingPacketErrors.Increment()
                return err
        }

        stats.PacketsSent.Increment()
        return nil
}

// WritePackets implements stack.NetworkEndpoint.
func (e *endpoint) WritePackets(r *stack.Route, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, tcpip.Error) {
        if r.Loop()&stack.PacketLoop != 0 {
                panic("not implemented")
        }
        if r.Loop()&stack.PacketOut == 0 {
                return pkts.Len(), nil
        }

        stats := e.stats.ip
        linkMTU := e.nic.MTU()
        for pb := pkts.Front(); pb != nil; pb = pb.Next() {
                if err := addIPHeader(r.LocalAddress(), r.RemoteAddress(), pb, params, nil /* extensionHeaders */); err != nil {
                        return 0, err
                }

                networkMTU, err := calculateNetworkMTU(linkMTU, uint32(pb.NetworkHeader().View().Size()))
                if err != nil {
                        stats.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len()))
                        return 0, err
                }
                if packetMustBeFragmented(pb, networkMTU) {
                        // Keep track of the packet that is about to be fragmented so it can be
                        // removed once the fragmentation is done.
                        originalPkt := pb
                        if _, _, err := e.handleFragments(r, networkMTU, pb, params.Protocol, func(fragPkt *stack.PacketBuffer) tcpip.Error {
                                // Modify the packet list in place with the new fragments.
                                pkts.InsertAfter(pb, fragPkt)
                                pb = fragPkt
                                return nil
                        }); err != nil {
                                stats.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len()))
                                return 0, err
                        }
                        // Remove the packet that was just fragmented and process the rest.
                        pkts.Remove(originalPkt)
                }
        }

        // iptables filtering. All packets that reach here are locally
        // generated.
        outNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
        outputDropped, natPkts := e.protocol.stack.IPTables().CheckPackets(stack.Output, pkts, r, "" /* inNicName */, outNicName)
        stats.IPTablesOutputDropped.IncrementBy(uint64(len(outputDropped)))
        for pkt := range outputDropped {
                pkts.Remove(pkt)
        }

        // The NAT-ed packets may now be destined for us.
        locallyDelivered := 0
        for pkt := range natPkts {
                ep := e.protocol.findEndpointWithAddress(header.IPv6(pkt.NetworkHeader().View()).DestinationAddress())
                if ep == nil {
                        // The NAT-ed packet is still destined for some remote node.
                        continue
                }

                // Do not send the locally destined packet out the NIC.
                pkts.Remove(pkt)

                // Deliver the packet locally.
                ep.handleLocalPacket(pkt, true /* canSkipRXChecksum */)
                locallyDelivered++
        }

        // We ignore the list of NAT-ed packets here because Postrouting NAT can only
        // change the source address, and does not alter the route or outgoing
        // interface of the packet.
        postroutingDropped, _ := e.protocol.stack.IPTables().CheckPackets(stack.Postrouting, pkts, r, "" /* inNicName */, outNicName)
        stats.IPTablesPostroutingDropped.IncrementBy(uint64(len(postroutingDropped)))
        for pkt := range postroutingDropped {
                pkts.Remove(pkt)
        }

        // The rest of the packets can be delivered to the NIC as a batch.
        pktsLen := pkts.Len()
        written, err := e.nic.WritePackets(r, pkts, ProtocolNumber)
        stats.PacketsSent.IncrementBy(uint64(written))
        stats.OutgoingPacketErrors.IncrementBy(uint64(pktsLen - written))

        // Dropped packets aren't errors, so include them in the return value.
        return locallyDelivered + written + len(outputDropped) + len(postroutingDropped), err
}

// WriteHeaderIncludedPacket implements stack.NetworkEndpoint.
func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) tcpip.Error {
        // The packet already has an IP header, but there are a few required checks.
        h, ok := pkt.Data().PullUp(header.IPv6MinimumSize)
        if !ok {
                return &tcpip.ErrMalformedHeader{}
        }
        ipH := header.IPv6(h)

        // Always set the payload length.
        pktSize := pkt.Data().Size()
        ipH.SetPayloadLength(uint16(pktSize - header.IPv6MinimumSize))

        // Set the source address when zero.
        if ipH.SourceAddress() == header.IPv6Any {
                ipH.SetSourceAddress(r.LocalAddress())
        }

        // Populate the packet buffer's network header and don't allow an invalid
        // packet to be sent.
        //
        // Note that parsing only makes sure that the packet is well formed as per the
        // wire format. We also want to check if the header's fields are valid before
        // sending the packet.
        proto, _, _, _, ok := parse.IPv6(pkt)
        if !ok || !header.IPv6(pkt.NetworkHeader().View()).IsValid(pktSize) {
                return &tcpip.ErrMalformedHeader{}
        }

        return e.writePacket(r, pkt, proto, true /* headerIncluded */)
}

// forwardPacket attempts to forward a packet to its final destination.
func (e *endpoint) forwardPacket(pkt *stack.PacketBuffer) ip.ForwardingError {
        h := header.IPv6(pkt.NetworkHeader().View())

        dstAddr := h.DestinationAddress()
        // As per RFC 4291 section 2.5.6,
        //
        //   Routers must not forward any packets with Link-Local source or
        //   destination addresses to other links.
        if header.IsV6LinkLocalUnicastAddress(h.SourceAddress()) {
                return &ip.ErrLinkLocalSourceAddress{}
        }
        if header.IsV6LinkLocalUnicastAddress(dstAddr) || header.IsV6LinkLocalMulticastAddress(dstAddr) {
                return &ip.ErrLinkLocalDestinationAddress{}
        }

        hopLimit := h.HopLimit()
        if hopLimit <= 1 {
                // As per RFC 4443 section 3.3,
                //
                //   If a router receives a packet with a Hop Limit of zero, or if a
                //   router decrements a packet's Hop Limit to zero, it MUST discard the
                //   packet and originate an ICMPv6 Time Exceeded message with Code 0 to
                //   the source of the packet.  This indicates either a routing loop or
                //   too small an initial Hop Limit value.
                //
                // We return the original error rather than the result of returning
                // the ICMP packet because the original error is more relevant to
                // the caller.
                _ = e.protocol.returnError(&icmpReasonHopLimitExceeded{}, pkt)
                return &ip.ErrTTLExceeded{}
        }

        stk := e.protocol.stack

        // Check if the destination is owned by the stack.
        if ep := e.protocol.findEndpointWithAddress(dstAddr); ep != nil {
                inNicName := stk.FindNICNameFromID(e.nic.ID())
                outNicName := stk.FindNICNameFromID(ep.nic.ID())
                if ok := stk.IPTables().Check(stack.Forward, pkt, nil, "" /* preroutingAddr */, inNicName, outNicName); !ok {
                        // iptables is telling us to drop the packet.
                        e.stats.ip.IPTablesForwardDropped.Increment()
                        return nil
                }

                // The packet originally arrived on e so provide its NIC as the input NIC.
                ep.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */)
                return nil
        }

        // Check extension headers for any errors requiring action during forwarding.
        if err := e.processExtensionHeaders(h, pkt, true /* forwarding */); err != nil {
                return &ip.ErrParameterProblem{}
        }

        r, err := stk.FindRoute(0, "", dstAddr, ProtocolNumber, false /* multicastLoop */)
        switch err.(type) {
        case nil:
        case *tcpip.ErrNoRoute, *tcpip.ErrNetworkUnreachable:
                // We return the original error rather than the result of returning the
                // ICMP packet because the original error is more relevant to the caller.
                _ = e.protocol.returnError(&icmpReasonNetUnreachable{}, pkt)
                return &ip.ErrNoRoute{}
        default:
                return &ip.ErrOther{Err: err}
        }
        defer r.Release()

        inNicName := stk.FindNICNameFromID(e.nic.ID())
        outNicName := stk.FindNICNameFromID(r.NICID())
        if ok := stk.IPTables().Check(stack.Forward, pkt, nil, "" /* preroutingAddr */, inNicName, outNicName); !ok {
                // iptables is telling us to drop the packet.
                e.stats.ip.IPTablesForwardDropped.Increment()
                return nil
        }

        // We need to do a deep copy of the IP packet because
        // WriteHeaderIncludedPacket takes ownership of the packet buffer, but we do
        // not own it.
        newHdr := header.IPv6(stack.PayloadSince(pkt.NetworkHeader()))

        // As per RFC 8200 section 3,
        //
        //   Hop Limit           8-bit unsigned integer. Decremented by 1 by
        //                       each node that forwards the packet.
        newHdr.SetHopLimit(hopLimit - 1)

        switch err := r.WriteHeaderIncludedPacket(stack.NewPacketBuffer(stack.PacketBufferOptions{
                ReserveHeaderBytes: int(r.MaxHeaderLength()),
                Data:               buffer.View(newHdr).ToVectorisedView(),
                IsForwardedPacket:  true,
        })); err.(type) {
        case nil:
                return nil
        case *tcpip.ErrMessageTooLong:
                // As per RFC 4443, section 3.2:
                //   A Packet Too Big MUST be sent by a router in response to a packet that
                //   it cannot forward because the packet is larger than the MTU of the
                //   outgoing link.
                _ = e.protocol.returnError(&icmpReasonPacketTooBig{}, pkt)
                return &ip.ErrMessageTooLong{}
        default:
                return &ip.ErrOther{Err: err}
        }
}

// HandlePacket is called by the link layer when new ipv6 packets arrive for
// this endpoint.
func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) {
        stats := e.stats.ip

        stats.PacketsReceived.Increment()

        if !e.isEnabled() {
                stats.DisabledPacketsReceived.Increment()
                return
        }

        h, ok := e.protocol.parseAndValidate(pkt)
        if !ok {
                stats.MalformedPacketsReceived.Increment()
                return
        }

        if !e.nic.IsLoopback() {
                if !e.protocol.options.AllowExternalLoopbackTraffic {
                        if header.IsV6LoopbackAddress(h.SourceAddress()) {
                                stats.InvalidSourceAddressesReceived.Increment()
                                return
                        }

                        if header.IsV6LoopbackAddress(h.DestinationAddress()) {
                                stats.InvalidDestinationAddressesReceived.Increment()
                                return
                        }
                }

                if e.protocol.stack.HandleLocal() {
                        addressEndpoint := e.AcquireAssignedAddress(header.IPv6(pkt.NetworkHeader().View()).SourceAddress(), e.nic.Promiscuous(), stack.CanBePrimaryEndpoint)
                        if addressEndpoint != nil {
                                addressEndpoint.DecRef()

                                // The source address is one of our own, so we never should have gotten
                                // a packet like this unless HandleLocal is false or our NIC is the
                                // loopback interface.
                                stats.InvalidSourceAddressesReceived.Increment()
                                return
                        }
                }

                // Loopback traffic skips the prerouting chain.
                inNicName := e.protocol.stack.FindNICNameFromID(e.nic.ID())
                if ok := e.protocol.stack.IPTables().Check(stack.Prerouting, pkt, nil, e.MainAddress().Address, inNicName, "" /* outNicName */); !ok {
                        // iptables is telling us to drop the packet.
                        stats.IPTablesPreroutingDropped.Increment()
                        return
                }
        }

        e.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */)
}

// handleLocalPacket is like HandlePacket except it does not perform the
// prerouting iptables hook or check for loopback traffic that originated from
// outside of the netstack (i.e. martian loopback packets).
func (e *endpoint) handleLocalPacket(pkt *stack.PacketBuffer, canSkipRXChecksum bool) {
        stats := e.stats.ip
        stats.PacketsReceived.Increment()

        pkt = pkt.CloneToInbound()
        pkt.RXTransportChecksumValidated = canSkipRXChecksum

        h, ok := e.protocol.parseAndValidate(pkt)
        if !ok {
                stats.MalformedPacketsReceived.Increment()
                return
        }

        e.handleValidatedPacket(h, pkt, e.nic.Name() /* inNICName */)
}

func (e *endpoint) handleValidatedPacket(h header.IPv6, pkt *stack.PacketBuffer, inNICName string) {
        // Raw socket packets are delivered based solely on the transport protocol
        // number. We only require that the packet be valid IPv6.
        e.dispatcher.DeliverRawPacket(h.TransportProtocol(), pkt)

        pkt.NICID = e.nic.ID()
        stats := e.stats.ip
        stats.ValidPacketsReceived.Increment()

        srcAddr := h.SourceAddress()
        dstAddr := h.DestinationAddress()

        // As per RFC 4291 section 2.7:
        //   Multicast addresses must not be used as source addresses in IPv6
        //   packets or appear in any Routing header.
        if header.IsV6MulticastAddress(srcAddr) {
                stats.InvalidSourceAddressesReceived.Increment()
                return
        }

        // The destination address should be an address we own or a group we joined
        // for us to receive the packet. Otherwise, attempt to forward the packet.
        if addressEndpoint := e.AcquireAssignedAddress(dstAddr, e.nic.Promiscuous(), stack.CanBePrimaryEndpoint); addressEndpoint != nil {
                addressEndpoint.DecRef()
        } else if !e.IsInGroup(dstAddr) {
                if !e.Forwarding() {
                        stats.InvalidDestinationAddressesReceived.Increment()
                        return
                }
                switch err := e.forwardPacket(pkt); err.(type) {
                case nil:
                        return
                case *ip.ErrLinkLocalSourceAddress:
                        e.stats.ip.Forwarding.LinkLocalSource.Increment()
                case *ip.ErrLinkLocalDestinationAddress:
                        e.stats.ip.Forwarding.LinkLocalDestination.Increment()
                case *ip.ErrTTLExceeded:
                        e.stats.ip.Forwarding.ExhaustedTTL.Increment()
                case *ip.ErrNoRoute:
                        e.stats.ip.Forwarding.Unrouteable.Increment()
                case *ip.ErrParameterProblem:
                        e.stats.ip.Forwarding.ExtensionHeaderProblem.Increment()
                case *ip.ErrMessageTooLong:
                        e.stats.ip.Forwarding.PacketTooBig.Increment()
                default:
                        panic(fmt.Sprintf("unexpected error %s while trying to forward packet: %#v", err, pkt))
                }
                e.stats.ip.Forwarding.Errors.Increment()
                return
        }

        // iptables filtering. All packets that reach here are intended for
        // this machine and need not be forwarded.
        if ok := e.protocol.stack.IPTables().Check(stack.Input, pkt, nil, "" /* preroutingAddr */, inNICName, "" /* outNicName */); !ok {
                // iptables is telling us to drop the packet.
                stats.IPTablesInputDropped.Increment()
                return
        }

        // Any returned error is only useful for terminating execution early, but
        // we have nothing left to do, so we can drop it.
        _ = e.processExtensionHeaders(h, pkt, false /* forwarding */)
}

// processExtensionHeaders processes the extension headers in the given packet.
// Returns an error if the processing of a header failed or if the packet should
// be discarded.
func (e *endpoint) processExtensionHeaders(h header.IPv6, pkt *stack.PacketBuffer, forwarding bool) error {
        stats := e.stats.ip
        srcAddr := h.SourceAddress()
        dstAddr := h.DestinationAddress()

        // Create a VV to parse the packet. We don't plan to modify anything here.
        // vv consists of:
        // - Any IPv6 header bytes after the first 40 (i.e. extensions).
        // - The transport header, if present.
        // - Any other payload data.
        vv := pkt.NetworkHeader().View()[header.IPv6MinimumSize:].ToVectorisedView()
        vv.AppendView(pkt.TransportHeader().View())
        vv.AppendViews(pkt.Data().Views())
        it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), vv)

        var (
                hasFragmentHeader bool
                routerAlert       *header.IPv6RouterAlertOption
        )

        for {
                // Keep track of the start of the previous header so we can report the
                // special case of a Hop by Hop at a location other than at the start.
                previousHeaderStart := it.HeaderOffset()
                extHdr, done, err := it.Next()
                if err != nil {
                        stats.MalformedPacketsReceived.Increment()
                        return err
                }
                if done {
                        break
                }

                // As per RFC 8200, section 4:
                //
                //   Extension headers (except for the Hop-by-Hop Options header) are
                //   not processed, inserted, or deleted by any node along a packet's
                //   delivery path until the packet reaches the node identified in the
                //   Destination Address field of the IPv6 header.
                //
                // Furthermore, as per RFC 8200 section 4.1, the Hop By Hop extension
                // header is restricted to appear first in the list of extension headers.
                //
                // Therefore, we can immediately return once we hit any header other
                // than the Hop-by-Hop header while forwarding a packet.
                if forwarding {
                        if _, ok := extHdr.(header.IPv6HopByHopOptionsExtHdr); !ok {
                                return nil
                        }
                }

                switch extHdr := extHdr.(type) {
                case header.IPv6HopByHopOptionsExtHdr:
                        // As per RFC 8200 section 4.1, the Hop By Hop extension header is
                        // restricted to appear immediately after an IPv6 fixed header.
                        if previousHeaderStart != 0 {
                                _ = e.protocol.returnError(&icmpReasonParameterProblem{
                                        code:       header.ICMPv6UnknownHeader,
                                        pointer:    previousHeaderStart,
                                        forwarding: forwarding,
                                }, pkt)
                                return fmt.Errorf("found Hop-by-Hop header = %#v with non-zero previous header offset = %d", extHdr, previousHeaderStart)
                        }

                        optsIt := extHdr.Iter()

                        for {
                                opt, done, err := optsIt.Next()
                                if err != nil {
                                        stats.MalformedPacketsReceived.Increment()
                                        return err
                                }
                                if done {
                                        break
                                }

                                switch opt := opt.(type) {
                                case *header.IPv6RouterAlertOption:
                                        if routerAlert != nil {
                                                // As per RFC 2711 section 3, there should be at most one Router
                                                // Alert option per packet.
                                                //
                                                //    There MUST only be one option of this type, regardless of
                                                //    value, per Hop-by-Hop header.
                                                stats.MalformedPacketsReceived.Increment()
                                                return fmt.Errorf("found multiple Router Alert options (%#v, %#v)", opt, routerAlert)
                                        }
                                        routerAlert = opt
                                        stats.OptionRouterAlertReceived.Increment()
                                default:
                                        switch opt.UnknownAction() {
                                        case header.IPv6OptionUnknownActionSkip:
                                        case header.IPv6OptionUnknownActionDiscard:
                                                return fmt.Errorf("found unknown Hop-by-Hop header option = %#v with discard action", opt)
                                        case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest:
                                                if header.IsV6MulticastAddress(dstAddr) {
                                                        return fmt.Errorf("found unknown hop-by-hop header option = %#v with discard action", opt)
                                                }
                                                fallthrough
                                        case header.IPv6OptionUnknownActionDiscardSendICMP:
                                                // This case satisfies a requirement of RFC 8200 section 4.2 which
                                                // states that an unknown option starting with bits [10] should:
                                                //
                                                //    discard the packet and, regardless of whether or not the
                                                //    packet's Destination Address was a multicast address, send an
                                                //    ICMP Parameter Problem, Code 2, message to the packet's
                                                //    Source Address, pointing to the unrecognized Option Type.
                                                _ = e.protocol.returnError(&icmpReasonParameterProblem{
                                                        code:               header.ICMPv6UnknownOption,
                                                        pointer:            it.ParseOffset() + optsIt.OptionOffset(),
                                                        respondToMulticast: true,
                                                        forwarding:         forwarding,
                                                }, pkt)
                                                return fmt.Errorf("found unknown hop-by-hop header option = %#v with discard action", opt)
                                        default:
                                                panic(fmt.Sprintf("unrecognized action for an unrecognized Hop By Hop extension header option = %#v", opt))
                                        }
                                }
                        }

                case header.IPv6RoutingExtHdr:
                        // As per RFC 8200 section 4.4, if a node encounters a routing header with
                        // an unrecognized routing type value, with a non-zero Segments Left
                        // value, the node must discard the packet and send an ICMP Parameter
                        // Problem, Code 0 to the packet's Source Address, pointing to the
                        // unrecognized Routing Type.
                        //
                        // If the Segments Left is 0, the node must ignore the Routing extension
                        // header and process the next header in the packet.
                        //
                        // Note, the stack does not yet handle any type of routing extension
                        // header, so we just make sure Segments Left is zero before processing
                        // the next extension header.
                        if extHdr.SegmentsLeft() != 0 {
                                _ = e.protocol.returnError(&icmpReasonParameterProblem{
                                        code:    header.ICMPv6ErroneousHeader,
                                        pointer: it.ParseOffset(),
                                        // For the sake of consistency, we're using the value of `forwarding`
                                        // here, even though it should always be false if we've reached this
                                        // point. If `forwarding` is true here, we're executing undefined
                                        // behavior no matter what.
                                        forwarding: forwarding,
                                }, pkt)
                                return fmt.Errorf("found unrecognized routing type with non-zero segments left in header = %#v", extHdr)
                        }

                case header.IPv6FragmentExtHdr:
                        hasFragmentHeader = true

                        if extHdr.IsAtomic() {
                                // This fragment extension header indicates that this packet is an
                                // atomic fragment. An atomic fragment is a fragment that contains
                                // all the data required to reassemble a full packet. As per RFC 6946,
                                // atomic fragments must not interfere with "normal" fragmented traffic
                                // so we skip processing the fragment instead of feeding it through the
                                // reassembly process below.
                                continue
                        }

                        fragmentFieldOffset := it.ParseOffset()

                        // Don't consume the iterator if we have the first fragment because we
                        // will use it to validate that the first fragment holds the upper layer
                        // header.
                        rawPayload := it.AsRawHeader(extHdr.FragmentOffset() != 0 /* consume */)

                        if extHdr.FragmentOffset() == 0 {
                                // Check that the iterator ends with a raw payload as the first fragment
                                // should include all headers up to and including any upper layer
                                // headers, as per RFC 8200 section 4.5; only upper layer data
                                // (non-headers) should follow the fragment extension header.
                                var lastHdr header.IPv6PayloadHeader

                                for {
                                        it, done, err := it.Next()
                                        if err != nil {
                                                stats.MalformedPacketsReceived.Increment()
                                                stats.MalformedFragmentsReceived.Increment()
                                                return err
                                        }
                                        if done {
                                                break
                                        }

                                        lastHdr = it
                                }

                                // If the last header is a raw header, then the last portion of the IPv6
                                // payload is not a known IPv6 extension header. Note, this does not
                                // mean that the last portion is an upper layer header or not an
                                // extension header because:
                                //  1) we do not yet support all extension headers
                                //  2) we do not validate the upper layer header before reassembling.
                                //
                                // This check makes sure that a known IPv6 extension header is not
                                // present after the Fragment extension header in a non-initial
                                // fragment.
                                //
                                // TODO(#2196): Support IPv6 Authentication and Encapsulated
                                // Security Payload extension headers.
                                // TODO(#2333): Validate that the upper layer header is valid.
                                switch lastHdr.(type) {
                                case header.IPv6RawPayloadHeader:
                                default:
                                        stats.MalformedPacketsReceived.Increment()
                                        stats.MalformedFragmentsReceived.Increment()
                                        return fmt.Errorf("known extension header = %#v present after fragment header in a non-initial fragment", lastHdr)
                                }
                        }

                        fragmentPayloadLen := rawPayload.Buf.Size()
                        if fragmentPayloadLen == 0 {
                                // Drop the packet as it's marked as a fragment but has no payload.
                                stats.MalformedPacketsReceived.Increment()
                                stats.MalformedFragmentsReceived.Increment()
                                return fmt.Errorf("fragment has no payload")
                        }

                        // As per RFC 2460 Section 4.5:
                        //
                        //    If the length of a fragment, as derived from the fragment packet's
                        //    Payload Length field, is not a multiple of 8 octets and the M flag
                        //    of that fragment is 1, then that fragment must be discarded and an
                        //    ICMP Parameter Problem, Code 0, message should be sent to the source
                        //    of the fragment, pointing to the Payload Length field of the
                        //    fragment packet.
                        if extHdr.More() && fragmentPayloadLen%header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit != 0 {
                                stats.MalformedPacketsReceived.Increment()
                                stats.MalformedFragmentsReceived.Increment()
                                _ = e.protocol.returnError(&icmpReasonParameterProblem{
                                        code:    header.ICMPv6ErroneousHeader,
                                        pointer: header.IPv6PayloadLenOffset,
                                }, pkt)
                                return fmt.Errorf("found fragment length = %d that is not a multiple of 8 octets", fragmentPayloadLen)
                        }

                        // The packet is a fragment, let's try to reassemble it.
                        start := extHdr.FragmentOffset() * header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit

                        // As per RFC 2460 Section 4.5:
                        //
                        //    If the length and offset of a fragment are such that the Payload
                        //    Length of the packet reassembled from that fragment would exceed
                        //    65,535 octets, then that fragment must be discarded and an ICMP
                        //    Parameter Problem, Code 0, message should be sent to the source of
                        //    the fragment, pointing to the Fragment Offset field of the fragment
                        //    packet.
                        lengthAfterReassembly := int(start) + fragmentPayloadLen
                        if lengthAfterReassembly > header.IPv6MaximumPayloadSize {
                                stats.MalformedPacketsReceived.Increment()
                                stats.MalformedFragmentsReceived.Increment()
                                _ = e.protocol.returnError(&icmpReasonParameterProblem{
                                        code:    header.ICMPv6ErroneousHeader,
                                        pointer: fragmentFieldOffset,
                                }, pkt)
                                return fmt.Errorf("determined that reassembled packet length = %d would exceed allowed length = %d", lengthAfterReassembly, header.IPv6MaximumPayloadSize)
                        }

                        // Note that pkt doesn't have its transport header set after reassembly,
                        // and won't until DeliverNetworkPacket sets it.
                        resPkt, proto, ready, err := e.protocol.fragmentation.Process(
                                // IPv6 ignores the Protocol field since the ID only needs to be unique
                                // across source-destination pairs, as per RFC 8200 section 4.5.
                                fragmentation.FragmentID{
                                        Source:      srcAddr,
                                        Destination: dstAddr,
                                        ID:          extHdr.ID(),
                                },
                                start,
                                start+uint16(fragmentPayloadLen)-1,
                                extHdr.More(),
                                uint8(rawPayload.Identifier),
                                pkt,
                        )
                        if err != nil {
                                stats.MalformedPacketsReceived.Increment()
                                stats.MalformedFragmentsReceived.Increment()
                                return err
                        }

                        if ready {
                                pkt = resPkt

                                // We create a new iterator with the reassembled packet because we could
                                // have more extension headers in the reassembled payload, as per RFC
                                // 8200 section 4.5. We also use the NextHeader value from the first
                                // fragment.
                                data := pkt.Data()
                                dataVV := buffer.NewVectorisedView(data.Size(), data.Views())
                                it = header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(proto), dataVV)
                        }

                case header.IPv6DestinationOptionsExtHdr:
                        optsIt := extHdr.Iter()

                        for {
                                opt, done, err := optsIt.Next()
                                if err != nil {
                                        stats.MalformedPacketsReceived.Increment()
                                        return err
                                }
                                if done {
                                        break
                                }

                                // We currently do not support any IPv6 Destination extension header
                                // options.
                                switch opt.UnknownAction() {
                                case header.IPv6OptionUnknownActionSkip:
                                case header.IPv6OptionUnknownActionDiscard:
                                        return fmt.Errorf("found unknown destination header option = %#v with discard action", opt)
                                case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest:
                                        if header.IsV6MulticastAddress(dstAddr) {
                                                return fmt.Errorf("found unknown destination header option %#v with discard action", opt)
                                        }
                                        fallthrough
                                case header.IPv6OptionUnknownActionDiscardSendICMP:
                                        // This case satisfies a requirement of RFC 8200 section 4.2
                                        // which states that an unknown option starting with bits [10] should:
                                        //
                                        //    discard the packet and, regardless of whether or not the
                                        //    packet's Destination Address was a multicast address, send an
                                        //    ICMP Parameter Problem, Code 2, message to the packet's
                                        //    Source Address, pointing to the unrecognized Option Type.
                                        //
                                        _ = e.protocol.returnError(&icmpReasonParameterProblem{
                                                code:               header.ICMPv6UnknownOption,
                                                pointer:            it.ParseOffset() + optsIt.OptionOffset(),
                                                respondToMulticast: true,
                                        }, pkt)
                                        return fmt.Errorf("found unknown destination header option %#v with discard action", opt)
                                default:
                                        panic(fmt.Sprintf("unrecognized action for an unrecognized Destination extension header option = %#v", opt))
                                }
                        }

                case header.IPv6RawPayloadHeader:
                        // If the last header in the payload isn't a known IPv6 extension header,
                        // handle it as if it is transport layer data.

                        // Calculate the number of octets parsed from data. We want to remove all
                        // the data except the unparsed portion located at the end, which its size
                        // is extHdr.Buf.Size().
                        trim := pkt.Data().Size() - extHdr.Buf.Size()

                        // For unfragmented packets, extHdr still contains the transport header.
                        // Get rid of it.
                        //
                        // For reassembled fragments, pkt.TransportHeader is unset, so this is a
                        // no-op and pkt.Data begins with the transport header.
                        trim += pkt.TransportHeader().View().Size()

                        pkt.Data().DeleteFront(trim)

                        stats.PacketsDelivered.Increment()
                        if p := tcpip.TransportProtocolNumber(extHdr.Identifier); p == header.ICMPv6ProtocolNumber {
                                pkt.TransportProtocolNumber = p
                                e.handleICMP(pkt, hasFragmentHeader, routerAlert)
                        } else {
                                stats.PacketsDelivered.Increment()
                                switch res := e.dispatcher.DeliverTransportPacket(p, pkt); res {
                                case stack.TransportPacketHandled:
                                case stack.TransportPacketDestinationPortUnreachable:
                                        // As per RFC 4443 section 3.1:
                                        //   A destination node SHOULD originate a Destination Unreachable
                                        //   message with Code 4 in response to a packet for which the
                                        //   transport protocol (e.g., UDP) has no listener, if that transport
                                        //   protocol has no alternative means to inform the sender.
                                        _ = e.protocol.returnError(&icmpReasonPortUnreachable{}, pkt)
                                        return fmt.Errorf("destination port unreachable")
                                case stack.TransportPacketProtocolUnreachable:
                                        // As per RFC 8200 section 4. (page 7):
                                        //   Extension headers are numbered from IANA IP Protocol Numbers
                                        //   [IANA-PN], the same values used for IPv4 and IPv6.  When
                                        //   processing a sequence of Next Header values in a packet, the
                                        //   first one that is not an extension header [IANA-EH] indicates
                                        //   that the next item in the packet is the corresponding upper-layer
                                        //   header.
                                        // With more related information on page 8:
                                        //   If, as a result of processing a header, the destination node is
                                        //   required to proceed to the next header but the Next Header value
                                        //   in the current header is unrecognized by the node, it should
                                        //   discard the packet and send an ICMP Parameter Problem message to
                                        //   the source of the packet, with an ICMP Code value of 1
                                        //   ("unrecognized Next Header type encountered") and the ICMP
                                        //   Pointer field containing the offset of the unrecognized value
                                        //   within the original packet.
                                        //
                                        // Which when taken together indicate that an unknown protocol should
                                        // be treated as an unrecognized next header value.
                                        // The location of the Next Header field is in a different place in
                                        // the initial IPv6 header than it is in the extension headers so
                                        // treat it specially.
                                        prevHdrIDOffset := uint32(header.IPv6NextHeaderOffset)
                                        if previousHeaderStart != 0 {
                                                prevHdrIDOffset = previousHeaderStart
                                        }
                                        _ = e.protocol.returnError(&icmpReasonParameterProblem{
                                                code:    header.ICMPv6UnknownHeader,
                                                pointer: prevHdrIDOffset,
                                        }, pkt)
                                        return fmt.Errorf("transport protocol unreachable")
                                default:
                                        panic(fmt.Sprintf("unrecognized result from DeliverTransportPacket = %d", res))
                                }
                        }

                default:
                        // Since the iterator returns IPv6RawPayloadHeader for unknown Extension
                        // Header IDs this should never happen unless we missed a supported type
                        // here.
                        panic(fmt.Sprintf("unrecognized type from it.Next() = %T", extHdr))

                }
        }
        return nil
}

// Close cleans up resources associated with the endpoint.
func (e *endpoint) Close() {
        e.mu.Lock()
        e.disableLocked()
        e.mu.addressableEndpointState.Cleanup()
        e.mu.Unlock()

        e.protocol.forgetEndpoint(e.nic.ID())
}

// NetworkProtocolNumber implements stack.NetworkEndpoint.
func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
        return e.protocol.Number()
}

// AddAndAcquirePermanentAddress implements stack.AddressableEndpoint.
func (e *endpoint) AddAndAcquirePermanentAddress(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, tcpip.Error) {
        // TODO(b/169350103): add checks here after making sure we no longer receive
        // an empty address.
        e.mu.Lock()
        defer e.mu.Unlock()
        return e.addAndAcquirePermanentAddressLocked(addr, peb, configType, deprecated)
}

// addAndAcquirePermanentAddressLocked is like AddAndAcquirePermanentAddress but
// with locking requirements.
//
// addAndAcquirePermanentAddressLocked also joins the passed address's
// solicited-node multicast group and start duplicate address detection.
//
// Precondition: e.mu must be write locked.
func (e *endpoint) addAndAcquirePermanentAddressLocked(addr tcpip.AddressWithPrefix, peb stack.PrimaryEndpointBehavior, configType stack.AddressConfigType, deprecated bool) (stack.AddressEndpoint, tcpip.Error) {
        addressEndpoint, err := e.mu.addressableEndpointState.AddAndAcquirePermanentAddress(addr, peb, configType, deprecated)
        if err != nil {
                return nil, err
        }

        if !header.IsV6UnicastAddress(addr.Address) {
                return addressEndpoint, nil
        }

        addressEndpoint.SetKind(stack.PermanentTentative)

        if e.Enabled() {
                if err := e.mu.ndp.startDuplicateAddressDetection(addr.Address, addressEndpoint); err != nil {
                        return nil, err
                }
        }

        snmc := header.SolicitedNodeAddr(addr.Address)
        if err := e.joinGroupLocked(snmc); err != nil {
                // joinGroupLocked only returns an error if the group address is not a valid
                // IPv6 multicast address.
                panic(fmt.Sprintf("e.joinGroupLocked(%s): %s", snmc, err))
        }

        return addressEndpoint, nil
}

// RemovePermanentAddress implements stack.AddressableEndpoint.
func (e *endpoint) RemovePermanentAddress(addr tcpip.Address) tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()

        addressEndpoint := e.getAddressRLocked(addr)
        if addressEndpoint == nil || !addressEndpoint.GetKind().IsPermanent() {
                return &tcpip.ErrBadLocalAddress{}
        }

        return e.removePermanentEndpointLocked(addressEndpoint, true /* allowSLAACInvalidation */, &stack.DADAborted{})
}

// removePermanentEndpointLocked is like removePermanentAddressLocked except
// it works with a stack.AddressEndpoint.
//
// Precondition: e.mu must be write locked.
func (e *endpoint) removePermanentEndpointLocked(addressEndpoint stack.AddressEndpoint, allowSLAACInvalidation bool, dadResult stack.DADResult) tcpip.Error {
        addr := addressEndpoint.AddressWithPrefix()
        // If we are removing an address generated via SLAAC, cleanup
        // its SLAAC resources and notify the integrator.
        switch addressEndpoint.ConfigType() {
        case stack.AddressConfigSlaac:
                e.mu.ndp.cleanupSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation)
        case stack.AddressConfigSlaacTemp:
                e.mu.ndp.cleanupTempSLAACAddrResourcesAndNotify(addr)
        }

        return e.removePermanentEndpointInnerLocked(addressEndpoint, dadResult)
}

// removePermanentEndpointInnerLocked is like removePermanentEndpointLocked
// except it does not cleanup SLAAC address state.
//
// Precondition: e.mu must be write locked.
func (e *endpoint) removePermanentEndpointInnerLocked(addressEndpoint stack.AddressEndpoint, dadResult stack.DADResult) tcpip.Error {
        addr := addressEndpoint.AddressWithPrefix()
        e.mu.ndp.stopDuplicateAddressDetection(addr.Address, dadResult)

        if err := e.mu.addressableEndpointState.RemovePermanentEndpoint(addressEndpoint); err != nil {
                return err
        }

        snmc := header.SolicitedNodeAddr(addr.Address)
        err := e.leaveGroupLocked(snmc)
        // The endpoint may have already left the multicast group.
        if _, ok := err.(*tcpip.ErrBadLocalAddress); ok {
                err = nil
        }
        return err
}

// hasPermanentAddressLocked returns true if the endpoint has a permanent
// address equal to the passed address.
//
// Precondition: e.mu must be read or write locked.
func (e *endpoint) hasPermanentAddressRLocked(addr tcpip.Address) bool {
        addressEndpoint := e.getAddressRLocked(addr)
        if addressEndpoint == nil {
                return false
        }
        return addressEndpoint.GetKind().IsPermanent()
}

// getAddressRLocked returns the endpoint for the passed address.
//
// Precondition: e.mu must be read or write locked.
func (e *endpoint) getAddressRLocked(localAddr tcpip.Address) stack.AddressEndpoint {
        return e.mu.addressableEndpointState.GetAddress(localAddr)
}

// MainAddress implements stack.AddressableEndpoint.
func (e *endpoint) MainAddress() tcpip.AddressWithPrefix {
        e.mu.RLock()
        defer e.mu.RUnlock()
        return e.mu.addressableEndpointState.MainAddress()
}

// AcquireAssignedAddress implements stack.AddressableEndpoint.
func (e *endpoint) AcquireAssignedAddress(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint {
        e.mu.RLock()
        defer e.mu.RUnlock()
        return e.acquireAddressOrCreateTempLocked(localAddr, allowTemp, tempPEB)
}

// acquireAddressOrCreateTempLocked is like AcquireAssignedAddress but with
// locking requirements.
//
// Precondition: e.mu must be write locked.
func (e *endpoint) acquireAddressOrCreateTempLocked(localAddr tcpip.Address, allowTemp bool, tempPEB stack.PrimaryEndpointBehavior) stack.AddressEndpoint {
        return e.mu.addressableEndpointState.AcquireAssignedAddress(localAddr, allowTemp, tempPEB)
}

// AcquireOutgoingPrimaryAddress implements stack.AddressableEndpoint.
func (e *endpoint) AcquireOutgoingPrimaryAddress(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
        e.mu.RLock()
        defer e.mu.RUnlock()
        return e.acquireOutgoingPrimaryAddressRLocked(remoteAddr, allowExpired)
}

// getLinkLocalAddressRLocked returns a link-local address from the primary list
// of addresses, if one is available.
//
// See stack.PrimaryEndpointBehavior for more details about the primary list.
//
// Precondition: e.mu must be read locked.
func (e *endpoint) getLinkLocalAddressRLocked() tcpip.Address {
        var linkLocalAddr tcpip.Address
        e.mu.addressableEndpointState.ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) bool {
                if addressEndpoint.IsAssigned(false /* allowExpired */) {
                        if addr := addressEndpoint.AddressWithPrefix().Address; header.IsV6LinkLocalUnicastAddress(addr) {
                                linkLocalAddr = addr
                                return false
                        }
                }
                return true
        })
        return linkLocalAddr
}

// acquireOutgoingPrimaryAddressRLocked is like AcquireOutgoingPrimaryAddress
// but with locking requirements.
//
// Precondition: e.mu must be read locked.
func (e *endpoint) acquireOutgoingPrimaryAddressRLocked(remoteAddr tcpip.Address, allowExpired bool) stack.AddressEndpoint {
        // addrCandidate is a candidate for Source Address Selection, as per
        // RFC 6724 section 5.
        type addrCandidate struct {
                addressEndpoint stack.AddressEndpoint
                addr            tcpip.Address
                scope           header.IPv6AddressScope

                label          uint8
                matchingPrefix uint8
        }

        if len(remoteAddr) == 0 {
                return e.mu.addressableEndpointState.AcquireOutgoingPrimaryAddress(remoteAddr, allowExpired)
        }

        // Create a candidate set of available addresses we can potentially use as a
        // source address.
        var cs []addrCandidate
        e.mu.addressableEndpointState.ForEachPrimaryEndpoint(func(addressEndpoint stack.AddressEndpoint) bool {
                // If r is not valid for outgoing connections, it is not a valid endpoint.
                if !addressEndpoint.IsAssigned(allowExpired) {
                        return true
                }

                addr := addressEndpoint.AddressWithPrefix().Address
                scope, err := header.ScopeForIPv6Address(addr)
                if err != nil {
                        // Should never happen as we got r from the primary IPv6 endpoint list and
                        // ScopeForIPv6Address only returns an error if addr is not an IPv6
                        // address.
                        panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", addr, err))
                }

                cs = append(cs, addrCandidate{
                        addressEndpoint: addressEndpoint,
                        addr:            addr,
                        scope:           scope,
                        label:           getLabel(addr),
                        matchingPrefix:  remoteAddr.MatchingPrefix(addr),
                })

                return true
        })

        remoteScope, err := header.ScopeForIPv6Address(remoteAddr)
        if err != nil {
                // primaryIPv6Endpoint should never be called with an invalid IPv6 address.
                panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", remoteAddr, err))
        }

        remoteLabel := getLabel(remoteAddr)

        // Sort the addresses as per RFC 6724 section 5 rules 1-3.
        //
        // TODO(b/146021396): Implement rules 4, 5 of RFC 6724 section 5.
        sort.Slice(cs, func(i, j int) bool {
                sa := cs[i]
                sb := cs[j]

                // Prefer same address as per RFC 6724 section 5 rule 1.
                if sa.addr == remoteAddr {
                        return true
                }
                if sb.addr == remoteAddr {
                        return false
                }

                // Prefer appropriate scope as per RFC 6724 section 5 rule 2.
                if sa.scope < sb.scope {
                        return sa.scope >= remoteScope
                } else if sb.scope < sa.scope {
                        return sb.scope < remoteScope
                }

                // Avoid deprecated addresses as per RFC 6724 section 5 rule 3.
                if saDep, sbDep := sa.addressEndpoint.Deprecated(), sb.addressEndpoint.Deprecated(); saDep != sbDep {
                        // If sa is not deprecated, it is preferred over sb.
                        return sbDep
                }

                // Prefer matching label as per RFC 6724 section 5 rule 6.
                if sa, sb := sa.label == remoteLabel, sb.label == remoteLabel; sa != sb {
                        if sa {
                                return true
                        }
                        if sb {
                                return false
                        }
                }

                // Prefer temporary addresses as per RFC 6724 section 5 rule 7.
                if saTemp, sbTemp := sa.addressEndpoint.ConfigType() == stack.AddressConfigSlaacTemp, sb.addressEndpoint.ConfigType() == stack.AddressConfigSlaacTemp; saTemp != sbTemp {
                        return saTemp
                }

                // Use longest matching prefix as per RFC 6724 section 5 rule 8.
                if sa.matchingPrefix > sb.matchingPrefix {
                        return true
                }
                if sb.matchingPrefix > sa.matchingPrefix {
                        return false
                }

                // sa and sb are equal, return the endpoint that is closest to the front of
                // the primary endpoint list.
                return i < j
        })

        // Return the most preferred address that can have its reference count
        // incremented.
        for _, c := range cs {
                if c.addressEndpoint.IncRef() {
                        return c.addressEndpoint
                }
        }

        return nil
}

// PrimaryAddresses implements stack.AddressableEndpoint.
func (e *endpoint) PrimaryAddresses() []tcpip.AddressWithPrefix {
        e.mu.RLock()
        defer e.mu.RUnlock()
        return e.mu.addressableEndpointState.PrimaryAddresses()
}

// PermanentAddresses implements stack.AddressableEndpoint.
func (e *endpoint) PermanentAddresses() []tcpip.AddressWithPrefix {
        e.mu.RLock()
        defer e.mu.RUnlock()
        return e.mu.addressableEndpointState.PermanentAddresses()
}

// JoinGroup implements stack.GroupAddressableEndpoint.
func (e *endpoint) JoinGroup(addr tcpip.Address) tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()
        return e.joinGroupLocked(addr)
}

// joinGroupLocked is like JoinGroup but with locking requirements.
//
// Precondition: e.mu must be locked.
func (e *endpoint) joinGroupLocked(addr tcpip.Address) tcpip.Error {
        if !header.IsV6MulticastAddress(addr) {
                return &tcpip.ErrBadAddress{}
        }

        e.mu.mld.joinGroup(addr)
        return nil
}

// LeaveGroup implements stack.GroupAddressableEndpoint.
func (e *endpoint) LeaveGroup(addr tcpip.Address) tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()
        return e.leaveGroupLocked(addr)
}

// leaveGroupLocked is like LeaveGroup but with locking requirements.
//
// Precondition: e.mu must be locked.
func (e *endpoint) leaveGroupLocked(addr tcpip.Address) tcpip.Error {
        return e.mu.mld.leaveGroup(addr)
}

// IsInGroup implements stack.GroupAddressableEndpoint.
func (e *endpoint) IsInGroup(addr tcpip.Address) bool {
        e.mu.RLock()
        defer e.mu.RUnlock()
        return e.mu.mld.isInGroup(addr)
}

// Stats implements stack.NetworkEndpoint.
func (e *endpoint) Stats() stack.NetworkEndpointStats {
        return &e.stats.localStats
}

var _ stack.NetworkProtocol = (*protocol)(nil)
var _ fragmentation.TimeoutHandler = (*protocol)(nil)

type protocol struct {
        stack   *stack.Stack
        options Options

        mu struct {
                sync.RWMutex

                // eps is keyed by NICID to allow protocol methods to retrieve an endpoint
                // when handling a packet, by looking at which NIC handled the packet.
                eps map[tcpip.NICID]*endpoint
        }

        ids    []uint32
        hashIV uint32

        // defaultTTL is the current default TTL for the protocol. Only the
        // uint8 portion of it is meaningful.
        //
        // Must be accessed using atomic operations.
        defaultTTL uint32

        fragmentation *fragmentation.Fragmentation
}

// Number returns the ipv6 protocol number.
func (p *protocol) Number() tcpip.NetworkProtocolNumber {
        return ProtocolNumber
}

// MinimumPacketSize returns the minimum valid ipv6 packet size.
func (p *protocol) MinimumPacketSize() int {
        return header.IPv6MinimumSize
}

// DefaultPrefixLen returns the IPv6 default prefix length.
func (p *protocol) DefaultPrefixLen() int {
        return header.IPv6AddressSize * 8
}

// ParseAddresses implements stack.NetworkProtocol.
func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
        h := header.IPv6(v)
        return h.SourceAddress(), h.DestinationAddress()
}

// NewEndpoint creates a new ipv6 endpoint.
func (p *protocol) NewEndpoint(nic stack.NetworkInterface, dispatcher stack.TransportDispatcher) stack.NetworkEndpoint {
        e := &endpoint{
                nic:        nic,
                dispatcher: dispatcher,
                protocol:   p,
        }

        // NDP options must be 8 octet aligned and the first 2 bytes are used for
        // the type and length fields leaving 6 octets as the minimum size for a
        // nonce option without padding.
        const nonceSize = 6

        // As per RFC 7527 section 4.1,
        //
        //   If any probe is looped back within RetransTimer milliseconds after
        //   having sent DupAddrDetectTransmits NS(DAD) messages, the interface
        //   continues with another MAX_MULTICAST_SOLICIT number of NS(DAD)
        //   messages transmitted RetransTimer milliseconds apart.
        //
        // Value taken from RFC 4861 section 10.
        const maxMulticastSolicit = 3
        dadOptions := ip.DADOptions{
                Clock:              p.stack.Clock(),
                SecureRNG:          p.stack.SecureRNG(),
                NonceSize:          nonceSize,
                ExtendDADTransmits: maxMulticastSolicit,
                Protocol:           &e.mu.ndp,
                NICID:              nic.ID(),
        }

        e.mu.Lock()
        e.mu.addressableEndpointState.Init(e)
        e.mu.ndp.init(e, dadOptions)
        e.mu.mld.init(e)
        e.dad.mu.Lock()
        e.dad.mu.dad.Init(&e.dad.mu, p.options.DADConfigs, dadOptions)
        e.dad.mu.Unlock()
        e.mu.Unlock()

        stackStats := p.stack.Stats()
        tcpip.InitStatCounters(reflect.ValueOf(&e.stats.localStats).Elem())
        e.stats.ip.Init(&e.stats.localStats.IP, &stackStats.IP)
        e.stats.icmp.init(&e.stats.localStats.ICMP, &stackStats.ICMP.V6)

        p.mu.Lock()
        defer p.mu.Unlock()
        p.mu.eps[nic.ID()] = e
        return e
}

func (p *protocol) findEndpointWithAddress(addr tcpip.Address) *endpoint {
        p.mu.RLock()
        defer p.mu.RUnlock()

        for _, e := range p.mu.eps {
                if addressEndpoint := e.AcquireAssignedAddress(addr, false /* allowTemp */, stack.NeverPrimaryEndpoint); addressEndpoint != nil {
                        addressEndpoint.DecRef()
                        return e
                }
        }

        return nil
}

func (p *protocol) forgetEndpoint(nicID tcpip.NICID) {
        p.mu.Lock()
        defer p.mu.Unlock()
        delete(p.mu.eps, nicID)
}

// SetOption implements stack.NetworkProtocol.
func (p *protocol) SetOption(option tcpip.SettableNetworkProtocolOption) tcpip.Error {
        switch v := option.(type) {
        case *tcpip.DefaultTTLOption:
                p.SetDefaultTTL(uint8(*v))
                return nil
        default:
                return &tcpip.ErrUnknownProtocolOption{}
        }
}

// Option implements stack.NetworkProtocol.
func (p *protocol) Option(option tcpip.GettableNetworkProtocolOption) tcpip.Error {
        switch v := option.(type) {
        case *tcpip.DefaultTTLOption:
                *v = tcpip.DefaultTTLOption(p.DefaultTTL())
                return nil
        default:
                return &tcpip.ErrUnknownProtocolOption{}
        }
}

// SetDefaultTTL sets the default TTL for endpoints created with this protocol.
func (p *protocol) SetDefaultTTL(ttl uint8) {
        atomic.StoreUint32(&p.defaultTTL, uint32(ttl))
}

// DefaultTTL returns the default TTL for endpoints created with this protocol.
func (p *protocol) DefaultTTL() uint8 {
        return uint8(atomic.LoadUint32(&p.defaultTTL))
}

// Close implements stack.TransportProtocol.
func (*protocol) Close() {}

// Wait implements stack.TransportProtocol.
func (*protocol) Wait() {}

// parseAndValidate parses the packet (including its transport layer header) and
// returns the parsed IP header.
//
// Returns true if the IP header was successfully parsed.
func (p *protocol) parseAndValidate(pkt *stack.PacketBuffer) (header.IPv6, bool) {
        transProtoNum, hasTransportHdr, ok := p.Parse(pkt)
        if !ok {
                return nil, false
        }

        h := header.IPv6(pkt.NetworkHeader().View())
        // Do not include the link header's size when calculating the size of the IP
        // packet.
        if !h.IsValid(pkt.Size() - pkt.LinkHeader().View().Size()) {
                return nil, false
        }

        if hasTransportHdr {
                switch err := p.stack.ParsePacketBufferTransport(transProtoNum, pkt); err {
                case stack.ParsedOK:
                case stack.UnknownTransportProtocol, stack.TransportLayerParseError:
                        // The transport layer will handle unknown protocols and transport layer
                        // parsing errors.
                default:
                        panic(fmt.Sprintf("unexpected error parsing transport header = %d", err))
                }
        }

        return h, true
}

// Parse implements stack.NetworkProtocol.
func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
        proto, _, fragOffset, fragMore, ok := parse.IPv6(pkt)
        if !ok {
                return 0, false, false
        }

        return proto, !fragMore && fragOffset == 0, true
}

// calculateNetworkMTU calculates the network-layer payload MTU based on the
// link-layer payload MTU and the length of every IPv6 header.
// Note that this is different than the Payload Length field of the IPv6 header,
// which includes the length of the extension headers.
func calculateNetworkMTU(linkMTU, networkHeadersLen uint32) (uint32, tcpip.Error) {
        if linkMTU < header.IPv6MinimumMTU {
                return 0, &tcpip.ErrInvalidEndpointState{}
        }

        // As per RFC 7112 section 5, we should discard packets if their IPv6 header
        // is bigger than 1280 bytes (ie, the minimum link MTU) since we do not
        // support PMTU discovery:
        //   Hosts that do not discover the Path MTU MUST limit the IPv6 Header Chain
        //   length to 1280 bytes.  Limiting the IPv6 Header Chain length to 1280
        //   bytes ensures that the header chain length does not exceed the IPv6
        //   minimum MTU.
        if networkHeadersLen > header.IPv6MinimumMTU {
                return 0, &tcpip.ErrMalformedHeader{}
        }

        networkMTU := linkMTU - networkHeadersLen
        if networkMTU > maxPayloadSize {
                networkMTU = maxPayloadSize
        }
        return networkMTU, nil
}

// Options holds options to configure a new protocol.
type Options struct {
        // NDPConfigs is the default NDP configurations used by interfaces.
        NDPConfigs NDPConfigurations

        // AutoGenLinkLocal determines whether or not the stack attempts to
        // auto-generate a link-local address for newly enabled non-loopback
        // NICs.
        //
        // Note, setting this to true does not mean that a link-local address is
        // assigned right away, or at all. If Duplicate Address Detection is enabled,
        // an address is only assigned if it successfully resolves. If it fails, no
        // further attempts are made to auto-generate a link-local address.
        //
        // The generated link-local address follows RFC 4291 Appendix A guidelines.
        AutoGenLinkLocal bool

        // NDPDisp is the NDP event dispatcher that an integrator can provide to
        // receive NDP related events.
        NDPDisp NDPDispatcher

        // OpaqueIIDOpts hold the options for generating opaque interface
        // identifiers (IIDs) as outlined by RFC 7217.
        OpaqueIIDOpts OpaqueInterfaceIdentifierOptions

        // TempIIDSeed is used to seed the initial temporary interface identifier
        // history value used to generate IIDs for temporary SLAAC addresses.
        //
        // Temporary SLAAC addresses are short-lived addresses which are unpredictable
        // and random from the perspective of other nodes on the network. It is
        // recommended that the seed be a random byte buffer of at least
        // header.IIDSize bytes to make sure that temporary SLAAC addresses are
        // sufficiently random. It should follow minimum randomness requirements for
        // security as outlined by RFC 4086.
        //
        // Note: using a nil value, the same seed across netstack program runs, or a
        // seed that is too small would reduce randomness and increase predictability,
        // defeating the purpose of temporary SLAAC addresses.
        TempIIDSeed []byte

        // MLD holds options for MLD.
        MLD MLDOptions

        // DADConfigs holds the default DAD configurations used by IPv6 endpoints.
        DADConfigs stack.DADConfigurations

        // AllowExternalLoopbackTraffic indicates that inbound loopback packets (i.e.
        // martian loopback packets) should be accepted.
        AllowExternalLoopbackTraffic bool
}

// NewProtocolWithOptions returns an IPv6 network protocol.
func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory {
        opts.NDPConfigs.validate()

        ids := hash.RandN32(buckets)
        hashIV := hash.RandN32(1)[0]

        return func(s *stack.Stack) stack.NetworkProtocol {
                p := &protocol{
                        stack:   s,
                        options: opts,

                        ids:    ids,
                        hashIV: hashIV,
                }
                p.fragmentation = fragmentation.NewFragmentation(header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, ReassembleTimeout, s.Clock(), p)
                p.mu.eps = make(map[tcpip.NICID]*endpoint)
                p.SetDefaultTTL(DefaultTTL)
                return p
        }
}

// NewProtocol is equivalent to NewProtocolWithOptions with an empty Options.
func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
        return NewProtocolWithOptions(Options{})(s)
}

func calculateFragmentReserve(pkt *stack.PacketBuffer) int {
        return pkt.AvailableHeaderBytes() + pkt.NetworkHeader().View().Size() + header.IPv6FragmentHeaderSize
}

// hashRoute calculates a hash value for the given route. It uses the source &
// destination address and 32-bit number to generate the hash.
func hashRoute(r *stack.Route, hashIV uint32) uint32 {
        // The FNV-1a was chosen because it is a fast hashing algorithm, and
        // cryptographic properties are not needed here.
        h := fnv.New32a()
        if _, err := h.Write([]byte(r.LocalAddress())); err != nil {
                panic(fmt.Sprintf("Hash.Write: %s, but Hash' implementation of Write is not expected to ever return an error", err))
        }

        if _, err := h.Write([]byte(r.RemoteAddress())); err != nil {
                panic(fmt.Sprintf("Hash.Write: %s, but Hash' implementation of Write is not expected to ever return an error", err))
        }

        s := make([]byte, 4)
        binary.LittleEndian.PutUint32(s, hashIV)
        if _, err := h.Write(s); err != nil {
                panic(fmt.Sprintf("Hash.Write: %s, but Hash' implementation of Write is not expected ever to return an error", err))
        }

        return h.Sum32()
}

func buildNextFragment(pf *fragmentation.PacketFragmenter, originalIPHeaders header.IPv6, transportProto tcpip.TransportProtocolNumber, id uint32) (*stack.PacketBuffer, bool) {
        fragPkt, offset, copied, more := pf.BuildNextFragment()
        fragPkt.NetworkProtocolNumber = ProtocolNumber

        originalIPHeadersLength := len(originalIPHeaders)

        s := header.IPv6ExtHdrSerializer{&header.IPv6SerializableFragmentExtHdr{
                FragmentOffset: uint16(offset / header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit),
                M:              more,
                Identification: id,
        }}

        fragmentIPHeadersLength := originalIPHeadersLength + s.Length()
        fragmentIPHeaders := header.IPv6(fragPkt.NetworkHeader().Push(fragmentIPHeadersLength))

        // Copy the IPv6 header and any extension headers already populated.
        if copied := copy(fragmentIPHeaders, originalIPHeaders); copied != originalIPHeadersLength {
                panic(fmt.Sprintf("wrong number of bytes copied into fragmentIPHeaders: got %d, want %d", copied, originalIPHeadersLength))
        }

        nextHeader, _ := s.Serialize(transportProto, fragmentIPHeaders[originalIPHeadersLength:])

        fragmentIPHeaders.SetNextHeader(nextHeader)
        fragmentIPHeaders.SetPayloadLength(uint16(copied + fragmentIPHeadersLength - header.IPv6MinimumSize))

        return fragPkt, more
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/ipc_namespace_refs.go: no such file or directory





































































































   11 






















 1956 




 1956 
    2 


 1958 



   29 




 1958 




   27 






   21 















   14 
    2 


   13 



   11 









   22 







   13 





























    5 





























    5 




    4 



    4 



  366 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "math"
        "time"
)

const (
        // ClockTick is the length of time represented by a single clock tick, as
        // used by times(2) and /proc/[pid]/stat.
        ClockTick = time.Second / CLOCKS_PER_SEC

        // CLOCKS_PER_SEC is the number of ClockTicks per second.
        //
        // Linux defines this to be 100 on most architectures, irrespective of
        // CONFIG_HZ. Userspace obtains the value through sysconf(_SC_CLK_TCK),
        // which uses the AT_CLKTCK entry in the auxiliary vector if one is
        // provided, and assumes 100 otherwise (glibc:
        // sysdeps/posix/sysconf.c:__sysconf() =>
        // sysdeps/unix/sysv/linux/getclktck.c, elf/dl-support.c:_dl_aux_init()).
        //
        // Not to be confused with POSIX CLOCKS_PER_SEC, as used by clock(3); "XSI
        // requires that [POSIX] CLOCKS_PER_SEC equals 1000000 independent of the
        // actual resolution" - clock(3).
        CLOCKS_PER_SEC = 100
)

// CPU clock types for use with clock_gettime(2) et al.
//
// The 29 most significant bits of a 32 bit clock ID are either a PID or a FD.
//
// Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3.
//
// Bit 2 indicates whether a cpu clock refers to a thread or a process.
const (
        CPUCLOCK_PROF  = 0
        CPUCLOCK_VIRT  = 1
        CPUCLOCK_SCHED = 2
        CPUCLOCK_MAX   = 3
        CLOCKFD        = CPUCLOCK_MAX

        CPUCLOCK_CLOCK_MASK     = 3
        CPUCLOCK_PERTHREAD_MASK = 4
)

// Clock identifiers for use with clock_gettime(2), clock_getres(2),
// clock_nanosleep(2).
const (
        CLOCK_REALTIME           = 0
        CLOCK_MONOTONIC          = 1
        CLOCK_PROCESS_CPUTIME_ID = 2
        CLOCK_THREAD_CPUTIME_ID  = 3
        CLOCK_MONOTONIC_RAW      = 4
        CLOCK_REALTIME_COARSE    = 5
        CLOCK_MONOTONIC_COARSE   = 6
        CLOCK_BOOTTIME           = 7
        CLOCK_REALTIME_ALARM     = 8
        CLOCK_BOOTTIME_ALARM     = 9
)

// Flags for clock_nanosleep(2).
const (
        TIMER_ABSTIME = 1
)

// Flags for timerfd syscalls (timerfd_create(2), timerfd_settime(2)).
const (
        // TFD_CLOEXEC is a timerfd_create flag.
        TFD_CLOEXEC = O_CLOEXEC

        // TFD_NONBLOCK is a timerfd_create flag.
        TFD_NONBLOCK = O_NONBLOCK

        // TFD_TIMER_ABSTIME is a timerfd_settime flag.
        TFD_TIMER_ABSTIME = 1
)

// The safe number of seconds you can represent by int64.
const maxSecInDuration = math.MaxInt64 / int64(time.Second)

// TimeT represents time_t in <time.h>. It represents time in seconds.
//
// +marshal
type TimeT int64

// NsecToTimeT translates nanoseconds to TimeT (seconds).
func NsecToTimeT(nsec int64) TimeT {
        return TimeT(nsec / 1e9)
}

// Timespec represents struct timespec in <time.h>.
//
// +marshal slice:TimespecSlice
type Timespec struct {
        Sec  int64
        Nsec int64
}

// Unix returns the second and nanosecond.
func (ts Timespec) Unix() (sec int64, nsec int64) {
        return int64(ts.Sec), int64(ts.Nsec)
}

// ToTime returns the Go time.Time representation.
func (ts Timespec) ToTime() time.Time {
        return time.Unix(ts.Sec, ts.Nsec)
}

// ToNsec returns the nanosecond representation.
func (ts Timespec) ToNsec() int64 {
        return int64(ts.Sec)*1e9 + int64(ts.Nsec)
}

// ToNsecCapped returns the safe nanosecond representation.
func (ts Timespec) ToNsecCapped() int64 {
        if ts.Sec > maxSecInDuration {
                return math.MaxInt64
        }
        return ts.ToNsec()
}

// ToDuration returns the safe nanosecond representation as time.Duration.
func (ts Timespec) ToDuration() time.Duration {
        return time.Duration(ts.ToNsecCapped())
}

// Valid returns whether the timespec contains valid values.
func (ts Timespec) Valid() bool {
        return !(ts.Sec < 0 || ts.Nsec < 0 || ts.Nsec >= int64(time.Second))
}

// NsecToTimespec translates nanoseconds to Timespec.
func NsecToTimespec(nsec int64) (ts Timespec) {
        ts.Sec = nsec / 1e9
        ts.Nsec = nsec % 1e9
        return
}

// DurationToTimespec translates time.Duration to Timespec.
func DurationToTimespec(dur time.Duration) Timespec {
        return NsecToTimespec(dur.Nanoseconds())
}

// SizeOfTimeval is the size of a Timeval struct in bytes.
const SizeOfTimeval = 16

// Timeval represents struct timeval in <time.h>.
//
// +marshal slice:TimevalSlice
type Timeval struct {
        Sec  int64
        Usec int64
}

// ToNsecCapped returns the safe nanosecond representation.
func (tv Timeval) ToNsecCapped() int64 {
        if tv.Sec > maxSecInDuration {
                return math.MaxInt64
        }
        return int64(tv.Sec)*1e9 + int64(tv.Usec)*1e3
}

// ToDuration returns the safe nanosecond representation as a time.Duration.
func (tv Timeval) ToDuration() time.Duration {
        return time.Duration(tv.ToNsecCapped())
}

// ToTime returns the Go time.Time representation.
func (tv Timeval) ToTime() time.Time {
        return time.Unix(tv.Sec, tv.Usec*1e3)
}

// NsecToTimeval translates nanosecond to Timeval.
func NsecToTimeval(nsec int64) (tv Timeval) {
        nsec += 999 // round up to microsecond
        tv.Sec = nsec / 1e9
        tv.Usec = nsec % 1e9 / 1e3
        return
}

// DurationToTimeval translates time.Duration to Timeval.
func DurationToTimeval(dur time.Duration) Timeval {
        return NsecToTimeval(dur.Nanoseconds())
}

// Itimerspec represents struct itimerspec in <time.h>.
//
// +marshal
type Itimerspec struct {
        Interval Timespec
        Value    Timespec
}

// ItimerVal mimics the following struct in <sys/time.h>
//   struct itimerval {
//     struct timeval it_interval; /* next value */
//     struct timeval it_value;    /* current value */
//   };
//
// +marshal
type ItimerVal struct {
        Interval Timeval
        Value    Timeval
}

// ClockT represents type clock_t.
//
// +marshal
type ClockT int64

// ClockTFromDuration converts time.Duration to clock_t.
func ClockTFromDuration(d time.Duration) ClockT {
        return ClockT(d / ClockTick)
}

// Tms represents struct tms, used by times(2).
//
// +marshal
type Tms struct {
        UTime  ClockT
        STime  ClockT
        CUTime ClockT
        CSTime ClockT
}

// TimerID represents type timer_t, which identifies a POSIX per-process
// interval timer.
//
// +marshal
type TimerID int32

// StatxTimestamp represents struct statx_timestamp.
//
// +marshal
type StatxTimestamp struct {
        Sec  int64
        Nsec uint32
        _    int32
}

// ToNsec returns the nanosecond representation.
func (sxts StatxTimestamp) ToNsec() int64 {
        return int64(sxts.Sec)*1e9 + int64(sxts.Nsec)
}

// ToNsecCapped returns the safe nanosecond representation.
func (sxts StatxTimestamp) ToNsecCapped() int64 {
        if sxts.Sec > maxSecInDuration {
                return math.MaxInt64
        }
        return sxts.ToNsec()
}

// NsecToStatxTimestamp translates nanoseconds to StatxTimestamp.
func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) {
        return StatxTimestamp{
                Sec:  nsec / 1e9,
                Nsec: uint32(nsec % 1e9),
        }
}

// Utime represents struct utimbuf used by utimes(2).
//
// +marshal
type Utime struct {
        Actime  int64
        Modtime int64
}








































   15 




   15 

















   15 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package pool provides a trivial integer pool.
package pool

import (
        "gvisor.dev/gvisor/pkg/sync"
)

// Pool is a simple allocator.
type Pool struct {
        mu sync.Mutex

        // cache is the set of returned values.
        cache []uint64

        // Start is the starting value (if needed).
        Start uint64

        // max is the current maximum issued.
        max uint64

        // Limit is the upper limit.
        Limit uint64
}

// Get gets a value from the pool.
func (p *Pool) Get() (uint64, bool) {
        p.mu.Lock()
        defer p.mu.Unlock()

        // Anything cached?
        if len(p.cache) > 0 {
                v := p.cache[len(p.cache)-1]
                p.cache = p.cache[:len(p.cache)-1]
                return v, true
        }

        // Over the limit?
        if p.Start == p.Limit {
                return 0, false
        }

        // Generate a new value.
        v := p.Start
        p.Start++
        return v, true
}

// Put returns a value to the pool.
func (p *Pool) Put(v uint64) {
        p.mu.Lock()
        p.cache = append(p.cache, v)
        p.mu.Unlock()
}




























   13 







   13 






   12 




    2 


    1 





    1 


   11 
    2 




   11 




   11 





    1 




   10 



    5 







    8 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/signalfd"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
)

// sharedSignalfd is shared between the two calls.
func sharedSignalfd(t *kernel.Task, fd int32, sigset hostarch.Addr, sigsetsize uint, flags int32) (uintptr, *kernel.SyscallControl, error) {
        // Copy in the signal mask.
        mask, err := slinux.CopyInSigSet(t, sigset, sigsetsize)
        if err != nil {
                return 0, nil, err
        }

        // Always check for valid flags, even if not creating.
        if flags&^(linux.SFD_NONBLOCK|linux.SFD_CLOEXEC) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Is this a change to an existing signalfd?
        //
        // The spec indicates that this should adjust the mask.
        if fd != -1 {
                file := t.GetFileVFS2(fd)
                if file == nil {
                        return 0, nil, linuxerr.EBADF
                }
                defer file.DecRef(t)

                // Is this a signalfd?
                if sfd, ok := file.Impl().(*signalfd.SignalFileDescription); ok {
                        sfd.SetMask(mask)
                        return 0, nil, nil
                }

                // Not a signalfd.
                return 0, nil, linuxerr.EINVAL
        }

        fileFlags := uint32(linux.O_RDWR)
        if flags&linux.SFD_NONBLOCK != 0 {
                fileFlags |= linux.O_NONBLOCK
        }

        // Create a new file.
        vfsObj := t.Kernel().VFS()
        file, err := signalfd.New(vfsObj, t, mask, fileFlags)
        if err != nil {
                return 0, nil, err
        }
        defer file.DecRef(t)

        // Create a new descriptor.
        fd, err = t.NewFDFromVFS2(0, file, kernel.FDFlags{
                CloseOnExec: flags&linux.SFD_CLOEXEC != 0,
        })
        if err != nil {
                return 0, nil, err
        }

        // Done.
        return uintptr(fd), nil, nil
}

// Signalfd implements the linux syscall signalfd(2).
func Signalfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        sigset := args[1].Pointer()
        sigsetsize := args[2].SizeT()
        return sharedSignalfd(t, fd, sigset, sigsetsize, 0)
}

// Signalfd4 implements the linux syscall signalfd4(2).
func Signalfd4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        sigset := args[1].Pointer()
        sigsetsize := args[2].SizeT()
        flags := args[3].Int()
        return sharedSignalfd(t, fd, sigset, sigsetsize, flags)
}
































































   16 

    1 

    1 

    2 

    3 



    8 

    1 







   16 











   17 





   33 
    2 




   31 









   30 
    2 


   28 
    4 


    2 



    2 


    2 
    1 





    5 


    3 



    3 


    1 



    1 


    3 
    1 


    2 


    7 

    2 

    3 
    1 


    1 




    1 


    5 

    1 






    1 


    4 

    1 


    3 





   18 



   18 


    3 
    1 





   17 






































    6 
    2 


    4 




   39 


   39 


    1 

    1 




    1 
























    1 




    1 




    1 











    1 





    1 

    1 

    1 

    1 



    1 


    1 



    1 


    1 

    1 



    1 

    1 



    2 

    1 









    1 

    1 



    1 

    1 

    1 





    1 



    2 

   38 





    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package bpf

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
)

// Possible values for ProgramError.Code.
const (
        // DivisionByZero indicates that a program contains, or executed, a
        // division or modulo by zero.
        DivisionByZero = iota

        // InvalidEndOfProgram indicates that the last instruction of a program is
        // not a return.
        InvalidEndOfProgram

        // InvalidInstructionCount indicates that a program has zero instructions
        // or more than MaxInstructions instructions.
        InvalidInstructionCount

        // InvalidJumpTarget indicates that a program contains a jump whose target
        // is outside of the program's bounds.
        InvalidJumpTarget

        // InvalidLoad indicates that a program executed an invalid load of input
        // data.
        InvalidLoad

        // InvalidOpcode indicates that a program contains an instruction with an
        // invalid opcode.
        InvalidOpcode

        // InvalidRegister indicates that a program contains a load from, or store
        // to, a non-existent M register (index >= ScratchMemRegisters).
        InvalidRegister
)

// Error is an error encountered while compiling or executing a BPF program.
type Error struct {
        // Code indicates the kind of error that occurred.
        Code int

        // PC is the program counter (index into the list of instructions) at which
        // the error occurred.
        PC int
}

func (e Error) codeString() string {
        switch e.Code {
        case DivisionByZero:
                return "division by zero"
        case InvalidEndOfProgram:
                return "last instruction must be a return"
        case InvalidInstructionCount:
                return "invalid number of instructions"
        case InvalidJumpTarget:
                return "jump target out of bounds"
        case InvalidLoad:
                return "load out of bounds or violates input alignment requirements"
        case InvalidOpcode:
                return "invalid instruction opcode"
        case InvalidRegister:
                return "invalid M register"
        default:
                return "unknown error"
        }
}

// Error implements error.Error.
func (e Error) Error() string {
        return fmt.Sprintf("at l%d: %s", e.PC, e.codeString())
}

// Program is a BPF program that has been validated for consistency.
//
// +stateify savable
type Program struct {
        instructions []linux.BPFInstruction
}

// Length returns the number of instructions in the program.
func (p Program) Length() int {
        return len(p.instructions)
}

// Compile performs validation on a sequence of BPF instructions before
// wrapping them in a Program.
func Compile(insns []linux.BPFInstruction) (Program, error) {
        if len(insns) == 0 || len(insns) > MaxInstructions {
                return Program{}, Error{InvalidInstructionCount, len(insns)}
        }

        // The last instruction must be a return.
        if last := insns[len(insns)-1]; last.OpCode != (Ret|K) && last.OpCode != (Ret|A) {
                return Program{}, Error{InvalidEndOfProgram, len(insns) - 1}
        }

        // Validate each instruction. Note that we skip a validation Linux does:
        // Linux additionally verifies that every load from an M register is
        // preceded, in every path, by a store to the same M register, in order to
        // avoid having to clear M between programs
        // (net/core/filter.c:check_load_and_stores). We always start with a zeroed
        // M array.
        for pc, i := range insns {
                if i.OpCode&unusedBitsMask != 0 {
                        return Program{}, Error{InvalidOpcode, pc}
                }
                switch i.OpCode & instructionClassMask {
                case Ld:
                        mode := i.OpCode & loadModeMask
                        switch i.OpCode & loadSizeMask {
                        case W:
                                if mode != Imm && mode != Abs && mode != Ind && mode != Mem && mode != Len {
                                        return Program{}, Error{InvalidOpcode, pc}
                                }
                                if mode == Mem && i.K >= ScratchMemRegisters {
                                        return Program{}, Error{InvalidRegister, pc}
                                }
                        case H, B:
                                if mode != Abs && mode != Ind {
                                        return Program{}, Error{InvalidOpcode, pc}
                                }
                        default:
                                return Program{}, Error{InvalidOpcode, pc}
                        }
                case Ldx:
                        mode := i.OpCode & loadModeMask
                        switch i.OpCode & loadSizeMask {
                        case W:
                                if mode != Imm && mode != Mem && mode != Len {
                                        return Program{}, Error{InvalidOpcode, pc}
                                }
                                if mode == Mem && i.K >= ScratchMemRegisters {
                                        return Program{}, Error{InvalidRegister, pc}
                                }
                        case B:
                                if mode != Msh {
                                        return Program{}, Error{InvalidOpcode, pc}
                                }
                        default:
                                return Program{}, Error{InvalidOpcode, pc}
                        }
                case St, Stx:
                        if i.OpCode&storeUnusedBitsMask != 0 {
                                return Program{}, Error{InvalidOpcode, pc}
                        }
                        if i.K >= ScratchMemRegisters {
                                return Program{}, Error{InvalidRegister, pc}
                        }
                case Alu:
                        switch i.OpCode & aluMask {
                        case Add, Sub, Mul, Or, And, Lsh, Rsh, Xor:
                                break
                        case Div, Mod:
                                if src := i.OpCode & srcAluJmpMask; src == K && i.K == 0 {
                                        return Program{}, Error{DivisionByZero, pc}
                                }
                        case Neg:
                                // Negation doesn't take a source operand.
                                if i.OpCode&srcAluJmpMask != 0 {
                                        return Program{}, Error{InvalidOpcode, pc}
                                }
                        default:
                                return Program{}, Error{InvalidOpcode, pc}
                        }
                case Jmp:
                        switch i.OpCode & jmpMask {
                        case Ja:
                                // Unconditional jump doesn't take a source operand.
                                if i.OpCode&srcAluJmpMask != 0 {
                                        return Program{}, Error{InvalidOpcode, pc}
                                }
                                // Do the comparison in 64 bits to avoid the possibility of
                                // overflow from a very large i.K.
                                if uint64(pc)+uint64(i.K)+1 >= uint64(len(insns)) {
                                        return Program{}, Error{InvalidJumpTarget, pc}
                                }
                        case Jeq, Jgt, Jge, Jset:
                                // jt and jf are uint16s, so there's no threat of overflow.
                                if pc+int(i.JumpIfTrue)+1 >= len(insns) {
                                        return Program{}, Error{InvalidJumpTarget, pc}
                                }
                                if pc+int(i.JumpIfFalse)+1 >= len(insns) {
                                        return Program{}, Error{InvalidJumpTarget, pc}
                                }
                        default:
                                return Program{}, Error{InvalidOpcode, pc}
                        }
                case Ret:
                        if i.OpCode&retUnusedBitsMask != 0 {
                                return Program{}, Error{InvalidOpcode, pc}
                        }
                        if src := i.OpCode & srcRetMask; src != K && src != A {
                                return Program{}, Error{InvalidOpcode, pc}
                        }
                case Misc:
                        if misc := i.OpCode & miscMask; misc != Tax && misc != Txa {
                                return Program{}, Error{InvalidOpcode, pc}
                        }
                }
        }

        return Program{insns}, nil
}

// Input represents a source of input data for a BPF program. (BPF
// documentation sometimes refers to the input data as the "packet" due to its
// origins as a packet processing DSL.)
//
// For all of Input's Load methods:
//
// - The second (bool) return value is true if the load succeeded and false
// otherwise.
//
// - Inputs should not assume that the loaded range falls within the input
// data's length. Inputs should return false if the load falls outside of the
// input data.
//
// - Inputs should not assume that the offset is correctly aligned. Inputs may
// choose to service or reject loads to unaligned addresses.
type Input interface {
        // Load32 reads 32 bits from the input starting at the given byte offset.
        Load32(off uint32) (uint32, bool)

        // Load16 reads 16 bits from the input starting at the given byte offset.
        Load16(off uint32) (uint16, bool)

        // Load8 reads 8 bits from the input starting at the given byte offset.
        Load8(off uint32) (uint8, bool)

        // Length returns the length of the input in bytes.
        Length() uint32
}

// machine represents the state of a BPF virtual machine.
type machine struct {
        A uint32
        X uint32
        M [ScratchMemRegisters]uint32
}

func conditionalJumpOffset(insn linux.BPFInstruction, cond bool) int {
        if cond {
                return int(insn.JumpIfTrue)
        }
        return int(insn.JumpIfFalse)
}

// Exec executes a BPF program over the given input and returns its return
// value.
func Exec(p Program, in Input) (uint32, error) {
        var m machine
        var pc int
        for ; pc < len(p.instructions); pc++ {
                i := p.instructions[pc]
                switch i.OpCode {
                case Ld | Imm | W:
                        m.A = i.K
                case Ld | Abs | W:
                        val, ok := in.Load32(i.K)
                        if !ok {
                                return 0, Error{InvalidLoad, pc}
                        }
                        m.A = val
                case Ld | Abs | H:
                        val, ok := in.Load16(i.K)
                        if !ok {
                                return 0, Error{InvalidLoad, pc}
                        }
                        m.A = uint32(val)
                case Ld | Abs | B:
                        val, ok := in.Load8(i.K)
                        if !ok {
                                return 0, Error{InvalidLoad, pc}
                        }
                        m.A = uint32(val)
                case Ld | Ind | W:
                        val, ok := in.Load32(m.X + i.K)
                        if !ok {
                                return 0, Error{InvalidLoad, pc}
                        }
                        m.A = val
                case Ld | Ind | H:
                        val, ok := in.Load16(m.X + i.K)
                        if !ok {
                                return 0, Error{InvalidLoad, pc}
                        }
                        m.A = uint32(val)
                case Ld | Ind | B:
                        val, ok := in.Load8(m.X + i.K)
                        if !ok {
                                return 0, Error{InvalidLoad, pc}
                        }
                        m.A = uint32(val)
                case Ld | Mem | W:
                        m.A = m.M[int(i.K)]
                case Ld | Len | W:
                        m.A = in.Length()
                case Ldx | Imm | W:
                        m.X = i.K
                case Ldx | Mem | W:
                        m.X = m.M[int(i.K)]
                case Ldx | Len | W:
                        m.X = in.Length()
                case Ldx | Msh | B:
                        val, ok := in.Load8(i.K)
                        if !ok {
                                return 0, Error{InvalidLoad, pc}
                        }
                        m.X = 4 * uint32(val&0xf)
                case St:
                        m.M[int(i.K)] = m.A
                case Stx:
                        m.M[int(i.K)] = m.X
                case Alu | Add | K:
                        m.A += i.K
                case Alu | Add | X:
                        m.A += m.X
                case Alu | Sub | K:
                        m.A -= i.K
                case Alu | Sub | X:
                        m.A -= m.X
                case Alu | Mul | K:
                        m.A *= i.K
                case Alu | Mul | X:
                        m.A *= m.X
                case Alu | Div | K:
                        // K != 0 already checked by Compile.
                        m.A /= i.K
                case Alu | Div | X:
                        if m.X == 0 {
                                return 0, Error{DivisionByZero, pc}
                        }
                        m.A /= m.X
                case Alu | Or | K:
                        m.A |= i.K
                case Alu | Or | X:
                        m.A |= m.X
                case Alu | And | K:
                        m.A &= i.K
                case Alu | And | X:
                        m.A &= m.X
                case Alu | Lsh | K:
                        m.A <<= i.K
                case Alu | Lsh | X:
                        m.A <<= m.X
                case Alu | Rsh | K:
                        m.A >>= i.K
                case Alu | Rsh | X:
                        m.A >>= m.X
                case Alu | Neg:
                        m.A = uint32(-int32(m.A))
                case Alu | Mod | K:
                        // K != 0 already checked by Compile.
                        m.A %= i.K
                case Alu | Mod | X:
                        if m.X == 0 {
                                return 0, Error{DivisionByZero, pc}
                        }
                        m.A %= m.X
                case Alu | Xor | K:
                        m.A ^= i.K
                case Alu | Xor | X:
                        m.A ^= m.X
                case Jmp | Ja:
                        pc += int(i.K)
                case Jmp | Jeq | K:
                        pc += conditionalJumpOffset(i, m.A == i.K)
                case Jmp | Jeq | X:
                        pc += conditionalJumpOffset(i, m.A == m.X)
                case Jmp | Jgt | K:
                        pc += conditionalJumpOffset(i, m.A > i.K)
                case Jmp | Jgt | X:
                        pc += conditionalJumpOffset(i, m.A > m.X)
                case Jmp | Jge | K:
                        pc += conditionalJumpOffset(i, m.A >= i.K)
                case Jmp | Jge | X:
                        pc += conditionalJumpOffset(i, m.A >= m.X)
                case Jmp | Jset | K:
                        pc += conditionalJumpOffset(i, (m.A&i.K) != 0)
                case Jmp | Jset | X:
                        pc += conditionalJumpOffset(i, (m.A&m.X) != 0)
                case Ret | K:
                        return i.K, nil
                case Ret | A:
                        return m.A, nil
                case Misc | Tax:
                        m.A = m.X
                case Misc | Txa:
                        m.X = m.A
                default:
                        return 0, Error{InvalidOpcode, pc}
                }
        }
        return 0, Error{InvalidEndOfProgram, pc}
}























































































































































































   66 




   67 



   30 




   30 




   67 























  679 






  676 



  680 





  682 





  681 




  684 
  674 













































 1706 






 1707 
 1713 

 1711 






  608 



 1714 




 1709 



    1 




















 1706 




 1704 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "fmt"
        "sync/atomic"
        "time"

        "gvisor.dev/gvisor/pkg/log"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
)

// Timekeeper manages all of the kernel clocks.
//
// +stateify savable
type Timekeeper struct {
        // clocks are the clock sources.
        //
        // These are not saved directly, as the new machine's clock may behave
        // differently.
        //
        // It is set only once, by SetClocks.
        clocks sentrytime.Clocks `state:"nosave"`

        // realtimeClock is a ktime.Clock based on timekeeper's Realtime.
        realtimeClock *timekeeperClock

        // monotonicClock is a ktime.Clock based on timekeeper's Monotonic.
        monotonicClock *timekeeperClock

        // bootTime is the realtime when the system "booted". i.e., when
        // SetClocks was called in the initial (not restored) run.
        bootTime ktime.Time

        // monotonicOffset is the offset to apply to the monotonic clock output
        // from clocks.
        //
        // It is set only once, by SetClocks.
        monotonicOffset int64 `state:"nosave"`

        // monotonicLowerBound is the lowerBound for monotonic time.
        monotonicLowerBound int64 `state:"nosave"`

        // restored, if non-nil, indicates that this Timekeeper was restored
        // from a state file. The clocks are not set until restored is closed.
        restored chan struct{} `state:"nosave"`

        // saveMonotonic is the (offset) value of the monotonic clock at the
        // time of save.
        //
        // It is only valid if restored is non-nil.
        //
        // It is only used in SetClocks after restore to compute the new
        // monotonicOffset.
        saveMonotonic int64

        // saveRealtime is the value of the realtime clock at the time of save.
        //
        // It is only valid if restored is non-nil.
        //
        // It is only used in SetClocks after restore to compute the new
        // monotonicOffset.
        saveRealtime int64

        // params manages the parameter page.
        params *VDSOParamPage

        // mu protects destruction with stop and wg.
        mu sync.Mutex `state:"nosave"`

        // stop is used to tell the update goroutine to exit.
        stop chan struct{} `state:"nosave"`

        // wg is used to indicate that the update goroutine has exited.
        wg sync.WaitGroup `state:"nosave"`
}

// NewTimekeeper returns a Timekeeper that is automatically kept up-to-date.
// NewTimekeeper does not take ownership of paramPage.
//
// SetClocks must be called on the returned Timekeeper before it is usable.
func NewTimekeeper(mfp pgalloc.MemoryFileProvider, paramPage memmap.FileRange) *Timekeeper {
        t := Timekeeper{
                params: NewVDSOParamPage(mfp, paramPage),
        }
        t.realtimeClock = &timekeeperClock{tk: &t, c: sentrytime.Realtime}
        t.monotonicClock = &timekeeperClock{tk: &t, c: sentrytime.Monotonic}
        return &t
}

// SetClocks the backing clock source.
//
// SetClocks must be called before the Timekeeper is used, and it may not be
// called more than once, as changing the clock source without extra correction
// could cause time discontinuities.
//
// It must also be called after Load.
func (t *Timekeeper) SetClocks(c sentrytime.Clocks) {
        // Update the params, marking them "not ready", as we may need to
        // restart calibration on this new machine.
        if t.restored != nil {
                if err := t.params.Write(func() vdsoParams {
                        return vdsoParams{}
                }); err != nil {
                        panic("unable to reset VDSO params: " + err.Error())
                }
        }

        if t.clocks != nil {
                panic("SetClocks called on previously-initialized Timekeeper")
        }

        t.clocks = c

        // Compute the offset of the monotonic clock from the base Clocks.
        //
        // In a fresh (not restored) sentry, monotonic time starts at zero.
        //
        // In a restored sentry, monotonic time jumps forward by approximately
        // the same amount as real time. There are no guarantees here, we are
        // just making a best-effort attempt to make it appear that the app
        // was simply not scheduled for a long period, rather than that the
        // real time clock was changed.
        //
        // If real time went backwards, it remains the same.
        wantMonotonic := int64(0)

        nowMonotonic, err := t.clocks.GetTime(sentrytime.Monotonic)
        if err != nil {
                panic("Unable to get current monotonic time: " + err.Error())
        }

        nowRealtime, err := t.clocks.GetTime(sentrytime.Realtime)
        if err != nil {
                panic("Unable to get current realtime: " + err.Error())
        }

        if t.restored != nil {
                wantMonotonic = t.saveMonotonic
                elapsed := nowRealtime - t.saveRealtime
                if elapsed > 0 {
                        wantMonotonic += elapsed
                }
        }

        t.monotonicOffset = wantMonotonic - nowMonotonic

        if t.restored == nil {
                // Hold on to the initial "boot" time.
                t.bootTime = ktime.FromNanoseconds(nowRealtime)
        }

        t.mu.Lock()
        defer t.mu.Unlock()
        t.startUpdater()

        if t.restored != nil {
                close(t.restored)
        }
}

var _ tcpip.Clock = (*Timekeeper)(nil)

// Now implements tcpip.Clock.
func (t *Timekeeper) Now() time.Time {
        nsec, err := t.GetTime(sentrytime.Realtime)
        if err != nil {
                panic("timekeeper.GetTime(sentrytime.Realtime): " + err.Error())
        }
        return time.Unix(0, nsec)
}

// NowMonotonic implements tcpip.Clock.
func (t *Timekeeper) NowMonotonic() tcpip.MonotonicTime {
        nsec, err := t.GetTime(sentrytime.Monotonic)
        if err != nil {
                panic("timekeeper.GetTime(sentrytime.Monotonic): " + err.Error())
        }
        var mt tcpip.MonotonicTime
        return mt.Add(time.Duration(nsec) * time.Nanosecond)
}

// AfterFunc implements tcpip.Clock.
func (t *Timekeeper) AfterFunc(d time.Duration, f func()) tcpip.Timer {
        return ktime.TcpipAfterFunc(t.realtimeClock, d, f)
}

// startUpdater starts an update goroutine that keeps the clocks updated.
//
// mu must be held.
func (t *Timekeeper) startUpdater() {
        if t.stop != nil {
                // Timekeeper already started
                return
        }
        t.stop = make(chan struct{})

        // Keep the clocks up to date.
        //
        // Note that the Go runtime uses host CLOCK_MONOTONIC to service the
        // timer, so it may run at a *slightly* different rate from the
        // application CLOCK_MONOTONIC. That is fine, as we only need to update
        // at approximately this rate.
        timer := time.NewTicker(sentrytime.ApproxUpdateInterval)
        t.wg.Add(1)
        go func() { // S/R-SAFE: stopped during save.
                defer t.wg.Done()
                for {
                        // Start with an update immediately, so the clocks are
                        // ready ASAP.

                        // Call Update within a Write block to prevent the VDSO
                        // from using the old params between Update and
                        // Write.
                        if err := t.params.Write(func() vdsoParams {
                                monotonicParams, monotonicOk, realtimeParams, realtimeOk := t.clocks.Update()

                                var p vdsoParams
                                if monotonicOk {
                                        p.monotonicReady = 1
                                        p.monotonicBaseCycles = int64(monotonicParams.BaseCycles)
                                        p.monotonicBaseRef = int64(monotonicParams.BaseRef) + t.monotonicOffset
                                        p.monotonicFrequency = monotonicParams.Frequency
                                }
                                if realtimeOk {
                                        p.realtimeReady = 1
                                        p.realtimeBaseCycles = int64(realtimeParams.BaseCycles)
                                        p.realtimeBaseRef = int64(realtimeParams.BaseRef)
                                        p.realtimeFrequency = realtimeParams.Frequency
                                }
                                return p
                        }); err != nil {
                                log.Warningf("Unable to update VDSO parameter page: %v", err)
                        }

                        select {
                        case <-timer.C:
                        case <-t.stop:
                                return
                        }
                }
        }()
}

// stopUpdater stops the update goroutine, blocking until it exits.
//
// mu must be held.
func (t *Timekeeper) stopUpdater() {
        if t.stop == nil {
                // Updater not running.
                return
        }

        close(t.stop)
        t.wg.Wait()
        t.stop = nil
}

// Destroy destroys the Timekeeper, freeing all associated resources.
func (t *Timekeeper) Destroy() {
        t.mu.Lock()
        defer t.mu.Unlock()

        t.stopUpdater()
}

// PauseUpdates stops clock parameter updates. This should only be used when
// Tasks are not running and thus cannot access the clock.
func (t *Timekeeper) PauseUpdates() {
        t.mu.Lock()
        defer t.mu.Unlock()
        t.stopUpdater()
}

// ResumeUpdates restarts clock parameter updates stopped by PauseUpdates.
func (t *Timekeeper) ResumeUpdates() {
        t.mu.Lock()
        defer t.mu.Unlock()
        t.startUpdater()
}

// GetTime returns the current time in nanoseconds.
func (t *Timekeeper) GetTime(c sentrytime.ClockID) (int64, error) {
        if t.clocks == nil {
                if t.restored == nil {
                        panic("Timekeeper used before initialized with SetClocks")
                }
                <-t.restored
        }
        now, err := t.clocks.GetTime(c)
        if err == nil && c == sentrytime.Monotonic {
                now += t.monotonicOffset
                for {
                        // It's possible that the clock is shaky. This may be due to
                        // platform issues, e.g. the KVM platform relies on the guest
                        // TSC and host TSC, which may not be perfectly in sync. To
                        // work around this issue, ensure that the monotonic time is
                        // always bounded by the last time read.
                        oldLowerBound := atomic.LoadInt64(&t.monotonicLowerBound)
                        if now < oldLowerBound {
                                now = oldLowerBound
                                break
                        }
                        if atomic.CompareAndSwapInt64(&t.monotonicLowerBound, oldLowerBound, now) {
                                break
                        }
                }
        }
        return now, err
}

// BootTime returns the system boot real time.
func (t *Timekeeper) BootTime() ktime.Time {
        return t.bootTime
}

// timekeeperClock is a ktime.Clock that reads time from a
// kernel.Timekeeper-managed clock.
//
// +stateify savable
type timekeeperClock struct {
        tk *Timekeeper
        c  sentrytime.ClockID

        // Implements ktime.Clock.WallTimeUntil.
        ktime.WallRateClock `state:"nosave"`

        // Implements waiter.Waitable. (We have no ability to detect
        // discontinuities from external changes to CLOCK_REALTIME).
        ktime.NoClockEvents `state:"nosave"`
}

// Now implements ktime.Clock.Now.
func (tc *timekeeperClock) Now() ktime.Time {
        now, err := tc.tk.GetTime(tc.c)
        if err != nil {
                panic(fmt.Sprintf("timekeeperClock(ClockID=%v)).Now: %v", tc.c, err))
        }
        return ktime.FromNanoseconds(now)
}



















































 1959 

 1956 


  124 


 1961 

 1959 




 1959 

 1951 


    1 


 1960 

 1952 








  203 


  121 


   94 


   94 

























   12 








   14 







































    4 


    4 





    1 


    4 







    3 


    4 


    4 


























































































































  145 




  145 


  145 



  145 



  145 



  129 







  129 


  128 


  145 











  135 




  136 



  137 

    3 



    2 



  136 



  136 
    4 


  130 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package safemem

import (
        "errors"
        "io"
        "math"
)

// ErrEndOfBlockSeq is returned by BlockSeqWriter when attempting to write
// beyond the end of the BlockSeq.
var ErrEndOfBlockSeq = errors.New("write beyond end of BlockSeq")

// Reader represents a streaming byte source like io.Reader.
type Reader interface {
        // ReadToBlocks reads up to dsts.NumBytes() bytes into dsts and returns the
        // number of bytes read. It may return a partial read without an error
        // (i.e. (n, nil) where 0 < n < dsts.NumBytes()). It should not return a
        // full read with an error (i.e. (dsts.NumBytes(), err) where err != nil);
        // note that this differs from io.Reader.Read (in particular, io.EOF should
        // not be returned if ReadToBlocks successfully reads dsts.NumBytes()
        // bytes.)
        ReadToBlocks(dsts BlockSeq) (uint64, error)
}

// Writer represents a streaming byte sink like io.Writer.
type Writer interface {
        // WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns
        // the number of bytes written. It may return a partial write without an
        // error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not
        // return a full write with an error (i.e. srcs.NumBytes(), err) where err
        // != nil).
        WriteFromBlocks(srcs BlockSeq) (uint64, error)
}

// ReadFullToBlocks repeatedly invokes r.ReadToBlocks until dsts.NumBytes()
// bytes have been read or ReadToBlocks returns an error.
func ReadFullToBlocks(r Reader, dsts BlockSeq) (uint64, error) {
        var done uint64
        for !dsts.IsEmpty() {
                n, err := r.ReadToBlocks(dsts)
                done += n
                if err != nil {
                        return done, err
                }
                dsts = dsts.DropFirst64(n)
        }
        return done, nil
}

// WriteFullFromBlocks repeatedly invokes w.WriteFromBlocks until
// srcs.NumBytes() bytes have been written or WriteFromBlocks returns an error.
func WriteFullFromBlocks(w Writer, srcs BlockSeq) (uint64, error) {
        var done uint64
        for !srcs.IsEmpty() {
                n, err := w.WriteFromBlocks(srcs)
                done += n
                if err != nil {
                        return done, err
                }
                srcs = srcs.DropFirst64(n)
        }
        return done, nil
}

// BlockSeqReader implements Reader by reading from a BlockSeq.
type BlockSeqReader struct {
        Blocks BlockSeq
}

// ReadToBlocks implements Reader.ReadToBlocks.
func (r *BlockSeqReader) ReadToBlocks(dsts BlockSeq) (uint64, error) {
        n, err := CopySeq(dsts, r.Blocks)
        r.Blocks = r.Blocks.DropFirst64(n)
        if err != nil {
                return n, err
        }
        if n < dsts.NumBytes() {
                return n, io.EOF
        }
        return n, nil
}

// BlockSeqWriter implements Writer by writing to a BlockSeq.
type BlockSeqWriter struct {
        Blocks BlockSeq
}

// WriteFromBlocks implements Writer.WriteFromBlocks.
func (w *BlockSeqWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
        n, err := CopySeq(w.Blocks, srcs)
        w.Blocks = w.Blocks.DropFirst64(n)
        if err != nil {
                return n, err
        }
        if n < srcs.NumBytes() {
                return n, ErrEndOfBlockSeq
        }
        return n, nil
}

// ReaderFunc implements Reader for a function with the semantics of
// Reader.ReadToBlocks.
type ReaderFunc func(dsts BlockSeq) (uint64, error)

// ReadToBlocks implements Reader.ReadToBlocks.
func (f ReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) {
        return f(dsts)
}

// WriterFunc implements Writer for a function with the semantics of
// Writer.WriteFromBlocks.
type WriterFunc func(srcs BlockSeq) (uint64, error)

// WriteFromBlocks implements Writer.WriteFromBlocks.
func (f WriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
        return f(srcs)
}

// ToIOReader implements io.Reader for a (safemem.)Reader.
//
// ToIOReader will return a successful partial read iff Reader.ReadToBlocks does
// so.
type ToIOReader struct {
        Reader Reader
}

// Read implements io.Reader.Read.
func (r ToIOReader) Read(dst []byte) (int, error) {
        n, err := r.Reader.ReadToBlocks(BlockSeqOf(BlockFromSafeSlice(dst)))
        return int(n), err
}

// ToIOWriter implements io.Writer for a (safemem.)Writer.
type ToIOWriter struct {
        Writer Writer
}

// Write implements io.Writer.Write.
func (w ToIOWriter) Write(src []byte) (int, error) {
        // io.Writer does not permit partial writes.
        n, err := WriteFullFromBlocks(w.Writer, BlockSeqOf(BlockFromSafeSlice(src)))
        return int(n), err
}

// FromIOReader implements Reader for an io.Reader by repeatedly invoking
// io.Reader.Read until it returns an error or partial read. This is not
// thread-safe.
//
// FromIOReader will return a successful partial read iff Reader.Read does so.
type FromIOReader struct {
        Reader io.Reader
}

// ReadToBlocks implements Reader.ReadToBlocks.
func (r FromIOReader) ReadToBlocks(dsts BlockSeq) (uint64, error) {
        var buf []byte
        var done uint64
        for !dsts.IsEmpty() {
                dst := dsts.Head()
                var n int
                var err error
                n, buf, err = r.readToBlock(dst, buf)
                done += uint64(n)
                if n != dst.Len() {
                        return done, err
                }
                dsts = dsts.Tail()
                if err != nil {
                        if dsts.IsEmpty() && err == io.EOF {
                                return done, nil
                        }
                        return done, err
                }
        }
        return done, nil
}

func (r FromIOReader) readToBlock(dst Block, buf []byte) (int, []byte, error) {
        // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require
        // safecopy.
        if !dst.NeedSafecopy() {
                n, err := r.Reader.Read(dst.ToSlice())
                return n, buf, err
        }
        if len(buf) < dst.Len() {
                buf = make([]byte, dst.Len())
        }
        rn, rerr := r.Reader.Read(buf[:dst.Len()])
        wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn]))
        if wberr != nil {
                return wbn, buf, wberr
        }
        return wbn, buf, rerr
}

// FromIOReaderAt implements Reader for an io.ReaderAt. Does not repeatedly
// invoke io.ReaderAt.ReadAt because ReadAt is more strict than Read. A partial
// read indicates an error. This is not thread-safe.
type FromIOReaderAt struct {
        ReaderAt io.ReaderAt
        Offset   int64
}

// ReadToBlocks implements Reader.ReadToBlocks.
func (r FromIOReaderAt) ReadToBlocks(dsts BlockSeq) (uint64, error) {
        var buf []byte
        var done uint64
        for !dsts.IsEmpty() {
                dst := dsts.Head()
                var n int
                var err error
                n, buf, err = r.readToBlock(dst, buf)
                done += uint64(n)
                if n != dst.Len() {
                        return done, err
                }
                dsts = dsts.Tail()
                if err != nil {
                        if dsts.IsEmpty() && err == io.EOF {
                                return done, nil
                        }
                        return done, err
                }
        }
        return done, nil
}

func (r FromIOReaderAt) readToBlock(dst Block, buf []byte) (int, []byte, error) {
        // io.Reader isn't safecopy-aware, so we have to buffer Blocks that require
        // safecopy.
        if !dst.NeedSafecopy() {
                n, err := r.ReaderAt.ReadAt(dst.ToSlice(), r.Offset)
                r.Offset += int64(n)
                return n, buf, err
        }
        if len(buf) < dst.Len() {
                buf = make([]byte, dst.Len())
        }
        rn, rerr := r.ReaderAt.ReadAt(buf[:dst.Len()], r.Offset)
        r.Offset += int64(rn)
        wbn, wberr := Copy(dst, BlockFromSafeSlice(buf[:rn]))
        if wberr != nil {
                return wbn, buf, wberr
        }
        return wbn, buf, rerr
}

// FromIOWriter implements Writer for an io.Writer by repeatedly invoking
// io.Writer.Write until it returns an error or partial write.
//
// FromIOWriter will tolerate implementations of io.Writer.Write that return
// partial writes with a nil error in contravention of io.Writer's
// requirements, since Writer is permitted to do so. FromIOWriter will return a
// successful partial write iff Writer.Write does so.
type FromIOWriter struct {
        Writer io.Writer
}

// WriteFromBlocks implements Writer.WriteFromBlocks.
func (w FromIOWriter) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
        var buf []byte
        var done uint64
        for !srcs.IsEmpty() {
                src := srcs.Head()
                var n int
                var err error
                n, buf, err = w.writeFromBlock(src, buf)
                done += uint64(n)
                if n != src.Len() || err != nil {
                        return done, err
                }
                srcs = srcs.Tail()
        }
        return done, nil
}

func (w FromIOWriter) writeFromBlock(src Block, buf []byte) (int, []byte, error) {
        // io.Writer isn't safecopy-aware, so we have to buffer Blocks that require
        // safecopy.
        if !src.NeedSafecopy() {
                n, err := w.Writer.Write(src.ToSlice())
                return n, buf, err
        }
        if len(buf) < src.Len() {
                buf = make([]byte, src.Len())
        }
        bufn, buferr := Copy(BlockFromSafeSlice(buf[:src.Len()]), src)
        wn, werr := w.Writer.Write(buf[:bufn])
        if werr != nil {
                return wn, buf, werr
        }
        return wn, buf, buferr
}

// FromVecReaderFunc implements Reader for a function that reads data into a
// [][]byte and returns the number of bytes read as an int64.
type FromVecReaderFunc struct {
        ReadVec func(dsts [][]byte) (int64, error)
}

// ReadToBlocks implements Reader.ReadToBlocks.
//
// ReadToBlocks calls r.ReadVec at most once.
func (r FromVecReaderFunc) ReadToBlocks(dsts BlockSeq) (uint64, error) {
        if dsts.IsEmpty() {
                return 0, nil
        }
        // Ensure that we don't pass a [][]byte with a total length > MaxInt64.
        dsts = dsts.TakeFirst64(uint64(math.MaxInt64))
        dstSlices := make([][]byte, 0, dsts.NumBlocks())
        // Buffer Blocks that require safecopy.
        for tmp := dsts; !tmp.IsEmpty(); tmp = tmp.Tail() {
                dst := tmp.Head()
                if dst.NeedSafecopy() {
                        dstSlices = append(dstSlices, make([]byte, dst.Len()))
                } else {
                        dstSlices = append(dstSlices, dst.ToSlice())
                }
        }
        rn, rerr := r.ReadVec(dstSlices)
        dsts = dsts.TakeFirst64(uint64(rn))
        var done uint64
        var i int
        for !dsts.IsEmpty() {
                dst := dsts.Head()
                if dst.NeedSafecopy() {
                        n, err := Copy(dst, BlockFromSafeSlice(dstSlices[i]))
                        done += uint64(n)
                        if err != nil {
                                return done, err
                        }
                } else {
                        done += uint64(dst.Len())
                }
                dsts = dsts.Tail()
                i++
        }
        return done, rerr
}

// FromVecWriterFunc implements Writer for a function that writes data from a
// [][]byte and returns the number of bytes written.
type FromVecWriterFunc struct {
        WriteVec func(srcs [][]byte) (int64, error)
}

// WriteFromBlocks implements Writer.WriteFromBlocks.
//
// WriteFromBlocks calls w.WriteVec at most once.
func (w FromVecWriterFunc) WriteFromBlocks(srcs BlockSeq) (uint64, error) {
        if srcs.IsEmpty() {
                return 0, nil
        }
        // Ensure that we don't pass a [][]byte with a total length > MaxInt64.
        srcs = srcs.TakeFirst64(uint64(math.MaxInt64))
        srcSlices := make([][]byte, 0, srcs.NumBlocks())
        // Buffer Blocks that require safecopy.
        var buferr error
        for tmp := srcs; !tmp.IsEmpty(); tmp = tmp.Tail() {
                src := tmp.Head()
                if src.NeedSafecopy() {
                        slice := make([]byte, src.Len())
                        n, err := Copy(BlockFromSafeSlice(slice), src)
                        srcSlices = append(srcSlices, slice[:n])
                        if err != nil {
                                buferr = err
                                break
                        }
                } else {
                        srcSlices = append(srcSlices, src.ToSlice())
                }
        }
        n, err := w.WriteVec(srcSlices)
        if err != nil {
                return uint64(n), err
        }
        return uint64(n), buferr
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/abi/linux/linux_amd64_abi_autogen_unsafe.go: no such file or directory














































    1 























    1 




    1 




    1 






    1 



   14 




   14 












   14 




   14 
   14 








  321 




  321 



  323 



  323 



    2 




    2 




    2 



    3 




    2 



  190 




  192 




  191 












    2 



    2 



    2 
    2 







    1 



    1 



    1 



    2 



    2 



    2 
    2 











    1 



    1 



    1 







































    1 






    1 







    1 





































    1 




    1 
    1 


























































































































































































































































































































































    1 




    1 




    1 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package p9

import (
        "fmt"
        "io"
        "sync/atomic"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/fd"
        "gvisor.dev/gvisor/pkg/log"
)

// Attach attaches to a server.
//
// Note that authentication is not currently supported.
func (c *Client) Attach(name string) (File, error) {
        fid, ok := c.fidPool.Get()
        if !ok {
                return nil, ErrOutOfFIDs
        }

        rattach := Rattach{}
        if err := c.sendRecv(&Tattach{FID: FID(fid), Auth: Tauth{AttachName: name, AuthenticationFID: NoFID, UID: NoUID}}, &rattach); err != nil {
                c.fidPool.Put(fid)
                return nil, err
        }

        return c.newFile(FID(fid)), nil
}

// newFile returns a new client file.
func (c *Client) newFile(fid FID) *clientFile {
        return &clientFile{
                client: c,
                fid:    fid,
        }
}

// clientFile is provided to clients.
//
// This proxies all of the interfaces found in file.go.
type clientFile struct {
        DisallowServerCalls

        // client is the originating client.
        client *Client

        // fid is the FID for this file.
        fid FID

        // closed indicates whether this file has been closed.
        closed uint32
}

// Walk implements File.Walk.
func (c *clientFile) Walk(names []string) ([]QID, File, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return nil, nil, unix.EBADF
        }

        fid, ok := c.client.fidPool.Get()
        if !ok {
                return nil, nil, ErrOutOfFIDs
        }

        rwalk := Rwalk{}
        if err := c.client.sendRecv(&Twalk{FID: c.fid, NewFID: FID(fid), Names: names}, &rwalk); err != nil {
                c.client.fidPool.Put(fid)
                return nil, nil, err
        }

        // Return a new client file.
        return rwalk.QIDs, c.client.newFile(FID(fid)), nil
}

// WalkGetAttr implements File.WalkGetAttr.
func (c *clientFile) WalkGetAttr(components []string) ([]QID, File, AttrMask, Attr, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return nil, nil, AttrMask{}, Attr{}, unix.EBADF
        }

        if !versionSupportsTwalkgetattr(c.client.version) {
                qids, file, err := c.Walk(components)
                if err != nil {
                        return nil, nil, AttrMask{}, Attr{}, err
                }
                _, valid, attr, err := file.GetAttr(AttrMaskAll())
                if err != nil {
                        file.Close()
                        return nil, nil, AttrMask{}, Attr{}, err
                }
                return qids, file, valid, attr, nil
        }

        fid, ok := c.client.fidPool.Get()
        if !ok {
                return nil, nil, AttrMask{}, Attr{}, ErrOutOfFIDs
        }

        rwalkgetattr := Rwalkgetattr{}
        if err := c.client.sendRecv(&Twalkgetattr{FID: c.fid, NewFID: FID(fid), Names: components}, &rwalkgetattr); err != nil {
                c.client.fidPool.Put(fid)
                return nil, nil, AttrMask{}, Attr{}, err
        }

        // Return a new client file.
        return rwalkgetattr.QIDs, c.client.newFile(FID(fid)), rwalkgetattr.Valid, rwalkgetattr.Attr, nil
}

func (c *clientFile) MultiGetAttr(names []string) ([]FullStat, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return nil, unix.EBADF
        }

        if !versionSupportsTmultiGetAttr(c.client.version) {
                return DefaultMultiGetAttr(c, names)
        }

        rmultigetattr := Rmultigetattr{}
        if err := c.client.sendRecv(&Tmultigetattr{FID: c.fid, Names: names}, &rmultigetattr); err != nil {
                return nil, err
        }
        return rmultigetattr.Stats, nil
}

// StatFS implements File.StatFS.
func (c *clientFile) StatFS() (FSStat, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return FSStat{}, unix.EBADF
        }

        rstatfs := Rstatfs{}
        if err := c.client.sendRecv(&Tstatfs{FID: c.fid}, &rstatfs); err != nil {
                return FSStat{}, err
        }

        return rstatfs.FSStat, nil
}

// FSync implements File.FSync.
func (c *clientFile) FSync() error {
        if atomic.LoadUint32(&c.closed) != 0 {
                return unix.EBADF
        }

        return c.client.sendRecv(&Tfsync{FID: c.fid}, &Rfsync{})
}

// GetAttr implements File.GetAttr.
func (c *clientFile) GetAttr(req AttrMask) (QID, AttrMask, Attr, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return QID{}, AttrMask{}, Attr{}, unix.EBADF
        }

        rgetattr := Rgetattr{}
        if err := c.client.sendRecv(&Tgetattr{FID: c.fid, AttrMask: req}, &rgetattr); err != nil {
                return QID{}, AttrMask{}, Attr{}, err
        }

        return rgetattr.QID, rgetattr.Valid, rgetattr.Attr, nil
}

// SetAttr implements File.SetAttr.
func (c *clientFile) SetAttr(valid SetAttrMask, attr SetAttr) error {
        if atomic.LoadUint32(&c.closed) != 0 {
                return unix.EBADF
        }

        return c.client.sendRecv(&Tsetattr{FID: c.fid, Valid: valid, SetAttr: attr}, &Rsetattr{})
}

// GetXattr implements File.GetXattr.
func (c *clientFile) GetXattr(name string, size uint64) (string, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return "", unix.EBADF
        }
        if !versionSupportsGetSetXattr(c.client.version) {
                return "", unix.EOPNOTSUPP
        }

        rgetxattr := Rgetxattr{}
        if err := c.client.sendRecv(&Tgetxattr{FID: c.fid, Name: name, Size: size}, &rgetxattr); err != nil {
                return "", err
        }

        return rgetxattr.Value, nil
}

// SetXattr implements File.SetXattr.
func (c *clientFile) SetXattr(name, value string, flags uint32) error {
        if atomic.LoadUint32(&c.closed) != 0 {
                return unix.EBADF
        }
        if !versionSupportsGetSetXattr(c.client.version) {
                return unix.EOPNOTSUPP
        }

        return c.client.sendRecv(&Tsetxattr{FID: c.fid, Name: name, Value: value, Flags: flags}, &Rsetxattr{})
}

// ListXattr implements File.ListXattr.
func (c *clientFile) ListXattr(size uint64) (map[string]struct{}, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return nil, unix.EBADF
        }
        if !versionSupportsListRemoveXattr(c.client.version) {
                return nil, unix.EOPNOTSUPP
        }

        rlistxattr := Rlistxattr{}
        if err := c.client.sendRecv(&Tlistxattr{FID: c.fid, Size: size}, &rlistxattr); err != nil {
                return nil, err
        }

        xattrs := make(map[string]struct{}, len(rlistxattr.Xattrs))
        for _, x := range rlistxattr.Xattrs {
                xattrs[x] = struct{}{}
        }
        return xattrs, nil
}

// RemoveXattr implements File.RemoveXattr.
func (c *clientFile) RemoveXattr(name string) error {
        if atomic.LoadUint32(&c.closed) != 0 {
                return unix.EBADF
        }
        if !versionSupportsListRemoveXattr(c.client.version) {
                return unix.EOPNOTSUPP
        }

        return c.client.sendRecv(&Tremovexattr{FID: c.fid, Name: name}, &Rremovexattr{})
}

// Allocate implements File.Allocate.
func (c *clientFile) Allocate(mode AllocateMode, offset, length uint64) error {
        if atomic.LoadUint32(&c.closed) != 0 {
                return unix.EBADF
        }
        if !versionSupportsTallocate(c.client.version) {
                return unix.EOPNOTSUPP
        }

        return c.client.sendRecv(&Tallocate{FID: c.fid, Mode: mode, Offset: offset, Length: length}, &Rallocate{})
}

// Remove implements File.Remove.
//
// N.B. This method is no longer part of the file interface and should be
// considered deprecated.
func (c *clientFile) Remove() error {
        // Avoid double close.
        if !atomic.CompareAndSwapUint32(&c.closed, 0, 1) {
                return unix.EBADF
        }

        // Send the remove message.
        if err := c.client.sendRecv(&Tremove{FID: c.fid}, &Rremove{}); err != nil {
                return err
        }

        // "It is correct to consider remove to be a clunk with the side effect
        // of removing the file if permissions allow."
        // https://swtch.com/plan9port/man/man9/remove.html

        // Return the FID to the pool.
        c.client.fidPool.Put(uint64(c.fid))
        return nil
}

// Close implements File.Close.
func (c *clientFile) Close() error {
        // Avoid double close.
        if !atomic.CompareAndSwapUint32(&c.closed, 0, 1) {
                return unix.EBADF
        }

        // Send the close message.
        if err := c.client.sendRecv(&Tclunk{FID: c.fid}, &Rclunk{}); err != nil {
                // If an error occurred, we toss away the FID. This isn't ideal,
                // but I'm not sure what else makes sense in this context.
                log.Warningf("Tclunk failed, losing FID %v: %v", c.fid, err)
                return err
        }

        // Return the FID to the pool.
        c.client.fidPool.Put(uint64(c.fid))
        return nil
}

// SetAttrClose implements File.SetAttrClose.
func (c *clientFile) SetAttrClose(valid SetAttrMask, attr SetAttr) error {
        if !versionSupportsTsetattrclunk(c.client.version) {
                setAttrErr := c.SetAttr(valid, attr)

                // Try to close file even in case of failure above. Since the state of the
                // file is unknown to the caller, it will not attempt to close the file
                // again.
                if err := c.Close(); err != nil {
                        return err
                }

                return setAttrErr
        }

        // Avoid double close.
        if !atomic.CompareAndSwapUint32(&c.closed, 0, 1) {
                return unix.EBADF
        }

        // Send the message.
        if err := c.client.sendRecv(&Tsetattrclunk{FID: c.fid, Valid: valid, SetAttr: attr}, &Rsetattrclunk{}); err != nil {
                // If an error occurred, we toss away the FID. This isn't ideal,
                // but I'm not sure what else makes sense in this context.
                log.Warningf("Tsetattrclunk failed, losing FID %v: %v", c.fid, err)
                return err
        }

        // Return the FID to the pool.
        c.client.fidPool.Put(uint64(c.fid))
        return nil
}

// Open implements File.Open.
func (c *clientFile) Open(flags OpenFlags) (*fd.FD, QID, uint32, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return nil, QID{}, 0, unix.EBADF
        }

        rlopen := Rlopen{}
        if err := c.client.sendRecv(&Tlopen{FID: c.fid, Flags: flags}, &rlopen); err != nil {
                return nil, QID{}, 0, err
        }

        return rlopen.File, rlopen.QID, rlopen.IoUnit, nil
}

// Connect implements File.Connect.
func (c *clientFile) Connect(flags ConnectFlags) (*fd.FD, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return nil, unix.EBADF
        }

        if !VersionSupportsConnect(c.client.version) {
                return nil, unix.ECONNREFUSED
        }

        rlconnect := Rlconnect{}
        if err := c.client.sendRecv(&Tlconnect{FID: c.fid, Flags: flags}, &rlconnect); err != nil {
                return nil, err
        }

        return rlconnect.File, nil
}

// chunk applies fn to p in chunkSize-sized chunks until fn returns a partial result, p is
// exhausted, or an error is encountered (which may be io.EOF).
func chunk(chunkSize uint32, fn func([]byte, uint64) (int, error), p []byte, offset uint64) (int, error) {
        // Some p9.Clients depend on executing fn on zero-byte buffers. Handle this
        // as a special case (normally it is fine to short-circuit and return (0, nil)).
        if len(p) == 0 {
                return fn(p, offset)
        }

        // total is the cumulative bytes processed.
        var total int
        for {
                var n int
                var err error

                // We're done, don't bother trying to do anything more.
                if total == len(p) {
                        return total, nil
                }

                // Apply fn to a chunkSize-sized (or less) chunk of p.
                if len(p) < total+int(chunkSize) {
                        n, err = fn(p[total:], offset)
                } else {
                        n, err = fn(p[total:total+int(chunkSize)], offset)
                }
                total += n
                offset += uint64(n)

                // Return whatever we have processed if we encounter an error. This error
                // could be io.EOF.
                if err != nil {
                        return total, err
                }

                // Did we get a partial result? If so, return it immediately.
                if n < int(chunkSize) {
                        return total, nil
                }

                // If we received more bytes than we ever requested, this is a problem.
                if total > len(p) {
                        panic(fmt.Sprintf("bytes completed (%d)) > requested (%d)", total, len(p)))
                }
        }
}

// ReadAt proxies File.ReadAt.
func (c *clientFile) ReadAt(p []byte, offset uint64) (int, error) {
        return chunk(c.client.payloadSize, c.readAt, p, offset)
}

func (c *clientFile) readAt(p []byte, offset uint64) (int, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return 0, unix.EBADF
        }

        rread := Rread{Data: p}
        if err := c.client.sendRecv(&Tread{FID: c.fid, Offset: offset, Count: uint32(len(p))}, &rread); err != nil {
                return 0, err
        }

        // The message may have been truncated, or for some reason a new buffer
        // allocated. This isn't the common path, but we make sure that if the
        // payload has changed we copy it. See transport.go for more information.
        if len(p) > 0 && len(rread.Data) > 0 && &rread.Data[0] != &p[0] {
                copy(p, rread.Data)
        }

        // io.EOF is not an error that a p9 server can return. Use POSIX semantics to
        // return io.EOF manually: zero bytes were returned and a non-zero buffer was used.
        if len(rread.Data) == 0 && len(p) > 0 {
                return 0, io.EOF
        }

        return len(rread.Data), nil
}

// WriteAt proxies File.WriteAt.
func (c *clientFile) WriteAt(p []byte, offset uint64) (int, error) {
        return chunk(c.client.payloadSize, c.writeAt, p, offset)
}

func (c *clientFile) writeAt(p []byte, offset uint64) (int, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return 0, unix.EBADF
        }

        rwrite := Rwrite{}
        if err := c.client.sendRecv(&Twrite{FID: c.fid, Offset: offset, Data: p}, &rwrite); err != nil {
                return 0, err
        }

        return int(rwrite.Count), nil
}

// ReadWriterFile wraps a File and implements io.ReadWriter, io.ReaderAt, and io.WriterAt.
type ReadWriterFile struct {
        File   File
        Offset uint64
}

// Read implements part of the io.ReadWriter interface.
func (r *ReadWriterFile) Read(p []byte) (int, error) {
        n, err := r.File.ReadAt(p, r.Offset)
        r.Offset += uint64(n)
        if err != nil {
                return n, err
        }
        if n == 0 && len(p) > 0 {
                return n, io.EOF
        }
        return n, nil
}

// ReadAt implements the io.ReaderAt interface.
func (r *ReadWriterFile) ReadAt(p []byte, offset int64) (int, error) {
        n, err := r.File.ReadAt(p, uint64(offset))
        if err != nil {
                return 0, err
        }
        if n == 0 && len(p) > 0 {
                return n, io.EOF
        }
        return n, nil
}

// Write implements part of the io.ReadWriter interface.
//
// Note that this may return a short write with a nil error. This violates the
// contract of io.Writer, but is more consistent with gVisor's pattern of
// returning errors that correspond to Linux errnos. Since short writes without
// error are common in Linux, returning a nil error is appropriate.
func (r *ReadWriterFile) Write(p []byte) (int, error) {
        n, err := r.File.WriteAt(p, r.Offset)
        r.Offset += uint64(n)
        return n, err
}

// WriteAt implements the io.WriteAt interface.
//
// Note that this may return a short write with a nil error. This violates the
// contract of io.WriterAt. See comment on Write for justification.
func (r *ReadWriterFile) WriteAt(p []byte, offset int64) (int, error) {
        return r.File.WriteAt(p, uint64(offset))
}

// Rename implements File.Rename.
func (c *clientFile) Rename(dir File, name string) error {
        if atomic.LoadUint32(&c.closed) != 0 {
                return unix.EBADF
        }

        clientDir, ok := dir.(*clientFile)
        if !ok {
                return unix.EBADF
        }

        return c.client.sendRecv(&Trename{FID: c.fid, Directory: clientDir.fid, Name: name}, &Rrename{})
}

// Create implements File.Create.
func (c *clientFile) Create(name string, openFlags OpenFlags, permissions FileMode, uid UID, gid GID) (*fd.FD, File, QID, uint32, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return nil, nil, QID{}, 0, unix.EBADF
        }

        msg := Tlcreate{
                FID:         c.fid,
                Name:        name,
                OpenFlags:   openFlags,
                Permissions: permissions,
                GID:         NoGID,
        }

        if versionSupportsTucreation(c.client.version) {
                msg.GID = gid
                rucreate := Rucreate{}
                if err := c.client.sendRecv(&Tucreate{Tlcreate: msg, UID: uid}, &rucreate); err != nil {
                        return nil, nil, QID{}, 0, err
                }
                return rucreate.File, c, rucreate.QID, rucreate.IoUnit, nil
        }

        rlcreate := Rlcreate{}
        if err := c.client.sendRecv(&msg, &rlcreate); err != nil {
                return nil, nil, QID{}, 0, err
        }

        return rlcreate.File, c, rlcreate.QID, rlcreate.IoUnit, nil
}

// Mkdir implements File.Mkdir.
func (c *clientFile) Mkdir(name string, permissions FileMode, uid UID, gid GID) (QID, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return QID{}, unix.EBADF
        }

        msg := Tmkdir{
                Directory:   c.fid,
                Name:        name,
                Permissions: permissions,
                GID:         NoGID,
        }

        if versionSupportsTucreation(c.client.version) {
                msg.GID = gid
                rumkdir := Rumkdir{}
                if err := c.client.sendRecv(&Tumkdir{Tmkdir: msg, UID: uid}, &rumkdir); err != nil {
                        return QID{}, err
                }
                return rumkdir.QID, nil
        }

        rmkdir := Rmkdir{}
        if err := c.client.sendRecv(&msg, &rmkdir); err != nil {
                return QID{}, err
        }

        return rmkdir.QID, nil
}

// Symlink implements File.Symlink.
func (c *clientFile) Symlink(oldname string, newname string, uid UID, gid GID) (QID, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return QID{}, unix.EBADF
        }

        msg := Tsymlink{
                Directory: c.fid,
                Name:      newname,
                Target:    oldname,
                GID:       NoGID,
        }

        if versionSupportsTucreation(c.client.version) {
                msg.GID = gid
                rusymlink := Rusymlink{}
                if err := c.client.sendRecv(&Tusymlink{Tsymlink: msg, UID: uid}, &rusymlink); err != nil {
                        return QID{}, err
                }
                return rusymlink.QID, nil
        }

        rsymlink := Rsymlink{}
        if err := c.client.sendRecv(&msg, &rsymlink); err != nil {
                return QID{}, err
        }

        return rsymlink.QID, nil
}

// Link implements File.Link.
func (c *clientFile) Link(target File, newname string) error {
        if atomic.LoadUint32(&c.closed) != 0 {
                return unix.EBADF
        }

        targetFile, ok := target.(*clientFile)
        if !ok {
                return unix.EBADF
        }

        return c.client.sendRecv(&Tlink{Directory: c.fid, Name: newname, Target: targetFile.fid}, &Rlink{})
}

// Mknod implements File.Mknod.
func (c *clientFile) Mknod(name string, mode FileMode, major uint32, minor uint32, uid UID, gid GID) (QID, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return QID{}, unix.EBADF
        }

        msg := Tmknod{
                Directory: c.fid,
                Name:      name,
                Mode:      mode,
                Major:     major,
                Minor:     minor,
                GID:       NoGID,
        }

        if versionSupportsTucreation(c.client.version) {
                msg.GID = gid
                rumknod := Rumknod{}
                if err := c.client.sendRecv(&Tumknod{Tmknod: msg, UID: uid}, &rumknod); err != nil {
                        return QID{}, err
                }
                return rumknod.QID, nil
        }

        rmknod := Rmknod{}
        if err := c.client.sendRecv(&msg, &rmknod); err != nil {
                return QID{}, err
        }

        return rmknod.QID, nil
}

// RenameAt implements File.RenameAt.
func (c *clientFile) RenameAt(oldname string, newdir File, newname string) error {
        if atomic.LoadUint32(&c.closed) != 0 {
                return unix.EBADF
        }

        clientNewDir, ok := newdir.(*clientFile)
        if !ok {
                return unix.EBADF
        }

        return c.client.sendRecv(&Trenameat{OldDirectory: c.fid, OldName: oldname, NewDirectory: clientNewDir.fid, NewName: newname}, &Rrenameat{})
}

// UnlinkAt implements File.UnlinkAt.
func (c *clientFile) UnlinkAt(name string, flags uint32) error {
        if atomic.LoadUint32(&c.closed) != 0 {
                return unix.EBADF
        }

        return c.client.sendRecv(&Tunlinkat{Directory: c.fid, Name: name, Flags: flags}, &Runlinkat{})
}

// Readdir implements File.Readdir.
func (c *clientFile) Readdir(offset uint64, count uint32) ([]Dirent, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return nil, unix.EBADF
        }

        rreaddir := Rreaddir{}
        if err := c.client.sendRecv(&Treaddir{Directory: c.fid, Offset: offset, Count: count}, &rreaddir); err != nil {
                return nil, err
        }

        return rreaddir.Entries, nil
}

// Readlink implements File.Readlink.
func (c *clientFile) Readlink() (string, error) {
        if atomic.LoadUint32(&c.closed) != 0 {
                return "", unix.EBADF
        }

        rreadlink := Rreadlink{}
        if err := c.client.sendRecv(&Treadlink{FID: c.fid}, &rreadlink); err != nil {
                return "", err
        }

        return rreadlink.Target, nil
}

// Flush implements File.Flush.
func (c *clientFile) Flush() error {
        if atomic.LoadUint32(&c.closed) != 0 {
                return unix.EBADF
        }

        if !VersionSupportsTflushf(c.client.version) {
                return nil
        }

        return c.client.sendRecv(&Tflushf{FID: c.fid}, &Rflushf{})
}



















































  525 


















  546 




  547 











  548 





  547 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package eventchannel contains functionality for sending any protobuf message
// on a socketpair.
//
// The wire format is a uvarint length followed by a binary protobuf.Any
// message.
package eventchannel

import (
        "encoding/binary"
        "fmt"

        "google.golang.org/protobuf/encoding/prototext"
        "google.golang.org/protobuf/proto"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        pb "gvisor.dev/gvisor/pkg/eventchannel/eventchannel_go_proto"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/unet"
)

// Emitter emits a proto message.
type Emitter interface {
        // Emit writes a single eventchannel message to an emitter. Emit should
        // return hangup = true to indicate an emitter has "hung up" and no further
        // messages should be directed to it.
        Emit(msg proto.Message) (hangup bool, err error)

        // Close closes this emitter. Emit cannot be used after Close is called.
        Close() error
}

// DefaultEmitter is the default emitter. Calls to Emit and AddEmitter are sent
// to this Emitter.
var DefaultEmitter = &multiEmitter{}

// Emit is a helper method that calls DefaultEmitter.Emit.
func Emit(msg proto.Message) error {
        _, err := DefaultEmitter.Emit(msg)
        return err
}

// AddEmitter is a helper method that calls DefaultEmitter.AddEmitter.
func AddEmitter(e Emitter) {
        DefaultEmitter.AddEmitter(e)
}

// multiEmitter is an Emitter that forwards messages to multiple Emitters.
type multiEmitter struct {
        // mu protects emitters.
        mu sync.Mutex
        // emitters is initialized lazily in AddEmitter.
        emitters map[Emitter]struct{}
}

// Emit emits a message using all added emitters.
func (me *multiEmitter) Emit(msg proto.Message) (bool, error) {
        me.mu.Lock()
        defer me.mu.Unlock()

        var err error
        for e := range me.emitters {
                hangup, eerr := e.Emit(msg)
                if eerr != nil {
                        if err == nil {
                                err = fmt.Errorf("error emitting %v: on %v: %v", msg, e, eerr)
                        } else {
                                err = fmt.Errorf("%v; on %v: %v", err, e, eerr)
                        }

                        // Log as well, since most callers ignore the error.
                        log.Warningf("Error emitting %v on %v: %v", msg, e, eerr)
                }
                if hangup {
                        log.Infof("Hangup on eventchannel emitter %v.", e)
                        delete(me.emitters, e)
                }
        }

        return false, err
}

// AddEmitter adds a new emitter.
func (me *multiEmitter) AddEmitter(e Emitter) {
        me.mu.Lock()
        defer me.mu.Unlock()
        if me.emitters == nil {
                me.emitters = make(map[Emitter]struct{})
        }
        me.emitters[e] = struct{}{}
}

// Close closes all emitters. If any Close call errors, it returns the first
// one encountered.
func (me *multiEmitter) Close() error {
        me.mu.Lock()
        defer me.mu.Unlock()
        var err error
        for e := range me.emitters {
                if eerr := e.Close(); err == nil && eerr != nil {
                        err = eerr
                }
                delete(me.emitters, e)
        }
        return err
}

// socketEmitter emits proto messages on a socket.
type socketEmitter struct {
        socket *unet.Socket
}

// SocketEmitter creates a new event channel based on the given fd.
//
// SocketEmitter takes ownership of fd.
func SocketEmitter(fd int) (Emitter, error) {
        s, err := unet.NewSocket(fd)
        if err != nil {
                return nil, err
        }

        return &socketEmitter{
                socket: s,
        }, nil
}

// Emit implements Emitter.Emit.
func (s *socketEmitter) Emit(msg proto.Message) (bool, error) {
        any, err := newAny(msg)
        if err != nil {
                return false, err
        }
        bufMsg, err := proto.Marshal(any)
        if err != nil {
                return false, err
        }

        // Wire format is uvarint message length followed by binary proto.
        p := make([]byte, binary.MaxVarintLen64)
        n := binary.PutUvarint(p, uint64(len(bufMsg)))
        p = append(p[:n], bufMsg...)
        for done := 0; done < len(p); {
                n, err := s.socket.Write(p[done:])
                if err != nil {
                        return linuxerr.Equals(linuxerr.EPIPE, err), err
                }
                done += n
        }

        return false, nil
}

// Close implements Emitter.Emit.
func (s *socketEmitter) Close() error {
        return s.socket.Close()
}

// debugEmitter wraps an emitter to emit stringified event messages. This is
// useful for debugging -- when the messages are intended for humans.
type debugEmitter struct {
        inner Emitter
}

// DebugEmitterFrom creates a new event channel emitter by wrapping an existing
// raw emitter.
func DebugEmitterFrom(inner Emitter) Emitter {
        return &debugEmitter{
                inner: inner,
        }
}

func (d *debugEmitter) Emit(msg proto.Message) (bool, error) {
        text, err := prototext.Marshal(msg)
        if err != nil {
                return false, err
        }
        ev := &pb.DebugEvent{
                Name: string(msg.ProtoReflect().Descriptor().FullName()),
                Text: string(text),
        }
        return d.inner.Emit(ev)
}

func (d *debugEmitter) Close() error {
        return d.inner.Close()
}





















































  210 



  210 




   47 




   47 

























   47 




   47 





    7 




   19 




   25 




   22 




   15 




   25 


    9 





    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernfs

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/usermem"
)

// DynamicBytesFile implements kernfs.Inode and represents a read-only file
// whose contents are backed by a vfs.DynamicBytesSource. If data additionally
// implements vfs.WritableDynamicBytesSource, the file also supports dispatching
// writes to the implementer, but note that this will not update the source data.
//
// Must be instantiated with NewDynamicBytesFile or initialized with Init
// before first use.
//
// +stateify savable
type DynamicBytesFile struct {
        InodeAttrs
        InodeNoStatFS
        InodeNoopRefCount
        InodeNotDirectory
        InodeNotSymlink

        locks vfs.FileLocks
        // data can additionally implement vfs.WritableDynamicBytesSource to support
        // writes.
        data vfs.DynamicBytesSource
}

var _ Inode = (*DynamicBytesFile)(nil)

// Init initializes a dynamic bytes file.
func (f *DynamicBytesFile) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) {
        if perm&^linux.PermissionsMask != 0 {
                panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
        }
        f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
        f.data = data
}

// Open implements Inode.Open.
func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd := &DynamicBytesFD{}
        if err := fd.Init(rp.Mount(), d, f.data, &f.locks, opts.Flags); err != nil {
                return nil, err
        }
        return &fd.vfsfd, nil
}

// SetStat implements Inode.SetStat. By default DynamicBytesFile doesn't allow
// inode attributes to be changed. Override SetStat() making it call
// f.InodeAttrs to allow it.
func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
        return linuxerr.EPERM
}

// DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a
// DynamicBytesFile.
//
// Must be initialized with Init before first use.
//
// +stateify savable
type DynamicBytesFD struct {
        vfs.FileDescriptionDefaultImpl
        vfs.DynamicBytesFileDescriptionImpl
        vfs.LockFD

        vfsfd vfs.FileDescription
        inode Inode
}

// Init initializes a DynamicBytesFD.
func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
        fd.LockFD.Init(locks)
        if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
                return err
        }
        fd.inode = d.inode
        fd.SetDataSource(data)
        return nil
}

// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence)
}

// Read implements vfs.FileDescriptionImpl.Read.
func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts)
}

// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts)
}

// Write implements vfs.FileDescriptionImpl.Write.
func (fd *DynamicBytesFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        return fd.DynamicBytesFileDescriptionImpl.Write(ctx, src, opts)
}

// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
        return fd.DynamicBytesFileDescriptionImpl.PWrite(ctx, src, offset, opts)
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *DynamicBytesFD) Release(context.Context) {}

// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
        fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
        return fd.inode.Stat(ctx, fs, opts)
}

// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error {
        // DynamicBytesFiles are immutable.
        return linuxerr.EPERM
}













































































   32 




   32 



   32 
















   31 
   31 












































































































































































    5 

    5 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package ip holds IPv4/IPv6 common utilities.
package ip

import (
        "bytes"
        "fmt"
        "io"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

type extendRequest int

const (
        notRequested extendRequest = iota
        requested
        extended
)

type dadState struct {
        nonce         []byte
        extendRequest extendRequest

        done  *bool
        timer tcpip.Timer

        completionHandlers []stack.DADCompletionHandler
}

// DADProtocol is a protocol whose core state machine can be represented by DAD.
type DADProtocol interface {
        // SendDADMessage attempts to send a DAD probe message.
        SendDADMessage(tcpip.Address, []byte) tcpip.Error
}

// DADOptions holds options for DAD.
type DADOptions struct {
        Clock              tcpip.Clock
        SecureRNG          io.Reader
        NonceSize          uint8
        ExtendDADTransmits uint8
        Protocol           DADProtocol
        NICID              tcpip.NICID
}

// DAD performs duplicate address detection for addresses.
type DAD struct {
        opts    DADOptions
        configs stack.DADConfigurations

        protocolMU sync.Locker
        addresses  map[tcpip.Address]dadState
}

// Init initializes the DAD state.
//
// Must only be called once for the lifetime of d; Init will panic if it is
// called twice.
//
// The lock will only be taken when timers fire.
func (d *DAD) Init(protocolMU sync.Locker, configs stack.DADConfigurations, opts DADOptions) {
        if d.addresses != nil {
                panic("attempted to initialize DAD state twice")
        }

        if opts.NonceSize != 0 && opts.ExtendDADTransmits == 0 {
                panic(fmt.Sprintf("given a non-zero value for NonceSize (%d) but zero for ExtendDADTransmits", opts.NonceSize))
        }

        configs.Validate()

        *d = DAD{
                opts:       opts,
                configs:    configs,
                protocolMU: protocolMU,
                addresses:  make(map[tcpip.Address]dadState),
        }
}

// CheckDuplicateAddressLocked performs DAD for an address, calling the
// completion handler once DAD resolves.
//
// If DAD is already performing for the provided address, h will be called when
// the currently running process completes.
//
// Precondition: d.protocolMU must be locked.
func (d *DAD) CheckDuplicateAddressLocked(addr tcpip.Address, h stack.DADCompletionHandler) stack.DADCheckAddressDisposition {
        if d.configs.DupAddrDetectTransmits == 0 {
                return stack.DADDisabled
        }

        ret := stack.DADAlreadyRunning
        s, ok := d.addresses[addr]
        if !ok {
                ret = stack.DADStarting

                remaining := d.configs.DupAddrDetectTransmits

                // Protected by d.protocolMU.
                done := false

                s = dadState{
                        done: &done,
                        timer: d.opts.Clock.AfterFunc(0, func() {
                                dadDone := remaining == 0

                                nonce, earlyReturn := func() ([]byte, bool) {
                                        d.protocolMU.Lock()
                                        defer d.protocolMU.Unlock()

                                        if done {
                                                return nil, true
                                        }

                                        s, ok := d.addresses[addr]
                                        if !ok {
                                                panic(fmt.Sprintf("dad: timer fired but missing state for %s on NIC(%d)", addr, d.opts.NICID))
                                        }

                                        // As per RFC 7527 section 4
                                        //
                                        //   If any probe is looped back within RetransTimer milliseconds
                                        //   after having sent DupAddrDetectTransmits NS(DAD) messages, the
                                        //   interface continues with another MAX_MULTICAST_SOLICIT number of
                                        //   NS(DAD) messages transmitted RetransTimer milliseconds apart.
                                        if dadDone && s.extendRequest == requested {
                                                dadDone = false
                                                remaining = d.opts.ExtendDADTransmits
                                                s.extendRequest = extended
                                        }

                                        if !dadDone && d.opts.NonceSize != 0 {
                                                if s.nonce == nil {
                                                        s.nonce = make([]byte, d.opts.NonceSize)
                                                }

                                                if n, err := io.ReadFull(d.opts.SecureRNG, s.nonce); err != nil {
                                                        panic(fmt.Sprintf("SecureRNG.Read(...): %s", err))
                                                } else if n != len(s.nonce) {
                                                        panic(fmt.Sprintf("expected to read %d bytes from secure RNG, only read %d bytes", len(s.nonce), n))
                                                }
                                        }

                                        d.addresses[addr] = s
                                        return s.nonce, false
                                }()
                                if earlyReturn {
                                        return
                                }

                                var err tcpip.Error
                                if !dadDone {
                                        err = d.opts.Protocol.SendDADMessage(addr, nonce)
                                }

                                d.protocolMU.Lock()
                                defer d.protocolMU.Unlock()

                                if done {
                                        return
                                }

                                s, ok := d.addresses[addr]
                                if !ok {
                                        panic(fmt.Sprintf("dad: timer fired but missing state for %s on NIC(%d)", addr, d.opts.NICID))
                                }

                                if !dadDone && err == nil {
                                        remaining--
                                        s.timer.Reset(d.configs.RetransmitTimer)
                                        return
                                }

                                // At this point we know that either DAD has resolved or we hit an error
                                // sending the last DAD message. Either way, clear the DAD state.
                                done = false
                                s.timer.Stop()
                                delete(d.addresses, addr)

                                var res stack.DADResult = &stack.DADSucceeded{}
                                if err != nil {
                                        res = &stack.DADError{Err: err}
                                }
                                for _, h := range s.completionHandlers {
                                        h(res)
                                }
                        }),
                }
        }

        s.completionHandlers = append(s.completionHandlers, h)
        d.addresses[addr] = s
        return ret
}

// ExtendIfNonceEqualLockedDisposition enumerates the possible results from
// ExtendIfNonceEqualLocked.
type ExtendIfNonceEqualLockedDisposition int

const (
        // Extended indicates that the DAD process was extended.
        Extended ExtendIfNonceEqualLockedDisposition = iota

        // AlreadyExtended indicates that the DAD process was already extended.
        AlreadyExtended

        // NoDADStateFound indicates that DAD state was not found for the address.
        NoDADStateFound

        // NonceDisabled indicates that nonce values are not sent with DAD messages.
        NonceDisabled

        // NonceNotEqual indicates that the nonce value passed and the nonce in the
        // last send DAD message are not equal.
        NonceNotEqual
)

// ExtendIfNonceEqualLocked extends the DAD process if the provided nonce is the
// same as the nonce sent in the last DAD message.
//
// Precondition: d.protocolMU must be locked.
func (d *DAD) ExtendIfNonceEqualLocked(addr tcpip.Address, nonce []byte) ExtendIfNonceEqualLockedDisposition {
        s, ok := d.addresses[addr]
        if !ok {
                return NoDADStateFound
        }

        if d.opts.NonceSize == 0 {
                return NonceDisabled
        }

        if s.extendRequest != notRequested {
                return AlreadyExtended
        }

        // As per RFC 7527 section 4
        //
        //   If any probe is looped back within RetransTimer milliseconds after having
        //   sent DupAddrDetectTransmits NS(DAD) messages, the interface continues
        //   with another MAX_MULTICAST_SOLICIT number of NS(DAD) messages transmitted
        //   RetransTimer milliseconds apart.
        //
        // If a DAD message has already been sent and the nonce value we observed is
        // the same as the nonce value we last sent, then we assume our probe was
        // looped back and request an extension to the DAD process.
        //
        // Note, the first DAD message is sent asynchronously so we need to make sure
        // that we sent a DAD message by checking if we have a nonce value set.
        if s.nonce != nil && bytes.Equal(s.nonce, nonce) {
                s.extendRequest = requested
                d.addresses[addr] = s
                return Extended
        }

        return NonceNotEqual
}

// StopLocked stops a currently running DAD process.
//
// Precondition: d.protocolMU must be locked.
func (d *DAD) StopLocked(addr tcpip.Address, reason stack.DADResult) {
        s, ok := d.addresses[addr]
        if !ok {
                return
        }

        *s.done = true
        s.timer.Stop()
        delete(d.addresses, addr)

        for _, h := range s.completionHandlers {
                h(reason)
        }
}

// SetConfigsLocked sets the DAD configurations.
//
// Precondition: d.protocolMU must be locked.
func (d *DAD) SetConfigsLocked(c stack.DADConfigurations) {
        c.Validate()
        d.configs = c
}


























    5 












    1 


    4 






   12 


    3 



   10 



    5 





    1 


    4 









    4 




    2 






   10 














    2 




    2 


    1 


    2 






    1 






    7 
    2 


    6 


    2 




    2 


    2 










    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/shm"
)

// Shmget implements shmget(2).
func Shmget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        key := shm.Key(args[0].Int())
        size := uint64(args[1].SizeT())
        flag := args[2].Int()

        private := key == linux.IPC_PRIVATE
        create := flag&linux.IPC_CREAT == linux.IPC_CREAT
        exclusive := flag&linux.IPC_EXCL == linux.IPC_EXCL
        mode := linux.FileMode(flag & 0777)

        pid := int32(t.ThreadGroup().ID())
        r := t.IPCNamespace().ShmRegistry()
        segment, err := r.FindOrCreate(t, pid, key, size, mode, private, create, exclusive)
        if err != nil {
                return 0, nil, err
        }
        defer segment.DecRef(t)
        return uintptr(segment.ID), nil, nil
}

// findSegment retrives a shm segment by the given id.
//
// findSegment returns a reference on Shm.
func findSegment(t *kernel.Task, id shm.ID) (*shm.Shm, error) {
        r := t.IPCNamespace().ShmRegistry()
        segment := r.FindByID(id)
        if segment == nil {
                // No segment with provided id.
                return nil, linuxerr.EINVAL
        }
        return segment, nil
}

// Shmat implements shmat(2).
func Shmat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        id := shm.ID(args[0].Int())
        addr := args[1].Pointer()
        flag := args[2].Int()

        segment, err := findSegment(t, id)
        if err != nil {
                return 0, nil, linuxerr.EINVAL
        }
        defer segment.DecRef(t)

        opts, err := segment.ConfigureAttach(t, addr, shm.AttachOpts{
                Execute:  flag&linux.SHM_EXEC == linux.SHM_EXEC,
                Readonly: flag&linux.SHM_RDONLY == linux.SHM_RDONLY,
                Remap:    flag&linux.SHM_REMAP == linux.SHM_REMAP,
        })
        if err != nil {
                return 0, nil, err
        }
        addr, err = t.MemoryManager().MMap(t, opts)
        return uintptr(addr), nil, err
}

// Shmdt implements shmdt(2).
func Shmdt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        err := t.MemoryManager().DetachShm(t, addr)
        return 0, nil, err
}

// Shmctl implements shmctl(2).
func Shmctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        id := shm.ID(args[0].Int())
        cmd := args[1].Int()
        buf := args[2].Pointer()

        r := t.IPCNamespace().ShmRegistry()

        switch cmd {
        case linux.SHM_STAT:
                // Technically, we should be treating id as "an index into the kernel's
                // internal array that maintains information about all shared memory
                // segments on the system". Since we don't track segments in an array,
                // we'll just pretend the shmid is the index and do the same thing as
                // IPC_STAT. Linux also uses the index as the shmid.
                fallthrough
        case linux.IPC_STAT:
                segment, err := findSegment(t, id)
                if err != nil {
                        return 0, nil, linuxerr.EINVAL
                }
                defer segment.DecRef(t)

                stat, err := segment.IPCStat(t)
                if err == nil {
                        _, err = stat.CopyOut(t, buf)
                }
                return 0, nil, err

        case linux.IPC_INFO:
                params := r.IPCInfo()
                _, err := params.CopyOut(t, buf)
                return 0, nil, err

        case linux.SHM_INFO:
                info := r.ShmInfo()
                _, err := info.CopyOut(t, buf)
                return 0, nil, err
        }

        // Remaining commands refer to a specific segment.
        segment, err := findSegment(t, id)
        if err != nil {
                return 0, nil, linuxerr.EINVAL
        }
        defer segment.DecRef(t)

        switch cmd {
        case linux.IPC_SET:
                var ds linux.ShmidDS
                if _, err = ds.CopyIn(t, buf); err != nil {
                        return 0, nil, err
                }
                err := segment.Set(t, &ds)
                return 0, nil, err

        case linux.IPC_RMID:
                segment.MarkDestroyed(t)
                return 0, nil, nil

        case linux.SHM_LOCK, linux.SHM_UNLOCK:
                // We currently do not support memory locking anywhere.
                // mlock(2)/munlock(2) are currently stubbed out as no-ops so do the
                // same here.
                t.Kernel().EmitUnimplementedEvent(t)
                return 0, nil, nil

        default:
                return 0, nil, linuxerr.EINVAL
        }
}






































































   78 




   65 









   10 




   69 




   64 























   47 










   19 
   14 



   19 





   19 



   19 
















   23 




   25 



   25 




   22 
    9 

   13 

   12 

   12 




   24 
    9 





   16 
   13 




   16 




   16 



   13 




   12 




   12 




   12 


   64 























   45 




   39 


   45 











   14 




   59 




   47 





   28 





   45 
   38 


    8 



   12 
   12 






   12 
   12 

























    1 
































   47 




   37 



   37 




   10 
   10 



   10 









   10 


   10 


   10 
   10 






   50 













   47 



   63 





   62 




   63 






   63 



   63 



   59 




   57 














    5 




    5 



   56 




   27 





   60 



   61 






   14 





   14 
   14 











   59 
    1 



   58 






   58 





   59 









   10 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
)

// Route represents a route through the networking stack to a given destination.
//
// It is safe to call Route's methods from multiple goroutines.
type Route struct {
        routeInfo routeInfo

        // localAddressNIC is the interface the address is associated with.
        // TODO(gvisor.dev/issue/4548): Remove this field once we can query the
        // address's assigned status without the NIC.
        localAddressNIC *nic

        mu struct {
                sync.RWMutex

                // localAddressEndpoint is the local address this route is associated with.
                localAddressEndpoint AssignableAddressEndpoint

                // remoteLinkAddress is the link-layer (MAC) address of the next hop in the
                // route.
                remoteLinkAddress tcpip.LinkAddress
        }

        // outgoingNIC is the interface this route uses to write packets.
        outgoingNIC *nic

        // linkRes is set if link address resolution is enabled for this protocol on
        // the route's NIC.
        linkRes *linkResolver
}

type routeInfo struct {
        RemoteAddress tcpip.Address

        LocalAddress tcpip.Address

        LocalLinkAddress tcpip.LinkAddress

        NextHop tcpip.Address

        NetProto tcpip.NetworkProtocolNumber

        Loop PacketLooping
}

// RemoteAddress returns the route's destination.
func (r *Route) RemoteAddress() tcpip.Address {
        return r.routeInfo.RemoteAddress
}

// LocalAddress returns the route's local address.
func (r *Route) LocalAddress() tcpip.Address {
        return r.routeInfo.LocalAddress
}

// LocalLinkAddress returns the route's local link-layer address.
func (r *Route) LocalLinkAddress() tcpip.LinkAddress {
        return r.routeInfo.LocalLinkAddress
}

// NextHop returns the next node in the route's path to the destination.
func (r *Route) NextHop() tcpip.Address {
        return r.routeInfo.NextHop
}

// NetProto returns the route's network-layer protocol number.
func (r *Route) NetProto() tcpip.NetworkProtocolNumber {
        return r.routeInfo.NetProto
}

// Loop returns the route's required packet looping.
func (r *Route) Loop() PacketLooping {
        return r.routeInfo.Loop
}

// RouteInfo contains all of Route's exported fields.
type RouteInfo struct {
        routeInfo

        // RemoteLinkAddress is the link-layer (MAC) address of the next hop in the
        // route.
        RemoteLinkAddress tcpip.LinkAddress
}

// Fields returns a RouteInfo with all of the known values for the route's
// fields.
//
// If any fields are unknown (e.g. remote link address when it is waiting for
// link address resolution), they will be unset.
func (r *Route) Fields() RouteInfo {
        r.mu.RLock()
        defer r.mu.RUnlock()
        return r.fieldsLocked()
}

func (r *Route) fieldsLocked() RouteInfo {
        return RouteInfo{
                routeInfo:         r.routeInfo,
                RemoteLinkAddress: r.mu.remoteLinkAddress,
        }
}

// constructAndValidateRoute validates and initializes a route. It takes
// ownership of the provided local address.
//
// Returns an empty route if validation fails.
func constructAndValidateRoute(netProto tcpip.NetworkProtocolNumber, addressEndpoint AssignableAddressEndpoint, localAddressNIC, outgoingNIC *nic, gateway, localAddr, remoteAddr tcpip.Address, handleLocal, multicastLoop bool) *Route {
        if len(localAddr) == 0 {
                localAddr = addressEndpoint.AddressWithPrefix().Address
        }

        if localAddressNIC != outgoingNIC && header.IsV6LinkLocalUnicastAddress(localAddr) {
                addressEndpoint.DecRef()
                return nil
        }

        // If no remote address is provided, use the local address.
        if len(remoteAddr) == 0 {
                remoteAddr = localAddr
        }

        r := makeRoute(
                netProto,
                gateway,
                localAddr,
                remoteAddr,
                outgoingNIC,
                localAddressNIC,
                addressEndpoint,
                handleLocal,
                multicastLoop,
        )

        return r
}

// makeRoute initializes a new route. It takes ownership of the provided
// AssignableAddressEndpoint.
func makeRoute(netProto tcpip.NetworkProtocolNumber, gateway, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *nic, localAddressEndpoint AssignableAddressEndpoint, handleLocal, multicastLoop bool) *Route {
        if localAddressNIC.stack != outgoingNIC.stack {
                panic(fmt.Sprintf("cannot create a route with NICs from different stacks"))
        }

        if len(localAddr) == 0 {
                localAddr = localAddressEndpoint.AddressWithPrefix().Address
        }

        loop := PacketOut

        // TODO(gvisor.dev/issue/4689): Loopback interface loops back packets at the
        // link endpoint level. We can remove this check once loopback interfaces
        // loop back packets at the network layer.
        if !outgoingNIC.IsLoopback() {
                if handleLocal && localAddr != "" && remoteAddr == localAddr {
                        loop = PacketLoop
                } else if multicastLoop && (header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)) {
                        loop |= PacketLoop
                } else if remoteAddr == header.IPv4Broadcast {
                        loop |= PacketLoop
                } else if subnet := localAddressEndpoint.AddressWithPrefix().Subnet(); subnet.IsBroadcast(remoteAddr) {
                        loop |= PacketLoop
                }
        }

        r := makeRouteInner(netProto, localAddr, remoteAddr, outgoingNIC, localAddressNIC, localAddressEndpoint, loop)
        if r.Loop()&PacketOut == 0 {
                // Packet will not leave the stack, no need for a gateway or a remote link
                // address.
                return r
        }

        if r.outgoingNIC.LinkEndpoint.Capabilities()&CapabilityResolutionRequired != 0 {
                if linkRes, ok := r.outgoingNIC.linkAddrResolvers[r.NetProto()]; ok {
                        r.linkRes = linkRes
                }
        }

        if len(gateway) > 0 {
                r.routeInfo.NextHop = gateway
                return r
        }

        if r.linkRes == nil {
                return r
        }

        if linkAddr, ok := r.linkRes.resolver.ResolveStaticAddress(r.RemoteAddress()); ok {
                r.ResolveWith(linkAddr)
                return r
        }

        if subnet := localAddressEndpoint.Subnet(); subnet.IsBroadcast(remoteAddr) {
                r.ResolveWith(header.EthernetBroadcastAddress)
                return r
        }

        if r.RemoteAddress() == r.LocalAddress() {
                // Local link address is already known.
                r.ResolveWith(r.LocalLinkAddress())
        }

        return r
}

func makeRouteInner(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *nic, localAddressEndpoint AssignableAddressEndpoint, loop PacketLooping) *Route {
        r := &Route{
                routeInfo: routeInfo{
                        NetProto:         netProto,
                        LocalAddress:     localAddr,
                        LocalLinkAddress: outgoingNIC.LinkEndpoint.LinkAddress(),
                        RemoteAddress:    remoteAddr,
                        Loop:             loop,
                },
                localAddressNIC: localAddressNIC,
                outgoingNIC:     outgoingNIC,
        }

        r.mu.Lock()
        r.mu.localAddressEndpoint = localAddressEndpoint
        r.mu.Unlock()

        return r
}

// makeLocalRoute initializes a new local route. It takes ownership of the
// provided AssignableAddressEndpoint.
//
// A local route is a route to a destination that is local to the stack.
func makeLocalRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, outgoingNIC, localAddressNIC *nic, localAddressEndpoint AssignableAddressEndpoint) *Route {
        loop := PacketLoop
        // TODO(gvisor.dev/issue/4689): Loopback interface loops back packets at the
        // link endpoint level. We can remove this check once loopback interfaces
        // loop back packets at the network layer.
        if outgoingNIC.IsLoopback() {
                loop = PacketOut
        }
        return makeRouteInner(netProto, localAddr, remoteAddr, outgoingNIC, localAddressNIC, localAddressEndpoint, loop)
}

// RemoteLinkAddress returns the link-layer (MAC) address of the next hop in
// the route.
func (r *Route) RemoteLinkAddress() tcpip.LinkAddress {
        r.mu.RLock()
        defer r.mu.RUnlock()
        return r.mu.remoteLinkAddress
}

// NICID returns the id of the NIC from which this route originates.
func (r *Route) NICID() tcpip.NICID {
        return r.outgoingNIC.ID()
}

// MaxHeaderLength forwards the call to the network endpoint's implementation.
func (r *Route) MaxHeaderLength() uint16 {
        return r.outgoingNIC.getNetworkEndpoint(r.NetProto()).MaxHeaderLength()
}

// Stats returns a mutable copy of current stats.
func (r *Route) Stats() tcpip.Stats {
        return r.outgoingNIC.stack.Stats()
}

// PseudoHeaderChecksum forwards the call to the network endpoint's
// implementation.
func (r *Route) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, totalLen uint16) uint16 {
        return header.PseudoHeaderChecksum(protocol, r.LocalAddress(), r.RemoteAddress(), totalLen)
}

// RequiresTXTransportChecksum returns false if the route does not require
// transport checksums to be populated.
func (r *Route) RequiresTXTransportChecksum() bool {
        if r.local() {
                return false
        }
        return r.outgoingNIC.LinkEndpoint.Capabilities()&CapabilityTXChecksumOffload == 0
}

// HasSoftwareGSOCapability returns true if the route supports software GSO.
func (r *Route) HasSoftwareGSOCapability() bool {
        if gso, ok := r.outgoingNIC.LinkEndpoint.(GSOEndpoint); ok {
                return gso.SupportedGSO() == SWGSOSupported
        }
        return false
}

// HasHardwareGSOCapability returns true if the route supports hardware GSO.
func (r *Route) HasHardwareGSOCapability() bool {
        if gso, ok := r.outgoingNIC.LinkEndpoint.(GSOEndpoint); ok {
                return gso.SupportedGSO() == HWGSOSupported
        }
        return false
}

// HasSaveRestoreCapability returns true if the route supports save/restore.
func (r *Route) HasSaveRestoreCapability() bool {
        return r.outgoingNIC.LinkEndpoint.Capabilities()&CapabilitySaveRestore != 0
}

// HasDisconncetOkCapability returns true if the route supports disconnecting.
func (r *Route) HasDisconncetOkCapability() bool {
        return r.outgoingNIC.LinkEndpoint.Capabilities()&CapabilityDisconnectOk != 0
}

// GSOMaxSize returns the maximum GSO packet size.
func (r *Route) GSOMaxSize() uint32 {
        if gso, ok := r.outgoingNIC.LinkEndpoint.(GSOEndpoint); ok {
                return gso.GSOMaxSize()
        }
        return 0
}

// ResolveWith immediately resolves a route with the specified remote link
// address.
func (r *Route) ResolveWith(addr tcpip.LinkAddress) {
        r.mu.Lock()
        defer r.mu.Unlock()
        r.mu.remoteLinkAddress = addr
}

// ResolvedFieldsResult is the result of a route resolution attempt.
type ResolvedFieldsResult struct {
        RouteInfo RouteInfo
        Err       tcpip.Error
}

// ResolvedFields attempts to resolve the remote link address if it is not
// known.
//
// If a callback is provided, it will be called before ResolvedFields returns
// when address resolution is not required. If address resolution is required,
// the callback will be called once address resolution is complete, regardless
// of success or failure.
//
// Note, the route will not cache the remote link address when address
// resolution completes.
func (r *Route) ResolvedFields(afterResolve func(ResolvedFieldsResult)) tcpip.Error {
        _, _, err := r.resolvedFields(afterResolve)
        return err
}

// resolvedFields is like ResolvedFields but also returns a notification channel
// when address resolution is required. This channel will become readable once
// address resolution is complete.
//
// The route's fields will also be returned, regardless of whether address
// resolution is required or not.
func (r *Route) resolvedFields(afterResolve func(ResolvedFieldsResult)) (RouteInfo, <-chan struct{}, tcpip.Error) {
        r.mu.RLock()
        fields := r.fieldsLocked()
        resolutionRequired := r.isResolutionRequiredRLocked()
        r.mu.RUnlock()
        if !resolutionRequired {
                if afterResolve != nil {
                        afterResolve(ResolvedFieldsResult{RouteInfo: fields, Err: nil})
                }
                return fields, nil, nil
        }

        // If specified, the local address used for link address resolution must be an
        // address on the outgoing interface.
        var linkAddressResolutionRequestLocalAddr tcpip.Address
        if r.localAddressNIC == r.outgoingNIC {
                linkAddressResolutionRequestLocalAddr = r.LocalAddress()
        }

        afterResolveFields := fields
        linkAddr, ch, err := r.linkRes.getNeighborLinkAddress(r.nextHop(), linkAddressResolutionRequestLocalAddr, func(r LinkResolutionResult) {
                if afterResolve != nil {
                        if r.Err == nil {
                                afterResolveFields.RemoteLinkAddress = r.LinkAddress
                        }

                        afterResolve(ResolvedFieldsResult{RouteInfo: afterResolveFields, Err: r.Err})
                }
        })
        if err == nil {
                fields.RemoteLinkAddress = linkAddr
        }
        return fields, ch, err
}

func (r *Route) nextHop() tcpip.Address {
        if len(r.NextHop()) == 0 {
                return r.RemoteAddress()
        }
        return r.NextHop()
}

// local returns true if the route is a local route.
func (r *Route) local() bool {
        return r.Loop() == PacketLoop || r.outgoingNIC.IsLoopback()
}

// IsResolutionRequired returns true if Resolve() must be called to resolve
// the link address before the route can be written to.
//
// The NICs the route is associated with must not be locked.
func (r *Route) IsResolutionRequired() bool {
        r.mu.RLock()
        defer r.mu.RUnlock()
        return r.isResolutionRequiredRLocked()
}

func (r *Route) isResolutionRequiredRLocked() bool {
        return len(r.mu.remoteLinkAddress) == 0 && r.linkRes != nil && r.isValidForOutgoingRLocked() && !r.local()
}

func (r *Route) isValidForOutgoing() bool {
        r.mu.RLock()
        defer r.mu.RUnlock()
        return r.isValidForOutgoingRLocked()
}

func (r *Route) isValidForOutgoingRLocked() bool {
        if !r.outgoingNIC.Enabled() {
                return false
        }

        localAddressEndpoint := r.mu.localAddressEndpoint
        if localAddressEndpoint == nil || !r.localAddressNIC.isValidForOutgoing(localAddressEndpoint) {
                return false
        }

        // If the source NIC and outgoing NIC are different, make sure the stack has
        // forwarding enabled, or the packet will be handled locally.
        if r.outgoingNIC != r.localAddressNIC && !isNICForwarding(r.localAddressNIC, r.NetProto()) && (!r.outgoingNIC.stack.handleLocal || !r.outgoingNIC.hasAddress(r.NetProto(), r.RemoteAddress())) {
                return false
        }

        return true
}

// WritePacket writes the packet through the given route.
func (r *Route) WritePacket(params NetworkHeaderParams, pkt *PacketBuffer) tcpip.Error {
        if !r.isValidForOutgoing() {
                return &tcpip.ErrInvalidEndpointState{}
        }

        return r.outgoingNIC.getNetworkEndpoint(r.NetProto()).WritePacket(r, params, pkt)
}

// WritePackets writes a list of n packets through the given route and returns
// the number of packets written.
func (r *Route) WritePackets(pkts PacketBufferList, params NetworkHeaderParams) (int, tcpip.Error) {
        if !r.isValidForOutgoing() {
                return 0, &tcpip.ErrInvalidEndpointState{}
        }

        return r.outgoingNIC.getNetworkEndpoint(r.NetProto()).WritePackets(r, pkts, params)
}

// WriteHeaderIncludedPacket writes a packet already containing a network
// header through the given route.
func (r *Route) WriteHeaderIncludedPacket(pkt *PacketBuffer) tcpip.Error {
        if !r.isValidForOutgoing() {
                return &tcpip.ErrInvalidEndpointState{}
        }

        return r.outgoingNIC.getNetworkEndpoint(r.NetProto()).WriteHeaderIncludedPacket(r, pkt)
}

// DefaultTTL returns the default TTL of the underlying network endpoint.
func (r *Route) DefaultTTL() uint8 {
        return r.outgoingNIC.getNetworkEndpoint(r.NetProto()).DefaultTTL()
}

// MTU returns the MTU of the underlying network endpoint.
func (r *Route) MTU() uint32 {
        return r.outgoingNIC.getNetworkEndpoint(r.NetProto()).MTU()
}

// Release decrements the reference counter of the resources associated with the
// route.
func (r *Route) Release() {
        r.mu.Lock()
        defer r.mu.Unlock()

        if ep := r.mu.localAddressEndpoint; ep != nil {
                ep.DecRef()
        }
}

// Acquire increments the reference counter of the resources associated with the
// route.
func (r *Route) Acquire() {
        r.mu.RLock()
        defer r.mu.RUnlock()
        r.acquireLocked()
}

func (r *Route) acquireLocked() {
        if ep := r.mu.localAddressEndpoint; ep != nil {
                if !ep.IncRef() {
                        panic(fmt.Sprintf("failed to increment reference count for local address endpoint = %s", r.LocalAddress()))
                }
        }
}

// Stack returns the instance of the Stack that owns this route.
func (r *Route) Stack() *Stack {
        return r.outgoingNIC.stack
}

func (r *Route) isV4Broadcast(addr tcpip.Address) bool {
        if addr == header.IPv4Broadcast {
                return true
        }

        r.mu.RLock()
        localAddressEndpoint := r.mu.localAddressEndpoint
        r.mu.RUnlock()
        if localAddressEndpoint == nil {
                return false
        }

        subnet := localAddressEndpoint.Subnet()
        return subnet.IsBroadcast(addr)
}

// IsOutboundBroadcast returns true if the route is for an outbound broadcast
// packet.
func (r *Route) IsOutboundBroadcast() bool {
        // Only IPv4 has a notion of broadcast.
        return r.isV4Broadcast(r.RemoteAddress())
}

// ConfirmReachable informs the network/link layer that the neighbour used for
// the route is reachable.
//
// "Reachable" is defined as having full-duplex communication between the
// local and remote ends of the route.
func (r *Route) ConfirmReachable() {
        if r.linkRes != nil {
                r.linkRes.confirmReachable(r.nextHop())
        }
}














































































































































































   12 



























    3 







































































    7 















    7 














   12 
   12 


    4 




   11 




   12 




    8 













    8 









    8 


    1 




    1 
    3 

    1 

    1 




    2 


    1 
    7 











    7 






























    1 
















    5 






    5 







    5 






    5 








































   53 




    7 





   57 


   53 



   59 













   47 



   46 









    2 



   47 










   59 










   58 
   15 

   11 



   10 

    1 





    1 

    5 








    5 






    2 



    5 

    4 

    1 



    3 

   38 



   43 















   18 






   10 


   15 





    1 




    1 

   15 





   15 









   12 

    5 



   15 

    1 



   15 

















































































































































































    1 












































   21 

   20 


    1 





    1 





   22 

    1 


   22 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import (
        "bufio"
        "bytes"
        "encoding/binary"
        "errors"
        "fmt"
        "io"
        "math"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
)

// IPv6ExtensionHeaderIdentifier is an IPv6 extension header identifier.
type IPv6ExtensionHeaderIdentifier uint8

const (
        // IPv6HopByHopOptionsExtHdrIdentifier is the header identifier of a Hop by
        // Hop Options extension header, as per RFC 8200 section 4.3.
        IPv6HopByHopOptionsExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 0

        // IPv6RoutingExtHdrIdentifier is the header identifier of a Routing extension
        // header, as per RFC 8200 section 4.4.
        IPv6RoutingExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 43

        // IPv6FragmentExtHdrIdentifier is the header identifier of a Fragment
        // extension header, as per RFC 8200 section 4.5.
        IPv6FragmentExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 44

        // IPv6DestinationOptionsExtHdrIdentifier is the header identifier of a
        // Destination Options extension header, as per RFC 8200 section 4.6.
        IPv6DestinationOptionsExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 60

        // IPv6NoNextHeaderIdentifier is the header identifier used to signify the end
        // of an IPv6 payload, as per RFC 8200 section 4.7.
        IPv6NoNextHeaderIdentifier IPv6ExtensionHeaderIdentifier = 59

        // IPv6UnknownExtHdrIdentifier is reserved by IANA.
        // https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml#extension-header
        // "254        Use for experimentation and testing        [RFC3692][RFC4727]"
        IPv6UnknownExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 254
)

const (
        // ipv6UnknownExtHdrOptionActionMask is the mask of the action to take when
        // a node encounters an unrecognized option.
        ipv6UnknownExtHdrOptionActionMask = 192

        // ipv6UnknownExtHdrOptionActionShift is the least significant bits to discard
        // from the action value for an unrecognized option identifier.
        ipv6UnknownExtHdrOptionActionShift = 6

        // ipv6RoutingExtHdrSegmentsLeftIdx is the index to the Segments Left field
        // within an IPv6RoutingExtHdr.
        ipv6RoutingExtHdrSegmentsLeftIdx = 1

        // IPv6FragmentExtHdrLength is the length of an IPv6 extension header, in
        // bytes.
        IPv6FragmentExtHdrLength = 8

        // ipv6FragmentExtHdrFragmentOffsetOffset is the offset to the start of the
        // Fragment Offset field within an IPv6FragmentExtHdr.
        ipv6FragmentExtHdrFragmentOffsetOffset = 0

        // ipv6FragmentExtHdrFragmentOffsetShift is the bit offset of the Fragment
        // Offset field within an IPv6FragmentExtHdr.
        ipv6FragmentExtHdrFragmentOffsetShift = 3

        // ipv6FragmentExtHdrFlagsIdx is the index to the flags field within an
        // IPv6FragmentExtHdr.
        ipv6FragmentExtHdrFlagsIdx = 1

        // ipv6FragmentExtHdrMFlagMask is the mask of the More (M) flag within the
        // flags field of an IPv6FragmentExtHdr.
        ipv6FragmentExtHdrMFlagMask = 1

        // ipv6FragmentExtHdrIdentificationOffset is the offset to the Identification
        // field within an IPv6FragmentExtHdr.
        ipv6FragmentExtHdrIdentificationOffset = 2

        // ipv6ExtHdrLenBytesPerUnit is the unit size of an extension header's length
        // field. That is, given a Length field of 2, the extension header expects
        // 16 bytes following the first 8 bytes (see ipv6ExtHdrLenBytesExcluded for
        // details about the first 8 bytes' exclusion from the Length field).
        ipv6ExtHdrLenBytesPerUnit = 8

        // ipv6ExtHdrLenBytesExcluded is the number of bytes excluded from an
        // extension header's Length field following the Length field.
        //
        // The Length field excludes the first 8 bytes, but the Next Header and Length
        // field take up the first 2 of the 8 bytes so we expect (at minimum) 6 bytes
        // after the Length field.
        //
        // This ensures that every extension header is at least 8 bytes.
        ipv6ExtHdrLenBytesExcluded = 6

        // IPv6FragmentExtHdrFragmentOffsetBytesPerUnit is the unit size of a Fragment
        // extension header's Fragment Offset field. That is, given a Fragment Offset
        // of 2, the extension header is indiciating that the fragment's payload
        // starts at the 16th byte in the reassembled packet.
        IPv6FragmentExtHdrFragmentOffsetBytesPerUnit = 8
)

// padIPv6OptionsLength returns the total length for IPv6 options of length l
// considering the 8-octet alignment as stated in RFC 8200 Section 4.2.
func padIPv6OptionsLength(length int) int {
        return (length + ipv6ExtHdrLenBytesPerUnit - 1) & ^(ipv6ExtHdrLenBytesPerUnit - 1)
}

// padIPv6Option fills b with the appropriate padding options depending on its
// length.
func padIPv6Option(b []byte) {
        switch len(b) {
        case 0: // No padding needed.
        case 1: // Pad with Pad1.
                b[ipv6ExtHdrOptionTypeOffset] = uint8(ipv6Pad1ExtHdrOptionIdentifier)
        default: // Pad with PadN.
                s := b[ipv6ExtHdrOptionPayloadOffset:]
                for i := range s {
                        s[i] = 0
                }
                b[ipv6ExtHdrOptionTypeOffset] = uint8(ipv6PadNExtHdrOptionIdentifier)
                b[ipv6ExtHdrOptionLengthOffset] = uint8(len(s))
        }
}

// ipv6OptionsAlignmentPadding returns the number of padding bytes needed to
// serialize an option at headerOffset with alignment requirements
// [align]n + alignOffset.
func ipv6OptionsAlignmentPadding(headerOffset int, align int, alignOffset int) int {
        padLen := headerOffset - alignOffset
        return ((padLen + align - 1) & ^(align - 1)) - padLen
}

// IPv6PayloadHeader is implemented by the various headers that can be found
// in an IPv6 payload.
//
// These headers include IPv6 extension headers or upper layer data.
type IPv6PayloadHeader interface {
        isIPv6PayloadHeader()
}

// IPv6RawPayloadHeader the remainder of an IPv6 payload after an iterator
// encounters a Next Header field it does not recognize as an IPv6 extension
// header.
type IPv6RawPayloadHeader struct {
        Identifier IPv6ExtensionHeaderIdentifier
        Buf        buffer.VectorisedView
}

// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
func (IPv6RawPayloadHeader) isIPv6PayloadHeader() {}

// ipv6OptionsExtHdr is an IPv6 extension header that holds options.
type ipv6OptionsExtHdr []byte

// Iter returns an iterator over the IPv6 extension header options held in b.
func (b ipv6OptionsExtHdr) Iter() IPv6OptionsExtHdrOptionsIterator {
        it := IPv6OptionsExtHdrOptionsIterator{}
        it.reader.Reset(b)
        return it
}

// IPv6OptionsExtHdrOptionsIterator is an iterator over IPv6 extension header
// options.
//
// Note, between when an IPv6OptionsExtHdrOptionsIterator is obtained and last
// used, no changes to the underlying buffer may happen. Doing so may cause
// undefined and unexpected behaviour. It is fine to obtain an
// IPv6OptionsExtHdrOptionsIterator, iterate over the first few options then
// modify the backing payload so long as the IPv6OptionsExtHdrOptionsIterator
// obtained before modification is no longer used.
type IPv6OptionsExtHdrOptionsIterator struct {
        reader bytes.Reader

        // optionOffset is the number of bytes from the first byte of the
        // options field to the beginning of the current option.
        optionOffset uint32

        // nextOptionOffset is the offset of the next option.
        nextOptionOffset uint32
}

// OptionOffset returns the number of bytes parsed while processing the
// option field of the current Extension Header.
func (i *IPv6OptionsExtHdrOptionsIterator) OptionOffset() uint32 {
        return i.optionOffset
}

// IPv6OptionUnknownAction is the action that must be taken if the processing
// IPv6 node does not recognize the option, as outlined in RFC 8200 section 4.2.
type IPv6OptionUnknownAction int

const (
        // IPv6OptionUnknownActionSkip indicates that the unrecognized option must
        // be skipped and the node should continue processing the header.
        IPv6OptionUnknownActionSkip IPv6OptionUnknownAction = 0

        // IPv6OptionUnknownActionDiscard indicates that the packet must be silently
        // discarded.
        IPv6OptionUnknownActionDiscard IPv6OptionUnknownAction = 1

        // IPv6OptionUnknownActionDiscardSendICMP indicates that the packet must be
        // discarded and the node must send an ICMP Parameter Problem, Code 2, message
        // to the packet's source, regardless of whether or not the packet's
        // Destination was a multicast address.
        IPv6OptionUnknownActionDiscardSendICMP IPv6OptionUnknownAction = 2

        // IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest indicates that the
        // packet must be discarded and the node must send an ICMP Parameter Problem,
        // Code 2, message to the packet's source only if the packet's Destination was
        // not a multicast address.
        IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest IPv6OptionUnknownAction = 3
)

// IPv6ExtHdrOption is implemented by the various IPv6 extension header options.
type IPv6ExtHdrOption interface {
        // UnknownAction returns the action to take in response to an unrecognized
        // option.
        UnknownAction() IPv6OptionUnknownAction

        // isIPv6ExtHdrOption is used to "lock" this interface so it is not
        // implemented by other packages.
        isIPv6ExtHdrOption()
}

// IPv6ExtHdrOptionIdentifier is an IPv6 extension header option identifier.
type IPv6ExtHdrOptionIdentifier uint8

const (
        // ipv6Pad1ExtHdrOptionIdentifier is the identifier for a padding option that
        // provides 1 byte padding, as outlined in RFC 8200 section 4.2.
        ipv6Pad1ExtHdrOptionIdentifier IPv6ExtHdrOptionIdentifier = 0

        // ipv6PadBExtHdrOptionIdentifier is the identifier for a padding option that
        // provides variable length byte padding, as outlined in RFC 8200 section 4.2.
        ipv6PadNExtHdrOptionIdentifier IPv6ExtHdrOptionIdentifier = 1

        // ipv6RouterAlertHopByHopOptionIdentifier is the identifier for the Router
        // Alert Hop by Hop option as defined in RFC 2711 section 2.1.
        ipv6RouterAlertHopByHopOptionIdentifier IPv6ExtHdrOptionIdentifier = 5

        // ipv6ExtHdrOptionTypeOffset is the option type offset in an extension header
        // option as defined in RFC 8200 section 4.2.
        ipv6ExtHdrOptionTypeOffset = 0

        // ipv6ExtHdrOptionLengthOffset is the option length offset in an extension
        // header option as defined in RFC 8200 section 4.2.
        ipv6ExtHdrOptionLengthOffset = 1

        // ipv6ExtHdrOptionPayloadOffset is the option payload offset in an extension
        // header option as defined in RFC 8200 section 4.2.
        ipv6ExtHdrOptionPayloadOffset = 2
)

// ipv6UnknownActionFromIdentifier maps an extension header option's
// identifier's high  bits to the action to take when the identifier is unknown.
func ipv6UnknownActionFromIdentifier(id IPv6ExtHdrOptionIdentifier) IPv6OptionUnknownAction {
        return IPv6OptionUnknownAction((id & ipv6UnknownExtHdrOptionActionMask) >> ipv6UnknownExtHdrOptionActionShift)
}

// ErrMalformedIPv6ExtHdrOption indicates that an IPv6 extension header option
// is malformed.
var ErrMalformedIPv6ExtHdrOption = errors.New("malformed IPv6 extension header option")

// IPv6UnknownExtHdrOption holds the identifier and data for an IPv6 extension
// header option that is unknown by the parsing utilities.
type IPv6UnknownExtHdrOption struct {
        Identifier IPv6ExtHdrOptionIdentifier
        Data       []byte
}

// UnknownAction implements IPv6OptionUnknownAction.UnknownAction.
func (o *IPv6UnknownExtHdrOption) UnknownAction() IPv6OptionUnknownAction {
        return ipv6UnknownActionFromIdentifier(o.Identifier)
}

// isIPv6ExtHdrOption implements IPv6ExtHdrOption.isIPv6ExtHdrOption.
func (*IPv6UnknownExtHdrOption) isIPv6ExtHdrOption() {}

// Next returns the next option in the options data.
//
// If the next item is not a known extension header option,
// IPv6UnknownExtHdrOption will be returned with the option identifier and data.
//
// The return is of the format (option, done, error). done will be true when
// Next is unable to return anything because the iterator has reached the end of
// the options data, or an error occured.
func (i *IPv6OptionsExtHdrOptionsIterator) Next() (IPv6ExtHdrOption, bool, error) {
        for {
                i.optionOffset = i.nextOptionOffset
                temp, err := i.reader.ReadByte()
                if err != nil {
                        // If we can't read the first byte of a new option, then we know the
                        // options buffer has been exhausted and we are done iterating.
                        return nil, true, nil
                }
                id := IPv6ExtHdrOptionIdentifier(temp)

                // If the option identifier indicates the option is a Pad1 option, then we
                // know the option does not have Length and Data fields. End processing of
                // the Pad1 option and continue processing the buffer as a new option.
                if id == ipv6Pad1ExtHdrOptionIdentifier {
                        i.nextOptionOffset = i.optionOffset + 1
                        continue
                }

                length, err := i.reader.ReadByte()
                if err != nil {
                        if err != io.EOF {
                                // ReadByte should only ever return nil or io.EOF.
                                panic(fmt.Sprintf("unexpected error when reading the option's Length field for option with id = %d: %s", id, err))
                        }

                        // We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected once
                        // we start parsing an option; we expect the reader to contain enough
                        // bytes for the whole option.
                        return nil, true, fmt.Errorf("error when reading the option's Length field for option with id = %d: %w", id, io.ErrUnexpectedEOF)
                }

                // Do we have enough bytes in the reader for the next option?
                if n := i.reader.Len(); n < int(length) {
                        // Reset the reader to effectively consume the remaining buffer.
                        i.reader.Reset(nil)

                        // We return the same error as if we failed to read a non-padding option
                        // so consumers of this iterator don't need to differentiate between
                        // padding and non-padding options.
                        return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, io.ErrUnexpectedEOF)
                }

                i.nextOptionOffset = i.optionOffset + uint32(length) + 1 /* option ID */ + 1 /* length byte */

                switch id {
                case ipv6PadNExtHdrOptionIdentifier:
                        // Special-case the variable length padding option to avoid a copy.
                        if _, err := i.reader.Seek(int64(length), io.SeekCurrent); err != nil {
                                panic(fmt.Sprintf("error when skipping PadN (N = %d) option's data bytes: %s", length, err))
                        }
                        continue
                case ipv6RouterAlertHopByHopOptionIdentifier:
                        var routerAlertValue [ipv6RouterAlertPayloadLength]byte
                        if n, err := io.ReadFull(&i.reader, routerAlertValue[:]); err != nil {
                                switch err {
                                case io.EOF, io.ErrUnexpectedEOF:
                                        return nil, true, fmt.Errorf("got invalid length (%d) for router alert option (want = %d): %w", length, ipv6RouterAlertPayloadLength, ErrMalformedIPv6ExtHdrOption)
                                default:
                                        return nil, true, fmt.Errorf("read %d out of %d option data bytes for router alert option: %w", n, ipv6RouterAlertPayloadLength, err)
                                }
                        } else if n != int(length) {
                                return nil, true, fmt.Errorf("got invalid length (%d) for router alert option (want = %d): %w", length, ipv6RouterAlertPayloadLength, ErrMalformedIPv6ExtHdrOption)
                        }
                        return &IPv6RouterAlertOption{Value: IPv6RouterAlertValue(binary.BigEndian.Uint16(routerAlertValue[:]))}, false, nil
                default:
                        bytes := make([]byte, length)
                        if n, err := io.ReadFull(&i.reader, bytes); err != nil {
                                // io.ReadFull may return io.EOF if i.reader has been exhausted. We use
                                // io.ErrUnexpectedEOF instead as the io.EOF is unexpected given the
                                // Length field found in the option.
                                if err == io.EOF {
                                        err = io.ErrUnexpectedEOF
                                }

                                return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, err)
                        }
                        return &IPv6UnknownExtHdrOption{Identifier: id, Data: bytes}, false, nil
                }
        }
}

// IPv6HopByHopOptionsExtHdr is a buffer holding the Hop By Hop Options
// extension header.
type IPv6HopByHopOptionsExtHdr struct {
        ipv6OptionsExtHdr
}

// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
func (IPv6HopByHopOptionsExtHdr) isIPv6PayloadHeader() {}

// IPv6DestinationOptionsExtHdr is a buffer holding the Destination Options
// extension header.
type IPv6DestinationOptionsExtHdr struct {
        ipv6OptionsExtHdr
}

// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
func (IPv6DestinationOptionsExtHdr) isIPv6PayloadHeader() {}

// IPv6RoutingExtHdr is a buffer holding the Routing extension header specific
// data as outlined in RFC 8200 section 4.4.
type IPv6RoutingExtHdr []byte

// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
func (IPv6RoutingExtHdr) isIPv6PayloadHeader() {}

// SegmentsLeft returns the Segments Left field.
func (b IPv6RoutingExtHdr) SegmentsLeft() uint8 {
        return b[ipv6RoutingExtHdrSegmentsLeftIdx]
}

// IPv6FragmentExtHdr is a buffer holding the Fragment extension header specific
// data as outlined in RFC 8200 section 4.5.
//
// Note, the buffer does not include the Next Header and Reserved fields.
type IPv6FragmentExtHdr [6]byte

// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
func (IPv6FragmentExtHdr) isIPv6PayloadHeader() {}

// FragmentOffset returns the Fragment Offset field.
//
// This value indicates where the buffer following the Fragment extension header
// starts in the target (reassembled) packet.
func (b IPv6FragmentExtHdr) FragmentOffset() uint16 {
        return binary.BigEndian.Uint16(b[ipv6FragmentExtHdrFragmentOffsetOffset:]) >> ipv6FragmentExtHdrFragmentOffsetShift
}

// More returns the More (M) flag.
//
// This indicates whether any fragments are expected to succeed b.
func (b IPv6FragmentExtHdr) More() bool {
        return b[ipv6FragmentExtHdrFlagsIdx]&ipv6FragmentExtHdrMFlagMask != 0
}

// ID returns the Identification field.
//
// This value is used to uniquely identify the packet, between a
// souce and destination.
func (b IPv6FragmentExtHdr) ID() uint32 {
        return binary.BigEndian.Uint32(b[ipv6FragmentExtHdrIdentificationOffset:])
}

// IsAtomic returns whether the fragment header indicates an atomic fragment. An
// atomic fragment is a fragment that contains all the data required to
// reassemble a full packet.
func (b IPv6FragmentExtHdr) IsAtomic() bool {
        return !b.More() && b.FragmentOffset() == 0
}

// IPv6PayloadIterator is an iterator over the contents of an IPv6 payload.
//
// The IPv6 payload may contain IPv6 extension headers before any upper layer
// data.
//
// Note, between when an IPv6PayloadIterator is obtained and last used, no
// changes to the payload may happen. Doing so may cause undefined and
// unexpected behaviour. It is fine to obtain an IPv6PayloadIterator, iterate
// over the first few headers then modify the backing payload so long as the
// IPv6PayloadIterator obtained before modification is no longer used.
type IPv6PayloadIterator struct {
        // The identifier of the next header to parse.
        nextHdrIdentifier IPv6ExtensionHeaderIdentifier

        // reader is an io.Reader over payload.
        reader  bufio.Reader
        payload buffer.VectorisedView

        // Indicates to the iterator that it should return the remaining payload as a
        // raw payload on the next call to Next.
        forceRaw bool

        // headerOffset is the offset of the beginning of the current extension
        // header starting from the beginning of the fixed header.
        headerOffset uint32

        // parseOffset is the byte offset into the current extension header of the
        // field we are currently examining. It can be added to the header offset
        // if the absolute offset within the packet is required.
        parseOffset uint32

        // nextOffset is the offset of the next header.
        nextOffset uint32
}

// HeaderOffset returns the offset to the start of the extension
// header most recently processed.
func (i IPv6PayloadIterator) HeaderOffset() uint32 {
        return i.headerOffset
}

// ParseOffset returns the number of bytes successfully parsed.
func (i IPv6PayloadIterator) ParseOffset() uint32 {
        return i.headerOffset + i.parseOffset
}

// MakeIPv6PayloadIterator returns an iterator over the IPv6 payload containing
// extension headers, or a raw payload if the payload cannot be parsed.
func MakeIPv6PayloadIterator(nextHdrIdentifier IPv6ExtensionHeaderIdentifier, payload buffer.VectorisedView) IPv6PayloadIterator {
        readers := payload.Readers()
        readerPs := make([]io.Reader, 0, len(readers))
        for i := range readers {
                readerPs = append(readerPs, &readers[i])
        }

        return IPv6PayloadIterator{
                nextHdrIdentifier: nextHdrIdentifier,
                payload:           payload.Clone(nil),
                // We need a buffer of size 1 for calls to bufio.Reader.ReadByte.
                reader:     *bufio.NewReaderSize(io.MultiReader(readerPs...), 1),
                nextOffset: IPv6FixedHeaderSize,
        }
}

// AsRawHeader returns the remaining payload of i as a raw header and
// optionally consumes the iterator.
//
// If consume is true, calls to Next after calling AsRawHeader on i will
// indicate that the iterator is done.
func (i *IPv6PayloadIterator) AsRawHeader(consume bool) IPv6RawPayloadHeader {
        identifier := i.nextHdrIdentifier

        var buf buffer.VectorisedView
        if consume {
                // Since we consume the iterator, we return the payload as is.
                buf = i.payload

                // Mark i as done, but keep track of where we were for error reporting.
                *i = IPv6PayloadIterator{
                        nextHdrIdentifier: IPv6NoNextHeaderIdentifier,
                        headerOffset:      i.headerOffset,
                        nextOffset:        i.nextOffset,
                }
        } else {
                buf = i.payload.Clone(nil)
        }

        return IPv6RawPayloadHeader{Identifier: identifier, Buf: buf}
}

// Next returns the next item in the payload.
//
// If the next item is not a known IPv6 extension header, IPv6RawPayloadHeader
// will be returned with the remaining bytes and next header identifier.
//
// The return is of the format (header, done, error). done will be true when
// Next is unable to return anything because the iterator has reached the end of
// the payload, or an error occured.
func (i *IPv6PayloadIterator) Next() (IPv6PayloadHeader, bool, error) {
        i.headerOffset = i.nextOffset
        i.parseOffset = 0
        // We could be forced to return i as a raw header when the previous header was
        // a fragment extension header as the data following the fragment extension
        // header may not be complete.
        if i.forceRaw {
                return i.AsRawHeader(true /* consume */), false, nil
        }

        // Is the header we are parsing a known extension header?
        switch i.nextHdrIdentifier {
        case IPv6HopByHopOptionsExtHdrIdentifier:
                nextHdrIdentifier, bytes, err := i.nextHeaderData(false /* fragmentHdr */, nil)
                if err != nil {
                        return nil, true, err
                }

                i.nextHdrIdentifier = nextHdrIdentifier
                return IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: bytes}, false, nil
        case IPv6RoutingExtHdrIdentifier:
                nextHdrIdentifier, bytes, err := i.nextHeaderData(false /* fragmentHdr */, nil)
                if err != nil {
                        return nil, true, err
                }

                i.nextHdrIdentifier = nextHdrIdentifier
                return IPv6RoutingExtHdr(bytes), false, nil
        case IPv6FragmentExtHdrIdentifier:
                var data [6]byte
                // We ignore the returned bytes because we know the fragment extension
                // header specific data will fit in data.
                nextHdrIdentifier, _, err := i.nextHeaderData(true /* fragmentHdr */, data[:])
                if err != nil {
                        return nil, true, err
                }

                fragmentExtHdr := IPv6FragmentExtHdr(data)

                // If the packet is not the first fragment, do not attempt to parse anything
                // after the fragment extension header as the payload following the fragment
                // extension header should not contain any headers; the first fragment must
                // hold all the headers up to and including any upper layer headers, as per
                // RFC 8200 section 4.5.
                if fragmentExtHdr.FragmentOffset() != 0 {
                        i.forceRaw = true
                }

                i.nextHdrIdentifier = nextHdrIdentifier
                return fragmentExtHdr, false, nil
        case IPv6DestinationOptionsExtHdrIdentifier:
                nextHdrIdentifier, bytes, err := i.nextHeaderData(false /* fragmentHdr */, nil)
                if err != nil {
                        return nil, true, err
                }

                i.nextHdrIdentifier = nextHdrIdentifier
                return IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: bytes}, false, nil
        case IPv6NoNextHeaderIdentifier:
                // This indicates the end of the IPv6 payload.
                return nil, true, nil

        default:
                // The header we are parsing is not a known extension header. Return the
                // raw payload.
                return i.AsRawHeader(true /* consume */), false, nil
        }
}

// nextHeaderData returns the extension header's Next Header field and raw data.
//
// fragmentHdr indicates that the extension header being parsed is the Fragment
// extension header so the Length field should be ignored as it is Reserved
// for the Fragment extension header.
//
// If bytes is not nil, extension header specific data will be read into bytes
// if it has enough capacity. If bytes is provided but does not have enough
// capacity for the data, nextHeaderData will panic.
func (i *IPv6PayloadIterator) nextHeaderData(fragmentHdr bool, bytes []byte) (IPv6ExtensionHeaderIdentifier, []byte, error) {
        // We ignore the number of bytes read because we know we will only ever read
        // at max 1 bytes since rune has a length of 1. If we read 0 bytes, the Read
        // would return io.EOF to indicate that io.Reader has reached the end of the
        // payload.
        nextHdrIdentifier, err := i.reader.ReadByte()
        i.payload.TrimFront(1)
        if err != nil {
                return 0, nil, fmt.Errorf("error when reading the Next Header field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
        }
        i.parseOffset++

        var length uint8
        length, err = i.reader.ReadByte()
        i.payload.TrimFront(1)

        if err != nil {
                if fragmentHdr {
                        return 0, nil, fmt.Errorf("error when reading the Length field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
                }

                return 0, nil, fmt.Errorf("error when reading the Reserved field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
        }
        if fragmentHdr {
                length = 0
        }

        // Make parseOffset point to the first byte of the Extension Header
        // specific data.
        i.parseOffset++

        // length is in 8 byte chunks but doesn't include the first one.
        // See RFC 8200 for each header type, sections 4.3-4.6 and the requirement
        // in section 4.8 for new extension headers at the top of page 24.
        //   [ Hdr Ext Len ] ... Length of the Destination Options header in 8-octet
        //   units, not including the first 8 octets.
        i.nextOffset += uint32((length + 1) * ipv6ExtHdrLenBytesPerUnit)

        bytesLen := int(length)*ipv6ExtHdrLenBytesPerUnit + ipv6ExtHdrLenBytesExcluded
        if bytes == nil {
                bytes = make([]byte, bytesLen)
        } else if n := len(bytes); n < bytesLen {
                panic(fmt.Sprintf("bytes only has space for %d bytes but need space for %d bytes (length = %d) for extension header with id = %d", n, bytesLen, length, i.nextHdrIdentifier))
        }

        n, err := io.ReadFull(&i.reader, bytes)
        i.payload.TrimFront(n)
        if err != nil {
                return 0, nil, fmt.Errorf("read %d out of %d extension header data bytes (length = %d) for header with id = %d: %w", n, bytesLen, length, i.nextHdrIdentifier, err)
        }

        return IPv6ExtensionHeaderIdentifier(nextHdrIdentifier), bytes, nil
}

// IPv6SerializableExtHdr provides serialization for IPv6 extension
// headers.
type IPv6SerializableExtHdr interface {
        // identifier returns the assigned IPv6 header identifier for this extension
        // header.
        identifier() IPv6ExtensionHeaderIdentifier

        // length returns the total serialized length in bytes of this extension
        // header, including the common next header and length fields.
        length() int

        // serializeInto serializes the receiver into the provided byte
        // buffer and with the provided nextHeader value.
        //
        // Note, the caller MUST provide a byte buffer with size of at least
        // length. Implementers of this function may assume that the byte buffer
        // is of sufficient size. serializeInto MAY panic if the provided byte
        // buffer is not of sufficient size.
        //
        // serializeInto returns the number of bytes that was used to serialize the
        // receiver. Implementers must only use the number of bytes required to
        // serialize the receiver. Callers MAY provide a larger buffer than required
        // to serialize into.
        serializeInto(nextHeader uint8, b []byte) int
}

var _ IPv6SerializableExtHdr = (*IPv6SerializableHopByHopExtHdr)(nil)

// IPv6SerializableHopByHopExtHdr implements serialization of the Hop by Hop
// options extension header.
type IPv6SerializableHopByHopExtHdr []IPv6SerializableHopByHopOption

const (
        // ipv6HopByHopExtHdrNextHeaderOffset is the offset of the next header field
        // in a hop by hop extension header as defined in RFC 8200 section 4.3.
        ipv6HopByHopExtHdrNextHeaderOffset = 0

        // ipv6HopByHopExtHdrLengthOffset is the offset of the length field in a hop
        // by hop extension header as defined in RFC 8200 section 4.3.
        ipv6HopByHopExtHdrLengthOffset = 1

        // ipv6HopByHopExtHdrPayloadOffset is the offset of the options in a hop by
        // hop extension header as defined in RFC 8200 section 4.3.
        ipv6HopByHopExtHdrOptionsOffset = 2

        // ipv6HopByHopExtHdrUnaccountedLenWords is the implicit number of 8-octet
        // words in a hop by hop extension header's length field, as stated in RFC
        // 8200 section 4.3:
        //   Length of the Hop-by-Hop Options header in 8-octet units,
        //   not including the first 8 octets.
        ipv6HopByHopExtHdrUnaccountedLenWords = 1
)

// identifier implements IPv6SerializableExtHdr.
func (IPv6SerializableHopByHopExtHdr) identifier() IPv6ExtensionHeaderIdentifier {
        return IPv6HopByHopOptionsExtHdrIdentifier
}

// length implements IPv6SerializableExtHdr.
func (h IPv6SerializableHopByHopExtHdr) length() int {
        var total int
        for _, opt := range h {
                align, alignOffset := opt.alignment()
                total += ipv6OptionsAlignmentPadding(total, align, alignOffset)
                total += ipv6ExtHdrOptionPayloadOffset + int(opt.length())
        }
        // Account for next header and total length fields and add padding.
        return padIPv6OptionsLength(ipv6HopByHopExtHdrOptionsOffset + total)
}

// serializeInto implements IPv6SerializableExtHdr.
func (h IPv6SerializableHopByHopExtHdr) serializeInto(nextHeader uint8, b []byte) int {
        optBuffer := b[ipv6HopByHopExtHdrOptionsOffset:]
        totalLength := ipv6HopByHopExtHdrOptionsOffset
        for _, opt := range h {
                // Calculate alignment requirements and pad buffer if necessary.
                align, alignOffset := opt.alignment()
                padLen := ipv6OptionsAlignmentPadding(totalLength, align, alignOffset)
                if padLen != 0 {
                        padIPv6Option(optBuffer[:padLen])
                        totalLength += padLen
                        optBuffer = optBuffer[padLen:]
                }

                l := opt.serializeInto(optBuffer[ipv6ExtHdrOptionPayloadOffset:])
                optBuffer[ipv6ExtHdrOptionTypeOffset] = uint8(opt.identifier())
                optBuffer[ipv6ExtHdrOptionLengthOffset] = l
                l += ipv6ExtHdrOptionPayloadOffset
                totalLength += int(l)
                optBuffer = optBuffer[l:]
        }
        padded := padIPv6OptionsLength(totalLength)
        if padded != totalLength {
                padIPv6Option(optBuffer[:padded-totalLength])
                totalLength = padded
        }
        wordsLen := totalLength/ipv6ExtHdrLenBytesPerUnit - ipv6HopByHopExtHdrUnaccountedLenWords
        if wordsLen > math.MaxUint8 {
                panic(fmt.Sprintf("IPv6 hop by hop options too large: %d+1 64-bit words", wordsLen))
        }
        b[ipv6HopByHopExtHdrNextHeaderOffset] = nextHeader
        b[ipv6HopByHopExtHdrLengthOffset] = uint8(wordsLen)
        return totalLength
}

// IPv6SerializableHopByHopOption provides serialization for hop by hop options.
type IPv6SerializableHopByHopOption interface {
        // identifier returns the option identifier of this Hop by Hop option.
        identifier() IPv6ExtHdrOptionIdentifier

        // length returns the *payload* size of the option (not considering the type
        // and length fields).
        length() uint8

        // alignment returns the alignment requirements from this option.
        //
        // Alignment requirements take the form [align]n + offset as specified in
        // RFC 8200 section 4.2. The alignment requirement is on the offset between
        // the option type byte and the start of the hop by hop header.
        //
        // align must be a power of 2.
        alignment() (align int, offset int)

        // serializeInto serializes the receiver into the provided byte
        // buffer.
        //
        // Note, the caller MUST provide a byte buffer with size of at least
        // length. Implementers of this function may assume that the byte buffer
        // is of sufficient size. serializeInto MAY panic if the provided byte
        // buffer is not of sufficient size.
        //
        // serializeInto will return the number of bytes that was used to
        // serialize the receiver. Implementers must only use the number of
        // bytes required to serialize the receiver. Callers MAY provide a
        // larger buffer than required to serialize into.
        serializeInto([]byte) uint8
}

var _ IPv6SerializableHopByHopOption = (*IPv6RouterAlertOption)(nil)

// IPv6RouterAlertOption is the IPv6 Router alert Hop by Hop option defined in
// RFC 2711 section 2.1.
type IPv6RouterAlertOption struct {
        Value IPv6RouterAlertValue
}

// IPv6RouterAlertValue is the payload of an IPv6 Router Alert option.
type IPv6RouterAlertValue uint16

const (
        // IPv6RouterAlertMLD indicates a datagram containing a Multicast Listener
        // Discovery message as defined in RFC 2711 section 2.1.
        IPv6RouterAlertMLD IPv6RouterAlertValue = 0
        // IPv6RouterAlertRSVP indicates a datagram containing an RSVP message as
        // defined in RFC 2711 section 2.1.
        IPv6RouterAlertRSVP IPv6RouterAlertValue = 1
        // IPv6RouterAlertActiveNetworks indicates a datagram containing an Active
        // Networks message as defined in RFC 2711 section 2.1.
        IPv6RouterAlertActiveNetworks IPv6RouterAlertValue = 2

        // ipv6RouterAlertPayloadLength is the length of the Router Alert payload
        // as defined in RFC 2711.
        ipv6RouterAlertPayloadLength = 2

        // ipv6RouterAlertAlignmentRequirement is the alignment requirement for the
        // Router Alert option defined as 2n+0 in RFC 2711.
        ipv6RouterAlertAlignmentRequirement = 2

        // ipv6RouterAlertAlignmentOffsetRequirement is the alignment offset
        // requirement for the Router Alert option defined as 2n+0 in RFC 2711 section
        // 2.1.
        ipv6RouterAlertAlignmentOffsetRequirement = 0
)

// UnknownAction implements IPv6ExtHdrOption.
func (*IPv6RouterAlertOption) UnknownAction() IPv6OptionUnknownAction {
        return ipv6UnknownActionFromIdentifier(ipv6RouterAlertHopByHopOptionIdentifier)
}

// isIPv6ExtHdrOption implements IPv6ExtHdrOption.
func (*IPv6RouterAlertOption) isIPv6ExtHdrOption() {}

// identifier implements IPv6SerializableHopByHopOption.
func (*IPv6RouterAlertOption) identifier() IPv6ExtHdrOptionIdentifier {
        return ipv6RouterAlertHopByHopOptionIdentifier
}

// length implements IPv6SerializableHopByHopOption.
func (*IPv6RouterAlertOption) length() uint8 {
        return ipv6RouterAlertPayloadLength
}

// alignment implements IPv6SerializableHopByHopOption.
func (*IPv6RouterAlertOption) alignment() (int, int) {
        // From RFC 2711 section 2.1:
        //   Alignment requirement: 2n+0.
        return ipv6RouterAlertAlignmentRequirement, ipv6RouterAlertAlignmentOffsetRequirement
}

// serializeInto implements IPv6SerializableHopByHopOption.
func (o *IPv6RouterAlertOption) serializeInto(b []byte) uint8 {
        binary.BigEndian.PutUint16(b, uint16(o.Value))
        return ipv6RouterAlertPayloadLength
}

// IPv6ExtHdrSerializer provides serialization of IPv6 extension headers.
type IPv6ExtHdrSerializer []IPv6SerializableExtHdr

// Serialize serializes the provided list of IPv6 extension headers into b.
//
// Note, b must be of sufficient size to hold all the headers in s. See
// IPv6ExtHdrSerializer.Length for details on the getting the total size of a
// serialized IPv6ExtHdrSerializer.
//
// Serialize may panic if b is not of sufficient size to hold all the options
// in s.
//
// Serialize takes the transportProtocol value to be used as the last extension
// header's Next Header value and returns the header identifier of the first
// serialized extension header and the total serialized length.
func (s IPv6ExtHdrSerializer) Serialize(transportProtocol tcpip.TransportProtocolNumber, b []byte) (uint8, int) {
        nextHeader := uint8(transportProtocol)
        if len(s) == 0 {
                return nextHeader, 0
        }
        var totalLength int
        for i, h := range s[:len(s)-1] {
                length := h.serializeInto(uint8(s[i+1].identifier()), b)
                b = b[length:]
                totalLength += length
        }
        totalLength += s[len(s)-1].serializeInto(nextHeader, b)
        return uint8(s[0].identifier()), totalLength
}

// Length returns the total number of bytes required to serialize the extension
// headers.
func (s IPv6ExtHdrSerializer) Length() int {
        var totalLength int
        for _, h := range s {
                totalLength += h.length()
        }
        return totalLength
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/fd_table_refs.go: no such file or directory






































































































































































































































   32 




   32 













    1 
    1 
















   32 
   32 













   31 
   31 

   31 











   30 






   33 
    1 






   32 




















   32 



   32 





   37 









    2 





    2 


    2 
    1 





    1 


















































































































































































































    1 
    1 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ip

import (
        "fmt"
        "math/rand"
        "time"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
)

// hostState is the state a host may be in for a multicast group.
type hostState int

// The states below are generic across IGMPv2 (RFC 2236 section 6) and MLDv1
// (RFC 2710 section 5). Even though the states are generic across both IGMPv2
// and MLDv1, IGMPv2 terminology will be used.
//
//                                  ______________receive query______________
//                                 |                                         |
//                                 |   _____send or receive report_____      |
//                                 |  |                                |     |
//                                 V  |                                V     |
//  +-------+ +-----------+ +------------+ +-------------------+ +--------+  |
//  | Non-M | | Pending-M | | Delaying-M | | Queued Delaying-M | | Idle-M | -
//  +-------+ +-----------+ +------------+ +-------------------+ +--------+
//    |          ^      |       ^      |          ^       |             ^
//    |          |      |       |      |          |       |             |
//     ----------        -------        ----------         -------------
//   initialize new    send inital     fail to send       send or receive
//  group membership     report       delayed report          report
//
// Not shown in the diagram above, but any state may transition into the non
// member state when a group is left.
const (
        // nonMember is the "'Non-Member' state, when the host does not belong to the
        // group on the interface. This is the initial state for all memberships on
        // all network interfaces; it requires no storage in the host."
        //
        // 'Non-Listener' is the MLDv1 term used to describe this state.
        //
        // This state is used to keep track of groups that have been joined locally,
        // but without advertising the membership to the network.
        nonMember hostState = iota

        // pendingMember is a newly joined member that is waiting to successfully send
        // the initial set of reports.
        //
        // This is not an RFC defined state; it is an implementation specific state to
        // track that the initial report needs to be sent.
        //
        // MAY NOT transition to the idle member state from this state.
        pendingMember

        // delayingMember is the "'Delaying Member' state, when the host belongs to
        // the group on the interface and has a report delay timer running for that
        // membership."
        //
        // 'Delaying Listener' is the MLDv1 term used to describe this state.
        delayingMember

        // queuedDelayingMember is a delayingMember that failed to send a report after
        // its delayed report timer fired. Hosts in this state are waiting to attempt
        // retransmission of the delayed report.
        //
        // This is not an RFC defined state; it is an implementation specific state to
        // track that the delayed report needs to be sent.
        //
        // May transition to idle member if a report is received for a group.
        queuedDelayingMember

        // idleMember is the "Idle Member" state, when the host belongs to the group
        // on the interface and does not have a report delay timer running for that
        // membership.
        //
        // 'Idle Listener' is the MLDv1 term used to describe this state.
        idleMember
)

func (s hostState) isDelayingMember() bool {
        switch s {
        case nonMember, pendingMember, idleMember:
                return false
        case delayingMember, queuedDelayingMember:
                return true
        default:
                panic(fmt.Sprintf("unrecognized host state = %d", s))
        }
}

// multicastGroupState holds the Generic Multicast Protocol state for a
// multicast group.
type multicastGroupState struct {
        // joins is the number of times the group has been joined.
        joins uint64

        // state holds the host's state for the group.
        state hostState

        // lastToSendReport is true if we sent the last report for the group. It is
        // used to track whether there are other hosts on the subnet that are also
        // members of the group.
        //
        // Defined in RFC 2236 section 6 page 9 for IGMPv2 and RFC 2710 section 5 page
        // 8 for MLDv1.
        lastToSendReport bool

        // delayedReportJob is used to delay sending responses to membership report
        // messages in order to reduce duplicate reports from multiple hosts on the
        // interface.
        //
        // Must not be nil.
        delayedReportJob *tcpip.Job

        // delyedReportJobFiresAt is the time when the delayed report job will fire.
        //
        // A zero value indicates that the job is not scheduled.
        delayedReportJobFiresAt time.Time
}

func (m *multicastGroupState) cancelDelayedReportJob() {
        m.delayedReportJob.Cancel()
        m.delayedReportJobFiresAt = time.Time{}
}

// GenericMulticastProtocolOptions holds options for the generic multicast
// protocol.
type GenericMulticastProtocolOptions struct {
        // Rand is the source of random numbers.
        Rand *rand.Rand

        // Clock is the clock used to create timers.
        Clock tcpip.Clock

        // Protocol is the implementation of the variant of multicast group protocol
        // in use.
        Protocol MulticastGroupProtocol

        // MaxUnsolicitedReportDelay is the maximum amount of time to wait between
        // transmitting unsolicited reports.
        //
        // Unsolicited reports are transmitted when a group is newly joined.
        MaxUnsolicitedReportDelay time.Duration
}

// MulticastGroupProtocol is a multicast group protocol whose core state machine
// can be represented by GenericMulticastProtocolState.
type MulticastGroupProtocol interface {
        // Enabled indicates whether the generic multicast protocol will be
        // performed.
        //
        // When enabled, the protocol may transmit report and leave messages when
        // joining and leaving multicast groups respectively, and handle incoming
        // packets.
        //
        // When disabled, the protocol will still keep track of locally joined groups,
        // it just won't transmit and handle packets, or update groups' state.
        Enabled() bool

        // SendReport sends a multicast report for the specified group address.
        //
        // Returns false if the caller should queue the report to be sent later. Note,
        // returning false does not mean that the receiver hit an error.
        SendReport(groupAddress tcpip.Address) (sent bool, err tcpip.Error)

        // SendLeave sends a multicast leave for the specified group address.
        SendLeave(groupAddress tcpip.Address) tcpip.Error

        // ShouldPerformProtocol returns true iff the protocol should be performed for
        // the specified group.
        ShouldPerformProtocol(tcpip.Address) bool
}

// GenericMulticastProtocolState is the per interface generic multicast protocol
// state.
//
// There is actually no protocol named "Generic Multicast Protocol". Instead,
// the term used to refer to a generic multicast protocol that applies to both
// IPv4 and IPv6. Specifically, Generic Multicast Protocol is the core state
// machine of IGMPv2 as defined by RFC 2236 and MLDv1 as defined by RFC 2710.
//
// Callers must synchronize accesses to the generic multicast protocol state;
// GenericMulticastProtocolState obtains no locks in any of its methods. The
// only exception to this is GenericMulticastProtocolState's timer/job callbacks
// which will obtain the lock provided to the GenericMulticastProtocolState when
// it is initialized.
//
// GenericMulticastProtocolState.Init MUST be called before calling any of
// the methods on GenericMulticastProtocolState.
//
// GenericMulticastProtocolState.MakeAllNonMemberLocked MUST be called when the
// multicast group protocol is disabled so that leave messages may be sent.
type GenericMulticastProtocolState struct {
        // Do not allow overwriting this state.
        _ sync.NoCopy

        opts GenericMulticastProtocolOptions

        // memberships holds group addresses and their associated state.
        memberships map[tcpip.Address]multicastGroupState

        // protocolMU is the mutex used to protect the protocol.
        protocolMU *sync.RWMutex
}

// Init initializes the Generic Multicast Protocol state.
//
// Must only be called once for the lifetime of g; Init will panic if it is
// called twice.
//
// The GenericMulticastProtocolState will only grab the lock when timers/jobs
// fire.
//
// Note: the methods on opts.Protocol will always be called while protocolMU is
// held.
func (g *GenericMulticastProtocolState) Init(protocolMU *sync.RWMutex, opts GenericMulticastProtocolOptions) {
        if g.memberships != nil {
                panic("attempted to initialize generic membership protocol state twice")
        }

        *g = GenericMulticastProtocolState{
                opts:        opts,
                memberships: make(map[tcpip.Address]multicastGroupState),
                protocolMU:  protocolMU,
        }
}

// MakeAllNonMemberLocked transitions all groups to the non-member state.
//
// The groups will still be considered joined locally.
//
// MUST be called when the multicast group protocol is disabled.
//
// Precondition: g.protocolMU must be locked.
func (g *GenericMulticastProtocolState) MakeAllNonMemberLocked() {
        if !g.opts.Protocol.Enabled() {
                return
        }

        for groupAddress, info := range g.memberships {
                g.transitionToNonMemberLocked(groupAddress, &info)
                g.memberships[groupAddress] = info
        }
}

// InitializeGroupsLocked initializes each group, as if they were newly joined
// but without affecting the groups' join count.
//
// Must only be called after calling MakeAllNonMember as a group should not be
// initialized while it is not in the non-member state.
//
// Precondition: g.protocolMU must be locked.
func (g *GenericMulticastProtocolState) InitializeGroupsLocked() {
        if !g.opts.Protocol.Enabled() {
                return
        }

        for groupAddress, info := range g.memberships {
                g.initializeNewMemberLocked(groupAddress, &info)
                g.memberships[groupAddress] = info
        }
}

// SendQueuedReportsLocked attempts to send reports for groups that failed to
// send reports during their last attempt.
//
// Precondition: g.protocolMU must be locked.
func (g *GenericMulticastProtocolState) SendQueuedReportsLocked() {
        for groupAddress, info := range g.memberships {
                switch info.state {
                case nonMember, delayingMember, idleMember:
                case pendingMember:
                        // pendingMembers failed to send their initial unsolicited report so try
                        // to send the report and queue the extra unsolicited reports.
                        g.maybeSendInitialReportLocked(groupAddress, &info)
                case queuedDelayingMember:
                        // queuedDelayingMembers failed to send their delayed reports so try to
                        // send the report and transition them to the idle state.
                        g.maybeSendDelayedReportLocked(groupAddress, &info)
                default:
                        panic(fmt.Sprintf("unrecognized host state = %d", info.state))
                }
                g.memberships[groupAddress] = info
        }
}

// JoinGroupLocked handles joining a new group.
//
// Precondition: g.protocolMU must be locked.
func (g *GenericMulticastProtocolState) JoinGroupLocked(groupAddress tcpip.Address) {
        if info, ok := g.memberships[groupAddress]; ok {
                // The group has already been joined.
                info.joins++
                g.memberships[groupAddress] = info
                return
        }

        info := multicastGroupState{
                // Since we just joined the group, its count is 1.
                joins: 1,
                // The state will be updated below, if required.
                state:            nonMember,
                lastToSendReport: false,
                delayedReportJob: tcpip.NewJob(g.opts.Clock, g.protocolMU, func() {
                        if !g.opts.Protocol.Enabled() {
                                panic(fmt.Sprintf("delayed report job fired for group %s while the multicast group protocol is disabled", groupAddress))
                        }

                        info, ok := g.memberships[groupAddress]
                        if !ok {
                                panic(fmt.Sprintf("expected to find group state for group = %s", groupAddress))
                        }

                        g.maybeSendDelayedReportLocked(groupAddress, &info)
                        g.memberships[groupAddress] = info
                }),
        }

        if g.opts.Protocol.Enabled() {
                g.initializeNewMemberLocked(groupAddress, &info)
        }

        g.memberships[groupAddress] = info
}

// IsLocallyJoinedRLocked returns true if the group is locally joined.
//
// Precondition: g.protocolMU must be read locked.
func (g *GenericMulticastProtocolState) IsLocallyJoinedRLocked(groupAddress tcpip.Address) bool {
        _, ok := g.memberships[groupAddress]
        return ok
}

// LeaveGroupLocked handles leaving the group.
//
// Returns false if the group is not currently joined.
//
// Precondition: g.protocolMU must be locked.
func (g *GenericMulticastProtocolState) LeaveGroupLocked(groupAddress tcpip.Address) bool {
        info, ok := g.memberships[groupAddress]
        if !ok {
                return false
        }

        if info.joins == 0 {
                panic(fmt.Sprintf("tried to leave group %s with a join count of 0", groupAddress))
        }
        info.joins--
        if info.joins != 0 {
                // If we still have outstanding joins, then do nothing further.
                g.memberships[groupAddress] = info
                return true
        }

        g.transitionToNonMemberLocked(groupAddress, &info)
        delete(g.memberships, groupAddress)
        return true
}

// HandleQueryLocked handles a query message with the specified maximum response
// time.
//
// If the group address is unspecified, then reports will be scheduled for all
// joined groups.
//
// Report(s) will be scheduled to be sent after a random duration between 0 and
// the maximum response time.
//
// Precondition: g.protocolMU must be locked.
func (g *GenericMulticastProtocolState) HandleQueryLocked(groupAddress tcpip.Address, maxResponseTime time.Duration) {
        if !g.opts.Protocol.Enabled() {
                return
        }

        // As per RFC 2236 section 2.4 (for IGMPv2),
        //
        //   In a Membership Query message, the group address field is set to zero
        //   when sending a General Query, and set to the group address being
        //   queried when sending a Group-Specific Query.
        //
        // As per RFC 2710 section 3.6 (for MLDv1),
        //
        //   In a Query message, the Multicast Address field is set to zero when
        //   sending a General Query, and set to a specific IPv6 multicast address
        //   when sending a Multicast-Address-Specific Query.
        if groupAddress.Unspecified() {
                // This is a general query as the group address is unspecified.
                for groupAddress, info := range g.memberships {
                        g.setDelayTimerForAddressRLocked(groupAddress, &info, maxResponseTime)
                        g.memberships[groupAddress] = info
                }
        } else if info, ok := g.memberships[groupAddress]; ok {
                g.setDelayTimerForAddressRLocked(groupAddress, &info, maxResponseTime)
                g.memberships[groupAddress] = info
        }
}

// HandleReportLocked handles a report message.
//
// If the report is for a joined group, any active delayed report will be
// cancelled and the host state for the group transitions to idle.
//
// Precondition: g.protocolMU must be locked.
func (g *GenericMulticastProtocolState) HandleReportLocked(groupAddress tcpip.Address) {
        if !g.opts.Protocol.Enabled() {
                return
        }

        // As per RFC 2236 section 3 pages 3-4 (for IGMPv2),
        //
        //   If the host receives another host's Report (version 1 or 2) while it has
        //   a timer running, it stops its timer for the specified group and does not
        //   send a Report
        //
        // As per RFC 2710 section 4 page 6 (for MLDv1),
        //
        //   If a node receives another node's Report from an interface for a
        //   multicast address while it has a timer running for that same address
        //   on that interface, it stops its timer and does not send a Report for
        //   that address, thus suppressing duplicate reports on the link.
        if info, ok := g.memberships[groupAddress]; ok && info.state.isDelayingMember() {
                info.cancelDelayedReportJob()
                info.lastToSendReport = false
                info.state = idleMember
                g.memberships[groupAddress] = info
        }
}

// initializeNewMemberLocked initializes a new group membership.
//
// Precondition: g.protocolMU must be locked.
func (g *GenericMulticastProtocolState) initializeNewMemberLocked(groupAddress tcpip.Address, info *multicastGroupState) {
        if info.state != nonMember {
                panic(fmt.Sprintf("host must be in non-member state to be initialized; group = %s, state = %d", groupAddress, info.state))
        }

        info.lastToSendReport = false

        if !g.opts.Protocol.ShouldPerformProtocol(groupAddress) {
                info.state = idleMember
                return
        }

        info.state = pendingMember
        g.maybeSendInitialReportLocked(groupAddress, info)
}

// maybeSendInitialReportLocked attempts to start transmission of the initial
// set of reports after newly joining a group.
//
// Host must be in pending member state.
//
// Precondition: g.protocolMU must be locked.
func (g *GenericMulticastProtocolState) maybeSendInitialReportLocked(groupAddress tcpip.Address, info *multicastGroupState) {
        if info.state != pendingMember {
                panic(fmt.Sprintf("host must be in pending member state to send initial reports; group = %s, state = %d", groupAddress, info.state))
        }

        // As per RFC 2236 section 3 page 5 (for IGMPv2),
        //
        //   When a host joins a multicast group, it should immediately transmit an
        //   unsolicited Version 2 Membership Report for that group" ... "it is
        //   recommended that it be repeated".
        //
        // As per RFC 2710 section 4 page 6 (for MLDv1),
        //
        //   When a node starts listening to a multicast address on an interface,
        //   it should immediately transmit an unsolicited Report for that address
        //   on that interface, in case it is the first listener on the link. To
        //   cover the possibility of the initial Report being lost or damaged, it
        //   is recommended that it be repeated once or twice after short delays
        //   [Unsolicited Report Interval].
        //
        // TODO(gvisor.dev/issue/4901): Support a configurable number of initial
        // unsolicited reports.
        sent, err := g.opts.Protocol.SendReport(groupAddress)
        if err == nil && sent {
                info.lastToSendReport = true
                g.setDelayTimerForAddressRLocked(groupAddress, info, g.opts.MaxUnsolicitedReportDelay)
        }
}

// maybeSendDelayedReportLocked attempts to send the delayed report.
//
// Host must be in pending, delaying or queued delaying member state.
//
// Precondition: g.protocolMU must be locked.
func (g *GenericMulticastProtocolState) maybeSendDelayedReportLocked(groupAddress tcpip.Address, info *multicastGroupState) {
        if !info.state.isDelayingMember() {
                panic(fmt.Sprintf("host must be in delaying or queued delaying member state to send delayed reports; group = %s, state = %d", groupAddress, info.state))
        }

        sent, err := g.opts.Protocol.SendReport(groupAddress)
        if err == nil && sent {
                info.lastToSendReport = true
                info.state = idleMember
        } else {
                info.state = queuedDelayingMember
        }
}

// maybeSendLeave attempts to send a leave message.
func (g *GenericMulticastProtocolState) maybeSendLeave(groupAddress tcpip.Address, lastToSendReport bool) {
        if !g.opts.Protocol.Enabled() || !lastToSendReport {
                return
        }

        if !g.opts.Protocol.ShouldPerformProtocol(groupAddress) {
                return
        }

        // Okay to ignore the error here as if packet write failed, the multicast
        // routers will eventually drop our membership anyways. If the interface is
        // being disabled or removed, the generic multicast protocol's should be
        // cleared eventually.
        //
        // As per RFC 2236 section 3 page 5 (for IGMPv2),
        //
        //   When a router receives a Report, it adds the group being reported to
        //   the list of multicast group memberships on the network on which it
        //   received the Report and sets the timer for the membership to the
        //   [Group Membership Interval]. Repeated Reports refresh the timer. If
        //   no Reports are received for a particular group before this timer has
        //   expired, the router assumes that the group has no local members and
        //   that it need not forward remotely-originated multicasts for that
        //   group onto the attached network.
        //
        // As per RFC 2710 section 4 page 5 (for MLDv1),
        //
        //   When a router receives a Report from a link, if the reported address
        //   is not already present in the router's list of multicast address
        //   having listeners on that link, the reported address is added to the
        //   list, its timer is set to [Multicast Listener Interval], and its
        //   appearance is made known to the router's multicast routing component.
        //   If a Report is received for a multicast address that is already
        //   present in the router's list, the timer for that address is reset to
        //   [Multicast Listener Interval]. If an address's timer expires, it is
        //   assumed that there are no longer any listeners for that address
        //   present on the link, so it is deleted from the list and its
        //   disappearance is made known to the multicast routing component.
        //
        // The requirement to send a leave message is also optional (it MAY be
        // skipped):
        //
        // As per RFC 2236 section 6 page 8 (for IGMPv2),
        //
        //  "send leave" for the group on the interface. If the interface
        //   state says the Querier is running IGMPv1, this action SHOULD be
        //   skipped. If the flag saying we were the last host to report is
        //   cleared, this action MAY be skipped. The Leave Message is sent to
        //   the ALL-ROUTERS group (224.0.0.2).
        //
        // As per RFC 2710 section 5 page 8 (for MLDv1),
        //
        //   "send done" for the address on the interface. If the flag saying
        //   we were the last node to report is cleared, this action MAY be
        //   skipped. The Done message is sent to the link-scope all-routers
        //   address (FF02::2).
        _ = g.opts.Protocol.SendLeave(groupAddress)
}

// transitionToNonMemberLocked transitions the given multicast group the the
// non-member/listener state.
//
// Precondition: g.protocolMU must be locked.
func (g *GenericMulticastProtocolState) transitionToNonMemberLocked(groupAddress tcpip.Address, info *multicastGroupState) {
        if info.state == nonMember {
                return
        }

        info.cancelDelayedReportJob()
        g.maybeSendLeave(groupAddress, info.lastToSendReport)
        info.lastToSendReport = false
        info.state = nonMember
}

// setDelayTimerForAddressRLocked sets timer to send a delay report.
//
// Precondition: g.protocolMU MUST be read locked.
func (g *GenericMulticastProtocolState) setDelayTimerForAddressRLocked(groupAddress tcpip.Address, info *multicastGroupState, maxResponseTime time.Duration) {
        if info.state == nonMember {
                return
        }

        if !g.opts.Protocol.ShouldPerformProtocol(groupAddress) {
                return
        }

        // As per RFC 2236 section 3 page 3 (for IGMPv2),
        //
        //   If a timer for the group is already unning, it is reset to the random
        //   value only if the requested Max Response Time is less than the remaining
        //   value of the running timer.
        //
        // As per RFC 2710 section 4 page 5 (for MLDv1),
        //
        //   If a timer for any address is already running, it is reset to the new
        //   random value only if the requested Maximum Response Delay is less than
        //   the remaining value of the running timer.
        now := g.opts.Clock.Now()
        if info.state == delayingMember {
                if info.delayedReportJobFiresAt.IsZero() {
                        panic(fmt.Sprintf("delayed report unscheduled while in the delaying member state; group = %s", groupAddress))
                }

                if info.delayedReportJobFiresAt.Sub(now) <= maxResponseTime {
                        // The timer is scheduled to fire before the maximum response time so we
                        // leave our timer as is.
                        return
                }
        }

        info.state = delayingMember
        info.cancelDelayedReportJob()
        maxResponseTime = g.calculateDelayTimerDuration(maxResponseTime)
        info.delayedReportJob.Schedule(maxResponseTime)
        info.delayedReportJobFiresAt = now.Add(maxResponseTime)
}

// calculateDelayTimerDuration returns a random time between (0, maxRespTime].
func (g *GenericMulticastProtocolState) calculateDelayTimerDuration(maxRespTime time.Duration) time.Duration {
        // As per RFC 2236 section 3 page 3 (for IGMPv2),
        //
        //   When a host receives a Group-Specific Query, it sets a delay timer to a
        //   random value selected from the range (0, Max Response Time]...
        //
        // As per RFC 2710 section 4 page 6 (for MLDv1),
        //
        //   When a node receives a Multicast-Address-Specific Query, if it is
        //   listening to the queried Multicast Address on the interface from
        //   which the Query was received, it sets a delay timer for that address
        //   to a random value selected from the range [0, Maximum Response Delay],
        //   as above.
        if maxRespTime == 0 {
                return 0
        }
        return time.Duration(g.opts.Rand.Int63n(int64(maxRespTime)))
}























































































































  137 
    1 



  137 
    2 



  135 





   14 


    2 



   12 




   11 



   10 



   10 




   10 




   10 




  155 





    1 




  154 
    9 


  144 





  145 






  145 



   38 






    1 




   37 
    5 



   32 






   32 




   32 



    1 



   30 
    2 
    2 



    2 


   29 



   62 






    1 


   61 



    1 




   60 




   60 





   11 

    1 




   10 
    2 


    7 









    8 



    7 


    4 






    4 



    4 










    7 









   43 






    1 


   42 



    1 




   41 
    1 



   40 



   19 





    1 


   18 



    1 



   17 














   17 





   12 





    1 


   11 



    1 




   10 
    9 
    1 



    9 



  115 








    4 


  111 








  112 
    1 


  109 




  111 
   70 



   41 



   40 





   40 




  111 
   31 

    2 
    1 




   30 
    1 














  109 





  123 








    3 


  123 







  123 


  121 


  121 
    1 




  120 



   69 



    3 









    3 








    3 
    1 



    2 



    6 






    1 


    5 








    5 
    2 



    3 



   10 










   10 
    2 


    8 








    8 



    7 



    7 




    7 



    7 




   22 











   23 




   23 




   22 
    2 


   20 



    1 



   19 



   19 

    3 




    4 


    4 



   19 



   15 




   19 

   20 




   20 
   11 




   19 



   20 


   20 


   24 


   20 


   25 






   25 


   27 


    1 




   27 

    7 


   20 




   20 

    1 




   19 


    7 


    7 
    2 


    8 




    1 




    8 




    8 
    1 





    7 


    7 






    7 



    7 




   14 





   14 




   11 
    1 


   11 



    1 



   10 



    9 


    1 



    9 

    1 


    8 



    9 

    3 




    7 
    2 




    5 



   14 












   30 










   30 
    2 


   28 



    1 




   27 



   26 



   26 




   42 










   41 




   42 
    1 


   41 



    1 




   40 



   40 



   40 

   40 




   40 
   35 




   33 



   32 


   33 


   39 


   33 


   64 


    1 



   64 
   17 

    1 


   15 
    1 





   63 
   33 


    2 





   62 


   62 






   62 
   10 



   58 




   57 




   58 



   42 


   57 




   41 

    1 




   39 
    2 


   39 



    1 



   38 




   37 

   20 






   38 






   38 

    1 


   36 




   38 




   41 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/socket"
        "gvisor.dev/gvisor/pkg/sentry/socket/control"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"

        "gvisor.dev/gvisor/pkg/hostarch"
)

// maxAddrLen is the maximum socket address length we're willing to accept.
const maxAddrLen = 200

// maxOptLen is the maximum sockopt parameter length we're willing to accept.
const maxOptLen = 1024 * 8

// maxControlLen is the maximum length of the msghdr.msg_control buffer we're
// willing to accept. Note that this limit is smaller than Linux, which allows
// buffers upto INT_MAX.
const maxControlLen = 10 * 1024 * 1024

// maxListenBacklog is the maximum limit of listen backlog supported.
const maxListenBacklog = 1024

// nameLenOffset is the offset from the start of the MessageHeader64 struct to
// the NameLen field.
const nameLenOffset = 8

// controlLenOffset is the offset form the start of the MessageHeader64 struct
// to the ControlLen field.
const controlLenOffset = 40

// flagsOffset is the offset form the start of the MessageHeader64 struct
// to the Flags field.
const flagsOffset = 48

const sizeOfInt32 = 4

// messageHeader64Len is the length of a MessageHeader64 struct.
var messageHeader64Len = uint64((*MessageHeader64)(nil).SizeBytes())

// multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
var multipleMessageHeader64Len = uint64((*multipleMessageHeader64)(nil).SizeBytes())

// baseRecvFlags are the flags that are accepted across recvmsg(2),
// recvmmsg(2), and recvfrom(2).
const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC

// MessageHeader64 is the 64-bit representation of the msghdr struct used in
// the recvmsg and sendmsg syscalls.
//
// +marshal
type MessageHeader64 struct {
        // Name is the optional pointer to a network address buffer.
        Name uint64

        // NameLen is the length of the buffer pointed to by Name.
        NameLen uint32
        _       uint32

        // Iov is a pointer to an array of io vectors that describe the memory
        // locations involved in the io operation.
        Iov uint64

        // IovLen is the length of the array pointed to by Iov.
        IovLen uint64

        // Control is the optional pointer to ancillary control data.
        Control uint64

        // ControlLen is the length of the data pointed to by Control.
        ControlLen uint64

        // Flags on the sent/received message.
        Flags int32
        _     int32
}

// multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
// the recvmmsg and sendmmsg syscalls.
//
// +marshal
type multipleMessageHeader64 struct {
        msgHdr MessageHeader64
        msgLen uint32
        _      int32
}

// CaptureAddress allocates memory for and copies a socket address structure
// from the untrusted address space range.
func CaptureAddress(t *kernel.Task, addr hostarch.Addr, addrlen uint32) ([]byte, error) {
        if addrlen > maxAddrLen {
                return nil, linuxerr.EINVAL
        }

        addrBuf := make([]byte, addrlen)
        if _, err := t.CopyInBytes(addr, addrBuf); err != nil {
                return nil, err
        }

        return addrBuf, nil
}

// writeAddress writes a sockaddr structure and its length to an output buffer
// in the unstrusted address space range. If the address is bigger than the
// buffer, it is truncated.
func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr hostarch.Addr, addrLenPtr hostarch.Addr) error {
        // Get the buffer length.
        var bufLen uint32
        if _, err := primitive.CopyUint32In(t, addrLenPtr, &bufLen); err != nil {
                return err
        }

        if int32(bufLen) < 0 {
                return linuxerr.EINVAL
        }

        // Write the length unconditionally.
        if _, err := primitive.CopyUint32Out(t, addrLenPtr, addrLen); err != nil {
                return err
        }

        if addr == nil {
                return nil
        }

        if bufLen > addrLen {
                bufLen = addrLen
        }

        // Copy as much of the address as will fit in the buffer.
        encodedAddr := t.CopyScratchBuffer(addr.SizeBytes())
        addr.MarshalUnsafe(encodedAddr)
        if bufLen > uint32(len(encodedAddr)) {
                bufLen = uint32(len(encodedAddr))
        }
        _, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)])
        return err
}

// Socket implements the linux syscall socket(2).
func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        domain := int(args[0].Int())
        stype := args[1].Int()
        protocol := int(args[2].Int())

        // Check and initialize the flags.
        if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Create the new socket.
        s, e := socket.NewVFS2(t, domain, linux.SockType(stype&0xf), protocol)
        if e != nil {
                return 0, nil, e.ToError()
        }
        defer s.DecRef(t)

        if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil {
                return 0, nil, err
        }

        fd, err := t.NewFDFromVFS2(0, s, kernel.FDFlags{
                CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
        })
        if err != nil {
                return 0, nil, err
        }

        return uintptr(fd), nil, nil
}

// SocketPair implements the linux syscall socketpair(2).
func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        domain := int(args[0].Int())
        stype := args[1].Int()
        protocol := int(args[2].Int())
        addr := args[3].Pointer()

        // Check and initialize the flags.
        if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Create the socket pair.
        s1, s2, e := socket.PairVFS2(t, domain, linux.SockType(stype&0xf), protocol)
        if e != nil {
                return 0, nil, e.ToError()
        }
        // Adding to the FD table will cause an extra reference to be acquired.
        defer s1.DecRef(t)
        defer s2.DecRef(t)

        nonblocking := uint32(stype & linux.SOCK_NONBLOCK)
        if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
                return 0, nil, err
        }
        if err := s2.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
                return 0, nil, err
        }

        // Create the FDs for the sockets.
        flags := kernel.FDFlags{
                CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
        }
        fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{s1, s2}, flags)
        if err != nil {
                return 0, nil, err
        }

        if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
                for _, fd := range fds {
                        if _, file := t.FDTable().Remove(t, fd); file != nil {
                                file.DecRef(t)
                        }
                }
                return 0, nil, err
        }

        return 0, nil, nil
}

// Connect implements the linux syscall connect(2).
func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        addrlen := args[2].Uint()

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, nil, syserror.ENOTSOCK
        }

        // Capture address and call syscall implementation.
        a, err := CaptureAddress(t, addr, addrlen)
        if err != nil {
                return 0, nil, err
        }

        blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
        return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), syserror.ERESTARTSYS)
}

// accept is the implementation of the accept syscall. It is called by accept
// and accept4 syscall handlers.
func accept(t *kernel.Task, fd int32, addr hostarch.Addr, addrLen hostarch.Addr, flags int) (uintptr, error) {
        // Check that no unsupported flags are passed in.
        if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
                return 0, linuxerr.EINVAL
        }

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, syserror.ENOTSOCK
        }

        // Call the syscall implementation for this socket, then copy the
        // output address if one is specified.
        blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0

        peerRequested := addrLen != 0
        nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
        if e != nil {
                return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
        }
        if peerRequested {
                // NOTE(magi): Linux does not give you an error if it can't
                // write the data back out so neither do we.
                if err := writeAddress(t, peer, peerLen, addr, addrLen); linuxerr.Equals(linuxerr.EINVAL, err) {
                        return 0, err
                }
        }
        return uintptr(nfd), nil
}

// Accept4 implements the linux syscall accept4(2).
func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        addrlen := args[2].Pointer()
        flags := int(args[3].Int())

        n, err := accept(t, fd, addr, addrlen, flags)
        return n, nil, err
}

// Accept implements the linux syscall accept(2).
func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        addrlen := args[2].Pointer()

        n, err := accept(t, fd, addr, addrlen, 0)
        return n, nil, err
}

// Bind implements the linux syscall bind(2).
func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        addrlen := args[2].Uint()

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, nil, syserror.ENOTSOCK
        }

        // Capture address and call syscall implementation.
        a, err := CaptureAddress(t, addr, addrlen)
        if err != nil {
                return 0, nil, err
        }

        return 0, nil, s.Bind(t, a).ToError()
}

// Listen implements the linux syscall listen(2).
func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        backlog := args[1].Uint()

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, nil, syserror.ENOTSOCK
        }

        if backlog > maxListenBacklog {
                // Linux treats incoming backlog as uint with a limit defined by
                // sysctl_somaxconn.
                // https://github.com/torvalds/linux/blob/7acac4b3196/net/socket.c#L1666
                backlog = maxListenBacklog
        }

        // Accept one more than the configured listen backlog to keep in parity with
        // Linux. Ref, because of missing equality check here:
        // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/sock.h#L937
        //
        // In case of unix domain sockets, the following check
        // https://github.com/torvalds/linux/blob/7d6beb71da3/net/unix/af_unix.c#L1293
        // will allow 1 connect through since it checks for a receive queue len >
        // backlog and not >=.
        backlog++

        return 0, nil, s.Listen(t, int(backlog)).ToError()
}

// Shutdown implements the linux syscall shutdown(2).
func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        how := args[1].Int()

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, nil, syserror.ENOTSOCK
        }

        // Validate how, then call syscall implementation.
        switch how {
        case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR:
        default:
                return 0, nil, linuxerr.EINVAL
        }

        return 0, nil, s.Shutdown(t, int(how)).ToError()
}

// GetSockOpt implements the linux syscall getsockopt(2).
func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        level := args[1].Int()
        name := args[2].Int()
        optValAddr := args[3].Pointer()
        optLenAddr := args[4].Pointer()

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, nil, syserror.ENOTSOCK
        }

        // Read the length. Reject negative values.
        var optLen int32
        if _, err := primitive.CopyInt32In(t, optLenAddr, &optLen); err != nil {
                return 0, nil, err
        }
        if optLen < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Call syscall implementation then copy both value and value len out.
        v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen))
        if e != nil {
                return 0, nil, e.ToError()
        }

        if _, err := primitive.CopyInt32Out(t, optLenAddr, int32(v.SizeBytes())); err != nil {
                return 0, nil, err
        }

        if v != nil {
                if _, err := v.CopyOut(t, optValAddr); err != nil {
                        return 0, nil, err
                }
        }

        return 0, nil, nil
}

// getSockOpt tries to handle common socket options, or dispatches to a specific
// socket implementation.
func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr hostarch.Addr, len int) (marshal.Marshallable, *syserr.Error) {
        if level == linux.SOL_SOCKET {
                switch name {
                case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
                        if len < sizeOfInt32 {
                                return nil, syserr.ErrInvalidArgument
                        }
                }

                switch name {
                case linux.SO_TYPE:
                        _, skType, _ := s.Type()
                        v := primitive.Int32(skType)
                        return &v, nil
                case linux.SO_DOMAIN:
                        family, _, _ := s.Type()
                        v := primitive.Int32(family)
                        return &v, nil
                case linux.SO_PROTOCOL:
                        _, _, protocol := s.Type()
                        v := primitive.Int32(protocol)
                        return &v, nil
                }
        }

        return s.GetSockOpt(t, level, name, optValAddr, len)
}

// SetSockOpt implements the linux syscall setsockopt(2).
//
// Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket.
func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        level := args[1].Int()
        name := args[2].Int()
        optValAddr := args[3].Pointer()
        optLen := args[4].Int()

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, nil, syserror.ENOTSOCK
        }

        if optLen < 0 {
                return 0, nil, linuxerr.EINVAL
        }
        if optLen > maxOptLen {
                return 0, nil, linuxerr.EINVAL
        }
        buf := t.CopyScratchBuffer(int(optLen))
        if _, err := t.CopyInBytes(optValAddr, buf); err != nil {
                return 0, nil, err
        }

        // Call syscall implementation.
        if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil {
                return 0, nil, err.ToError()
        }

        return 0, nil, nil
}

// GetSockName implements the linux syscall getsockname(2).
func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        addrlen := args[2].Pointer()

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, nil, syserror.ENOTSOCK
        }

        // Get the socket name and copy it to the caller.
        v, vl, err := s.GetSockName(t)
        if err != nil {
                return 0, nil, err.ToError()
        }

        return 0, nil, writeAddress(t, v, vl, addr, addrlen)
}

// GetPeerName implements the linux syscall getpeername(2).
func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        addrlen := args[2].Pointer()

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, nil, syserror.ENOTSOCK
        }

        // Get the socket peer name and copy it to the caller.
        v, vl, err := s.GetPeerName(t)
        if err != nil {
                return 0, nil, err.ToError()
        }

        return 0, nil, writeAddress(t, v, vl, addr, addrlen)
}

// RecvMsg implements the linux syscall recvmsg(2).
func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        msgPtr := args[1].Pointer()
        flags := args[2].Int()

        if t.Arch().Width() != 8 {
                // We only handle 64-bit for now.
                return 0, nil, linuxerr.EINVAL
        }

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, nil, syserror.ENOTSOCK
        }

        // Reject flags that we don't handle yet.
        if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
                flags |= linux.MSG_DONTWAIT
        }

        var haveDeadline bool
        var deadline ktime.Time
        if dl := s.RecvTimeout(); dl > 0 {
                deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
                haveDeadline = true
        } else if dl < 0 {
                flags |= linux.MSG_DONTWAIT
        }

        n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline)
        return n, nil, err
}

// RecvMMsg implements the linux syscall recvmmsg(2).
func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        msgPtr := args[1].Pointer()
        vlen := args[2].Uint()
        flags := args[3].Int()
        toPtr := args[4].Pointer()

        if t.Arch().Width() != 8 {
                // We only handle 64-bit for now.
                return 0, nil, linuxerr.EINVAL
        }

        if vlen > linux.UIO_MAXIOV {
                vlen = linux.UIO_MAXIOV
        }

        // Reject flags that we don't handle yet.
        if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, nil, syserror.ENOTSOCK
        }

        if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
                flags |= linux.MSG_DONTWAIT
        }

        var haveDeadline bool
        var deadline ktime.Time
        if toPtr != 0 {
                var ts linux.Timespec
                if _, err := ts.CopyIn(t, toPtr); err != nil {
                        return 0, nil, err
                }
                if !ts.Valid() {
                        return 0, nil, linuxerr.EINVAL
                }
                deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration())
                haveDeadline = true
        }

        if !haveDeadline {
                if dl := s.RecvTimeout(); dl > 0 {
                        deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
                        haveDeadline = true
                } else if dl < 0 {
                        flags |= linux.MSG_DONTWAIT
                }
        }

        var count uint32
        var err error
        for i := uint64(0); i < uint64(vlen); i++ {
                mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
                if !ok {
                        return 0, nil, linuxerr.EFAULT
                }
                var n uintptr
                if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil {
                        break
                }

                // Copy the received length to the caller.
                lp, ok := mp.AddLength(messageHeader64Len)
                if !ok {
                        return 0, nil, linuxerr.EFAULT
                }
                if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
                        break
                }
                count++
        }

        if count == 0 {
                return 0, nil, err
        }
        return uintptr(count), nil, nil
}

func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr hostarch.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
        // Capture the message header and io vectors.
        var msg MessageHeader64
        if _, err := msg.CopyIn(t, msgPtr); err != nil {
                return 0, err
        }

        if msg.IovLen > linux.UIO_MAXIOV {
                return 0, linuxerr.EMSGSIZE
        }
        dst, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if err != nil {
                return 0, err
        }

        // Fast path when no control message nor name buffers are provided.
        if msg.ControlLen == 0 && msg.NameLen == 0 {
                n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
                if err != nil {
                        return 0, syserror.ConvertIntr(err.ToError(), syserror.ERESTARTSYS)
                }
                if !cms.Unix.Empty() {
                        mflags |= linux.MSG_CTRUNC
                        cms.Release(t)
                }

                if int(msg.Flags) != mflags {
                        // Copy out the flags to the caller.
                        if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
                                return 0, err
                        }
                }

                return uintptr(n), nil
        }

        if msg.ControlLen > maxControlLen {
                return 0, linuxerr.ENOBUFS
        }
        n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
        if e != nil {
                return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
        }
        defer cms.Release(t)

        controlData := make([]byte, 0, msg.ControlLen)
        controlData = control.PackControlMessages(t, cms, controlData)

        if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() {
                creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
                controlData, mflags = control.PackCredentials(t, creds, controlData, mflags)
        }

        if cms.Unix.Rights != nil {
                controlData, mflags = control.PackRightsVFS2(t, cms.Unix.Rights.(control.SCMRightsVFS2), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
        }

        // Copy the address to the caller.
        if msg.NameLen != 0 {
                if err := writeAddress(t, sender, senderLen, hostarch.Addr(msg.Name), hostarch.Addr(msgPtr+nameLenOffset)); err != nil {
                        return 0, err
                }
        }

        // Copy the control data to the caller.
        if _, err := primitive.CopyUint64Out(t, msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
                return 0, err
        }
        if len(controlData) > 0 {
                if _, err := t.CopyOutBytes(hostarch.Addr(msg.Control), controlData); err != nil {
                        return 0, err
                }
        }

        // Copy out the flags to the caller.
        if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
                return 0, err
        }

        return uintptr(n), nil
}

// recvFrom is the implementation of the recvfrom syscall. It is called by
// recvfrom and recv syscall handlers.
func recvFrom(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLenPtr hostarch.Addr) (uintptr, error) {
        if int(bufLen) < 0 {
                return 0, linuxerr.EINVAL
        }

        // Reject flags that we don't handle yet.
        if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 {
                return 0, linuxerr.EINVAL
        }

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, syserror.ENOTSOCK
        }

        if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
                flags |= linux.MSG_DONTWAIT
        }

        dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if err != nil {
                return 0, err
        }

        var haveDeadline bool
        var deadline ktime.Time
        if dl := s.RecvTimeout(); dl > 0 {
                deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
                haveDeadline = true
        } else if dl < 0 {
                flags |= linux.MSG_DONTWAIT
        }

        n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
        cm.Release(t)
        if e != nil {
                return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
        }

        // Copy the address to the caller.
        if nameLenPtr != 0 {
                if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil {
                        return 0, err
                }
        }

        return uintptr(n), nil
}

// RecvFrom implements the linux syscall recvfrom(2).
func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        bufPtr := args[1].Pointer()
        bufLen := args[2].Uint64()
        flags := args[3].Int()
        namePtr := args[4].Pointer()
        nameLenPtr := args[5].Pointer()

        n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr)
        return n, nil, err
}

// SendMsg implements the linux syscall sendmsg(2).
func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        msgPtr := args[1].Pointer()
        flags := args[2].Int()

        if t.Arch().Width() != 8 {
                // We only handle 64-bit for now.
                return 0, nil, linuxerr.EINVAL
        }

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, nil, syserror.ENOTSOCK
        }

        // Reject flags that we don't handle yet.
        if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
                flags |= linux.MSG_DONTWAIT
        }

        n, err := sendSingleMsg(t, s, file, msgPtr, flags)
        return n, nil, err
}

// SendMMsg implements the linux syscall sendmmsg(2).
func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        msgPtr := args[1].Pointer()
        vlen := args[2].Uint()
        flags := args[3].Int()

        if t.Arch().Width() != 8 {
                // We only handle 64-bit for now.
                return 0, nil, linuxerr.EINVAL
        }

        if vlen > linux.UIO_MAXIOV {
                vlen = linux.UIO_MAXIOV
        }

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, nil, syserror.ENOTSOCK
        }

        // Reject flags that we don't handle yet.
        if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
                flags |= linux.MSG_DONTWAIT
        }

        var count uint32
        var err error
        for i := uint64(0); i < uint64(vlen); i++ {
                mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
                if !ok {
                        return 0, nil, linuxerr.EFAULT
                }
                var n uintptr
                if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil {
                        break
                }

                // Copy the received length to the caller.
                lp, ok := mp.AddLength(messageHeader64Len)
                if !ok {
                        return 0, nil, linuxerr.EFAULT
                }
                if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
                        break
                }
                count++
        }

        if count == 0 {
                return 0, nil, err
        }
        return uintptr(count), nil, nil
}

func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescription, msgPtr hostarch.Addr, flags int32) (uintptr, error) {
        // Capture the message header.
        var msg MessageHeader64
        if _, err := msg.CopyIn(t, msgPtr); err != nil {
                return 0, err
        }

        var controlData []byte
        if msg.ControlLen > 0 {
                // Put an upper bound to prevent large allocations.
                if msg.ControlLen > maxControlLen {
                        return 0, linuxerr.ENOBUFS
                }
                controlData = make([]byte, msg.ControlLen)
                if _, err := t.CopyInBytes(hostarch.Addr(msg.Control), controlData); err != nil {
                        return 0, err
                }
        }

        // Read the destination address if one is specified.
        var to []byte
        if msg.NameLen != 0 {
                var err error
                to, err = CaptureAddress(t, hostarch.Addr(msg.Name), msg.NameLen)
                if err != nil {
                        return 0, err
                }
        }

        // Read data then call the sendmsg implementation.
        if msg.IovLen > linux.UIO_MAXIOV {
                return 0, linuxerr.EMSGSIZE
        }
        src, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if err != nil {
                return 0, err
        }

        controlMessages, err := control.Parse(t, s, controlData, t.Arch().Width())
        if err != nil {
                return 0, err
        }

        var haveDeadline bool
        var deadline ktime.Time
        if dl := s.SendTimeout(); dl > 0 {
                deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
                haveDeadline = true
        } else if dl < 0 {
                flags |= linux.MSG_DONTWAIT
        }

        // Call the syscall implementation.
        n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
        err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file)
        // Control messages should be released on error as well as for zero-length
        // messages, which are discarded by the receiver.
        if n == 0 || err != nil {
                controlMessages.Release(t)
        }
        return uintptr(n), err
}

// sendTo is the implementation of the sendto syscall. It is called by sendto
// and send syscall handlers.
func sendTo(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLen uint32) (uintptr, error) {
        bl := int(bufLen)
        if bl < 0 {
                return 0, linuxerr.EINVAL
        }

        // Get socket from the file descriptor.
        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, linuxerr.EBADF
        }
        defer file.DecRef(t)

        // Extract the socket.
        s, ok := file.Impl().(socket.SocketVFS2)
        if !ok {
                return 0, syserror.ENOTSOCK
        }

        if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
                flags |= linux.MSG_DONTWAIT
        }

        // Read the destination address if one is specified.
        var to []byte
        var err error
        if namePtr != 0 {
                to, err = CaptureAddress(t, namePtr, nameLen)
                if err != nil {
                        return 0, err
                }
        }

        src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if err != nil {
                return 0, err
        }

        var haveDeadline bool
        var deadline ktime.Time
        if dl := s.SendTimeout(); dl > 0 {
                deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
                haveDeadline = true
        } else if dl < 0 {
                flags |= linux.MSG_DONTWAIT
        }

        // Call the syscall implementation.
        n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)})
        return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendto", file)
}

// SendTo implements the linux syscall sendto(2).
func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        bufPtr := args[1].Pointer()
        bufLen := args[2].Uint64()
        flags := args[3].Int()
        namePtr := args[4].Pointer()
        nameLen := args[5].Uint()

        n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen)
        return n, nil, err
}





















































  674 















  138 

  139 




  138 



  140 

  139 






  140 



  133 





  140 













  684 




  484 




  723 






  656 



  664 


  136 

  114 


  135 






  622 



  625 


  132 



  135 


  135 


  136 


























  211 



  209 



  238 



  240 



  242 
    6 

  229 
    6 


   16 
    4 



  234 



  214 

   12 


  217 





  191 


  209 







  109 



  108 



  264 



  265 


  265 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package hostarch

import (
        "bytes"
        "fmt"
        "unsafe"

        "gvisor.dev/gvisor/pkg/gohacks"
)

// An AddrRangeSeq represents a sequence of AddrRanges.
//
// AddrRangeSeqs are immutable and may be copied by value. The zero value of
// AddrRangeSeq represents an empty sequence.
//
// An AddrRangeSeq may contain AddrRanges with a length of 0. This is necessary
// since zero-length AddrRanges are significant to MM bounds checks.
type AddrRangeSeq struct {
        // If length is 0, then the AddrRangeSeq represents no AddrRanges.
        // Invariants: data == 0; offset == 0; limit == 0.
        //
        // If length is 1, then the AddrRangeSeq represents the single
        // AddrRange{offset, offset+limit}. Invariants: data == 0.
        //
        // Otherwise, length >= 2, and the AddrRangeSeq represents the `length`
        // AddrRanges in the array of AddrRanges starting at address `data`,
        // starting at `offset` bytes into the first AddrRange and limited to the
        // following `limit` bytes. (AddrRanges after `limit` are still iterated,
        // but are truncated to a length of 0.) Invariants: data != 0; offset <=
        // data[0].Length(); limit > 0; offset+limit <= the combined length of all
        // AddrRanges in the array.
        data   unsafe.Pointer
        length int
        offset Addr
        limit  Addr
}

// AddrRangeSeqOf returns an AddrRangeSeq representing the single AddrRange ar.
func AddrRangeSeqOf(ar AddrRange) AddrRangeSeq {
        return AddrRangeSeq{
                length: 1,
                offset: ar.Start,
                limit:  ar.Length(),
        }
}

// AddrRangeSeqFromSlice returns an AddrRangeSeq representing all AddrRanges in
// slice.
//
// Whether the returned AddrRangeSeq shares memory with slice is unspecified;
// clients should avoid mutating slices passed to AddrRangeSeqFromSlice.
//
// Preconditions: The combined length of all AddrRanges in slice <=
// math.MaxInt64.
func AddrRangeSeqFromSlice(slice []AddrRange) AddrRangeSeq {
        var limit int64
        for _, ar := range slice {
                len64 := int64(ar.Length())
                if len64 < 0 {
                        panic(fmt.Sprintf("Length of AddrRange %v overflows int64", ar))
                }
                sum := limit + len64
                if sum < limit {
                        panic(fmt.Sprintf("Total length of AddrRanges %v overflows int64", slice))
                }
                limit = sum
        }
        return addrRangeSeqFromSliceLimited(slice, limit)
}

// Preconditions:
// * The combined length of all AddrRanges in slice <= limit.
// * limit >= 0.
// * If len(slice) != 0, then limit > 0.
func addrRangeSeqFromSliceLimited(slice []AddrRange, limit int64) AddrRangeSeq {
        switch len(slice) {
        case 0:
                return AddrRangeSeq{}
        case 1:
                return AddrRangeSeq{
                        length: 1,
                        offset: slice[0].Start,
                        limit:  Addr(limit),
                }
        default:
                return AddrRangeSeq{
                        data:   unsafe.Pointer(&slice[0]),
                        length: len(slice),
                        limit:  Addr(limit),
                }
        }
}

// IsEmpty returns true if ars.NumRanges() == 0.
//
// Note that since AddrRangeSeq may contain AddrRanges with a length of zero,
// an AddrRange representing 0 bytes (AddrRangeSeq.NumBytes() == 0) is not
// necessarily empty.
func (ars AddrRangeSeq) IsEmpty() bool {
        return ars.length == 0
}

// NumRanges returns the number of AddrRanges in ars.
func (ars AddrRangeSeq) NumRanges() int {
        return ars.length
}

// NumBytes returns the number of bytes represented by ars.
func (ars AddrRangeSeq) NumBytes() int64 {
        return int64(ars.limit)
}

// Head returns the first AddrRange in ars.
//
// Preconditions: !ars.IsEmpty().
func (ars AddrRangeSeq) Head() AddrRange {
        if ars.length == 0 {
                panic("empty AddrRangeSeq")
        }
        if ars.length == 1 {
                return AddrRange{ars.offset, ars.offset + ars.limit}
        }
        ar := *(*AddrRange)(ars.data)
        ar.Start += ars.offset
        if ar.Length() > ars.limit {
                ar.End = ar.Start + ars.limit
        }
        return ar
}

// Tail returns an AddrRangeSeq consisting of all AddrRanges in ars after the
// first.
//
// Preconditions: !ars.IsEmpty().
func (ars AddrRangeSeq) Tail() AddrRangeSeq {
        if ars.length == 0 {
                panic("empty AddrRangeSeq")
        }
        if ars.length == 1 {
                return AddrRangeSeq{}
        }
        return ars.externalTail()
}

// Preconditions: ars.length >= 2.
func (ars AddrRangeSeq) externalTail() AddrRangeSeq {
        headLen := (*AddrRange)(ars.data).Length() - ars.offset
        var tailLimit int64
        if ars.limit > headLen {
                tailLimit = int64(ars.limit - headLen)
        }
        var extSlice []AddrRange
        extSliceHdr := (*gohacks.SliceHeader)(unsafe.Pointer(&extSlice))
        extSliceHdr.Data = ars.data
        extSliceHdr.Len = ars.length
        extSliceHdr.Cap = ars.length
        return addrRangeSeqFromSliceLimited(extSlice[1:], tailLimit)
}

// DropFirst returns an AddrRangeSeq equivalent to ars, but with the first n
// bytes omitted. If n > ars.NumBytes(), DropFirst returns an empty
// AddrRangeSeq.
//
// If !ars.IsEmpty() and ars.Head().Length() == 0, DropFirst will always omit
// at least ars.Head(), even if n == 0. This guarantees that the basic pattern
// of:
//
//     for !ars.IsEmpty() {
//       n, err = doIOWith(ars.Head())
//       if err != nil {
//         return err
//       }
//       ars = ars.DropFirst(n)
//     }
//
// works even in the presence of zero-length AddrRanges.
//
// Preconditions: n >= 0.
func (ars AddrRangeSeq) DropFirst(n int) AddrRangeSeq {
        if n < 0 {
                panic(fmt.Sprintf("invalid n: %d", n))
        }
        return ars.DropFirst64(int64(n))
}

// DropFirst64 is equivalent to DropFirst but takes an int64.
func (ars AddrRangeSeq) DropFirst64(n int64) AddrRangeSeq {
        if n < 0 {
                panic(fmt.Sprintf("invalid n: %d", n))
        }
        if Addr(n) > ars.limit {
                return AddrRangeSeq{}
        }
        // Handle initial empty AddrRange.
        switch ars.length {
        case 0:
                return AddrRangeSeq{}
        case 1:
                if ars.limit == 0 {
                        return AddrRangeSeq{}
                }
        default:
                if rawHeadLen := (*AddrRange)(ars.data).Length(); ars.offset == rawHeadLen {
                        ars = ars.externalTail()
                }
        }
        for n != 0 {
                // Calling ars.Head() here is surprisingly expensive, so inline getting
                // the head's length.
                var headLen Addr
                if ars.length == 1 {
                        headLen = ars.limit
                } else {
                        headLen = (*AddrRange)(ars.data).Length() - ars.offset
                }
                if Addr(n) < headLen {
                        // Dropping ends partway through the head AddrRange.
                        ars.offset += Addr(n)
                        ars.limit -= Addr(n)
                        return ars
                }
                n -= int64(headLen)
                ars = ars.Tail()
        }
        return ars
}

// TakeFirst returns an AddrRangeSeq equivalent to ars, but iterating at most n
// bytes. TakeFirst never removes AddrRanges from ars; AddrRanges beyond the
// first n bytes are reduced to a length of zero, but will still be iterated.
//
// Preconditions: n >= 0.
func (ars AddrRangeSeq) TakeFirst(n int) AddrRangeSeq {
        if n < 0 {
                panic(fmt.Sprintf("invalid n: %d", n))
        }
        return ars.TakeFirst64(int64(n))
}

// TakeFirst64 is equivalent to TakeFirst but takes an int64.
func (ars AddrRangeSeq) TakeFirst64(n int64) AddrRangeSeq {
        if n < 0 {
                panic(fmt.Sprintf("invalid n: %d", n))
        }
        if ars.limit > Addr(n) {
                ars.limit = Addr(n)
        }
        return ars
}

// String implements fmt.Stringer.String.
func (ars AddrRangeSeq) String() string {
        // This is deliberately chosen to be the same as fmt's automatic stringer
        // for []AddrRange.
        var buf bytes.Buffer
        buf.WriteByte('[')
        var sep string
        for !ars.IsEmpty() {
                buf.WriteString(sep)
                sep = " "
                buf.WriteString(ars.Head().String())
                ars = ars.Tail()
        }
        buf.WriteByte(']')
        return buf.String()
}


























































































   25 
















    8 





    6 



    8 



    3 





















    1 










   21 

   22 






   22 



   19 
   19 
   16 

   16 










    9 





   11 

    4 


   11 


   11 



    5 





   21 












   25 















   24 

   18 

    9 





   25 







    5 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
        "fmt"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/seqnum"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

// queueFlags are used to indicate which queue of an endpoint a particular segment
// belongs to. This is used to track memory accounting correctly.
type queueFlags uint8

const (
        recvQ queueFlags = 1 << iota
        sendQ
)

// segment represents a TCP segment. It holds the payload and parsed TCP segment
// information, and can be added to intrusive lists.
// segment is mostly immutable, the only field allowed to change is data.
//
// +stateify savable
type segment struct {
        segmentEntry
        refCnt int32
        ep     *endpoint
        qFlags queueFlags
        id     stack.TransportEndpointID `state:"manual"`

        // TODO(gvisor.dev/issue/4417): Hold a stack.PacketBuffer instead of
        // individual members for link/network packet info.
        srcAddr  tcpip.Address
        dstAddr  tcpip.Address
        netProto tcpip.NetworkProtocolNumber
        nicID    tcpip.NICID

        data buffer.VectorisedView `state:".(buffer.VectorisedView)"`

        hdr header.TCP
        // views is used as buffer for data when its length is large
        // enough to store a VectorisedView.
        views          [8]buffer.View `state:"nosave"`
        sequenceNumber seqnum.Value
        ackNumber      seqnum.Value
        flags          header.TCPFlags
        window         seqnum.Size
        // csum is only populated for received segments.
        csum uint16
        // csumValid is true if the csum in the received segment is valid.
        csumValid bool

        // parsedOptions stores the parsed values from the options in the segment.
        parsedOptions  header.TCPOptions
        options        []byte `state:".([]byte)"`
        hasNewSACKInfo bool
        rcvdTime       tcpip.MonotonicTime
        // xmitTime is the last transmit time of this segment.
        xmitTime  tcpip.MonotonicTime
        xmitCount uint32

        // acked indicates if the segment has already been SACKed.
        acked bool

        // dataMemSize is the memory used by data initially.
        dataMemSize int

        // lost indicates if the segment is marked as lost by RACK.
        lost bool
}

func newIncomingSegment(id stack.TransportEndpointID, clock tcpip.Clock, pkt *stack.PacketBuffer) *segment {
        netHdr := pkt.Network()
        s := &segment{
                refCnt:   1,
                id:       id,
                srcAddr:  netHdr.SourceAddress(),
                dstAddr:  netHdr.DestinationAddress(),
                netProto: pkt.NetworkProtocolNumber,
                nicID:    pkt.NICID,
        }
        s.data = pkt.Data().ExtractVV().Clone(s.views[:])
        s.hdr = header.TCP(pkt.TransportHeader().View())
        s.rcvdTime = clock.NowMonotonic()
        s.dataMemSize = s.data.Size()
        return s
}

func newOutgoingSegment(id stack.TransportEndpointID, clock tcpip.Clock, v buffer.View) *segment {
        s := &segment{
                refCnt: 1,
                id:     id,
        }
        s.rcvdTime = clock.NowMonotonic()
        if len(v) != 0 {
                s.views[0] = v
                s.data = buffer.NewVectorisedView(len(v), s.views[:1])
        }
        s.dataMemSize = s.data.Size()
        return s
}

func (s *segment) clone() *segment {
        t := &segment{
                refCnt:         1,
                id:             s.id,
                sequenceNumber: s.sequenceNumber,
                ackNumber:      s.ackNumber,
                flags:          s.flags,
                window:         s.window,
                netProto:       s.netProto,
                nicID:          s.nicID,
                rcvdTime:       s.rcvdTime,
                xmitTime:       s.xmitTime,
                xmitCount:      s.xmitCount,
                ep:             s.ep,
                qFlags:         s.qFlags,
                dataMemSize:    s.dataMemSize,
        }
        t.data = s.data.Clone(t.views[:])
        return t
}

// merge merges data in oth and clears oth.
func (s *segment) merge(oth *segment) {
        s.data.Append(oth.data)
        s.dataMemSize = s.data.Size()

        oth.data = buffer.VectorisedView{}
        oth.dataMemSize = oth.data.Size()
}

// setOwner sets the owning endpoint for this segment. Its required
// to be called to ensure memory accounting for receive/send buffer
// queues is done properly.
func (s *segment) setOwner(ep *endpoint, qFlags queueFlags) {
        switch qFlags {
        case recvQ:
                ep.updateReceiveMemUsed(s.segMemSize())
        case sendQ:
                // no memory account for sendQ yet.
        default:
                panic(fmt.Sprintf("unexpected queue flag %b", qFlags))
        }
        s.ep = ep
        s.qFlags = qFlags
}

func (s *segment) decRef() {
        if atomic.AddInt32(&s.refCnt, -1) == 0 {
                if s.ep != nil {
                        switch s.qFlags {
                        case recvQ:
                                s.ep.updateReceiveMemUsed(-s.segMemSize())
                        case sendQ:
                                // no memory accounting for sendQ yet.
                        default:
                                panic(fmt.Sprintf("unexpected queue flag %b set for segment", s.qFlags))
                        }
                }
        }
}

func (s *segment) incRef() {
        atomic.AddInt32(&s.refCnt, 1)
}

// logicalLen is the segment length in the sequence number space. It's defined
// as the data length plus one for each of the SYN and FIN bits set.
func (s *segment) logicalLen() seqnum.Size {
        l := seqnum.Size(s.data.Size())
        if s.flags.Contains(header.TCPFlagSyn) {
                l++
        }
        if s.flags.Contains(header.TCPFlagFin) {
                l++
        }
        return l
}

// payloadSize is the size of s.data.
func (s *segment) payloadSize() int {
        return s.data.Size()
}

// segMemSize is the amount of memory used to hold the segment data and
// the associated metadata.
func (s *segment) segMemSize() int {
        return SegSize + s.dataMemSize
}

// parse populates the sequence & ack numbers, flags, and window fields of the
// segment from the TCP header stored in the data. It then updates the view to
// skip the header.
//
// Returns boolean indicating if the parsing was successful.
//
// If checksum verification may not be skipped, parse also verifies the
// TCP checksum and stores the checksum and result of checksum verification in
// the csum and csumValid fields of the segment.
func (s *segment) parse(skipChecksumValidation bool) bool {
        // h is the header followed by the payload. We check that the offset to
        // the data respects the following constraints:
        // 1. That it's at least the minimum header size; if we don't do this
        //    then part of the header would be delivered to user.
        // 2. That the header fits within the buffer; if we don't do this, we
        //    would panic when we tried to access data beyond the buffer.
        //
        // N.B. The segment has already been validated as having at least the
        //      minimum TCP size before reaching here, so it's safe to read the
        //      fields.
        offset := int(s.hdr.DataOffset())
        if offset < header.TCPMinimumSize || offset > len(s.hdr) {
                return false
        }

        s.options = s.hdr[header.TCPMinimumSize:]
        s.parsedOptions = header.ParseTCPOptions(s.options)
        if skipChecksumValidation {
                s.csumValid = true
        } else {
                s.csum = s.hdr.Checksum()
                payloadChecksum := header.ChecksumVV(s.data, 0)
                payloadLength := uint16(s.data.Size())
                s.csumValid = s.hdr.IsChecksumValid(s.srcAddr, s.dstAddr, payloadChecksum, payloadLength)
        }
        s.sequenceNumber = seqnum.Value(s.hdr.SequenceNumber())
        s.ackNumber = seqnum.Value(s.hdr.AckNumber())
        s.flags = s.hdr.Flags()
        s.window = seqnum.Size(s.hdr.WindowSize())
        return true
}

// sackBlock returns a header.SACKBlock that represents this segment.
func (s *segment) sackBlock() header.SACKBlock {
        return header.SACKBlock{Start: s.sequenceNumber, End: s.sequenceNumber.Add(s.logicalLen())}
}





































































































































































































































































































































































































































   25 




   87 








   48 



   39 





    1 

































































































    3 



   87 
   78 


   12 
   10 



    3 









   85 
    6 


   83 




   44 

    2 



   43 

    2 


    4 


   38 


   37 


    4 





   33 




   11 







   22 





   11 




   11 




   28 
    1 



   27 





    2 

    1 


    1 





    1 



   25 


    1 



   24 



   23 



   24 












   24 




   11 





    1 







    1 

    2 



    1 





















































    9 


    7 

    2 






    8 




    6 






    6 








































  101 

   22 


   29 


   18 


   32 













   10 
    2 


    8 



   22 


    4 
    1 




    3 
    2 




    1 


    2 
    1 



    1 















    2 
    1 



    1 





    1 
























    1 
    1 






    1 

    1 




























    2 
    1 



    1 


    2 
    1 



    1 





    1 


    2 

    1 



    1 


    2 

    1 



    1 


    1 




    1 


    1 




    1 


    2 
    1 




    1 
    1 


    1 









   29 
    1 




   28 
    2 
    1 



    1 


    2 
    1 



    1 


    1 
    1 






    3 
    1 



    2 
    1 


    1 


    1 




    1 
    1 





    2 




    2 
    1 


    1 














    2 




    2 
    1 


    1 


    3 

    1 





    2 














    2 







    2 




    2 

    2 


    2 




    1 



    2 
    1 



    1 








    1 


    1 


    1 





    2 
    1 



    1 



    1 


    1 


    1 

    3 
    1 



    2 
    1 



    1 


    2 
    1 



    1 
    1 





    1 
    1 









    1 


    2 



   18 





   18 
    1 



   17 
    1 




    1 





    2 

    1 



    1 
    1 














    1 
    1 





    1 
    1 






    1 
    1 






    2 




    2 
    2 






    3 
    1 




    2 



    2 



    2 
    1 


    1 

    5 

    1 



    4 



    3 



    3 
    2 


    1 

    1 





    1 




















   32 





   32 
    4 




    4 
    1 




    3 
    3 



    3 

    2 
    1 



    2 
    1 



    1 


    2 




    2 
    1 



    1 



    1 




    1 


    3 

    2 



    1 
    1 









    1 




    1 


    2 
    1 



    1 


    1 
    1 






    1 
    1 














    2 
    1 



    1 
    1 






    2 
    1 




    1 



    1 



    1 
    1 




    4 





    4 



    3 



    3 
    3 




    4 
    1 




    3 



    2 



    2 
    1 


    1 

    2 


    2 


































  114 

   34 


   32 


   20 


   28 


    1 








    1 




    2 


    5 




    2 



    5 

    3 





    5 



   34 

    3 
    1 



    2 





    4 
    1 



    3 





    3 




    3 



    1 
    1 







    2 




    2 



    2 



    2 
    1 



    1 

    1 




    1 



    3 
    1 



    2 



    2 
    1 



    1 



    3 
    1 



    2 

    1 


    1 


    3 
    1 



    2 

    1 


    1 


    2 
    1 



    1 



    2 
    1 



    1 



    2 
    1 



    1 


    1 



    1 





    3 




    1 



    1 



   32 
    1 




   31 
    2 
    1 



    1 



    4 
    1 



    3 



    2 
    1 



    1 



    3 
    1 



    2 


    1 




    1 
    1 





    2 
    1 



    1 



    1 


    3 
    1 



    2 
    1 


    1 

    2 
    1 



    1 



    1 


    2 

    1 


    1 

    3 
    1 



    2 



    1 
    1 









    1 



    1 



    3 
    1 


    2 



    1 


    3 



    3 



   20 
    1 




   19 




   19 
    3 




    3 

    2 



    1 



    1 

    1 








    3 

    1 



    2 
















    1 



    2 
    1 


    1 




    3 
    1 


    2 



    2 


    2 

    1 





    1 

    2 
    1 


    1 



    1 


    1 
    1 



















    2 



    3 













    9 
    1 



    8 




    1 




    7 





    2 




    4 
    2 



    2 







   14 
    2 



   12 



    3 



   28 





   28 
    1 

    1 












    2 





    2 







    3 





    3 







    5 

    1 



    4 


















    4 

    1 




    3 

    2 


    2 

    2 



    2 



    2 

    1 




    1 


    3 
    1 


    2 



    2 













    1 



    1 



    1 


    2 
    1 


    1 




    1 


    3 
    1 




    2 



    1 




    1 

    1 






























    1 




    1 





    4 
















    2 








    2 





































    2 








    2 


















































    2 

    1 



    1 





    4 

    2 



    2 



    2 
    2 










    2 

    2 

















   25 
















    2 




   23 



   25 



    6 


   25 



   16 

   13 


    4 


    2 





   13 
    1 



   13 
    5 



   13 


    4 


    2 






    2 
    1 


    2 

    2 



    2 




   15 























   17 

   17 






    2 







    2 

    1 


    2 




    2 

    2 










    2 







    2 
    2 


    2 










   20 
    2 



   19 



    1 



   19 

    1 




   18 




   13 





    1 







    6 



    1 






    1 

    5 



    1 

    1 



    1 









   63 

    1 



   64 
   39 

    3 


   37 


   37 




   62 











   62 


    5 


   58 

   41 

    1 
   23 


   58 
    3 





    3 



    1 
    1 






    3 

   55 








   16 








   16 
    2 


    1 



    1 



    1 





    1 




    1 




   13 



   23 





   23 












   11 





   11 


    9 


    4 







    4 



    1 


    2 

    1 



    1 



    1 



    4 

    3 



    1 




    1 



    1 



    2 



   11 














   11 



    1 







   10 
    8 




   10 



    8 
    2 




















    1 






    1 

    1 

    1 




    1 



    1 








    1 







    1 







    1 

    1 

    1 



    1 





















    8 



    4 








    4 




    3 

    3 

    3 




    3 




    3 








    3 










    1 







    1 
    1 






   20 

   20 


   20 


   20 


   20 


   20 


   64 



    2 





































































  166 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package netstack provides an implementation of the socket.Socket interface
// that is backed by a tcpip.Endpoint.
//
// It does not depend on any particular endpoint implementation, and thus can
// be used to expose certain endpoints to the sentry while leaving others out,
// for example, TCP endpoints and Unix-domain endpoints.
//
// Lock ordering: netstack => mm: ioSequenceReadWriter copies user memory inside
// tcpip.Endpoint.Write(). Netstack is allowed to (and does) hold locks during
// this operation.
package netstack

import (
        "bytes"
        "encoding/binary"
        "fmt"
        "io"
        "io/ioutil"
        "math"
        "reflect"
        "time"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/abi/linux/errno"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/metric"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
        "gvisor.dev/gvisor/pkg/sentry/inet"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/socket"
        "gvisor.dev/gvisor/pkg/sentry/socket/netfilter"
        "gvisor.dev/gvisor/pkg/sentry/unimpl"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
        "gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
        "gvisor.dev/gvisor/pkg/tcpip/transport/udp"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

func mustCreateMetric(name, description string) *tcpip.StatCounter {
        var cm tcpip.StatCounter
        metric.MustRegisterCustomUint64Metric(name, true /* cumulative */, false /* sync */, description, cm.Value)
        return &cm
}

func mustCreateGauge(name, description string) *tcpip.StatCounter {
        var cm tcpip.StatCounter
        metric.MustRegisterCustomUint64Metric(name, false /* cumulative */, false /* sync */, description, cm.Value)
        return &cm
}

// Metrics contains metrics exported by netstack.
var Metrics = tcpip.Stats{
        DroppedPackets: mustCreateMetric("/netstack/dropped_packets", "Number of packets dropped at the transport layer."),
        NICs: tcpip.NICStats{
                UnknownL3ProtocolRcvdPackets: mustCreateMetric("/netstack/nic/unknown_l3_protocol_received_packets", "Number of packets received that were for an unknown or unsupported L3 protocol."),
                UnknownL4ProtocolRcvdPackets: mustCreateMetric("/netstack/nic/unknown_l4_protocol_received_packets", "Number of packets received that were for an unknown or unsupported L4 protocol."),
                MalformedL4RcvdPackets:       mustCreateMetric("/netstack/nic/malformed_l4_received_packets", "Number of packets received that failed L4 header parsing."),
                Tx: tcpip.NICPacketStats{
                        Packets: mustCreateMetric("/netstack/nic/tx/packets", "Number of packets transmitted."),
                        Bytes:   mustCreateMetric("/netstack/nic/tx/bytes", "Number of bytes transmitted."),
                },
                Rx: tcpip.NICPacketStats{
                        Packets: mustCreateMetric("/netstack/nic/rx/packets", "Number of packets received."),
                        Bytes:   mustCreateMetric("/netstack/nic/rx/bytes", "Number of bytes received."),
                },
                DisabledRx: tcpip.NICPacketStats{
                        Packets: mustCreateMetric("/netstack/nic/disabled_rx/packets", "Number of packets received on disabled NICs."),
                        Bytes:   mustCreateMetric("/netstack/nic/disabled_rx/bytes", "Number of bytes received on disabled NICs."),
                },
                Neighbor: tcpip.NICNeighborStats{
                        UnreachableEntryLookups: mustCreateMetric("/netstack/nic/neighbor/unreachable_entry_loopups", "Number of lookups performed on a neighbor entry in Unreachable state."),
                },
        },
        ICMP: tcpip.ICMPStats{
                V4: tcpip.ICMPv4Stats{
                        PacketsSent: tcpip.ICMPv4SentPacketStats{
                                ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
                                        EchoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_request", "Number of ICMPv4 echo request packets sent."),
                                        EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/echo_reply", "Number of ICMPv4 echo reply packets sent."),
                                        DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_sent/dst_unreachable", "Number of ICMPv4 destination unreachable packets sent."),
                                        SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_sent/src_quench", "Number of ICMPv4 source quench packets sent."),
                                        Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_sent/redirect", "Number of ICMPv4 redirect packets sent."),
                                        TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_sent/time_exceeded", "Number of ICMPv4 time exceeded packets sent."),
                                        ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_sent/param_problem", "Number of ICMPv4 parameter problem packets sent."),
                                        Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp", "Number of ICMPv4 timestamp packets sent."),
                                        TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_sent/timestamp_reply", "Number of ICMPv4 timestamp reply packets sent."),
                                        InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_sent/info_request", "Number of ICMPv4 information request packets sent."),
                                        InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_sent/info_reply", "Number of ICMPv4 information reply packets sent."),
                                },
                                Dropped:     mustCreateMetric("/netstack/icmp/v4/packets_sent/dropped", "Number of ICMPv4 packets dropped due to link layer errors."),
                                RateLimited: mustCreateMetric("/netstack/icmp/v4/packets_sent/rate_limited", "Number of ICMPv4 packets dropped due to rate limit being exceeded."),
                        },
                        PacketsReceived: tcpip.ICMPv4ReceivedPacketStats{
                                ICMPv4PacketStats: tcpip.ICMPv4PacketStats{
                                        EchoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/echo_request", "Number of ICMPv4 echo request packets received."),
                                        EchoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/echo_reply", "Number of ICMPv4 echo reply packets received."),
                                        DstUnreachable: mustCreateMetric("/netstack/icmp/v4/packets_received/dst_unreachable", "Number of ICMPv4 destination unreachable packets received."),
                                        SrcQuench:      mustCreateMetric("/netstack/icmp/v4/packets_received/src_quench", "Number of ICMPv4 source quench packets received."),
                                        Redirect:       mustCreateMetric("/netstack/icmp/v4/packets_received/redirect", "Number of ICMPv4 redirect packets received."),
                                        TimeExceeded:   mustCreateMetric("/netstack/icmp/v4/packets_received/time_exceeded", "Number of ICMPv4 time exceeded packets received."),
                                        ParamProblem:   mustCreateMetric("/netstack/icmp/v4/packets_received/param_problem", "Number of ICMPv4 parameter problem packets received."),
                                        Timestamp:      mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp", "Number of ICMPv4 timestamp packets received."),
                                        TimestampReply: mustCreateMetric("/netstack/icmp/v4/packets_received/timestamp_reply", "Number of ICMPv4 timestamp reply packets received."),
                                        InfoRequest:    mustCreateMetric("/netstack/icmp/v4/packets_received/info_request", "Number of ICMPv4 information request packets received."),
                                        InfoReply:      mustCreateMetric("/netstack/icmp/v4/packets_received/info_reply", "Number of ICMPv4 information reply packets received."),
                                },
                                Invalid: mustCreateMetric("/netstack/icmp/v4/packets_received/invalid", "Number of ICMPv4 packets received that the transport layer could not parse."),
                        },
                },
                V6: tcpip.ICMPv6Stats{
                        PacketsSent: tcpip.ICMPv6SentPacketStats{
                                ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
                                        EchoRequest:             mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_request", "Number of ICMPv6 echo request packets sent."),
                                        EchoReply:               mustCreateMetric("/netstack/icmp/v6/packets_sent/echo_reply", "Number of ICMPv6 echo reply packets sent."),
                                        DstUnreachable:          mustCreateMetric("/netstack/icmp/v6/packets_sent/dst_unreachable", "Number of ICMPv6 destination unreachable packets sent."),
                                        PacketTooBig:            mustCreateMetric("/netstack/icmp/v6/packets_sent/packet_too_big", "Number of ICMPv6 packet too big packets sent."),
                                        TimeExceeded:            mustCreateMetric("/netstack/icmp/v6/packets_sent/time_exceeded", "Number of ICMPv6 time exceeded packets sent."),
                                        ParamProblem:            mustCreateMetric("/netstack/icmp/v6/packets_sent/param_problem", "Number of ICMPv6 parameter problem packets sent."),
                                        RouterSolicit:           mustCreateMetric("/netstack/icmp/v6/packets_sent/router_solicit", "Number of ICMPv6 router solicit packets sent."),
                                        RouterAdvert:            mustCreateMetric("/netstack/icmp/v6/packets_sent/router_advert", "Number of ICMPv6 router advert packets sent."),
                                        NeighborSolicit:         mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets sent."),
                                        NeighborAdvert:          mustCreateMetric("/netstack/icmp/v6/packets_sent/neighbor_advert", "Number of ICMPv6 neighbor advert packets sent."),
                                        RedirectMsg:             mustCreateMetric("/netstack/icmp/v6/packets_sent/redirect_msg", "Number of ICMPv6 redirect message packets sent."),
                                        MulticastListenerQuery:  mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_query", "Number of ICMPv6 multicast listener query packets sent."),
                                        MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."),
                                        MulticastListenerDone:   mustCreateMetric("/netstack/icmp/v6/packets_sent/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."),
                                },
                                Dropped:     mustCreateMetric("/netstack/icmp/v6/packets_sent/dropped", "Number of ICMPv6 packets dropped due to link layer errors."),
                                RateLimited: mustCreateMetric("/netstack/icmp/v6/packets_sent/rate_limited", "Number of ICMPv6 packets dropped due to rate limit being exceeded."),
                        },
                        PacketsReceived: tcpip.ICMPv6ReceivedPacketStats{
                                ICMPv6PacketStats: tcpip.ICMPv6PacketStats{
                                        EchoRequest:             mustCreateMetric("/netstack/icmp/v6/packets_received/echo_request", "Number of ICMPv6 echo request packets received."),
                                        EchoReply:               mustCreateMetric("/netstack/icmp/v6/packets_received/echo_reply", "Number of ICMPv6 echo reply packets received."),
                                        DstUnreachable:          mustCreateMetric("/netstack/icmp/v6/packets_received/dst_unreachable", "Number of ICMPv6 destination unreachable packets received."),
                                        PacketTooBig:            mustCreateMetric("/netstack/icmp/v6/packets_received/packet_too_big", "Number of ICMPv6 packet too big packets received."),
                                        TimeExceeded:            mustCreateMetric("/netstack/icmp/v6/packets_received/time_exceeded", "Number of ICMPv6 time exceeded packets received."),
                                        ParamProblem:            mustCreateMetric("/netstack/icmp/v6/packets_received/param_problem", "Number of ICMPv6 parameter problem packets received."),
                                        RouterSolicit:           mustCreateMetric("/netstack/icmp/v6/packets_received/router_solicit", "Number of ICMPv6 router solicit packets received."),
                                        RouterAdvert:            mustCreateMetric("/netstack/icmp/v6/packets_received/router_advert", "Number of ICMPv6 router advert packets received."),
                                        NeighborSolicit:         mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_solicit", "Number of ICMPv6 neighbor solicit packets received."),
                                        NeighborAdvert:          mustCreateMetric("/netstack/icmp/v6/packets_received/neighbor_advert", "Number of ICMPv6 neighbor advert packets received."),
                                        RedirectMsg:             mustCreateMetric("/netstack/icmp/v6/packets_received/redirect_msg", "Number of ICMPv6 redirect message packets received."),
                                        MulticastListenerQuery:  mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_query", "Number of ICMPv6 multicast listener query packets received."),
                                        MulticastListenerReport: mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_report", "Number of ICMPv6 multicast listener report packets sent."),
                                        MulticastListenerDone:   mustCreateMetric("/netstack/icmp/v6/packets_received/multicast_listener_done", "Number of ICMPv6 multicast listener done packets sent."),
                                },
                                Unrecognized:                   mustCreateMetric("/netstack/icmp/v6/packets_received/unrecognized", "Number of ICMPv6 packets received that the transport layer does not know how to parse."),
                                Invalid:                        mustCreateMetric("/netstack/icmp/v6/packets_received/invalid", "Number of ICMPv6 packets received that the transport layer could not parse."),
                                RouterOnlyPacketsDroppedByHost: mustCreateMetric("/netstack/icmp/v6/packets_received/router_only_packets_dropped_by_host", "Number of ICMPv6 packets dropped due to being router-specific packets."),
                        },
                },
        },
        IGMP: tcpip.IGMPStats{
                PacketsSent: tcpip.IGMPSentPacketStats{
                        IGMPPacketStats: tcpip.IGMPPacketStats{
                                MembershipQuery:    mustCreateMetric("/netstack/igmp/packets_sent/membership_query", "Number of IGMP Membership Query messages sent."),
                                V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v1_membership_report", "Number of IGMPv1 Membership Report messages sent."),
                                V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_sent/v2_membership_report", "Number of IGMPv2 Membership Report messages sent."),
                                LeaveGroup:         mustCreateMetric("/netstack/igmp/packets_sent/leave_group", "Number of IGMP Leave Group messages sent."),
                        },
                        Dropped: mustCreateMetric("/netstack/igmp/packets_sent/dropped", "Number of IGMP packets dropped due to link layer errors."),
                },
                PacketsReceived: tcpip.IGMPReceivedPacketStats{
                        IGMPPacketStats: tcpip.IGMPPacketStats{
                                MembershipQuery:    mustCreateMetric("/netstack/igmp/packets_received/membership_query", "Number of IGMP Membership Query messages received."),
                                V1MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v1_membership_report", "Number of IGMPv1 Membership Report messages received."),
                                V2MembershipReport: mustCreateMetric("/netstack/igmp/packets_received/v2_membership_report", "Number of IGMPv2 Membership Report messages received."),
                                LeaveGroup:         mustCreateMetric("/netstack/igmp/packets_received/leave_group", "Number of IGMP Leave Group messages received."),
                        },
                        Invalid:        mustCreateMetric("/netstack/igmp/packets_received/invalid", "Number of IGMP packets received that could not be parsed."),
                        ChecksumErrors: mustCreateMetric("/netstack/igmp/packets_received/checksum_errors", "Number of received IGMP packets with bad checksums."),
                        Unrecognized:   mustCreateMetric("/netstack/igmp/packets_received/unrecognized", "Number of unrecognized IGMP packets received."),
                },
        },
        IP: tcpip.IPStats{
                PacketsReceived:                     mustCreateMetric("/netstack/ip/packets_received", "Number of IP packets received from the link layer in nic.DeliverNetworkPacket."),
                DisabledPacketsReceived:             mustCreateMetric("/netstack/ip/disabled_packets_received", "Number of IP packets received from the link layer when the IP layer is disabled."),
                InvalidDestinationAddressesReceived: mustCreateMetric("/netstack/ip/invalid_addresses_received", "Number of IP packets received with an unknown or invalid destination address."),
                InvalidSourceAddressesReceived:      mustCreateMetric("/netstack/ip/invalid_source_addresses_received", "Number of IP packets received with an unknown or invalid source address."),
                PacketsDelivered:                    mustCreateMetric("/netstack/ip/packets_delivered", "Number of incoming IP packets that are successfully delivered to the transport layer via HandlePacket."),
                PacketsSent:                         mustCreateMetric("/netstack/ip/packets_sent", "Number of IP packets sent via WritePacket."),
                OutgoingPacketErrors:                mustCreateMetric("/netstack/ip/outgoing_packet_errors", "Number of IP packets which failed to write to a link-layer endpoint."),
                MalformedPacketsReceived:            mustCreateMetric("/netstack/ip/malformed_packets_received", "Number of IP packets which failed IP header validation checks."),
                MalformedFragmentsReceived:          mustCreateMetric("/netstack/ip/malformed_fragments_received", "Number of IP fragments which failed IP fragment validation checks."),
                IPTablesPreroutingDropped:           mustCreateMetric("/netstack/ip/iptables/prerouting_dropped", "Number of IP packets dropped in the Prerouting chain."),
                IPTablesInputDropped:                mustCreateMetric("/netstack/ip/iptables/input_dropped", "Number of IP packets dropped in the Input chain."),
                IPTablesOutputDropped:               mustCreateMetric("/netstack/ip/iptables/output_dropped", "Number of IP packets dropped in the Output chain."),
                OptionTimestampReceived:             mustCreateMetric("/netstack/ip/options/timestamp_received", "Number of timestamp options found in received IP packets."),
                OptionRecordRouteReceived:           mustCreateMetric("/netstack/ip/options/record_route_received", "Number of record route options found in received IP packets."),
                OptionRouterAlertReceived:           mustCreateMetric("/netstack/ip/options/router_alert_received", "Number of router alert options found in received IP packets."),
                OptionUnknownReceived:               mustCreateMetric("/netstack/ip/options/unknown_received", "Number of unknown options found in received IP packets."),
                Forwarding: tcpip.IPForwardingStats{
                        Unrouteable:            mustCreateMetric("/netstack/ip/forwarding/unrouteable", "Number of IP packets received which couldn't be routed and thus were not forwarded."),
                        ExhaustedTTL:           mustCreateMetric("/netstack/ip/forwarding/exhausted_ttl", "Number of IP packets received which could not be forwarded due to an exhausted TTL."),
                        LinkLocalSource:        mustCreateMetric("/netstack/ip/forwarding/link_local_source_address", "Number of IP packets received which could not be forwarded due to a link-local source address."),
                        LinkLocalDestination:   mustCreateMetric("/netstack/ip/forwarding/link_local_destination_address", "Number of IP packets received which could not be forwarded due to a link-local destination address."),
                        ExtensionHeaderProblem: mustCreateMetric("/netstack/ip/forwarding/extension_header_problem", "Number of IP packets received which could not be forwarded due to a problem processing their IPv6 extension headers."),
                        PacketTooBig:           mustCreateMetric("/netstack/ip/forwarding/packet_too_big", "Number of IP packets received which could not be forwarded because they could not fit within the outgoing MTU."),
                        HostUnreachable:        mustCreateMetric("/netstack/ip/forwarding/host_unreachable", "Number of IP packets received which could not be forwarded due to unresolvable next hop."),
                        Errors:                 mustCreateMetric("/netstack/ip/forwarding/errors", "Number of IP packets which couldn't be forwarded."),
                },
        },
        ARP: tcpip.ARPStats{
                PacketsReceived:                                 mustCreateMetric("/netstack/arp/packets_received", "Number of ARP packets received from the link layer."),
                DisabledPacketsReceived:                         mustCreateMetric("/netstack/arp/disabled_packets_received", "Number of ARP packets received from the link layer when the ARP layer is disabled."),
                MalformedPacketsReceived:                        mustCreateMetric("/netstack/arp/malformed_packets_received", "Number of ARP packets which failed ARP header validation checks."),
                RequestsReceived:                                mustCreateMetric("/netstack/arp/requests_received", "Number of ARP requests received."),
                RequestsReceivedUnknownTargetAddress:            mustCreateMetric("/netstack/arp/requests_received_unknown_addr", "Number of ARP requests received with an unknown target address."),
                OutgoingRequestInterfaceHasNoLocalAddressErrors: mustCreateMetric("/netstack/arp/outgoing_requests_iface_has_no_addr", "Number of failed attempts to send an ARP request with an interface that has no network address."),
                OutgoingRequestBadLocalAddressErrors:            mustCreateMetric("/netstack/arp/outgoing_requests_invalid_local_addr", "Number of failed attempts to send an ARP request with a provided local address that is invalid."),
                OutgoingRequestsDropped:                         mustCreateMetric("/netstack/arp/outgoing_requests_dropped", "Number of ARP requests which failed to write to a link-layer endpoint."),
                OutgoingRequestsSent:                            mustCreateMetric("/netstack/arp/outgoing_requests_sent", "Number of ARP requests sent."),
                RepliesReceived:                                 mustCreateMetric("/netstack/arp/replies_received", "Number of ARP replies received."),
                OutgoingRepliesDropped:                          mustCreateMetric("/netstack/arp/outgoing_replies_dropped", "Number of ARP replies which failed to write to a link-layer endpoint."),
                OutgoingRepliesSent:                             mustCreateMetric("/netstack/arp/outgoing_replies_sent", "Number of ARP replies sent."),
        },
        TCP: tcpip.TCPStats{
                ActiveConnectionOpenings:           mustCreateMetric("/netstack/tcp/active_connection_openings", "Number of connections opened successfully via Connect."),
                PassiveConnectionOpenings:          mustCreateMetric("/netstack/tcp/passive_connection_openings", "Number of connections opened successfully via Listen."),
                CurrentEstablished:                 mustCreateGauge("/netstack/tcp/current_established", "Number of connections in ESTABLISHED state now."),
                CurrentConnected:                   mustCreateGauge("/netstack/tcp/current_open", "Number of connections that are in connected state."),
                EstablishedResets:                  mustCreateMetric("/netstack/tcp/established_resets", "Number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state"),
                EstablishedClosed:                  mustCreateMetric("/netstack/tcp/established_closed", "Number of times established TCP connections made a transition to CLOSED state."),
                EstablishedTimedout:                mustCreateMetric("/netstack/tcp/established_timedout", "Number of times  an established connection was reset because of keep-alive time out."),
                ListenOverflowSynDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_syn_drop", "Number of times the listen queue overflowed and a SYN was dropped."),
                ListenOverflowAckDrop:              mustCreateMetric("/netstack/tcp/listen_overflow_ack_drop", "Number of times the listen queue overflowed and the final ACK in the handshake was dropped."),
                ListenOverflowSynCookieSent:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_sent", "Number of times a SYN cookie was sent."),
                ListenOverflowSynCookieRcvd:        mustCreateMetric("/netstack/tcp/listen_overflow_syn_cookie_rcvd", "Number of times a SYN cookie was received."),
                ListenOverflowInvalidSynCookieRcvd: mustCreateMetric("/netstack/tcp/listen_overflow_invalid_syn_cookie_rcvd", "Number of times an invalid SYN cookie was received."),
                FailedConnectionAttempts:           mustCreateMetric("/netstack/tcp/failed_connection_attempts", "Number of calls to Connect or Listen (active and passive openings, respectively) that end in an error."),
                ValidSegmentsReceived:              mustCreateMetric("/netstack/tcp/valid_segments_received", "Number of TCP segments received that the transport layer successfully parsed."),
                InvalidSegmentsReceived:            mustCreateMetric("/netstack/tcp/invalid_segments_received", "Number of TCP segments received that the transport layer could not parse."),
                SegmentsSent:                       mustCreateMetric("/netstack/tcp/segments_sent", "Number of TCP segments sent."),
                SegmentSendErrors:                  mustCreateMetric("/netstack/tcp/segment_send_errors", "Number of TCP segments failed to be sent."),
                ResetsSent:                         mustCreateMetric("/netstack/tcp/resets_sent", "Number of TCP resets sent."),
                ResetsReceived:                     mustCreateMetric("/netstack/tcp/resets_received", "Number of TCP resets received."),
                Retransmits:                        mustCreateMetric("/netstack/tcp/retransmits", "Number of TCP segments retransmitted."),
                FastRecovery:                       mustCreateMetric("/netstack/tcp/fast_recovery", "Number of times fast recovery was used to recover from packet loss."),
                SACKRecovery:                       mustCreateMetric("/netstack/tcp/sack_recovery", "Number of times SACK recovery was used to recover from packet loss."),
                TLPRecovery:                        mustCreateMetric("/netstack/tcp/tlp_recovery", "Number of times tail loss probe triggers recovery from tail loss."),
                SlowStartRetransmits:               mustCreateMetric("/netstack/tcp/slow_start_retransmits", "Number of segments retransmitted in slow start mode."),
                FastRetransmit:                     mustCreateMetric("/netstack/tcp/fast_retransmit", "Number of TCP segments which were fast retransmitted."),
                Timeouts:                           mustCreateMetric("/netstack/tcp/timeouts", "Number of times RTO expired."),
                ChecksumErrors:                     mustCreateMetric("/netstack/tcp/checksum_errors", "Number of segments dropped due to bad checksums."),
                FailedPortReservations:             mustCreateMetric("/netstack/tcp/failed_port_reservations", "Number of time TCP failed to reserve a port."),
        },
        UDP: tcpip.UDPStats{
                PacketsReceived:          mustCreateMetric("/netstack/udp/packets_received", "Number of UDP datagrams received via HandlePacket."),
                UnknownPortErrors:        mustCreateMetric("/netstack/udp/unknown_port_errors", "Number of incoming UDP datagrams dropped because they did not have a known destination port."),
                ReceiveBufferErrors:      mustCreateMetric("/netstack/udp/receive_buffer_errors", "Number of incoming UDP datagrams dropped due to the receiving buffer being in an invalid state."),
                MalformedPacketsReceived: mustCreateMetric("/netstack/udp/malformed_packets_received", "Number of incoming UDP datagrams dropped due to the UDP header being in a malformed state."),
                PacketsSent:              mustCreateMetric("/netstack/udp/packets_sent", "Number of UDP datagrams sent."),
                PacketSendErrors:         mustCreateMetric("/netstack/udp/packet_send_errors", "Number of UDP datagrams failed to be sent."),
                ChecksumErrors:           mustCreateMetric("/netstack/udp/checksum_errors", "Number of UDP datagrams dropped due to bad checksums."),
        },
}

// DefaultTTL is linux's default TTL. All network protocols in all stacks used
// with this package must have this value set as their default TTL.
const DefaultTTL = 64

const sizeOfInt32 int = 4

var errStackType = syserr.New("expected but did not receive a netstack.Stack", errno.EINVAL)

// commonEndpoint represents the intersection of a tcpip.Endpoint and a
// transport.Endpoint.
type commonEndpoint interface {
        // GetLocalAddress implements tcpip.Endpoint.GetLocalAddress and
        // transport.Endpoint.GetLocalAddress.
        GetLocalAddress() (tcpip.FullAddress, tcpip.Error)

        // GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress and
        // transport.Endpoint.GetRemoteAddress.
        GetRemoteAddress() (tcpip.FullAddress, tcpip.Error)

        // Readiness implements tcpip.Endpoint.Readiness and
        // transport.Endpoint.Readiness.
        Readiness(mask waiter.EventMask) waiter.EventMask

        // SetSockOpt implements tcpip.Endpoint.SetSockOpt and
        // transport.Endpoint.SetSockOpt.
        SetSockOpt(tcpip.SettableSocketOption) tcpip.Error

        // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt and
        // transport.Endpoint.SetSockOptInt.
        SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error

        // GetSockOpt implements tcpip.Endpoint.GetSockOpt and
        // transport.Endpoint.GetSockOpt.
        GetSockOpt(tcpip.GettableSocketOption) tcpip.Error

        // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt and
        // transport.Endpoint.GetSockOpt.
        GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error)

        // State returns a socket's lifecycle state. The returned value is
        // protocol-specific and is primarily used for diagnostics.
        State() uint32

        // LastError implements tcpip.Endpoint.LastError and
        // transport.Endpoint.LastError.
        LastError() tcpip.Error

        // SocketOptions implements tcpip.Endpoint.SocketOptions and
        // transport.Endpoint.SocketOptions.
        SocketOptions() *tcpip.SocketOptions
}

// LINT.IfChange

// SocketOperations encapsulates all the state needed to represent a network stack
// endpoint in the kernel context.
//
// +stateify savable
type SocketOperations struct {
        fsutil.FilePipeSeek             `state:"nosave"`
        fsutil.FileNotDirReaddir        `state:"nosave"`
        fsutil.FileNoopFlush            `state:"nosave"`
        fsutil.FileNoFsync              `state:"nosave"`
        fsutil.FileNoMMap               `state:"nosave"`
        fsutil.FileUseInodeUnstableAttr `state:"nosave"`

        socketOpsCommon
}

// socketOpsCommon contains the socket operations common to VFS1 and VFS2.
//
// +stateify savable
type socketOpsCommon struct {
        socket.SendReceiveTimeout
        *waiter.Queue

        family   int
        Endpoint tcpip.Endpoint
        skType   linux.SockType
        protocol int

        // readMu protects access to the below fields.
        readMu sync.Mutex `state:"nosave"`

        // sockOptTimestamp corresponds to SO_TIMESTAMP. When true, timestamps
        // of returned messages can be returned via control messages. When
        // false, the same timestamp is instead stored and can be read via the
        // SIOCGSTAMP ioctl. It is protected by readMu. See socket(7).
        sockOptTimestamp bool
        // timestampValid indicates whether timestamp for SIOCGSTAMP has been
        // set. It is protected by readMu.
        timestampValid bool
        // timestampNS holds the timestamp to use with SIOCTSTAMP. It is only
        // valid when timestampValid is true. It is protected by readMu.
        timestampNS int64

        // TODO(b/153685824): Move this to SocketOptions.
        // sockOptInq corresponds to TCP_INQ.
        sockOptInq bool
}

// New creates a new endpoint socket.
func New(t *kernel.Task, family int, skType linux.SockType, protocol int, queue *waiter.Queue, endpoint tcpip.Endpoint) (*fs.File, *syserr.Error) {
        if skType == linux.SOCK_STREAM {
                endpoint.SocketOptions().SetDelayOption(true)
        }

        dirent := socket.NewDirent(t, netstackDevice)
        defer dirent.DecRef(t)
        return fs.NewFile(t, dirent, fs.FileFlags{Read: true, Write: true, NonSeekable: true}, &SocketOperations{
                socketOpsCommon: socketOpsCommon{
                        Queue:    queue,
                        family:   family,
                        Endpoint: endpoint,
                        skType:   skType,
                        protocol: protocol,
                },
        }), nil
}

var sockAddrInetSize = (*linux.SockAddrInet)(nil).SizeBytes()
var sockAddrInet6Size = (*linux.SockAddrInet6)(nil).SizeBytes()
var sockAddrLinkSize = (*linux.SockAddrLink)(nil).SizeBytes()

// bytesToIPAddress converts an IPv4 or IPv6 address from the user to the
// netstack representation taking any addresses into account.
func bytesToIPAddress(addr []byte) tcpip.Address {
        if bytes.Equal(addr, make([]byte, 4)) || bytes.Equal(addr, make([]byte, 16)) {
                return ""
        }
        return tcpip.Address(addr)
}

func (s *socketOpsCommon) isPacketBased() bool {
        return s.skType == linux.SOCK_DGRAM || s.skType == linux.SOCK_SEQPACKET || s.skType == linux.SOCK_RDM || s.skType == linux.SOCK_RAW
}

// Release implements fs.FileOperations.Release.
func (s *socketOpsCommon) Release(ctx context.Context) {
        e, ch := waiter.NewChannelEntry(nil)
        s.EventRegister(&e, waiter.EventHUp|waiter.EventErr)
        defer s.EventUnregister(&e)

        s.Endpoint.Close()

        // SO_LINGER option is valid only for TCP. For other socket types
        // return after endpoint close.
        if family, skType, _ := s.Type(); skType != linux.SOCK_STREAM || (family != linux.AF_INET && family != linux.AF_INET6) {
                return
        }

        v := s.Endpoint.SocketOptions().GetLinger()
        // The case for zero timeout is handled in tcp endpoint close function.
        // Close is blocked until either:
        // 1. The endpoint state is not in any of the states: FIN-WAIT1,
        // CLOSING and LAST_ACK.
        // 2. Timeout is reached.
        if v.Enabled && v.Timeout != 0 {
                t := kernel.TaskFromContext(ctx)
                start := t.Kernel().MonotonicClock().Now()
                deadline := start.Add(v.Timeout)
                t.BlockWithDeadline(ch, true, deadline)
        }
}

// Read implements fs.FileOperations.Read.
func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
        if dst.NumBytes() == 0 {
                return 0, nil
        }
        n, _, _, _, _, err := s.nonBlockingRead(ctx, dst, false, false, false)
        if err == syserr.ErrWouldBlock {
                return int64(n), syserror.ErrWouldBlock
        }
        if err != nil {
                return 0, err.ToError()
        }
        return int64(n), nil
}

// WriteTo implements fs.FileOperations.WriteTo.
func (s *SocketOperations) WriteTo(ctx context.Context, _ *fs.File, dst io.Writer, count int64, dup bool) (int64, error) {
        s.readMu.Lock()
        defer s.readMu.Unlock()

        w := tcpip.LimitedWriter{
                W: dst,
                N: count,
        }

        // This may return a blocking error.
        res, err := s.Endpoint.Read(&w, tcpip.ReadOptions{
                Peek: dup,
        })
        if err != nil {
                return 0, syserr.TranslateNetstackError(err).ToError()
        }
        return int64(res.Count), nil
}

// Write implements fs.FileOperations.Write.
func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
        r := src.Reader(ctx)
        n, err := s.Endpoint.Write(r, tcpip.WriteOptions{})
        if _, ok := err.(*tcpip.ErrWouldBlock); ok {
                return 0, syserror.ErrWouldBlock
        }
        if err != nil {
                return 0, syserr.TranslateNetstackError(err).ToError()
        }

        if n < src.NumBytes() {
                return n, syserror.ErrWouldBlock
        }

        return n, nil
}

var _ tcpip.Payloader = (*limitedPayloader)(nil)

type limitedPayloader struct {
        inner io.LimitedReader
        err   error
}

func (l *limitedPayloader) Read(p []byte) (int, error) {
        n, err := l.inner.Read(p)
        l.err = err
        return n, err
}

func (l *limitedPayloader) Len() int {
        return int(l.inner.N)
}

// ReadFrom implements fs.FileOperations.ReadFrom.
func (s *SocketOperations) ReadFrom(ctx context.Context, _ *fs.File, r io.Reader, count int64) (int64, error) {
        f := limitedPayloader{
                inner: io.LimitedReader{
                        R: r,
                        N: count,
                },
        }
        n, err := s.Endpoint.Write(&f, tcpip.WriteOptions{
                // Reads may be destructive but should be very fast,
                // so we can't release the lock while copying data.
                Atomic: true,
        })
        if _, ok := err.(*tcpip.ErrBadBuffer); ok {
                return n, f.err
        }
        return n, syserr.TranslateNetstackError(err).ToError()
}

// Readiness returns a mask of ready events for socket s.
func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask {
        return s.Endpoint.Readiness(mask)
}

func (s *socketOpsCommon) checkFamily(family uint16, exact bool) *syserr.Error {
        if family == uint16(s.family) {
                return nil
        }
        if !exact && family == linux.AF_INET && s.family == linux.AF_INET6 {
                if !s.Endpoint.SocketOptions().GetV6Only() {
                        return nil
                }
        }
        return syserr.ErrInvalidArgument
}

// mapFamily maps the AF_INET ANY address to the IPv4-mapped IPv6 ANY if the
// receiver's family is AF_INET6.
//
// This is a hack to work around the fact that both IPv4 and IPv6 ANY are
// represented by the empty string.
//
// TODO(gvisor.dev/issue/1556): remove this function.
func (s *socketOpsCommon) mapFamily(addr tcpip.FullAddress, family uint16) tcpip.FullAddress {
        if len(addr.Addr) == 0 && s.family == linux.AF_INET6 && family == linux.AF_INET {
                addr.Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00"
        }
        return addr
}

// Connect implements the linux syscall connect(2) for sockets backed by
// tpcip.Endpoint.
func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
        addr, family, err := socket.AddressAndFamily(sockaddr)
        if err != nil {
                return err
        }

        if family == linux.AF_UNSPEC {
                err := s.Endpoint.Disconnect()
                if _, ok := err.(*tcpip.ErrNotSupported); ok {
                        return syserr.ErrAddressFamilyNotSupported
                }
                return syserr.TranslateNetstackError(err)
        }

        if err := s.checkFamily(family, false /* exact */); err != nil {
                return err
        }
        addr = s.mapFamily(addr, family)

        // Always return right away in the non-blocking case.
        if !blocking {
                return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
        }

        // Register for notification when the endpoint becomes writable, then
        // initiate the connection.
        e, ch := waiter.NewChannelEntry(nil)
        s.EventRegister(&e, waiter.WritableEvents)
        defer s.EventUnregister(&e)

        switch err := s.Endpoint.Connect(addr); err.(type) {
        case *tcpip.ErrConnectStarted, *tcpip.ErrAlreadyConnecting:
        case *tcpip.ErrNoPortAvailable:
                if (s.family == unix.AF_INET || s.family == unix.AF_INET6) && s.skType == linux.SOCK_STREAM {
                        // TCP unlike UDP returns EADDRNOTAVAIL when it can't
                        // find an available local ephemeral port.
                        return syserr.ErrAddressNotAvailable
                }
                return syserr.TranslateNetstackError(err)
        default:
                return syserr.TranslateNetstackError(err)
        }

        // It's pending, so we have to wait for a notification, and fetch the
        // result once the wait completes.
        if err := t.Block(ch); err != nil {
                return syserr.FromError(err)
        }

        // Call Connect() again after blocking to find connect's result.
        return syserr.TranslateNetstackError(s.Endpoint.Connect(addr))
}

// Bind implements the linux syscall bind(2) for sockets backed by
// tcpip.Endpoint.
func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
        if len(sockaddr) < 2 {
                return syserr.ErrInvalidArgument
        }

        family := hostarch.ByteOrder.Uint16(sockaddr)
        var addr tcpip.FullAddress

        // Bind for AF_PACKET requires only family, protocol and ifindex.
        // In function AddressAndFamily, we check the address length which is
        // not needed for AF_PACKET bind.
        if family == linux.AF_PACKET {
                var a linux.SockAddrLink
                if len(sockaddr) < sockAddrLinkSize {
                        return syserr.ErrInvalidArgument
                }
                a.UnmarshalBytes(sockaddr[:sockAddrLinkSize])

                if a.Protocol != uint16(s.protocol) {
                        return syserr.ErrInvalidArgument
                }

                addr = tcpip.FullAddress{
                        NIC:  tcpip.NICID(a.InterfaceIndex),
                        Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
                }
        } else {
                var err *syserr.Error
                addr, family, err = socket.AddressAndFamily(sockaddr)
                if err != nil {
                        return err
                }

                if err = s.checkFamily(family, true /* exact */); err != nil {
                        return err
                }

                addr = s.mapFamily(addr, family)
        }

        // Issue the bind request to the endpoint.
        err := s.Endpoint.Bind(addr)
        if _, ok := err.(*tcpip.ErrNoPortAvailable); ok {
                // Bind always returns EADDRINUSE irrespective of if the specified port was
                // already bound or if an ephemeral port was requested but none were
                // available.
                //
                // *tcpip.ErrNoPortAvailable is mapped to EAGAIN in syserr package because
                // UDP connect returns EAGAIN on ephemeral port exhaustion.
                //
                // TCP connect returns EADDRNOTAVAIL on ephemeral port exhaustion.
                err = &tcpip.ErrPortInUse{}
        }

        return syserr.TranslateNetstackError(err)
}

// Listen implements the linux syscall listen(2) for sockets backed by
// tcpip.Endpoint.
func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
        return syserr.TranslateNetstackError(s.Endpoint.Listen(backlog))
}

// blockingAccept implements a blocking version of accept(2), that is, if no
// connections are ready to be accept, it will block until one becomes ready.
func (s *socketOpsCommon) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, *syserr.Error) {
        // Register for notifications.
        e, ch := waiter.NewChannelEntry(nil)
        s.EventRegister(&e, waiter.ReadableEvents)
        defer s.EventUnregister(&e)

        // Try to accept the connection again; if it fails, then wait until we
        // get a notification.
        for {
                ep, wq, err := s.Endpoint.Accept(peerAddr)
                if _, ok := err.(*tcpip.ErrWouldBlock); !ok {
                        return ep, wq, syserr.TranslateNetstackError(err)
                }

                if err := t.Block(ch); err != nil {
                        return nil, nil, syserr.FromError(err)
                }
        }
}

// Accept implements the linux syscall accept(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
        var peerAddr *tcpip.FullAddress
        if peerRequested {
                peerAddr = &tcpip.FullAddress{}
        }
        ep, wq, terr := s.Endpoint.Accept(peerAddr)
        if terr != nil {
                if _, ok := terr.(*tcpip.ErrWouldBlock); !ok || !blocking {
                        return 0, nil, 0, syserr.TranslateNetstackError(terr)
                }

                var err *syserr.Error
                ep, wq, err = s.blockingAccept(t, peerAddr)
                if err != nil {
                        return 0, nil, 0, err
                }
        }

        ns, err := New(t, s.family, s.skType, s.protocol, wq, ep)
        if err != nil {
                return 0, nil, 0, err
        }
        defer ns.DecRef(t)

        if flags&linux.SOCK_NONBLOCK != 0 {
                flags := ns.Flags()
                flags.NonBlocking = true
                ns.SetFlags(flags.Settable())
        }

        var addr linux.SockAddr
        var addrLen uint32
        if peerAddr != nil {
                addr, addrLen = socket.ConvertAddress(s.family, *peerAddr)
        }

        fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
                CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
        })

        t.Kernel().RecordSocket(ns)

        return fd, addr, addrLen, syserr.FromError(e)
}

// ConvertShutdown converts Linux shutdown flags into tcpip shutdown flags.
func ConvertShutdown(how int) (tcpip.ShutdownFlags, *syserr.Error) {
        var f tcpip.ShutdownFlags
        switch how {
        case linux.SHUT_RD:
                f = tcpip.ShutdownRead
        case linux.SHUT_WR:
                f = tcpip.ShutdownWrite
        case linux.SHUT_RDWR:
                f = tcpip.ShutdownRead | tcpip.ShutdownWrite
        default:
                return 0, syserr.ErrInvalidArgument
        }
        return f, nil
}

// Shutdown implements the linux syscall shutdown(2) for sockets backed by
// tcpip.Endpoint.
func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
        f, err := ConvertShutdown(how)
        if err != nil {
                return err
        }

        // Issue shutdown request.
        return syserr.TranslateNetstackError(s.Endpoint.Shutdown(f))
}

// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
        // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
        // implemented specifically for netstack.SocketOperations rather than
        // commonEndpoint. commonEndpoint should be extended to support socket
        // options where the implementation is not shared, as unix sockets need
        // their own support for SO_TIMESTAMP.
        if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }
                val := primitive.Int32(0)
                s.readMu.Lock()
                defer s.readMu.Unlock()
                if s.sockOptTimestamp {
                        val = 1
                }
                return &val, nil
        }
        if level == linux.SOL_TCP && name == linux.TCP_INQ {
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }
                val := primitive.Int32(0)
                s.readMu.Lock()
                defer s.readMu.Unlock()
                if s.sockOptInq {
                        val = 1
                }
                return &val, nil
        }

        return GetSockOpt(t, s, s.Endpoint, s.family, s.skType, level, name, outPtr, outLen)
}

// GetSockOpt can be used to implement the linux syscall getsockopt(2) for
// sockets backed by a commonEndpoint.
func GetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
        switch level {
        case linux.SOL_SOCKET:
                return getSockOptSocket(t, s, ep, family, skType, name, outLen)

        case linux.SOL_TCP:
                return getSockOptTCP(t, s, ep, name, outLen)

        case linux.SOL_IPV6:
                return getSockOptIPv6(t, s, ep, name, outPtr, outLen)

        case linux.SOL_IP:
                return getSockOptIP(t, s, ep, name, outPtr, outLen, family)

        case linux.SOL_UDP,
                linux.SOL_ICMPV6,
                linux.SOL_RAW,
                linux.SOL_PACKET:

                t.Kernel().EmitUnimplementedEvent(t)
        }

        return nil, syserr.ErrProtocolNotAvailable
}

func boolToInt32(v bool) int32 {
        if v {
                return 1
        }
        return 0
}

// getSockOptSocket implements GetSockOpt when level is SOL_SOCKET.
func getSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, family int, skType linux.SockType, name, outLen int) (marshal.Marshallable, *syserr.Error) {
        // TODO(b/124056281): Stop rejecting short optLen values in getsockopt.
        switch name {
        case linux.SO_ERROR:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                // Get the last error and convert it.
                err := ep.SocketOptions().GetLastError()
                if err == nil {
                        optP := primitive.Int32(0)
                        return &optP, nil
                }

                optP := primitive.Int32(syserr.TranslateNetstackError(err).ToLinux())
                return &optP, nil

        case linux.SO_PEERCRED:
                if family != linux.AF_UNIX || outLen < unix.SizeofUcred {
                        return nil, syserr.ErrInvalidArgument
                }

                tcred := t.Credentials()
                creds := linux.ControlMessageCredentials{
                        PID: int32(t.ThreadGroup().ID()),
                        UID: uint32(tcred.EffectiveKUID.In(tcred.UserNamespace).OrOverflow()),
                        GID: uint32(tcred.EffectiveKGID.In(tcred.UserNamespace).OrOverflow()),
                }
                return &creds, nil

        case linux.SO_PASSCRED:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetPassCred()))
                return &v, nil

        case linux.SO_SNDBUF:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                size := ep.SocketOptions().GetSendBufferSize()

                if size > math.MaxInt32 {
                        size = math.MaxInt32
                }

                sizeP := primitive.Int32(size)
                return &sizeP, nil

        case linux.SO_RCVBUF:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                size := ep.SocketOptions().GetReceiveBufferSize()

                if size > math.MaxInt32 {
                        size = math.MaxInt32
                }

                sizeP := primitive.Int32(size)
                return &sizeP, nil

        case linux.SO_REUSEADDR:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReuseAddress()))
                return &v, nil

        case linux.SO_REUSEPORT:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReusePort()))
                return &v, nil

        case linux.SO_BINDTODEVICE:
                v := ep.SocketOptions().GetBindToDevice()
                if v == 0 {
                        var b primitive.ByteSlice
                        return &b, nil
                }
                if outLen < linux.IFNAMSIZ {
                        return nil, syserr.ErrInvalidArgument
                }
                s := t.NetworkContext()
                if s == nil {
                        return nil, syserr.ErrNoDevice
                }
                nic, ok := s.Interfaces()[int32(v)]
                if !ok {
                        // The NICID no longer indicates a valid interface, probably because that
                        // interface was removed.
                        return nil, syserr.ErrUnknownDevice
                }

                name := primitive.ByteSlice(append([]byte(nic.Name), 0))
                return &name, nil

        case linux.SO_BROADCAST:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetBroadcast()))
                return &v, nil

        case linux.SO_KEEPALIVE:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetKeepAlive()))
                return &v, nil

        case linux.SO_LINGER:
                if outLen < linux.SizeOfLinger {
                        return nil, syserr.ErrInvalidArgument
                }

                var linger linux.Linger
                v := ep.SocketOptions().GetLinger()

                if v.Enabled {
                        linger.OnOff = 1
                }
                linger.Linger = int32(v.Timeout.Seconds())
                return &linger, nil

        case linux.SO_SNDTIMEO:
                // TODO(igudger): Linux allows shorter lengths for partial results.
                if outLen < linux.SizeOfTimeval {
                        return nil, syserr.ErrInvalidArgument
                }

                sendTimeout := linux.NsecToTimeval(s.SendTimeout())
                return &sendTimeout, nil

        case linux.SO_RCVTIMEO:
                // TODO(igudger): Linux allows shorter lengths for partial results.
                if outLen < linux.SizeOfTimeval {
                        return nil, syserr.ErrInvalidArgument
                }

                recvTimeout := linux.NsecToTimeval(s.RecvTimeout())
                return &recvTimeout, nil

        case linux.SO_OOBINLINE:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetOutOfBandInline()))
                return &v, nil

        case linux.SO_NO_CHECK:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetNoChecksum()))
                return &v, nil

        case linux.SO_ACCEPTCONN:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                // This option is only viable for TCP endpoints.
                var v bool
                if _, skType, skProto := s.Type(); isTCPSocket(skType, skProto) {
                        v = tcp.EndpointState(ep.State()) == tcp.StateListen
                }
                vP := primitive.Int32(boolToInt32(v))
                return &vP, nil

        default:
                socket.GetSockOptEmitUnimplementedEvent(t, name)
        }
        return nil, syserr.ErrProtocolNotAvailable
}

// getSockOptTCP implements GetSockOpt when level is SOL_TCP.
func getSockOptTCP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name, outLen int) (marshal.Marshallable, *syserr.Error) {
        if _, skType, skProto := s.Type(); !isTCPSocket(skType, skProto) {
                log.Warningf("SOL_TCP options are only supported on TCP sockets: skType, skProto = %v, %d", skType, skProto)
                return nil, syserr.ErrUnknownProtocolOption
        }

        switch name {
        case linux.TCP_NODELAY:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(!ep.SocketOptions().GetDelayOption()))
                return &v, nil

        case linux.TCP_CORK:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetCorkOption()))
                return &v, nil

        case linux.TCP_QUICKACK:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetQuickAck()))
                return &v, nil

        case linux.TCP_MAXSEG:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v, err := ep.GetSockOptInt(tcpip.MaxSegOption)
                if err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }
                vP := primitive.Int32(v)
                return &vP, nil

        case linux.TCP_KEEPIDLE:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                var v tcpip.KeepaliveIdleOption
                if err := ep.GetSockOpt(&v); err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }
                keepAliveIdle := primitive.Int32(time.Duration(v) / time.Second)
                return &keepAliveIdle, nil

        case linux.TCP_KEEPINTVL:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                var v tcpip.KeepaliveIntervalOption
                if err := ep.GetSockOpt(&v); err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }
                keepAliveInterval := primitive.Int32(time.Duration(v) / time.Second)
                return &keepAliveInterval, nil

        case linux.TCP_KEEPCNT:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v, err := ep.GetSockOptInt(tcpip.KeepaliveCountOption)
                if err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }
                vP := primitive.Int32(v)
                return &vP, nil

        case linux.TCP_USER_TIMEOUT:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                var v tcpip.TCPUserTimeoutOption
                if err := ep.GetSockOpt(&v); err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }
                tcpUserTimeout := primitive.Int32(time.Duration(v) / time.Millisecond)
                return &tcpUserTimeout, nil

        case linux.TCP_INFO:
                var v tcpip.TCPInfoOption
                if err := ep.GetSockOpt(&v); err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }

                // TODO(b/64800844): Translate fields once they are added to
                // tcpip.TCPInfoOption.
                info := linux.TCPInfo{
                        State:       uint8(v.State),
                        RTO:         uint32(v.RTO / time.Microsecond),
                        RTT:         uint32(v.RTT / time.Microsecond),
                        RTTVar:      uint32(v.RTTVar / time.Microsecond),
                        SndSsthresh: v.SndSsthresh,
                        SndCwnd:     v.SndCwnd,
                }
                switch v.CcState {
                case tcpip.RTORecovery:
                        info.CaState = linux.TCP_CA_Loss
                case tcpip.FastRecovery, tcpip.SACKRecovery:
                        info.CaState = linux.TCP_CA_Recovery
                case tcpip.Disorder:
                        info.CaState = linux.TCP_CA_Disorder
                case tcpip.Open:
                        info.CaState = linux.TCP_CA_Open
                }

                // In netstack reorderSeen is updated only when RACK is enabled.
                // We only track whether the reordering is seen, which is
                // different than Linux where reorderSeen is not specific to
                // RACK and is incremented when a reordering event is seen.
                if v.ReorderSeen {
                        info.ReordSeen = 1
                }

                // Linux truncates the output binary to outLen.
                buf := t.CopyScratchBuffer(info.SizeBytes())
                info.MarshalUnsafe(buf)
                if len(buf) > outLen {
                        buf = buf[:outLen]
                }
                bufP := primitive.ByteSlice(buf)
                return &bufP, nil

        case linux.TCP_CC_INFO,
                linux.TCP_NOTSENT_LOWAT,
                linux.TCP_ZEROCOPY_RECEIVE:

                t.Kernel().EmitUnimplementedEvent(t)

        case linux.TCP_CONGESTION:
                if outLen <= 0 {
                        return nil, syserr.ErrInvalidArgument
                }

                var v tcpip.CongestionControlOption
                if err := ep.GetSockOpt(&v); err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }

                // We match linux behaviour here where it returns the lower of
                // TCP_CA_NAME_MAX bytes or the value of the option length.
                //
                // This is Linux's net/tcp.h TCP_CA_NAME_MAX.
                const tcpCANameMax = 16

                toCopy := tcpCANameMax
                if outLen < tcpCANameMax {
                        toCopy = outLen
                }
                b := make([]byte, toCopy)
                copy(b, v)

                bP := primitive.ByteSlice(b)
                return &bP, nil

        case linux.TCP_LINGER2:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                var v tcpip.TCPLingerTimeoutOption
                if err := ep.GetSockOpt(&v); err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }
                var lingerTimeout primitive.Int32
                if v >= 0 {
                        lingerTimeout = primitive.Int32(time.Duration(v) / time.Second)
                } else {
                        lingerTimeout = -1
                }
                return &lingerTimeout, nil

        case linux.TCP_DEFER_ACCEPT:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                var v tcpip.TCPDeferAcceptOption
                if err := ep.GetSockOpt(&v); err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }

                tcpDeferAccept := primitive.Int32(time.Duration(v) / time.Second)
                return &tcpDeferAccept, nil

        case linux.TCP_SYNCNT:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v, err := ep.GetSockOptInt(tcpip.TCPSynCountOption)
                if err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }
                vP := primitive.Int32(v)
                return &vP, nil

        case linux.TCP_WINDOW_CLAMP:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v, err := ep.GetSockOptInt(tcpip.TCPWindowClampOption)
                if err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }
                vP := primitive.Int32(v)
                return &vP, nil
        default:
                emitUnimplementedEventTCP(t, name)
        }
        return nil, syserr.ErrProtocolNotAvailable
}

// getSockOptIPv6 implements GetSockOpt when level is SOL_IPV6.
func getSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
        if _, ok := ep.(tcpip.Endpoint); !ok {
                log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
                return nil, syserr.ErrUnknownProtocolOption
        }

        family, skType, _ := s.Type()
        if family != linux.AF_INET6 {
                return nil, syserr.ErrUnknownProtocolOption
        }

        switch name {
        case linux.IPV6_V6ONLY:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetV6Only()))
                return &v, nil

        case linux.IPV6_PATHMTU:
                t.Kernel().EmitUnimplementedEvent(t)

        case linux.IPV6_TCLASS:
                // Length handling for parity with Linux.
                if outLen == 0 {
                        var b primitive.ByteSlice
                        return &b, nil
                }
                v, err := ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
                if err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }

                uintv := primitive.Uint32(v)
                // Linux truncates the output binary to outLen.
                ib := t.CopyScratchBuffer(uintv.SizeBytes())
                uintv.MarshalUnsafe(ib)
                // Handle cases where outLen is lesser than sizeOfInt32.
                if len(ib) > outLen {
                        ib = ib[:outLen]
                }
                ibP := primitive.ByteSlice(ib)
                return &ibP, nil

        case linux.IPV6_RECVTCLASS:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTClass()))
                return &v, nil
        case linux.IPV6_RECVERR:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetRecvError()))
                return &v, nil

        case linux.IPV6_RECVORIGDSTADDR:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress()))
                return &v, nil

        case linux.IP6T_ORIGINAL_DST:
                if outLen < sockAddrInet6Size {
                        return nil, syserr.ErrInvalidArgument
                }

                var v tcpip.OriginalDestinationOption
                if err := ep.GetSockOpt(&v); err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }

                a, _ := socket.ConvertAddress(linux.AF_INET6, tcpip.FullAddress(v))
                return a.(*linux.SockAddrInet6), nil

        case linux.IP6T_SO_GET_INFO:
                if outLen < linux.SizeOfIPTGetinfo {
                        return nil, syserr.ErrInvalidArgument
                }

                // Only valid for raw IPv6 sockets.
                if skType != linux.SOCK_RAW {
                        return nil, syserr.ErrProtocolNotAvailable
                }

                stack := inet.StackFromContext(t)
                if stack == nil {
                        return nil, syserr.ErrNoDevice
                }
                info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, true)
                if err != nil {
                        return nil, err
                }
                return &info, nil

        case linux.IP6T_SO_GET_ENTRIES:
                // IPTGetEntries is reused for IPv6.
                if outLen < linux.SizeOfIPTGetEntries {
                        return nil, syserr.ErrInvalidArgument
                }
                // Only valid for raw IPv6 sockets.
                if skType != linux.SOCK_RAW {
                        return nil, syserr.ErrProtocolNotAvailable
                }

                stack := inet.StackFromContext(t)
                if stack == nil {
                        return nil, syserr.ErrNoDevice
                }
                entries, err := netfilter.GetEntries6(t, stack.(*Stack).Stack, outPtr, outLen)
                if err != nil {
                        return nil, err
                }
                return &entries, nil

        case linux.IP6T_SO_GET_REVISION_TARGET:
                if outLen < linux.SizeOfXTGetRevision {
                        return nil, syserr.ErrInvalidArgument
                }

                // Only valid for raw IPv6 sockets.
                if skType != linux.SOCK_RAW {
                        return nil, syserr.ErrProtocolNotAvailable
                }

                stack := inet.StackFromContext(t)
                if stack == nil {
                        return nil, syserr.ErrNoDevice
                }
                ret, err := netfilter.TargetRevision(t, outPtr, header.IPv6ProtocolNumber)
                if err != nil {
                        return nil, err
                }
                return &ret, nil

        default:
                emitUnimplementedEventIPv6(t, name)
        }
        return nil, syserr.ErrProtocolNotAvailable
}

// getSockOptIP implements GetSockOpt when level is SOL_IP.
func getSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, outPtr hostarch.Addr, outLen int, family int) (marshal.Marshallable, *syserr.Error) {
        if _, ok := ep.(tcpip.Endpoint); !ok {
                log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
                return nil, syserr.ErrUnknownProtocolOption
        }

        switch name {
        case linux.IP_TTL:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v, err := ep.GetSockOptInt(tcpip.TTLOption)
                if err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }

                // Fill in the default value, if needed.
                vP := primitive.Int32(v)
                if vP == 0 {
                        vP = DefaultTTL
                }

                return &vP, nil

        case linux.IP_MULTICAST_TTL:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v, err := ep.GetSockOptInt(tcpip.MulticastTTLOption)
                if err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }

                vP := primitive.Int32(v)
                return &vP, nil

        case linux.IP_MULTICAST_IF:
                if outLen < len(linux.InetAddr{}) {
                        return nil, syserr.ErrInvalidArgument
                }

                var v tcpip.MulticastInterfaceOption
                if err := ep.GetSockOpt(&v); err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }

                a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress{Addr: v.InterfaceAddr})

                return &a.(*linux.SockAddrInet).Addr, nil

        case linux.IP_MULTICAST_LOOP:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetMulticastLoop()))
                return &v, nil

        case linux.IP_TOS:
                // Length handling for parity with Linux.
                if outLen == 0 {
                        var b primitive.ByteSlice
                        return &b, nil
                }
                v, err := ep.GetSockOptInt(tcpip.IPv4TOSOption)
                if err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }
                if outLen < sizeOfInt32 {
                        vP := primitive.Uint8(v)
                        return &vP, nil
                }
                vP := primitive.Int32(v)
                return &vP, nil

        case linux.IP_RECVTOS:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveTOS()))
                return &v, nil

        case linux.IP_RECVERR:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetRecvError()))
                return &v, nil

        case linux.IP_PKTINFO:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceivePacketInfo()))
                return &v, nil

        case linux.IP_HDRINCL:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetHeaderIncluded()))
                return &v, nil

        case linux.IP_RECVORIGDSTADDR:
                if outLen < sizeOfInt32 {
                        return nil, syserr.ErrInvalidArgument
                }

                v := primitive.Int32(boolToInt32(ep.SocketOptions().GetReceiveOriginalDstAddress()))
                return &v, nil

        case linux.SO_ORIGINAL_DST:
                if outLen < sockAddrInetSize {
                        return nil, syserr.ErrInvalidArgument
                }

                var v tcpip.OriginalDestinationOption
                if err := ep.GetSockOpt(&v); err != nil {
                        return nil, syserr.TranslateNetstackError(err)
                }

                a, _ := socket.ConvertAddress(linux.AF_INET, tcpip.FullAddress(v))
                return a.(*linux.SockAddrInet), nil

        case linux.IPT_SO_GET_INFO:
                if outLen < linux.SizeOfIPTGetinfo {
                        return nil, syserr.ErrInvalidArgument
                }

                // Only valid for raw IPv4 sockets.
                if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
                        return nil, syserr.ErrProtocolNotAvailable
                }

                stack := inet.StackFromContext(t)
                if stack == nil {
                        return nil, syserr.ErrNoDevice
                }
                info, err := netfilter.GetInfo(t, stack.(*Stack).Stack, outPtr, false)
                if err != nil {
                        return nil, err
                }
                return &info, nil

        case linux.IPT_SO_GET_ENTRIES:
                if outLen < linux.SizeOfIPTGetEntries {
                        return nil, syserr.ErrInvalidArgument
                }

                // Only valid for raw IPv4 sockets.
                if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
                        return nil, syserr.ErrProtocolNotAvailable
                }

                stack := inet.StackFromContext(t)
                if stack == nil {
                        return nil, syserr.ErrNoDevice
                }
                entries, err := netfilter.GetEntries4(t, stack.(*Stack).Stack, outPtr, outLen)
                if err != nil {
                        return nil, err
                }
                return &entries, nil

        case linux.IPT_SO_GET_REVISION_TARGET:
                if outLen < linux.SizeOfXTGetRevision {
                        return nil, syserr.ErrInvalidArgument
                }

                // Only valid for raw IPv4 sockets.
                if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
                        return nil, syserr.ErrProtocolNotAvailable
                }

                stack := inet.StackFromContext(t)
                if stack == nil {
                        return nil, syserr.ErrNoDevice
                }
                ret, err := netfilter.TargetRevision(t, outPtr, header.IPv4ProtocolNumber)
                if err != nil {
                        return nil, err
                }
                return &ret, nil

        default:
                emitUnimplementedEventIP(t, name)
        }
        return nil, syserr.ErrProtocolNotAvailable
}

// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
// tcpip.Endpoint.
func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
        // TODO(b/78348848): Unlike other socket options, SO_TIMESTAMP is
        // implemented specifically for netstack.SocketOperations rather than
        // commonEndpoint. commonEndpoint should be extended to support socket
        // options where the implementation is not shared, as unix sockets need
        // their own support for SO_TIMESTAMP.
        if level == linux.SOL_SOCKET && name == linux.SO_TIMESTAMP {
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }
                s.readMu.Lock()
                defer s.readMu.Unlock()
                s.sockOptTimestamp = hostarch.ByteOrder.Uint32(optVal) != 0
                return nil
        }
        if level == linux.SOL_TCP && name == linux.TCP_INQ {
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }
                s.readMu.Lock()
                defer s.readMu.Unlock()
                s.sockOptInq = hostarch.ByteOrder.Uint32(optVal) != 0
                return nil
        }

        return SetSockOpt(t, s, s.Endpoint, level, name, optVal)
}

// SetSockOpt can be used to implement the linux syscall setsockopt(2) for
// sockets backed by a commonEndpoint.
func SetSockOpt(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, level int, name int, optVal []byte) *syserr.Error {
        switch level {
        case linux.SOL_SOCKET:
                return setSockOptSocket(t, s, ep, name, optVal)

        case linux.SOL_TCP:
                return setSockOptTCP(t, s, ep, name, optVal)

        case linux.SOL_IPV6:
                return setSockOptIPv6(t, s, ep, name, optVal)

        case linux.SOL_IP:
                return setSockOptIP(t, s, ep, name, optVal)

        case linux.SOL_PACKET:
                // gVisor doesn't support any SOL_PACKET options just return not
                // supported. Returning nil here will result in tcpdump thinking AF_PACKET
                // features are supported and proceed to use them and break.
                t.Kernel().EmitUnimplementedEvent(t)
                return syserr.ErrProtocolNotAvailable

        case linux.SOL_UDP,
                linux.SOL_ICMPV6,
                linux.SOL_RAW:

                t.Kernel().EmitUnimplementedEvent(t)
        }

        return nil
}

func clampBufSize(newSz, min, max int64) int64 {
        // packetOverheadFactor is used to multiply the value provided by the user on
        // a setsockopt(2) for setting the send/receive buffer sizes sockets.
        const packetOverheadFactor = 2

        if newSz > max {
                newSz = max
        }

        if newSz < math.MaxInt32/packetOverheadFactor {
                newSz *= packetOverheadFactor
                if newSz < min {
                        newSz = min
                }
        } else {
                newSz = math.MaxInt32
        }
        return newSz
}

// setSockOptSocket implements SetSockOpt when level is SOL_SOCKET.
func setSockOptSocket(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
        switch name {
        case linux.SO_SNDBUF:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                min, max := ep.SocketOptions().SendBufferLimits()
                clamped := clampBufSize(int64(v), min, max)
                ep.SocketOptions().SetSendBufferSize(clamped, true /* notify */)
                return nil

        case linux.SO_RCVBUF:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                min, max := ep.SocketOptions().ReceiveBufferLimits()
                clamped := clampBufSize(int64(v), min, max)
                ep.SocketOptions().SetReceiveBufferSize(clamped, true /* notify */)
                return nil

        case linux.SO_REUSEADDR:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                ep.SocketOptions().SetReuseAddress(v != 0)
                return nil

        case linux.SO_REUSEPORT:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                ep.SocketOptions().SetReusePort(v != 0)
                return nil

        case linux.SO_BINDTODEVICE:
                n := bytes.IndexByte(optVal, 0)
                if n == -1 {
                        n = len(optVal)
                }
                name := string(optVal[:n])
                if name == "" {
                        return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(0))
                }
                s := t.NetworkContext()
                if s == nil {
                        return syserr.ErrNoDevice
                }
                for nicID, nic := range s.Interfaces() {
                        if nic.Name == name {
                                return syserr.TranslateNetstackError(ep.SocketOptions().SetBindToDevice(nicID))
                        }
                }
                return syserr.ErrUnknownDevice

        case linux.SO_BROADCAST:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                ep.SocketOptions().SetBroadcast(v != 0)
                return nil

        case linux.SO_PASSCRED:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                ep.SocketOptions().SetPassCred(v != 0)
                return nil

        case linux.SO_KEEPALIVE:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                ep.SocketOptions().SetKeepAlive(v != 0)
                return nil

        case linux.SO_SNDTIMEO:
                if len(optVal) < linux.SizeOfTimeval {
                        return syserr.ErrInvalidArgument
                }

                var v linux.Timeval
                v.UnmarshalBytes(optVal[:linux.SizeOfTimeval])
                if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
                        return syserr.ErrDomain
                }
                s.SetSendTimeout(v.ToNsecCapped())
                return nil

        case linux.SO_RCVTIMEO:
                if len(optVal) < linux.SizeOfTimeval {
                        return syserr.ErrInvalidArgument
                }

                var v linux.Timeval
                v.UnmarshalBytes(optVal[:linux.SizeOfTimeval])
                if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
                        return syserr.ErrDomain
                }
                s.SetRecvTimeout(v.ToNsecCapped())
                return nil

        case linux.SO_OOBINLINE:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                ep.SocketOptions().SetOutOfBandInline(v != 0)
                return nil

        case linux.SO_NO_CHECK:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                ep.SocketOptions().SetNoChecksum(v != 0)
                return nil

        case linux.SO_LINGER:
                if len(optVal) < linux.SizeOfLinger {
                        return syserr.ErrInvalidArgument
                }

                var v linux.Linger
                v.UnmarshalBytes(optVal[:linux.SizeOfLinger])

                if v != (linux.Linger{}) {
                        socket.SetSockOptEmitUnimplementedEvent(t, name)
                }

                ep.SocketOptions().SetLinger(tcpip.LingerOption{
                        Enabled: v.OnOff != 0,
                        Timeout: time.Second * time.Duration(v.Linger),
                })
                return nil

        case linux.SO_DETACH_FILTER:
                // optval is ignored.
                var v tcpip.SocketDetachFilterOption
                return syserr.TranslateNetstackError(ep.SetSockOpt(&v))

        default:
                socket.SetSockOptEmitUnimplementedEvent(t, name)
        }

        return nil
}

// setSockOptTCP implements SetSockOpt when level is SOL_TCP.
func setSockOptTCP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
        if _, skType, skProto := s.Type(); !isTCPSocket(skType, skProto) {
                log.Warningf("SOL_TCP options are only supported on TCP sockets: skType, skProto = %v, %d", skType, skProto)
                return syserr.ErrUnknownProtocolOption
        }

        switch name {
        case linux.TCP_NODELAY:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                ep.SocketOptions().SetDelayOption(v == 0)
                return nil

        case linux.TCP_CORK:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                ep.SocketOptions().SetCorkOption(v != 0)
                return nil

        case linux.TCP_QUICKACK:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                ep.SocketOptions().SetQuickAck(v != 0)
                return nil

        case linux.TCP_MAXSEG:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MaxSegOption, int(v)))

        case linux.TCP_KEEPIDLE:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                if v < 1 || v > linux.MAX_TCP_KEEPIDLE {
                        return syserr.ErrInvalidArgument
                }
                opt := tcpip.KeepaliveIdleOption(time.Second * time.Duration(v))
                return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))

        case linux.TCP_KEEPINTVL:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                if v < 1 || v > linux.MAX_TCP_KEEPINTVL {
                        return syserr.ErrInvalidArgument
                }
                opt := tcpip.KeepaliveIntervalOption(time.Second * time.Duration(v))
                return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))

        case linux.TCP_KEEPCNT:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                if v < 1 || v > linux.MAX_TCP_KEEPCNT {
                        return syserr.ErrInvalidArgument
                }
                return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.KeepaliveCountOption, int(v)))

        case linux.TCP_USER_TIMEOUT:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := int32(hostarch.ByteOrder.Uint32(optVal))
                if v < 0 {
                        return syserr.ErrInvalidArgument
                }
                opt := tcpip.TCPUserTimeoutOption(time.Millisecond * time.Duration(v))
                return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))

        case linux.TCP_CONGESTION:
                v := tcpip.CongestionControlOption(optVal)
                if err := ep.SetSockOpt(&v); err != nil {
                        return syserr.TranslateNetstackError(err)
                }
                return nil

        case linux.TCP_LINGER2:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                v := int32(hostarch.ByteOrder.Uint32(optVal))
                opt := tcpip.TCPLingerTimeoutOption(time.Second * time.Duration(v))
                return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))

        case linux.TCP_DEFER_ACCEPT:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }
                v := int32(hostarch.ByteOrder.Uint32(optVal))
                if v < 0 {
                        v = 0
                }
                opt := tcpip.TCPDeferAcceptOption(time.Second * time.Duration(v))
                return syserr.TranslateNetstackError(ep.SetSockOpt(&opt))

        case linux.TCP_SYNCNT:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }
                v := hostarch.ByteOrder.Uint32(optVal)

                return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPSynCountOption, int(v)))

        case linux.TCP_WINDOW_CLAMP:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }
                v := hostarch.ByteOrder.Uint32(optVal)

                return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TCPWindowClampOption, int(v)))

        case linux.TCP_REPAIR_OPTIONS:
                t.Kernel().EmitUnimplementedEvent(t)

        default:
                emitUnimplementedEventTCP(t, name)
        }

        return nil
}

// setSockOptIPv6 implements SetSockOpt when level is SOL_IPV6.
func setSockOptIPv6(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
        if _, ok := ep.(tcpip.Endpoint); !ok {
                log.Warningf("SOL_IPV6 options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
                return syserr.ErrUnknownProtocolOption
        }

        family, skType, skProto := s.Type()
        if family != linux.AF_INET6 {
                return syserr.ErrUnknownProtocolOption
        }

        switch name {
        case linux.IPV6_V6ONLY:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }

                if isTCPSocket(skType, skProto) && tcp.EndpointState(ep.State()) != tcp.StateInitial {
                        return syserr.ErrInvalidEndpointState
                } else if isUDPSocket(skType, skProto) && udp.EndpointState(ep.State()) != udp.StateInitial {
                        return syserr.ErrInvalidEndpointState
                }

                v := hostarch.ByteOrder.Uint32(optVal)
                ep.SocketOptions().SetV6Only(v != 0)
                return nil

        case linux.IPV6_ADD_MEMBERSHIP:
                req, err := copyInMulticastV6Request(optVal)
                if err != nil {
                        return err
                }

                return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
                        NIC:           tcpip.NICID(req.InterfaceIndex),
                        MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
                }))

        case linux.IPV6_DROP_MEMBERSHIP:
                req, err := copyInMulticastV6Request(optVal)
                if err != nil {
                        return err
                }

                return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
                        NIC:           tcpip.NICID(req.InterfaceIndex),
                        MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
                }))

        case linux.IPV6_IPSEC_POLICY,
                linux.IPV6_JOIN_ANYCAST,
                linux.IPV6_LEAVE_ANYCAST,
                // TODO(b/148887420): Add support for IPV6_PKTINFO.
                linux.IPV6_PKTINFO,
                linux.IPV6_ROUTER_ALERT,
                linux.IPV6_XFRM_POLICY,
                linux.MCAST_BLOCK_SOURCE,
                linux.MCAST_JOIN_GROUP,
                linux.MCAST_JOIN_SOURCE_GROUP,
                linux.MCAST_LEAVE_GROUP,
                linux.MCAST_LEAVE_SOURCE_GROUP,
                linux.MCAST_UNBLOCK_SOURCE:

                t.Kernel().EmitUnimplementedEvent(t)

        case linux.IPV6_RECVORIGDSTADDR:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }
                v := int32(hostarch.ByteOrder.Uint32(optVal))

                ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0)
                return nil

        case linux.IPV6_TCLASS:
                if len(optVal) < sizeOfInt32 {
                        return syserr.ErrInvalidArgument
                }
                v := int32(hostarch.ByteOrder.Uint32(optVal))
                if v < -1 || v > 255 {
                        return syserr.ErrInvalidArgument
                }
                if v == -1 {
                        v = 0
                }
                return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, int(v)))

        case linux.IPV6_RECVTCLASS:
                v, err := parseIntOrChar(optVal)
                if err != nil {
                        return err
                }

                ep.SocketOptions().SetReceiveTClass(v != 0)
                return nil
        case linux.IPV6_RECVERR:
                if len(optVal) == 0 {
                        return nil
                }
                v, err := parseIntOrChar(optVal)
                if err != nil {
                        return err
                }
                ep.SocketOptions().SetRecvError(v != 0)
                return nil

        case linux.IP6T_SO_SET_REPLACE:
                if len(optVal) < linux.SizeOfIP6TReplace {
                        return syserr.ErrInvalidArgument
                }

                // Only valid for raw IPv6 sockets.
                if skType != linux.SOCK_RAW {
                        return syserr.ErrProtocolNotAvailable
                }

                stack := inet.StackFromContext(t)
                if stack == nil {
                        return syserr.ErrNoDevice
                }
                // Stack must be a netstack stack.
                return netfilter.SetEntries(t, stack.(*Stack).Stack, optVal, true)

        case linux.IP6T_SO_SET_ADD_COUNTERS:
                log.Infof("IP6T_SO_SET_ADD_COUNTERS is not supported")
                return nil

        default:
                emitUnimplementedEventIPv6(t, name)
        }

        return nil
}

var (
        inetMulticastRequestSize        = (*linux.InetMulticastRequest)(nil).SizeBytes()
        inetMulticastRequestWithNICSize = (*linux.InetMulticastRequestWithNIC)(nil).SizeBytes()
        inet6MulticastRequestSize       = (*linux.Inet6MulticastRequest)(nil).SizeBytes()
)

// copyInMulticastRequest copies in a variable-size multicast request. The
// kernel determines which structure was passed by its length. IP_MULTICAST_IF
// supports ip_mreqn, ip_mreq and in_addr, while IP_ADD_MEMBERSHIP and
// IP_DROP_MEMBERSHIP only support ip_mreqn and ip_mreq. To handle this,
// allowAddr controls whether in_addr is accepted or rejected.
func copyInMulticastRequest(optVal []byte, allowAddr bool) (linux.InetMulticastRequestWithNIC, *syserr.Error) {
        if len(optVal) < len(linux.InetAddr{}) {
                return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
        }

        if len(optVal) < inetMulticastRequestSize {
                if !allowAddr {
                        return linux.InetMulticastRequestWithNIC{}, syserr.ErrInvalidArgument
                }

                var req linux.InetMulticastRequestWithNIC
                copy(req.InterfaceAddr[:], optVal)
                return req, nil
        }

        if len(optVal) >= inetMulticastRequestWithNICSize {
                var req linux.InetMulticastRequestWithNIC
                req.UnmarshalUnsafe(optVal[:inetMulticastRequestWithNICSize])
                return req, nil
        }

        var req linux.InetMulticastRequestWithNIC
        req.InetMulticastRequest.UnmarshalUnsafe(optVal[:inetMulticastRequestSize])
        return req, nil
}

func copyInMulticastV6Request(optVal []byte) (linux.Inet6MulticastRequest, *syserr.Error) {
        if len(optVal) < inet6MulticastRequestSize {
                return linux.Inet6MulticastRequest{}, syserr.ErrInvalidArgument
        }

        var req linux.Inet6MulticastRequest
        req.UnmarshalUnsafe(optVal[:inet6MulticastRequestSize])
        return req, nil
}

// parseIntOrChar copies either a 32-bit int or an 8-bit uint out of buf.
//
// net/ipv4/ip_sockglue.c:do_ip_setsockopt does this for its socket options.
func parseIntOrChar(buf []byte) (int32, *syserr.Error) {
        if len(buf) == 0 {
                return 0, syserr.ErrInvalidArgument
        }

        if len(buf) >= sizeOfInt32 {
                return int32(hostarch.ByteOrder.Uint32(buf)), nil
        }

        return int32(buf[0]), nil
}

// setSockOptIP implements SetSockOpt when level is SOL_IP.
func setSockOptIP(t *kernel.Task, s socket.SocketOps, ep commonEndpoint, name int, optVal []byte) *syserr.Error {
        if _, ok := ep.(tcpip.Endpoint); !ok {
                log.Warningf("SOL_IP options not supported on endpoints other than tcpip.Endpoint: option = %d", name)
                return syserr.ErrUnknownProtocolOption
        }

        switch name {
        case linux.IP_MULTICAST_TTL:
                v, err := parseIntOrChar(optVal)
                if err != nil {
                        return err
                }

                if v == -1 {
                        // Linux translates -1 to 1.
                        v = 1
                }
                if v < 0 || v > 255 {
                        return syserr.ErrInvalidArgument
                }
                return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.MulticastTTLOption, int(v)))

        case linux.IP_ADD_MEMBERSHIP:
                req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
                if err != nil {
                        return err
                }

                return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.AddMembershipOption{
                        NIC: tcpip.NICID(req.InterfaceIndex),
                        // TODO(igudger): Change AddMembership to use the standard
                        // any address representation.
                        InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]),
                        MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
                }))

        case linux.IP_DROP_MEMBERSHIP:
                req, err := copyInMulticastRequest(optVal, false /* allowAddr */)
                if err != nil {
                        return err
                }

                return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.RemoveMembershipOption{
                        NIC: tcpip.NICID(req.InterfaceIndex),
                        // TODO(igudger): Change DropMembership to use the standard
                        // any address representation.
                        InterfaceAddr: tcpip.Address(req.InterfaceAddr[:]),
                        MulticastAddr: tcpip.Address(req.MulticastAddr[:]),
                }))

        case linux.IP_MULTICAST_IF:
                req, err := copyInMulticastRequest(optVal, true /* allowAddr */)
                if err != nil {
                        return err
                }

                return syserr.TranslateNetstackError(ep.SetSockOpt(&tcpip.MulticastInterfaceOption{
                        NIC:           tcpip.NICID(req.InterfaceIndex),
                        InterfaceAddr: socket.BytesToIPAddress(req.InterfaceAddr[:]),
                }))

        case linux.IP_MULTICAST_LOOP:
                v, err := parseIntOrChar(optVal)
                if err != nil {
                        return err
                }

                ep.SocketOptions().SetMulticastLoop(v != 0)
                return nil

        case linux.MCAST_JOIN_GROUP:
                // FIXME(b/124219304): Implement MCAST_JOIN_GROUP.
                t.Kernel().EmitUnimplementedEvent(t)
                return syserr.ErrInvalidArgument

        case linux.IP_TTL:
                v, err := parseIntOrChar(optVal)
                if err != nil {
                        return err
                }

                // -1 means default TTL.
                if v == -1 {
                        v = 0
                } else if v < 1 || v > 255 {
                        return syserr.ErrInvalidArgument
                }
                return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.TTLOption, int(v)))

        case linux.IP_TOS:
                if len(optVal) == 0 {
                        return nil
                }
                v, err := parseIntOrChar(optVal)
                if err != nil {
                        return err
                }
                return syserr.TranslateNetstackError(ep.SetSockOptInt(tcpip.IPv4TOSOption, int(v)))

        case linux.IP_RECVTOS:
                v, err := parseIntOrChar(optVal)
                if err != nil {
                        return err
                }
                ep.SocketOptions().SetReceiveTOS(v != 0)
                return nil

        case linux.IP_RECVERR:
                if len(optVal) == 0 {
                        return nil
                }
                v, err := parseIntOrChar(optVal)
                if err != nil {
                        return err
                }
                ep.SocketOptions().SetRecvError(v != 0)
                return nil

        case linux.IP_PKTINFO:
                if len(optVal) == 0 {
                        return nil
                }
                v, err := parseIntOrChar(optVal)
                if err != nil {
                        return err
                }
                ep.SocketOptions().SetReceivePacketInfo(v != 0)
                return nil

        case linux.IP_HDRINCL:
                if len(optVal) == 0 {
                        return nil
                }
                v, err := parseIntOrChar(optVal)
                if err != nil {
                        return err
                }
                ep.SocketOptions().SetHeaderIncluded(v != 0)
                return nil

        case linux.IP_RECVORIGDSTADDR:
                if len(optVal) == 0 {
                        return nil
                }
                v, err := parseIntOrChar(optVal)
                if err != nil {
                        return err
                }

                ep.SocketOptions().SetReceiveOriginalDstAddress(v != 0)
                return nil

        case linux.IPT_SO_SET_REPLACE:
                if len(optVal) < linux.SizeOfIPTReplace {
                        return syserr.ErrInvalidArgument
                }

                // Only valid for raw IPv4 sockets.
                if family, skType, _ := s.Type(); family != linux.AF_INET || skType != linux.SOCK_RAW {
                        return syserr.ErrProtocolNotAvailable
                }

                stack := inet.StackFromContext(t)
                if stack == nil {
                        return syserr.ErrNoDevice
                }
                // Stack must be a netstack stack.
                return netfilter.SetEntries(t, stack.(*Stack).Stack, optVal, false)

        case linux.IPT_SO_SET_ADD_COUNTERS:
                log.Infof("IPT_SO_SET_ADD_COUNTERS is not supported")
                return nil

        case linux.IP_ADD_SOURCE_MEMBERSHIP,
                linux.IP_BIND_ADDRESS_NO_PORT,
                linux.IP_BLOCK_SOURCE,
                linux.IP_CHECKSUM,
                linux.IP_DROP_SOURCE_MEMBERSHIP,
                linux.IP_FREEBIND,
                linux.IP_IPSEC_POLICY,
                linux.IP_MINTTL,
                linux.IP_MSFILTER,
                linux.IP_MTU_DISCOVER,
                linux.IP_MULTICAST_ALL,
                linux.IP_NODEFRAG,
                linux.IP_OPTIONS,
                linux.IP_PASSSEC,
                linux.IP_RECVFRAGSIZE,
                linux.IP_RECVOPTS,
                linux.IP_RECVTTL,
                linux.IP_RETOPTS,
                linux.IP_TRANSPARENT,
                linux.IP_UNBLOCK_SOURCE,
                linux.IP_UNICAST_IF,
                linux.IP_XFRM_POLICY,
                linux.MCAST_BLOCK_SOURCE,
                linux.MCAST_JOIN_SOURCE_GROUP,
                linux.MCAST_LEAVE_GROUP,
                linux.MCAST_LEAVE_SOURCE_GROUP,
                linux.MCAST_MSFILTER,
                linux.MCAST_UNBLOCK_SOURCE:

                t.Kernel().EmitUnimplementedEvent(t)
        }

        return nil
}

// emitUnimplementedEventTCP emits unimplemented event if name is valid. This
// function contains names that are common between Get and SetSockOpt when
// level is SOL_TCP.
func emitUnimplementedEventTCP(t *kernel.Task, name int) {
        switch name {
        case linux.TCP_CONGESTION,
                linux.TCP_CORK,
                linux.TCP_FASTOPEN,
                linux.TCP_FASTOPEN_CONNECT,
                linux.TCP_FASTOPEN_KEY,
                linux.TCP_FASTOPEN_NO_COOKIE,
                linux.TCP_QUEUE_SEQ,
                linux.TCP_REPAIR,
                linux.TCP_REPAIR_QUEUE,
                linux.TCP_REPAIR_WINDOW,
                linux.TCP_SAVED_SYN,
                linux.TCP_SAVE_SYN,
                linux.TCP_THIN_DUPACK,
                linux.TCP_THIN_LINEAR_TIMEOUTS,
                linux.TCP_TIMESTAMP,
                linux.TCP_ULP:

                t.Kernel().EmitUnimplementedEvent(t)
        }
}

// emitUnimplementedEventIPv6 emits unimplemented event if name is valid. It
// contains names that are common between Get and SetSockOpt when level is
// SOL_IPV6.
func emitUnimplementedEventIPv6(t *kernel.Task, name int) {
        switch name {
        case linux.IPV6_2292DSTOPTS,
                linux.IPV6_2292HOPLIMIT,
                linux.IPV6_2292HOPOPTS,
                linux.IPV6_2292PKTINFO,
                linux.IPV6_2292PKTOPTIONS,
                linux.IPV6_2292RTHDR,
                linux.IPV6_ADDR_PREFERENCES,
                linux.IPV6_AUTOFLOWLABEL,
                linux.IPV6_DONTFRAG,
                linux.IPV6_DSTOPTS,
                linux.IPV6_FLOWINFO,
                linux.IPV6_FLOWINFO_SEND,
                linux.IPV6_FLOWLABEL_MGR,
                linux.IPV6_FREEBIND,
                linux.IPV6_HOPOPTS,
                linux.IPV6_MINHOPCOUNT,
                linux.IPV6_MTU,
                linux.IPV6_MTU_DISCOVER,
                linux.IPV6_MULTICAST_ALL,
                linux.IPV6_MULTICAST_HOPS,
                linux.IPV6_MULTICAST_IF,
                linux.IPV6_MULTICAST_LOOP,
                linux.IPV6_RECVDSTOPTS,
                linux.IPV6_RECVFRAGSIZE,
                linux.IPV6_RECVHOPLIMIT,
                linux.IPV6_RECVHOPOPTS,
                linux.IPV6_RECVPATHMTU,
                linux.IPV6_RECVPKTINFO,
                linux.IPV6_RECVRTHDR,
                linux.IPV6_RTHDR,
                linux.IPV6_RTHDRDSTOPTS,
                linux.IPV6_TCLASS,
                linux.IPV6_TRANSPARENT,
                linux.IPV6_UNICAST_HOPS,
                linux.IPV6_UNICAST_IF,
                linux.MCAST_MSFILTER,
                linux.IPV6_ADDRFORM:

                t.Kernel().EmitUnimplementedEvent(t)
        }
}

// emitUnimplementedEventIP emits unimplemented event if name is valid. It
// contains names that are common between Get and SetSockOpt when level is
// SOL_IP.
func emitUnimplementedEventIP(t *kernel.Task, name int) {
        switch name {
        case linux.IP_TOS,
                linux.IP_TTL,
                linux.IP_OPTIONS,
                linux.IP_ROUTER_ALERT,
                linux.IP_RECVOPTS,
                linux.IP_RETOPTS,
                linux.IP_PKTINFO,
                linux.IP_PKTOPTIONS,
                linux.IP_MTU_DISCOVER,
                linux.IP_RECVTTL,
                linux.IP_RECVTOS,
                linux.IP_MTU,
                linux.IP_FREEBIND,
                linux.IP_IPSEC_POLICY,
                linux.IP_XFRM_POLICY,
                linux.IP_PASSSEC,
                linux.IP_TRANSPARENT,
                linux.IP_ORIGDSTADDR,
                linux.IP_MINTTL,
                linux.IP_NODEFRAG,
                linux.IP_CHECKSUM,
                linux.IP_BIND_ADDRESS_NO_PORT,
                linux.IP_RECVFRAGSIZE,
                linux.IP_MULTICAST_IF,
                linux.IP_MULTICAST_TTL,
                linux.IP_MULTICAST_LOOP,
                linux.IP_ADD_MEMBERSHIP,
                linux.IP_DROP_MEMBERSHIP,
                linux.IP_UNBLOCK_SOURCE,
                linux.IP_BLOCK_SOURCE,
                linux.IP_ADD_SOURCE_MEMBERSHIP,
                linux.IP_DROP_SOURCE_MEMBERSHIP,
                linux.IP_MSFILTER,
                linux.MCAST_JOIN_GROUP,
                linux.MCAST_BLOCK_SOURCE,
                linux.MCAST_UNBLOCK_SOURCE,
                linux.MCAST_LEAVE_GROUP,
                linux.MCAST_JOIN_SOURCE_GROUP,
                linux.MCAST_LEAVE_SOURCE_GROUP,
                linux.MCAST_MSFILTER,
                linux.IP_MULTICAST_ALL,
                linux.IP_UNICAST_IF:

                t.Kernel().EmitUnimplementedEvent(t)
        }
}

// GetSockName implements the linux syscall getsockname(2) for sockets backed by
// tcpip.Endpoint.
func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
        addr, err := s.Endpoint.GetLocalAddress()
        if err != nil {
                return nil, 0, syserr.TranslateNetstackError(err)
        }

        a, l := socket.ConvertAddress(s.family, addr)
        return a, l, nil
}

// GetPeerName implements the linux syscall getpeername(2) for sockets backed by
// tcpip.Endpoint.
func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
        addr, err := s.Endpoint.GetRemoteAddress()
        if err != nil {
                return nil, 0, syserr.TranslateNetstackError(err)
        }

        a, l := socket.ConvertAddress(s.family, addr)
        return a, l, nil
}

func (s *socketOpsCommon) fillCmsgInq(cmsg *socket.ControlMessages) {
        if !s.sockOptInq {
                return
        }
        rcvBufUsed, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
        if err != nil {
                return
        }
        cmsg.IP.HasInq = true
        cmsg.IP.Inq = int32(rcvBufUsed)
}

func toLinuxPacketType(pktType tcpip.PacketType) uint8 {
        switch pktType {
        case tcpip.PacketHost:
                return linux.PACKET_HOST
        case tcpip.PacketOtherHost:
                return linux.PACKET_OTHERHOST
        case tcpip.PacketOutgoing:
                return linux.PACKET_OUTGOING
        case tcpip.PacketBroadcast:
                return linux.PACKET_BROADCAST
        case tcpip.PacketMulticast:
                return linux.PACKET_MULTICAST
        default:
                panic(fmt.Sprintf("unknown packet type: %d", pktType))
        }
}

// nonBlockingRead issues a non-blocking read.
//
// TODO(b/78348848): Support timestamps for stream sockets.
func (s *socketOpsCommon) nonBlockingRead(ctx context.Context, dst usermem.IOSequence, peek, trunc, senderRequested bool) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
        isPacket := s.isPacketBased()

        readOptions := tcpip.ReadOptions{
                Peek:               peek,
                NeedRemoteAddr:     senderRequested,
                NeedLinkPacketInfo: isPacket,
        }

        // TCP sockets discard the data if MSG_TRUNC is set.
        //
        // This behavior is documented in man 7 tcp:
        // Since version 2.4, Linux supports the use of MSG_TRUNC in the flags
        // argument of recv(2) (and recvmsg(2)). This flag causes the received
        // bytes of data to be discarded, rather than passed back in a
        // caller-supplied  buffer.
        var w io.Writer
        if !isPacket && trunc {
                w = &tcpip.LimitedWriter{
                        W: ioutil.Discard,
                        N: dst.NumBytes(),
                }
        } else {
                w = dst.Writer(ctx)
        }

        s.readMu.Lock()
        defer s.readMu.Unlock()

        res, err := s.Endpoint.Read(w, readOptions)
        if _, ok := err.(*tcpip.ErrBadBuffer); ok && dst.NumBytes() == 0 {
                err = nil
        }
        if err != nil {
                return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
        }
        // Set the control message, even if 0 bytes were read.
        s.updateTimestamp(res.ControlMessages)

        if isPacket {
                var addr linux.SockAddr
                var addrLen uint32
                if senderRequested {
                        addr, addrLen = socket.ConvertAddress(s.family, res.RemoteAddr)
                        switch v := addr.(type) {
                        case *linux.SockAddrLink:
                                v.Protocol = socket.Htons(uint16(res.LinkPacketInfo.Protocol))
                                v.PacketType = toLinuxPacketType(res.LinkPacketInfo.PktType)
                        }
                }

                msgLen := res.Count
                if trunc {
                        msgLen = res.Total
                }

                var flags int
                if res.Total > res.Count {
                        flags |= linux.MSG_TRUNC
                }

                return msgLen, flags, addr, addrLen, s.controlMessages(res.ControlMessages), nil
        }

        if peek {
                // MSG_TRUNC with MSG_PEEK on a TCP socket returns the
                // amount that could be read, and does not write to buffer.
                if trunc {
                        // TCP endpoint does not return the total bytes in buffer as numTotal.
                        // We need to query it from socket option.
                        rql, err := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
                        if err != nil {
                                return 0, 0, nil, 0, socket.ControlMessages{}, syserr.TranslateNetstackError(err)
                        }
                        msgLen := int(dst.NumBytes())
                        if msgLen > rql {
                                msgLen = rql
                        }
                        return msgLen, 0, nil, 0, socket.ControlMessages{}, nil
                }
        } else if n := res.Count; n != 0 {
                s.Endpoint.ModerateRecvBuf(n)
        }

        cmsg := s.controlMessages(res.ControlMessages)
        s.fillCmsgInq(&cmsg)
        return res.Count, 0, nil, 0, cmsg, syserr.TranslateNetstackError(err)
}

func (s *socketOpsCommon) controlMessages(cm tcpip.ControlMessages) socket.ControlMessages {
        readCM := socket.NewIPControlMessages(s.family, cm)
        return socket.ControlMessages{
                IP: socket.IPControlMessages{
                        HasTimestamp:       readCM.HasTimestamp && s.sockOptTimestamp,
                        Timestamp:          readCM.Timestamp,
                        HasInq:             readCM.HasInq,
                        Inq:                readCM.Inq,
                        HasTOS:             readCM.HasTOS,
                        TOS:                readCM.TOS,
                        HasTClass:          readCM.HasTClass,
                        TClass:             readCM.TClass,
                        HasIPPacketInfo:    readCM.HasIPPacketInfo,
                        PacketInfo:         readCM.PacketInfo,
                        OriginalDstAddress: readCM.OriginalDstAddress,
                        SockErr:            readCM.SockErr,
                },
        }
}

// updateTimestamp sets the timestamp for SIOCGSTAMP. It should be called after
// successfully writing packet data out to userspace.
//
// Precondition: s.readMu must be locked.
func (s *socketOpsCommon) updateTimestamp(cm tcpip.ControlMessages) {
        // Save the SIOCGSTAMP timestamp only if SO_TIMESTAMP is disabled.
        if !s.sockOptTimestamp {
                s.timestampValid = true
                s.timestampNS = cm.Timestamp
        }
}

// dequeueErr is analogous to net/core/skbuff.c:sock_dequeue_err_skb().
func (s *socketOpsCommon) dequeueErr() *tcpip.SockError {
        so := s.Endpoint.SocketOptions()
        err := so.DequeueErr()
        if err == nil {
                return nil
        }

        // Update socket error to reflect ICMP errors in queue.
        if nextErr := so.PeekErr(); nextErr != nil && nextErr.Cause.Origin().IsICMPErr() {
                so.SetLastError(nextErr.Err)
        } else if err.Cause.Origin().IsICMPErr() {
                so.SetLastError(nil)
        }
        return err
}

// addrFamilyFromNetProto returns the address family identifier for the given
// network protocol.
func addrFamilyFromNetProto(net tcpip.NetworkProtocolNumber) int {
        switch net {
        case header.IPv4ProtocolNumber:
                return linux.AF_INET
        case header.IPv6ProtocolNumber:
                return linux.AF_INET6
        default:
                panic(fmt.Sprintf("invalid net proto for addr family inference: %d", net))
        }
}

// recvErr handles MSG_ERRQUEUE for recvmsg(2).
// This is analogous to net/ipv4/ip_sockglue.c:ip_recv_error().
func (s *socketOpsCommon) recvErr(t *kernel.Task, dst usermem.IOSequence) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
        sockErr := s.dequeueErr()
        if sockErr == nil {
                return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
        }

        // The payload of the original packet that caused the error is passed as
        // normal data via msg_iovec.  -- recvmsg(2)
        msgFlags := linux.MSG_ERRQUEUE
        if int(dst.NumBytes()) < len(sockErr.Payload) {
                msgFlags |= linux.MSG_TRUNC
        }
        n, err := dst.CopyOut(t, sockErr.Payload)

        // The original destination address of the datagram that caused the error is
        // supplied via msg_name.  -- recvmsg(2)
        dstAddr, dstAddrLen := socket.ConvertAddress(addrFamilyFromNetProto(sockErr.NetProto), sockErr.Dst)
        cmgs := socket.ControlMessages{IP: socket.NewIPControlMessages(s.family, tcpip.ControlMessages{SockErr: sockErr})}
        return n, msgFlags, dstAddr, dstAddrLen, cmgs, syserr.FromError(err)
}

// RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
// tcpip.Endpoint.
func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
        if flags&linux.MSG_ERRQUEUE != 0 {
                return s.recvErr(t, dst)
        }

        trunc := flags&linux.MSG_TRUNC != 0
        peek := flags&linux.MSG_PEEK != 0
        dontWait := flags&linux.MSG_DONTWAIT != 0
        waitAll := flags&linux.MSG_WAITALL != 0
        if senderRequested && !s.isPacketBased() {
                // Stream sockets ignore the sender address.
                senderRequested = false
        }
        n, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)

        if s.isPacketBased() && err == syserr.ErrClosedForReceive && flags&linux.MSG_DONTWAIT != 0 {
                // In this situation we should return EAGAIN.
                return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
        }

        if err != nil && (err != syserr.ErrWouldBlock || dontWait) {
                // Read failed and we should not retry.
                return 0, 0, nil, 0, socket.ControlMessages{}, err
        }

        if err == nil && (dontWait || !waitAll || s.isPacketBased() || int64(n) >= dst.NumBytes()) {
                // We got all the data we need.
                return
        }

        // Don't overwrite any data we received.
        dst = dst.DropFirst(n)

        // We'll have to block. Register for notifications and keep trying to
        // send all the data.
        e, ch := waiter.NewChannelEntry(nil)
        s.EventRegister(&e, waiter.ReadableEvents)
        defer s.EventUnregister(&e)

        for {
                var rn int
                rn, msgFlags, senderAddr, senderAddrLen, controlMessages, err = s.nonBlockingRead(t, dst, peek, trunc, senderRequested)
                n += rn
                if err != nil && err != syserr.ErrWouldBlock {
                        // Always stop on errors other than would block as we generally
                        // won't be able to get any more data. Eat the error if we got
                        // any data.
                        if n > 0 {
                                err = nil
                        }
                        return
                }
                if err == nil && (s.isPacketBased() || !waitAll || int64(rn) >= dst.NumBytes()) {
                        // We got all the data we need.
                        return
                }
                dst = dst.DropFirst(rn)

                if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
                        if n > 0 {
                                return n, msgFlags, senderAddr, senderAddrLen, controlMessages, nil
                        }
                        if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
                                return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
                        }
                        return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
                }
        }
}

// SendMsg implements the linux syscall sendmsg(2) for sockets backed by
// tcpip.Endpoint.
func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
        // Reject Unix control messages.
        if !controlMessages.Unix.Empty() {
                return 0, syserr.ErrInvalidArgument
        }

        var addr *tcpip.FullAddress
        if len(to) > 0 {
                addrBuf, family, err := socket.AddressAndFamily(to)
                if err != nil {
                        return 0, err
                }
                if err := s.checkFamily(family, false /* exact */); err != nil {
                        return 0, err
                }
                addrBuf = s.mapFamily(addrBuf, family)

                addr = &addrBuf
        }

        opts := tcpip.WriteOptions{
                To:          addr,
                More:        flags&linux.MSG_MORE != 0,
                EndOfRecord: flags&linux.MSG_EOR != 0,
        }

        r := src.Reader(t)
        var (
                total int64
                entry waiter.Entry
                ch    <-chan struct{}
        )
        for {
                n, err := s.Endpoint.Write(r, opts)
                total += n
                if flags&linux.MSG_DONTWAIT != 0 {
                        return int(total), syserr.TranslateNetstackError(err)
                }
                block := true
                switch err.(type) {
                case nil:
                        block = total != src.NumBytes()
                case *tcpip.ErrWouldBlock:
                default:
                        block = false
                }
                if block {
                        if ch == nil {
                                // We'll have to block. Register for notification and keep trying to
                                // send all the data.
                                entry, ch = waiter.NewChannelEntry(nil)
                                s.EventRegister(&entry, waiter.WritableEvents)
                                defer s.EventUnregister(&entry)
                        } else {
                                // Don't wait immediately after registration in case more data
                                // became available between when we last checked and when we setup
                                // the notification.
                                if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
                                        if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
                                                return int(total), syserr.ErrTryAgain
                                        }
                                        // handleIOError will consume errors from t.Block if needed.
                                        return int(total), syserr.FromError(err)
                                }
                        }
                        continue
                }
                return int(total), syserr.TranslateNetstackError(err)
        }
}

// Ioctl implements fs.FileOperations.Ioctl.
func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        return s.socketOpsCommon.ioctl(ctx, io, args)
}

func (s *socketOpsCommon) ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        t := kernel.TaskFromContext(ctx)
        if t == nil {
                panic("ioctl(2) may only be called from a task goroutine")
        }

        // SIOCGSTAMP is implemented by netstack rather than all commonEndpoint
        // sockets.
        // TODO(b/78348848): Add a commonEndpoint method to support SIOCGSTAMP.
        switch args[1].Int() {
        case linux.SIOCGSTAMP:
                s.readMu.Lock()
                defer s.readMu.Unlock()
                if !s.timestampValid {
                        return 0, syserror.ENOENT
                }

                tv := linux.NsecToTimeval(s.timestampNS)
                _, err := tv.CopyOut(t, args[2].Pointer())
                return 0, err

        case linux.TIOCINQ:
                v, terr := s.Endpoint.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
                if terr != nil {
                        return 0, syserr.TranslateNetstackError(terr).ToError()
                }

                if v > math.MaxInt32 {
                        v = math.MaxInt32
                }

                // Copy result to userspace.
                vP := primitive.Int32(v)
                _, err := vP.CopyOut(t, args[2].Pointer())
                return 0, err
        }

        return Ioctl(ctx, s.Endpoint, io, args)
}

// Ioctl performs a socket ioctl.
func Ioctl(ctx context.Context, ep commonEndpoint, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        t := kernel.TaskFromContext(ctx)
        if t == nil {
                panic("ioctl(2) may only be called from a task goroutine")
        }

        switch arg := int(args[1].Int()); arg {
        case linux.SIOCGIFFLAGS,
                linux.SIOCGIFADDR,
                linux.SIOCGIFBRDADDR,
                linux.SIOCGIFDSTADDR,
                linux.SIOCGIFHWADDR,
                linux.SIOCGIFINDEX,
                linux.SIOCGIFMAP,
                linux.SIOCGIFMETRIC,
                linux.SIOCGIFMTU,
                linux.SIOCGIFNAME,
                linux.SIOCGIFNETMASK,
                linux.SIOCGIFTXQLEN,
                linux.SIOCETHTOOL:

                var ifr linux.IFReq
                if _, err := ifr.CopyIn(t, args[2].Pointer()); err != nil {
                        return 0, err
                }
                if err := interfaceIoctl(ctx, io, arg, &ifr); err != nil {
                        return 0, err.ToError()
                }
                _, err := ifr.CopyOut(t, args[2].Pointer())
                return 0, err

        case linux.SIOCGIFCONF:
                // Return a list of interface addresses or the buffer size
                // necessary to hold the list.
                var ifc linux.IFConf
                if _, err := ifc.CopyIn(t, args[2].Pointer()); err != nil {
                        return 0, err
                }

                if err := ifconfIoctl(ctx, t, io, &ifc); err != nil {
                        return 0, err
                }

                _, err := ifc.CopyOut(t, args[2].Pointer())
                return 0, err

        case linux.TIOCINQ:
                v, terr := ep.GetSockOptInt(tcpip.ReceiveQueueSizeOption)
                if terr != nil {
                        return 0, syserr.TranslateNetstackError(terr).ToError()
                }

                if v > math.MaxInt32 {
                        v = math.MaxInt32
                }
                // Copy result to userspace.
                vP := primitive.Int32(v)
                _, err := vP.CopyOut(t, args[2].Pointer())
                return 0, err

        case linux.TIOCOUTQ:
                v, terr := ep.GetSockOptInt(tcpip.SendQueueSizeOption)
                if terr != nil {
                        return 0, syserr.TranslateNetstackError(terr).ToError()
                }

                if v > math.MaxInt32 {
                        v = math.MaxInt32
                }

                // Copy result to userspace.
                vP := primitive.Int32(v)
                _, err := vP.CopyOut(t, args[2].Pointer())
                return 0, err

        case linux.SIOCGIFMEM, linux.SIOCGIFPFLAGS, linux.SIOCGMIIPHY, linux.SIOCGMIIREG:
                unimpl.EmitUnimplementedEvent(ctx)
        }

        return 0, linuxerr.ENOTTY
}

// interfaceIoctl implements interface requests.
func interfaceIoctl(ctx context.Context, io usermem.IO, arg int, ifr *linux.IFReq) *syserr.Error {
        var (
                iface inet.Interface
                index int32
                found bool
        )

        // Find the relevant device.
        stack := inet.StackFromContext(ctx)
        if stack == nil {
                return syserr.ErrNoDevice
        }

        // SIOCGIFNAME uses ifr.ifr_ifindex rather than ifr.ifr_name to
        // identify a device.
        if arg == linux.SIOCGIFNAME {
                // Gets the name of the interface given the interface index
                // stored in ifr_ifindex.
                index = int32(hostarch.ByteOrder.Uint32(ifr.Data[:4]))
                if iface, ok := stack.Interfaces()[index]; ok {
                        ifr.SetName(iface.Name)
                        return nil
                }
                return syserr.ErrNoDevice
        }

        // Find the relevant device.
        for index, iface = range stack.Interfaces() {
                if iface.Name == ifr.Name() {
                        found = true
                        break
                }
        }
        if !found {
                return syserr.ErrNoDevice
        }

        switch arg {
        case linux.SIOCGIFINDEX:
                // Copy out the index to the data.
                hostarch.ByteOrder.PutUint32(ifr.Data[:], uint32(index))

        case linux.SIOCGIFHWADDR:
                // Copy the hardware address out.
                //
                // Refer: https://linux.die.net/man/7/netdevice
                // SIOCGIFHWADDR, SIOCSIFHWADDR
                //
                // Get or set the hardware address of a device using
                // ifr_hwaddr. The hardware address is specified in a struct
                // sockaddr. sa_family contains the ARPHRD_* device type,
                // sa_data the L2 hardware address starting from byte 0. Setting
                // the hardware address is a privileged operation.
                hostarch.ByteOrder.PutUint16(ifr.Data[:], iface.DeviceType)
                n := copy(ifr.Data[2:], iface.Addr)
                for i := 2 + n; i < len(ifr.Data); i++ {
                        ifr.Data[i] = 0 // Clear padding.
                }

        case linux.SIOCGIFFLAGS:
                f, err := interfaceStatusFlags(stack, iface.Name)
                if err != nil {
                        return err
                }
                // Drop the flags that don't fit in the size that we need to return. This
                // matches Linux behavior.
                hostarch.ByteOrder.PutUint16(ifr.Data[:2], uint16(f))

        case linux.SIOCGIFADDR:
                // Copy the IPv4 address out.
                for _, addr := range stack.InterfaceAddrs()[index] {
                        // This ioctl is only compatible with AF_INET addresses.
                        if addr.Family != linux.AF_INET {
                                continue
                        }
                        copy(ifr.Data[4:8], addr.Addr)
                        break
                }

        case linux.SIOCGIFMETRIC:
                // Gets the metric of the device. As per netdevice(7), this
                // always just sets ifr_metric to 0.
                hostarch.ByteOrder.PutUint32(ifr.Data[:4], 0)

        case linux.SIOCGIFMTU:
                // Gets the MTU of the device.
                hostarch.ByteOrder.PutUint32(ifr.Data[:4], iface.MTU)

        case linux.SIOCGIFMAP:
                // Gets the hardware parameters of the device.
                // TODO(gvisor.dev/issue/505): Implement.

        case linux.SIOCGIFTXQLEN:
                // Gets the transmit queue length of the device.
                // TODO(gvisor.dev/issue/505): Implement.

        case linux.SIOCGIFDSTADDR:
                // Gets the destination address of a point-to-point device.
                // TODO(gvisor.dev/issue/505): Implement.

        case linux.SIOCGIFBRDADDR:
                // Gets the broadcast address of a device.
                // TODO(gvisor.dev/issue/505): Implement.

        case linux.SIOCGIFNETMASK:
                // Gets the network mask of a device.
                for _, addr := range stack.InterfaceAddrs()[index] {
                        // This ioctl is only compatible with AF_INET addresses.
                        if addr.Family != linux.AF_INET {
                                continue
                        }
                        // Populate ifr.ifr_netmask (type sockaddr).
                        hostarch.ByteOrder.PutUint16(ifr.Data[0:], uint16(linux.AF_INET))
                        hostarch.ByteOrder.PutUint16(ifr.Data[2:], 0)
                        var mask uint32 = 0xffffffff << (32 - addr.PrefixLen)
                        // Netmask is expected to be returned as a big endian
                        // value.
                        binary.BigEndian.PutUint32(ifr.Data[4:8], mask)
                        break
                }

        case linux.SIOCETHTOOL:
                // Stubbed out for now, Ideally we should implement the required
                // sub-commands for ETHTOOL
                //
                // See:
                // https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/net/core/dev_ioctl.c
                return syserr.ErrEndpointOperation

        default:
                // Not a valid call.
                return syserr.ErrInvalidArgument
        }

        return nil
}

// ifconfIoctl populates a struct ifconf for the SIOCGIFCONF ioctl.
func ifconfIoctl(ctx context.Context, t *kernel.Task, io usermem.IO, ifc *linux.IFConf) error {
        // If Ptr is NULL, return the necessary buffer size via Len.
        // Otherwise, write up to Len bytes starting at Ptr containing ifreq
        // structs.
        stack := inet.StackFromContext(ctx)
        if stack == nil {
                return syserr.ErrNoDevice.ToError()
        }

        if ifc.Ptr == 0 {
                ifc.Len = int32(len(stack.Interfaces())) * int32(linux.SizeOfIFReq)
                return nil
        }

        max := ifc.Len
        ifc.Len = 0
        for key, ifaceAddrs := range stack.InterfaceAddrs() {
                iface := stack.Interfaces()[key]
                for _, ifaceAddr := range ifaceAddrs {
                        // Don't write past the end of the buffer.
                        if ifc.Len+int32(linux.SizeOfIFReq) > max {
                                break
                        }
                        if ifaceAddr.Family != linux.AF_INET {
                                continue
                        }

                        // Populate ifr.ifr_addr.
                        ifr := linux.IFReq{}
                        ifr.SetName(iface.Name)
                        hostarch.ByteOrder.PutUint16(ifr.Data[0:2], uint16(ifaceAddr.Family))
                        hostarch.ByteOrder.PutUint16(ifr.Data[2:4], 0)
                        copy(ifr.Data[4:8], ifaceAddr.Addr[:4])

                        // Copy the ifr to userspace.
                        dst := uintptr(ifc.Ptr) + uintptr(ifc.Len)
                        ifc.Len += int32(linux.SizeOfIFReq)
                        if _, err := ifr.CopyOut(t, hostarch.Addr(dst)); err != nil {
                                return err
                        }
                }
        }
        return nil
}

// interfaceStatusFlags returns status flags for an interface in the stack.
// Flag values and meanings are described in greater detail in netdevice(7) in
// the SIOCGIFFLAGS section.
func interfaceStatusFlags(stack inet.Stack, name string) (uint32, *syserr.Error) {
        // We should only ever be passed a netstack.Stack.
        epstack, ok := stack.(*Stack)
        if !ok {
                return 0, errStackType
        }

        // Find the NIC corresponding to this interface.
        for _, info := range epstack.Stack.NICInfo() {
                if info.Name == name {
                        return nicStateFlagsToLinux(info.Flags), nil
                }
        }
        return 0, syserr.ErrNoDevice
}

func nicStateFlagsToLinux(f stack.NICStateFlags) uint32 {
        var rv uint32
        if f.Up {
                rv |= linux.IFF_UP | linux.IFF_LOWER_UP
        }
        if f.Running {
                rv |= linux.IFF_RUNNING
        }
        if f.Promiscuous {
                rv |= linux.IFF_PROMISC
        }
        if f.Loopback {
                rv |= linux.IFF_LOOPBACK
        }
        return rv
}

func isTCPSocket(skType linux.SockType, skProto int) bool {
        return skType == linux.SOCK_STREAM && (skProto == 0 || skProto == unix.IPPROTO_TCP)
}

func isUDPSocket(skType linux.SockType, skProto int) bool {
        return skType == linux.SOCK_DGRAM && (skProto == 0 || skProto == unix.IPPROTO_UDP)
}

func isICMPSocket(skType linux.SockType, skProto int) bool {
        return skType == linux.SOCK_DGRAM && (skProto == unix.IPPROTO_ICMP || skProto == unix.IPPROTO_ICMPV6)
}

// State implements socket.Socket.State. State translates the internal state
// returned by netstack to values defined by Linux.
func (s *socketOpsCommon) State() uint32 {
        if s.family != linux.AF_INET && s.family != linux.AF_INET6 {
                // States not implemented for this socket's family.
                return 0
        }

        switch {
        case isTCPSocket(s.skType, s.protocol):
                // TCP socket.
                switch tcp.EndpointState(s.Endpoint.State()) {
                case tcp.StateEstablished:
                        return linux.TCP_ESTABLISHED
                case tcp.StateSynSent:
                        return linux.TCP_SYN_SENT
                case tcp.StateSynRecv:
                        return linux.TCP_SYN_RECV
                case tcp.StateFinWait1:
                        return linux.TCP_FIN_WAIT1
                case tcp.StateFinWait2:
                        return linux.TCP_FIN_WAIT2
                case tcp.StateTimeWait:
                        return linux.TCP_TIME_WAIT
                case tcp.StateClose, tcp.StateInitial, tcp.StateBound, tcp.StateConnecting, tcp.StateError:
                        return linux.TCP_CLOSE
                case tcp.StateCloseWait:
                        return linux.TCP_CLOSE_WAIT
                case tcp.StateLastAck:
                        return linux.TCP_LAST_ACK
                case tcp.StateListen:
                        return linux.TCP_LISTEN
                case tcp.StateClosing:
                        return linux.TCP_CLOSING
                default:
                        // Internal or unknown state.
                        return 0
                }
        case isUDPSocket(s.skType, s.protocol):
                // UDP socket.
                switch udp.EndpointState(s.Endpoint.State()) {
                case udp.StateInitial, udp.StateBound, udp.StateClosed:
                        return linux.TCP_CLOSE
                case udp.StateConnected:
                        return linux.TCP_ESTABLISHED
                default:
                        return 0
                }
        case isICMPSocket(s.skType, s.protocol):
                // TODO(b/112063468): Export states for ICMP sockets.
        case s.skType == linux.SOCK_RAW:
                // TODO(b/112063468): Export states for raw sockets.
        default:
                // Unknown transport protocol, how did we make this socket?
                log.Warningf("Unknown transport protocol for an existing socket: family=%v, type=%v, protocol=%v, internal type %v", s.family, s.skType, s.protocol, reflect.TypeOf(s.Endpoint).Elem())
                return 0
        }

        return 0
}

// Type implements socket.Socket.Type.
func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) {
        return s.family, s.skType, s.protocol
}

// LINT.ThenChange(./netstack_vfs2.go)



















































































































































  123 





  126 



  126 
  126 
  127 



  125 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build linux

// Package fdnotifier contains an adapter that translates IO events (e.g., a
// file became readable/writable) from native FDs to the notifications in the
// waiter package. It uses epoll in edge-triggered mode to receive notifications
// for registered FDs.
package fdnotifier

import (
        "fmt"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/waiter"
)

type fdInfo struct {
        queue   *waiter.Queue
        waiting bool
}

// notifier holds all the state necessary to issue notifications when IO events
// occur in the observed FDs.
type notifier struct {
        // epFD is the epoll file descriptor used to register for io
        // notifications.
        epFD int

        // mu protects fdMap.
        mu sync.Mutex

        // fdMap maps file descriptors to their notification queues and waiting
        // status.
        fdMap map[int32]*fdInfo
}

// newNotifier creates a new notifier object.
func newNotifier() (*notifier, error) {
        epfd, err := unix.EpollCreate1(0)
        if err != nil {
                return nil, err
        }

        w := &notifier{
                epFD:  epfd,
                fdMap: make(map[int32]*fdInfo),
        }

        go w.waitAndNotify() // S/R-SAFE: no waiter exists during save / load.

        return w, nil
}

// waitFD waits on mask for fd. The fdMap mutex must be hold.
func (n *notifier) waitFD(fd int32, fi *fdInfo, mask waiter.EventMask) error {
        if !fi.waiting && mask == 0 {
                return nil
        }

        e := unix.EpollEvent{
                Events: mask.ToLinux() | unix.EPOLLET,
                Fd:     fd,
        }

        switch {
        case !fi.waiting && mask != 0:
                if err := unix.EpollCtl(n.epFD, unix.EPOLL_CTL_ADD, int(fd), &e); err != nil {
                        return err
                }
                fi.waiting = true
        case fi.waiting && mask == 0:
                unix.EpollCtl(n.epFD, unix.EPOLL_CTL_DEL, int(fd), nil)
                fi.waiting = false
        case fi.waiting && mask != 0:
                if err := unix.EpollCtl(n.epFD, unix.EPOLL_CTL_MOD, int(fd), &e); err != nil {
                        return err
                }
        }

        return nil
}

// addFD adds an FD to the list of FDs observed by n.
func (n *notifier) addFD(fd int32, queue *waiter.Queue) {
        n.mu.Lock()
        defer n.mu.Unlock()

        // Panic if we're already notifying on this FD.
        if _, ok := n.fdMap[fd]; ok {
                panic(fmt.Sprintf("File descriptor %v added twice", fd))
        }

        // We have nothing to wait for at the moment. Just add it to the map.
        n.fdMap[fd] = &fdInfo{queue: queue}
}

// updateFD updates the set of events the fd needs to be notified on.
func (n *notifier) updateFD(fd int32) error {
        n.mu.Lock()
        defer n.mu.Unlock()

        if fi, ok := n.fdMap[fd]; ok {
                return n.waitFD(fd, fi, fi.queue.Events())
        }

        return nil
}

// RemoveFD removes an FD from the list of FDs observed by n.
func (n *notifier) removeFD(fd int32) {
        n.mu.Lock()
        defer n.mu.Unlock()

        // Remove from map, then from epoll object.
        n.waitFD(fd, n.fdMap[fd], 0)
        delete(n.fdMap, fd)
}

// hasFD returns true if the fd is in the list of observed FDs.
func (n *notifier) hasFD(fd int32) bool {
        n.mu.Lock()
        defer n.mu.Unlock()

        _, ok := n.fdMap[fd]
        return ok
}

// waitAndNotify run is its own goroutine and loops waiting for io event
// notifications from the epoll object. Once notifications arrive, they are
// dispatched to the registered queue.
func (n *notifier) waitAndNotify() error {
        e := make([]unix.EpollEvent, 100)
        for {
                v, err := epollWait(n.epFD, e, -1)
                if err == unix.EINTR {
                        continue
                }

                if err != nil {
                        return err
                }

                n.mu.Lock()
                for i := 0; i < v; i++ {
                        if fi, ok := n.fdMap[e[i].Fd]; ok {
                                fi.queue.Notify(waiter.EventMaskFromLinux(e[i].Events))
                        }
                }
                n.mu.Unlock()
        }
}

var shared struct {
        notifier *notifier
        once     sync.Once
        initErr  error
}

// AddFD adds an FD to the list of observed FDs.
func AddFD(fd int32, queue *waiter.Queue) error {
        shared.once.Do(func() {
                shared.notifier, shared.initErr = newNotifier()
        })

        if shared.initErr != nil {
                return shared.initErr
        }

        shared.notifier.addFD(fd, queue)
        return nil
}

// UpdateFD updates the set of events the fd needs to be notified on.
func UpdateFD(fd int32) error {
        return shared.notifier.updateFD(fd)
}

// RemoveFD removes an FD from the list of observed FDs.
func RemoveFD(fd int32) {
        shared.notifier.removeFD(fd)
}

// HasFD returns true if the FD is in the list of observed FDs.
//
// This should only be used by tests to assert that FDs are correctly registered.
func HasFD(fd int32) bool {
        return shared.notifier.hasFD(fd)
}


































  432 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package abi describes the interface between a kernel and userspace.
package abi

import (
        "fmt"
)

// OS describes the target operating system for an ABI.
//
// Note that OS is architecture-independent. The details of the OS ABI will
// vary between architectures.
type OS int

const (
        // Linux is the Linux ABI.
        Linux OS = iota
)

// String implements fmt.Stringer.
func (o OS) String() string {
        switch o {
        case Linux:
                return "linux"
        default:
                return fmt.Sprintf("OS(%d)", o)
        }
}

// ABI is an interface that defines OS-specific interactions.
type ABI interface {
}


























   70 



    2 


   69 

    1 




   68 
    1 





    1 





    1 




    1 
    1 




    1 

    2 

    1 


    1 
    1 




    1 


    1 


    1 


    1 



    1 


    2 




    2 
    1 

    1 





    1 


   61 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
)

// Ioctl implements Linux syscall ioctl(2).
func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        if file.StatusFlags()&linux.O_PATH != 0 {
                return 0, nil, linuxerr.EBADF
        }

        // Handle ioctls that apply to all FDs.
        switch args[1].Int() {
        case linux.FIONCLEX:
                t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
                        CloseOnExec: false,
                })
                return 0, nil, nil

        case linux.FIOCLEX:
                t.FDTable().SetFlagsVFS2(t, fd, kernel.FDFlags{
                        CloseOnExec: true,
                })
                return 0, nil, nil

        case linux.FIONBIO:
                var set int32
                if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
                        return 0, nil, err
                }
                flags := file.StatusFlags()
                if set != 0 {
                        flags |= linux.O_NONBLOCK
                } else {
                        flags &^= linux.O_NONBLOCK
                }
                return 0, nil, file.SetStatusFlags(t, t.Credentials(), flags)

        case linux.FIOASYNC:
                var set int32
                if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &set); err != nil {
                        return 0, nil, err
                }
                flags := file.StatusFlags()
                if set != 0 {
                        flags |= linux.O_ASYNC
                } else {
                        flags &^= linux.O_ASYNC
                }
                file.SetStatusFlags(t, t.Credentials(), flags)
                return 0, nil, nil

        case linux.FIOGETOWN, linux.SIOCGPGRP:
                var who int32
                owner, hasOwner := getAsyncOwner(t, file)
                if hasOwner {
                        if owner.Type == linux.F_OWNER_PGRP {
                                who = -owner.PID
                        } else {
                                who = owner.PID
                        }
                }
                _, err := primitive.CopyInt32Out(t, args[2].Pointer(), who)
                return 0, nil, err

        case linux.FIOSETOWN, linux.SIOCSPGRP:
                var who int32
                if _, err := primitive.CopyInt32In(t, args[2].Pointer(), &who); err != nil {
                        return 0, nil, err
                }
                ownerType := int32(linux.F_OWNER_PID)
                if who < 0 {
                        // Check for overflow before flipping the sign.
                        if who-1 > who {
                                return 0, nil, linuxerr.EINVAL
                        }
                        ownerType = linux.F_OWNER_PGRP
                        who = -who
                }
                return 0, nil, setAsyncOwner(t, int(fd), file, ownerType, who)
        }

        ret, err := file.Ioctl(t, t.MemoryManager(), args)
        return ret, nil, err
}
























































    2 







   21 





   21 










   21 




























   21 







   20 


   20 


   21 




   21 







   20 



    1 





    2 


















   21 









    1 




    3 






    3 



    1 
    1 



    1 



















   21 









    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package sys implements sysfs.
package sys

import (
        "bytes"
        "fmt"
        "strconv"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/coverage"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

const (
        // Name is the default filesystem name.
        Name                     = "sysfs"
        defaultSysDirMode        = linux.FileMode(0755)
        defaultMaxCachedDentries = uint64(1000)
)

// FilesystemType implements vfs.FilesystemType.
//
// +stateify savable
type FilesystemType struct{}

// filesystem implements vfs.FilesystemImpl.
//
// +stateify savable
type filesystem struct {
        kernfs.Filesystem

        devMinor uint32
}

// Name implements vfs.FilesystemType.Name.
func (FilesystemType) Name() string {
        return Name
}

// Release implements vfs.FilesystemType.Release.
func (FilesystemType) Release(ctx context.Context) {}

// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
        devMinor, err := vfsObj.GetAnonBlockDevMinor()
        if err != nil {
                return nil, nil, err
        }

        mopts := vfs.GenericParseMountOptions(opts.Data)
        maxCachedDentries := defaultMaxCachedDentries
        if str, ok := mopts["dentry_cache_limit"]; ok {
                delete(mopts, "dentry_cache_limit")
                maxCachedDentries, err = strconv.ParseUint(str, 10, 64)
                if err != nil {
                        ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
                        return nil, nil, linuxerr.EINVAL
                }
        }

        fs := &filesystem{
                devMinor: devMinor,
        }
        fs.MaxCachedDentries = maxCachedDentries
        fs.VFSFilesystem().Init(vfsObj, &fsType, fs)

        root := fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
                "block": fs.newDir(ctx, creds, defaultSysDirMode, nil),
                "bus":   fs.newDir(ctx, creds, defaultSysDirMode, nil),
                "class": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
                        "power_supply": fs.newDir(ctx, creds, defaultSysDirMode, nil),
                }),
                "dev": fs.newDir(ctx, creds, defaultSysDirMode, nil),
                "devices": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
                        "system": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
                                "cpu": cpuDir(ctx, fs, creds),
                        }),
                }),
                "firmware": fs.newDir(ctx, creds, defaultSysDirMode, nil),
                "fs":       fs.newDir(ctx, creds, defaultSysDirMode, nil),
                "kernel":   kernelDir(ctx, fs, creds),
                "module":   fs.newDir(ctx, creds, defaultSysDirMode, nil),
                "power":    fs.newDir(ctx, creds, defaultSysDirMode, nil),
        })
        var rootD kernfs.Dentry
        rootD.InitRoot(&fs.Filesystem, root)
        return fs.VFSFilesystem(), rootD.VFSDentry(), nil
}

func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode {
        k := kernel.KernelFromContext(ctx)
        maxCPUCores := k.ApplicationCores()
        children := map[string]kernfs.Inode{
                "online":   fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
                "possible": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
                "present":  fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
        }
        for i := uint(0); i < maxCPUCores; i++ {
                children[fmt.Sprintf("cpu%d", i)] = fs.newDir(ctx, creds, linux.FileMode(0555), nil)
        }
        return fs.newDir(ctx, creds, defaultSysDirMode, children)
}

func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode {
        // Set up /sys/kernel/debug/kcov. Technically, debugfs should be
        // mounted at debug/, but for our purposes, it is sufficient to keep it
        // in sys.
        var children map[string]kernfs.Inode
        if coverage.KcovSupported() {
                log.Debugf("Set up /sys/kernel/debug/kcov")
                children = map[string]kernfs.Inode{
                        "debug": fs.newDir(ctx, creds, linux.FileMode(0700), map[string]kernfs.Inode{
                                "kcov": fs.newKcovFile(ctx, creds),
                        }),
                }
        }
        return fs.newDir(ctx, creds, defaultSysDirMode, children)
}

// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
        fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
        fs.Filesystem.Release(ctx)
}

// MountOptions implements vfs.FilesystemImpl.MountOptions.
func (fs *filesystem) MountOptions() string {
        return fmt.Sprintf("dentry_cache_limit=%d", fs.MaxCachedDentries)
}

// dir implements kernfs.Inode.
//
// +stateify savable
type dir struct {
        dirRefs
        kernfs.InodeAlwaysValid
        kernfs.InodeAttrs
        kernfs.InodeNotSymlink
        kernfs.InodeDirectoryNoNewChildren
        kernfs.InodeTemporary
        kernfs.OrderedChildren

        locks vfs.FileLocks
}

func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
        d := &dir{}
        d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
        d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
        d.InitRefs()
        d.IncLinks(d.OrderedChildren.Populate(contents))
        return d
}

// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
        return linuxerr.EPERM
}

// Open implements kernfs.Inode.Open.
func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
                SeekEnd: kernfs.SeekEndStaticEntries,
        })
        if err != nil {
                return nil, err
        }
        return fd.VFSFileDescription(), nil
}

// DecRef implements kernfs.Inode.DecRef.
func (d *dir) DecRef(ctx context.Context) {
        d.dirRefs.DecRef(func() { d.Destroy(ctx) })
}

// StatFS implements kernfs.Inode.StatFS.
func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
        return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
}

// cpuFile implements kernfs.Inode.
//
// +stateify savable
type cpuFile struct {
        implStatFS
        kernfs.DynamicBytesFile

        maxCores uint
}

// Generate implements vfs.DynamicBytesSource.Generate.
func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
        fmt.Fprintf(buf, "0-%d\n", c.maxCores-1)
        return nil
}

func (fs *filesystem) newCPUFile(ctx context.Context, creds *auth.Credentials, maxCores uint, mode linux.FileMode) kernfs.Inode {
        c := &cpuFile{maxCores: maxCores}
        c.DynamicBytesFile.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode)
        return c
}

// +stateify savable
type implStatFS struct{}

// StatFS implements kernfs.Inode.StatFS.
func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
        return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
}




















































  299 

  251 





  346 

  321 



















   73 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package usage

import (
        "sync/atomic"
)

// IO contains I/O-related statistics.
//
// +stateify savable
type IO struct {
        // CharsRead is the number of bytes read by read syscalls.
        CharsRead uint64

        // CharsWritten is the number of bytes written by write syscalls.
        CharsWritten uint64

        // ReadSyscalls is the number of read syscalls.
        ReadSyscalls uint64

        // WriteSyscalls is the number of write syscalls.
        WriteSyscalls uint64

        // The following counter is only meaningful when Sentry has internal
        // pagecache.

        // BytesRead is the number of bytes actually read into pagecache.
        BytesRead uint64

        // BytesWritten is the number of bytes actually written from pagecache.
        BytesWritten uint64

        // BytesWriteCancelled is the number of bytes not written out due to
        // truncation.
        BytesWriteCancelled uint64
}

// AccountReadSyscall does the accounting for a read syscall.
func (i *IO) AccountReadSyscall(bytes int64) {
        atomic.AddUint64(&i.ReadSyscalls, 1)
        if bytes > 0 {
                atomic.AddUint64(&i.CharsRead, uint64(bytes))
        }
}

// AccountWriteSyscall does the accounting for a write syscall.
func (i *IO) AccountWriteSyscall(bytes int64) {
        atomic.AddUint64(&i.WriteSyscalls, 1)
        if bytes > 0 {
                atomic.AddUint64(&i.CharsWritten, uint64(bytes))
        }
}

// AccountReadIO does the accounting for a read IO into the file system.
func (i *IO) AccountReadIO(bytes int64) {
        if bytes > 0 {
                atomic.AddUint64(&i.BytesRead, uint64(bytes))
        }
}

// AccountWriteIO does the accounting for a write IO into the file system.
func (i *IO) AccountWriteIO(bytes int64) {
        if bytes > 0 {
                atomic.AddUint64(&i.BytesWritten, uint64(bytes))
        }
}

// Accumulate adds up io usages.
func (i *IO) Accumulate(io *IO) {
        atomic.AddUint64(&i.CharsRead, atomic.LoadUint64(&io.CharsRead))
        atomic.AddUint64(&i.CharsWritten, atomic.LoadUint64(&io.CharsWritten))
        atomic.AddUint64(&i.ReadSyscalls, atomic.LoadUint64(&io.ReadSyscalls))
        atomic.AddUint64(&i.WriteSyscalls, atomic.LoadUint64(&io.WriteSyscalls))
        atomic.AddUint64(&i.BytesRead, atomic.LoadUint64(&io.BytesRead))
        atomic.AddUint64(&i.BytesWritten, atomic.LoadUint64(&io.BytesWritten))
        atomic.AddUint64(&i.BytesWriteCancelled, atomic.LoadUint64(&io.BytesWriteCancelled))
}

























































    2 















    2 




    2 



   20 
    1 


   20 






   78 

   78 



   78 




   78 











   79 


   77 


   79 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package auth

import (
        "math"

        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sync"
)

// A UserNamespace represents a user namespace. See user_namespaces(7) for
// details.
//
// +stateify savable
type UserNamespace struct {
        // parent is this namespace's parent. If this is the root namespace, parent
        // is nil. The parent pointer is immutable.
        parent *UserNamespace

        // owner is the effective UID of the namespace's creator in the root
        // namespace. owner is immutable.
        owner KUID

        // mu protects the following fields.
        //
        // If mu will be locked in multiple UserNamespaces, it must be locked in
        // descendant namespaces before ancestors.
        mu sync.Mutex `state:"nosave"`

        // Mappings of user/group IDs between this namespace and its parent.
        //
        // All ID maps, once set, cannot be changed. This means that successful
        // UID/GID translations cannot be racy.
        uidMapFromParent idMapSet
        uidMapToParent   idMapSet
        gidMapFromParent idMapSet
        gidMapToParent   idMapSet

        // TODO(b/27454212): Support disabling setgroups(2).
}

// NewRootUserNamespace returns a UserNamespace that is appropriate for a
// system's root user namespace.
func NewRootUserNamespace() *UserNamespace {
        var ns UserNamespace
        // """
        // The initial user namespace has no parent namespace, but, for
        // consistency, the kernel provides dummy user and group ID mapping files
        // for this namespace. Looking at the uid_map file (gid_map is the same)
        // from a shell in the initial namespace shows:
        //
        // $ cat /proc/$$/uid_map
        // 0          0 4294967295
        // """ - user_namespaces(7)
        for _, m := range []*idMapSet{
                &ns.uidMapFromParent,
                &ns.uidMapToParent,
                &ns.gidMapFromParent,
                &ns.gidMapToParent,
        } {
                if !m.Add(idMapRange{0, math.MaxUint32}, 0) {
                        panic("Failed to insert into empty ID map")
                }
        }
        return &ns
}

// Root returns the root of the user namespace tree containing ns.
func (ns *UserNamespace) Root() *UserNamespace {
        for ns.parent != nil {
                ns = ns.parent
        }
        return ns
}

// "The kernel imposes (since version 3.11) a limit of 32 nested levels of user
// namespaces." - user_namespaces(7)
const maxUserNamespaceDepth = 32

func (ns *UserNamespace) depth() int {
        var i int
        for ns != nil {
                i++
                ns = ns.parent
        }
        return i
}

// NewChildUserNamespace returns a new user namespace created by a caller with
// credentials c.
func (c *Credentials) NewChildUserNamespace() (*UserNamespace, error) {
        if c.UserNamespace.depth() >= maxUserNamespaceDepth {
                // "... Calls to unshare(2) or clone(2) that would cause this limit to
                // be exceeded fail with the error EUSERS." - user_namespaces(7)
                return nil, linuxerr.EUSERS
        }
        // "EPERM: CLONE_NEWUSER was specified in flags, but either the effective
        // user ID or the effective group ID of the caller does not have a mapping
        // in the parent namespace (see user_namespaces(7))." - clone(2)
        // "CLONE_NEWUSER requires that the user ID and group ID of the calling
        // process are mapped to user IDs and group IDs in the user namespace of
        // the calling process at the time of the call." - unshare(2)
        if !c.EffectiveKUID.In(c.UserNamespace).Ok() {
                return nil, linuxerr.EPERM
        }
        if !c.EffectiveKGID.In(c.UserNamespace).Ok() {
                return nil, linuxerr.EPERM
        }
        return &UserNamespace{
                parent: c.UserNamespace,
                owner:  c.EffectiveKUID,
                // "When a user namespace is created, it starts without a mapping of
                // user IDs (group IDs) to the parent user namespace." -
                // user_namespaces(7)
        }, nil
}


































    4 




    4 



    5 




    5 



















   20 


    3 


   20 



   12 


    1 


   11 


   23 



   24 


   24 



   24 



   24 


   22 



   22 


















   22 




   22 



   24 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package usermem

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/safemem"
)

const maxInt = int(^uint(0) >> 1)

// BytesIO implements IO using a byte slice. Addresses are interpreted as
// offsets into the slice. Reads and writes beyond the end of the slice return
// EFAULT.
type BytesIO struct {
        Bytes []byte
}

// CopyOut implements IO.CopyOut.
func (b *BytesIO) CopyOut(ctx context.Context, addr hostarch.Addr, src []byte, opts IOOpts) (int, error) {
        rngN, rngErr := b.rangeCheck(addr, len(src))
        if rngN == 0 {
                return 0, rngErr
        }
        return copy(b.Bytes[int(addr):], src[:rngN]), rngErr
}

// CopyIn implements IO.CopyIn.
func (b *BytesIO) CopyIn(ctx context.Context, addr hostarch.Addr, dst []byte, opts IOOpts) (int, error) {
        rngN, rngErr := b.rangeCheck(addr, len(dst))
        if rngN == 0 {
                return 0, rngErr
        }
        return copy(dst[:rngN], b.Bytes[int(addr):]), rngErr
}

// ZeroOut implements IO.ZeroOut.
func (b *BytesIO) ZeroOut(ctx context.Context, addr hostarch.Addr, toZero int64, opts IOOpts) (int64, error) {
        if toZero > int64(maxInt) {
                return 0, linuxerr.EINVAL
        }
        rngN, rngErr := b.rangeCheck(addr, int(toZero))
        if rngN == 0 {
                return 0, rngErr
        }
        zeroSlice := b.Bytes[int(addr) : int(addr)+rngN]
        for i := range zeroSlice {
                zeroSlice[i] = 0
        }
        return int64(rngN), rngErr
}

// CopyOutFrom implements IO.CopyOutFrom.
func (b *BytesIO) CopyOutFrom(ctx context.Context, ars hostarch.AddrRangeSeq, src safemem.Reader, opts IOOpts) (int64, error) {
        dsts, rngErr := b.blocksFromAddrRanges(ars)
        n, err := src.ReadToBlocks(dsts)
        if err != nil {
                return int64(n), err
        }
        return int64(n), rngErr
}

// CopyInTo implements IO.CopyInTo.
func (b *BytesIO) CopyInTo(ctx context.Context, ars hostarch.AddrRangeSeq, dst safemem.Writer, opts IOOpts) (int64, error) {
        srcs, rngErr := b.blocksFromAddrRanges(ars)
        n, err := dst.WriteFromBlocks(srcs)
        if err != nil {
                return int64(n), err
        }
        return int64(n), rngErr
}

func (b *BytesIO) rangeCheck(addr hostarch.Addr, length int) (int, error) {
        if length == 0 {
                return 0, nil
        }
        if length < 0 {
                return 0, linuxerr.EINVAL
        }
        max := hostarch.Addr(len(b.Bytes))
        if addr >= max {
                return 0, linuxerr.EFAULT
        }
        end, ok := addr.AddLength(uint64(length))
        if !ok || end > max {
                return int(max - addr), linuxerr.EFAULT
        }
        return length, nil
}

func (b *BytesIO) blocksFromAddrRanges(ars hostarch.AddrRangeSeq) (safemem.BlockSeq, error) {
        switch ars.NumRanges() {
        case 0:
                return safemem.BlockSeq{}, nil
        case 1:
                block, err := b.blockFromAddrRange(ars.Head())
                return safemem.BlockSeqOf(block), err
        default:
                blocks := make([]safemem.Block, 0, ars.NumRanges())
                for !ars.IsEmpty() {
                        block, err := b.blockFromAddrRange(ars.Head())
                        if block.Len() != 0 {
                                blocks = append(blocks, block)
                        }
                        if err != nil {
                                return safemem.BlockSeqFromSlice(blocks), err
                        }
                        ars = ars.Tail()
                }
                return safemem.BlockSeqFromSlice(blocks), nil
        }
}

func (b *BytesIO) blockFromAddrRange(ar hostarch.AddrRange) (safemem.Block, error) {
        n, err := b.rangeCheck(ar.Start, int(ar.Length()))
        if n == 0 {
                return safemem.Block{}, err
        }
        return safemem.BlockFromSafeSlice(b.Bytes[int(ar.Start) : int(ar.Start)+n]), err
}

// BytesIOSequence returns an IOSequence representing the given byte slice.
func BytesIOSequence(buf []byte) IOSequence {
        return IOSequence{
                IO:    &BytesIO{buf},
                Addrs: hostarch.AddrRangeSeqOf(hostarch.AddrRange{0, hostarch.Addr(len(buf))}),
        }
}





















































    9 







    9 





    9 




    2 








    2 




    2 






    1 



    1 














    5 


    2 




    3 




    1 




    4 




    3 




   12 






   12 





    2 

    2 


    2 







    1 







    1 


    1 


    3 






    1 






    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package devpts

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// replicaInode is the inode for the replica end of the Terminal.
//
// +stateify savable
type replicaInode struct {
        implStatFS
        kernfs.InodeAttrs
        kernfs.InodeNoopRefCount
        kernfs.InodeNotDirectory
        kernfs.InodeNotSymlink

        locks vfs.FileLocks

        // root is the devpts root inode.
        root *rootInode

        // t is the connected Terminal.
        t *Terminal
}

var _ kernfs.Inode = (*replicaInode)(nil)

// Open implements kernfs.Inode.Open.
func (ri *replicaInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd := &replicaFileDescription{
                inode: ri,
        }
        fd.LockFD.Init(&ri.locks)
        if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
                return nil, err
        }
        if opts.Flags&linux.O_NOCTTY == 0 {
                // Opening a replica sets the process' controlling TTY when
                // possible. An error indicates it cannot be set, and is
                // ignored silently.
                _ = fd.inode.t.setControllingTTY(ctx, false /* steal */, false /* isMaster */, fd.vfsfd.IsReadable())
        }
        return &fd.vfsfd, nil

}

// Valid implements kernfs.Inode.Valid.
func (ri *replicaInode) Valid(context.Context) bool {
        // Return valid if the replica still exists.
        ri.root.mu.Lock()
        defer ri.root.mu.Unlock()
        _, ok := ri.root.replicas[ri.t.n]
        return ok
}

// Stat implements kernfs.Inode.Stat.
func (ri *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
        statx, err := ri.InodeAttrs.Stat(ctx, vfsfs, opts)
        if err != nil {
                return linux.Statx{}, err
        }
        statx.Blksize = 1024
        statx.RdevMajor = linux.UNIX98_PTY_REPLICA_MAJOR
        statx.RdevMinor = ri.t.n
        return statx, nil
}

// SetStat implements kernfs.Inode.SetStat
func (ri *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
        if opts.Stat.Mask&linux.STATX_SIZE != 0 {
                return linuxerr.EINVAL
        }
        return ri.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
}

// +stateify savable
type replicaFileDescription struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.LockFD

        inode *replicaInode
}

var _ vfs.FileDescriptionImpl = (*replicaFileDescription)(nil)

// Release implements fs.FileOperations.Release.
func (rfd *replicaFileDescription) Release(ctx context.Context) {}

// EventRegister implements waiter.Waitable.EventRegister.
func (rfd *replicaFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        rfd.inode.t.ld.replicaWaiter.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (rfd *replicaFileDescription) EventUnregister(e *waiter.Entry) {
        rfd.inode.t.ld.replicaWaiter.EventUnregister(e)
}

// Readiness implements waiter.Waitable.Readiness.
func (rfd *replicaFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
        return rfd.inode.t.ld.replicaReadiness()
}

// Read implements vfs.FileDescriptionImpl.Read.
func (rfd *replicaFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
        return rfd.inode.t.ld.inputQueueRead(ctx, dst)
}

// Write implements vfs.FileDescriptionImpl.Write.
func (rfd *replicaFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
        return rfd.inode.t.ld.outputQueueWrite(ctx, src)
}

// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
func (rfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        t := kernel.TaskFromContext(ctx)
        if t == nil {
                // ioctl(2) may only be called from a task goroutine.
                return 0, linuxerr.ENOTTY
        }

        switch cmd := args[1].Uint(); cmd {
        case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
                // Get the number of bytes in the input queue read buffer.
                return 0, rfd.inode.t.ld.inputQueueReadSize(t, io, args)
        case linux.TCGETS:
                return rfd.inode.t.ld.getTermios(t, args)
        case linux.TCSETS:
                return rfd.inode.t.ld.setTermios(t, args)
        case linux.TCSETSW:
                // TODO(b/29356795): This should drain the output queue first.
                return rfd.inode.t.ld.setTermios(t, args)
        case linux.TIOCGPTN:
                nP := primitive.Uint32(rfd.inode.t.n)
                _, err := nP.CopyOut(t, args[2].Pointer())
                return 0, err
        case linux.TIOCGWINSZ:
                return 0, rfd.inode.t.ld.windowSize(t, args)
        case linux.TIOCSWINSZ:
                return 0, rfd.inode.t.ld.setWindowSize(t, args)
        case linux.TIOCSCTTY:
                // Make the given terminal the controlling terminal of the
                // calling process.
                steal := args[2].Int() == 1
                return 0, rfd.inode.t.setControllingTTY(ctx, steal, false /* isMaster */, rfd.vfsfd.IsReadable())
        case linux.TIOCNOTTY:
                // Release this process's controlling terminal.
                return 0, rfd.inode.t.releaseControllingTTY(ctx, false /* isMaster */)
        case linux.TIOCGPGRP:
                // Get the foreground process group.
                return rfd.inode.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
        case linux.TIOCSPGRP:
                // Set the foreground process group.
                return rfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
        default:
                maybeEmitUnimplementedEvent(ctx, cmd)
                return 0, linuxerr.ENOTTY
        }
}

// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (rfd *replicaFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
        creds := auth.CredentialsFromContext(ctx)
        fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem()
        return rfd.inode.SetStat(ctx, fs, creds, opts)
}

// Stat implements vfs.FileDescriptionImpl.Stat.
func (rfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
        fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem()
        return rfd.inode.Stat(ctx, fs, opts)
}








































   40 











    7 







    7 













    6 

    1 










    6 









   25 









































    6 







    7 
    6 







   23 






   23 



   23 





    4 




    1 



    1 




    4 




   23 









   23 
    3 






    9 






   23 


















    6 



    4 


    2 



    2 

















    8 




    8 





   13 
    7 





    1 




    1 




    1 




    1 





    8 





    8 



    6 



































   25 




   24 




















    8 








    8 










    1 


    7 




    7 




    7 
    1 



    6 






   23 

    1 




   22 



   22 




   29 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mm

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        "gvisor.dev/gvisor/pkg/sentry/usage"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/usermem"
)

// aioManager creates and manages asynchronous I/O contexts.
//
// +stateify savable
type aioManager struct {
        // mu protects below.
        mu sync.Mutex `state:"nosave"`

        // aioContexts is the set of asynchronous I/O contexts.
        contexts map[uint64]*AIOContext
}

func (mm *MemoryManager) destroyAIOManager(ctx context.Context) {
        mm.aioManager.mu.Lock()
        defer mm.aioManager.mu.Unlock()

        for id := range mm.aioManager.contexts {
                mm.destroyAIOContextLocked(ctx, id)
        }
}

// newAIOContext creates a new context for asynchronous I/O.
//
// Returns false if 'id' is currently in use.
func (a *aioManager) newAIOContext(events uint32, id uint64) bool {
        a.mu.Lock()
        defer a.mu.Unlock()

        if _, ok := a.contexts[id]; ok {
                return false
        }

        a.contexts[id] = &AIOContext{
                requestReady:   make(chan struct{}, 1),
                maxOutstanding: events,
        }
        return true
}

// destroyAIOContext destroys an asynchronous I/O context. It doesn't wait for
// for pending requests to complete. Returns the destroyed AIOContext so it can
// be drained.
//
// Nil is returned if the context does not exist.
//
// Precondition: mm.aioManager.mu is locked.
func (mm *MemoryManager) destroyAIOContextLocked(ctx context.Context, id uint64) *AIOContext {
        aioCtx, ok := mm.aioManager.contexts[id]
        if !ok {
                return nil
        }

        // Only unmaps after it assured that the address is a valid aio context to
        // prevent random memory from been unmapped.
        //
        // Note: It's possible to unmap this address and map something else into
        // the same address. Then it would be unmapping memory that it doesn't own.
        // This is, however, the way Linux implements AIO. Keeps the same [weird]
        // semantics in case anyone relies on it.
        mm.MUnmap(ctx, hostarch.Addr(id), aioRingBufferSize)

        delete(mm.aioManager.contexts, id)
        aioCtx.destroy()
        return aioCtx
}

// lookupAIOContext looks up the given context.
//
// Returns false if context does not exist.
func (a *aioManager) lookupAIOContext(id uint64) (*AIOContext, bool) {
        a.mu.Lock()
        defer a.mu.Unlock()
        ctx, ok := a.contexts[id]
        return ctx, ok
}

// ioResult is a completed I/O operation.
//
// +stateify savable
type ioResult struct {
        data interface{}
        ioEntry
}

// AIOContext is a single asynchronous I/O context.
//
// +stateify savable
type AIOContext struct {
        // requestReady is the notification channel used for all requests.
        requestReady chan struct{} `state:"nosave"`

        // mu protects below.
        mu sync.Mutex `state:"nosave"`

        // results is the set of completed requests.
        results ioList

        // maxOutstanding is the maximum number of outstanding entries; this value
        // is immutable.
        maxOutstanding uint32

        // outstanding is the number of requests outstanding; this will effectively
        // be the number of entries in the result list or that are expected to be
        // added to the result list.
        outstanding uint32

        // dead is set when the context is destroyed.
        dead bool `state:"zerovalue"`
}

// destroy marks the context dead.
func (ctx *AIOContext) destroy() {
        ctx.mu.Lock()
        defer ctx.mu.Unlock()
        ctx.dead = true
        ctx.checkForDone()
}

// Preconditions: ctx.mu must be held by caller.
func (ctx *AIOContext) checkForDone() {
        if ctx.dead && ctx.outstanding == 0 {
                close(ctx.requestReady)
                ctx.requestReady = nil
        }
}

// Prepare reserves space for a new request, returning nil if available.
// Returns EAGAIN if the context is busy and EINVAL if the context is dead.
func (ctx *AIOContext) Prepare() error {
        ctx.mu.Lock()
        defer ctx.mu.Unlock()
        if ctx.dead {
                // Context died after the caller looked it up.
                return linuxerr.EINVAL
        }
        if ctx.outstanding >= ctx.maxOutstanding {
                // Context is busy.
                return linuxerr.EAGAIN
        }
        ctx.outstanding++
        return nil
}

// PopRequest pops a completed request if available, this function does not do
// any blocking. Returns false if no request is available.
func (ctx *AIOContext) PopRequest() (interface{}, bool) {
        ctx.mu.Lock()
        defer ctx.mu.Unlock()

        // Is there anything ready?
        if e := ctx.results.Front(); e != nil {
                if ctx.outstanding == 0 {
                        panic("AIOContext outstanding is going negative")
                }
                ctx.outstanding--
                ctx.results.Remove(e)
                ctx.checkForDone()
                return e.data, true
        }
        return nil, false
}

// FinishRequest finishes a pending request. It queues up the data
// and notifies listeners.
func (ctx *AIOContext) FinishRequest(data interface{}) {
        ctx.mu.Lock()
        defer ctx.mu.Unlock()

        // Push to the list and notify opportunistically. The channel notify
        // here is guaranteed to be safe because outstanding must be non-zero.
        // The requestReady channel is only closed when outstanding reaches zero.
        ctx.results.PushBack(&ioResult{data: data})

        select {
        case ctx.requestReady <- struct{}{}:
        default:
        }
}

// WaitChannel returns a channel that is notified when an AIO request is
// completed. Returns nil if the context is destroyed and there are no more
// outstanding requests.
func (ctx *AIOContext) WaitChannel() chan struct{} {
        ctx.mu.Lock()
        defer ctx.mu.Unlock()
        return ctx.requestReady
}

// Dead returns true if the context has been destroyed.
func (ctx *AIOContext) Dead() bool {
        ctx.mu.Lock()
        defer ctx.mu.Unlock()
        return ctx.dead
}

// CancelPendingRequest forgets about a request that hasn't yet completed.
func (ctx *AIOContext) CancelPendingRequest() {
        ctx.mu.Lock()
        defer ctx.mu.Unlock()

        if ctx.outstanding == 0 {
                panic("AIOContext outstanding is going negative")
        }
        ctx.outstanding--
        ctx.checkForDone()
}

// Drain drops all completed requests. Pending requests remain untouched.
func (ctx *AIOContext) Drain() {
        ctx.mu.Lock()
        defer ctx.mu.Unlock()

        if ctx.outstanding == 0 {
                return
        }
        size := uint32(ctx.results.Len())
        if ctx.outstanding < size {
                panic("AIOContext outstanding is going negative")
        }
        ctx.outstanding -= size
        ctx.results.Reset()
        ctx.checkForDone()
}

// aioMappable implements memmap.MappingIdentity and memmap.Mappable for AIO
// ring buffers.
//
// +stateify savable
type aioMappable struct {
        aioMappableRefs

        mfp pgalloc.MemoryFileProvider
        fr  memmap.FileRange
}

var aioRingBufferSize = uint64(hostarch.Addr(linux.AIORingSize).MustRoundUp())

func newAIOMappable(mfp pgalloc.MemoryFileProvider) (*aioMappable, error) {
        fr, err := mfp.MemoryFile().Allocate(aioRingBufferSize, usage.Anonymous)
        if err != nil {
                return nil, err
        }
        m := aioMappable{mfp: mfp, fr: fr}
        m.InitRefs()
        return &m, nil
}

// DecRef implements refs.RefCounter.DecRef.
func (m *aioMappable) DecRef(ctx context.Context) {
        m.aioMappableRefs.DecRef(func() {
                m.mfp.MemoryFile().DecRef(m.fr)
        })
}

// MappedName implements memmap.MappingIdentity.MappedName.
func (m *aioMappable) MappedName(ctx context.Context) string {
        return "[aio]"
}

// DeviceID implements memmap.MappingIdentity.DeviceID.
func (m *aioMappable) DeviceID() uint64 {
        return 0
}

// InodeID implements memmap.MappingIdentity.InodeID.
func (m *aioMappable) InodeID() uint64 {
        return 0
}

// Msync implements memmap.MappingIdentity.Msync.
func (m *aioMappable) Msync(ctx context.Context, mr memmap.MappableRange) error {
        // Linux: aio_ring_fops.fsync == NULL
        return linuxerr.EINVAL
}

// AddMapping implements memmap.Mappable.AddMapping.
func (m *aioMappable) AddMapping(_ context.Context, _ memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, _ bool) error {
        // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
        // sets VM_DONTEXPAND).
        if offset != 0 || uint64(ar.Length()) != aioRingBufferSize {
                return linuxerr.EFAULT
        }
        return nil
}

// RemoveMapping implements memmap.Mappable.RemoveMapping.
func (m *aioMappable) RemoveMapping(context.Context, memmap.MappingSpace, hostarch.AddrRange, uint64, bool) {
}

// CopyMapping implements memmap.Mappable.CopyMapping.
func (m *aioMappable) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, _ bool) error {
        // Don't allow mappings to be expanded (in Linux, fs/aio.c:aio_ring_mmap()
        // sets VM_DONTEXPAND).
        if offset != 0 || uint64(dstAR.Length()) != aioRingBufferSize {
                return linuxerr.EFAULT
        }
        // Require that the mapping correspond to a live AIOContext. Compare
        // Linux's fs/aio.c:aio_ring_mremap().
        mm, ok := ms.(*MemoryManager)
        if !ok {
                return linuxerr.EINVAL
        }
        am := &mm.aioManager
        am.mu.Lock()
        defer am.mu.Unlock()
        oldID := uint64(srcAR.Start)
        aioCtx, ok := am.contexts[oldID]
        if !ok {
                return linuxerr.EINVAL
        }
        aioCtx.mu.Lock()
        defer aioCtx.mu.Unlock()
        if aioCtx.dead {
                return linuxerr.EINVAL
        }
        // Use the new ID for the AIOContext.
        am.contexts[uint64(dstAR.Start)] = aioCtx
        delete(am.contexts, oldID)
        return nil
}

// Translate implements memmap.Mappable.Translate.
func (m *aioMappable) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
        var err error
        if required.End > m.fr.Length() {
                err = &memmap.BusError{linuxerr.EFAULT}
        }
        if source := optional.Intersect(memmap.MappableRange{0, m.fr.Length()}); source.Length() != 0 {
                return []memmap.Translation{
                        {
                                Source: source,
                                File:   m.mfp.MemoryFile(),
                                Offset: m.fr.Start + source.Start,
                                Perms:  hostarch.AnyAccess,
                        },
                }, err
        }
        return nil, err
}

// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
func (m *aioMappable) InvalidateUnsavable(ctx context.Context) error {
        return nil
}

// NewAIOContext creates a new context for asynchronous I/O.
//
// NewAIOContext is analogous to Linux's fs/aio.c:ioctx_alloc().
func (mm *MemoryManager) NewAIOContext(ctx context.Context, events uint32) (uint64, error) {
        // libaio get_ioevents() expects context "handle" to be a valid address.
        // libaio peeks inside looking for a magic number. This function allocates
        // a page per context and keeps it set to zeroes to ensure it will not
        // match AIO_RING_MAGIC and make libaio happy.
        m, err := newAIOMappable(mm.mfp)
        if err != nil {
                return 0, err
        }
        defer m.DecRef(ctx)
        addr, err := mm.MMap(ctx, memmap.MMapOpts{
                Length:          aioRingBufferSize,
                MappingIdentity: m,
                Mappable:        m,
                // Linux uses "do_mmap_pgoff(..., PROT_READ | PROT_WRITE, ...)" in
                // fs/aio.c:aio_setup_ring(). Since we don't implement AIO_RING_MAGIC,
                // user mode should not write to this page.
                Perms:    hostarch.Read,
                MaxPerms: hostarch.Read,
        })
        if err != nil {
                return 0, err
        }
        id := uint64(addr)
        if !mm.aioManager.newAIOContext(events, id) {
                mm.MUnmap(ctx, addr, aioRingBufferSize)
                return 0, linuxerr.EINVAL
        }
        return id, nil
}

// DestroyAIOContext destroys an asynchronous I/O context. It returns the
// destroyed context. nil if the context does not exist.
func (mm *MemoryManager) DestroyAIOContext(ctx context.Context, id uint64) *AIOContext {
        if !mm.isValidAddr(ctx, id) {
                return nil
        }

        mm.aioManager.mu.Lock()
        defer mm.aioManager.mu.Unlock()
        return mm.destroyAIOContextLocked(ctx, id)
}

// LookupAIOContext looks up the given context. It returns false if the context
// does not exist.
func (mm *MemoryManager) LookupAIOContext(ctx context.Context, id uint64) (*AIOContext, bool) {
        aioCtx, ok := mm.aioManager.lookupAIOContext(id)
        if !ok {
                return nil, false
        }

        // Protect against 'id' that is inaccessible.
        if !mm.isValidAddr(ctx, id) {
                return nil, false
        }

        return aioCtx, true
}

// isValidAddr determines if the address `id` is valid. (Linux also reads 4
// bytes from id).
func (mm *MemoryManager) isValidAddr(ctx context.Context, id uint64) bool {
        var buf [4]byte
        _, err := mm.CopyIn(ctx, hostarch.Addr(id), buf[:], usermem.IOOpts{})
        return err == nil
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/mm/pma_set.go: no such file or directory




































































































































    6 







    6 
    6 



    6 




    2 




    2 

























    4 




    2 

    2 

    1 



    1 







    1 














    2 






































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package overlay provides an overlay filesystem implementation, which
// synthesizes a filesystem by composing one or more immutable filesystems
// ("lower layers") with an optional mutable filesystem ("upper layer").
//
// Lock order:
//
// directoryFD.mu / regularFileFD.mu
//   filesystem.renameMu
//     dentry.dirMu
//       dentry.copyMu
//         filesystem.devMu
//         *** "memmap.Mappable locks" below this point
//         dentry.mapsMu
//           *** "memmap.Mappable locks taken by Translate" below this point
//           dentry.dataMu
//
// Locking dentry.dirMu in multiple dentries requires that parent dentries are
// locked before child dentries, and that filesystem.renameMu is locked to
// stabilize this relationship.
package overlay

import (
        "fmt"
        "strings"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/refsvfs2"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
)

// Name is the default filesystem name.
const Name = "overlay"

// FilesystemType implements vfs.FilesystemType.
//
// +stateify savable
type FilesystemType struct{}

// Name implements vfs.FilesystemType.Name.
func (FilesystemType) Name() string {
        return Name
}

// Release implements FilesystemType.Release.
func (FilesystemType) Release(ctx context.Context) {}

// FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to
// FilesystemType.GetFilesystem.
//
// +stateify savable
type FilesystemOptions struct {
        // Callers passing FilesystemOptions to
        // overlay.FilesystemType.GetFilesystem() are responsible for ensuring that
        // the vfs.Mounts comprising the layers of the overlay filesystem do not
        // contain submounts.

        // If UpperRoot.Ok(), it is the root of the writable upper layer of the
        // overlay.
        UpperRoot vfs.VirtualDentry

        // LowerRoots contains the roots of the immutable lower layers of the
        // overlay. LowerRoots is immutable.
        LowerRoots []vfs.VirtualDentry
}

// filesystem implements vfs.FilesystemImpl.
//
// +stateify savable
type filesystem struct {
        vfsfs vfs.Filesystem

        // Immutable options.
        opts FilesystemOptions

        // creds is a copy of the filesystem's creator's credentials, which are
        // used for accesses to the filesystem's layers. creds is immutable.
        creds *auth.Credentials

        // privateDevMinors maps device numbers from layer filesystems to device
        // minor numbers assigned to files originating from that filesystem.
        //
        // For non-directory files, this remapping is necessary for lower layers
        // because a file on a lower layer, and that same file on an overlay, are
        // distinguishable because they will diverge after copy-up. (Once a
        // non-directory file has been copied up, its contents on the upper layer
        // completely determine its contents in the overlay, so this is no longer
        // true; but we still do the mapping for consistency.)
        //
        // For directories, this remapping may be necessary even if the directory
        // exists on the upper layer due to directory merging; rather than make the
        // mapping conditional on whether the directory is opaque, we again
        // unconditionally apply the mapping unconditionally.
        //
        // privateDevMinors is protected by devMu.
        devMu            sync.Mutex `state:"nosave"`
        privateDevMinors map[layerDevNumber]uint32

        // renameMu synchronizes renaming with non-renaming operations in order to
        // ensure consistent lock ordering between dentry.dirMu in different
        // dentries.
        renameMu sync.RWMutex `state:"nosave"`
}

// +stateify savable
type layerDevNumber struct {
        major uint32
        minor uint32
}

// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
        mopts := vfs.GenericParseMountOptions(opts.Data)
        fsoptsRaw := opts.InternalData
        fsopts, ok := fsoptsRaw.(FilesystemOptions)
        if fsoptsRaw != nil && !ok {
                ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
                return nil, nil, linuxerr.EINVAL
        }
        vfsroot := vfs.RootFromContext(ctx)
        if vfsroot.Ok() {
                defer vfsroot.DecRef(ctx)
        }

        if upperPathname, ok := mopts["upperdir"]; ok {
                if fsopts.UpperRoot.Ok() {
                        ctx.Infof("overlay.FilesystemType.GetFilesystem: both upperdir and FilesystemOptions.UpperRoot are specified")
                        return nil, nil, linuxerr.EINVAL
                }
                delete(mopts, "upperdir")
                // Linux overlayfs also requires a workdir when upperdir is
                // specified; we don't, so silently ignore this option.
                delete(mopts, "workdir")
                upperPath := fspath.Parse(upperPathname)
                if !upperPath.Absolute {
                        ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
                        return nil, nil, linuxerr.EINVAL
                }
                upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
                        Root:               vfsroot,
                        Start:              vfsroot,
                        Path:               upperPath,
                        FollowFinalSymlink: true,
                }, &vfs.GetDentryOptions{
                        CheckSearchable: true,
                })
                if err != nil {
                        ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
                        return nil, nil, err
                }
                privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
                upperRoot.DecRef(ctx)
                if err != nil {
                        ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
                        return nil, nil, err
                }
                defer privateUpperRoot.DecRef(ctx)
                fsopts.UpperRoot = privateUpperRoot
        }

        if lowerPathnamesStr, ok := mopts["lowerdir"]; ok {
                if len(fsopts.LowerRoots) != 0 {
                        ctx.Infof("overlay.FilesystemType.GetFilesystem: both lowerdir and FilesystemOptions.LowerRoots are specified")
                        return nil, nil, linuxerr.EINVAL
                }
                delete(mopts, "lowerdir")
                lowerPathnames := strings.Split(lowerPathnamesStr, ":")
                for _, lowerPathname := range lowerPathnames {
                        lowerPath := fspath.Parse(lowerPathname)
                        if !lowerPath.Absolute {
                                ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
                                return nil, nil, linuxerr.EINVAL
                        }
                        lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
                                Root:               vfsroot,
                                Start:              vfsroot,
                                Path:               lowerPath,
                                FollowFinalSymlink: true,
                        }, &vfs.GetDentryOptions{
                                CheckSearchable: true,
                        })
                        if err != nil {
                                ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
                                return nil, nil, err
                        }
                        privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */)
                        lowerRoot.DecRef(ctx)
                        if err != nil {
                                ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
                                return nil, nil, err
                        }
                        defer privateLowerRoot.DecRef(ctx)
                        fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot)
                }
        }

        if len(mopts) != 0 {
                ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
                return nil, nil, linuxerr.EINVAL
        }

        if len(fsopts.LowerRoots) == 0 {
                ctx.Infof("overlay.FilesystemType.GetFilesystem: at least one lower layer is required")
                return nil, nil, linuxerr.EINVAL
        }
        if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
                ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lower layers are required when no upper layer is present")
                return nil, nil, linuxerr.EINVAL
        }
        const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
        if len(fsopts.LowerRoots) > maxLowerLayers {
                ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lower layers specified, maximum %d", len(fsopts.LowerRoots), maxLowerLayers)
                return nil, nil, linuxerr.EINVAL
        }

        // Take extra references held by the filesystem.
        if fsopts.UpperRoot.Ok() {
                fsopts.UpperRoot.IncRef()
        }
        for _, lowerRoot := range fsopts.LowerRoots {
                lowerRoot.IncRef()
        }

        fs := &filesystem{
                opts:             fsopts,
                creds:            creds.Fork(),
                privateDevMinors: make(map[layerDevNumber]uint32),
        }
        fs.vfsfs.Init(vfsObj, &fstype, fs)

        // Construct the root dentry.
        root := fs.newDentry()
        root.refs = 1
        if fs.opts.UpperRoot.Ok() {
                fs.opts.UpperRoot.IncRef()
                root.copiedUp = 1
                root.upperVD = fs.opts.UpperRoot
        }
        for _, lowerRoot := range fs.opts.LowerRoots {
                lowerRoot.IncRef()
                root.lowerVDs = append(root.lowerVDs, lowerRoot)
        }
        rootTopVD := root.topLayer()
        // Get metadata from the topmost layer. See fs.lookupLocked().
        const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
        rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
                Root:  rootTopVD,
                Start: rootTopVD,
        }, &vfs.StatOptions{
                Mask: rootStatMask,
        })
        if err != nil {
                root.destroyLocked(ctx)
                fs.vfsfs.DecRef(ctx)
                return nil, nil, err
        }
        if rootStat.Mask&rootStatMask != rootStatMask {
                root.destroyLocked(ctx)
                fs.vfsfs.DecRef(ctx)
                return nil, nil, linuxerr.EREMOTE
        }
        if isWhiteout(&rootStat) {
                ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
                root.destroyLocked(ctx)
                fs.vfsfs.DecRef(ctx)
                return nil, nil, linuxerr.EINVAL
        }
        root.mode = uint32(rootStat.Mode)
        root.uid = rootStat.UID
        root.gid = rootStat.GID
        root.devMajor = linux.UNNAMED_MAJOR
        rootDevMinor, err := fs.getPrivateDevMinor(rootStat.DevMajor, rootStat.DevMinor)
        if err != nil {
                ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err)
                root.destroyLocked(ctx)
                fs.vfsfs.DecRef(ctx)
                return nil, nil, err
        }
        root.devMinor = rootDevMinor
        root.ino = rootStat.Ino

        return &fs.vfsfs, &root.vfsd, nil
}

// clonePrivateMount creates a non-recursive bind mount rooted at vd, not
// associated with any MountNamespace, and returns the root of the new mount.
// (This is required to ensure that each layer of an overlay comprises only a
// single mount, and therefore can't cross into e.g. the overlay filesystem
// itself, risking lock recursion.) A reference is held on the returned
// VirtualDentry.
func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) {
        oldmnt := vd.Mount()
        opts := oldmnt.Options()
        if forceReadOnly {
                opts.ReadOnly = true
        }
        newmnt, err := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts)
        if err != nil {
                return vfs.VirtualDentry{}, err
        }
        // Take a reference on the dentry which will be owned by the returned
        // VirtualDentry.
        d := vd.Dentry()
        d.IncRef()
        return vfs.MakeVirtualDentry(newmnt, d), nil
}

// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
        vfsObj := fs.vfsfs.VirtualFilesystem()
        for _, devMinor := range fs.privateDevMinors {
                vfsObj.PutAnonBlockDevMinor(devMinor)
        }
        if fs.opts.UpperRoot.Ok() {
                fs.opts.UpperRoot.DecRef(ctx)
        }
        for _, lowerRoot := range fs.opts.LowerRoots {
                lowerRoot.DecRef(ctx)
        }
}

func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) {
        // Always statfs the root of the topmost layer. Compare Linux's
        // fs/overlayfs/super.c:ovl_statfs().
        var rootVD vfs.VirtualDentry
        if fs.opts.UpperRoot.Ok() {
                rootVD = fs.opts.UpperRoot
        } else {
                rootVD = fs.opts.LowerRoots[0]
        }
        fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{
                Root:  rootVD,
                Start: rootVD,
        })
        if err != nil {
                return linux.Statfs{}, err
        }
        fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC
        return fsstat, nil
}

func (fs *filesystem) getPrivateDevMinor(layerMajor, layerMinor uint32) (uint32, error) {
        fs.devMu.Lock()
        defer fs.devMu.Unlock()
        orig := layerDevNumber{layerMajor, layerMinor}
        if minor, ok := fs.privateDevMinors[orig]; ok {
                return minor, nil
        }
        minor, err := fs.vfsfs.VirtualFilesystem().GetAnonBlockDevMinor()
        if err != nil {
                return 0, err
        }
        fs.privateDevMinors[orig] = minor
        return minor, nil
}

// dentry implements vfs.DentryImpl.
//
// +stateify savable
type dentry struct {
        vfsd vfs.Dentry

        refs int64

        // fs is the owning filesystem. fs is immutable.
        fs *filesystem

        // mode, uid, and gid are the file mode, owner, and group of the file in
        // the topmost layer (and therefore the overlay file as well), and are used
        // for permission checks on this dentry. These fields are protected by
        // copyMu and accessed using atomic memory operations.
        mode uint32
        uid  uint32
        gid  uint32

        // copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and
        // 0 otherwise. copiedUp is accessed using atomic memory operations.
        copiedUp uint32

        // parent is the dentry corresponding to this dentry's parent directory.
        // name is this dentry's name in parent. If this dentry is a filesystem
        // root, parent is nil and name is the empty string. parent and name are
        // protected by fs.renameMu.
        parent *dentry
        name   string

        // If this dentry represents a directory, children maps the names of
        // children for which dentries have been instantiated to those dentries,
        // and dirents (if not nil) is a cache of dirents as returned by
        // directoryFDs representing this directory. children is protected by
        // dirMu.
        dirMu    sync.Mutex `state:"nosave"`
        children map[string]*dentry
        dirents  []vfs.Dirent

        // upperVD and lowerVDs are the files from the overlay filesystem's layers
        // that comprise the file on the overlay filesystem.
        //
        // If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e.
        // be copied up) with copyMu locked for writing; otherwise, it is
        // immutable. lowerVDs is always immutable.
        copyMu   sync.RWMutex `state:"nosave"`
        upperVD  vfs.VirtualDentry
        lowerVDs []vfs.VirtualDentry

        // inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <=
        // len(inlineLowerVDs).
        inlineLowerVDs [1]vfs.VirtualDentry

        // devMajor, devMinor, and ino are the device major/minor and inode numbers
        // used by this dentry. These fields are protected by copyMu and accessed
        // using atomic memory operations.
        devMajor uint32
        devMinor uint32
        ino      uint64

        // If this dentry represents a regular file, then:
        //
        // - mapsMu is used to synchronize between copy-up and memmap.Mappable
        // methods on dentry preceding mm.MemoryManager.activeMu in the lock order.
        //
        // - dataMu is used to synchronize between copy-up and
        // dentry.(memmap.Mappable).Translate.
        //
        // - lowerMappings tracks memory mappings of the file. lowerMappings is
        // used to invalidate mappings of the lower layer when the file is copied
        // up to ensure that they remain coherent with subsequent writes to the
        // file. (Note that, as of this writing, Linux overlayfs does not do this;
        // this feature is a gVisor extension.) lowerMappings is protected by
        // mapsMu.
        //
        // - If this dentry is copied-up, then wrappedMappable is the Mappable
        // obtained from a call to the current top layer's
        // FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil
        // (from a call to regularFileFD.ensureMappable()), it cannot become nil.
        // wrappedMappable is protected by mapsMu and dataMu.
        //
        // - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is
        // accessed using atomic memory operations.
        mapsMu          sync.Mutex `state:"nosave"`
        lowerMappings   memmap.MappingSet
        dataMu          sync.RWMutex `state:"nosave"`
        wrappedMappable memmap.Mappable
        isMappable      uint32

        locks vfs.FileLocks

        // watches is the set of inotify watches on the file repesented by this dentry.
        //
        // Note that hard links to the same file will not share the same set of
        // watches, due to the fact that we do not have inode structures in this
        // overlay implementation.
        watches vfs.Watches
}

// newDentry creates a new dentry. The dentry initially has no references; it
// is the caller's responsibility to set the dentry's reference count and/or
// call dentry.destroy() as appropriate. The dentry is initially invalid in
// that it contains no layers; the caller is responsible for setting them.
func (fs *filesystem) newDentry() *dentry {
        d := &dentry{
                fs: fs,
        }
        d.lowerVDs = d.inlineLowerVDs[:0]
        d.vfsd.Init(d)
        refsvfs2.Register(d)
        return d
}

// IncRef implements vfs.DentryImpl.IncRef.
func (d *dentry) IncRef() {
        // d.refs may be 0 if d.fs.renameMu is locked, which serializes against
        // d.checkDropLocked().
        r := atomic.AddInt64(&d.refs, 1)
        if d.LogRefs() {
                refsvfs2.LogIncRef(d, r)
        }
}

// TryIncRef implements vfs.DentryImpl.TryIncRef.
func (d *dentry) TryIncRef() bool {
        for {
                r := atomic.LoadInt64(&d.refs)
                if r <= 0 {
                        return false
                }
                if atomic.CompareAndSwapInt64(&d.refs, r, r+1) {
                        if d.LogRefs() {
                                refsvfs2.LogTryIncRef(d, r+1)
                        }
                        return true
                }
        }
}

// DecRef implements vfs.DentryImpl.DecRef.
func (d *dentry) DecRef(ctx context.Context) {
        r := atomic.AddInt64(&d.refs, -1)
        if d.LogRefs() {
                refsvfs2.LogDecRef(d, r)
        }
        if r == 0 {
                d.fs.renameMu.Lock()
                d.checkDropLocked(ctx)
                d.fs.renameMu.Unlock()
        } else if r < 0 {
                panic("overlay.dentry.DecRef() called without holding a reference")
        }
}

func (d *dentry) decRefLocked(ctx context.Context) {
        r := atomic.AddInt64(&d.refs, -1)
        if d.LogRefs() {
                refsvfs2.LogDecRef(d, r)
        }
        if r == 0 {
                d.checkDropLocked(ctx)
        } else if r < 0 {
                panic("overlay.dentry.decRefLocked() called without holding a reference")
        }
}

// checkDropLocked should be called after d's reference count becomes 0 or it
// becomes deleted.
//
// Preconditions: d.fs.renameMu must be locked for writing.
func (d *dentry) checkDropLocked(ctx context.Context) {
        // Dentries with a positive reference count must be retained. (The only way
        // to obtain a reference on a dentry with zero references is via path
        // resolution, which requires renameMu, so if d.refs is zero then it will
        // remain zero while we hold renameMu for writing.) Dentries with a
        // negative reference count have already been destroyed.
        if atomic.LoadInt64(&d.refs) != 0 {
                return
        }

        // Make sure that we do not lose watches on dentries that have not been
        // deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so
        // d.vfsd.IsDead() indicates that d was deleted.
        if !d.vfsd.IsDead() && d.watches.Size() > 0 {
                return
        }

        // Refs is still zero; destroy it.
        d.destroyLocked(ctx)
        return
}

// destroyLocked destroys the dentry.
//
// Preconditions:
// * d.fs.renameMu must be locked for writing.
// * d.refs == 0.
func (d *dentry) destroyLocked(ctx context.Context) {
        switch atomic.LoadInt64(&d.refs) {
        case 0:
                // Mark the dentry destroyed.
                atomic.StoreInt64(&d.refs, -1)
        case -1:
                panic("overlay.dentry.destroyLocked() called on already destroyed dentry")
        default:
                panic("overlay.dentry.destroyLocked() called with references on the dentry")
        }

        if d.upperVD.Ok() {
                d.upperVD.DecRef(ctx)
        }
        for _, lowerVD := range d.lowerVDs {
                lowerVD.DecRef(ctx)
        }

        d.watches.HandleDeletion(ctx)

        if d.parent != nil {
                d.parent.dirMu.Lock()
                if !d.vfsd.IsDead() {
                        delete(d.parent.children, d.name)
                }
                d.parent.dirMu.Unlock()
                // Drop the reference held by d on its parent without recursively
                // locking d.fs.renameMu.
                d.parent.decRefLocked(ctx)
        }
        refsvfs2.Unregister(d)
}

// RefType implements refsvfs2.CheckedObject.Type.
func (d *dentry) RefType() string {
        return "overlay.dentry"
}

// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
func (d *dentry) LeakMessage() string {
        return fmt.Sprintf("[overlay.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
}

// LogRefs implements refsvfs2.CheckedObject.LogRefs.
//
// This should only be set to true for debugging purposes, as it can generate an
// extremely large amount of output and drastically degrade performance.
func (d *dentry) LogRefs() bool {
        return false
}

// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) {
        if d.isDir() {
                events |= linux.IN_ISDIR
        }

        // overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
        // that d was deleted.
        deleted := d.vfsd.IsDead()

        d.fs.renameMu.RLock()
        // The ordering below is important, Linux always notifies the parent first.
        if d.parent != nil {
                d.parent.watches.Notify(ctx, d.name, events, cookie, et, deleted)
        }
        d.watches.Notify(ctx, "", events, cookie, et, deleted)
        d.fs.renameMu.RUnlock()
}

// Watches implements vfs.DentryImpl.Watches.
func (d *dentry) Watches() *vfs.Watches {
        return &d.watches
}

// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
func (d *dentry) OnZeroWatches(ctx context.Context) {
        if atomic.LoadInt64(&d.refs) == 0 {
                d.fs.renameMu.Lock()
                d.checkDropLocked(ctx)
                d.fs.renameMu.Unlock()
        }
}

// iterLayers invokes yield on each layer comprising d, from top to bottom. If
// any call to yield returns false, iterLayer stops iteration.
func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) {
        if d.isCopiedUp() {
                if !yield(d.upperVD, true) {
                        return
                }
        }
        for _, lowerVD := range d.lowerVDs {
                if !yield(lowerVD, false) {
                        return
                }
        }
}

func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) {
        if d.isCopiedUp() {
                return d.upperVD, true
        }
        return d.lowerVDs[0], false
}

func (d *dentry) topLayer() vfs.VirtualDentry {
        vd, _ := d.topLayerInfo()
        return vd
}

func (d *dentry) topLookupLayer() lookupLayer {
        if d.upperVD.Ok() {
                return lookupLayerUpper
        }
        return lookupLayerLower
}

func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
        return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
}

func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
        mode := linux.FileMode(atomic.LoadUint32(&d.mode))
        kuid := auth.KUID(atomic.LoadUint32(&d.uid))
        kgid := auth.KGID(atomic.LoadUint32(&d.gid))
        if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
                return err
        }
        return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
}

// statInternalMask is the set of stat fields that is set by
// dentry.statInternalTo().
const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO

// statInternalTo writes fields to stat that are stored in d, and therefore do
// not requiring invoking StatAt on the overlay's layers.
func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) {
        stat.Mask |= statInternalMask
        if d.isDir() {
                // Linux sets nlink to 1 for merged directories
                // (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is
                // correct more often ("." and the directory's entry in its parent),
                // and some of our tests expect this.
                stat.Nlink = 2
        }
        stat.UID = atomic.LoadUint32(&d.uid)
        stat.GID = atomic.LoadUint32(&d.gid)
        stat.Mode = uint16(atomic.LoadUint32(&d.mode))
        stat.Ino = atomic.LoadUint64(&d.ino)
        stat.DevMajor = atomic.LoadUint32(&d.devMajor)
        stat.DevMinor = atomic.LoadUint32(&d.devMinor)
}

// Preconditions: d.copyMu must be locked for writing.
func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) {
        if opts.Stat.Mask&linux.STATX_MODE != 0 {
                atomic.StoreUint32(&d.mode, (d.mode&linux.S_IFMT)|uint32(opts.Stat.Mode&^linux.S_IFMT))
        }
        if opts.Stat.Mask&linux.STATX_UID != 0 {
                atomic.StoreUint32(&d.uid, opts.Stat.UID)
        }
        if opts.Stat.Mask&linux.STATX_GID != 0 {
                atomic.StoreUint32(&d.gid, opts.Stat.GID)
        }
}

func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
        return vfs.CheckDeleteSticky(
                creds,
                linux.FileMode(atomic.LoadUint32(&d.mode)),
                auth.KUID(atomic.LoadUint32(&d.uid)),
                auth.KUID(atomic.LoadUint32(&child.uid)),
                auth.KGID(atomic.LoadUint32(&child.gid)),
        )
}

// newChildOwnerStat returns a Statx for configuring the UID, GID, and mode of
// children.
func (d *dentry) newChildOwnerStat(mode linux.FileMode, creds *auth.Credentials) linux.Statx {
        stat := linux.Statx{
                Mask: uint32(linux.STATX_UID | linux.STATX_GID),
                UID:  uint32(creds.EffectiveKUID),
                GID:  uint32(creds.EffectiveKGID),
        }
        // Set GID and possibly the SGID bit if the parent is an SGID directory.
        d.copyMu.RLock()
        defer d.copyMu.RUnlock()
        if atomic.LoadUint32(&d.mode)&linux.ModeSetGID == linux.ModeSetGID {
                stat.GID = atomic.LoadUint32(&d.gid)
                if stat.Mode&linux.ModeDirectory == linux.ModeDirectory {
                        stat.Mode = uint16(mode) | linux.ModeSetGID
                        stat.Mask |= linux.STATX_MODE
                }
        }
        return stat
}

// fileDescription is embedded by overlay implementations of
// vfs.FileDescriptionImpl.
//
// +stateify savable
type fileDescription struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.LockFD
}

func (fd *fileDescription) filesystem() *filesystem {
        return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
}

func (fd *fileDescription) dentry() *dentry {
        return fd.vfsfd.Dentry().Impl().(*dentry)
}

// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
        return fd.filesystem().listXattr(ctx, fd.dentry(), size)
}

// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
        return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts)
}

// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
        fs := fd.filesystem()
        d := fd.dentry()

        fs.renameMu.RLock()
        err := fs.setXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts)
        fs.renameMu.RUnlock()
        if err != nil {
                return err
        }

        d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
        return nil
}

// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
        fs := fd.filesystem()
        d := fd.dentry()

        fs.renameMu.RLock()
        err := fs.removeXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name)
        fs.renameMu.RUnlock()
        if err != nil {
                return err
        }

        d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
        return nil
}













































































  816 
  820 


    2 


    2 

    2 
    2 





    2 












  820 







 1623 






 1619 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
// Copyright 2019 The gVisor Authors.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package sync

import (
        "sync/atomic"
)

// SeqCount is a synchronization primitive for optimistic reader/writer
// synchronization in cases where readers can work with stale data and
// therefore do not need to block writers.
//
// Compared to sync/atomic.Value:
//
// - Mutation of SeqCount-protected data does not require memory allocation,
// whereas atomic.Value generally does. This is a significant advantage when
// writes are common.
//
// - Atomic reads of SeqCount-protected data require copying. This is a
// disadvantage when atomic reads are common.
//
// - SeqCount may be more flexible: correct use of SeqCount.ReadOk allows other
// operations to be made atomic with reads of SeqCount-protected data.
//
// - SeqCount is more cumbersome to use; atomic reads of SeqCount-protected
// data require instantiating function templates using go_generics (see
// seqatomic.go).
type SeqCount struct {
        // epoch is incremented by BeginWrite and EndWrite, such that epoch is odd
        // if a writer critical section is active, and a read from data protected
        // by this SeqCount is atomic iff epoch is the same even value before and
        // after the read.
        epoch uint32
}

// SeqCountEpoch tracks writer critical sections in a SeqCount.
type SeqCountEpoch uint32

// We assume that:
//
// - All functions in sync/atomic that perform a memory read are at least a
// read fence: memory reads before calls to such functions cannot be reordered
// after the call, and memory reads after calls to such functions cannot be
// reordered before the call, even if those reads do not use sync/atomic.
//
// - All functions in sync/atomic that perform a memory write are at least a
// write fence: memory writes before calls to such functions cannot be
// reordered after the call, and memory writes after calls to such functions
// cannot be reordered before the call, even if those writes do not use
// sync/atomic.
//
// As of this writing, the Go memory model completely fails to describe
// sync/atomic, but these properties are implied by
// https://groups.google.com/forum/#!topic/golang-nuts/7EnEhM3U7B8.

// BeginRead indicates the beginning of a reader critical section. Reader
// critical sections DO NOT BLOCK writer critical sections, so operations in a
// reader critical section MAY RACE with writer critical sections. Races are
// detected by ReadOk at the end of the reader critical section. Thus, the
// low-level structure of readers is generally:
//
//     for {
//         epoch := seq.BeginRead()
//         // do something idempotent with seq-protected data
//         if seq.ReadOk(epoch) {
//             break
//         }
//     }
//
// However, since reader critical sections may race with writer critical
// sections, the Go race detector will (accurately) flag data races in readers
// using this pattern. Most users of SeqCount will need to use the
// SeqAtomicLoad function template in seqatomic.go.
func (s *SeqCount) BeginRead() SeqCountEpoch {
        if epoch := atomic.LoadUint32(&s.epoch); epoch&1 == 0 {
                return SeqCountEpoch(epoch)
        }
        return s.beginReadSlow()
}

func (s *SeqCount) beginReadSlow() SeqCountEpoch {
        i := 0
        for {
                if canSpin(i) {
                        i++
                        doSpin()
                } else {
                        goyield()
                }
                if epoch := atomic.LoadUint32(&s.epoch); epoch&1 == 0 {
                        return SeqCountEpoch(epoch)
                }
        }
}

// ReadOk returns true if the reader critical section initiated by a previous
// call to BeginRead() that returned epoch did not race with any writer critical
// sections.
//
// ReadOk may be called any number of times during a reader critical section.
// Reader critical sections do not need to be explicitly terminated; the last
// call to ReadOk is implicitly the end of the reader critical section.
func (s *SeqCount) ReadOk(epoch SeqCountEpoch) bool {
        return atomic.LoadUint32(&s.epoch) == uint32(epoch)
}

// BeginWrite indicates the beginning of a writer critical section.
//
// SeqCount does not support concurrent writer critical sections; clients with
// concurrent writers must synchronize them using e.g. sync.Mutex.
func (s *SeqCount) BeginWrite() {
        if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 == 0 {
                panic("SeqCount.BeginWrite during writer critical section")
        }
}

// EndWrite ends the effect of a preceding BeginWrite.
func (s *SeqCount) EndWrite() {
        if epoch := atomic.AddUint32(&s.epoch, 1); epoch&1 != 0 {
                panic("SeqCount.EndWrite outside writer critical section")
        }
}

































































































































































    1 

    1 





    1 


















    1 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fsutil

import (
        "math"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
)

// DirtySet maps offsets into a memmap.Mappable to DirtyInfo. It is used to
// implement Mappables that cache data from another source.
//
// type DirtySet <generated by go_generics>

// DirtyInfo is the value type of DirtySet, and represents information about a
// Mappable offset that is dirty (the cached data for that offset is newer than
// its source).
//
// +stateify savable
type DirtyInfo struct {
        // Keep is true if the represented offset is concurrently writable, such
        // that writing the data for that offset back to the source does not
        // guarantee that the offset is clean (since it may be concurrently
        // rewritten after the writeback).
        Keep bool
}

// dirtySetFunctions implements segment.Functions for DirtySet.
type dirtySetFunctions struct{}

// MinKey implements segment.Functions.MinKey.
func (dirtySetFunctions) MinKey() uint64 {
        return 0
}

// MaxKey implements segment.Functions.MaxKey.
func (dirtySetFunctions) MaxKey() uint64 {
        return math.MaxUint64
}

// ClearValue implements segment.Functions.ClearValue.
func (dirtySetFunctions) ClearValue(val *DirtyInfo) {
}

// Merge implements segment.Functions.Merge.
func (dirtySetFunctions) Merge(_ memmap.MappableRange, val1 DirtyInfo, _ memmap.MappableRange, val2 DirtyInfo) (DirtyInfo, bool) {
        if val1 != val2 {
                return DirtyInfo{}, false
        }
        return val1, true
}

// Split implements segment.Functions.Split.
func (dirtySetFunctions) Split(_ memmap.MappableRange, val DirtyInfo, _ uint64) (DirtyInfo, DirtyInfo) {
        return val, val
}

// MarkClean marks all offsets in mr as not dirty, except for those to which
// KeepDirty has been applied.
func (ds *DirtySet) MarkClean(mr memmap.MappableRange) {
        seg := ds.LowerBoundSegment(mr.Start)
        for seg.Ok() && seg.Start() < mr.End {
                if seg.Value().Keep {
                        seg = seg.NextSegment()
                        continue
                }
                seg = ds.Isolate(seg, mr)
                seg = ds.Remove(seg).NextSegment()
        }
}

// KeepClean marks all offsets in mr as not dirty, even those that were
// previously kept dirty by KeepDirty.
func (ds *DirtySet) KeepClean(mr memmap.MappableRange) {
        ds.RemoveRange(mr)
}

// MarkDirty marks all offsets in mr as dirty.
func (ds *DirtySet) MarkDirty(mr memmap.MappableRange) {
        ds.setDirty(mr, false)
}

// KeepDirty marks all offsets in mr as dirty and prevents them from being
// marked as clean by MarkClean.
func (ds *DirtySet) KeepDirty(mr memmap.MappableRange) {
        ds.setDirty(mr, true)
}

func (ds *DirtySet) setDirty(mr memmap.MappableRange, keep bool) {
        var changedAny bool
        defer func() {
                if changedAny {
                        // Merge segments split by Isolate to reduce cost of iteration.
                        ds.MergeRange(mr)
                }
        }()
        seg, gap := ds.Find(mr.Start)
        for {
                switch {
                case seg.Ok() && seg.Start() < mr.End:
                        if keep && !seg.Value().Keep {
                                changedAny = true
                                seg = ds.Isolate(seg, mr)
                                seg.ValuePtr().Keep = true
                        }
                        seg, gap = seg.NextNonEmpty()

                case gap.Ok() && gap.Start() < mr.End:
                        changedAny = true
                        seg = ds.Insert(gap, gap.Range().Intersect(mr), DirtyInfo{keep})
                        seg, gap = seg.NextNonEmpty()

                default:
                        return
                }
        }
}

// AllowClean allows MarkClean to mark offsets in mr as not dirty, ending the
// effect of a previous call to KeepDirty. (It does not itself mark those
// offsets as not dirty.)
func (ds *DirtySet) AllowClean(mr memmap.MappableRange) {
        var changedAny bool
        defer func() {
                if changedAny {
                        // Merge segments split by Isolate to reduce cost of iteration.
                        ds.MergeRange(mr)
                }
        }()
        for seg := ds.LowerBoundSegment(mr.Start); seg.Ok() && seg.Start() < mr.End; seg = seg.NextSegment() {
                if seg.Value().Keep {
                        changedAny = true
                        seg = ds.Isolate(seg, mr)
                        seg.ValuePtr().Keep = false
                }
        }
}

// SyncDirty passes pages in the range mr that are stored in cache and
// identified as dirty to writeAt, updating dirty to reflect successful writes.
// If writeAt returns a successful partial write, SyncDirty will call it
// repeatedly until all bytes have been written. max is the true size of the
// cached object; offsets beyond max will not be passed to writeAt, even if
// they are marked dirty.
func SyncDirty(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
        var changedDirty bool
        defer func() {
                if changedDirty {
                        // Merge segments split by Isolate to reduce cost of iteration.
                        dirty.MergeRange(mr)
                }
        }()
        dseg := dirty.LowerBoundSegment(mr.Start)
        for dseg.Ok() && dseg.Start() < mr.End {
                var dr memmap.MappableRange
                if dseg.Value().Keep {
                        dr = dseg.Range().Intersect(mr)
                } else {
                        changedDirty = true
                        dseg = dirty.Isolate(dseg, mr)
                        dr = dseg.Range()
                }
                if err := syncDirtyRange(ctx, dr, cache, max, mem, writeAt); err != nil {
                        return err
                }
                if dseg.Value().Keep {
                        dseg = dseg.NextSegment()
                } else {
                        dseg = dirty.Remove(dseg).NextSegment()
                }
        }
        return nil
}

// SyncDirtyAll passes all pages stored in cache identified as dirty to
// writeAt, updating dirty to reflect successful writes. If writeAt returns a
// successful partial write, SyncDirtyAll will call it repeatedly until all
// bytes have been written. max is the true size of the cached object; offsets
// beyond max will not be passed to writeAt, even if they are marked dirty.
func SyncDirtyAll(ctx context.Context, cache *FileRangeSet, dirty *DirtySet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
        dseg := dirty.FirstSegment()
        for dseg.Ok() {
                if err := syncDirtyRange(ctx, dseg.Range(), cache, max, mem, writeAt); err != nil {
                        return err
                }
                if dseg.Value().Keep {
                        dseg = dseg.NextSegment()
                } else {
                        dseg = dirty.Remove(dseg).NextSegment()
                }
        }
        return nil
}

// Preconditions: mr must be page-aligned.
func syncDirtyRange(ctx context.Context, mr memmap.MappableRange, cache *FileRangeSet, max uint64, mem memmap.File, writeAt func(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)) error {
        for cseg := cache.LowerBoundSegment(mr.Start); cseg.Ok() && cseg.Start() < mr.End; cseg = cseg.NextSegment() {
                wbr := cseg.Range().Intersect(mr)
                if max < wbr.Start {
                        break
                }
                ims, err := mem.MapInternal(cseg.FileRangeOf(wbr), hostarch.Read)
                if err != nil {
                        return err
                }
                if max < wbr.End {
                        ims = ims.TakeFirst64(max - wbr.Start)
                }
                offset := wbr.Start
                for !ims.IsEmpty() {
                        n, err := writeAt(ctx, ims, offset)
                        if err != nil {
                                return err
                        }
                        offset += n
                        ims = ims.DropFirst64(n)
                }
        }
        return nil
}































  350 
  350 






  345 
  346 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package time

import (
        "gvisor.dev/gvisor/pkg/context"
)

// contextID is the time package's type for context.Context.Value keys.
type contextID int

const (
        // CtxRealtimeClock is a Context.Value key for the current real time.
        CtxRealtimeClock contextID = iota
)

// RealtimeClockFromContext returns the real time clock associated with context
// ctx.
func RealtimeClockFromContext(ctx context.Context) Clock {
        if v := ctx.Value(CtxRealtimeClock); v != nil {
                return v.(Clock)
        }
        return nil
}

// NowFromContext returns the current real time associated with context ctx.
func NowFromContext(ctx context.Context) Time {
        if clk := RealtimeClockFromContext(ctx); clk != nil {
                return clk.Now()
        }
        panic("encountered context without RealtimeClock")
}
































    1 






    3 






   12 







   16 
    1 



   15 




    1 



   14 


   14 



   14 





























   14 





    1 


   13 





   19 







    7 




    1 


    6 







    6 






    5 










    5 



    5 



    4 









    4 


    4 



    4 


    3 

    1 


    2 









    1 



    1 






    2 





    1 


    1 




    2 






    2 






















    6 







    7 





    1 



    6 
    1 


    5 
    1 


    4 











    3 

    1 



    4 



    8 







   10 




   10 
    1 





    9 
    1 


    9 


    5 



    4 


    4 



    4 



    5 




    1 


    4 
    1 


    3 





    3 




    3 







    3 





    3 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/bits"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/gohacks"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

// Stat implements Linux syscall stat(2).
func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pathAddr := args[0].Pointer()
        statAddr := args[1].Pointer()
        return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, 0 /* flags */)
}

// Lstat implements Linux syscall lstat(2).
func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pathAddr := args[0].Pointer()
        statAddr := args[1].Pointer()
        return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, linux.AT_SYMLINK_NOFOLLOW)
}

// Newfstatat implements Linux syscall newfstatat, which backs fstatat(2).
func Newfstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        dirfd := args[0].Int()
        pathAddr := args[1].Pointer()
        statAddr := args[2].Pointer()
        flags := args[3].Int()
        return 0, nil, fstatat(t, dirfd, pathAddr, statAddr, flags)
}

func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr hostarch.Addr, flags int32) error {
        if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
                return linuxerr.EINVAL
        }

        opts := vfs.StatOptions{
                Mask: linux.STATX_BASIC_STATS,
        }

        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return err
        }

        root := t.FSContext().RootDirectoryVFS2()
        defer root.DecRef(t)
        start := root
        if !path.Absolute {
                if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
                        return syserror.ENOENT
                }
                if dirfd == linux.AT_FDCWD {
                        start = t.FSContext().WorkingDirectoryVFS2()
                        defer start.DecRef(t)
                } else {
                        dirfile := t.GetFileVFS2(dirfd)
                        if dirfile == nil {
                                return linuxerr.EBADF
                        }
                        if !path.HasComponents() {
                                // Use FileDescription.Stat() instead of
                                // VirtualFilesystem.StatAt() for fstatat(fd, ""), since the
                                // former may be able to use opened file state to expedite the
                                // Stat.
                                statx, err := dirfile.Stat(t, opts)
                                dirfile.DecRef(t)
                                if err != nil {
                                        return err
                                }
                                var stat linux.Stat
                                convertStatxToUserStat(t, &statx, &stat)
                                _, err = stat.CopyOut(t, statAddr)
                                return err
                        }
                        start = dirfile.VirtualDentry()
                        start.IncRef()
                        defer start.DecRef(t)
                        dirfile.DecRef(t)
                }
        }

        statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
                Root:               root,
                Start:              start,
                Path:               path,
                FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
        }, &opts)
        if err != nil {
                return err
        }
        var stat linux.Stat
        convertStatxToUserStat(t, &statx, &stat)
        _, err = stat.CopyOut(t, statAddr)
        return err
}

func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec {
        return linux.Timespec{
                Sec:  sxts.Sec,
                Nsec: int64(sxts.Nsec),
        }
}

// Fstat implements Linux syscall fstat(2).
func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        statAddr := args[1].Pointer()

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        statx, err := file.Stat(t, vfs.StatOptions{
                Mask: linux.STATX_BASIC_STATS,
        })
        if err != nil {
                return 0, nil, err
        }
        var stat linux.Stat
        convertStatxToUserStat(t, &statx, &stat)
        _, err = stat.CopyOut(t, statAddr)
        return 0, nil, err
}

// Statx implements Linux syscall statx(2).
func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        dirfd := args[0].Int()
        pathAddr := args[1].Pointer()
        flags := args[2].Int()
        mask := args[3].Uint()
        statxAddr := args[4].Pointer()

        if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW|linux.AT_STATX_SYNC_TYPE) != 0 {
                return 0, nil, linuxerr.EINVAL
        }
        // Make sure that only one sync type option is set.
        syncType := uint32(flags & linux.AT_STATX_SYNC_TYPE)
        if syncType != 0 && !bits.IsPowerOfTwo32(syncType) {
                return 0, nil, linuxerr.EINVAL
        }
        if mask&linux.STATX__RESERVED != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        opts := vfs.StatOptions{
                Mask: mask,
                Sync: uint32(flags & linux.AT_STATX_SYNC_TYPE),
        }

        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return 0, nil, err
        }

        root := t.FSContext().RootDirectoryVFS2()
        defer root.DecRef(t)
        start := root
        if !path.Absolute {
                if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
                        return 0, nil, syserror.ENOENT
                }
                if dirfd == linux.AT_FDCWD {
                        start = t.FSContext().WorkingDirectoryVFS2()
                        defer start.DecRef(t)
                } else {
                        dirfile := t.GetFileVFS2(dirfd)
                        if dirfile == nil {
                                return 0, nil, linuxerr.EBADF
                        }
                        if !path.HasComponents() {
                                // Use FileDescription.Stat() instead of
                                // VirtualFilesystem.StatAt() for statx(fd, ""), since the
                                // former may be able to use opened file state to expedite the
                                // Stat.
                                statx, err := dirfile.Stat(t, opts)
                                dirfile.DecRef(t)
                                if err != nil {
                                        return 0, nil, err
                                }
                                userifyStatx(t, &statx)
                                _, err = statx.CopyOut(t, statxAddr)
                                return 0, nil, err
                        }
                        start = dirfile.VirtualDentry()
                        start.IncRef()
                        defer start.DecRef(t)
                        dirfile.DecRef(t)
                }
        }

        statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
                Root:               root,
                Start:              start,
                Path:               path,
                FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
        }, &opts)
        if err != nil {
                return 0, nil, err
        }
        userifyStatx(t, &statx)
        _, err = statx.CopyOut(t, statxAddr)
        return 0, nil, err
}

func userifyStatx(t *kernel.Task, statx *linux.Statx) {
        userns := t.UserNamespace()
        statx.UID = uint32(auth.KUID(statx.UID).In(userns).OrOverflow())
        statx.GID = uint32(auth.KGID(statx.GID).In(userns).OrOverflow())
}

// Readlink implements Linux syscall readlink(2).
func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pathAddr := args[0].Pointer()
        bufAddr := args[1].Pointer()
        size := args[2].SizeT()
        return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size)
}

// Access implements Linux syscall access(2).
func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        mode := args[1].ModeT()

        return 0, nil, accessAt(t, linux.AT_FDCWD, addr, mode)
}

// Faccessat implements Linux syscall faccessat(2).
//
// Note that the faccessat() system call does not take a flags argument:
// "The raw faccessat() system call takes only the first three arguments. The
// AT_EACCESS and AT_SYMLINK_NOFOLLOW flags are actually implemented within
// the glibc wrapper function for faccessat().  If either of these flags is
// specified, then the wrapper function employs fstatat(2) to determine access
// permissions." - faccessat(2)
func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        dirfd := args[0].Int()
        addr := args[1].Pointer()
        mode := args[2].ModeT()

        return 0, nil, accessAt(t, dirfd, addr, mode)
}

func accessAt(t *kernel.Task, dirfd int32, pathAddr hostarch.Addr, mode uint) error {
        const rOK = 4
        const wOK = 2
        const xOK = 1

        // Sanity check the mode.
        if mode&^(rOK|wOK|xOK) != 0 {
                return linuxerr.EINVAL
        }

        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return err
        }
        tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink)
        if err != nil {
                return err
        }
        defer tpop.Release(t)

        // access(2) and faccessat(2) check permissions using real
        // UID/GID, not effective UID/GID.
        //
        // "access() needs to use the real uid/gid, not the effective
        // uid/gid. We do this by temporarily clearing all FS-related
        // capabilities and switching the fsuid/fsgid around to the
        // real ones." -fs/open.c:faccessat
        creds := t.Credentials().Fork()
        creds.EffectiveKUID = creds.RealKUID
        creds.EffectiveKGID = creds.RealKGID
        if creds.EffectiveKUID.In(creds.UserNamespace) == auth.RootUID {
                creds.EffectiveCaps = creds.PermittedCaps
        } else {
                creds.EffectiveCaps = 0
        }

        return t.Kernel().VFS().AccessAt(t, creds, vfs.AccessTypes(mode), &tpop.pop)
}

// Readlinkat implements Linux syscall mknodat(2).
func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        dirfd := args[0].Int()
        pathAddr := args[1].Pointer()
        bufAddr := args[2].Pointer()
        size := args[3].SizeT()
        return readlinkat(t, dirfd, pathAddr, bufAddr, size)
}

func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr hostarch.Addr, size uint) (uintptr, *kernel.SyscallControl, error) {
        if int(size) <= 0 {
                return 0, nil, linuxerr.EINVAL
        }

        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return 0, nil, err
        }
        // "Since Linux 2.6.39, pathname can be an empty string, in which case the
        // call operates on the symbolic link referred to by dirfd ..." -
        // readlinkat(2)
        tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink)
        if err != nil {
                return 0, nil, err
        }
        defer tpop.Release(t)

        target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop)
        if err != nil {
                return 0, nil, err
        }

        if len(target) > int(size) {
                target = target[:size]
        }
        n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target))
        if n == 0 {
                return 0, nil, err
        }
        return uintptr(n), nil, nil
}

// Statfs implements Linux syscall statfs(2).
func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pathAddr := args[0].Pointer()
        bufAddr := args[1].Pointer()

        path, err := copyInPath(t, pathAddr)
        if err != nil {
                return 0, nil, err
        }
        tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
        if err != nil {
                return 0, nil, err
        }
        defer tpop.Release(t)

        statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
        if err != nil {
                return 0, nil, err
        }
        _, err = statfs.CopyOut(t, bufAddr)
        return 0, nil, err
}

// Fstatfs implements Linux syscall fstatfs(2).
func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        bufAddr := args[1].Pointer()

        tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
        if err != nil {
                return 0, nil, err
        }
        defer tpop.Release(t)

        statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
        if err != nil {
                return 0, nil, err
        }
        _, err = statfs.CopyOut(t, bufAddr)
        return 0, nil, err
}


























    3 



    1 





    1 


    1 





    3 




























    2 







    2 





    2 






    1 



    2 







    1 



    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/usage"
)

func getrusage(t *kernel.Task, which int32) linux.Rusage {
        var cs usage.CPUStats

        switch which {
        case linux.RUSAGE_SELF:
                cs = t.ThreadGroup().CPUStats()

        case linux.RUSAGE_CHILDREN:
                cs = t.ThreadGroup().JoinedChildCPUStats()

        case linux.RUSAGE_THREAD:
                cs = t.CPUStats()

        case linux.RUSAGE_BOTH:
                tg := t.ThreadGroup()
                cs = tg.CPUStats()
                cs.Accumulate(tg.JoinedChildCPUStats())
        }

        return linux.Rusage{
                UTime:  linux.NsecToTimeval(cs.UserTime.Nanoseconds()),
                STime:  linux.NsecToTimeval(cs.SysTime.Nanoseconds()),
                NVCSw:  int64(cs.VoluntarySwitches),
                MaxRSS: int64(t.MaxRSS(which) / 1024),
        }
}

// Getrusage implements linux syscall getrusage(2).
//        marked "y" are supported now
//        marked "*" are not used on Linux
//        marked "p" are pending for support
//
//        y    struct timeval ru_utime; /* user CPU time used */
//        y    struct timeval ru_stime; /* system CPU time used */
//        p    long   ru_maxrss;        /* maximum resident set size */
//        *    long   ru_ixrss;         /* integral shared memory size */
//        *    long   ru_idrss;         /* integral unshared data size */
//        *    long   ru_isrss;         /* integral unshared stack size */
//        p    long   ru_minflt;        /* page reclaims (soft page faults) */
//        p    long   ru_majflt;        /* page faults (hard page faults) */
//        *    long   ru_nswap;         /* swaps */
//        p    long   ru_inblock;       /* block input operations */
//        p    long   ru_oublock;       /* block output operations */
//        *    long   ru_msgsnd;        /* IPC messages sent */
//        *    long   ru_msgrcv;        /* IPC messages received */
//        *    long   ru_nsignals;      /* signals received */
//        y    long   ru_nvcsw;         /* voluntary context switches */
//        y    long   ru_nivcsw;        /* involuntary context switches */
func Getrusage(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        which := args[0].Int()
        addr := args[1].Pointer()

        if which != linux.RUSAGE_SELF && which != linux.RUSAGE_CHILDREN && which != linux.RUSAGE_THREAD {
                return 0, nil, linuxerr.EINVAL
        }

        ru := getrusage(t, which)
        _, err := ru.CopyOut(t, addr)
        return 0, nil, err
}

// Times implements linux syscall times(2).
func Times(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()

        // Calculate the ticks first, and figure out if any additional work is
        // necessary. Linux allows for a NULL addr, in which case only the
        // return value is meaningful. We don't need to do anything else.
        ticks := uintptr(ktime.NowFromContext(t).Nanoseconds() / linux.ClockTick.Nanoseconds())
        if addr == 0 {
                return ticks, nil, nil
        }

        cs1 := t.ThreadGroup().CPUStats()
        cs2 := t.ThreadGroup().JoinedChildCPUStats()
        r := linux.Tms{
                UTime:  linux.ClockTFromDuration(cs1.UserTime),
                STime:  linux.ClockTFromDuration(cs1.SysTime),
                CUTime: linux.ClockTFromDuration(cs2.UserTime),
                CSTime: linux.ClockTFromDuration(cs2.SysTime),
        }
        if _, err := r.CopyOut(t, addr); err != nil {
                return 0, nil, err
        }

        return ticks, nil, nil
}











































   27 




   27 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package eventchannel

import (
        "golang.org/x/time/rate"
        "google.golang.org/protobuf/proto"
)

// rateLimitedEmitter wraps an emitter and limits events to the given limits.
// Events that would exceed the limit are discarded.
type rateLimitedEmitter struct {
        inner   Emitter
        limiter *rate.Limiter
}

// RateLimitedEmitterFrom creates a new event channel emitter that wraps the
// existing emitter and enforces rate limits. The limits are imposed via a
// token bucket, with `maxRate` events per second, with burst size of `burst`
// events. See the golang.org/x/time/rate package and
// https://en.wikipedia.org/wiki/Token_bucket for more information about token
// buckets generally.
func RateLimitedEmitterFrom(inner Emitter, maxRate float64, burst int) Emitter {
        return &rateLimitedEmitter{
                inner:   inner,
                limiter: rate.NewLimiter(rate.Limit(maxRate), burst),
        }
}

// Emit implements EventEmitter.Emit.
func (rle *rateLimitedEmitter) Emit(msg proto.Message) (bool, error) {
        if !rle.limiter.Allow() {
                // Drop event.
                return false, nil
        }
        return rle.inner.Emit(msg)
}

// Close implements EventEmitter.Close.
func (rle *rateLimitedEmitter) Close() error {
        return rle.inner.Close()
}
























 1957 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build go1.12
// +build !go1.18

// Check type signatures when updating Go version.

// Package goid provides the Get function.
package goid

// Get returns the ID of the current goroutine.
func Get() int64 {
        return getg().goid
}

// Structs from Go runtime. These may change in the future and require
// updating. These structs are currently the same on both AMD64 and ARM64,
// but may diverge in the future.

type stack struct {
        lo uintptr
        hi uintptr
}

type gobuf struct {
        sp   uintptr
        pc   uintptr
        g    uintptr
        ctxt uintptr
        ret  uint64
        lr   uintptr
        bp   uintptr
}

type g struct {
        stack       stack
        stackguard0 uintptr
        stackguard1 uintptr

        _panic       uintptr
        _defer       uintptr
        m            uintptr
        sched        gobuf
        syscallsp    uintptr
        syscallpc    uintptr
        stktopsp     uintptr
        param        uintptr
        atomicstatus uint32
        stackLock    uint32
        goid         int64

        // More fields...
        //
        // We only use goid and the fields before it are only listed to
        // calculate the correct offset.
}

// Defined in assembly. This can't use go:linkname since runtime.getg() isn't a
// real function, it's a compiler intrinsic.
func getg() *g









































   96 
   98 

    4 


   92 







































    3 








    3 



  113 
  113 




   95 


   95 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package unix

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/tcpip"
)

// EndpointWriter implements safemem.Writer that writes to a transport.Endpoint.
//
// EndpointWriter is not thread-safe.
type EndpointWriter struct {
        Ctx context.Context

        // Endpoint is the transport.Endpoint to write to.
        Endpoint transport.Endpoint

        // Control is the control messages to send.
        Control transport.ControlMessages

        // To is the endpoint to send to. May be nil.
        To transport.BoundEndpoint
}

// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
func (w *EndpointWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
        return safemem.FromVecWriterFunc{func(bufs [][]byte) (int64, error) {
                n, err := w.Endpoint.SendMsg(w.Ctx, bufs, w.Control, w.To)
                if err != nil {
                        return int64(n), err.ToError()
                }
                return int64(n), nil
        }}.WriteFromBlocks(srcs)
}

// EndpointReader implements safemem.Reader that reads from a
// transport.Endpoint.
//
// EndpointReader is not thread-safe.
type EndpointReader struct {
        Ctx context.Context

        // Endpoint is the transport.Endpoint to read from.
        Endpoint transport.Endpoint

        // Creds indicates if credential control messages are requested.
        Creds bool

        // NumRights is the number of SCM_RIGHTS FDs requested.
        NumRights int

        // Peek indicates that the data should not be consumed from the
        // endpoint.
        Peek bool

        // MsgSize is the size of the message that was read from. For stream
        // sockets, it is the amount read.
        MsgSize int64

        // From, if not nil, will be set with the address read from.
        From *tcpip.FullAddress

        // Control contains the received control messages.
        Control transport.ControlMessages

        // ControlTrunc indicates that SCM_RIGHTS FDs were discarded based on
        // the value of NumRights.
        ControlTrunc bool
}

// Truncate calls RecvMsg on the endpoint without writing to a destination.
func (r *EndpointReader) Truncate() error {
        // Ignore bytes read since it will always be zero.
        _, ms, c, ct, err := r.Endpoint.RecvMsg(r.Ctx, [][]byte{}, r.Creds, r.NumRights, r.Peek, r.From)
        r.Control = c
        r.ControlTrunc = ct
        r.MsgSize = ms
        if err != nil {
                return err.ToError()
        }
        return nil
}

// ReadToBlocks implements safemem.Reader.ReadToBlocks.
func (r *EndpointReader) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
        return safemem.FromVecReaderFunc{func(bufs [][]byte) (int64, error) {
                n, ms, c, ct, err := r.Endpoint.RecvMsg(r.Ctx, bufs, r.Creds, r.NumRights, r.Peek, r.From)
                r.Control = c
                r.ControlTrunc = ct
                r.MsgSize = ms
                if err != nil {
                        return int64(n), err.ToError()
                }
                return int64(n), nil
        }}.ReadToBlocks(dsts)
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/session_list.go: no such file or directory
























   68 




   13 

























   13 





   67 


























   73 



   43 


   72 






   74 



   72 



   74 














   13 





   72 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package time

import (
        "sync"
        "time"
)

// TcpipAfterFunc waits for duration to elapse according to clock then runs fn.
// The timer is started immediately and will fire exactly once.
func TcpipAfterFunc(clock Clock, duration time.Duration, fn func()) *TcpipTimer {
        timer := &TcpipTimer{
                clock: clock,
        }
        timer.notifier = functionNotifier{
                fn: func() {
                        // tcpip.Timer.Stop() explicitly states that the function is called in a
                        // separate goroutine that Stop() does not synchronize with.
                        // Timer.Destroy() synchronizes with calls to TimerListener.Notify().
                        // This is semantically meaningful because, in the former case, it's
                        // legal to call tcpip.Timer.Stop() while holding locks that may also be
                        // taken by the function, but this isn't so in the latter case. Most
                        // immediately, Timer calls TimerListener.Notify() while holding
                        // Timer.mu. A deadlock occurs without spawning a goroutine:
                        //   T1: (Timer expires)
                        //     => Timer.Tick()           <- Timer.mu.Lock() called
                        //     => TimerListener.Notify()
                        //     => Timer.Stop()
                        //     => Timer.Destroy()        <- Timer.mu.Lock() called, deadlock!
                        //
                        // Spawning a goroutine avoids the deadlock:
                        //   T1: (Timer expires)
                        //     => Timer.Tick()           <- Timer.mu.Lock() called
                        //     => TimerListener.Notify() <- Launches T2
                        //   T2:
                        //     => Timer.Stop()
                        //     => Timer.Destroy()        <- Timer.mu.Lock() called, blocks
                        //   T1:
                        //     => (returns)              <- Timer.mu.Unlock() called
                        //   T2:
                        //     => (continues)            <- No deadlock!
                        go func() {
                                timer.Stop()
                                fn()
                        }()
                },
        }
        timer.Reset(duration)
        return timer
}

// TcpipTimer is a resettable timer with variable duration expirations.
// Implements tcpip.Timer, which does not define a Destroy method; instead, all
// resources are released after timer expiration and calls to Timer.Stop.
//
// Must be created by AfterFunc.
type TcpipTimer struct {
        // clock is the time source. clock is immutable.
        clock Clock

        // notifier is called when the Timer expires. notifier is immutable.
        notifier functionNotifier

        // mu protects t.
        mu sync.Mutex

        // t stores the latest running Timer. This is replaced whenever Reset is
        // called since Timer cannot be restarted once it has been Destroyed by Stop.
        //
        // This field is nil iff Stop has been called.
        t *Timer
}

// Stop implements tcpip.Timer.Stop.
func (r *TcpipTimer) Stop() bool {
        r.mu.Lock()
        defer r.mu.Unlock()

        if r.t == nil {
                return false
        }
        _, lastSetting := r.t.Swap(Setting{})
        r.t.Destroy()
        r.t = nil
        return lastSetting.Enabled
}

// Reset implements tcpip.Timer.Reset.
func (r *TcpipTimer) Reset(d time.Duration) {
        r.mu.Lock()
        defer r.mu.Unlock()

        if r.t == nil {
                r.t = NewTimer(r.clock, &r.notifier)
        }

        r.t.Swap(Setting{
                Enabled: true,
                Period:  0,
                Next:    r.clock.Now().Add(d),
        })
}

// functionNotifier is a TimerListener that runs a function.
//
// functionNotifier cannot be saved or loaded.
type functionNotifier struct {
        fn func()
}

// Notify implements ktime.TimerListener.Notify.
func (f *functionNotifier) Notify(uint64, Setting) (Setting, bool) {
        f.fn()
        return Setting{}, false
}

// Destroy implements ktime.TimerListener.Destroy.
func (f *functionNotifier) Destroy() {}












































































































   31 



   31 
   31 



   31 
   31 


































    3 

    3 
    3 



    3 




































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "fmt"
        "strings"
        "sync"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
)

// A Hook specifies one of the hooks built into the network stack.
//
//                      Userspace app          Userspace app
//                            ^                      |
//                            |                      v
//                         [Input]               [Output]
//                            ^                      |
//                            |                      v
//                            |                   routing
//                            |                      |
//                            |                      v
// ----->[Prerouting]----->routing----->[Forward]---------[Postrouting]----->
type Hook uint

const (
        // Prerouting happens before a packet is routed to applications or to
        // be forwarded.
        Prerouting Hook = iota

        // Input happens before a packet reaches an application.
        Input

        // Forward happens once it's decided that a packet should be forwarded
        // to another host.
        Forward

        // Output happens after a packet is written by an application to be
        // sent out.
        Output

        // Postrouting happens just before a packet goes out on the wire.
        Postrouting

        // NumHooks is the total number of hooks.
        NumHooks
)

// A RuleVerdict is what a rule decides should be done with a packet.
type RuleVerdict int

const (
        // RuleAccept indicates the packet should continue through netstack.
        RuleAccept RuleVerdict = iota

        // RuleDrop indicates the packet should be dropped.
        RuleDrop

        // RuleJump indicates the packet should jump to another chain.
        RuleJump

        // RuleReturn indicates the packet should return to the previous chain.
        RuleReturn
)

// IPTables holds all the tables for a netstack.
//
// +stateify savable
type IPTables struct {
        // mu protects v4Tables, v6Tables, and modified.
        mu sync.RWMutex
        // v4Tables and v6tables map tableIDs to tables. They hold builtin
        // tables only, not user tables. mu must be locked for accessing.
        v4Tables [NumTables]Table
        v6Tables [NumTables]Table
        // modified is whether tables have been modified at least once. It is
        // used to elide the iptables performance overhead for workloads that
        // don't utilize iptables.
        modified bool

        // priorities maps each hook to a list of table names. The order of the
        // list is the order in which each table should be visited for that
        // hook. It is immutable.
        priorities [NumHooks][]TableID

        connections ConnTrack

        // reaperDone can be signaled to stop the reaper goroutine.
        reaperDone chan struct{}
}

// VisitTargets traverses all the targets of all tables and replaces each with
// transform(target).
func (it *IPTables) VisitTargets(transform func(Target) Target) {
        it.mu.Lock()
        defer it.mu.Unlock()

        for tid := range it.v4Tables {
                for i, rule := range it.v4Tables[tid].Rules {
                        it.v4Tables[tid].Rules[i].Target = transform(rule.Target)
                }
        }
        for tid := range it.v6Tables {
                for i, rule := range it.v6Tables[tid].Rules {
                        it.v6Tables[tid].Rules[i].Target = transform(rule.Target)
                }
        }
}

// A Table defines a set of chains and hooks into the network stack.
//
// It is a list of Rules, entry points (BuiltinChains), and error handlers
// (Underflows). As packets traverse netstack, they hit hooks. When a packet
// hits a hook, iptables compares it to Rules starting from that hook's entry
// point. So if a packet hits the Input hook, we look up the corresponding
// entry point in BuiltinChains and jump to that point.
//
// If the Rule doesn't match the packet, iptables continues to the next Rule.
// If a Rule does match, it can issue a verdict on the packet (e.g. RuleAccept
// or RuleDrop) that causes the packet to stop traversing iptables. It can also
// jump to other rules or perform custom actions based on Rule.Target.
//
// Underflow Rules are invoked when a chain returns without reaching a verdict.
//
// +stateify savable
type Table struct {
        // Rules holds the rules that make up the table.
        Rules []Rule

        // BuiltinChains maps builtin chains to their entrypoint rule in Rules.
        BuiltinChains [NumHooks]int

        // Underflows maps builtin chains to their underflow rule in Rules
        // (i.e. the rule to execute if the chain returns without a verdict).
        Underflows [NumHooks]int
}

// ValidHooks returns a bitmap of the builtin hooks for the given table.
func (table *Table) ValidHooks() uint32 {
        hooks := uint32(0)
        for hook, ruleIdx := range table.BuiltinChains {
                if ruleIdx != HookUnset {
                        hooks |= 1 << hook
                }
        }
        return hooks
}

// A Rule is a packet processing rule. It consists of two pieces. First it
// contains zero or more matchers, each of which is a specification of which
// packets this rule applies to. If there are no matchers in the rule, it
// applies to any packet.
//
// +stateify savable
type Rule struct {
        // Filter holds basic IP filtering fields common to every rule.
        Filter IPHeaderFilter

        // Matchers is the list of matchers for this rule.
        Matchers []Matcher

        // Target is the action to invoke if all the matchers match the packet.
        Target Target
}

// IPHeaderFilter performs basic IP header matching common to every rule.
//
// +stateify savable
type IPHeaderFilter struct {
        // Protocol matches the transport protocol.
        Protocol tcpip.TransportProtocolNumber

        // CheckProtocol determines whether the Protocol field should be
        // checked during matching.
        CheckProtocol bool

        // Dst matches the destination IP address.
        Dst tcpip.Address

        // DstMask masks bits of the destination IP address when comparing with
        // Dst.
        DstMask tcpip.Address

        // DstInvert inverts the meaning of the destination IP check, i.e. when
        // true the filter will match packets that fail the destination
        // comparison.
        DstInvert bool

        // Src matches the source IP address.
        Src tcpip.Address

        // SrcMask masks bits of the source IP address when comparing with Src.
        SrcMask tcpip.Address

        // SrcInvert inverts the meaning of the source IP check, i.e. when true the
        // filter will match packets that fail the source comparison.
        SrcInvert bool

        // InputInterface matches the name of the incoming interface for the packet.
        InputInterface string

        // InputInterfaceMask masks the characters of the interface name when
        // comparing with InputInterface.
        InputInterfaceMask string

        // InputInterfaceInvert inverts the meaning of incoming interface check,
        // i.e. when true the filter will match packets that fail the incoming
        // interface comparison.
        InputInterfaceInvert bool

        // OutputInterface matches the name of the outgoing interface for the packet.
        OutputInterface string

        // OutputInterfaceMask masks the characters of the interface name when
        // comparing with OutputInterface.
        OutputInterfaceMask string

        // OutputInterfaceInvert inverts the meaning of outgoing interface check,
        // i.e. when true the filter will match packets that fail the outgoing
        // interface comparison.
        OutputInterfaceInvert bool
}

// match returns whether pkt matches the filter.
//
// Preconditions: pkt.NetworkHeader is set and is at least of the minimal IPv4
// or IPv6 header length.
func (fl IPHeaderFilter) match(pkt *PacketBuffer, hook Hook, inNicName, outNicName string) bool {
        // Extract header fields.
        var (
                transProto tcpip.TransportProtocolNumber
                dstAddr    tcpip.Address
                srcAddr    tcpip.Address
        )
        switch proto := pkt.NetworkProtocolNumber; proto {
        case header.IPv4ProtocolNumber:
                hdr := header.IPv4(pkt.NetworkHeader().View())
                transProto = hdr.TransportProtocol()
                dstAddr = hdr.DestinationAddress()
                srcAddr = hdr.SourceAddress()

        case header.IPv6ProtocolNumber:
                hdr := header.IPv6(pkt.NetworkHeader().View())
                transProto = hdr.TransportProtocol()
                dstAddr = hdr.DestinationAddress()
                srcAddr = hdr.SourceAddress()

        default:
                panic(fmt.Sprintf("unknown network protocol with EtherType: %d", proto))
        }

        // Check the transport protocol.
        if fl.CheckProtocol && fl.Protocol != transProto {
                return false
        }

        // Check the addresses.
        if !filterAddress(dstAddr, fl.DstMask, fl.Dst, fl.DstInvert) ||
                !filterAddress(srcAddr, fl.SrcMask, fl.Src, fl.SrcInvert) {
                return false
        }

        switch hook {
        case Prerouting, Input:
                return matchIfName(inNicName, fl.InputInterface, fl.InputInterfaceInvert)
        case Output:
                return matchIfName(outNicName, fl.OutputInterface, fl.OutputInterfaceInvert)
        case Forward:
                if !matchIfName(inNicName, fl.InputInterface, fl.InputInterfaceInvert) {
                        return false
                }

                if !matchIfName(outNicName, fl.OutputInterface, fl.OutputInterfaceInvert) {
                        return false
                }

                return true
        case Postrouting:
                return true
        default:
                panic(fmt.Sprintf("unknown hook: %d", hook))
        }
}

func matchIfName(nicName string, ifName string, invert bool) bool {
        n := len(ifName)
        if n == 0 {
                // If the interface name is omitted in the filter, any interface will match.
                return true
        }
        // If the interface name ends with '+', any interface which begins with the
        // name should be matched.
        var matches bool
        if strings.HasSuffix(ifName, "+") {
                matches = strings.HasPrefix(nicName, ifName[:n-1])
        } else {
                matches = nicName == ifName
        }
        return matches != invert
}

// NetworkProtocol returns the protocol (IPv4 or IPv6) on to which the header
// applies.
func (fl IPHeaderFilter) NetworkProtocol() tcpip.NetworkProtocolNumber {
        switch len(fl.Src) {
        case header.IPv4AddressSize:
                return header.IPv4ProtocolNumber
        case header.IPv6AddressSize:
                return header.IPv6ProtocolNumber
        }
        panic(fmt.Sprintf("invalid address in IPHeaderFilter: %s", fl.Src))
}

// filterAddress returns whether addr matches the filter.
func filterAddress(addr, mask, filterAddr tcpip.Address, invert bool) bool {
        matches := true
        for i := range filterAddr {
                if addr[i]&mask[i] != filterAddr[i] {
                        matches = false
                        break
                }
        }
        return matches != invert
}

// A Matcher is the interface for matching packets.
type Matcher interface {
        // Match returns whether the packet matches and whether the packet
        // should be "hotdropped", i.e. dropped immediately. This is usually
        // used for suspicious packets.
        //
        // Precondition: packet.NetworkHeader is set.
        Match(hook Hook, packet *PacketBuffer, inputInterfaceName, outputInterfaceName string) (matches bool, hotdrop bool)
}

// A Target is the interface for taking an action for a packet.
type Target interface {
        // Action takes an action on the packet and returns a verdict on how
        // traversal should (or should not) continue. If the return value is
        // Jump, it also returns the index of the rule to jump to.
        Action(*PacketBuffer, *ConnTrack, Hook, *Route, tcpip.Address) (RuleVerdict, int)
}





















































































































































































































































































































































































































































































































































































































































































































































































  181 




  183 





















































































































































































































































































































































































































































 1960 
 1958 

 1952 

  367 


 1954 



  281 

   36 







  285 



































   49 
    5 





   49 


















 1954 




















































































































    7 




    3 




  542 




   92 

























    7 






   19 












   33 




 1953 




 1855 




 1964 













    3 





    3 



   14 


























































 1960 



































































  195 




  194 











  137 









    7 


    6 
    8 







    8 






















































































   27 




   27 







  576 

















   21 




   32 




  195 




    3 



























    3 

    3 
    2 


    3 





    3 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package kernel provides an emulation of the Linux kernel.
//
// See README.md for a detailed overview.
//
// Lock order (outermost locks must be taken first):
//
// Kernel.extMu
//   ThreadGroup.timerMu
//     ktime.Timer.mu (for kernelCPUClockTicker and IntervalTimer)
//       TaskSet.mu
//         SignalHandlers.mu
//           Task.mu
//       runningTasksMu
//
// Locking SignalHandlers.mu in multiple SignalHandlers requires locking
// TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
// time requires locking all of their signal mutexes first.
package kernel

import (
        "errors"
        "fmt"
        "path/filepath"
        "sync/atomic"
        "time"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/cleanup"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/cpuid"
        "gvisor.dev/gvisor/pkg/eventchannel"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/refs"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        oldtimerfd "gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
        "gvisor.dev/gvisor/pkg/sentry/fsbridge"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/timerfd"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
        "gvisor.dev/gvisor/pkg/sentry/hostcpu"
        "gvisor.dev/gvisor/pkg/sentry/inet"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
        "gvisor.dev/gvisor/pkg/sentry/kernel/futex"
        "gvisor.dev/gvisor/pkg/sentry/kernel/sched"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/sentry/loader"
        "gvisor.dev/gvisor/pkg/sentry/mm"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        "gvisor.dev/gvisor/pkg/sentry/platform"
        "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
        sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
        "gvisor.dev/gvisor/pkg/sentry/unimpl"
        uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
        "gvisor.dev/gvisor/pkg/sentry/uniqueid"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/state"
        "gvisor.dev/gvisor/pkg/state/wire"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
)

// VFS2Enabled is set to true when VFS2 is enabled. Added as a global for allow
// easy access everywhere. To be removed once VFS2 becomes the default.
var VFS2Enabled = false

// FUSEEnabled is set to true when FUSE is enabled. Added as a global for allow
// easy access everywhere. To be removed once FUSE is completed.
var FUSEEnabled = false

// Kernel represents an emulated Linux kernel. It must be initialized by calling
// Init() or LoadFrom().
//
// +stateify savable
type Kernel struct {
        // extMu serializes external changes to the Kernel with calls to
        // Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
        // remains frozen for the duration of the call; it requires that the Kernel
        // is paused as a precondition, which ensures that none of the tasks
        // running within the Kernel can affect its state, but extMu is required to
        // ensure that concurrent users of the Kernel *outside* the Kernel's
        // control cannot affect its state by calling e.g.
        // Kernel.SendExternalSignal.)
        extMu sync.Mutex `state:"nosave"`

        // started is true if Start has been called. Unless otherwise specified,
        // all Kernel fields become immutable once started becomes true.
        started bool `state:"nosave"`

        // All of the following fields are immutable unless otherwise specified.

        // Platform is the platform that is used to execute tasks in the created
        // Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is
        // embedded anonymously (the same issue applies).
        platform.Platform `state:"nosave"`

        // mf provides application memory.
        mf *pgalloc.MemoryFile `state:"nosave"`

        // See InitKernelArgs for the meaning of these fields.
        featureSet                  *cpuid.FeatureSet
        timekeeper                  *Timekeeper
        tasks                       *TaskSet
        rootUserNamespace           *auth.UserNamespace
        rootNetworkNamespace        *inet.Namespace
        applicationCores            uint
        useHostCores                bool
        extraAuxv                   []arch.AuxEntry
        vdso                        *loader.VDSO
        rootUTSNamespace            *UTSNamespace
        rootIPCNamespace            *IPCNamespace
        rootAbstractSocketNamespace *AbstractSocketNamespace

        // futexes is the "root" futex.Manager, from which all others are forked.
        // This is necessary to ensure that shared futexes are coherent across all
        // tasks, including those created by CreateProcess.
        futexes *futex.Manager

        // globalInit is the thread group whose leader has ID 1 in the root PID
        // namespace. globalInit is stored separately so that it is accessible even
        // after all tasks in the thread group have exited, such that ID 1 is no
        // longer mapped.
        //
        // globalInit is mutable until it is assigned by the first successful call
        // to CreateProcess, and is protected by extMu.
        globalInit *ThreadGroup

        // syslog is the kernel log.
        syslog syslog

        // runningTasksMu synchronizes disable/enable of cpuClockTicker when
        // the kernel is idle (runningTasks == 0).
        //
        // runningTasksMu is used to exclude critical sections when the timer
        // disables itself and when the first active task enables the timer,
        // ensuring that tasks always see a valid cpuClock value.
        runningTasksMu sync.Mutex `state:"nosave"`

        // runningTasks is the total count of tasks currently in
        // TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are
        // not blocked or stopped.
        //
        // runningTasks must be accessed atomically. Increments from 0 to 1 are
        // further protected by runningTasksMu (see incRunningTasks).
        runningTasks int64

        // cpuClock is incremented every linux.ClockTick. cpuClock is used to
        // measure task CPU usage, since sampling monotonicClock twice on every
        // syscall turns out to be unreasonably expensive. This is similar to how
        // Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING),
        // although Linux also uses scheduler timing information to improve
        // resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do
        // since "preeemptive" scheduling is managed by the Go runtime, which
        // doesn't provide this information.
        //
        // cpuClock is mutable, and is accessed using atomic memory operations.
        cpuClock uint64

        // cpuClockTicker increments cpuClock.
        cpuClockTicker *ktime.Timer `state:"nosave"`

        // cpuClockTickerDisabled indicates that cpuClockTicker has been
        // disabled because no tasks are running.
        //
        // cpuClockTickerDisabled is protected by runningTasksMu.
        cpuClockTickerDisabled bool

        // cpuClockTickerSetting is the ktime.Setting of cpuClockTicker at the
        // point it was disabled. It is cached here to avoid a lock ordering
        // violation with cpuClockTicker.mu when runningTaskMu is held.
        //
        // cpuClockTickerSetting is only valid when cpuClockTickerDisabled is
        // true.
        //
        // cpuClockTickerSetting is protected by runningTasksMu.
        cpuClockTickerSetting ktime.Setting

        // uniqueID is used to generate unique identifiers.
        //
        // uniqueID is mutable, and is accessed using atomic memory operations.
        uniqueID uint64

        // nextInotifyCookie is a monotonically increasing counter used for
        // generating unique inotify event cookies.
        //
        // nextInotifyCookie is mutable, and is accessed using atomic memory
        // operations.
        nextInotifyCookie uint32

        // netlinkPorts manages allocation of netlink socket port IDs.
        netlinkPorts *port.Manager

        // saveStatus is nil if the sandbox has not been saved, errSaved or
        // errAutoSaved if it has been saved successfully, or the error causing the
        // sandbox to exit during save.
        // It is protected by extMu.
        saveStatus error `state:"nosave"`

        // danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
        danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`

        // sockets is the list of all network sockets in the system.
        // Protected by extMu.
        // TODO(gvisor.dev/issue/1624): Only used by VFS1.
        sockets socketList

        // socketsVFS2 records all network sockets in the system. Protected by
        // extMu.
        socketsVFS2 map[*vfs.FileDescription]*SocketRecord

        // nextSocketRecord is the next entry number to use in sockets. Protected
        // by extMu.
        nextSocketRecord uint64

        // deviceRegistry is used to save/restore device.SimpleDevices.
        deviceRegistry struct{} `state:".(*device.Registry)"`

        // DirentCacheLimiter controls the number of total dirent entries can be in
        // caches. Not all caches use it, only the caches that use host resources use
        // the limiter. It may be nil if disabled.
        DirentCacheLimiter *fs.DirentCacheLimiter

        // unimplementedSyscallEmitterOnce is used in the initialization of
        // unimplementedSyscallEmitter.
        unimplementedSyscallEmitterOnce sync.Once `state:"nosave"`

        // unimplementedSyscallEmitter is used to emit unimplemented syscall
        // events. This is initialized lazily on the first unimplemented
        // syscall.
        unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`

        // SpecialOpts contains special kernel options.
        SpecialOpts

        // vfs keeps the filesystem state used across the kernel.
        vfs vfs.VirtualFilesystem

        // hostMount is the Mount used for file descriptors that were imported
        // from the host.
        hostMount *vfs.Mount

        // pipeMount is the Mount used for pipes created by the pipe() and pipe2()
        // syscalls (as opposed to named pipes created by mknod()).
        pipeMount *vfs.Mount

        // shmMount is the Mount used for anonymous files created by the
        // memfd_create() syscalls. It is analagous to Linux's shm_mnt.
        shmMount *vfs.Mount

        // socketMount is the Mount used for sockets created by the socket() and
        // socketpair() syscalls. There are several cases where a socket dentry will
        // not be contained in socketMount:
        // 1. Socket files created by mknod()
        // 2. Socket fds imported from the host (Kernel.hostMount is used for these)
        // 3. Socket files created by binding Unix sockets to a file path
        socketMount *vfs.Mount

        // If set to true, report address space activation waits as if the task is in
        // external wait so that the watchdog doesn't report the task stuck.
        SleepForAddressSpaceActivation bool

        // Exceptions to YAMA ptrace restrictions. Each key-value pair represents a
        // tracee-tracer relationship. The key is a process (technically, the thread
        // group leader) that can be traced by any thread that is a descendant of the
        // value. If the value is nil, then anyone can trace the process represented by
        // the key.
        //
        // ptraceExceptions is protected by the TaskSet mutex.
        ptraceExceptions map[*Task]*Task

        // YAMAPtraceScope is the current level of YAMA ptrace restrictions.
        YAMAPtraceScope int32

        // cgroupRegistry contains the set of active cgroup controllers on the
        // system. It is controller by cgroupfs. Nil if cgroupfs is unavailable on
        // the system.
        cgroupRegistry *CgroupRegistry
}

// InitKernelArgs holds arguments to Init.
type InitKernelArgs struct {
        // FeatureSet is the emulated CPU feature set.
        FeatureSet *cpuid.FeatureSet

        // Timekeeper manages time for all tasks in the system.
        Timekeeper *Timekeeper

        // RootUserNamespace is the root user namespace.
        RootUserNamespace *auth.UserNamespace

        // RootNetworkNamespace is the root network namespace. If nil, no networking
        // will be available.
        RootNetworkNamespace *inet.Namespace

        // ApplicationCores is the number of logical CPUs visible to sandboxed
        // applications. The set of logical CPU IDs is [0, ApplicationCores); thus
        // ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the
        // most significant bit in cpu_possible_mask + 1.
        ApplicationCores uint

        // If UseHostCores is true, Task.CPU() returns the task goroutine's CPU
        // instead of a virtualized CPU number, and Task.CopyToCPUMask() is a
        // no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it
        // will be overridden.
        UseHostCores bool

        // ExtraAuxv contains additional auxiliary vector entries that are added to
        // each process by the ELF loader.
        ExtraAuxv []arch.AuxEntry

        // Vdso holds the VDSO and its parameter page.
        Vdso *loader.VDSO

        // RootUTSNamespace is the root UTS namespace.
        RootUTSNamespace *UTSNamespace

        // RootIPCNamespace is the root IPC namespace.
        RootIPCNamespace *IPCNamespace

        // RootAbstractSocketNamespace is the root Abstract Socket namespace.
        RootAbstractSocketNamespace *AbstractSocketNamespace

        // PIDNamespace is the root PID namespace.
        PIDNamespace *PIDNamespace
}

// Init initialize the Kernel with no tasks.
//
// Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile
// before calling Init.
func (k *Kernel) Init(args InitKernelArgs) error {
        if args.FeatureSet == nil {
                return fmt.Errorf("args.FeatureSet is nil")
        }
        if args.Timekeeper == nil {
                return fmt.Errorf("args.Timekeeper is nil")
        }
        if args.Timekeeper.clocks == nil {
                return fmt.Errorf("must call Timekeeper.SetClocks() before Kernel.Init()")
        }
        if args.RootUserNamespace == nil {
                return fmt.Errorf("args.RootUserNamespace is nil")
        }
        if args.ApplicationCores == 0 {
                return fmt.Errorf("args.ApplicationCores is 0")
        }

        k.featureSet = args.FeatureSet
        k.timekeeper = args.Timekeeper
        k.tasks = newTaskSet(args.PIDNamespace)
        k.rootUserNamespace = args.RootUserNamespace
        k.rootUTSNamespace = args.RootUTSNamespace
        k.rootIPCNamespace = args.RootIPCNamespace
        k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
        k.rootNetworkNamespace = args.RootNetworkNamespace
        if k.rootNetworkNamespace == nil {
                k.rootNetworkNamespace = inet.NewRootNamespace(nil, nil)
        }
        k.applicationCores = args.ApplicationCores
        if args.UseHostCores {
                k.useHostCores = true
                maxCPU, err := hostcpu.MaxPossibleCPU()
                if err != nil {
                        return fmt.Errorf("failed to get maximum CPU number: %v", err)
                }
                minAppCores := uint(maxCPU) + 1
                if k.applicationCores < minAppCores {
                        log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores)
                        k.applicationCores = minAppCores
                }
        }
        k.extraAuxv = args.ExtraAuxv
        k.vdso = args.Vdso
        k.futexes = futex.NewManager()
        k.netlinkPorts = port.New()
        k.ptraceExceptions = make(map[*Task]*Task)
        k.YAMAPtraceScope = linux.YAMA_SCOPE_RELATIONAL

        if VFS2Enabled {
                ctx := k.SupervisorContext()
                if err := k.vfs.Init(ctx); err != nil {
                        return fmt.Errorf("failed to initialize VFS: %v", err)
                }

                pipeFilesystem, err := pipefs.NewFilesystem(&k.vfs)
                if err != nil {
                        return fmt.Errorf("failed to create pipefs filesystem: %v", err)
                }
                defer pipeFilesystem.DecRef(ctx)
                pipeMount, err := k.vfs.NewDisconnectedMount(pipeFilesystem, nil, &vfs.MountOptions{})
                if err != nil {
                        return fmt.Errorf("failed to create pipefs mount: %v", err)
                }
                k.pipeMount = pipeMount

                tmpfsFilesystem, tmpfsRoot, err := tmpfs.NewFilesystem(ctx, &k.vfs, auth.NewRootCredentials(k.rootUserNamespace))
                if err != nil {
                        return fmt.Errorf("failed to create tmpfs filesystem: %v", err)
                }
                defer tmpfsFilesystem.DecRef(ctx)
                defer tmpfsRoot.DecRef(ctx)
                shmMount, err := k.vfs.NewDisconnectedMount(tmpfsFilesystem, tmpfsRoot, &vfs.MountOptions{})
                if err != nil {
                        return fmt.Errorf("failed to create tmpfs mount: %v", err)
                }
                k.shmMount = shmMount

                socketFilesystem, err := sockfs.NewFilesystem(&k.vfs)
                if err != nil {
                        return fmt.Errorf("failed to create sockfs filesystem: %v", err)
                }
                defer socketFilesystem.DecRef(ctx)
                socketMount, err := k.vfs.NewDisconnectedMount(socketFilesystem, nil, &vfs.MountOptions{})
                if err != nil {
                        return fmt.Errorf("failed to create sockfs mount: %v", err)
                }
                k.socketMount = socketMount

                k.socketsVFS2 = make(map[*vfs.FileDescription]*SocketRecord)

                k.cgroupRegistry = newCgroupRegistry()
        }
        return nil
}

// SaveTo saves the state of k to w.
//
// Preconditions: The kernel must be paused throughout the call to SaveTo.
func (k *Kernel) SaveTo(ctx context.Context, w wire.Writer) error {
        saveStart := time.Now()

        // Do not allow other Kernel methods to affect it while it's being saved.
        k.extMu.Lock()
        defer k.extMu.Unlock()

        // Stop time.
        k.pauseTimeLocked(ctx)
        defer k.resumeTimeLocked(ctx)

        // Evict all evictable MemoryFile allocations.
        k.mf.StartEvictions()
        k.mf.WaitForEvictions()

        if VFS2Enabled {
                // Discard unsavable mappings, such as those for host file descriptors.
                if err := k.invalidateUnsavableMappings(ctx); err != nil {
                        return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
                }

                // Prepare filesystems for saving. This must be done after
                // invalidateUnsavableMappings(), since dropping memory mappings may
                // affect filesystem state (e.g. page cache reference counts).
                if err := k.vfs.PrepareSave(ctx); err != nil {
                        return err
                }
        } else {
                // Flush cached file writes to backing storage. This must come after
                // MemoryFile eviction since eviction may cause file writes.
                if err := k.flushWritesToFiles(ctx); err != nil {
                        return err
                }

                // Remove all epoll waiter objects from underlying wait queues.
                // NOTE: for programs to resume execution in future snapshot scenarios,
                // we will need to re-establish these waiter objects after saving.
                k.tasks.unregisterEpollWaiters(ctx)

                // Clear the dirent cache before saving because Dirents must be Loaded in a
                // particular order (parents before children), and Loading dirents from a cache
                // breaks that order.
                if err := k.flushMountSourceRefs(ctx); err != nil {
                        return err
                }

                // Ensure that all inode and mount release operations have completed.
                fs.AsyncBarrier()

                // Once all fs work has completed (flushed references have all been released),
                // reset mount mappings. This allows individual mounts to save how inodes map
                // to filesystem resources. Without this, fs.Inodes cannot be restored.
                fs.SaveInodeMappings()

                // Discard unsavable mappings, such as those for host file descriptors.
                // This must be done after waiting for "asynchronous fs work", which
                // includes async I/O that may touch application memory.
                //
                // TODO(gvisor.dev/issue/1624): This rationale is believed to be
                // obsolete since AIO callbacks are now waited-for by Kernel.Pause(),
                // but this order is conservatively retained for VFS1.
                if err := k.invalidateUnsavableMappings(ctx); err != nil {
                        return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
                }
        }

        // Save the CPUID FeatureSet before the rest of the kernel so we can
        // verify its compatibility on restore before attempting to restore the
        // entire kernel, which may fail on an incompatible machine.
        //
        // N.B. This will also be saved along with the full kernel save below.
        cpuidStart := time.Now()
        if _, err := state.Save(ctx, w, k.FeatureSet()); err != nil {
                return err
        }
        log.Infof("CPUID save took [%s].", time.Since(cpuidStart))

        // Save the timekeeper's state.

        // Save the kernel state.
        kernelStart := time.Now()
        stats, err := state.Save(ctx, w, k)
        if err != nil {
                return err
        }
        log.Infof("Kernel save stats: %s", stats.String())
        log.Infof("Kernel save took [%s].", time.Since(kernelStart))

        // Save the memory file's state.
        memoryStart := time.Now()
        if err := k.mf.SaveTo(ctx, w); err != nil {
                return err
        }
        log.Infof("Memory save took [%s].", time.Since(memoryStart))

        log.Infof("Overall save took [%s].", time.Since(saveStart))

        return nil
}

// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
//
// Preconditions: !VFS2Enabled.
func (k *Kernel) flushMountSourceRefs(ctx context.Context) error {
        // Flush all mount sources for currently mounted filesystems in each task.
        flushed := make(map[*fs.MountNamespace]struct{})
        k.tasks.mu.RLock()
        k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) {
                if _, ok := flushed[tg.mounts]; ok {
                        // Already flushed.
                        return
                }
                tg.mounts.FlushMountSourceRefs()
                flushed[tg.mounts] = struct{}{}
        })
        k.tasks.mu.RUnlock()

        // There may be some open FDs whose filesystems have been unmounted. We
        // must flush those as well.
        return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
                file.Dirent.Inode.MountSource.FlushDirentRefs()
                return nil
        })
}

// forEachFDPaused applies the given function to each open file descriptor in
// each task.
//
// Precondition: Must be called with the kernel paused.
func (ts *TaskSet) forEachFDPaused(ctx context.Context, f func(*fs.File, *vfs.FileDescription) error) (err error) {
        ts.mu.RLock()
        defer ts.mu.RUnlock()
        for t := range ts.Root.tids {
                // We can skip locking Task.mu here since the kernel is paused.
                if t.fdTable == nil {
                        continue
                }
                t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fileVFS2 *vfs.FileDescription, _ FDFlags) {
                        if lastErr := f(file, fileVFS2); lastErr != nil && err == nil {
                                err = lastErr
                        }
                })
        }
        return err
}

// Preconditions: !VFS2Enabled.
func (k *Kernel) flushWritesToFiles(ctx context.Context) error {
        return k.tasks.forEachFDPaused(ctx, func(file *fs.File, _ *vfs.FileDescription) error {
                if flags := file.Flags(); !flags.Write {
                        return nil
                }
                if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
                        return nil
                }
                // Here we need all metadata synced.
                syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
                if err := fs.SaveFileFsyncError(syncErr); err != nil {
                        name, _ := file.Dirent.FullName(nil /* root */)
                        // Wrap this error in ErrSaveRejection so that it will trigger a save
                        // error, rather than a panic. This also allows us to distinguish Fsync
                        // errors from state file errors in state.Save.
                        return &fs.ErrSaveRejection{
                                Err: fmt.Errorf("%q was not sufficiently synced: %w", name, err),
                        }
                }
                return nil
        })
}

// Preconditions: !VFS2Enabled.
func (ts *TaskSet) unregisterEpollWaiters(ctx context.Context) {
        ts.mu.RLock()
        defer ts.mu.RUnlock()

        // Tasks that belong to the same process could potentially point to the
        // same FDTable. So we retain a map of processed ones to avoid
        // processing the same FDTable multiple times.
        processed := make(map[*FDTable]struct{})
        for t := range ts.Root.tids {
                // We can skip locking Task.mu here since the kernel is paused.
                if t.fdTable == nil {
                        continue
                }
                if _, ok := processed[t.fdTable]; ok {
                        continue
                }
                t.fdTable.forEach(ctx, func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
                        if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
                                e.UnregisterEpollWaiters()
                        }
                })
                processed[t.fdTable] = struct{}{}
        }
}

// Preconditions: The kernel must be paused.
func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
        invalidated := make(map[*mm.MemoryManager]struct{})
        k.tasks.mu.RLock()
        defer k.tasks.mu.RUnlock()
        for t := range k.tasks.Root.tids {
                // We can skip locking Task.mu here since the kernel is paused.
                if memMgr := t.image.MemoryManager; memMgr != nil {
                        if _, ok := invalidated[memMgr]; !ok {
                                if err := memMgr.InvalidateUnsavable(ctx); err != nil {
                                        return err
                                }
                                invalidated[memMgr] = struct{}{}
                        }
                }
                // I really wish we just had a sync.Map of all MMs...
                if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
                        if err := r.image.MemoryManager.InvalidateUnsavable(ctx); err != nil {
                                return err
                        }
                }
        }
        return nil
}

// LoadFrom returns a new Kernel loaded from args.
func (k *Kernel) LoadFrom(ctx context.Context, r wire.Reader, timeReady chan struct{}, net inet.Stack, clocks sentrytime.Clocks, vfsOpts *vfs.CompleteRestoreOptions) error {
        loadStart := time.Now()

        initAppCores := k.applicationCores

        // Load the pre-saved CPUID FeatureSet.
        //
        // N.B. This was also saved along with the full kernel below, so we
        // don't need to explicitly install it in the Kernel.
        cpuidStart := time.Now()
        var features cpuid.FeatureSet
        if _, err := state.Load(ctx, r, &features); err != nil {
                return err
        }
        log.Infof("CPUID load took [%s].", time.Since(cpuidStart))

        // Verify that the FeatureSet is usable on this host. We do this before
        // Kernel load so that the explicit CPUID mismatch error has priority
        // over floating point state restore errors that may occur on load on
        // an incompatible machine.
        if err := features.CheckHostCompatible(); err != nil {
                return err
        }

        // Load the kernel state.
        kernelStart := time.Now()
        stats, err := state.Load(ctx, r, k)
        if err != nil {
                return err
        }
        log.Infof("Kernel load stats: %s", stats.String())
        log.Infof("Kernel load took [%s].", time.Since(kernelStart))

        // rootNetworkNamespace should be populated after loading the state file.
        // Restore the root network stack.
        k.rootNetworkNamespace.RestoreRootStack(net)

        // Load the memory file's state.
        memoryStart := time.Now()
        if err := k.mf.LoadFrom(ctx, r); err != nil {
                return err
        }
        log.Infof("Memory load took [%s].", time.Since(memoryStart))

        log.Infof("Overall load took [%s]", time.Since(loadStart))

        k.Timekeeper().SetClocks(clocks)

        if timeReady != nil {
                close(timeReady)
        }

        if net != nil {
                net.Resume()
        }

        if VFS2Enabled {
                if err := k.vfs.CompleteRestore(ctx, vfsOpts); err != nil {
                        return err
                }
        } else {
                // Ensure that all pending asynchronous work is complete:
                //   - namedpipe opening
                //   - inode file opening
                if err := fs.AsyncErrorBarrier(); err != nil {
                        return err
                }
        }

        tcpip.AsyncLoading.Wait()

        log.Infof("Overall load took [%s] after async work", time.Since(loadStart))

        // Applications may size per-cpu structures based on k.applicationCores, so
        // it can't change across save/restore. When we are virtualizing CPU
        // numbers, this isn't a problem. However, when we are exposing host CPU
        // assignments, we can't tolerate an increase in the number of host CPUs,
        // which could result in getcpu(2) returning CPUs that applications expect
        // not to exist.
        if k.useHostCores && initAppCores > k.applicationCores {
                return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores)
        }

        return nil
}

// UniqueID returns a unique identifier.
func (k *Kernel) UniqueID() uint64 {
        id := atomic.AddUint64(&k.uniqueID, 1)
        if id == 0 {
                panic("unique identifier generator wrapped around")
        }
        return id
}

// CreateProcessArgs holds arguments to kernel.CreateProcess.
type CreateProcessArgs struct {
        // Filename is the filename to load as the init binary.
        //
        // If this is provided as "", File will be checked, then the file will be
        // guessed via Argv[0].
        Filename string

        // File is a passed host FD pointing to a file to load as the init binary.
        //
        // This is checked if and only if Filename is "".
        File fsbridge.File

        // Argvv is a list of arguments.
        Argv []string

        // Envv is a list of environment variables.
        Envv []string

        // WorkingDirectory is the initial working directory.
        //
        // This defaults to the root if empty.
        WorkingDirectory string

        // Credentials is the initial credentials.
        Credentials *auth.Credentials

        // FDTable is the initial set of file descriptors. If CreateProcess succeeds,
        // it takes a reference on FDTable.
        FDTable *FDTable

        // Umask is the initial umask.
        Umask uint

        // Limits is the initial resource limits.
        Limits *limits.LimitSet

        // MaxSymlinkTraversals is the maximum number of symlinks to follow
        // during resolution.
        MaxSymlinkTraversals uint

        // UTSNamespace is the initial UTS namespace.
        UTSNamespace *UTSNamespace

        // IPCNamespace is the initial IPC namespace.
        IPCNamespace *IPCNamespace

        // PIDNamespace is the initial PID Namespace.
        PIDNamespace *PIDNamespace

        // AbstractSocketNamespace is the initial Abstract Socket namespace.
        AbstractSocketNamespace *AbstractSocketNamespace

        // MountNamespace optionally contains the mount namespace for this
        // process. If nil, the init process's mount namespace is used.
        //
        // Anyone setting MountNamespace must donate a reference (i.e.
        // increment it).
        MountNamespace *fs.MountNamespace

        // MountNamespaceVFS2 optionally contains the mount namespace for this
        // process. If nil, the init process's mount namespace is used.
        //
        // Anyone setting MountNamespaceVFS2 must donate a reference (i.e.
        // increment it).
        MountNamespaceVFS2 *vfs.MountNamespace

        // ContainerID is the container that the process belongs to.
        ContainerID string
}

// NewContext returns a context.Context that represents the task that will be
// created by args.NewContext(k).
func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext {
        return &createProcessContext{
                Logger: log.Log(),
                k:      k,
                args:   args,
        }
}

// createProcessContext is a context.Context that represents the context
// associated with a task that is being created.
type createProcessContext struct {
        context.NoopSleeper
        log.Logger
        k    *Kernel
        args *CreateProcessArgs
}

// Value implements context.Context.Value.
func (ctx *createProcessContext) Value(key interface{}) interface{} {
        switch key {
        case CtxKernel:
                return ctx.k
        case CtxPIDNamespace:
                return ctx.args.PIDNamespace
        case CtxUTSNamespace:
                return ctx.args.UTSNamespace
        case CtxIPCNamespace:
                ipcns := ctx.args.IPCNamespace
                ipcns.IncRef()
                return ipcns
        case auth.CtxCredentials:
                return ctx.args.Credentials
        case fs.CtxRoot:
                if ctx.args.MountNamespace != nil {
                        // MountNamespace.Root() will take a reference on the root dirent for us.
                        return ctx.args.MountNamespace.Root()
                }
                return nil
        case vfs.CtxRoot:
                if ctx.args.MountNamespaceVFS2 == nil {
                        return nil
                }
                root := ctx.args.MountNamespaceVFS2.Root()
                root.IncRef()
                return root
        case vfs.CtxMountNamespace:
                if ctx.k.globalInit == nil {
                        return nil
                }
                mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
                mntns.IncRef()
                return mntns
        case fs.CtxDirentCacheLimiter:
                return ctx.k.DirentCacheLimiter
        case inet.CtxStack:
                return ctx.k.RootNetworkNamespace().Stack()
        case ktime.CtxRealtimeClock:
                return ctx.k.RealtimeClock()
        case limits.CtxLimits:
                return ctx.args.Limits
        case pgalloc.CtxMemoryFile:
                return ctx.k.mf
        case pgalloc.CtxMemoryFileProvider:
                return ctx.k
        case platform.CtxPlatform:
                return ctx.k
        case uniqueid.CtxGlobalUniqueID:
                return ctx.k.UniqueID()
        case uniqueid.CtxGlobalUniqueIDProvider:
                return ctx.k
        case uniqueid.CtxInotifyCookie:
                return ctx.k.GenerateInotifyCookie()
        case unimpl.CtxEvents:
                return ctx.k
        default:
                return nil
        }
}

// CreateProcess creates a new task in a new thread group with the given
// options. The new task has no parent and is in the root PID namespace.
//
// If k.Start() has already been called, then the created process must be
// started by calling kernel.StartProcess(tg).
//
// If k.Start() has not yet been called, then the created task will begin
// running when k.Start() is called.
//
// CreateProcess has no analogue in Linux; it is used to create the initial
// application task, as well as processes started by the control server.
func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) {
        k.extMu.Lock()
        defer k.extMu.Unlock()
        log.Infof("EXEC: %v", args.Argv)

        ctx := args.NewContext(k)

        var (
                opener    fsbridge.Lookup
                fsContext *FSContext
                mntns     *fs.MountNamespace
                mntnsVFS2 *vfs.MountNamespace
        )

        if VFS2Enabled {
                mntnsVFS2 = args.MountNamespaceVFS2
                if mntnsVFS2 == nil {
                        // Add a reference to the namespace, which is transferred to the new process.
                        mntnsVFS2 = k.globalInit.Leader().MountNamespaceVFS2()
                        mntnsVFS2.IncRef()
                }
                // Get the root directory from the MountNamespace.
                root := mntnsVFS2.Root()
                root.IncRef()
                defer root.DecRef(ctx)

                // Grab the working directory.
                wd := root // Default.
                if args.WorkingDirectory != "" {
                        pop := vfs.PathOperation{
                                Root:               root,
                                Start:              wd,
                                Path:               fspath.Parse(args.WorkingDirectory),
                                FollowFinalSymlink: true,
                        }
                        var err error
                        wd, err = k.VFS().GetDentryAt(ctx, args.Credentials, &pop, &vfs.GetDentryOptions{
                                CheckSearchable: true,
                        })
                        if err != nil {
                                return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
                        }
                        defer wd.DecRef(ctx)
                }
                opener = fsbridge.NewVFSLookup(mntnsVFS2, root, wd)
                fsContext = NewFSContextVFS2(root, wd, args.Umask)

        } else {
                mntns = args.MountNamespace
                if mntns == nil {
                        mntns = k.GlobalInit().Leader().MountNamespace()
                        mntns.IncRef()
                }
                // Get the root directory from the MountNamespace.
                root := mntns.Root()
                // The call to newFSContext below will take a reference on root, so we
                // don't need to hold this one.
                defer root.DecRef(ctx)

                // Grab the working directory.
                remainingTraversals := args.MaxSymlinkTraversals
                wd := root // Default.
                if args.WorkingDirectory != "" {
                        var err error
                        wd, err = mntns.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
                        if err != nil {
                                return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
                        }
                        defer wd.DecRef(ctx)
                }
                opener = fsbridge.NewFSLookup(mntns, root, wd)
                fsContext = newFSContext(root, wd, args.Umask)
        }

        tg := k.NewThreadGroup(mntns, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
        cu := cleanup.Make(func() {
                tg.Release(ctx)
        })
        defer cu.Clean()

        // Check which file to start from.
        switch {
        case args.Filename != "":
                // If a filename is given, take that.
                // Set File to nil so we resolve the path in LoadTaskImage.
                args.File = nil
        case args.File != nil:
                // If File is set, take the File provided directly.
        default:
                // Otherwise look at Argv and see if the first argument is a valid path.
                if len(args.Argv) == 0 {
                        return nil, 0, fmt.Errorf("no filename or command provided")
                }
                if !filepath.IsAbs(args.Argv[0]) {
                        return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
                }
                args.Filename = args.Argv[0]
        }

        // Create a fresh task context.
        remainingTraversals := args.MaxSymlinkTraversals
        loadArgs := loader.LoadArgs{
                Opener:              opener,
                RemainingTraversals: &remainingTraversals,
                ResolveFinal:        true,
                Filename:            args.Filename,
                File:                args.File,
                CloseOnExec:         false,
                Argv:                args.Argv,
                Envv:                args.Envv,
                Features:            k.featureSet,
        }

        image, se := k.LoadTaskImage(ctx, loadArgs)
        if se != nil {
                return nil, 0, errors.New(se.String())
        }

        // Take a reference on the FDTable, which will be transferred to
        // TaskSet.NewTask().
        args.FDTable.IncRef()

        // Create the task.
        config := &TaskConfig{
                Kernel:                  k,
                ThreadGroup:             tg,
                TaskImage:               image,
                FSContext:               fsContext,
                FDTable:                 args.FDTable,
                Credentials:             args.Credentials,
                NetworkNamespace:        k.RootNetworkNamespace(),
                AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
                UTSNamespace:            args.UTSNamespace,
                IPCNamespace:            args.IPCNamespace,
                AbstractSocketNamespace: args.AbstractSocketNamespace,
                MountNamespaceVFS2:      mntnsVFS2,
                ContainerID:             args.ContainerID,
        }
        t, err := k.tasks.NewTask(ctx, config)
        if err != nil {
                return nil, 0, err
        }
        t.traceExecEvent(image) // Simulate exec for tracing.

        // Success.
        cu.Release()
        tgid := k.tasks.Root.IDOfThreadGroup(tg)
        if k.globalInit == nil {
                k.globalInit = tg
        }
        return tg, tgid, nil
}

// StartProcess starts running a process that was created with CreateProcess.
func (k *Kernel) StartProcess(tg *ThreadGroup) {
        t := tg.Leader()
        tid := k.tasks.Root.IDOfTask(t)
        t.Start(tid)
}

// Start starts execution of all tasks in k.
//
// Preconditions: Start may be called exactly once.
func (k *Kernel) Start() error {
        k.extMu.Lock()
        defer k.extMu.Unlock()

        if k.globalInit == nil {
                return fmt.Errorf("kernel contains no tasks")
        }
        if k.started {
                return fmt.Errorf("kernel already started")
        }

        k.started = true
        k.cpuClockTicker = ktime.NewTimer(k.timekeeper.monotonicClock, newKernelCPUClockTicker(k))
        k.cpuClockTicker.Swap(ktime.Setting{
                Enabled: true,
                Period:  linux.ClockTick,
        })
        // If k was created by LoadKernelFrom, timers were stopped during
        // Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
        // this is a no-op.
        k.resumeTimeLocked(k.SupervisorContext())
        // Start task goroutines.
        k.tasks.mu.RLock()
        defer k.tasks.mu.RUnlock()
        for t, tid := range k.tasks.Root.tids {
                t.Start(tid)
        }
        return nil
}

// pauseTimeLocked pauses all Timers and Timekeeper updates.
//
// Preconditions:
// * Any task goroutines running in k must be stopped.
// * k.extMu must be locked.
func (k *Kernel) pauseTimeLocked(ctx context.Context) {
        // k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
        // Kernel.Start().
        if k.cpuClockTicker != nil {
                k.cpuClockTicker.Pause()
        }

        // By precondition, nothing else can be interacting with PIDNamespace.tids
        // or FDTable.files, so we can iterate them without synchronization. (We
        // can't hold the TaskSet mutex when pausing thread group timers because
        // thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
        // mutex, while holding the Timer mutex.)
        for t := range k.tasks.Root.tids {
                if t == t.tg.leader {
                        t.tg.itimerRealTimer.Pause()
                        for _, it := range t.tg.timers {
                                it.PauseTimer()
                        }
                }
                // This means we'll iterate FDTables shared by multiple tasks repeatedly,
                // but ktime.Timer.Pause is idempotent so this is harmless.
                if t.fdTable != nil {
                        t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
                                if VFS2Enabled {
                                        if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
                                                tfd.PauseTimer()
                                        }
                                } else {
                                        if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok {
                                                tfd.PauseTimer()
                                        }
                                }
                        })
                }
        }
        k.timekeeper.PauseUpdates()
}

// resumeTimeLocked resumes all Timers and Timekeeper updates. If
// pauseTimeLocked has not been previously called, resumeTimeLocked has no
// effect.
//
// Preconditions:
// * Any task goroutines running in k must be stopped.
// * k.extMu must be locked.
func (k *Kernel) resumeTimeLocked(ctx context.Context) {
        if k.cpuClockTicker != nil {
                k.cpuClockTicker.Resume()
        }

        k.timekeeper.ResumeUpdates()
        for t := range k.tasks.Root.tids {
                if t == t.tg.leader {
                        t.tg.itimerRealTimer.Resume()
                        for _, it := range t.tg.timers {
                                it.ResumeTimer()
                        }
                }
                if t.fdTable != nil {
                        t.fdTable.forEach(ctx, func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) {
                                if VFS2Enabled {
                                        if tfd, ok := fd.Impl().(*timerfd.TimerFileDescription); ok {
                                                tfd.ResumeTimer()
                                        }
                                } else {
                                        if tfd, ok := file.FileOperations.(*oldtimerfd.TimerOperations); ok {
                                                tfd.ResumeTimer()
                                        }
                                }
                        })
                }
        }
}

func (k *Kernel) incRunningTasks() {
        for {
                tasks := atomic.LoadInt64(&k.runningTasks)
                if tasks != 0 {
                        // Standard case. Simply increment.
                        if !atomic.CompareAndSwapInt64(&k.runningTasks, tasks, tasks+1) {
                                continue
                        }
                        return
                }

                // Transition from 0 -> 1. Synchronize with other transitions and timer.
                k.runningTasksMu.Lock()
                tasks = atomic.LoadInt64(&k.runningTasks)
                if tasks != 0 {
                        // We're no longer the first task, no need to
                        // re-enable.
                        atomic.AddInt64(&k.runningTasks, 1)
                        k.runningTasksMu.Unlock()
                        return
                }

                if !k.cpuClockTickerDisabled {
                        // Timer was never disabled.
                        atomic.StoreInt64(&k.runningTasks, 1)
                        k.runningTasksMu.Unlock()
                        return
                }

                // We need to update cpuClock for all of the ticks missed while we
                // slept, and then re-enable the timer.
                //
                // The Notify in Swap isn't sufficient. kernelCPUClockTicker.Notify
                // always increments cpuClock by 1 regardless of the number of
                // expirations as a heuristic to avoid over-accounting in cases of CPU
                // throttling.
                //
                // We want to cover the normal case, when all time should be accounted,
                // so we increment for all expirations. Throttling is less concerning
                // here because the ticker is only disabled from Notify. This means
                // that Notify must schedule and compensate for the throttled period
                // before the timer is disabled. Throttling while the timer is disabled
                // doesn't matter, as nothing is running or reading cpuClock anyways.
                //
                // S/R also adds complication, as there are two cases. Recall that
                // monotonicClock will jump forward on restore.
                //
                // 1. If the ticker is enabled during save, then on Restore Notify is
                // called with many expirations, covering the time jump, but cpuClock
                // is only incremented by 1.
                //
                // 2. If the ticker is disabled during save, then after Restore the
                // first wakeup will call this function and cpuClock will be
                // incremented by the number of expirations across the S/R.
                //
                // These cause very different value of cpuClock. But again, since
                // nothing was running while the ticker was disabled, those differences
                // don't matter.
                setting, exp := k.cpuClockTickerSetting.At(k.timekeeper.monotonicClock.Now())
                if exp > 0 {
                        atomic.AddUint64(&k.cpuClock, exp)
                }

                // Now that cpuClock is updated it is safe to allow other tasks to
                // transition to running.
                atomic.StoreInt64(&k.runningTasks, 1)

                // N.B. we must unlock before calling Swap to maintain lock ordering.
                //
                // cpuClockTickerDisabled need not wait until after Swap to become
                // true. It is sufficient that the timer *will* be enabled.
                k.cpuClockTickerDisabled = false
                k.runningTasksMu.Unlock()

                // This won't call Notify (unless it's been ClockTick since setting.At
                // above). This means we skip the thread group work in Notify. However,
                // since nothing was running while we were disabled, none of the timers
                // could have expired.
                k.cpuClockTicker.Swap(setting)

                return
        }
}

func (k *Kernel) decRunningTasks() {
        tasks := atomic.AddInt64(&k.runningTasks, -1)
        if tasks < 0 {
                panic(fmt.Sprintf("Invalid running count %d", tasks))
        }

        // Nothing to do. The next CPU clock tick will disable the timer if
        // there is still nothing running. This provides approximately one tick
        // of slack in which we can switch back and forth between idle and
        // active without an expensive transition.
}

// WaitExited blocks until all tasks in k have exited.
func (k *Kernel) WaitExited() {
        k.tasks.liveGoroutines.Wait()
}

// Kill requests that all tasks in k immediately exit as if group exiting with
// status ws. Kill does not wait for tasks to exit.
func (k *Kernel) Kill(ws linux.WaitStatus) {
        k.extMu.Lock()
        defer k.extMu.Unlock()
        k.tasks.Kill(ws)
}

// Pause requests that all tasks in k temporarily stop executing, and blocks
// until all tasks and asynchronous I/O operations in k have stopped. Multiple
// calls to Pause nest and require an equal number of calls to Unpause to
// resume execution.
func (k *Kernel) Pause() {
        k.extMu.Lock()
        k.tasks.BeginExternalStop()
        k.extMu.Unlock()
        k.tasks.runningGoroutines.Wait()
        k.tasks.aioGoroutines.Wait()
}

// ReceiveTaskStates receives full states for all tasks.
func (k *Kernel) ReceiveTaskStates() {
        k.extMu.Lock()
        k.tasks.PullFullState()
        k.extMu.Unlock()
}

// Unpause ends the effect of a previous call to Pause. If Unpause is called
// without a matching preceding call to Pause, Unpause may panic.
func (k *Kernel) Unpause() {
        k.extMu.Lock()
        defer k.extMu.Unlock()
        k.tasks.EndExternalStop()
}

// SendExternalSignal injects a signal into the kernel.
//
// context is used only for debugging to describe how the signal was received.
//
// Preconditions: Kernel must have an init process.
func (k *Kernel) SendExternalSignal(info *linux.SignalInfo, context string) {
        k.extMu.Lock()
        defer k.extMu.Unlock()
        k.sendExternalSignal(info, context)
}

// SendExternalSignalThreadGroup injects a signal into an specific ThreadGroup.
// This function doesn't skip signals like SendExternalSignal does.
func (k *Kernel) SendExternalSignalThreadGroup(tg *ThreadGroup, info *linux.SignalInfo) error {
        k.extMu.Lock()
        defer k.extMu.Unlock()
        return tg.SendSignal(info)
}

// SendContainerSignal sends the given signal to all processes inside the
// namespace that match the given container ID.
func (k *Kernel) SendContainerSignal(cid string, info *linux.SignalInfo) error {
        k.extMu.Lock()
        defer k.extMu.Unlock()
        k.tasks.mu.RLock()
        defer k.tasks.mu.RUnlock()

        var lastErr error
        for tg := range k.tasks.Root.tgids {
                if tg.leader.ContainerID() == cid {
                        tg.signalHandlers.mu.Lock()
                        infoCopy := *info
                        if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
                                lastErr = err
                        }
                        tg.signalHandlers.mu.Unlock()
                }
        }
        return lastErr
}

// RebuildTraceContexts rebuilds the trace context for all tasks.
//
// Unfortunately, if these are built while tracing is not enabled, then we will
// not have meaningful trace data. Rebuilding here ensures that we can do so
// after tracing has been enabled.
func (k *Kernel) RebuildTraceContexts() {
        // We need to pause all task goroutines because Task.rebuildTraceContext()
        // replaces Task.traceContext and Task.traceTask, which are
        // task-goroutine-exclusive (i.e. the task goroutine assumes that it can
        // access them without synchronization) for performance.
        k.Pause()
        defer k.Unpause()

        k.extMu.Lock()
        defer k.extMu.Unlock()
        k.tasks.mu.RLock()
        defer k.tasks.mu.RUnlock()

        for t, tid := range k.tasks.Root.tids {
                t.rebuildTraceContext(tid)
        }
}

// FeatureSet returns the FeatureSet.
func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
        return k.featureSet
}

// Timekeeper returns the Timekeeper.
func (k *Kernel) Timekeeper() *Timekeeper {
        return k.timekeeper
}

// TaskSet returns the TaskSet.
func (k *Kernel) TaskSet() *TaskSet {
        return k.tasks
}

// RootUserNamespace returns the root UserNamespace.
func (k *Kernel) RootUserNamespace() *auth.UserNamespace {
        return k.rootUserNamespace
}

// RootUTSNamespace returns the root UTSNamespace.
func (k *Kernel) RootUTSNamespace() *UTSNamespace {
        return k.rootUTSNamespace
}

// RootIPCNamespace takes a reference and returns the root IPCNamespace.
func (k *Kernel) RootIPCNamespace() *IPCNamespace {
        k.rootIPCNamespace.IncRef()
        return k.rootIPCNamespace
}

// RootPIDNamespace returns the root PIDNamespace.
func (k *Kernel) RootPIDNamespace() *PIDNamespace {
        return k.tasks.Root
}

// RootAbstractSocketNamespace returns the root AbstractSocketNamespace.
func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
        return k.rootAbstractSocketNamespace
}

// RootNetworkNamespace returns the root network namespace, always non-nil.
func (k *Kernel) RootNetworkNamespace() *inet.Namespace {
        return k.rootNetworkNamespace
}

// GlobalInit returns the thread group with ID 1 in the root PID namespace, or
// nil if no such thread group exists. GlobalInit may return a thread group
// containing no tasks if the thread group has already exited.
func (k *Kernel) GlobalInit() *ThreadGroup {
        k.extMu.Lock()
        defer k.extMu.Unlock()
        return k.globalInit
}

// TestOnlySetGlobalInit sets the thread group with ID 1 in the root PID namespace.
func (k *Kernel) TestOnlySetGlobalInit(tg *ThreadGroup) {
        k.globalInit = tg
}

// ApplicationCores returns the number of CPUs visible to sandboxed
// applications.
func (k *Kernel) ApplicationCores() uint {
        return k.applicationCores
}

// RealtimeClock returns the application CLOCK_REALTIME clock.
func (k *Kernel) RealtimeClock() ktime.Clock {
        return k.timekeeper.realtimeClock
}

// MonotonicClock returns the application CLOCK_MONOTONIC clock.
func (k *Kernel) MonotonicClock() ktime.Clock {
        return k.timekeeper.monotonicClock
}

// CPUClockNow returns the current value of k.cpuClock.
func (k *Kernel) CPUClockNow() uint64 {
        return atomic.LoadUint64(&k.cpuClock)
}

// Syslog returns the syslog.
func (k *Kernel) Syslog() *syslog {
        return &k.syslog
}

// GenerateInotifyCookie generates a unique inotify event cookie.
//
// Returned values may overlap with previously returned values if the value
// space is exhausted. 0 is not a valid cookie value, all other values
// representable in a uint32 are allowed.
func (k *Kernel) GenerateInotifyCookie() uint32 {
        id := atomic.AddUint32(&k.nextInotifyCookie, 1)
        // Wrap-around is explicitly allowed for inotify event cookies.
        if id == 0 {
                id = atomic.AddUint32(&k.nextInotifyCookie, 1)
        }
        return id
}

// NetlinkPorts returns the netlink port manager.
func (k *Kernel) NetlinkPorts() *port.Manager {
        return k.netlinkPorts
}

var (
        errSaved     = errors.New("sandbox has been successfully saved")
        errAutoSaved = errors.New("sandbox has been successfully auto-saved")
)

// SaveStatus returns the sandbox save status. If it was saved successfully,
// autosaved indicates whether save was triggered by autosave. If it was not
// saved successfully, err indicates the sandbox error that caused the kernel to
// exit during save.
func (k *Kernel) SaveStatus() (saved, autosaved bool, err error) {
        k.extMu.Lock()
        defer k.extMu.Unlock()
        switch k.saveStatus {
        case nil:
                return false, false, nil
        case errSaved:
                return true, false, nil
        case errAutoSaved:
                return true, true, nil
        default:
                return false, false, k.saveStatus
        }
}

// SetSaveSuccess sets the flag indicating that save completed successfully, if
// no status was already set.
func (k *Kernel) SetSaveSuccess(autosave bool) {
        k.extMu.Lock()
        defer k.extMu.Unlock()
        if k.saveStatus == nil {
                if autosave {
                        k.saveStatus = errAutoSaved
                } else {
                        k.saveStatus = errSaved
                }
        }
}

// SetSaveError sets the sandbox error that caused the kernel to exit during
// save, if one is not already set.
func (k *Kernel) SetSaveError(err error) {
        k.extMu.Lock()
        defer k.extMu.Unlock()
        if k.saveStatus == nil {
                k.saveStatus = err
        }
}

// SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
// LoadFrom.
func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
        k.mf = mf
}

// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile.
func (k *Kernel) MemoryFile() *pgalloc.MemoryFile {
        return k.mf
}

// SupervisorContext returns a Context with maximum privileges in k. It should
// only be used by goroutines outside the control of the emulated kernel
// defined by e.
//
// Callers are responsible for ensuring that the returned Context is not used
// concurrently with changes to the Kernel.
func (k *Kernel) SupervisorContext() context.Context {
        return supervisorContext{
                Logger: log.Log(),
                k:      k,
        }
}

// SocketRecord represents a socket recorded in Kernel.socketsVFS2.
//
// +stateify savable
type SocketRecord struct {
        k        *Kernel
        Sock     *refs.WeakRef        // TODO(gvisor.dev/issue/1624): Only used by VFS1.
        SockVFS2 *vfs.FileDescription // Only used by VFS2.
        ID       uint64               // Socket table entry number.
}

// SocketRecordVFS1 represents a socket recorded in Kernel.sockets. It implements
// refs.WeakRefUser for sockets stored in the socket table.
//
// +stateify savable
type SocketRecordVFS1 struct {
        socketEntry
        SocketRecord
}

// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
func (s *SocketRecordVFS1) WeakRefGone(context.Context) {
        s.k.extMu.Lock()
        s.k.sockets.Remove(s)
        s.k.extMu.Unlock()
}

// RecordSocket adds a socket to the system-wide socket table for tracking.
//
// Precondition: Caller must hold a reference to sock.
func (k *Kernel) RecordSocket(sock *fs.File) {
        k.extMu.Lock()
        id := k.nextSocketRecord
        k.nextSocketRecord++
        s := &SocketRecordVFS1{
                SocketRecord: SocketRecord{
                        k:  k,
                        ID: id,
                },
        }
        s.Sock = refs.NewWeakRef(sock, s)
        k.sockets.PushBack(s)
        k.extMu.Unlock()
}

// RecordSocketVFS2 adds a VFS2 socket to the system-wide socket table for
// tracking.
//
// Precondition: Caller must hold a reference to sock.
//
// Note that the socket table will not hold a reference on the
// vfs.FileDescription.
func (k *Kernel) RecordSocketVFS2(sock *vfs.FileDescription) {
        k.extMu.Lock()
        if _, ok := k.socketsVFS2[sock]; ok {
                panic(fmt.Sprintf("Socket %p added twice", sock))
        }
        id := k.nextSocketRecord
        k.nextSocketRecord++
        s := &SocketRecord{
                k:        k,
                ID:       id,
                SockVFS2: sock,
        }
        k.socketsVFS2[sock] = s
        k.extMu.Unlock()
}

// DeleteSocketVFS2 removes a VFS2 socket from the system-wide socket table.
func (k *Kernel) DeleteSocketVFS2(sock *vfs.FileDescription) {
        k.extMu.Lock()
        delete(k.socketsVFS2, sock)
        k.extMu.Unlock()
}

// ListSockets returns a snapshot of all sockets.
//
// Callers of ListSockets() in VFS2 should use SocketRecord.SockVFS2.TryIncRef()
// to get a reference on a socket in the table.
func (k *Kernel) ListSockets() []*SocketRecord {
        k.extMu.Lock()
        var socks []*SocketRecord
        if VFS2Enabled {
                for _, s := range k.socketsVFS2 {
                        socks = append(socks, s)
                }
        } else {
                for s := k.sockets.Front(); s != nil; s = s.Next() {
                        socks = append(socks, &s.SocketRecord)
                }
        }
        k.extMu.Unlock()
        return socks
}

// supervisorContext is a privileged context.
type supervisorContext struct {
        context.NoopSleeper
        log.Logger
        k *Kernel
}

// Value implements context.Context.
func (ctx supervisorContext) Value(key interface{}) interface{} {
        switch key {
        case CtxCanTrace:
                // The supervisor context can trace anything. (None of
                // supervisorContext's users are expected to invoke ptrace, but ptrace
                // permissions are required for certain file accesses.)
                return func(*Task, bool) bool { return true }
        case CtxKernel:
                return ctx.k
        case CtxPIDNamespace:
                return ctx.k.tasks.Root
        case CtxUTSNamespace:
                return ctx.k.rootUTSNamespace
        case CtxIPCNamespace:
                ipcns := ctx.k.rootIPCNamespace
                ipcns.IncRef()
                return ipcns
        case auth.CtxCredentials:
                // The supervisor context is global root.
                return auth.NewRootCredentials(ctx.k.rootUserNamespace)
        case fs.CtxRoot:
                if ctx.k.globalInit != nil {
                        return ctx.k.globalInit.mounts.Root()
                }
                return nil
        case vfs.CtxRoot:
                if ctx.k.globalInit == nil {
                        return vfs.VirtualDentry{}
                }
                root := ctx.k.GlobalInit().Leader().MountNamespaceVFS2().Root()
                root.IncRef()
                return root
        case vfs.CtxMountNamespace:
                if ctx.k.globalInit == nil {
                        return nil
                }
                mntns := ctx.k.GlobalInit().Leader().MountNamespaceVFS2()
                mntns.IncRef()
                return mntns
        case fs.CtxDirentCacheLimiter:
                return ctx.k.DirentCacheLimiter
        case inet.CtxStack:
                return ctx.k.RootNetworkNamespace().Stack()
        case ktime.CtxRealtimeClock:
                return ctx.k.RealtimeClock()
        case limits.CtxLimits:
                // No limits apply.
                return limits.NewLimitSet()
        case pgalloc.CtxMemoryFile:
                return ctx.k.mf
        case pgalloc.CtxMemoryFileProvider:
                return ctx.k
        case platform.CtxPlatform:
                return ctx.k
        case uniqueid.CtxGlobalUniqueID:
                return ctx.k.UniqueID()
        case uniqueid.CtxGlobalUniqueIDProvider:
                return ctx.k
        case uniqueid.CtxInotifyCookie:
                return ctx.k.GenerateInotifyCookie()
        case unimpl.CtxEvents:
                return ctx.k
        default:
                return nil
        }
}

// Rate limits for the number of unimplemented syscall events.
const (
        unimplementedSyscallsMaxRate = 100  // events per second
        unimplementedSyscallBurst    = 1000 // events
)

// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
// channel.
func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
        k.unimplementedSyscallEmitterOnce.Do(func() {
                k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst)
        })

        t := TaskFromContext(ctx)
        _, _ = k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{
                Tid:       int32(t.ThreadID()),
                Registers: t.Arch().StateData().Proto(),
        })
}

// VFS returns the virtual filesystem for the kernel.
func (k *Kernel) VFS() *vfs.VirtualFilesystem {
        return &k.vfs
}

// SetHostMount sets the hostfs mount.
func (k *Kernel) SetHostMount(mnt *vfs.Mount) {
        if k.hostMount != nil {
                panic("Kernel.hostMount cannot be set more than once")
        }
        k.hostMount = mnt
}

// HostMount returns the hostfs mount.
func (k *Kernel) HostMount() *vfs.Mount {
        return k.hostMount
}

// PipeMount returns the pipefs mount.
func (k *Kernel) PipeMount() *vfs.Mount {
        return k.pipeMount
}

// ShmMount returns the tmpfs mount.
func (k *Kernel) ShmMount() *vfs.Mount {
        return k.shmMount
}

// SocketMount returns the sockfs mount.
func (k *Kernel) SocketMount() *vfs.Mount {
        return k.socketMount
}

// CgroupRegistry returns the cgroup registry.
func (k *Kernel) CgroupRegistry() *CgroupRegistry {
        return k.cgroupRegistry
}

// Release releases resources owned by k.
//
// Precondition: This should only be called after the kernel is fully
// initialized, e.g. after k.Start() has been called.
func (k *Kernel) Release() {
        ctx := k.SupervisorContext()
        if VFS2Enabled {
                k.hostMount.DecRef(ctx)
                k.pipeMount.DecRef(ctx)
                k.shmMount.DecRef(ctx)
                k.socketMount.DecRef(ctx)
                k.vfs.Release(ctx)
        }
        k.timekeeper.Destroy()
        k.vdso.Release(ctx)
}

// PopulateNewCgroupHierarchy moves all tasks into a newly created cgroup
// hierarchy.
//
// Precondition: root must be a new cgroup with no tasks. This implies the
// controllers for root are also new and currently manage no task, which in turn
// implies the new cgroup can be populated without migrating tasks between
// cgroups.
func (k *Kernel) PopulateNewCgroupHierarchy(root Cgroup) {
        k.tasks.mu.RLock()
        k.tasks.forEachTaskLocked(func(t *Task) {
                if t.exitState != TaskExitNone {
                        return
                }
                t.mu.Lock()
                // A task can be in the cgroup if it has been created after the
                // cgroup hierarchy was registered.
                t.enterCgroupIfNotYetLocked(root)
                t.mu.Unlock()
        })
        k.tasks.mu.RUnlock()
}

// ReleaseCgroupHierarchy moves all tasks out of all cgroups belonging to the
// hierarchy with the provided id.  This is intended for use during hierarchy
// teardown, as otherwise the tasks would be orphaned w.r.t to some controllers.
func (k *Kernel) ReleaseCgroupHierarchy(hid uint32) {
        k.tasks.mu.RLock()
        k.tasks.forEachTaskLocked(func(t *Task) {
                if t.exitState != TaskExitNone {
                        return
                }
                t.mu.Lock()
                for cg := range t.cgroups {
                        if cg.HierarchyID() == hid {
                                t.leaveCgroupLocked(cg)
                        }
                }
                t.mu.Unlock()
        })
        k.tasks.mu.RUnlock()
}






































































  163 




    1 




    1 




  182 
    1 


  176 





































































































































































































    8 









    1 

























































    9 



    1 

    8 


    9 


    7 

    3 




    7 








   33 





   33 




    1 




    8 









   32 















    2 

    3 








    3 



    1 










   19 















    5 






    5 




    3 


    3 





    3 











    3 






    3 



    3 






    3 





    2 


    2 





    1 
















    1 




    1 








    1 











    3 



   18 




















































































   23 




    2 




   25 


    5 











   25 



   22 




   27 





   27 









    9 


    1 









   31 




    1 






    5 




































    9 




    5 





   12 




    7 





  122 




  112 






  150 




  115 



    1 




  115 

   96 



   97 



   97 


   97 




   98 

    1 



   95 




   96 



    6 



   95 







    1 




    6 

    2 


    1 



    1 




    1 

    2 

    1 



    1 




    1 

    2 






    5 










  147 





    3 

    1 




    2 





    1 


    2 


    2 


    1 


    2 



    2 



    4 







    1 



    1 






   10 



























    2 











    1 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package transport contains the implementation of Unix endpoints.
package transport

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/waiter"
)

const (
        // The minimum size of the send/receive buffers.
        minimumBufferSize = 4 << 10 // 4 KiB (match default in linux)

        // The default size of the send/receive buffers.
        defaultBufferSize = 208 << 10 // 208 KiB  (default in linux for net.core.wmem_default)

        // The maximum permitted size for the send/receive buffers.
        maxBufferSize = 4 << 20 // 4 MiB 4 MiB (default in linux for net.core.wmem_max)
)

// A RightsControlMessage is a control message containing FDs.
//
// +stateify savable
type RightsControlMessage interface {
        // Clone returns a copy of the RightsControlMessage.
        Clone() RightsControlMessage

        // Release releases any resources owned by the RightsControlMessage.
        Release(ctx context.Context)
}

// A CredentialsControlMessage is a control message containing Unix credentials.
type CredentialsControlMessage interface {
        // Equals returns true iff the two messages are equal.
        Equals(CredentialsControlMessage) bool
}

// A ControlMessages represents a collection of socket control messages.
//
// +stateify savable
type ControlMessages struct {
        // Rights is a control message containing FDs.
        Rights RightsControlMessage

        // Credentials is a control message containing Unix credentials.
        Credentials CredentialsControlMessage
}

// Empty returns true iff the ControlMessages does not contain either
// credentials or rights.
func (c *ControlMessages) Empty() bool {
        return c.Rights == nil && c.Credentials == nil
}

// Clone clones both the credentials and the rights.
func (c *ControlMessages) Clone() ControlMessages {
        cm := ControlMessages{}
        if c.Rights != nil {
                cm.Rights = c.Rights.Clone()
        }
        cm.Credentials = c.Credentials
        return cm
}

// Release releases both the credentials and the rights.
func (c *ControlMessages) Release(ctx context.Context) {
        if c.Rights != nil {
                c.Rights.Release(ctx)
        }
        *c = ControlMessages{}
}

// Endpoint is the interface implemented by Unix transport protocol
// implementations that expose functionality like sendmsg, recvmsg, connect,
// etc. to Unix socket implementations.
type Endpoint interface {
        Credentialer
        waiter.Waitable

        // Close puts the endpoint in a closed state and frees all resources
        // associated with it.
        Close(ctx context.Context)

        // RecvMsg reads data and a control message from the endpoint. This method
        // does not block if there is no data pending.
        //
        // creds indicates if credential control messages are requested by the
        // caller. This is useful for determining if control messages can be
        // coalesced. creds is a hint and can be safely ignored by the
        // implementation if no coalescing is possible. It is fine to return
        // credential control messages when none were requested or to not return
        // credential control messages when they were requested.
        //
        // numRights is the number of SCM_RIGHTS FDs requested by the caller. This
        // is useful if one must allocate a buffer to receive a SCM_RIGHTS message
        // or determine if control messages can be coalesced. numRights is a hint
        // and can be safely ignored by the implementation if the number of
        // available SCM_RIGHTS FDs is known and no coalescing is possible. It is
        // fine for the returned number of SCM_RIGHTS FDs to be either higher or
        // lower than the requested number.
        //
        // If peek is true, no data should be consumed from the Endpoint. Any and
        // all data returned from a peek should be available in the next call to
        // RecvMsg.
        //
        // recvLen is the number of bytes copied into data.
        //
        // msgLen is the length of the read message consumed for datagram Endpoints.
        // msgLen is always the same as recvLen for stream Endpoints.
        //
        // CMTruncated indicates that the numRights hint was used to receive fewer
        // than the total available SCM_RIGHTS FDs. Additional truncation may be
        // required by the caller.
        RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *tcpip.FullAddress) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, err *syserr.Error)

        // SendMsg writes data and a control message to the endpoint's peer.
        // This method does not block if the data cannot be written.
        //
        // SendMsg does not take ownership of any of its arguments on error.
        SendMsg(context.Context, [][]byte, ControlMessages, BoundEndpoint) (int64, *syserr.Error)

        // Connect connects this endpoint directly to another.
        //
        // This should be called on the client endpoint, and the (bound)
        // endpoint passed in as a parameter.
        //
        // The error codes are the same as Connect.
        Connect(ctx context.Context, server BoundEndpoint) *syserr.Error

        // Shutdown closes the read and/or write end of the endpoint connection
        // to its peer.
        Shutdown(flags tcpip.ShutdownFlags) *syserr.Error

        // Listen puts the endpoint in "listen" mode, which allows it to accept
        // new connections.
        Listen(backlog int) *syserr.Error

        // Accept returns a new endpoint if a peer has established a connection
        // to an endpoint previously set to listen mode. This method does not
        // block if no new connections are available.
        //
        // The returned Queue is the wait queue for the newly created endpoint.
        //
        // peerAddr if not nil will be populated with the address of the connected
        // peer on a successful accept.
        Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error)

        // Bind binds the endpoint to a specific local address and port.
        // Specifying a NIC is optional.
        //
        // An optional commit function will be executed atomically with respect
        // to binding the endpoint. If this returns an error, the bind will not
        // occur and the error will be propagated back to the caller.
        Bind(address tcpip.FullAddress, commit func() *syserr.Error) *syserr.Error

        // Type return the socket type, typically either SockStream, SockDgram
        // or SockSeqpacket.
        Type() linux.SockType

        // GetLocalAddress returns the address to which the endpoint is bound.
        GetLocalAddress() (tcpip.FullAddress, tcpip.Error)

        // GetRemoteAddress returns the address to which the endpoint is
        // connected.
        GetRemoteAddress() (tcpip.FullAddress, tcpip.Error)

        // SetSockOpt sets a socket option.
        SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error

        // SetSockOptInt sets a socket option for simple cases when a value has
        // the int type.
        SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error

        // GetSockOpt gets a socket option.
        GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error

        // GetSockOptInt gets a socket option for simple cases when a return
        // value has the int type.
        GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error)

        // State returns the current state of the socket, as represented by Linux in
        // procfs.
        State() uint32

        // LastError clears and returns the last error reported by the endpoint.
        LastError() tcpip.Error

        // SocketOptions returns the structure which contains all the socket
        // level options.
        SocketOptions() *tcpip.SocketOptions
}

// A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket
// option.
type Credentialer interface {
        // Passcred returns whether or not the SO_PASSCRED socket option is
        // enabled on this end.
        Passcred() bool

        // ConnectedPasscred returns whether or not the SO_PASSCRED socket option
        // is enabled on the connected end.
        ConnectedPasscred() bool
}

// A BoundEndpoint is a unix endpoint that can be connected to.
type BoundEndpoint interface {
        // BidirectionalConnect establishes a bi-directional connection between two
        // unix endpoints in an all-or-nothing manner. If an error occurs during
        // connecting, the state of neither endpoint should be modified.
        //
        // In order for an endpoint to establish such a bidirectional connection
        // with a BoundEndpoint, the endpoint calls the BidirectionalConnect method
        // on the BoundEndpoint and sends a representation of itself (the
        // ConnectingEndpoint) and a callback (returnConnect) to receive the
        // connection information (Receiver and ConnectedEndpoint) upon a
        // successful connect. The callback should only be called on a successful
        // connect.
        //
        // For a connection attempt to be successful, the ConnectingEndpoint must
        // be unconnected and not listening and the BoundEndpoint whose
        // BidirectionalConnect method is being called must be listening.
        //
        // This method will return syserr.ErrConnectionRefused on endpoints with a
        // type that isn't SockStream or SockSeqpacket.
        BidirectionalConnect(ctx context.Context, ep ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error

        // UnidirectionalConnect establishes a write-only connection to a unix
        // endpoint.
        //
        // An endpoint which calls UnidirectionalConnect and supports it itself must
        // not hold its own lock when calling UnidirectionalConnect.
        //
        // This method will return syserr.ErrConnectionRefused on a non-SockDgram
        // endpoint.
        UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error)

        // Passcred returns whether or not the SO_PASSCRED socket option is
        // enabled on this end.
        Passcred() bool

        // Release releases any resources held by the BoundEndpoint. It must be
        // called before dropping all references to a BoundEndpoint returned by a
        // function.
        Release(ctx context.Context)
}

// message represents a message passed over a Unix domain socket.
//
// +stateify savable
type message struct {
        messageEntry

        // Data is the Message payload.
        Data buffer.View

        // Control is auxiliary control message data that goes along with the
        // data.
        Control ControlMessages

        // Address is the bound address of the endpoint that sent the message.
        //
        // If the endpoint that sent the message is not bound, the Address is
        // the empty string.
        Address tcpip.FullAddress
}

// Length returns number of bytes stored in the message.
func (m *message) Length() int64 {
        return int64(len(m.Data))
}

// Release releases any resources held by the message.
func (m *message) Release(ctx context.Context) {
        m.Control.Release(ctx)
}

// Peek returns a copy of the message.
func (m *message) Peek() *message {
        return &message{Data: m.Data, Control: m.Control.Clone(), Address: m.Address}
}

// Truncate reduces the length of the message payload to n bytes.
//
// Preconditions: n <= m.Length().
func (m *message) Truncate(n int64) {
        m.Data.CapLength(int(n))
}

// A Receiver can be used to receive Messages.
type Receiver interface {
        // Recv receives a single message. This method does not block.
        //
        // See Endpoint.RecvMsg for documentation on shared arguments.
        //
        // notify indicates if RecvNotify should be called.
        Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error)

        // RecvNotify notifies the Receiver of a successful Recv. This must not be
        // called while holding any endpoint locks.
        RecvNotify()

        // CloseRecv prevents the receiving of additional Messages.
        //
        // After CloseRecv is called, CloseNotify must also be called.
        CloseRecv()

        // CloseNotify notifies the Receiver of recv being closed. This must not be
        // called while holding any endpoint locks.
        CloseNotify()

        // Readable returns if messages should be attempted to be received. This
        // includes when read has been shutdown.
        Readable() bool

        // RecvQueuedSize returns the total amount of data currently receivable.
        // RecvQueuedSize should return -1 if the operation isn't supported.
        RecvQueuedSize() int64

        // RecvMaxQueueSize returns maximum value for RecvQueuedSize.
        // RecvMaxQueueSize should return -1 if the operation isn't supported.
        RecvMaxQueueSize() int64

        // Release releases any resources owned by the Receiver. It should be
        // called before dropping all references to a Receiver.
        Release(ctx context.Context)
}

// queueReceiver implements Receiver for datagram sockets.
//
// +stateify savable
type queueReceiver struct {
        readQueue *queue
}

// Recv implements Receiver.Recv.
func (q *queueReceiver) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
        var m *message
        var notify bool
        var err *syserr.Error
        if peek {
                m, err = q.readQueue.Peek()
        } else {
                m, notify, err = q.readQueue.Dequeue()
        }
        if err != nil {
                return 0, 0, ControlMessages{}, false, tcpip.FullAddress{}, false, err
        }
        src := []byte(m.Data)
        var copied int64
        for i := 0; i < len(data) && len(src) > 0; i++ {
                n := copy(data[i], src)
                copied += int64(n)
                src = src[n:]
        }
        return copied, int64(len(m.Data)), m.Control, false, m.Address, notify, nil
}

// RecvNotify implements Receiver.RecvNotify.
func (q *queueReceiver) RecvNotify() {
        q.readQueue.WriterQueue.Notify(waiter.WritableEvents)
}

// CloseNotify implements Receiver.CloseNotify.
func (q *queueReceiver) CloseNotify() {
        q.readQueue.ReaderQueue.Notify(waiter.ReadableEvents)
        q.readQueue.WriterQueue.Notify(waiter.WritableEvents)
}

// CloseRecv implements Receiver.CloseRecv.
func (q *queueReceiver) CloseRecv() {
        q.readQueue.Close()
}

// Readable implements Receiver.Readable.
func (q *queueReceiver) Readable() bool {
        return q.readQueue.IsReadable()
}

// RecvQueuedSize implements Receiver.RecvQueuedSize.
func (q *queueReceiver) RecvQueuedSize() int64 {
        return q.readQueue.QueuedSize()
}

// RecvMaxQueueSize implements Receiver.RecvMaxQueueSize.
func (q *queueReceiver) RecvMaxQueueSize() int64 {
        return q.readQueue.MaxQueueSize()
}

// Release implements Receiver.Release.
func (q *queueReceiver) Release(ctx context.Context) {
        q.readQueue.DecRef(ctx)
}

// streamQueueReceiver implements Receiver for stream sockets.
//
// +stateify savable
type streamQueueReceiver struct {
        queueReceiver

        mu      sync.Mutex `state:"nosave"`
        buffer  []byte
        control ControlMessages
        addr    tcpip.FullAddress
}

func vecCopy(data [][]byte, buf []byte) (int64, [][]byte, []byte) {
        var copied int64
        for len(data) > 0 && len(buf) > 0 {
                n := copy(data[0], buf)
                copied += int64(n)
                buf = buf[n:]
                data[0] = data[0][n:]
                if len(data[0]) == 0 {
                        data = data[1:]
                }
        }
        return copied, data, buf
}

// Readable implements Receiver.Readable.
func (q *streamQueueReceiver) Readable() bool {
        q.mu.Lock()
        bl := len(q.buffer)
        r := q.readQueue.IsReadable()
        q.mu.Unlock()
        // We're readable if we have data in our buffer or if the queue receiver is
        // readable.
        return bl > 0 || r
}

// RecvQueuedSize implements Receiver.RecvQueuedSize.
func (q *streamQueueReceiver) RecvQueuedSize() int64 {
        q.mu.Lock()
        bl := len(q.buffer)
        qs := q.readQueue.QueuedSize()
        q.mu.Unlock()
        return int64(bl) + qs
}

// RecvMaxQueueSize implements Receiver.RecvMaxQueueSize.
func (q *streamQueueReceiver) RecvMaxQueueSize() int64 {
        // The RecvMaxQueueSize() is the readQueue's MaxQueueSize() plus the largest
        // message we can buffer which is also the largest message we can receive.
        return 2 * q.readQueue.MaxQueueSize()
}

// Recv implements Receiver.Recv.
func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
        q.mu.Lock()
        defer q.mu.Unlock()

        var notify bool

        // If we have no data in the endpoint, we need to get some.
        if len(q.buffer) == 0 {
                // Load the next message into a buffer, even if we are peeking. Peeking
                // won't consume the message, so it will be still available to be read
                // the next time Recv() is called.
                m, n, err := q.readQueue.Dequeue()
                if err != nil {
                        return 0, 0, ControlMessages{}, false, tcpip.FullAddress{}, false, err
                }
                notify = n
                q.buffer = []byte(m.Data)
                q.control = m.Control
                q.addr = m.Address
        }

        var copied int64
        if peek {
                // Don't consume control message if we are peeking.
                c := q.control.Clone()

                // Don't consume data since we are peeking.
                copied, _, _ = vecCopy(data, q.buffer)

                return copied, copied, c, false, q.addr, notify, nil
        }

        // Consume data and control message since we are not peeking.
        copied, data, q.buffer = vecCopy(data, q.buffer)

        // Save the original state of q.control.
        c := q.control

        // Remove rights from q.control and leave behind just the creds.
        q.control.Rights = nil
        if !wantCreds {
                c.Credentials = nil
        }

        var cmTruncated bool
        if c.Rights != nil && numRights == 0 {
                c.Rights.Release(ctx)
                c.Rights = nil
                cmTruncated = true
        }

        haveRights := c.Rights != nil

        // If we have more capacity for data and haven't received any usable
        // rights.
        //
        // Linux never coalesces rights control messages.
        for !haveRights && len(data) > 0 {
                // Get a message from the readQueue.
                m, n, err := q.readQueue.Dequeue()
                if err != nil {
                        // We already got some data, so ignore this error. This will
                        // manifest as a short read to the user, which is what Linux
                        // does.
                        break
                }
                notify = notify || n
                q.buffer = []byte(m.Data)
                q.control = m.Control
                q.addr = m.Address

                if wantCreds {
                        if (q.control.Credentials == nil) != (c.Credentials == nil) {
                                // One message has credentials, the other does not.
                                break
                        }

                        if q.control.Credentials != nil && c.Credentials != nil && !q.control.Credentials.Equals(c.Credentials) {
                                // Both messages have credentials, but they don't match.
                                break
                        }
                }

                if numRights != 0 && c.Rights != nil && q.control.Rights != nil {
                        // Both messages have rights.
                        break
                }

                var cpd int64
                cpd, data, q.buffer = vecCopy(data, q.buffer)
                copied += cpd

                if cpd == 0 {
                        // data was actually full.
                        break
                }

                if q.control.Rights != nil {
                        // Consume rights.
                        if numRights == 0 {
                                cmTruncated = true
                                q.control.Rights.Release(ctx)
                        } else {
                                c.Rights = q.control.Rights
                                haveRights = true
                        }
                        q.control.Rights = nil
                }
        }
        return copied, copied, c, cmTruncated, q.addr, notify, nil
}

// Release implements Receiver.Release.
func (q *streamQueueReceiver) Release(ctx context.Context) {
        q.queueReceiver.Release(ctx)
        q.control.Release(ctx)
}

// A ConnectedEndpoint is an Endpoint that can be used to send Messages.
type ConnectedEndpoint interface {
        // Passcred implements Endpoint.Passcred.
        Passcred() bool

        // GetLocalAddress implements Endpoint.GetLocalAddress.
        GetLocalAddress() (tcpip.FullAddress, tcpip.Error)

        // Send sends a single message. This method does not block.
        //
        // notify indicates if SendNotify should be called.
        //
        // syserr.ErrWouldBlock can be returned along with a partial write if
        // the caller should block to send the rest of the data.
        Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error)

        // SendNotify notifies the ConnectedEndpoint of a successful Send. This
        // must not be called while holding any endpoint locks.
        SendNotify()

        // CloseSend prevents the sending of additional Messages.
        //
        // After CloseSend is call, CloseNotify must also be called.
        CloseSend()

        // CloseNotify notifies the ConnectedEndpoint of send being closed. This
        // must not be called while holding any endpoint locks.
        CloseNotify()

        // Writable returns if messages should be attempted to be sent. This
        // includes when write has been shutdown.
        Writable() bool

        // EventUpdate lets the ConnectedEndpoint know that event registrations
        // have changed.
        EventUpdate()

        // SendQueuedSize returns the total amount of data currently queued for
        // sending. SendQueuedSize should return -1 if the operation isn't
        // supported.
        SendQueuedSize() int64

        // SendMaxQueueSize returns maximum value for SendQueuedSize.
        // SendMaxQueueSize should return -1 if the operation isn't supported.
        SendMaxQueueSize() int64

        // Release releases any resources owned by the ConnectedEndpoint. It should
        // be called before dropping all references to a ConnectedEndpoint.
        Release(ctx context.Context)

        // CloseUnread sets the fact that this end is closed with unread data to
        // the peer socket.
        CloseUnread()

        // SetSendBufferSize is called when the endpoint's send buffer size is
        // changed.
        SetSendBufferSize(v int64) (newSz int64)
}

// +stateify savable
type connectedEndpoint struct {
        // endpoint represents the subset of the Endpoint functionality needed by
        // the connectedEndpoint. It is implemented by both connectionedEndpoint
        // and connectionlessEndpoint and allows the use of types which don't
        // fully implement Endpoint.
        endpoint interface {
                // Passcred implements Endpoint.Passcred.
                Passcred() bool

                // GetLocalAddress implements Endpoint.GetLocalAddress.
                GetLocalAddress() (tcpip.FullAddress, tcpip.Error)

                // Type implements Endpoint.Type.
                Type() linux.SockType
        }

        writeQueue *queue
}

// Passcred implements ConnectedEndpoint.Passcred.
func (e *connectedEndpoint) Passcred() bool {
        return e.endpoint.Passcred()
}

// GetLocalAddress implements ConnectedEndpoint.GetLocalAddress.
func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
        return e.endpoint.GetLocalAddress()
}

// Send implements ConnectedEndpoint.Send.
func (e *connectedEndpoint) Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
        discardEmpty := false
        truncate := false
        if e.endpoint.Type() == linux.SOCK_STREAM {
                // Discard empty stream packets. Since stream sockets don't
                // preserve message boundaries, sending zero bytes is a no-op.
                // In Linux, the receiver actually uses a zero-length receive
                // as an indication that the stream was closed.
                discardEmpty = true

                // Since stream sockets don't preserve message boundaries, we
                // can write only as much of the message as fits in the queue.
                truncate = true
        }

        return e.writeQueue.Enqueue(ctx, data, c, from, discardEmpty, truncate)
}

// SendNotify implements ConnectedEndpoint.SendNotify.
func (e *connectedEndpoint) SendNotify() {
        e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents)
}

// CloseNotify implements ConnectedEndpoint.CloseNotify.
func (e *connectedEndpoint) CloseNotify() {
        e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents)
        e.writeQueue.WriterQueue.Notify(waiter.WritableEvents)
}

// CloseSend implements ConnectedEndpoint.CloseSend.
func (e *connectedEndpoint) CloseSend() {
        e.writeQueue.Close()
}

// Writable implements ConnectedEndpoint.Writable.
func (e *connectedEndpoint) Writable() bool {
        return e.writeQueue.IsWritable()
}

// EventUpdate implements ConnectedEndpoint.EventUpdate.
func (*connectedEndpoint) EventUpdate() {}

// SendQueuedSize implements ConnectedEndpoint.SendQueuedSize.
func (e *connectedEndpoint) SendQueuedSize() int64 {
        return e.writeQueue.QueuedSize()
}

// SendMaxQueueSize implements ConnectedEndpoint.SendMaxQueueSize.
func (e *connectedEndpoint) SendMaxQueueSize() int64 {
        return e.writeQueue.MaxQueueSize()
}

// Release implements ConnectedEndpoint.Release.
func (e *connectedEndpoint) Release(ctx context.Context) {
        e.writeQueue.DecRef(ctx)
}

// CloseUnread implements ConnectedEndpoint.CloseUnread.
func (e *connectedEndpoint) CloseUnread() {
        e.writeQueue.CloseUnread()
}

// SetSendBufferSize implements ConnectedEndpoint.SetSendBufferSize.
// SetSendBufferSize sets the send buffer size for the write queue to the
// specified value.
func (e *connectedEndpoint) SetSendBufferSize(v int64) (newSz int64) {
        e.writeQueue.SetMaxQueueSize(v)
        return v
}

// baseEndpoint is an embeddable unix endpoint base used in both the connected
// and connectionless unix domain socket Endpoint implementations.
//
// Not to be used on its own.
//
// +stateify savable
type baseEndpoint struct {
        *waiter.Queue
        tcpip.DefaultSocketOptionsHandler

        // Mutex protects the below fields.
        //
        // See the lock ordering comment in package kernel/epoll regarding when
        // this lock can safely be held.
        sync.Mutex `state:"nosave"`

        // receiver allows Messages to be received.
        receiver Receiver

        // connected allows messages to be sent and state information about the
        // connected endpoint to be read.
        connected ConnectedEndpoint

        // path is not empty if the endpoint has been bound,
        // or may be used if the endpoint is connected.
        path string

        // ops is used to get socket level options.
        ops tcpip.SocketOptions
}

// EventRegister implements waiter.Waitable.EventRegister.
func (e *baseEndpoint) EventRegister(we *waiter.Entry, mask waiter.EventMask) {
        e.Queue.EventRegister(we, mask)
        e.Lock()
        c := e.connected
        e.Unlock()
        if c != nil {
                c.EventUpdate()
        }
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (e *baseEndpoint) EventUnregister(we *waiter.Entry) {
        e.Queue.EventUnregister(we)
        e.Lock()
        c := e.connected
        e.Unlock()
        if c != nil {
                c.EventUpdate()
        }
}

// Passcred implements Credentialer.Passcred.
func (e *baseEndpoint) Passcred() bool {
        return e.SocketOptions().GetPassCred()
}

// ConnectedPasscred implements Credentialer.ConnectedPasscred.
func (e *baseEndpoint) ConnectedPasscred() bool {
        e.Lock()
        defer e.Unlock()
        return e.connected != nil && e.connected.Passcred()
}

// Connected implements ConnectingEndpoint.Connected.
func (e *baseEndpoint) Connected() bool {
        return e.receiver != nil && e.connected != nil
}

// RecvMsg reads data and a control message from the endpoint.
func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *tcpip.FullAddress) (int64, int64, ControlMessages, bool, *syserr.Error) {
        e.Lock()

        receiver := e.receiver
        if receiver == nil {
                e.Unlock()
                return 0, 0, ControlMessages{}, false, syserr.ErrNotConnected
        }

        recvLen, msgLen, cms, cmt, a, notify, err := receiver.Recv(ctx, data, creds, numRights, peek)
        e.Unlock()
        if err != nil {
                return 0, 0, ControlMessages{}, false, err
        }

        if notify {
                receiver.RecvNotify()
        }

        if addr != nil {
                *addr = a
        }
        return recvLen, msgLen, cms, cmt, nil
}

// SendMsg writes data and a control message to the endpoint's peer.
// This method does not block if the data cannot be written.
func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, *syserr.Error) {
        e.Lock()
        if !e.Connected() {
                e.Unlock()
                return 0, syserr.ErrNotConnected
        }
        if to != nil {
                e.Unlock()
                return 0, syserr.ErrAlreadyConnected
        }

        connected := e.connected
        n, notify, err := connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
        e.Unlock()

        if notify {
                connected.SendNotify()
        }

        return n, err
}

// SetSockOpt sets a socket option.
func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
        return nil
}

func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
        log.Warningf("Unsupported socket option: %d", opt)
        return nil
}

func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
        switch opt {
        case tcpip.ReceiveQueueSizeOption:
                v := 0
                e.Lock()
                if !e.Connected() {
                        e.Unlock()
                        return -1, &tcpip.ErrNotConnected{}
                }
                v = int(e.receiver.RecvQueuedSize())
                e.Unlock()
                if v < 0 {
                        return -1, &tcpip.ErrQueueSizeNotSupported{}
                }
                return v, nil

        case tcpip.SendQueueSizeOption:
                e.Lock()
                if !e.Connected() {
                        e.Unlock()
                        return -1, &tcpip.ErrNotConnected{}
                }
                v := e.connected.SendQueuedSize()
                e.Unlock()
                if v < 0 {
                        return -1, &tcpip.ErrQueueSizeNotSupported{}
                }
                return int(v), nil

        default:
                log.Warningf("Unsupported socket option: %d", opt)
                return -1, &tcpip.ErrUnknownProtocolOption{}
        }
}

// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error {
        log.Warningf("Unsupported socket option: %T", opt)
        return &tcpip.ErrUnknownProtocolOption{}
}

// LastError implements Endpoint.LastError.
func (*baseEndpoint) LastError() tcpip.Error {
        return nil
}

// SocketOptions implements Endpoint.SocketOptions.
func (e *baseEndpoint) SocketOptions() *tcpip.SocketOptions {
        return &e.ops
}

// Shutdown closes the read and/or write end of the endpoint connection to its
// peer.
func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error {
        e.Lock()
        if !e.Connected() {
                e.Unlock()
                return syserr.ErrNotConnected
        }

        var (
                r             = e.receiver
                c             = e.connected
                shutdownRead  = flags&tcpip.ShutdownRead != 0
                shutdownWrite = flags&tcpip.ShutdownWrite != 0
        )
        if shutdownRead {
                r.CloseRecv()
        }
        if shutdownWrite {
                c.CloseSend()
        }
        e.Unlock()

        // Don't hold e.Mutex while calling CloseNotify.
        if shutdownRead {
                r.CloseNotify()
        }
        if shutdownWrite {
                c.CloseNotify()
        }

        return nil
}

// GetLocalAddress returns the bound path.
func (e *baseEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
        e.Lock()
        defer e.Unlock()
        return tcpip.FullAddress{Addr: tcpip.Address(e.path)}, nil
}

// GetRemoteAddress returns the local address of the connected endpoint (if
// available).
func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) {
        e.Lock()
        c := e.connected
        e.Unlock()
        if c != nil {
                return c.GetLocalAddress()
        }
        return tcpip.FullAddress{}, &tcpip.ErrNotConnected{}
}

// Release implements BoundEndpoint.Release.
func (*baseEndpoint) Release(context.Context) {
        // Binding a baseEndpoint doesn't take a reference.
}

// stackHandler is just a stub implementation of tcpip.StackHandler to provide
// when initializing socketoptions.
type stackHandler struct {
}

// Option implements tcpip.StackHandler.
func (h *stackHandler) Option(option interface{}) tcpip.Error {
        panic("unimplemented")
}

// TransportProtocolOption implements tcpip.StackHandler.
func (h *stackHandler) TransportProtocolOption(proto tcpip.TransportProtocolNumber, option tcpip.GettableTransportProtocolOption) tcpip.Error {
        panic("unimplemented")
}

// getSendBufferLimits implements tcpip.GetSendBufferLimits.
//
// AF_UNIX sockets buffer sizes are not tied to the networking stack/namespace
// in linux but are bound by net.core.(wmem|rmem)_(max|default).
//
// In gVisor net.core sysctls today are not exposed or if exposed are currently
// tied to the networking stack in use. This makes it complicated for AF_UNIX
// when we are in a new namespace w/ no networking stack. As a result for now we
// define default/max values here in the unix socket implementation itself.
func getSendBufferLimits(tcpip.StackHandler) tcpip.SendBufferSizeOption {
        return tcpip.SendBufferSizeOption{
                Min:     minimumBufferSize,
                Default: defaultBufferSize,
                Max:     maxBufferSize,
        }
}

// getReceiveBufferLimits implements tcpip.GetReceiveBufferLimits.
//
// We define min, max and default values for unix socket implementation. Unix
// sockets do not use receive buffer.
func getReceiveBufferLimits(tcpip.StackHandler) tcpip.ReceiveBufferSizeOption {
        return tcpip.ReceiveBufferSizeOption{
                Min:     minimumBufferSize,
                Default: defaultBufferSize,
                Max:     maxBufferSize,
        }
}










































































    3 




























































































    3 
    3 









    3 


























    3 






    3 







    3 





    3 






    3 






    3 




    3 
























  694 







    1 

    1 






  696 











  698 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "bytes"
        "fmt"
        "sort"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
)

// InvalidCgroupHierarchyID indicates an uninitialized hierarchy ID.
const InvalidCgroupHierarchyID uint32 = 0

// CgroupControllerType is the name of a cgroup controller.
type CgroupControllerType string

// CgroupController is the common interface to cgroup controllers available to
// the entire sentry. The controllers themselves are defined by cgroupfs.
//
// Callers of this interface are often unable access synchronization needed to
// ensure returned values remain valid. Some of values returned from this
// interface are thus snapshots in time, and may become stale. This is ok for
// many callers like procfs.
type CgroupController interface {
        // Returns the type of this cgroup controller (ex "memory", "cpu"). Returned
        // value is valid for the lifetime of the controller.
        Type() CgroupControllerType

        // Hierarchy returns the ID of the hierarchy this cgroup controller is
        // attached to. Returned value is valid for the lifetime of the controller.
        HierarchyID() uint32

        // RootCgroup returns the root cgroup for this controller. Returned value is
        // valid for the lifetime of the controller.
        RootCgroup() Cgroup

        // NumCgroups returns the number of cgroups managed by this controller.
        // Returned value is a snapshot in time.
        NumCgroups() uint64

        // Enabled returns whether this controller is enabled. Returned value is a
        // snapshot in time.
        Enabled() bool
}

// Cgroup represents a named pointer to a cgroup in cgroupfs. When a task enters
// a cgroup, it holds a reference on the underlying dentry pointing to the
// cgroup.
//
// +stateify savable
type Cgroup struct {
        *kernfs.Dentry
        CgroupImpl
}

func (c *Cgroup) decRef() {
        c.Dentry.DecRef(context.Background())
}

// Path returns the absolute path of c, relative to its hierarchy root.
func (c *Cgroup) Path() string {
        return c.FSLocalPath()
}

// HierarchyID returns the id of the hierarchy that contains this cgroup.
func (c *Cgroup) HierarchyID() uint32 {
        // Note: a cgroup is guaranteed to have at least one controller.
        return c.Controllers()[0].HierarchyID()
}

// CgroupImpl is the common interface to cgroups.
type CgroupImpl interface {
        Controllers() []CgroupController
        Enter(t *Task)
        Leave(t *Task)
}

// hierarchy represents a cgroupfs filesystem instance, with a unique set of
// controllers attached to it. Multiple cgroupfs mounts may reference the same
// hierarchy.
//
// +stateify savable
type hierarchy struct {
        id uint32
        // These are a subset of the controllers in CgroupRegistry.controllers,
        // grouped here by hierarchy for conveninent lookup.
        controllers map[CgroupControllerType]CgroupController
        // fs is not owned by hierarchy. The FS is responsible for unregistering the
        // hierarchy on destruction, which removes this association.
        fs *vfs.Filesystem
}

func (h *hierarchy) match(ctypes []CgroupControllerType) bool {
        if len(ctypes) != len(h.controllers) {
                return false
        }
        for _, ty := range ctypes {
                if _, ok := h.controllers[ty]; !ok {
                        return false
                }
        }
        return true
}

// cgroupFS is the public interface to cgroupfs. This lets the kernel package
// refer to cgroupfs.filesystem methods without directly depending on the
// cgroupfs package, which would lead to a circular dependency.
type cgroupFS interface {
        // Returns the vfs.Filesystem for the cgroupfs.
        VFSFilesystem() *vfs.Filesystem

        // InitializeHierarchyID sets the hierarchy ID for this filesystem during
        // filesystem creation. May only be called before the filesystem is visible
        // to the vfs layer.
        InitializeHierarchyID(hid uint32)
}

// CgroupRegistry tracks the active set of cgroup controllers on the system.
//
// +stateify savable
type CgroupRegistry struct {
        // lastHierarchyID is the id of the last allocated cgroup hierarchy. Valid
        // ids are from 1 to math.MaxUint32. Must be accessed through atomic ops.
        //
        lastHierarchyID uint32

        mu sync.Mutex `state:"nosave"`

        // controllers is the set of currently known cgroup controllers on the
        // system. Protected by mu.
        //
        // +checklocks:mu
        controllers map[CgroupControllerType]CgroupController

        // hierarchies is the active set of cgroup hierarchies. Protected by mu.
        //
        // +checklocks:mu
        hierarchies map[uint32]hierarchy
}

func newCgroupRegistry() *CgroupRegistry {
        return &CgroupRegistry{
                controllers: make(map[CgroupControllerType]CgroupController),
                hierarchies: make(map[uint32]hierarchy),
        }
}

// nextHierarchyID returns a newly allocated, unique hierarchy ID.
func (r *CgroupRegistry) nextHierarchyID() (uint32, error) {
        if hid := atomic.AddUint32(&r.lastHierarchyID, 1); hid != 0 {
                return hid, nil
        }
        return InvalidCgroupHierarchyID, fmt.Errorf("cgroup hierarchy ID overflow")
}

// FindHierarchy returns a cgroup filesystem containing exactly the set of
// controllers named in names. If no such FS is found, FindHierarchy return
// nil. FindHierarchy takes a reference on the returned FS, which is transferred
// to the caller.
func (r *CgroupRegistry) FindHierarchy(ctypes []CgroupControllerType) *vfs.Filesystem {
        r.mu.Lock()
        defer r.mu.Unlock()

        for _, h := range r.hierarchies {
                if h.match(ctypes) {
                        if !h.fs.TryIncRef() {
                                // Racing with filesystem destruction, namely h.fs.Release.
                                // Since we hold r.mu, we know the hierarchy hasn't been
                                // unregistered yet, but its associated filesystem is tearing
                                // down.
                                //
                                // If we simply indicate the hierarchy wasn't found without
                                // cleaning up the registry, the caller can race with the
                                // unregister and find itself temporarily unable to create a new
                                // hierarchy with a subset of the relevant controllers.
                                //
                                // To keep the result of FindHierarchy consistent with the
                                // uniqueness of controllers enforced by Register, drop the
                                // dying hierarchy now. The eventual unregister by the FS
                                // teardown will become a no-op.
                                return nil
                        }
                        return h.fs
                }
        }

        return nil
}

// Register registers the provided set of controllers with the registry as a new
// hierarchy. If any controller is already registered, the function returns an
// error without modifying the registry. Register sets the hierarchy ID for the
// filesystem on success.
func (r *CgroupRegistry) Register(cs []CgroupController, fs cgroupFS) error {
        r.mu.Lock()
        defer r.mu.Unlock()

        if len(cs) == 0 {
                return fmt.Errorf("can't register hierarchy with no controllers")
        }

        for _, c := range cs {
                if _, ok := r.controllers[c.Type()]; ok {
                        return fmt.Errorf("controllers may only be mounted on a single hierarchy")
                }
        }

        hid, err := r.nextHierarchyID()
        if err != nil {
                return err
        }

        // Must not fail below here, once we publish the hierarchy ID.

        fs.InitializeHierarchyID(hid)

        h := hierarchy{
                id:          hid,
                controllers: make(map[CgroupControllerType]CgroupController),
                fs:          fs.VFSFilesystem(),
        }
        for _, c := range cs {
                n := c.Type()
                r.controllers[n] = c
                h.controllers[n] = c
        }
        r.hierarchies[hid] = h
        return nil
}

// Unregister removes a previously registered hierarchy from the registry. If no
// such hierarchy is registered, Unregister is a no-op.
func (r *CgroupRegistry) Unregister(hid uint32) {
        r.mu.Lock()
        r.unregisterLocked(hid)
        r.mu.Unlock()
}

// Precondition: Caller must hold r.mu.
// +checklocks:r.mu
func (r *CgroupRegistry) unregisterLocked(hid uint32) {
        if h, ok := r.hierarchies[hid]; ok {
                for name, _ := range h.controllers {
                        delete(r.controllers, name)
                }
                delete(r.hierarchies, hid)
        }
}

// computeInitialGroups takes a reference on each of the returned cgroups. The
// caller takes ownership of this returned reference.
func (r *CgroupRegistry) computeInitialGroups(inherit map[Cgroup]struct{}) map[Cgroup]struct{} {
        r.mu.Lock()
        defer r.mu.Unlock()

        ctlSet := make(map[CgroupControllerType]CgroupController)
        cgset := make(map[Cgroup]struct{})

        // Remember controllers from the inherited cgroups set...
        for cg, _ := range inherit {
                cg.IncRef() // Ref transferred to caller.
                for _, ctl := range cg.Controllers() {
                        ctlSet[ctl.Type()] = ctl
                        cgset[cg] = struct{}{}
                }
        }

        // ... and add the root cgroups of all the missing controllers.
        for name, ctl := range r.controllers {
                if _, ok := ctlSet[name]; !ok {
                        cg := ctl.RootCgroup()
                        // Multiple controllers may share the same hierarchy, so may have
                        // the same root cgroup. Grab a single ref per hierarchy root.
                        if _, ok := cgset[cg]; ok {
                                continue
                        }
                        cg.IncRef() // Ref transferred to caller.
                        cgset[cg] = struct{}{}
                }
        }
        return cgset
}

// GenerateProcCgroups writes the contents of /proc/cgroups to buf.
func (r *CgroupRegistry) GenerateProcCgroups(buf *bytes.Buffer) {
        r.mu.Lock()
        entries := make([]string, 0, len(r.controllers))
        for _, c := range r.controllers {
                en := 0
                if c.Enabled() {
                        en = 1
                }
                entries = append(entries, fmt.Sprintf("%s\t%d\t%d\t%d\n", c.Type(), c.HierarchyID(), c.NumCgroups(), en))
        }
        r.mu.Unlock()

        sort.Strings(entries)
        fmt.Fprint(buf, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n")
        for _, e := range entries {
                fmt.Fprint(buf, e)
        }
}



























































  183 

   98 

    2 



    3 

   18 





    4 

    1 

   10 

    1 





    5 

    7 

    6 

    2 

    5 

    9 





    2 

    3 

   10 



   10 

    1 

    1 

    1 

    6 



    7 

    3 



    1 





    2 

    3 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package syserr

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux/errno"
        "gvisor.dev/gvisor/pkg/tcpip"
)

// LINT.IfChange

// Mapping for tcpip.Error types.
var (
        ErrUnknownProtocol       = New((&tcpip.ErrUnknownProtocol{}).String(), errno.EINVAL)
        ErrUnknownNICID          = New((&tcpip.ErrUnknownNICID{}).String(), errno.ENODEV)
        ErrUnknownDevice         = New((&tcpip.ErrUnknownDevice{}).String(), errno.ENODEV)
        ErrUnknownProtocolOption = New((&tcpip.ErrUnknownProtocolOption{}).String(), errno.ENOPROTOOPT)
        ErrDuplicateNICID        = New((&tcpip.ErrDuplicateNICID{}).String(), errno.EEXIST)
        ErrDuplicateAddress      = New((&tcpip.ErrDuplicateAddress{}).String(), errno.EEXIST)
        ErrAlreadyBound          = New((&tcpip.ErrAlreadyBound{}).String(), errno.EINVAL)
        ErrInvalidEndpointState  = New((&tcpip.ErrInvalidEndpointState{}).String(), errno.EINVAL)
        ErrAlreadyConnecting     = New((&tcpip.ErrAlreadyConnecting{}).String(), errno.EALREADY)
        ErrNoPortAvailable       = New((&tcpip.ErrNoPortAvailable{}).String(), errno.EAGAIN)
        ErrPortInUse             = New((&tcpip.ErrPortInUse{}).String(), errno.EADDRINUSE)
        ErrBadLocalAddress       = New((&tcpip.ErrBadLocalAddress{}).String(), errno.EADDRNOTAVAIL)
        ErrClosedForSend         = New((&tcpip.ErrClosedForSend{}).String(), errno.EPIPE)
        ErrClosedForReceive      = New((&tcpip.ErrClosedForReceive{}).String(), errno.NOERRNO)
        ErrTimeout               = New((&tcpip.ErrTimeout{}).String(), errno.ETIMEDOUT)
        ErrAborted               = New((&tcpip.ErrAborted{}).String(), errno.EPIPE)
        ErrConnectStarted        = New((&tcpip.ErrConnectStarted{}).String(), errno.EINPROGRESS)
        ErrDestinationRequired   = New((&tcpip.ErrDestinationRequired{}).String(), errno.EDESTADDRREQ)
        ErrNotSupported          = New((&tcpip.ErrNotSupported{}).String(), errno.EOPNOTSUPP)
        ErrQueueSizeNotSupported = New((&tcpip.ErrQueueSizeNotSupported{}).String(), errno.ENOTTY)
        ErrNoSuchFile            = New((&tcpip.ErrNoSuchFile{}).String(), errno.ENOENT)
        ErrInvalidOptionValue    = New((&tcpip.ErrInvalidOptionValue{}).String(), errno.EINVAL)
        ErrBroadcastDisabled     = New((&tcpip.ErrBroadcastDisabled{}).String(), errno.EACCES)
        ErrNotPermittedNet       = New((&tcpip.ErrNotPermitted{}).String(), errno.EPERM)
        ErrBadBuffer             = New((&tcpip.ErrBadBuffer{}).String(), errno.EFAULT)
        ErrMalformedHeader       = New((&tcpip.ErrMalformedHeader{}).String(), errno.EINVAL)
        ErrInvalidPortRange      = New((&tcpip.ErrInvalidPortRange{}).String(), errno.EINVAL)
)

// TranslateNetstackError converts an error from the tcpip package to a sentry
// internal error.
func TranslateNetstackError(err tcpip.Error) *Error {
        switch err.(type) {
        case nil:
                return nil
        case *tcpip.ErrUnknownProtocol:
                return ErrUnknownProtocol
        case *tcpip.ErrUnknownNICID:
                return ErrUnknownNICID
        case *tcpip.ErrUnknownDevice:
                return ErrUnknownDevice
        case *tcpip.ErrUnknownProtocolOption:
                return ErrUnknownProtocolOption
        case *tcpip.ErrDuplicateNICID:
                return ErrDuplicateNICID
        case *tcpip.ErrDuplicateAddress:
                return ErrDuplicateAddress
        case *tcpip.ErrNoRoute:
                return ErrNoRoute
        case *tcpip.ErrAlreadyBound:
                return ErrAlreadyBound
        case *tcpip.ErrInvalidEndpointState:
                return ErrInvalidEndpointState
        case *tcpip.ErrAlreadyConnecting:
                return ErrAlreadyConnecting
        case *tcpip.ErrAlreadyConnected:
                return ErrAlreadyConnected
        case *tcpip.ErrNoPortAvailable:
                return ErrNoPortAvailable
        case *tcpip.ErrPortInUse:
                return ErrPortInUse
        case *tcpip.ErrBadLocalAddress:
                return ErrBadLocalAddress
        case *tcpip.ErrClosedForSend:
                return ErrClosedForSend
        case *tcpip.ErrClosedForReceive:
                return ErrClosedForReceive
        case *tcpip.ErrWouldBlock:
                return ErrWouldBlock
        case *tcpip.ErrConnectionRefused:
                return ErrConnectionRefused
        case *tcpip.ErrTimeout:
                return ErrTimeout
        case *tcpip.ErrAborted:
                return ErrAborted
        case *tcpip.ErrConnectStarted:
                return ErrConnectStarted
        case *tcpip.ErrDestinationRequired:
                return ErrDestinationRequired
        case *tcpip.ErrNotSupported:
                return ErrNotSupported
        case *tcpip.ErrQueueSizeNotSupported:
                return ErrQueueSizeNotSupported
        case *tcpip.ErrNotConnected:
                return ErrNotConnected
        case *tcpip.ErrConnectionReset:
                return ErrConnectionReset
        case *tcpip.ErrConnectionAborted:
                return ErrConnectionAborted
        case *tcpip.ErrNoSuchFile:
                return ErrNoSuchFile
        case *tcpip.ErrInvalidOptionValue:
                return ErrInvalidOptionValue
        case *tcpip.ErrBadAddress:
                return ErrBadAddress
        case *tcpip.ErrNetworkUnreachable:
                return ErrNetworkUnreachable
        case *tcpip.ErrMessageTooLong:
                return ErrMessageTooLong
        case *tcpip.ErrNoBufferSpace:
                return ErrNoBufferSpace
        case *tcpip.ErrBroadcastDisabled:
                return ErrBroadcastDisabled
        case *tcpip.ErrNotPermitted:
                return ErrNotPermittedNet
        case *tcpip.ErrAddressFamilyNotSupported:
                return ErrAddressFamilyNotSupported
        case *tcpip.ErrBadBuffer:
                return ErrBadBuffer
        case *tcpip.ErrMalformedHeader:
                return ErrMalformedHeader
        case *tcpip.ErrInvalidPortRange:
                return ErrInvalidPortRange
        default:
                panic(fmt.Sprintf("unknown error %T", err))
        }
}

// LINT.ThenChange(../tcpip/errors.go)

































   35 

   35 
    8 

    5 

    3 




   34 




    5 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "strings"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/hostarch"
)

// GenericParseMountOptions parses a comma-separated list of options of the
// form "key" or "key=value", where neither key nor value contain commas, and
// returns it as a map. If str contains duplicate keys, then the last value
// wins. For example:
//
// str = "key0=value0,key1,key2=value2,key0=value3" -> map{'key0':'value3','key1':'','key2':'value2'}
//
// GenericParseMountOptions is not appropriate if values may contain commas,
// e.g. in the case of the mpol mount option for tmpfs(5).
func GenericParseMountOptions(str string) map[string]string {
        m := make(map[string]string)
        for _, opt := range strings.Split(str, ",") {
                if len(opt) > 0 {
                        res := strings.SplitN(opt, "=", 2)
                        if len(res) == 2 {
                                m[res[0]] = res[1]
                        } else {
                                m[opt] = ""
                        }
                }
        }
        return m
}

// GenericStatFS returns a statfs struct filled with the common fields for a
// general filesystem. This is analogous to Linux's fs/libfs.cs:simple_statfs().
func GenericStatFS(fsMagic uint64) linux.Statfs {
        return linux.Statfs{
                Type:       fsMagic,
                BlockSize:  hostarch.PageSize,
                NameLength: linux.NAME_MAX,
        }
}



































































































  900 


  898 










  896 








  899 




  898 












  900 































































  526 









  879 
  877 





  549 
  549 





   15 
   15 





  899 
















  900 




















  351 




  251 




   14 




  789 




   14 




    1 










































  877 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package log implements a library for logging.
//
// This is separate from the standard logging package because logging may be a
// high-impact activity, and therefore we wanted to provide as much flexibility
// as possible in the underlying implementation.
//
// Note that logging should still be considered high-impact, and should not be
// done in the hot path. If necessary, logging statements should be protected
// with guards regarding the logging level. For example,
//
//        if log.IsLogging(log.Debug) {
//                log.Debugf(...)
//        }
//
// This is because the log.Debugf(...) statement alone will generate a
// significant amount of garbage and churn in many cases, even if no log
// message is ultimately emitted.
package log

import (
        "fmt"
        "io"
        stdlog "log"
        "os"
        "runtime"
        "sync/atomic"
        "time"

        "gvisor.dev/gvisor/pkg/linewriter"
        "gvisor.dev/gvisor/pkg/sync"
)

// Level is the log level.
type Level uint32

// The following levels are fixed, and can never be changed. Since some control
// RPCs allow for changing the level as an integer, it is only possible to add
// additional levels, and the existing one cannot be removed.
const (
        // Warning indicates that output should always be emitted.
        Warning Level = iota

        // Info indicates that output should normally be emitted.
        Info

        // Debug indicates that output should not normally be emitted.
        Debug
)

func (l Level) String() string {
        switch l {
        case Warning:
                return "Warning"
        case Info:
                return "Info"
        case Debug:
                return "Debug"
        default:
                return fmt.Sprintf("Invalid level: %d", l)
        }
}

// Emitter is the final destination for logs.
type Emitter interface {
        // Emit emits the given log statement. This allows for control over the
        // timestamp used for logging.
        Emit(depth int, level Level, timestamp time.Time, format string, v ...interface{})
}

// Writer writes the output to the given writer.
type Writer struct {
        // Next is where output is written.
        Next io.Writer

        // mu protects fields below.
        mu sync.Mutex

        // errors counts failures to write log messages so it can be reported
        // when writer start to work again. Needs to be accessed using atomics
        // to make race detector happy because it's read outside the mutex.
        errors int32
}

// Write writes out the given bytes, handling non-blocking sockets.
func (l *Writer) Write(data []byte) (int, error) {
        n := 0

        for n < len(data) {
                w, err := l.Next.Write(data[n:])
                n += w

                // Is it a non-blocking socket?
                if pathErr, ok := err.(*os.PathError); ok && pathErr.Timeout() {
                        runtime.Gosched()
                        continue
                }

                // Some other error?
                if err != nil {
                        l.mu.Lock()
                        atomic.AddInt32(&l.errors, 1)
                        l.mu.Unlock()
                        return n, err
                }
        }

        // Do we need to end with a '\n'?
        if len(data) == 0 || data[len(data)-1] != '\n' {
                l.Write([]byte{'\n'})
        }

        // Dirty read in case there were errors (rare).
        if atomic.LoadInt32(&l.errors) > 0 {
                l.mu.Lock()
                defer l.mu.Unlock()

                // Recheck condition under lock.
                if e := atomic.LoadInt32(&l.errors); e > 0 {
                        msg := fmt.Sprintf("\n*** Dropped %d log messages ***\n", e)
                        if _, err := l.Next.Write([]byte(msg)); err == nil {
                                atomic.StoreInt32(&l.errors, 0)
                        }
                }
        }

        return n, nil
}

// Emit emits the message.
func (l *Writer) Emit(_ int, _ Level, _ time.Time, format string, args ...interface{}) {
        fmt.Fprintf(l, format, args...)
}

// MultiEmitter is an emitter that emits to multiple Emitters.
type MultiEmitter []Emitter

// Emit emits to all emitters.
func (m *MultiEmitter) Emit(depth int, level Level, timestamp time.Time, format string, v ...interface{}) {
        for _, e := range *m {
                e.Emit(1+depth, level, timestamp, format, v...)
        }
}

// TestLogger is implemented by testing.T and testing.B.
type TestLogger interface {
        Logf(format string, v ...interface{})
}

// TestEmitter may be used for wrapping tests.
type TestEmitter struct {
        TestLogger
}

// Emit emits to the TestLogger.
func (t *TestEmitter) Emit(_ int, level Level, timestamp time.Time, format string, v ...interface{}) {
        t.Logf(format, v...)
}

// Logger is a high-level logging interface. It is in fact, not used within the
// log package. Rather it is provided for others to provide contextual loggers
// that may append some addition information to log statement. BasicLogger
// satisfies this interface, and may be passed around as a Logger.
type Logger interface {
        // Debugf logs a debug statement.
        Debugf(format string, v ...interface{})

        // Infof logs at an info level.
        Infof(format string, v ...interface{})

        // Warningf logs at a warning level.
        Warningf(format string, v ...interface{})

        // IsLogging returns true iff this level is being logged. This may be
        // used to short-circuit expensive operations for debugging calls.
        IsLogging(level Level) bool
}

// BasicLogger is the default implementation of Logger.
type BasicLogger struct {
        Level
        Emitter
}

// Debugf implements logger.Debugf.
func (l *BasicLogger) Debugf(format string, v ...interface{}) {
        l.DebugfAtDepth(1, format, v...)
}

// Infof implements logger.Infof.
func (l *BasicLogger) Infof(format string, v ...interface{}) {
        l.InfofAtDepth(1, format, v...)
}

// Warningf implements logger.Warningf.
func (l *BasicLogger) Warningf(format string, v ...interface{}) {
        l.WarningfAtDepth(1, format, v...)
}

// DebugfAtDepth logs at a specific depth.
func (l *BasicLogger) DebugfAtDepth(depth int, format string, v ...interface{}) {
        if l.IsLogging(Debug) {
                l.Emit(1+depth, Debug, time.Now(), format, v...)
        }
}

// InfofAtDepth logs at a specific depth.
func (l *BasicLogger) InfofAtDepth(depth int, format string, v ...interface{}) {
        if l.IsLogging(Info) {
                l.Emit(1+depth, Info, time.Now(), format, v...)
        }
}

// WarningfAtDepth logs at a specific depth.
func (l *BasicLogger) WarningfAtDepth(depth int, format string, v ...interface{}) {
        if l.IsLogging(Warning) {
                l.Emit(1+depth, Warning, time.Now(), format, v...)
        }
}

// IsLogging implements logger.IsLogging.
func (l *BasicLogger) IsLogging(level Level) bool {
        return atomic.LoadUint32((*uint32)(&l.Level)) >= uint32(level)
}

// SetLevel sets the logging level.
func (l *BasicLogger) SetLevel(level Level) {
        atomic.StoreUint32((*uint32)(&l.Level), uint32(level))
}

// logMu protects Log below. We use atomic operations to read the value, but
// updates require logMu to ensure consistency.
var logMu sync.Mutex

// log is the default logger.
var log atomic.Value

// Log retrieves the global logger.
func Log() *BasicLogger {
        return log.Load().(*BasicLogger)
}

// SetTarget sets the log target.
//
// This is not thread safe and shouldn't be called concurrently with any
// logging calls.
func SetTarget(target Emitter) {
        logMu.Lock()
        defer logMu.Unlock()
        oldLog := Log()
        log.Store(&BasicLogger{Level: oldLog.Level, Emitter: target})
}

// SetLevel sets the log level.
func SetLevel(newLevel Level) {
        Log().SetLevel(newLevel)
}

// Debugf logs to the global logger.
func Debugf(format string, v ...interface{}) {
        Log().DebugfAtDepth(1, format, v...)
}

// Infof logs to the global logger.
func Infof(format string, v ...interface{}) {
        Log().InfofAtDepth(1, format, v...)
}

// Warningf logs to the global logger.
func Warningf(format string, v ...interface{}) {
        Log().WarningfAtDepth(1, format, v...)
}

// DebugfAtDepth logs to the global logger.
func DebugfAtDepth(depth int, format string, v ...interface{}) {
        Log().DebugfAtDepth(1+depth, format, v...)
}

// InfofAtDepth logs to the global logger.
func InfofAtDepth(depth int, format string, v ...interface{}) {
        Log().InfofAtDepth(1+depth, format, v...)
}

// WarningfAtDepth logs to the global logger.
func WarningfAtDepth(depth int, format string, v ...interface{}) {
        Log().WarningfAtDepth(1+depth, format, v...)
}

// defaultStackSize is the default buffer size to allocate for stack traces.
const defaultStackSize = 1 << 16 // 64KB

// maxStackSize is the maximum buffer size to allocate for stack traces.
const maxStackSize = 1 << 26 // 64MB

// Stacks returns goroutine stacks, like panic.
func Stacks(all bool) []byte {
        var trace []byte
        for s := defaultStackSize; s <= maxStackSize; s *= 4 {
                trace = make([]byte, s)
                nbytes := runtime.Stack(trace, all)
                if nbytes == s {
                        continue
                }
                return trace[:nbytes]
        }
        trace = append(trace, []byte("\n\n...<too large, truncated>")...)
        return trace
}

// Traceback logs the given message and dumps a stacktrace of the current
// goroutine.
//
// This will be print a traceback, tb, as Warningf(format+":\n%s", v..., tb).
func Traceback(format string, v ...interface{}) {
        v = append(v, Stacks(false))
        Warningf(format+":\n%s", v...)
}

// TracebackAll logs the given message and dumps a stacktrace of all goroutines.
//
// This will be print a traceback, tb, as Warningf(format+":\n%s", v..., tb).
func TracebackAll(format string, v ...interface{}) {
        v = append(v, Stacks(true))
        Warningf(format+":\n%s", v...)
}

// IsLogging returns whether the global logger is logging.
func IsLogging(level Level) bool {
        return Log().IsLogging(level)
}

// CopyStandardLogTo redirects the stdlib log package global output to the global
// logger for the specified level.
func CopyStandardLogTo(l Level) error {
        var f func(string, ...interface{})

        switch l {
        case Debug:
                f = Debugf
        case Info:
                f = Infof
        case Warning:
                f = Warningf
        default:
                return fmt.Errorf("unknown log level %v", l)
        }

        stdlog.SetOutput(linewriter.NewWriter(func(p []byte) {
                // We must not retain p, but log formatting is not required to
                // be synchronous (though the in-package implementations are),
                // so we must make a copy.
                b := make([]byte, len(p))
                copy(b, p)

                f("%s", b)
        }))

        return nil
}

func init() {
        // Store the initial value for the log.
        log.Store(&BasicLogger{Level: Info, Emitter: GoogleEmitter{&Writer{Next: os.Stderr}}})
}
























  314 




   18 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

// MakeDeviceID encodes a major and minor device number into a single device ID.
//
// Format (see linux/kdev_t.h:new_encode_dev):
//
// Bits 7:0   - minor bits 7:0
// Bits 19:8  - major bits 11:0
// Bits 31:20 - minor bits 19:8
func MakeDeviceID(major uint16, minor uint32) uint32 {
        return (minor & 0xff) | ((uint32(major) & 0xfff) << 8) | ((minor >> 8) << 20)
}

// DecodeDeviceID decodes a device ID into major and minor device numbers.
func DecodeDeviceID(rdev uint32) (uint16, uint32) {
        major := uint16((rdev >> 8) & 0xfff)
        minor := (rdev & 0xff) | ((rdev >> 20) << 8)
        return major, minor
}

// Character device IDs.
//
// See Documentations/devices.txt and uapi/linux/major.h.
const (
        // UNNAMED_MAJOR is the major device number for "unnamed" devices, whose
        // minor numbers are dynamically allocated by the kernel.
        UNNAMED_MAJOR = 0

        // MEM_MAJOR is the major device number for "memory" character devices.
        MEM_MAJOR = 1

        // TTYAUX_MAJOR is the major device number for alternate TTY devices.
        TTYAUX_MAJOR = 5

        // MISC_MAJOR is the major device number for non-serial mice, misc feature
        // devices.
        MISC_MAJOR = 10

        // UNIX98_PTY_MASTER_MAJOR is the initial major device number for
        // Unix98 PTY masters.
        UNIX98_PTY_MASTER_MAJOR = 128

        // UNIX98_PTY_REPLICA_MAJOR is the initial major device number for
        // Unix98 PTY replicas.
        UNIX98_PTY_REPLICA_MAJOR = 136
)

// Minor device numbers for TTYAUX_MAJOR.
const (
        // PTMX_MINOR is the minor device number for /dev/ptmx.
        PTMX_MINOR = 2
)
































    7 

















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package proc

import (
        "bytes"
        "fmt"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/usermem"
)

func (fs *filesystem) newYAMAPtraceScopeFile(ctx context.Context, k *kernel.Kernel, creds *auth.Credentials) kernfs.Inode {
        s := &yamaPtraceScope{level: &k.YAMAPtraceScope}
        s.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), s, 0644)
        return s
}

// yamaPtraceScope implements vfs.WritableDynamicBytesSource for
// /sys/kernel/yama/ptrace_scope.
//
// +stateify savable
type yamaPtraceScope struct {
        kernfs.DynamicBytesFile

        // level is the ptrace_scope level.
        level *int32
}

// Generate implements vfs.DynamicBytesSource.Generate.
func (s *yamaPtraceScope) Generate(ctx context.Context, buf *bytes.Buffer) error {
        _, err := fmt.Fprintf(buf, "%d\n", atomic.LoadInt32(s.level))
        return err
}

// Write implements vfs.WritableDynamicBytesSource.Write.
func (s *yamaPtraceScope) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
        if offset != 0 {
                // Ignore partial writes.
                return 0, linuxerr.EINVAL
        }
        if src.NumBytes() == 0 {
                return 0, nil
        }

        // Limit the amount of memory allocated.
        src = src.TakeFirst(hostarch.PageSize - 1)

        var v int32
        n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
        if err != nil {
                return 0, err
        }

        // We do not support YAMA levels > YAMA_SCOPE_RELATIONAL.
        if v < linux.YAMA_SCOPE_DISABLED || v > linux.YAMA_SCOPE_RELATIONAL {
                return 0, linuxerr.EINVAL
        }

        atomic.StoreInt32(s.level, v)
        return n, nil
}




















































































































   24 



   24 


   24 




































































  203 






  203 
    2 
    2 




  192 



  201 












  200 

   30 


  202 
















  199 

    1 



  197 
    3 


  195 
    8 






    8 




  197 


   10 


   10 


   10 


   10 













  197 
    2 


  196 





    5 




  192 



  186 



   26 




   26 







   13 







   17 







  253 




  233 






  228 

  187 


  227 






  228 




  229 








    5 

    5 


    5 


    5 




    4 







    1 






    2 





    2 











    4 



    4 


    4 


    3 

    1 


    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package pipe provides a pipe implementation.
package pipe

import (
        "fmt"
        "io"
        "sync/atomic"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/waiter"
)

const (
        // MinimumPipeSize is a hard limit of the minimum size of a pipe.
        // It corresponds to fs/pipe.c:pipe_min_size.
        MinimumPipeSize = hostarch.PageSize

        // MaximumPipeSize is a hard limit on the maximum size of a pipe.
        // It corresponds to fs/pipe.c:pipe_max_size.
        MaximumPipeSize = 1048576

        // DefaultPipeSize is the system-wide default size of a pipe in bytes.
        // It corresponds to pipe_fs_i.h:PIPE_DEF_BUFFERS.
        DefaultPipeSize = 16 * hostarch.PageSize

        // atomicIOBytes is the maximum number of bytes that the pipe will
        // guarantee atomic reads or writes atomically.
        // It corresponds to limits.h:PIPE_BUF.
        atomicIOBytes = 4096
)

// Pipe is an encapsulation of a platform-independent pipe.
// It manages a buffered byte queue shared between a reader/writer
// pair.
//
// +stateify savable
type Pipe struct {
        waiter.Queue `state:"nosave"`

        // isNamed indicates whether this is a named pipe.
        //
        // This value is immutable.
        isNamed bool

        // The number of active readers for this pipe.
        //
        // Access atomically.
        readers int32

        // The number of active writes for this pipe.
        //
        // Access atomically.
        writers int32

        // mu protects all pipe internal state below.
        mu sync.Mutex `state:"nosave"`

        // buf holds the pipe's data. buf is a circular buffer; the first valid
        // byte in buf is at offset off, and the pipe contains size valid bytes.
        // bufBlocks contains two identical safemem.Blocks representing buf; this
        // avoids needing to heap-allocate a new safemem.Block slice when buf is
        // resized. bufBlockSeq is a safemem.BlockSeq representing bufBlocks.
        //
        // These fields are protected by mu.
        buf         []byte
        bufBlocks   [2]safemem.Block `state:"nosave"`
        bufBlockSeq safemem.BlockSeq `state:"nosave"`
        off         int64
        size        int64

        // max is the maximum size of the pipe in bytes. When this max has been
        // reached, writers will get EWOULDBLOCK.
        //
        // This is protected by mu.
        max int64

        // hadWriter indicates if this pipe ever had a writer. Note that this
        // does not necessarily indicate there is *currently* a writer, just
        // that there has been a writer at some point since the pipe was
        // created.
        //
        // This is protected by mu.
        hadWriter bool
}

// NewPipe initializes and returns a pipe.
//
// N.B. The size will be bounded.
func NewPipe(isNamed bool, sizeBytes int64) *Pipe {
        var p Pipe
        initPipe(&p, isNamed, sizeBytes)
        return &p
}

func initPipe(pipe *Pipe, isNamed bool, sizeBytes int64) {
        if sizeBytes < MinimumPipeSize {
                sizeBytes = MinimumPipeSize
        }
        if sizeBytes > MaximumPipeSize {
                sizeBytes = MaximumPipeSize
        }
        pipe.isNamed = isNamed
        pipe.max = sizeBytes
}

// NewConnectedPipe initializes a pipe and returns a pair of objects
// representing the read and write ends of the pipe.
func NewConnectedPipe(ctx context.Context, sizeBytes int64) (*fs.File, *fs.File) {
        p := NewPipe(false /* isNamed */, sizeBytes)

        // Build an fs.Dirent for the pipe which will be shared by both
        // returned files.
        perms := fs.FilePermissions{
                User: fs.PermMask{Read: true, Write: true},
        }
        iops := NewInodeOperations(ctx, perms, p)
        ino := pipeDevice.NextIno()
        sattr := fs.StableAttr{
                Type:      fs.Pipe,
                DeviceID:  pipeDevice.DeviceID(),
                InodeID:   ino,
                BlockSize: int64(atomicIOBytes),
        }
        ms := fs.NewPseudoMountSource(ctx)
        d := fs.NewDirent(ctx, fs.NewInode(ctx, iops, ms, sattr), fmt.Sprintf("pipe:[%d]", ino))
        // The p.Open calls below will each take a reference on the Dirent. We
        // must drop the one we already have.
        defer d.DecRef(ctx)
        return p.Open(ctx, d, fs.FileFlags{Read: true}), p.Open(ctx, d, fs.FileFlags{Write: true})
}

// Open opens the pipe and returns a new file.
//
// Precondition: at least one of flags.Read or flags.Write must be set.
func (p *Pipe) Open(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) *fs.File {
        flags.NonSeekable = true
        switch {
        case flags.Read && flags.Write:
                p.rOpen()
                p.wOpen()
                return fs.NewFile(ctx, d, flags, &ReaderWriter{
                        Pipe: p,
                })
        case flags.Read:
                p.rOpen()
                return fs.NewFile(ctx, d, flags, &Reader{
                        ReaderWriter: ReaderWriter{Pipe: p},
                })
        case flags.Write:
                p.wOpen()
                return fs.NewFile(ctx, d, flags, &Writer{
                        ReaderWriter: ReaderWriter{Pipe: p},
                })
        default:
                // Precondition violated.
                panic("invalid pipe flags")
        }
}

// peekLocked passes the first count bytes in the pipe to f and returns its
// result. If fewer than count bytes are available, the safemem.BlockSeq passed
// to f will be less than count bytes in length.
//
// peekLocked does not mutate the pipe; if the read consumes bytes from the
// pipe, then the caller is responsible for calling p.consumeLocked() and
// p.Notify(waiter.WritableEvents). (The latter must be called with p.mu unlocked.)
//
// Preconditions:
// * p.mu must be locked.
// * This pipe must have readers.
func (p *Pipe) peekLocked(count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
        // Don't block for a zero-length read even if the pipe is empty.
        if count == 0 {
                return 0, nil
        }

        // Limit the amount of data read to the amount of data in the pipe.
        if count > p.size {
                if p.size == 0 {
                        if !p.HasWriters() {
                                return 0, io.EOF
                        }
                        return 0, syserror.ErrWouldBlock
                }
                count = p.size
        }

        // Prepare the view of the data to be read.
        bs := p.bufBlockSeq.DropFirst64(uint64(p.off)).TakeFirst64(uint64(count))

        // Perform the read.
        done, err := f(bs)
        return int64(done), err
}

// consumeLocked consumes the first n bytes in the pipe, such that they will no
// longer be visible to future reads.
//
// Preconditions:
// * p.mu must be locked.
// * The pipe must contain at least n bytes.
func (p *Pipe) consumeLocked(n int64) {
        p.off += n
        if max := int64(len(p.buf)); p.off >= max {
                p.off -= max
        }
        p.size -= n
}

// writeLocked passes a safemem.BlockSeq representing the first count bytes of
// unused space in the pipe to f and returns the result. If fewer than count
// bytes are free, the safemem.BlockSeq passed to f will be less than count
// bytes in length. If the pipe is full or otherwise cannot accomodate a write
// of any number of bytes up to count, writeLocked returns ErrWouldBlock
// without calling f.
//
// Unlike peekLocked, writeLocked assumes that f returns the number of bytes
// written to the pipe, and increases the number of bytes stored in the pipe
// accordingly. Callers are still responsible for calling
// p.Notify(waiter.ReadableEvents) with p.mu unlocked.
//
// Preconditions:
// * p.mu must be locked.
func (p *Pipe) writeLocked(count int64, f func(safemem.BlockSeq) (uint64, error)) (int64, error) {
        // Can't write to a pipe with no readers.
        if !p.HasReaders() {
                return 0, unix.EPIPE
        }

        avail := p.max - p.size
        if avail == 0 {
                return 0, syserror.ErrWouldBlock
        }
        short := false
        if count > avail {
                // POSIX requires that a write smaller than atomicIOBytes
                // (PIPE_BUF) be atomic, but requires no atomicity for writes
                // larger than this.
                if count <= atomicIOBytes {
                        return 0, syserror.ErrWouldBlock
                }
                count = avail
                short = true
        }

        // Ensure that the buffer is big enough.
        if newLen, oldCap := p.size+count, int64(len(p.buf)); newLen > oldCap {
                // Allocate a new buffer.
                newCap := oldCap * 2
                if oldCap == 0 {
                        newCap = 8 // arbitrary; sending individual integers across pipes is relatively common
                }
                for newLen > newCap {
                        newCap *= 2
                }
                if newCap > p.max {
                        newCap = p.max
                }
                newBuf := make([]byte, newCap)
                // Copy the old buffer's contents to the beginning of the new one.
                safemem.CopySeq(
                        safemem.BlockSeqOf(safemem.BlockFromSafeSlice(newBuf)),
                        p.bufBlockSeq.DropFirst64(uint64(p.off)).TakeFirst64(uint64(p.size)))
                // Switch to the new buffer.
                p.buf = newBuf
                p.bufBlocks[0] = safemem.BlockFromSafeSlice(newBuf)
                p.bufBlocks[1] = p.bufBlocks[0]
                p.bufBlockSeq = safemem.BlockSeqFromSlice(p.bufBlocks[:])
                p.off = 0
        }

        // Prepare the view of the space to be written.
        woff := p.off + p.size
        if woff >= int64(len(p.buf)) {
                woff -= int64(len(p.buf))
        }
        bs := p.bufBlockSeq.DropFirst64(uint64(woff)).TakeFirst64(uint64(count))

        // Perform the write.
        doneU64, err := f(bs)
        done := int64(doneU64)
        p.size += done
        if done < count || err != nil {
                return done, err
        }

        // If we shortened the write, adjust the returned error appropriately.
        if short {
                return done, syserror.ErrWouldBlock
        }

        return done, nil
}

// rOpen signals a new reader of the pipe.
func (p *Pipe) rOpen() {
        atomic.AddInt32(&p.readers, 1)
}

// wOpen signals a new writer of the pipe.
func (p *Pipe) wOpen() {
        p.mu.Lock()
        defer p.mu.Unlock()
        p.hadWriter = true
        atomic.AddInt32(&p.writers, 1)
}

// rClose signals that a reader has closed their end of the pipe.
func (p *Pipe) rClose() {
        newReaders := atomic.AddInt32(&p.readers, -1)
        if newReaders < 0 {
                panic(fmt.Sprintf("Refcounting bug, pipe has negative readers: %v", newReaders))
        }
}

// wClose signals that a writer has closed their end of the pipe.
func (p *Pipe) wClose() {
        newWriters := atomic.AddInt32(&p.writers, -1)
        if newWriters < 0 {
                panic(fmt.Sprintf("Refcounting bug, pipe has negative writers: %v.", newWriters))
        }
}

// HasReaders returns whether the pipe has any active readers.
func (p *Pipe) HasReaders() bool {
        return atomic.LoadInt32(&p.readers) > 0
}

// HasWriters returns whether the pipe has any active writers.
func (p *Pipe) HasWriters() bool {
        return atomic.LoadInt32(&p.writers) > 0
}

// rReadinessLocked calculates the read readiness.
//
// Precondition: mu must be held.
func (p *Pipe) rReadinessLocked() waiter.EventMask {
        ready := waiter.EventMask(0)
        if p.HasReaders() && p.size != 0 {
                ready |= waiter.ReadableEvents
        }
        if !p.HasWriters() && p.hadWriter {
                // POLLHUP must be suppressed until the pipe has had at least one writer
                // at some point. Otherwise a reader thread may poll and immediately get
                // a POLLHUP before the writer ever opens the pipe, which the reader may
                // interpret as the writer opening then closing the pipe.
                ready |= waiter.EventHUp
        }
        return ready
}

// rReadiness returns a mask that states whether the read end of the pipe is
// ready for reading.
func (p *Pipe) rReadiness() waiter.EventMask {
        p.mu.Lock()
        defer p.mu.Unlock()
        return p.rReadinessLocked()
}

// wReadinessLocked calculates the write readiness.
//
// Precondition: mu must be held.
func (p *Pipe) wReadinessLocked() waiter.EventMask {
        ready := waiter.EventMask(0)
        if p.HasWriters() && p.size < p.max {
                ready |= waiter.WritableEvents
        }
        if !p.HasReaders() {
                ready |= waiter.EventErr
        }
        return ready
}

// wReadiness returns a mask that states whether the write end of the pipe
// is ready for writing.
func (p *Pipe) wReadiness() waiter.EventMask {
        p.mu.Lock()
        defer p.mu.Unlock()
        return p.wReadinessLocked()
}

// rwReadiness returns a mask that states whether a read-write handle to the
// pipe is ready for IO.
func (p *Pipe) rwReadiness() waiter.EventMask {
        p.mu.Lock()
        defer p.mu.Unlock()
        return p.rReadinessLocked() | p.wReadinessLocked()
}

// queued returns the amount of queued data.
func (p *Pipe) queued() int64 {
        p.mu.Lock()
        defer p.mu.Unlock()
        return p.queuedLocked()
}

func (p *Pipe) queuedLocked() int64 {
        return p.size
}

// FifoSize implements fs.FifoSizer.FifoSize.
func (p *Pipe) FifoSize(context.Context, *fs.File) (int64, error) {
        p.mu.Lock()
        defer p.mu.Unlock()
        return p.max, nil
}

// SetFifoSize implements fs.FifoSizer.SetFifoSize.
func (p *Pipe) SetFifoSize(size int64) (int64, error) {
        if size < 0 {
                return 0, linuxerr.EINVAL
        }
        if size < MinimumPipeSize {
                size = MinimumPipeSize // Per spec.
        }
        if size > MaximumPipeSize {
                return 0, linuxerr.EPERM
        }
        p.mu.Lock()
        defer p.mu.Unlock()
        if size < p.size {
                return 0, linuxerr.EBUSY
        }
        p.max = size
        return size, nil
}































  487 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package auth

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/bits"
)

// A CapabilitySet is a set of capabilities implemented as a bitset. The zero
// value of CapabilitySet is a set containing no capabilities.
type CapabilitySet uint64

// AllCapabilities is a CapabilitySet containing all valid capabilities.
var AllCapabilities = CapabilitySetOf(linux.CAP_LAST_CAP+1) - 1

// CapabilitySetOf returns a CapabilitySet containing only the given
// capability.
func CapabilitySetOf(cp linux.Capability) CapabilitySet {
        return CapabilitySet(bits.MaskOf64(int(cp)))
}

// CapabilitySetOfMany returns a CapabilitySet containing the given capabilities.
func CapabilitySetOfMany(cps []linux.Capability) CapabilitySet {
        var cs uint64
        for _, cp := range cps {
                cs |= bits.MaskOf64(int(cp))
        }
        return CapabilitySet(cs)
}

// TaskCapabilities represents all the capability sets for a task. Each of these
// sets is explained in greater detail in capabilities(7).
type TaskCapabilities struct {
        // Permitted is a limiting superset for the effective capabilities that
        // the thread may assume.
        PermittedCaps CapabilitySet
        // Inheritable is a set of capabilities preserved across an execve(2).
        InheritableCaps CapabilitySet
        // Effective is the set of capabilities used by the kernel to perform
        // permission checks for the thread.
        EffectiveCaps CapabilitySet
        // Bounding is a limiting superset for the capabilities that a thread
        // can add to its inheritable set using capset(2).
        BoundingCaps CapabilitySet
        // Ambient is a set of capabilities that are preserved across an
        // execve(2) of a program that is not privileged.
        AmbientCaps CapabilitySet
}































































  156 




  124 
























































































































   74 






   77 






  579 































   20 

















 1629 

    4 

    4 

 1633 





 1630 


 1634 


 1637 


 1628 



 1629 
    1 


 1631 










    1 

 1631 

    3 





 1631 


 1626 



 1629 
 1634 
 1633 


    3 

   53 




   52 

    1 



   52 



 1629 




















    9 









    9 


    6 




    4 
    1 

    1 

    1 



    5 


    6 


    6 



    6 
    3 
    3 







    1 




    3 

    4 





    4 


    2 






    1 


    1 


    1 


    1 














    2 













    6 














    2 




    6 



    6 



   62 





    5 





    3 

    2 

    1 




    4 
    1 


    4 


    3 



    2 











    2 
    1 

    1 






    2 





    2 






























    6 








    4 

    1 


    3 
    1 




    2 






    3 
    1 

    2 





    2 

    1 



    1 




    1 

    1 




    1 








    3 




    3 

    3 


































    1 




    1 


    1 





    1 







    1 













    1 







    1 





    1 
    1 


    1 





    1 



    1 









    1 



   43 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "path"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/fsbridge"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/sched"
        "gvisor.dev/gvisor/pkg/sentry/loader"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
)

const (
        // exitSignalMask is the signal mask to be sent at exit. Same as CSIGNAL in linux.
        exitSignalMask = 0xff
)

var (
        // ExecMaxTotalSize is the maximum length of all argv and envv entries.
        //
        // N.B. The behavior here is different than Linux. Linux provides a limit on
        // individual arguments of 32 pages, and an aggregate limit of at least 32 pages
        // but otherwise bounded by min(stack size / 4, 8 MB * 3 / 4). We don't implement
        // any behavior based on the stack size, and instead provide a fixed hard-limit of
        // 2 MB (which should work well given that 8 MB stack limits are common).
        ExecMaxTotalSize = 2 * 1024 * 1024

        // ExecMaxElemSize is the maximum length of a single argv or envv entry.
        ExecMaxElemSize = 32 * hostarch.PageSize
)

// Getppid implements linux syscall getppid(2).
func Getppid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        parent := t.Parent()
        if parent == nil {
                return 0, nil, nil
        }
        return uintptr(t.PIDNamespace().IDOfThreadGroup(parent.ThreadGroup())), nil, nil
}

// Getpid implements linux syscall getpid(2).
func Getpid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return uintptr(t.ThreadGroup().ID()), nil, nil
}

// Gettid implements linux syscall gettid(2).
func Gettid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return uintptr(t.ThreadID()), nil, nil
}

// Execve implements linux syscall execve(2).
func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        filenameAddr := args[0].Pointer()
        argvAddr := args[1].Pointer()
        envvAddr := args[2].Pointer()

        return execveat(t, linux.AT_FDCWD, filenameAddr, argvAddr, envvAddr, 0)
}

// Execveat implements linux syscall execveat(2).
func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        dirFD := args[0].Int()
        pathnameAddr := args[1].Pointer()
        argvAddr := args[2].Pointer()
        envvAddr := args[3].Pointer()
        flags := args[4].Int()

        return execveat(t, dirFD, pathnameAddr, argvAddr, envvAddr, flags)
}

func execveat(t *kernel.Task, dirFD int32, pathnameAddr, argvAddr, envvAddr hostarch.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
        pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
        if err != nil {
                return 0, nil, err
        }

        var argv, envv []string
        if argvAddr != 0 {
                var err error
                argv, err = t.CopyInVector(argvAddr, ExecMaxElemSize, ExecMaxTotalSize)
                if err != nil {
                        return 0, nil, err
                }
        }
        if envvAddr != 0 {
                var err error
                envv, err = t.CopyInVector(envvAddr, ExecMaxElemSize, ExecMaxTotalSize)
                if err != nil {
                        return 0, nil, err
                }
        }

        if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
                return 0, nil, linuxerr.EINVAL
        }
        atEmptyPath := flags&linux.AT_EMPTY_PATH != 0
        if !atEmptyPath && len(pathname) == 0 {
                return 0, nil, syserror.ENOENT
        }
        resolveFinal := flags&linux.AT_SYMLINK_NOFOLLOW == 0

        root := t.FSContext().RootDirectory()
        defer root.DecRef(t)

        var wd *fs.Dirent
        var executable fsbridge.File
        var closeOnExec bool
        if dirFD == linux.AT_FDCWD || path.IsAbs(pathname) {
                // Even if the pathname is absolute, we may still need the wd
                // for interpreter scripts if the path of the interpreter is
                // relative.
                wd = t.FSContext().WorkingDirectory()
        } else {
                // Need to extract the given FD.
                f, fdFlags := t.FDTable().Get(dirFD)
                if f == nil {
                        return 0, nil, linuxerr.EBADF
                }
                defer f.DecRef(t)
                closeOnExec = fdFlags.CloseOnExec

                if atEmptyPath && len(pathname) == 0 {
                        // TODO(gvisor.dev/issue/160): Linux requires only execute permission,
                        // not read. However, our backing filesystems may prevent us from reading
                        // the file without read permission. Additionally, a task with a
                        // non-readable executable has additional constraints on access via
                        // ptrace and procfs.
                        if err := f.Dirent.Inode.CheckPermission(t, fs.PermMask{Read: true, Execute: true}); err != nil {
                                return 0, nil, err
                        }
                        executable = fsbridge.NewFSFile(f)
                } else {
                        wd = f.Dirent
                        wd.IncRef()
                        if !fs.IsDir(wd.Inode.StableAttr) {
                                return 0, nil, linuxerr.ENOTDIR
                        }
                }
        }
        if wd != nil {
                defer wd.DecRef(t)
        }

        // Load the new TaskImage.
        remainingTraversals := uint(linux.MaxSymlinkTraversals)
        loadArgs := loader.LoadArgs{
                Opener:              fsbridge.NewFSLookup(t.MountNamespace(), root, wd),
                RemainingTraversals: &remainingTraversals,
                ResolveFinal:        resolveFinal,
                Filename:            pathname,
                File:                executable,
                CloseOnExec:         closeOnExec,
                Argv:                argv,
                Envv:                envv,
                Features:            t.Arch().FeatureSet(),
        }

        image, se := t.Kernel().LoadTaskImage(t, loadArgs)
        if se != nil {
                return 0, nil, se.ToError()
        }

        ctrl, err := t.Execve(image)
        return 0, ctrl, err
}

// Exit implements linux syscall exit(2).
func Exit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        status := args[0].Int()
        t.PrepareExit(linux.WaitStatusExit(status & 0xff))
        return 0, kernel.CtrlDoExit, nil
}

// ExitGroup implements linux syscall exit_group(2).
func ExitGroup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        status := args[0].Int()
        t.PrepareGroupExit(linux.WaitStatusExit(status & 0xff))
        return 0, kernel.CtrlDoExit, nil
}

// clone is used by Clone, Fork, and VFork.
func clone(t *kernel.Task, flags int, stack hostarch.Addr, parentTID hostarch.Addr, childTID hostarch.Addr, tls hostarch.Addr) (uintptr, *kernel.SyscallControl, error) {
        opts := kernel.CloneOptions{
                SharingOptions: kernel.SharingOptions{
                        NewAddressSpace:     flags&linux.CLONE_VM == 0,
                        NewSignalHandlers:   flags&linux.CLONE_SIGHAND == 0,
                        NewThreadGroup:      flags&linux.CLONE_THREAD == 0,
                        TerminationSignal:   linux.Signal(flags & exitSignalMask),
                        NewPIDNamespace:     flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID,
                        NewUserNamespace:    flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER,
                        NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET,
                        NewFiles:            flags&linux.CLONE_FILES == 0,
                        NewFSContext:        flags&linux.CLONE_FS == 0,
                        NewUTSNamespace:     flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS,
                        NewIPCNamespace:     flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC,
                },
                Stack:         stack,
                SetTLS:        flags&linux.CLONE_SETTLS == linux.CLONE_SETTLS,
                TLS:           tls,
                ChildClearTID: flags&linux.CLONE_CHILD_CLEARTID == linux.CLONE_CHILD_CLEARTID,
                ChildSetTID:   flags&linux.CLONE_CHILD_SETTID == linux.CLONE_CHILD_SETTID,
                ChildTID:      childTID,
                ParentSetTID:  flags&linux.CLONE_PARENT_SETTID == linux.CLONE_PARENT_SETTID,
                ParentTID:     parentTID,
                Vfork:         flags&linux.CLONE_VFORK == linux.CLONE_VFORK,
                Untraced:      flags&linux.CLONE_UNTRACED == linux.CLONE_UNTRACED,
                InheritTracer: flags&linux.CLONE_PTRACE == linux.CLONE_PTRACE,
        }
        ntid, ctrl, err := t.Clone(&opts)
        return uintptr(ntid), ctrl, err
}

// Fork implements Linux syscall fork(2).
func Fork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        // "A call to fork() is equivalent to a call to clone(2) specifying flags
        // as just SIGCHLD." - fork(2)
        return clone(t, int(linux.SIGCHLD), 0, 0, 0, 0)
}

// Vfork implements Linux syscall vfork(2).
func Vfork(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        // """
        // A call to vfork() is equivalent to calling clone(2) with flags specified as:
        //
        //     CLONE_VM | CLONE_VFORK | SIGCHLD
        // """ - vfork(2)
        return clone(t, linux.CLONE_VM|linux.CLONE_VFORK|int(linux.SIGCHLD), 0, 0, 0, 0)
}

// parseCommonWaitOptions applies the options common to wait4 and waitid to
// wopts.
func parseCommonWaitOptions(wopts *kernel.WaitOptions, options int) error {
        switch options & (linux.WCLONE | linux.WALL) {
        case 0:
                wopts.NonCloneTasks = true
        case linux.WCLONE:
                wopts.CloneTasks = true
        case linux.WALL:
                wopts.NonCloneTasks = true
                wopts.CloneTasks = true
        default:
                return linuxerr.EINVAL
        }
        if options&linux.WCONTINUED != 0 {
                wopts.Events |= kernel.EventGroupContinue
        }
        if options&linux.WNOHANG == 0 {
                wopts.BlockInterruptErr = syserror.ERESTARTSYS
        }
        if options&linux.WNOTHREAD == 0 {
                wopts.SiblingChildren = true
        }
        return nil
}

// wait4 waits for the given child process to exit.
func wait4(t *kernel.Task, pid int, statusAddr hostarch.Addr, options int, rusageAddr hostarch.Addr) (uintptr, error) {
        if options&^(linux.WNOHANG|linux.WUNTRACED|linux.WCONTINUED|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
                return 0, linuxerr.EINVAL
        }
        wopts := kernel.WaitOptions{
                Events:       kernel.EventExit | kernel.EventTraceeStop,
                ConsumeEvent: true,
        }
        // There are four cases to consider:
        //
        // pid < -1    any child process whose process group ID is equal to the absolute value of pid
        // pid == -1   any child process
        // pid == 0    any child process whose process group ID is equal to that of the calling process
        // pid > 0     the child whose process ID is equal to the value of pid
        switch {
        case pid < -1:
                wopts.SpecificPGID = kernel.ProcessGroupID(-pid)
        case pid == -1:
                // Any process is the default.
        case pid == 0:
                wopts.SpecificPGID = t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())
        default:
                wopts.SpecificTID = kernel.ThreadID(pid)
        }

        if err := parseCommonWaitOptions(&wopts, options); err != nil {
                return 0, err
        }
        if options&linux.WUNTRACED != 0 {
                wopts.Events |= kernel.EventChildGroupStop
        }

        wr, err := t.Wait(&wopts)
        if err != nil {
                if err == kernel.ErrNoWaitableEvent {
                        return 0, nil
                }
                return 0, err
        }
        if statusAddr != 0 {
                if _, err := primitive.CopyUint32Out(t, statusAddr, uint32(wr.Status)); err != nil {
                        return 0, err
                }
        }
        if rusageAddr != 0 {
                ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
                if _, err := ru.CopyOut(t, rusageAddr); err != nil {
                        return 0, err
                }
        }
        return uintptr(wr.TID), nil
}

// Wait4 implements linux syscall wait4(2).
func Wait4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pid := int(args[0].Int())
        statusAddr := args[1].Pointer()
        options := int(args[2].Uint())
        rusageAddr := args[3].Pointer()

        n, err := wait4(t, pid, statusAddr, options, rusageAddr)
        return n, nil, err
}

// WaitPid implements linux syscall waitpid(2).
func WaitPid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        pid := int(args[0].Int())
        statusAddr := args[1].Pointer()
        options := int(args[2].Uint())

        n, err := wait4(t, pid, statusAddr, options, 0)
        return n, nil, err
}

// Waitid implements linux syscall waitid(2).
func Waitid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        idtype := args[0].Int()
        id := args[1].Int()
        infop := args[2].Pointer()
        options := int(args[3].Uint())
        rusageAddr := args[4].Pointer()

        if options&^(linux.WNOHANG|linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED|linux.WNOWAIT|linux.WNOTHREAD|linux.WALL|linux.WCLONE) != 0 {
                return 0, nil, linuxerr.EINVAL
        }
        if options&(linux.WEXITED|linux.WSTOPPED|linux.WCONTINUED) == 0 {
                return 0, nil, linuxerr.EINVAL
        }
        wopts := kernel.WaitOptions{
                Events:       kernel.EventTraceeStop,
                ConsumeEvent: options&linux.WNOWAIT == 0,
        }
        switch idtype {
        case linux.P_ALL:
        case linux.P_PID:
                wopts.SpecificTID = kernel.ThreadID(id)
        case linux.P_PGID:
                wopts.SpecificPGID = kernel.ProcessGroupID(id)
        default:
                return 0, nil, linuxerr.EINVAL
        }

        if err := parseCommonWaitOptions(&wopts, options); err != nil {
                return 0, nil, err
        }
        if options&linux.WEXITED != 0 {
                wopts.Events |= kernel.EventExit
        }
        if options&linux.WSTOPPED != 0 {
                wopts.Events |= kernel.EventChildGroupStop
        }

        wr, err := t.Wait(&wopts)
        if err != nil {
                if err == kernel.ErrNoWaitableEvent {
                        err = nil
                        // "If WNOHANG was specified in options and there were no children
                        // in a waitable state, then waitid() returns 0 immediately and the
                        // state of the siginfo_t structure pointed to by infop is
                        // unspecified." - waitid(2). But Linux's waitid actually zeroes
                        // out the fields it would set for a successful waitid in this case
                        // as well.
                        if infop != 0 {
                                var si linux.SignalInfo
                                _, err = si.CopyOut(t, infop)
                        }
                }
                return 0, nil, err
        }
        if rusageAddr != 0 {
                ru := getrusage(wr.Task, linux.RUSAGE_BOTH)
                if _, err := ru.CopyOut(t, rusageAddr); err != nil {
                        return 0, nil, err
                }
        }
        if infop == 0 {
                return 0, nil, nil
        }
        si := linux.SignalInfo{
                Signo: int32(linux.SIGCHLD),
        }
        si.SetPID(int32(wr.TID))
        si.SetUID(int32(wr.UID))
        s := wr.Status
        switch {
        case s.Exited():
                si.Code = linux.CLD_EXITED
                si.SetStatus(int32(s.ExitStatus()))
        case s.Signaled():
                if s.CoreDumped() {
                        si.Code = linux.CLD_DUMPED
                } else {
                        si.Code = linux.CLD_KILLED
                }
                si.SetStatus(int32(s.TerminationSignal()))
        case s.Stopped():
                if wr.Event == kernel.EventTraceeStop {
                        si.Code = linux.CLD_TRAPPED
                        si.SetStatus(int32(s.PtraceEvent()))
                } else {
                        si.Code = linux.CLD_STOPPED
                        si.SetStatus(int32(s.StopSignal()))
                }
        case s.Continued():
                si.Code = linux.CLD_CONTINUED
                si.SetStatus(int32(linux.SIGCONT))
        default:
                t.Warningf("waitid got incomprehensible wait status %d", s)
        }
        _, err = si.CopyOut(t, infop)
        return 0, nil, err
}

// SetTidAddress implements linux syscall set_tid_address(2).
func SetTidAddress(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()

        // Always succeed, return caller's tid.
        t.SetClearTID(addr)
        return uintptr(t.ThreadID()), nil, nil
}

// Unshare implements linux syscall unshare(2).
func Unshare(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        flags := args[0].Int()
        opts := kernel.SharingOptions{
                NewAddressSpace:     flags&linux.CLONE_VM == linux.CLONE_VM,
                NewSignalHandlers:   flags&linux.CLONE_SIGHAND == linux.CLONE_SIGHAND,
                NewThreadGroup:      flags&linux.CLONE_THREAD == linux.CLONE_THREAD,
                NewPIDNamespace:     flags&linux.CLONE_NEWPID == linux.CLONE_NEWPID,
                NewUserNamespace:    flags&linux.CLONE_NEWUSER == linux.CLONE_NEWUSER,
                NewNetworkNamespace: flags&linux.CLONE_NEWNET == linux.CLONE_NEWNET,
                NewFiles:            flags&linux.CLONE_FILES == linux.CLONE_FILES,
                NewFSContext:        flags&linux.CLONE_FS == linux.CLONE_FS,
                NewUTSNamespace:     flags&linux.CLONE_NEWUTS == linux.CLONE_NEWUTS,
                NewIPCNamespace:     flags&linux.CLONE_NEWIPC == linux.CLONE_NEWIPC,
        }
        // "CLONE_NEWPID automatically implies CLONE_THREAD as well." - unshare(2)
        if opts.NewPIDNamespace {
                opts.NewThreadGroup = true
        }
        // "... specifying CLONE_NEWUSER automatically implies CLONE_THREAD. Since
        // Linux 3.9, CLONE_NEWUSER also automatically implies CLONE_FS."
        if opts.NewUserNamespace {
                opts.NewThreadGroup = true
                opts.NewFSContext = true
        }
        return 0, nil, t.Unshare(&opts)
}

// SchedYield implements linux syscall sched_yield(2).
func SchedYield(t *kernel.Task, _ arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        t.Yield()
        return 0, nil, nil
}

// SchedSetaffinity implements linux syscall sched_setaffinity(2).
func SchedSetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        tid := args[0].Int()
        size := args[1].SizeT()
        maskAddr := args[2].Pointer()

        var task *kernel.Task
        if tid == 0 {
                task = t
        } else {
                task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
                if task == nil {
                        return 0, nil, linuxerr.ESRCH
                }
        }

        mask := sched.NewCPUSet(t.Kernel().ApplicationCores())
        if size > mask.Size() {
                size = mask.Size()
        }
        if _, err := t.CopyInBytes(maskAddr, mask[:size]); err != nil {
                return 0, nil, err
        }
        return 0, nil, task.SetCPUMask(mask)
}

// SchedGetaffinity implements linux syscall sched_getaffinity(2).
func SchedGetaffinity(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        tid := args[0].Int()
        size := args[1].SizeT()
        maskAddr := args[2].Pointer()

        // This limitation is because linux stores the cpumask
        // in an array of "unsigned long" so the buffer needs to
        // be a multiple of the word size.
        if size&(t.Arch().Width()-1) > 0 {
                return 0, nil, linuxerr.EINVAL
        }

        var task *kernel.Task
        if tid == 0 {
                task = t
        } else {
                task = t.PIDNamespace().TaskWithID(kernel.ThreadID(tid))
                if task == nil {
                        return 0, nil, linuxerr.ESRCH
                }
        }

        mask := task.CPUMask()
        // The buffer needs to be big enough to hold a cpumask with
        // all possible cpus.
        if size < mask.Size() {
                return 0, nil, linuxerr.EINVAL
        }
        _, err := t.CopyOutBytes(maskAddr, mask)

        // NOTE: The syscall interface is slightly different than the glibc
        // interface. The raw sched_getaffinity syscall returns the number of
        // bytes used to represent a cpu mask.
        return uintptr(mask.Size()), nil, err
}

// Getcpu implements linux syscall getcpu(2).
func Getcpu(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        cpu := args[0].Pointer()
        node := args[1].Pointer()
        // third argument to this system call is nowadays unused.

        if cpu != 0 {
                if _, err := primitive.CopyInt32Out(t, cpu, t.CPU()); err != nil {
                        return 0, nil, err
                }
        }
        // We always return node 0.
        if node != 0 {
                if _, err := t.MemoryManager().ZeroOut(t, node, 4, usermem.IOOpts{
                        AddressSpaceActive: true,
                }); err != nil {
                        return 0, nil, err
                }
        }
        return 0, nil, nil
}

// Setpgid implements the linux syscall setpgid(2).
func Setpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        // Note that throughout this function, pgid is interpreted with respect
        // to t's namespace, not with respect to the selected ThreadGroup's
        // namespace (which may be different).
        pid := kernel.ThreadID(args[0].Int())
        pgid := kernel.ProcessGroupID(args[1].Int())

        // "If pid is zero, then the process ID of the calling process is used."
        tg := t.ThreadGroup()
        if pid != 0 {
                ot := t.PIDNamespace().TaskWithID(pid)
                if ot == nil {
                        return 0, nil, linuxerr.ESRCH
                }
                tg = ot.ThreadGroup()
                if tg.Leader() != ot {
                        return 0, nil, linuxerr.EINVAL
                }

                // Setpgid only operates on child threadgroups.
                if tg != t.ThreadGroup() && (tg.Leader().Parent() == nil || tg.Leader().Parent().ThreadGroup() != t.ThreadGroup()) {
                        return 0, nil, linuxerr.ESRCH
                }
        }

        // "If pgid is zero, then the PGID of the process specified by pid is made
        // the same as its process ID."
        defaultPGID := kernel.ProcessGroupID(t.PIDNamespace().IDOfThreadGroup(tg))
        if pgid == 0 {
                pgid = defaultPGID
        } else if pgid < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // If the pgid is the same as the group, then create a new one. Otherwise,
        // we attempt to join an existing process group.
        if pgid == defaultPGID {
                // For convenience, errors line up with Linux syscall API.
                if err := tg.CreateProcessGroup(); err != nil {
                        // Is the process group already as expected? If so,
                        // just return success. This is the same behavior as
                        // Linux.
                        if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == defaultPGID {
                                return 0, nil, nil
                        }
                        return 0, nil, err
                }
        } else {
                // Same as CreateProcessGroup, above.
                if err := tg.JoinProcessGroup(t.PIDNamespace(), pgid, tg != t.ThreadGroup()); err != nil {
                        // See above.
                        if t.PIDNamespace().IDOfProcessGroup(tg.ProcessGroup()) == pgid {
                                return 0, nil, nil
                        }
                        return 0, nil, err
                }
        }

        // Success.
        return 0, nil, nil
}

// Getpgrp implements the linux syscall getpgrp(2).
func Getpgrp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return uintptr(t.PIDNamespace().IDOfProcessGroup(t.ThreadGroup().ProcessGroup())), nil, nil
}

// Getpgid implements the linux syscall getpgid(2).
func Getpgid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        tid := kernel.ThreadID(args[0].Int())
        if tid == 0 {
                return Getpgrp(t, args)
        }

        target := t.PIDNamespace().TaskWithID(tid)
        if target == nil {
                return 0, nil, linuxerr.ESRCH
        }

        return uintptr(t.PIDNamespace().IDOfProcessGroup(target.ThreadGroup().ProcessGroup())), nil, nil
}

// Setsid implements the linux syscall setsid(2).
func Setsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return 0, nil, t.ThreadGroup().CreateSession()
}

// Getsid implements the linux syscall getsid(2).
func Getsid(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        tid := kernel.ThreadID(args[0].Int())
        if tid == 0 {
                return uintptr(t.PIDNamespace().IDOfSession(t.ThreadGroup().Session())), nil, nil
        }

        target := t.PIDNamespace().TaskWithID(tid)
        if target == nil {
                return 0, nil, linuxerr.ESRCH
        }

        return uintptr(t.PIDNamespace().IDOfSession(target.ThreadGroup().Session())), nil, nil
}

// Getpriority pretends to implement the linux syscall getpriority(2).
//
// This is a stub; real priorities require a full scheduler.
func Getpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        which := args[0].Int()
        who := kernel.ThreadID(args[1].Int())

        switch which {
        case linux.PRIO_PROCESS:
                // Look for who, return ESRCH if not found.
                var task *kernel.Task
                if who == 0 {
                        task = t
                } else {
                        task = t.PIDNamespace().TaskWithID(who)
                }

                if task == nil {
                        return 0, nil, linuxerr.ESRCH
                }

                // From kernel/sys.c:getpriority:
                // "To avoid negative return values, 'getpriority()'
                // will not return the normal nice-value, but a negated
                // value that has been offset by 20"
                return uintptr(20 - task.Niceness()), nil, nil
        case linux.PRIO_USER:
                fallthrough
        case linux.PRIO_PGRP:
                // PRIO_USER and PRIO_PGRP have no further implementation yet.
                return 0, nil, nil
        default:
                return 0, nil, linuxerr.EINVAL
        }
}

// Setpriority pretends to implement the linux syscall setpriority(2).
//
// This is a stub; real priorities require a full scheduler.
func Setpriority(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        which := args[0].Int()
        who := kernel.ThreadID(args[1].Int())
        niceval := int(args[2].Int())

        // In the kernel's implementation, values outside the range
        // of [-20, 19] are truncated to these minimum and maximum
        // values.
        if niceval < -20 /* min niceval */ {
                niceval = -20
        } else if niceval > 19 /* max niceval */ {
                niceval = 19
        }

        switch which {
        case linux.PRIO_PROCESS:
                // Look for who, return ESRCH if not found.
                var task *kernel.Task
                if who == 0 {
                        task = t
                } else {
                        task = t.PIDNamespace().TaskWithID(who)
                }

                if task == nil {
                        return 0, nil, linuxerr.ESRCH
                }

                task.SetNiceness(niceval)
        case linux.PRIO_USER:
                fallthrough
        case linux.PRIO_PGRP:
                // PRIO_USER and PRIO_PGRP have no further implementation yet.
                return 0, nil, nil
        default:
                return 0, nil, linuxerr.EINVAL
        }

        return 0, nil, nil
}

// Ptrace implements linux system call ptrace(2).
func Ptrace(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        req := args[0].Int64()
        pid := kernel.ThreadID(args[1].Int())
        addr := args[2].Pointer()
        data := args[3].Pointer()

        return 0, nil, t.Ptrace(req, pid, addr, data)
}









































   18 

   18 













   77 






    2 






   61 




    1 




    1 





















   79 
    3 



   77 






   76 


    1 



   75 


   75 



    3 

    1 



    2 




    2 



    2 





   16 





   16 




   16 


   16 




   62 


    1 



   62 



   61 




   61 


   62 


   62 
    1 










   62 






    1 




   62 
    2 



   60 
    1 




   59 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/syserror"
)

// rlimit describes an implementation of 'struct rlimit', which may vary from
// system-to-system.
type rlimit interface {
        marshal.Marshallable

        // toLimit converts an rlimit to a limits.Limit.
        toLimit() *limits.Limit

        // fromLimit converts a limits.Limit to an rlimit.
        fromLimit(lim limits.Limit)
}

// newRlimit returns the appropriate rlimit type for 'struct rlimit' on this system.
func newRlimit(t *kernel.Task) (rlimit, error) {
        switch t.Arch().Width() {
        case 8:
                // On 64-bit system, struct rlimit and struct rlimit64 are identical.
                return &rlimit64{}, nil
        default:
                return nil, syserror.ENOSYS
        }
}

// +marshal
type rlimit64 struct {
        Cur uint64
        Max uint64
}

func (r *rlimit64) toLimit() *limits.Limit {
        return &limits.Limit{
                Cur: limits.FromLinux(r.Cur),
                Max: limits.FromLinux(r.Max),
        }
}

func (r *rlimit64) fromLimit(lim limits.Limit) {
        *r = rlimit64{
                Cur: limits.ToLinux(lim.Cur),
                Max: limits.ToLinux(lim.Max),
        }
}

func (r *rlimit64) copyIn(t *kernel.Task, addr hostarch.Addr) error {
        _, err := r.CopyIn(t, addr)
        return err
}

func (r *rlimit64) copyOut(t *kernel.Task, addr hostarch.Addr) error {
        _, err := r.CopyOut(t, addr)
        return err
}

func makeRlimit64(lim limits.Limit) *rlimit64 {
        return &rlimit64{Cur: lim.Cur, Max: lim.Max}
}

// setableLimits is the set of supported setable limits.
var setableLimits = map[limits.LimitType]struct{}{
        limits.NumberOfFiles: {},
        limits.AS:            {},
        limits.CPU:           {},
        limits.Data:          {},
        limits.FileSize:      {},
        limits.MemoryLocked:  {},
        limits.Stack:         {},
        // RSS can be set, but it's not enforced because Linux doesn't enforce it
        // either: "This limit has effect only in Linux 2.4.x, x < 30"
        limits.Rss: {},
        // These are not enforced, but we include them here to avoid returning
        // EPERM, since some apps expect them to succeed.
        limits.Core:         {},
        limits.ProcessCount: {},
}

func prlimit64(t *kernel.Task, resource limits.LimitType, newLim *limits.Limit) (limits.Limit, error) {
        if newLim == nil {
                return t.ThreadGroup().Limits().Get(resource), nil
        }

        if _, ok := setableLimits[resource]; !ok {
                return limits.Limit{}, linuxerr.EPERM
        }

        // "A privileged process (under Linux: one with the CAP_SYS_RESOURCE
        // capability in the initial user namespace) may make arbitrary changes
        // to either limit value."
        privileged := t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.Kernel().RootUserNamespace())

        oldLim, err := t.ThreadGroup().Limits().Set(resource, *newLim, privileged)
        if err != nil {
                return limits.Limit{}, err
        }

        if resource == limits.CPU {
                t.NotifyRlimitCPUUpdated()
        }
        return oldLim, nil
}

// Getrlimit implements linux syscall getrlimit(2).
func Getrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        resource, ok := limits.FromLinuxResource[int(args[0].Int())]
        if !ok {
                // Return err; unknown limit.
                return 0, nil, linuxerr.EINVAL
        }
        addr := args[1].Pointer()
        rlim, err := newRlimit(t)
        if err != nil {
                return 0, nil, err
        }
        lim, err := prlimit64(t, resource, nil)
        if err != nil {
                return 0, nil, err
        }
        rlim.fromLimit(lim)
        _, err = rlim.CopyOut(t, addr)
        return 0, nil, err
}

// Setrlimit implements linux syscall setrlimit(2).
func Setrlimit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        resource, ok := limits.FromLinuxResource[int(args[0].Int())]
        if !ok {
                // Return err; unknown limit.
                return 0, nil, linuxerr.EINVAL
        }
        addr := args[1].Pointer()
        rlim, err := newRlimit(t)
        if err != nil {
                return 0, nil, err
        }
        if _, err := rlim.CopyIn(t, addr); err != nil {
                return 0, nil, linuxerr.EFAULT
        }
        _, err = prlimit64(t, resource, rlim.toLimit())
        return 0, nil, err
}

// Prlimit64 implements linux syscall prlimit64(2).
func Prlimit64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        tid := kernel.ThreadID(args[0].Int())
        resource, ok := limits.FromLinuxResource[int(args[1].Int())]
        if !ok {
                // Return err; unknown limit.
                return 0, nil, linuxerr.EINVAL
        }
        newRlimAddr := args[2].Pointer()
        oldRlimAddr := args[3].Pointer()

        var newLim *limits.Limit
        if newRlimAddr != 0 {
                var nrl rlimit64
                if err := nrl.copyIn(t, newRlimAddr); err != nil {
                        return 0, nil, linuxerr.EFAULT
                }
                newLim = nrl.toLimit()
        }

        if tid < 0 {
                return 0, nil, linuxerr.EINVAL
        }
        ot := t
        if tid > 0 {
                if ot = t.PIDNamespace().TaskWithID(tid); ot == nil {
                        return 0, nil, linuxerr.ESRCH
                }
        }

        // "To set or get the resources of a process other than itself, the caller
        // must have the CAP_SYS_RESOURCE capability, or the real, effective, and
        // saved set user IDs of the target process must match the real user ID of
        // the caller and the real, effective, and saved set group IDs of the
        // target process must match the real group ID of the caller."
        if ot != t && !t.HasCapabilityIn(linux.CAP_SYS_RESOURCE, t.PIDNamespace().UserNamespace()) {
                cred, tcred := t.Credentials(), ot.Credentials()
                if cred.RealKUID != tcred.RealKUID ||
                        cred.RealKUID != tcred.EffectiveKUID ||
                        cred.RealKUID != tcred.SavedKUID ||
                        cred.RealKGID != tcred.RealKGID ||
                        cred.RealKGID != tcred.EffectiveKGID ||
                        cred.RealKGID != tcred.SavedKGID {
                        return 0, nil, linuxerr.EPERM
                }
        }

        oldLim, err := prlimit64(ot, resource, newLim)
        if err != nil {
                return 0, nil, err
        }

        if oldRlimAddr != 0 {
                if err := makeRlimit64(oldLim).copyOut(t, oldRlimAddr); err != nil {
                        return 0, nil, linuxerr.EFAULT
                }
        }

        return 0, nil, nil
}



























 1039 




  430 




  106 




   29 




   11 





   11 




    1 






   10 






    1 


    1 




   16 






   16 

    1 



   15 
   15 





   15 


    1 




    1 






   14 
   14 


   14 




    7 












    7 





    7 
    7 

    1 



    6 
    6 

    1 



    6 




   28 









   12 











   12 











   28 

    4 








   28 












   28 



    5 
    2 



    3 







    3 



    1 







    5 





    4 





    5 
    5 





    5 
    1 


    3 



    3 
    3 


    3 




    4 







    3 

    1 



    3 
    3 

    1 



    2 
    2 

    1 



    1 



    6 




    3 













    6 




    4 



    1 


    3 
    3 

    1 


    2 

    2 







    8 




    2 


    6 



    1 




    6 





    6 


    6 









    1 



    1 









   78 













   77 





















    1 




























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/mm"
)

// Credentials returns t's credentials.
//
// This value must be considered immutable.
func (t *Task) Credentials() *auth.Credentials {
        return t.creds.Load()
}

// UserNamespace returns the user namespace associated with the task.
func (t *Task) UserNamespace() *auth.UserNamespace {
        return t.Credentials().UserNamespace
}

// HasCapabilityIn checks if the task has capability cp in user namespace ns.
func (t *Task) HasCapabilityIn(cp linux.Capability, ns *auth.UserNamespace) bool {
        return t.Credentials().HasCapabilityIn(cp, ns)
}

// HasCapability checks if the task has capability cp in its user namespace.
func (t *Task) HasCapability(cp linux.Capability) bool {
        return t.Credentials().HasCapability(cp)
}

// SetUID implements the semantics of setuid(2).
func (t *Task) SetUID(uid auth.UID) error {
        // setuid considers -1 to be invalid.
        if !uid.Ok() {
                return linuxerr.EINVAL
        }

        t.mu.Lock()
        defer t.mu.Unlock()

        creds := t.Credentials()
        kuid := creds.UserNamespace.MapToKUID(uid)
        if !kuid.Ok() {
                return linuxerr.EINVAL
        }
        // "setuid() sets the effective user ID of the calling process. If the
        // effective UID of the caller is root (more precisely: if the caller has
        // the CAP_SETUID capability), the real UID and saved set-user-ID are also
        // set." - setuid(2)
        if creds.HasCapability(linux.CAP_SETUID) {
                t.setKUIDsUncheckedLocked(kuid, kuid, kuid)
                return nil
        }
        // "EPERM: The user is not privileged (Linux: does not have the CAP_SETUID
        // capability) and uid does not match the real UID or saved set-user-ID of
        // the calling process."
        if kuid != creds.RealKUID && kuid != creds.SavedKUID {
                return linuxerr.EPERM
        }
        t.setKUIDsUncheckedLocked(creds.RealKUID, kuid, creds.SavedKUID)
        return nil
}

// SetREUID implements the semantics of setreuid(2).
func (t *Task) SetREUID(r, e auth.UID) error {
        t.mu.Lock()
        defer t.mu.Unlock()
        // "Supplying a value of -1 for either the real or effective user ID forces
        // the system to leave that ID unchanged." - setreuid(2)
        creds := t.Credentials()
        newR := creds.RealKUID
        if r.Ok() {
                newR = creds.UserNamespace.MapToKUID(r)
                if !newR.Ok() {
                        return linuxerr.EINVAL
                }
        }
        newE := creds.EffectiveKUID
        if e.Ok() {
                newE = creds.UserNamespace.MapToKUID(e)
                if !newE.Ok() {
                        return linuxerr.EINVAL
                }
        }
        if !creds.HasCapability(linux.CAP_SETUID) {
                // "Unprivileged processes may only set the effective user ID to the
                // real user ID, the effective user ID, or the saved set-user-ID."
                if newE != creds.RealKUID && newE != creds.EffectiveKUID && newE != creds.SavedKUID {
                        return linuxerr.EPERM
                }
                // "Unprivileged users may only set the real user ID to the real user
                // ID or the effective user ID."
                if newR != creds.RealKUID && newR != creds.EffectiveKUID {
                        return linuxerr.EPERM
                }
        }
        // "If the real user ID is set (i.e., ruid is not -1) or the effective user
        // ID is set to a value not equal to the previous real user ID, the saved
        // set-user-ID will be set to the new effective user ID."
        newS := creds.SavedKUID
        if r.Ok() || (e.Ok() && newE != creds.EffectiveKUID) {
                newS = newE
        }
        t.setKUIDsUncheckedLocked(newR, newE, newS)
        return nil
}

// SetRESUID implements the semantics of the setresuid(2) syscall.
func (t *Task) SetRESUID(r, e, s auth.UID) error {
        t.mu.Lock()
        defer t.mu.Unlock()
        // "Unprivileged user processes may change the real UID, effective UID, and
        // saved set-user-ID, each to one of: the current real UID, the current
        // effective UID or the current saved set-user-ID. Privileged processes (on
        // Linux, those having the CAP_SETUID capability) may set the real UID,
        // effective UID, and saved set-user-ID to arbitrary values. If one of the
        // arguments equals -1, the corresponding value is not changed." -
        // setresuid(2)
        var err error
        creds := t.Credentials()
        newR := creds.RealKUID
        if r.Ok() {
                newR, err = creds.UseUID(r)
                if err != nil {
                        return err
                }
        }
        newE := creds.EffectiveKUID
        if e.Ok() {
                newE, err = creds.UseUID(e)
                if err != nil {
                        return err
                }
        }
        newS := creds.SavedKUID
        if s.Ok() {
                newS, err = creds.UseUID(s)
                if err != nil {
                        return err
                }
        }
        t.setKUIDsUncheckedLocked(newR, newE, newS)
        return nil
}

// Preconditions: t.mu must be locked.
func (t *Task) setKUIDsUncheckedLocked(newR, newE, newS auth.KUID) {
        creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds.
        root := creds.UserNamespace.MapToKUID(auth.RootUID)
        oldR, oldE, oldS := creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID
        creds.RealKUID, creds.EffectiveKUID, creds.SavedKUID = newR, newE, newS

        // "1. If one or more of the real, effective or saved set user IDs was
        // previously 0, and as a result of the UID changes all of these IDs have a
        // nonzero value, then all capabilities are cleared from the permitted and
        // effective capability sets." - capabilities(7)
        if (oldR == root || oldE == root || oldS == root) && (newR != root && newE != root && newS != root) {
                // prctl(2): "PR_SET_KEEPCAP: Set the state of the calling thread's
                // "keep capabilities" flag, which determines whether the thread's permitted
                // capability set is cleared when a change is made to the
                // thread's user IDs such that the thread's real UID, effective
                // UID, and saved set-user-ID all become nonzero when at least
                // one of them previously had the value 0.  By default, the
                // permitted capability set is cleared when such a change is
                // made; setting the "keep capabilities" flag prevents it from
                // being cleared." (A thread's effective capability set is always
                // cleared when such a credential change is made,
                // regardless of the setting of the "keep capabilities" flag.)
                if !creds.KeepCaps {
                        creds.PermittedCaps = 0
                        creds.EffectiveCaps = 0
                }
        }
        // """
        // 2. If the effective user ID is changed from 0 to nonzero, then all
        // capabilities are cleared from the effective set.
        //
        // 3. If the effective user ID is changed from nonzero to 0, then the
        // permitted set is copied to the effective set.
        // """
        if oldE == root && newE != root {
                creds.EffectiveCaps = 0
        } else if oldE != root && newE == root {
                creds.EffectiveCaps = creds.PermittedCaps
        }
        // "4. If the filesystem user ID is changed from 0 to nonzero (see
        // setfsuid(2)), then the following capabilities are cleared from the
        // effective set: ..."
        // (filesystem UIDs aren't implemented, nor are any of the capabilities in
        // question)

        if oldE != newE {
                // "[dumpability] is reset to the current value contained in
                // the file /proc/sys/fs/suid_dumpable (which by default has
                // the value 0), in the following circumstances: The process's
                // effective user or group ID is changed." - prctl(2)
                //
                // (suid_dumpable isn't implemented, so we just use the
                // default.
                t.MemoryManager().SetDumpability(mm.NotDumpable)

                // Not documented, but compare Linux's kernel/cred.c:commit_creds().
                t.parentDeathSignal = 0
        }
        t.creds.Store(creds)
}

// SetGID implements the semantics of setgid(2).
func (t *Task) SetGID(gid auth.GID) error {
        if !gid.Ok() {
                return linuxerr.EINVAL
        }

        t.mu.Lock()
        defer t.mu.Unlock()

        creds := t.Credentials()
        kgid := creds.UserNamespace.MapToKGID(gid)
        if !kgid.Ok() {
                return linuxerr.EINVAL
        }
        if creds.HasCapability(linux.CAP_SETGID) {
                t.setKGIDsUncheckedLocked(kgid, kgid, kgid)
                return nil
        }
        if kgid != creds.RealKGID && kgid != creds.SavedKGID {
                return linuxerr.EPERM
        }
        t.setKGIDsUncheckedLocked(creds.RealKGID, kgid, creds.SavedKGID)
        return nil
}

// SetREGID implements the semantics of setregid(2).
func (t *Task) SetREGID(r, e auth.GID) error {
        t.mu.Lock()
        defer t.mu.Unlock()

        creds := t.Credentials()
        newR := creds.RealKGID
        if r.Ok() {
                newR = creds.UserNamespace.MapToKGID(r)
                if !newR.Ok() {
                        return linuxerr.EINVAL
                }
        }
        newE := creds.EffectiveKGID
        if e.Ok() {
                newE = creds.UserNamespace.MapToKGID(e)
                if !newE.Ok() {
                        return linuxerr.EINVAL
                }
        }
        if !creds.HasCapability(linux.CAP_SETGID) {
                if newE != creds.RealKGID && newE != creds.EffectiveKGID && newE != creds.SavedKGID {
                        return linuxerr.EPERM
                }
                if newR != creds.RealKGID && newR != creds.EffectiveKGID {
                        return linuxerr.EPERM
                }
        }
        newS := creds.SavedKGID
        if r.Ok() || (e.Ok() && newE != creds.EffectiveKGID) {
                newS = newE
        }
        t.setKGIDsUncheckedLocked(newR, newE, newS)
        return nil
}

// SetRESGID implements the semantics of the setresgid(2) syscall.
func (t *Task) SetRESGID(r, e, s auth.GID) error {
        var err error

        t.mu.Lock()
        defer t.mu.Unlock()

        creds := t.Credentials()
        newR := creds.RealKGID
        if r.Ok() {
                newR, err = creds.UseGID(r)
                if err != nil {
                        return err
                }
        }
        newE := creds.EffectiveKGID
        if e.Ok() {
                newE, err = creds.UseGID(e)
                if err != nil {
                        return err
                }
        }
        newS := creds.SavedKGID
        if s.Ok() {
                newS, err = creds.UseGID(s)
                if err != nil {
                        return err
                }
        }
        t.setKGIDsUncheckedLocked(newR, newE, newS)
        return nil
}

func (t *Task) setKGIDsUncheckedLocked(newR, newE, newS auth.KGID) {
        creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds.
        oldE := creds.EffectiveKGID
        creds.RealKGID, creds.EffectiveKGID, creds.SavedKGID = newR, newE, newS

        if oldE != newE {
                // "[dumpability] is reset to the current value contained in
                // the file /proc/sys/fs/suid_dumpable (which by default has
                // the value 0), in the following circumstances: The process's
                // effective user or group ID is changed." - prctl(2)
                //
                // (suid_dumpable isn't implemented, so we just use the
                // default.
                t.MemoryManager().SetDumpability(mm.NotDumpable)

                // Not documented, but compare Linux's
                // kernel/cred.c:commit_creds().
                t.parentDeathSignal = 0
        }
        t.creds.Store(creds)
}

// SetExtraGIDs attempts to change t's supplemental groups. All IDs are
// interpreted as being in t's user namespace.
func (t *Task) SetExtraGIDs(gids []auth.GID) error {
        t.mu.Lock()
        defer t.mu.Unlock()
        creds := t.Credentials()
        if !creds.HasCapability(linux.CAP_SETGID) {
                return linuxerr.EPERM
        }
        kgids := make([]auth.KGID, len(gids))
        for i, gid := range gids {
                kgid := creds.UserNamespace.MapToKGID(gid)
                if !kgid.Ok() {
                        return linuxerr.EINVAL
                }
                kgids[i] = kgid
        }
        creds = creds.Fork() // The credentials object is immutable. See doc for creds.
        creds.ExtraKGIDs = kgids
        t.creds.Store(creds)
        return nil
}

// SetCapabilitySets attempts to change t's permitted, inheritable, and
// effective capability sets.
func (t *Task) SetCapabilitySets(permitted, inheritable, effective auth.CapabilitySet) error {
        t.mu.Lock()
        defer t.mu.Unlock()
        // "Permitted: This is a limiting superset for the effective capabilities
        // that the thread may assume." - capabilities(7)
        if effective & ^permitted != 0 {
                return linuxerr.EPERM
        }
        creds := t.Credentials()
        // "It is also a limiting superset for the capabilities that may be added
        // to the inheritable set by a thread that does not have the CAP_SETPCAP
        // capability in its effective set."
        if !creds.HasCapability(linux.CAP_SETPCAP) && (inheritable & ^(creds.InheritableCaps|creds.PermittedCaps) != 0) {
                return linuxerr.EPERM
        }
        // "If a thread drops a capability from its permitted set, it can never
        // reacquire that capability (unless it execve(2)s ..."
        if permitted & ^creds.PermittedCaps != 0 {
                return linuxerr.EPERM
        }
        // "... if a capability is not in the bounding set, then a thread can't add
        // this capability to its inheritable set, even if it was in its permitted
        // capabilities ..."
        if inheritable & ^(creds.InheritableCaps|creds.BoundingCaps) != 0 {
                return linuxerr.EPERM
        }
        creds = creds.Fork() // The credentials object is immutable. See doc for creds.
        creds.PermittedCaps = permitted
        creds.InheritableCaps = inheritable
        creds.EffectiveCaps = effective
        t.creds.Store(creds)
        return nil
}

// DropBoundingCapability attempts to drop capability cp from t's capability
// bounding set.
func (t *Task) DropBoundingCapability(cp linux.Capability) error {
        t.mu.Lock()
        defer t.mu.Unlock()
        creds := t.Credentials()
        if !creds.HasCapability(linux.CAP_SETPCAP) {
                return linuxerr.EPERM
        }
        creds = creds.Fork() // The credentials object is immutable. See doc for creds.
        creds.BoundingCaps &^= auth.CapabilitySetOf(cp)
        t.creds.Store(creds)
        return nil
}

// SetUserNamespace attempts to move c into ns.
func (t *Task) SetUserNamespace(ns *auth.UserNamespace) error {
        t.mu.Lock()
        defer t.mu.Unlock()

        creds := t.Credentials()
        // "A process reassociating itself with a user namespace must have the
        // CAP_SYS_ADMIN capability in the target user namespace." - setns(2)
        //
        // If t just created ns, then t.creds is guaranteed to have CAP_SYS_ADMIN
        // in ns (by rule 3 in auth.Credentials.HasCapability).
        if !creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, ns) {
                return linuxerr.EPERM
        }

        creds = creds.Fork() // The credentials object is immutable. See doc for creds.
        creds.UserNamespace = ns
        // "The child process created by clone(2) with the CLONE_NEWUSER flag
        // starts out with a complete set of capabilities in the new user
        // namespace. Likewise, a process that creates a new user namespace using
        // unshare(2) or joins an existing user namespace using setns(2) gains a
        // full set of capabilities in that namespace."
        creds.PermittedCaps = auth.AllCapabilities
        creds.InheritableCaps = 0
        creds.EffectiveCaps = auth.AllCapabilities
        creds.BoundingCaps = auth.AllCapabilities
        // "A call to clone(2), unshare(2), or setns(2) using the CLONE_NEWUSER
        // flag sets the "securebits" flags (see capabilities(7)) to their default
        // values (all flags disabled) in the child (for clone(2)) or caller (for
        // unshare(2), or setns(2)." - user_namespaces(7)
        creds.KeepCaps = false
        t.creds.Store(creds)

        return nil
}

// SetKeepCaps will set the keep capabilities flag PR_SET_KEEPCAPS.
func (t *Task) SetKeepCaps(k bool) {
        t.mu.Lock()
        defer t.mu.Unlock()
        creds := t.Credentials().Fork() // The credentials object is immutable. See doc for creds.
        creds.KeepCaps = k
        t.creds.Store(creds)
}

// updateCredsForExecLocked updates t.creds to reflect an execve().
//
// NOTE(b/30815691): We currently do not implement privileged executables
// (set-user/group-ID bits and file capabilities). This allows us to make a lot
// of simplifying assumptions:
//
// - We assume the no_new_privs bit (set by prctl(SET_NO_NEW_PRIVS)), which
// disables the features we don't support anyway, is always set. This
// drastically simplifies this function.
//
// - We don't set AT_SECURE = 1, because no_new_privs always being set means
// that the conditions that require AT_SECURE = 1 never arise. (Compare Linux's
// security/commoncap.c:cap_bprm_set_creds() and cap_bprm_secureexec().)
//
// - We don't check for CAP_SYS_ADMIN in prctl(PR_SET_SECCOMP), since
// seccomp-bpf is also allowed if the task has no_new_privs set.
//
// - Task.ptraceAttach does not serialize with execve as it does in Linux,
// since no_new_privs being set has the same effect as the presence of an
// unprivileged tracer.
//
// Preconditions: t.mu must be locked.
func (t *Task) updateCredsForExecLocked() {
        // """
        // During an execve(2), the kernel calculates the new capabilities of
        // the process using the following algorithm:
        //
        //     P'(permitted) = (P(inheritable) & F(inheritable)) |
        //                     (F(permitted) & cap_bset)
        //
        //     P'(effective) = F(effective) ? P'(permitted) : 0
        //
        //     P'(inheritable) = P(inheritable)    [i.e., unchanged]
        //
        // where:
        //
        //     P         denotes the value of a thread capability set before the
        //               execve(2)
        //
        //     P'        denotes the value of a thread capability set after the
        //               execve(2)
        //
        //     F         denotes a file capability set
        //
        //     cap_bset  is the value of the capability bounding set
        //
        // ...
        //
        // In order to provide an all-powerful root using capability sets, during
        // an execve(2):
        //
        // 1. If a set-user-ID-root program is being executed, or the real user ID
        // of the process is 0 (root) then the file inheritable and permitted sets
        // are defined to be all ones (i.e. all capabilities enabled).
        //
        // 2. If a set-user-ID-root program is being executed, then the file
        // effective bit is defined to be one (enabled).
        //
        // The upshot of the above rules, combined with the capabilities
        // transformations described above, is that when a process execve(2)s a
        // set-user-ID-root program, or when a process with an effective UID of 0
        // execve(2)s a program, it gains all capabilities in its permitted and
        // effective capability sets, except those masked out by the capability
        // bounding set.
        // """ - capabilities(7)
        // (ambient capability sets omitted)
        //
        // As the last paragraph implies, the case of "a set-user-ID root program
        // is being executed" also includes the case where (namespace) root is
        // executing a non-set-user-ID program; the actual check is just based on
        // the effective user ID.
        var newPermitted auth.CapabilitySet // since F(inheritable) == F(permitted) == 0
        fileEffective := false
        creds := t.Credentials()
        root := creds.UserNamespace.MapToKUID(auth.RootUID)
        if creds.EffectiveKUID == root || creds.RealKUID == root {
                newPermitted = creds.InheritableCaps | creds.BoundingCaps
                if creds.EffectiveKUID == root {
                        fileEffective = true
                }
        }

        creds = creds.Fork() // The credentials object is immutable. See doc for creds.

        // Now we enter poorly-documented, somewhat confusing territory. (The
        // accompanying comment in Linux's security/commoncap.c:cap_bprm_set_creds
        // is not very helpful.) My reading of it is:
        //
        // If at least one of the following is true:
        //
        // A1. The execing task is ptraced, and the tracer did not have
        // CAP_SYS_PTRACE in the execing task's user namespace at the time of
        // PTRACE_ATTACH.
        //
        // A2. The execing task shares its FS context with at least one task in
        // another thread group.
        //
        // A3. The execing task has no_new_privs set.
        //
        // AND at least one of the following is true:
        //
        // B1. The new effective user ID (which may come from set-user-ID, or be the
        // execing task's existing effective user ID) is not equal to the task's
        // real UID.
        //
        // B2. The new effective group ID (which may come from set-group-ID, or be
        // the execing task's existing effective group ID) is not equal to the
        // task's real GID.
        //
        // B3. The new permitted capability set contains capabilities not in the
        // task's permitted capability set.
        //
        // Then:
        //
        // C1. Limit the new permitted capability set to the task's permitted
        // capability set.
        //
        // C2. If either the task does not have CAP_SETUID in its user namespace, or
        // the task has no_new_privs set, force the new effective UID and GID to
        // the task's real UID and GID.
        //
        // But since no_new_privs is always set (A3 is always true), this becomes
        // much simpler. If B1 and B2 are false, C2 is a no-op. If B3 is false, C1
        // is a no-op. So we can just do C1 and C2 unconditionally.
        if creds.EffectiveKUID != creds.RealKUID || creds.EffectiveKGID != creds.RealKGID {
                creds.EffectiveKUID = creds.RealKUID
                creds.EffectiveKGID = creds.RealKGID
                t.parentDeathSignal = 0
        }
        // (Saved set-user-ID is always set to the new effective user ID, and saved
        // set-group-ID is always set to the new effective group ID, regardless of
        // the above.)
        creds.SavedKUID = creds.RealKUID
        creds.SavedKGID = creds.RealKGID
        creds.PermittedCaps &= newPermitted
        if fileEffective {
                creds.EffectiveCaps = creds.PermittedCaps
        } else {
                creds.EffectiveCaps = 0
        }

        // prctl(2): The "keep capabilities" value will be reset to 0 on subsequent
        // calls to execve(2).
        creds.KeepCaps = false

        // "The bounding set is inherited at fork(2) from the thread's parent, and
        // is preserved across an execve(2)". So we're done.
        t.creds.Store(creds)
}






























































  486 
  487 













  488 
  488 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package fsmetric defines filesystem metrics that are used by both VFS1 and
// VFS2.
//
// TODO(gvisor.dev/issue/1624): Once VFS1 is deleted, inline these metrics into
// VFS2.
package fsmetric

import (
        "time"

        "gvisor.dev/gvisor/pkg/metric"
)

// RecordWaitTime enables the ReadWait, GoferReadWait9P, GoferReadWaitHost, and
// TmpfsReadWait metrics. Enabling this comes at a CPU cost due to performing
// three clock reads per read call.
//
// Note that this is only performed in the direct read path, and may not be
// consistently applied for other forms of reads, such as splice.
var RecordWaitTime = false

// Metrics that apply to all filesystems.
var (
        Opens    = metric.MustCreateNewUint64Metric("/fs/opens", false /* sync */, "Number of file opens.")
        Reads    = metric.MustCreateNewUint64Metric("/fs/reads", false /* sync */, "Number of file reads.")
        ReadWait = metric.MustCreateNewUint64NanosecondsMetric("/fs/read_wait", false /* sync */, "Time waiting on file reads, in nanoseconds.")
)

// Metrics that only apply to fs/gofer and fsimpl/gofer.
var (
        GoferOpens9P      = metric.MustCreateNewUint64Metric("/gofer/opens_9p", false /* sync */, "Number of times a file was opened from a gofer and did not have a host file descriptor.")
        GoferOpensHost    = metric.MustCreateNewUint64Metric("/gofer/opens_host", false /* sync */, "Number of times a file was opened from a gofer and did have a host file descriptor.")
        GoferReads9P      = metric.MustCreateNewUint64Metric("/gofer/reads_9p", false /* sync */, "Number of 9P file reads from a gofer.")
        GoferReadWait9P   = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_9p", false /* sync */, "Time waiting on 9P file reads from a gofer, in nanoseconds.")
        GoferReadsHost    = metric.MustCreateNewUint64Metric("/gofer/reads_host", false /* sync */, "Number of host file reads from a gofer.")
        GoferReadWaitHost = metric.MustCreateNewUint64NanosecondsMetric("/gofer/read_wait_host", false /* sync */, "Time waiting on host file reads from a gofer, in nanoseconds.")
)

// Metrics that only apply to fs/tmpfs and fsimpl/tmpfs.
var (
        TmpfsOpensRO  = metric.MustCreateNewUint64Metric("/in_memory_file/opens_ro", false /* sync */, "Number of times an in-memory file was opened in read-only mode.")
        TmpfsOpensW   = metric.MustCreateNewUint64Metric("/in_memory_file/opens_w", false /* sync */, "Number of times an in-memory file was opened in write mode.")
        TmpfsReads    = metric.MustCreateNewUint64Metric("/in_memory_file/reads", false /* sync */, "Number of in-memory file reads.")
        TmpfsReadWait = metric.MustCreateNewUint64NanosecondsMetric("/in_memory_file/read_wait", false /* sync */, "Time waiting on in-memory file reads, in nanoseconds.")
)

// StartReadWait indicates the beginning of a file read.
func StartReadWait() time.Time {
        if !RecordWaitTime {
                return time.Time{}
        }
        return time.Now()
}

// FinishReadWait indicates the end of a file read whose time is accounted by
// m. start must be the value returned by the corresponding call to
// StartReadWait.
//
// FinishReadWait is marked nosplit for performance since it's often called
// from defer statements, which prevents it from being inlined
// (https://github.com/golang/go/issues/38471).
//go:nosplit
func FinishReadWait(m *metric.Uint64Metric, start time.Time) {
        if !RecordWaitTime {
                return
        }
        m.IncrementBy(uint64(time.Since(start).Nanoseconds()))
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/vfs/file_description_refs.go: no such file or directory



































   19 







    1 


   18 


   18 




   17 




   16 
    1 


   15 

    1 


   14 


    1 








   13 




    1 




   12 
    4 
    1 


    3 


    3 


    2 



   10 
    1 



    1 


    1 


    1 





   10 








   13 






    3 

    5 

    1 


    5 

    2 






   13 


    2 





   13 
    1 



   12 







   12 



    4 





    1 


    3 


    3 




    2 




    2 



    2 




    2 


    1 








    1 




    1 





































   20 






    1 


   19 
    1 



   18 
    1 


   17 





   17 






   17 

   17 
    1 




   16 
    6 
    1 


    5 
    1 


    4 




    4 





   14 


   13 


   13 




   13 













    1 
    1 


    1 


    1 
    1 













   12 


   18 

    6 


   12 





   18 
   17 






   18 




    6 


    1 









    6 



   18 

    3 


   18 



   18 


   17 





   14 


    2 




   12 
    1 








   12 

















    1 







    1 



    3 



    3 




    3 





    1 




   27 
    3 



   28 



   28 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "io"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
        slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// Splice implements Linux syscall splice(2).
func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        inFD := args[0].Int()
        inOffsetPtr := args[1].Pointer()
        outFD := args[2].Int()
        outOffsetPtr := args[3].Pointer()
        count := int64(args[4].SizeT())
        flags := args[5].Int()

        if count == 0 {
                return 0, nil, nil
        }
        if count > int64(kernel.MAX_RW_COUNT) {
                count = int64(kernel.MAX_RW_COUNT)
        }
        if count < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Check for invalid flags.
        if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Get file descriptions.
        inFile := t.GetFileVFS2(inFD)
        if inFile == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer inFile.DecRef(t)
        outFile := t.GetFileVFS2(outFD)
        if outFile == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer outFile.DecRef(t)

        // Check that both files support the required directionality.
        if !inFile.IsReadable() || !outFile.IsWritable() {
                return 0, nil, linuxerr.EBADF
        }

        // The operation is non-blocking if anything is non-blocking.
        //
        // N.B. This is a rather simplistic heuristic that avoids some
        // poor edge case behavior since the exact semantics here are
        // underspecified and vary between versions of Linux itself.
        nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)

        // At least one file description must represent a pipe.
        inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
        outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
        if !inIsPipe && !outIsPipe {
                return 0, nil, linuxerr.EINVAL
        }

        // Copy in offsets.
        inOffset := int64(-1)
        if inOffsetPtr != 0 {
                if inIsPipe {
                        return 0, nil, linuxerr.ESPIPE
                }
                if inFile.Options().DenyPRead {
                        return 0, nil, linuxerr.EINVAL
                }
                if _, err := primitive.CopyInt64In(t, inOffsetPtr, &inOffset); err != nil {
                        return 0, nil, err
                }
                if inOffset < 0 {
                        return 0, nil, linuxerr.EINVAL
                }
        }
        outOffset := int64(-1)
        if outOffsetPtr != 0 {
                if outIsPipe {
                        return 0, nil, linuxerr.ESPIPE
                }
                if outFile.Options().DenyPWrite {
                        return 0, nil, linuxerr.EINVAL
                }
                if _, err := primitive.CopyInt64In(t, outOffsetPtr, &outOffset); err != nil {
                        return 0, nil, err
                }
                if outOffset < 0 {
                        return 0, nil, linuxerr.EINVAL
                }
        }

        // Move data.
        var (
                n   int64
                err error
        )
        dw := dualWaiter{
                inFile:  inFile,
                outFile: outFile,
        }
        defer dw.destroy()
        for {
                // If both input and output are pipes, delegate to the pipe
                // implementation. Otherwise, exactly one end is a pipe, which
                // we ensure is consistently ordered after the non-pipe FD's
                // locks by passing the pipe FD as usermem.IO to the non-pipe
                // end.
                switch {
                case inIsPipe && outIsPipe:
                        n, err = pipe.Splice(t, outPipeFD, inPipeFD, count)
                case inIsPipe:
                        n, err = inPipeFD.SpliceToNonPipe(t, outFile, outOffset, count)
                        if outOffset != -1 {
                                outOffset += n
                        }
                case outIsPipe:
                        n, err = outPipeFD.SpliceFromNonPipe(t, inFile, inOffset, count)
                        if inOffset != -1 {
                                inOffset += n
                        }
                default:
                        panic("at least one end of splice must be a pipe")
                }

                if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
                        break
                }
                if err = dw.waitForBoth(t); err != nil {
                        break
                }
        }

        // Copy updated offsets out.
        if inOffsetPtr != 0 {
                if _, err := primitive.CopyInt64Out(t, inOffsetPtr, inOffset); err != nil {
                        return 0, nil, err
                }
        }
        if outOffsetPtr != 0 {
                if _, err := primitive.CopyInt64Out(t, outOffsetPtr, outOffset); err != nil {
                        return 0, nil, err
                }
        }

        // We can only pass a single file to handleIOError, so pick inFile arbitrarily.
        // This is used only for debugging purposes.
        return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "splice", outFile)
}

// Tee implements Linux syscall tee(2).
func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        inFD := args[0].Int()
        outFD := args[1].Int()
        count := int64(args[2].SizeT())
        flags := args[3].Int()

        if count == 0 {
                return 0, nil, nil
        }
        if count > int64(kernel.MAX_RW_COUNT) {
                count = int64(kernel.MAX_RW_COUNT)
        }
        if count < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Check for invalid flags.
        if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Get file descriptions.
        inFile := t.GetFileVFS2(inFD)
        if inFile == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer inFile.DecRef(t)
        outFile := t.GetFileVFS2(outFD)
        if outFile == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer outFile.DecRef(t)

        // Check that both files support the required directionality.
        if !inFile.IsReadable() || !outFile.IsWritable() {
                return 0, nil, linuxerr.EBADF
        }

        // The operation is non-blocking if anything is non-blocking.
        //
        // N.B. This is a rather simplistic heuristic that avoids some
        // poor edge case behavior since the exact semantics here are
        // underspecified and vary between versions of Linux itself.
        nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0)

        // Both file descriptions must represent pipes.
        inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD)
        outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
        if !inIsPipe || !outIsPipe {
                return 0, nil, linuxerr.EINVAL
        }

        // Copy data.
        var (
                n   int64
                err error
        )
        dw := dualWaiter{
                inFile:  inFile,
                outFile: outFile,
        }
        defer dw.destroy()
        for {
                n, err = pipe.Tee(t, outPipeFD, inPipeFD, count)
                if n != 0 || err != syserror.ErrWouldBlock || nonBlock {
                        break
                }
                if err = dw.waitForBoth(t); err != nil {
                        break
                }
        }

        if n != 0 {
                // If a partial write is completed, the error is dropped. Log it here.
                if err != nil && err != io.EOF && err != syserror.ErrWouldBlock {
                        log.Debugf("tee completed a partial write with error: %v", err)
                        err = nil
                }
        }

        // We can only pass a single file to handleIOError, so pick inFile arbitrarily.
        // This is used only for debugging purposes.
        return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "tee", inFile)
}

// Sendfile implements linux system call sendfile(2).
func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        outFD := args[0].Int()
        inFD := args[1].Int()
        offsetAddr := args[2].Pointer()
        count := int64(args[3].SizeT())

        inFile := t.GetFileVFS2(inFD)
        if inFile == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer inFile.DecRef(t)
        if !inFile.IsReadable() {
                return 0, nil, linuxerr.EBADF
        }

        outFile := t.GetFileVFS2(outFD)
        if outFile == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer outFile.DecRef(t)
        if !outFile.IsWritable() {
                return 0, nil, linuxerr.EBADF
        }

        // Verify that the outFile Append flag is not set.
        if outFile.StatusFlags()&linux.O_APPEND != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        // Verify that inFile is a regular file or block device. This is a
        // requirement; the same check appears in Linux
        // (fs/splice.c:splice_direct_to_actor).
        if stat, err := inFile.Stat(t, vfs.StatOptions{Mask: linux.STATX_TYPE}); err != nil {
                return 0, nil, err
        } else if stat.Mask&linux.STATX_TYPE == 0 ||
                (stat.Mode&linux.S_IFMT != linux.S_IFREG && stat.Mode&linux.S_IFMT != linux.S_IFBLK) {
                return 0, nil, linuxerr.EINVAL
        }

        // Copy offset if it exists.
        offset := int64(-1)
        if offsetAddr != 0 {
                if inFile.Options().DenyPRead {
                        return 0, nil, linuxerr.ESPIPE
                }
                var offsetP primitive.Int64
                if _, err := offsetP.CopyIn(t, offsetAddr); err != nil {
                        return 0, nil, err
                }
                offset = int64(offsetP)

                if offset < 0 {
                        return 0, nil, linuxerr.EINVAL
                }
                if offset+count < 0 {
                        return 0, nil, linuxerr.EINVAL
                }
        }

        // Validate count. This must come after offset checks.
        if count < 0 {
                return 0, nil, linuxerr.EINVAL
        }
        if count == 0 {
                return 0, nil, nil
        }
        if count > int64(kernel.MAX_RW_COUNT) {
                count = int64(kernel.MAX_RW_COUNT)
        }

        // Copy data.
        var (
                total int64
                err   error
        )
        dw := dualWaiter{
                inFile:  inFile,
                outFile: outFile,
        }
        defer dw.destroy()
        outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD)
        // Reading from input file should never block, since it is regular or
        // block device. We only need to check if writing to the output file
        // can block.
        nonBlock := outFile.StatusFlags()&linux.O_NONBLOCK != 0
        if outIsPipe {
                for {
                        var n int64
                        n, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count-total)
                        if offset != -1 {
                                offset += n
                        }
                        total += n
                        if total == count {
                                break
                        }
                        if err == nil && t.Interrupted() {
                                err = syserror.ErrInterrupted
                                break
                        }
                        if err == syserror.ErrWouldBlock && !nonBlock {
                                err = dw.waitForBoth(t)
                        }
                        if err != nil {
                                break
                        }
                }
        } else {
                // Read inFile to buffer, then write the contents to outFile.
                buf := make([]byte, count)
                for {
                        var readN int64
                        if offset != -1 {
                                readN, err = inFile.PRead(t, usermem.BytesIOSequence(buf), offset, vfs.ReadOptions{})
                                offset += readN
                        } else {
                                readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
                        }

                        // Write all of the bytes that we read. This may need
                        // multiple write calls to complete.
                        wbuf := buf[:readN]
                        for len(wbuf) > 0 {
                                var writeN int64
                                writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{})
                                wbuf = wbuf[writeN:]
                                if err == syserror.ErrWouldBlock && !nonBlock {
                                        err = dw.waitForOut(t)
                                }
                                if err != nil {
                                        // We didn't complete the write. Only report the bytes that were actually
                                        // written, and rewind offsets as needed.
                                        notWritten := int64(len(wbuf))
                                        readN -= notWritten
                                        if offset == -1 {
                                                // We modified the offset of the input file itself during the read
                                                // operation. Rewind it.
                                                if _, seekErr := inFile.Seek(t, -notWritten, linux.SEEK_CUR); seekErr != nil {
                                                        // Log the error but don't return it, since the write has already
                                                        // completed successfully.
                                                        log.Warningf("failed to roll back input file offset: %v", seekErr)
                                                }
                                        } else {
                                                // The sendfile call was provided an offset parameter that should be
                                                // adjusted to reflect the number of bytes sent. Rewind it.
                                                offset -= notWritten
                                        }
                                        break
                                }
                        }

                        total += readN
                        buf = buf[readN:]
                        if total == count {
                                break
                        }
                        if err == nil && t.Interrupted() {
                                err = syserror.ErrInterrupted
                                break
                        }
                        if err == syserror.ErrWouldBlock && !nonBlock {
                                err = dw.waitForBoth(t)
                        }
                        if err != nil {
                                break
                        }
                }
        }

        if offsetAddr != 0 {
                // Copy out the new offset.
                offsetP := primitive.Uint64(offset)
                if _, err := offsetP.CopyOut(t, offsetAddr); err != nil {
                        return 0, nil, err
                }
        }

        if total != 0 {
                if err != nil && err != io.EOF && err != syserror.ErrWouldBlock {
                        // If a partial write is completed, the error is dropped. Log it here.
                        log.Debugf("sendfile completed a partial write with error: %v", err)
                        err = nil
                }
        }

        // We can only pass a single file to handleIOError, so pick inFile arbitrarily.
        // This is used only for debugging purposes.
        return uintptr(total), nil, slinux.HandleIOErrorVFS2(t, total != 0, err, syserror.ERESTARTSYS, "sendfile", inFile)
}

// dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not
// thread-safe, and does not take a reference on the vfs.FileDescriptions.
//
// Users must call destroy() when finished.
type dualWaiter struct {
        inFile  *vfs.FileDescription
        outFile *vfs.FileDescription

        inW   waiter.Entry
        inCh  chan struct{}
        outW  waiter.Entry
        outCh chan struct{}
}

// waitForBoth waits for both dw.inFile and dw.outFile to be ready.
func (dw *dualWaiter) waitForBoth(t *kernel.Task) error {
        if dw.inFile.Readiness(eventMaskRead)&eventMaskRead == 0 {
                if dw.inCh == nil {
                        dw.inW, dw.inCh = waiter.NewChannelEntry(nil)
                        dw.inFile.EventRegister(&dw.inW, eventMaskRead)
                        // We might be ready now. Try again before blocking.
                        return nil
                }
                if err := t.Block(dw.inCh); err != nil {
                        return err
                }
        }
        return dw.waitForOut(t)
}

// waitForOut waits for dw.outfile to be read.
func (dw *dualWaiter) waitForOut(t *kernel.Task) error {
        // Don't bother checking readiness of the outFile, because it's not a
        // guarantee that it won't return EWOULDBLOCK. Both pipes and eventfds
        // can be "ready" but will reject writes of certain sizes with
        // EWOULDBLOCK. See b/172075629, b/170743336.
        if dw.outCh == nil {
                dw.outW, dw.outCh = waiter.NewChannelEntry(nil)
                dw.outFile.EventRegister(&dw.outW, eventMaskWrite)
                // We might be ready to write now. Try again before blocking.
                return nil
        }
        return t.Block(dw.outCh)
}

// destroy cleans up resources help by dw. No more calls to wait* can occur
// after destroy is called.
func (dw *dualWaiter) destroy() {
        if dw.inCh != nil {
                dw.inFile.EventUnregister(&dw.inW)
                dw.inCh = nil
        }
        if dw.outCh != nil {
                dw.outFile.EventUnregister(&dw.outW)
                dw.outCh = nil
        }
        dw.inFile = nil
        dw.outFile = nil
}








































































   31 










   31 






















































   35 









   35 




   35 



































   35 





































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package sniffer provides the implementation of data-link layer endpoints that
// wrap another endpoint and logs inbound and outbound packets.
//
// Sniffer endpoints can be used in the networking stack by calling New(eID) to
// create a new endpoint, where eID is the ID of the endpoint being wrapped,
// and then passing it as an argument to Stack.CreateNIC().
package sniffer

import (
        "encoding/binary"
        "fmt"
        "io"
        "sync/atomic"
        "time"

        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/header/parse"
        "gvisor.dev/gvisor/pkg/tcpip/link/nested"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

// LogPackets is a flag used to enable or disable packet logging via the log
// package. Valid values are 0 or 1.
//
// LogPackets must be accessed atomically.
var LogPackets uint32 = 1

// LogPacketsToPCAP is a flag used to enable or disable logging packets to a
// pcap writer. Valid values are 0 or 1. A writer must have been specified when the
// sniffer was created for this flag to have effect.
//
// LogPacketsToPCAP must be accessed atomically.
var LogPacketsToPCAP uint32 = 1

type endpoint struct {
        nested.Endpoint
        writer     io.Writer
        maxPCAPLen uint32
        logPrefix  string
}

var _ stack.GSOEndpoint = (*endpoint)(nil)
var _ stack.LinkEndpoint = (*endpoint)(nil)
var _ stack.NetworkDispatcher = (*endpoint)(nil)

type direction int

const (
        directionSend = iota
        directionRecv
)

// New creates a new sniffer link-layer endpoint. It wraps around another
// endpoint and logs packets and they traverse the endpoint.
func New(lower stack.LinkEndpoint) stack.LinkEndpoint {
        return NewWithPrefix(lower, "")
}

// NewWithPrefix creates a new sniffer link-layer endpoint. It wraps around
// another endpoint and logs packets prefixed with logPrefix as they traverse
// the endpoint.
//
// logPrefix is prepended to the log line without any separators.
// E.g. logPrefix = "NIC:en0/" will produce log lines like
// "NIC:en0/send udp [...]".
func NewWithPrefix(lower stack.LinkEndpoint, logPrefix string) stack.LinkEndpoint {
        sniffer := &endpoint{logPrefix: logPrefix}
        sniffer.Endpoint.Init(lower, sniffer)
        return sniffer
}

func zoneOffset() (int32, error) {
        date := time.Date(0, 0, 0, 0, 0, 0, 0, time.Local)
        _, offset := date.Zone()
        return int32(offset), nil
}

func writePCAPHeader(w io.Writer, maxLen uint32) error {
        offset, err := zoneOffset()
        if err != nil {
                return err
        }
        return binary.Write(w, binary.BigEndian, pcapHeader{
                // From https://wiki.wireshark.org/Development/LibpcapFileFormat
                MagicNumber: 0xa1b2c3d4,

                VersionMajor: 2,
                VersionMinor: 4,
                Thiszone:     offset,
                Sigfigs:      0,
                Snaplen:      maxLen,
                Network:      101, // LINKTYPE_RAW
        })
}

// NewWithWriter creates a new sniffer link-layer endpoint. It wraps around
// another endpoint and logs packets as they traverse the endpoint.
//
// Each packet is written to writer in the pcap format in a single Write call
// without synchronization. A sniffer created with this function will not emit
// packets using the standard log package.
//
// snapLen is the maximum amount of a packet to be saved. Packets with a length
// less than or equal to snapLen will be saved in their entirety. Longer
// packets will be truncated to snapLen.
func NewWithWriter(lower stack.LinkEndpoint, writer io.Writer, snapLen uint32) (stack.LinkEndpoint, error) {
        if err := writePCAPHeader(writer, snapLen); err != nil {
                return nil, err
        }
        sniffer := &endpoint{
                writer:     writer,
                maxPCAPLen: snapLen,
        }
        sniffer.Endpoint.Init(lower, sniffer)
        return sniffer, nil
}

// DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is
// called by the link-layer endpoint being wrapped when a packet arrives, and
// logs the packet before forwarding to the actual dispatcher.
func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
        e.dumpPacket(directionRecv, protocol, pkt)
        e.Endpoint.DeliverNetworkPacket(remote, local, protocol, pkt)
}

// DeliverOutboundPacket implements stack.NetworkDispatcher.DeliverOutboundPacket.
func (e *endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
        e.Endpoint.DeliverOutboundPacket(remote, local, protocol, pkt)
}

func (e *endpoint) dumpPacket(dir direction, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
        writer := e.writer
        if writer == nil && atomic.LoadUint32(&LogPackets) == 1 {
                logPacket(e.logPrefix, dir, protocol, pkt)
        }
        if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 {
                totalLength := pkt.Size()
                length := totalLength
                if max := int(e.maxPCAPLen); length > max {
                        length = max
                }
                packetHeader := newPCAPPacketHeader(time.Now(), uint32(length), uint32(totalLength))
                packet := make([]byte, binary.Size(packetHeader)+length)
                {
                        writer := tcpip.SliceWriter(packet)
                        if err := binary.Write(&writer, binary.BigEndian, packetHeader); err != nil {
                                panic(err)
                        }
                        for _, b := range pkt.Views() {
                                if length == 0 {
                                        break
                                }
                                if len(b) > length {
                                        b = b[:length]
                                }
                                n, err := writer.Write(b)
                                if err != nil {
                                        panic(err)
                                }
                                length -= n
                        }
                }
                if _, err := writer.Write(packet); err != nil {
                        panic(err)
                }
        }
}

// WritePacket implements the stack.LinkEndpoint interface. It is called by
// higher-level protocols to write packets; it just logs the packet and
// forwards the request to the lower endpoint.
func (e *endpoint) WritePacket(r stack.RouteInfo, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) tcpip.Error {
        e.dumpPacket(directionSend, protocol, pkt)
        return e.Endpoint.WritePacket(r, protocol, pkt)
}

// WritePackets implements the stack.LinkEndpoint interface. It is called by
// higher-level protocols to write packets; it just logs the packet and
// forwards the request to the lower endpoint.
func (e *endpoint) WritePackets(r stack.RouteInfo, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, tcpip.Error) {
        for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
                e.dumpPacket(directionSend, protocol, pkt)
        }
        return e.Endpoint.WritePackets(r, pkts, protocol)
}

func logPacket(prefix string, dir direction, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
        // Figure out the network layer info.
        var transProto uint8
        src := tcpip.Address("unknown")
        dst := tcpip.Address("unknown")
        var size uint16
        var id uint32
        var fragmentOffset uint16
        var moreFragments bool

        var directionPrefix string
        switch dir {
        case directionSend:
                directionPrefix = "send"
        case directionRecv:
                directionPrefix = "recv"
        default:
                panic(fmt.Sprintf("unrecognized direction: %d", dir))
        }

        // Clone the packet buffer to not modify the original.
        //
        // We don't clone the original packet buffer so that the new packet buffer
        // does not have any of its headers set.
        //
        // We trim the link headers from the cloned buffer as the sniffer doesn't
        // handle link headers.
        vv := buffer.NewVectorisedView(pkt.Size(), pkt.Views())
        vv.TrimFront(len(pkt.LinkHeader().View()))
        pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{Data: vv})
        switch protocol {
        case header.IPv4ProtocolNumber:
                if ok := parse.IPv4(pkt); !ok {
                        return
                }

                ipv4 := header.IPv4(pkt.NetworkHeader().View())
                fragmentOffset = ipv4.FragmentOffset()
                moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments
                src = ipv4.SourceAddress()
                dst = ipv4.DestinationAddress()
                transProto = ipv4.Protocol()
                size = ipv4.TotalLength() - uint16(ipv4.HeaderLength())
                id = uint32(ipv4.ID())

        case header.IPv6ProtocolNumber:
                proto, fragID, fragOffset, fragMore, ok := parse.IPv6(pkt)
                if !ok {
                        return
                }

                ipv6 := header.IPv6(pkt.NetworkHeader().View())
                src = ipv6.SourceAddress()
                dst = ipv6.DestinationAddress()
                transProto = uint8(proto)
                size = ipv6.PayloadLength()
                id = fragID
                moreFragments = fragMore
                fragmentOffset = fragOffset

        case header.ARPProtocolNumber:
                if !parse.ARP(pkt) {
                        return
                }

                arp := header.ARP(pkt.NetworkHeader().View())
                log.Infof(
                        "%s%s arp %s (%s) -> %s (%s) valid:%t",
                        prefix,
                        directionPrefix,
                        tcpip.Address(arp.ProtocolAddressSender()), tcpip.LinkAddress(arp.HardwareAddressSender()),
                        tcpip.Address(arp.ProtocolAddressTarget()), tcpip.LinkAddress(arp.HardwareAddressTarget()),
                        arp.IsValid(),
                )
                return
        default:
                log.Infof("%s%s unknown network protocol", prefix, directionPrefix)
                return
        }

        // Figure out the transport layer info.
        transName := "unknown"
        srcPort := uint16(0)
        dstPort := uint16(0)
        details := ""
        switch tcpip.TransportProtocolNumber(transProto) {
        case header.ICMPv4ProtocolNumber:
                transName = "icmp"
                hdr, ok := pkt.Data().PullUp(header.ICMPv4MinimumSize)
                if !ok {
                        break
                }
                icmp := header.ICMPv4(hdr)
                icmpType := "unknown"
                if fragmentOffset == 0 {
                        switch icmp.Type() {
                        case header.ICMPv4EchoReply:
                                icmpType = "echo reply"
                        case header.ICMPv4DstUnreachable:
                                icmpType = "destination unreachable"
                        case header.ICMPv4SrcQuench:
                                icmpType = "source quench"
                        case header.ICMPv4Redirect:
                                icmpType = "redirect"
                        case header.ICMPv4Echo:
                                icmpType = "echo"
                        case header.ICMPv4TimeExceeded:
                                icmpType = "time exceeded"
                        case header.ICMPv4ParamProblem:
                                icmpType = "param problem"
                        case header.ICMPv4Timestamp:
                                icmpType = "timestamp"
                        case header.ICMPv4TimestampReply:
                                icmpType = "timestamp reply"
                        case header.ICMPv4InfoRequest:
                                icmpType = "info request"
                        case header.ICMPv4InfoReply:
                                icmpType = "info reply"
                        }
                }
                log.Infof("%s%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, directionPrefix, transName, src, dst, icmpType, size, id, icmp.Code())
                return

        case header.ICMPv6ProtocolNumber:
                transName = "icmp"
                hdr, ok := pkt.Data().PullUp(header.ICMPv6MinimumSize)
                if !ok {
                        break
                }
                icmp := header.ICMPv6(hdr)
                icmpType := "unknown"
                switch icmp.Type() {
                case header.ICMPv6DstUnreachable:
                        icmpType = "destination unreachable"
                case header.ICMPv6PacketTooBig:
                        icmpType = "packet too big"
                case header.ICMPv6TimeExceeded:
                        icmpType = "time exceeded"
                case header.ICMPv6ParamProblem:
                        icmpType = "param problem"
                case header.ICMPv6EchoRequest:
                        icmpType = "echo request"
                case header.ICMPv6EchoReply:
                        icmpType = "echo reply"
                case header.ICMPv6RouterSolicit:
                        icmpType = "router solicit"
                case header.ICMPv6RouterAdvert:
                        icmpType = "router advert"
                case header.ICMPv6NeighborSolicit:
                        icmpType = "neighbor solicit"
                case header.ICMPv6NeighborAdvert:
                        icmpType = "neighbor advert"
                case header.ICMPv6RedirectMsg:
                        icmpType = "redirect message"
                }
                log.Infof("%s%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, directionPrefix, transName, src, dst, icmpType, size, id, icmp.Code())
                return

        case header.UDPProtocolNumber:
                transName = "udp"
                if ok := parse.UDP(pkt); !ok {
                        break
                }

                udp := header.UDP(pkt.TransportHeader().View())
                if fragmentOffset == 0 {
                        srcPort = udp.SourcePort()
                        dstPort = udp.DestinationPort()
                        details = fmt.Sprintf("xsum: 0x%x", udp.Checksum())
                        size -= header.UDPMinimumSize
                }

        case header.TCPProtocolNumber:
                transName = "tcp"
                if ok := parse.TCP(pkt); !ok {
                        break
                }

                tcp := header.TCP(pkt.TransportHeader().View())
                if fragmentOffset == 0 {
                        offset := int(tcp.DataOffset())
                        if offset < header.TCPMinimumSize {
                                details += fmt.Sprintf("invalid packet: tcp data offset too small %d", offset)
                                break
                        }
                        if size := pkt.Data().Size() + len(tcp); offset > size && !moreFragments {
                                details += fmt.Sprintf("invalid packet: tcp data offset %d larger than tcp packet length %d", offset, size)
                                break
                        }

                        srcPort = tcp.SourcePort()
                        dstPort = tcp.DestinationPort()
                        size -= uint16(offset)

                        // Initialize the TCP flags.
                        flags := tcp.Flags()
                        details = fmt.Sprintf("flags: %s seqnum: %d ack: %d win: %d xsum:0x%x", flags, tcp.SequenceNumber(), tcp.AckNumber(), tcp.WindowSize(), tcp.Checksum())
                        if flags&header.TCPFlagSyn != 0 {
                                details += fmt.Sprintf(" options: %+v", header.ParseSynOptions(tcp.Options(), flags&header.TCPFlagAck != 0))
                        } else {
                                details += fmt.Sprintf(" options: %+v", tcp.ParsedOptions())
                        }
                }

        default:
                log.Infof("%s%s %s -> %s unknown transport protocol: %d", prefix, directionPrefix, src, dst, transProto)
                return
        }

        if pkt.GSOOptions.Type != stack.GSONone {
                details += fmt.Sprintf(" gso: %#v", pkt.GSOOptions)
        }

        log.Infof("%s%s %s %s:%d -> %s:%d len:%d id:%04x %s", prefix, directionPrefix, transName, src, srcPort, dst, dstPort, size, id, details)
}



















   28 




    4 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package bits includes all bit related types and operations.
package bits

// AlignUp rounds a length up to an alignment. align must be a power of 2.
func AlignUp(length int, align uint) int {
        return (length + int(align) - 1) & ^(int(align) - 1)
}

// AlignDown rounds a length down to an alignment. align must be a power of 2.
func AlignDown(length int, align uint) int {
        return length & ^(int(align) - 1)
}


























































































































   12 

    5 

    7 








    2 




   24 

    3 
    1 


    3 

   21 




   20 




   20 




   20 




    1 





    1 




























    5 
























































































































































   21 

    3 




   18 
    2 












   16 




   16 


   16 







   16 


   16 


   10 
























   13 

    5 


    8 




    1 








    8 






















   20 






   11 

    1 

    2 
    1 


    1 
    8 

    6 


    2 








   20 
   20 



































   30 




   20 




   91 




    7 




    9 











    3 






    3 






















   12 










    2 


    2 


    2 



   12 
   11 



   12 






    2 



   13 







   11 
    1 








   12 
   12 


    1 



   12 



   12 
    1 



   11 



   11 



    1 





    2 



    3 
    1 






    1 



    1 


    1 



    1 
    1 
    1 


    1 


    1 






    2 
    1 


    2 


    1 





    4 




   19 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package unix provides an implementation of the socket.Socket interface for
// the AF_UNIX protocol family.
package unix

import (
        "fmt"
        "strings"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/socket"
        "gvisor.dev/gvisor/pkg/sentry/socket/control"
        "gvisor.dev/gvisor/pkg/sentry/socket/netstack"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// SocketOperations is a Unix socket. It is similar to a netstack socket,
// except it is backed by a transport.Endpoint instead of a tcpip.Endpoint.
//
// +stateify savable
type SocketOperations struct {
        fsutil.FilePipeSeek             `state:"nosave"`
        fsutil.FileNotDirReaddir        `state:"nosave"`
        fsutil.FileNoFsync              `state:"nosave"`
        fsutil.FileNoMMap               `state:"nosave"`
        fsutil.FileNoSplice             `state:"nosave"`
        fsutil.FileNoopFlush            `state:"nosave"`
        fsutil.FileUseInodeUnstableAttr `state:"nosave"`

        socketOperationsRefs
        socketOpsCommon
}

// New creates a new unix socket.
func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) *fs.File {
        dirent := socket.NewDirent(ctx, unixSocketDevice)
        defer dirent.DecRef(ctx)
        return NewWithDirent(ctx, dirent, endpoint, stype, fs.FileFlags{Read: true, Write: true, NonSeekable: true})
}

// NewWithDirent creates a new unix socket using an existing dirent.
func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, stype linux.SockType, flags fs.FileFlags) *fs.File {
        // You can create AF_UNIX, SOCK_RAW sockets. They're the same as
        // SOCK_DGRAM and don't require CAP_NET_RAW.
        if stype == linux.SOCK_RAW {
                stype = linux.SOCK_DGRAM
        }

        s := SocketOperations{
                socketOpsCommon: socketOpsCommon{
                        ep:    ep,
                        stype: stype,
                },
        }
        s.InitRefs()
        return fs.NewFile(ctx, d, flags, &s)
}

// DecRef implements RefCounter.DecRef.
func (s *SocketOperations) DecRef(ctx context.Context) {
        s.socketOperationsRefs.DecRef(func() {
                s.ep.Close(ctx)
                if s.abstractNamespace != nil {
                        s.abstractNamespace.Remove(s.abstractName, s)
                }
        })
}

// Release implemements fs.FileOperations.Release.
func (s *SocketOperations) Release(ctx context.Context) {
        // Release only decrements a reference on s because s may be referenced in
        // the abstract socket namespace.
        s.DecRef(ctx)
}

// socketOpsCommon contains the socket operations common to VFS1 and VFS2.
//
// +stateify savable
type socketOpsCommon struct {
        socket.SendReceiveTimeout

        ep    transport.Endpoint
        stype linux.SockType

        // abstractName and abstractNamespace indicate the name and namespace of the
        // socket if it is bound to an abstract socket namespace. Once the socket is
        // bound, they cannot be modified.
        abstractName      string
        abstractNamespace *kernel.AbstractSocketNamespace
}

func (s *socketOpsCommon) isPacket() bool {
        switch s.stype {
        case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
                return true
        case linux.SOCK_STREAM:
                return false
        default:
                // We shouldn't have allowed any other socket types during creation.
                panic(fmt.Sprintf("Invalid socket type %d", s.stype))
        }
}

// Endpoint extracts the transport.Endpoint.
func (s *socketOpsCommon) Endpoint() transport.Endpoint {
        return s.ep
}

// extractPath extracts and validates the address.
func extractPath(sockaddr []byte) (string, *syserr.Error) {
        addr, family, err := socket.AddressAndFamily(sockaddr)
        if err != nil {
                if err == syserr.ErrAddressFamilyNotSupported {
                        err = syserr.ErrInvalidArgument
                }
                return "", err
        }
        if family != linux.AF_UNIX {
                return "", syserr.ErrInvalidArgument
        }

        // The address is trimmed by GetAddress.
        p := string(addr.Addr)
        if p == "" {
                // Not allowed.
                return "", syserr.ErrInvalidArgument
        }
        if p[len(p)-1] == '/' {
                // Weird, they tried to bind '/a/b/c/'?
                return "", syserr.ErrIsDir
        }

        return p, nil
}

// GetPeerName implements the linux syscall getpeername(2) for sockets backed by
// a transport.Endpoint.
func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
        addr, err := s.ep.GetRemoteAddress()
        if err != nil {
                return nil, 0, syserr.TranslateNetstackError(err)
        }

        a, l := socket.ConvertAddress(linux.AF_UNIX, addr)
        return a, l, nil
}

// GetSockName implements the linux syscall getsockname(2) for sockets backed by
// a transport.Endpoint.
func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
        addr, err := s.ep.GetLocalAddress()
        if err != nil {
                return nil, 0, syserr.TranslateNetstackError(err)
        }

        a, l := socket.ConvertAddress(linux.AF_UNIX, addr)
        return a, l, nil
}

// Ioctl implements fs.FileOperations.Ioctl.
func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        return netstack.Ioctl(ctx, s.ep, io, args)
}

// GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
// a transport.Endpoint.
func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
        return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outPtr, outLen)
}

// Listen implements the linux syscall listen(2) for sockets backed by
// a transport.Endpoint.
func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
        return s.ep.Listen(backlog)
}

// blockingAccept implements a blocking version of accept(2), that is, if no
// connections are ready to be accept, it will block until one becomes ready.
func (s *SocketOperations) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) {
        // Register for notifications.
        e, ch := waiter.NewChannelEntry(nil)
        s.EventRegister(&e, waiter.ReadableEvents)
        defer s.EventUnregister(&e)

        // Try to accept the connection; if it fails, then wait until we get a
        // notification.
        for {
                if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock {
                        return ep, err
                }

                if err := t.Block(ch); err != nil {
                        return nil, syserr.FromError(err)
                }
        }
}

// Accept implements the linux syscall accept(2) for sockets backed by
// a transport.Endpoint.
func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
        var peerAddr *tcpip.FullAddress
        if peerRequested {
                peerAddr = &tcpip.FullAddress{}
        }
        ep, err := s.ep.Accept(peerAddr)
        if err != nil {
                if err != syserr.ErrWouldBlock || !blocking {
                        return 0, nil, 0, err
                }

                var err *syserr.Error
                ep, err = s.blockingAccept(t, peerAddr)
                if err != nil {
                        return 0, nil, 0, err
                }
        }

        ns := New(t, ep, s.stype)
        defer ns.DecRef(t)

        if flags&linux.SOCK_NONBLOCK != 0 {
                flags := ns.Flags()
                flags.NonBlocking = true
                ns.SetFlags(flags.Settable())
        }

        var addr linux.SockAddr
        var addrLen uint32
        if peerAddr != nil {
                addr, addrLen = socket.ConvertAddress(linux.AF_UNIX, *peerAddr)
        }

        fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
                CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
        })
        if e != nil {
                return 0, nil, 0, syserr.FromError(e)
        }

        t.Kernel().RecordSocket(ns)

        return fd, addr, addrLen, nil
}

// Bind implements the linux syscall bind(2) for unix sockets.
func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
        p, e := extractPath(sockaddr)
        if e != nil {
                return e
        }

        bep, ok := s.ep.(transport.BoundEndpoint)
        if !ok {
                // This socket can't be bound.
                return syserr.ErrInvalidArgument
        }

        return s.ep.Bind(tcpip.FullAddress{Addr: tcpip.Address(p)}, func() *syserr.Error {
                // Is it abstract?
                if p[0] == 0 {
                        if t.IsNetworkNamespaced() {
                                return syserr.ErrInvalidEndpointState
                        }
                        asn := t.AbstractSockets()
                        name := p[1:]
                        if err := asn.Bind(t, name, bep, s); err != nil {
                                // syserr.ErrPortInUse corresponds to EADDRINUSE.
                                return syserr.ErrPortInUse
                        }
                        s.abstractName = name
                        s.abstractNamespace = asn
                } else {
                        // The parent and name.
                        var d *fs.Dirent
                        var name string

                        cwd := t.FSContext().WorkingDirectory()
                        defer cwd.DecRef(t)

                        // Is there no slash at all?
                        if !strings.Contains(p, "/") {
                                d = cwd
                                name = p
                        } else {
                                root := t.FSContext().RootDirectory()
                                defer root.DecRef(t)
                                // Find the last path component, we know that something follows
                                // that final slash, otherwise extractPath() would have failed.
                                lastSlash := strings.LastIndex(p, "/")
                                subPath := p[:lastSlash]
                                if subPath == "" {
                                        // Fix up subpath in case file is in root.
                                        subPath = "/"
                                }
                                var err error
                                remainingTraversals := uint(fs.DefaultTraversalLimit)
                                d, err = t.MountNamespace().FindInode(t, root, cwd, subPath, &remainingTraversals)
                                if err != nil {
                                        // No path available.
                                        return syserr.ErrNoSuchFile
                                }
                                defer d.DecRef(t)
                                name = p[lastSlash+1:]
                        }

                        // Create the socket.
                        //
                        // Note that the file permissions here are not set correctly (see
                        // gvisor.dev/issue/2324). There is no convenient way to get permissions
                        // on the socket referred to by s, so we will leave this discrepancy
                        // unresolved until VFS2 replaces this code.
                        childDir, err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}})
                        if err != nil {
                                return syserr.ErrPortInUse
                        }
                        childDir.DecRef(t)
                }

                return nil
        })
}

// extractEndpoint retrieves the transport.BoundEndpoint associated with a Unix
// socket path. The Release must be called on the transport.BoundEndpoint when
// the caller is done with it.
func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, *syserr.Error) {
        path, err := extractPath(sockaddr)
        if err != nil {
                return nil, err
        }

        // Is it abstract?
        if path[0] == 0 {
                if t.IsNetworkNamespaced() {
                        return nil, syserr.ErrInvalidArgument
                }

                ep := t.AbstractSockets().BoundEndpoint(path[1:])
                if ep == nil {
                        // No socket found.
                        return nil, syserr.ErrConnectionRefused
                }

                return ep, nil
        }

        if kernel.VFS2Enabled {
                p := fspath.Parse(path)
                root := t.FSContext().RootDirectoryVFS2()
                start := root
                relPath := !p.Absolute
                if relPath {
                        start = t.FSContext().WorkingDirectoryVFS2()
                }
                pop := vfs.PathOperation{
                        Root:               root,
                        Start:              start,
                        Path:               p,
                        FollowFinalSymlink: true,
                }
                ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop, &vfs.BoundEndpointOptions{path})
                root.DecRef(t)
                if relPath {
                        start.DecRef(t)
                }
                if e != nil {
                        return nil, syserr.FromError(e)
                }
                return ep, nil
        }

        // Find the node in the filesystem.
        root := t.FSContext().RootDirectory()
        cwd := t.FSContext().WorkingDirectory()
        remainingTraversals := uint(fs.DefaultTraversalLimit)
        d, e := t.MountNamespace().FindInode(t, root, cwd, path, &remainingTraversals)
        cwd.DecRef(t)
        root.DecRef(t)
        if e != nil {
                return nil, syserr.FromError(e)
        }

        // Extract the endpoint if one is there.
        ep := d.Inode.BoundEndpoint(path)
        d.DecRef(t)
        if ep == nil {
                // No socket!
                return nil, syserr.ErrConnectionRefused
        }
        return ep, nil
}

// Connect implements the linux syscall connect(2) for unix sockets.
func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
        ep, err := extractEndpoint(t, sockaddr)
        if err != nil {
                return err
        }
        defer ep.Release(t)

        // Connect the server endpoint.
        err = s.ep.Connect(t, ep)

        if err == syserr.ErrWrongProtocolForSocket {
                // Linux for abstract sockets returns ErrConnectionRefused
                // instead of ErrWrongProtocolForSocket.
                path, _ := extractPath(sockaddr)
                if len(path) > 0 && path[0] == 0 {
                        err = syserr.ErrConnectionRefused
                }
        }

        return err
}

// Write implements fs.FileOperations.Write.
func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
        t := kernel.TaskFromContext(ctx)
        ctrl := control.New(t, s.ep, nil)

        if src.NumBytes() == 0 {
                nInt, err := s.ep.SendMsg(ctx, [][]byte{}, ctrl, nil)
                return int64(nInt), err.ToError()
        }

        return src.CopyInTo(ctx, &EndpointWriter{
                Ctx:      ctx,
                Endpoint: s.ep,
                Control:  ctrl,
                To:       nil,
        })
}

// SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by
// a transport.Endpoint.
func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
        w := EndpointWriter{
                Ctx:      t,
                Endpoint: s.ep,
                Control:  controlMessages.Unix,
                To:       nil,
        }
        if len(to) > 0 {
                switch s.stype {
                case linux.SOCK_SEQPACKET:
                        // to is ignored.
                case linux.SOCK_STREAM:
                        if s.State() == linux.SS_CONNECTED {
                                return 0, syserr.ErrAlreadyConnected
                        }
                        return 0, syserr.ErrNotSupported
                default:
                        ep, err := extractEndpoint(t, to)
                        if err != nil {
                                return 0, err
                        }
                        defer ep.Release(t)
                        w.To = ep

                        if ep.Passcred() && w.Control.Credentials == nil {
                                w.Control.Credentials = control.MakeCreds(t)
                        }
                }
        }

        n, err := src.CopyInTo(t, &w)
        if err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
                return int(n), syserr.FromError(err)
        }

        // Only send SCM Rights once (see net/unix/af_unix.c:unix_stream_sendmsg).
        w.Control.Rights = nil

        // We'll have to block. Register for notification and keep trying to
        // send all the data.
        e, ch := waiter.NewChannelEntry(nil)
        s.EventRegister(&e, waiter.WritableEvents)
        defer s.EventUnregister(&e)

        total := n
        for {
                // Shorten src to reflect bytes previously written.
                src = src.DropFirst64(n)

                n, err = src.CopyInTo(t, &w)
                total += n
                if err != syserror.ErrWouldBlock {
                        break
                }

                if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
                        if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
                                err = syserror.ErrWouldBlock
                        }
                        break
                }
        }

        return int(total), syserr.FromError(err)
}

// Passcred implements transport.Credentialer.Passcred.
func (s *socketOpsCommon) Passcred() bool {
        return s.ep.Passcred()
}

// ConnectedPasscred implements transport.Credentialer.ConnectedPasscred.
func (s *socketOpsCommon) ConnectedPasscred() bool {
        return s.ep.ConnectedPasscred()
}

// Readiness implements waiter.Waitable.Readiness.
func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask {
        return s.ep.Readiness(mask)
}

// EventRegister implements waiter.Waitable.EventRegister.
func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        s.ep.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) {
        s.ep.EventUnregister(e)
}

// SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
// a transport.Endpoint.
func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
        return netstack.SetSockOpt(t, s, s.ep, level, name, optVal)
}

// Shutdown implements the linux syscall shutdown(2) for sockets backed by
// a transport.Endpoint.
func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
        f, err := netstack.ConvertShutdown(how)
        if err != nil {
                return err
        }

        // Issue shutdown request.
        return s.ep.Shutdown(f)
}

// Read implements fs.FileOperations.Read.
func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
        if dst.NumBytes() == 0 {
                return 0, nil
        }
        r := &EndpointReader{
                Ctx:       ctx,
                Endpoint:  s.ep,
                NumRights: 0,
                Peek:      false,
                From:      nil,
        }
        n, err := dst.CopyOutFrom(ctx, r)
        // Drop control messages.
        r.Control.Release(ctx)
        return n, err
}

// RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
// a transport.Endpoint.
func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
        trunc := flags&linux.MSG_TRUNC != 0
        peek := flags&linux.MSG_PEEK != 0
        dontWait := flags&linux.MSG_DONTWAIT != 0
        waitAll := flags&linux.MSG_WAITALL != 0
        isPacket := s.isPacket()

        // Calculate the number of FDs for which we have space and if we are
        // requesting credentials.
        var wantCreds bool
        rightsLen := int(controlDataLen) - unix.SizeofCmsghdr
        if s.Passcred() {
                // Credentials take priority if they are enabled and there is space.
                wantCreds = rightsLen > 0
                if !wantCreds {
                        msgFlags |= linux.MSG_CTRUNC
                }
                credLen := unix.CmsgSpace(unix.SizeofUcred)
                rightsLen -= credLen
        }
        // FDs are 32 bit (4 byte) ints.
        numRights := rightsLen / 4
        if numRights < 0 {
                numRights = 0
        }

        r := EndpointReader{
                Ctx:       t,
                Endpoint:  s.ep,
                Creds:     wantCreds,
                NumRights: numRights,
                Peek:      peek,
        }
        if senderRequested {
                r.From = &tcpip.FullAddress{}
        }

        doRead := func() (int64, error) {
                return dst.CopyOutFrom(t, &r)
        }

        // If MSG_TRUNC is set with a zero byte destination then we still need
        // to read the message and discard it, or in the case where MSG_PEEK is
        // set, leave it be. In both cases the full message length must be
        // returned.
        if trunc && dst.Addrs.NumBytes() == 0 {
                doRead = func() (int64, error) {
                        err := r.Truncate()
                        // Always return zero for bytes read since the destination size is
                        // zero.
                        return 0, err
                }

        }

        var total int64
        if n, err := doRead(); err != syserror.ErrWouldBlock || dontWait {
                var from linux.SockAddr
                var fromLen uint32
                if r.From != nil && len([]byte(r.From.Addr)) != 0 {
                        from, fromLen = socket.ConvertAddress(linux.AF_UNIX, *r.From)
                }

                if r.ControlTrunc {
                        msgFlags |= linux.MSG_CTRUNC
                }

                if err != nil || dontWait || !waitAll || isPacket || n >= dst.NumBytes() {
                        if isPacket && n < int64(r.MsgSize) {
                                msgFlags |= linux.MSG_TRUNC
                        }

                        if trunc {
                                n = int64(r.MsgSize)
                        }

                        return int(n), msgFlags, from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
                }

                // Don't overwrite any data we received.
                dst = dst.DropFirst64(n)
                total += n
        }

        // We'll have to block. Register for notification and keep trying to
        // send all the data.
        e, ch := waiter.NewChannelEntry(nil)
        s.EventRegister(&e, waiter.ReadableEvents)
        defer s.EventUnregister(&e)

        for {
                if n, err := doRead(); err != syserror.ErrWouldBlock {
                        var from linux.SockAddr
                        var fromLen uint32
                        if r.From != nil {
                                from, fromLen = socket.ConvertAddress(linux.AF_UNIX, *r.From)
                        }

                        if r.ControlTrunc {
                                msgFlags |= linux.MSG_CTRUNC
                        }

                        if trunc {
                                // n and r.MsgSize are the same for streams.
                                total += int64(r.MsgSize)
                        } else {
                                total += n
                        }

                        streamPeerClosed := s.stype == linux.SOCK_STREAM && n == 0 && err == nil
                        if err != nil || !waitAll || isPacket || n >= dst.NumBytes() || streamPeerClosed {
                                if total > 0 {
                                        err = nil
                                }
                                if isPacket && n < int64(r.MsgSize) {
                                        msgFlags |= linux.MSG_TRUNC
                                }
                                return int(total), msgFlags, from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
                        }

                        // Don't overwrite any data we received.
                        dst = dst.DropFirst64(n)
                }

                if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
                        if total > 0 {
                                err = nil
                        }
                        if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
                                return int(total), msgFlags, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
                        }
                        return int(total), msgFlags, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
                }
        }
}

// State implements socket.Socket.State.
func (s *socketOpsCommon) State() uint32 {
        return s.ep.State()
}

// Type implements socket.Socket.Type.
func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) {
        // Unix domain sockets always have a protocol of 0.
        return linux.AF_UNIX, s.stype, 0
}

// provider is a unix domain socket provider.
type provider struct{}

// Socket returns a new unix domain socket.
func (*provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
        // Check arguments.
        if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
                return nil, syserr.ErrProtocolNotSupported
        }

        // Create the endpoint and socket.
        var ep transport.Endpoint
        switch stype {
        case linux.SOCK_DGRAM, linux.SOCK_RAW:
                ep = transport.NewConnectionless(t)
        case linux.SOCK_SEQPACKET, linux.SOCK_STREAM:
                ep = transport.NewConnectioned(t, stype, t.Kernel())
        default:
                return nil, syserr.ErrInvalidArgument
        }

        return New(t, ep, stype), nil
}

// Pair creates a new pair of AF_UNIX connected sockets.
func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
        // Check arguments.
        if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
                return nil, nil, syserr.ErrProtocolNotSupported
        }

        switch stype {
        case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET, linux.SOCK_RAW:
                // Ok
        default:
                return nil, nil, syserr.ErrInvalidArgument
        }

        // Create the endpoints and sockets.
        ep1, ep2 := transport.NewPair(t, stype, t.Kernel())
        s1 := New(t, ep1, stype)
        s2 := New(t, ep2, stype)

        return s1, s2, nil
}

func init() {
        socket.RegisterProvider(linux.AF_UNIX, &provider{})
        socket.RegisterProviderVFS2(linux.AF_UNIX, &providerVFS2{})
}



































   34 







   35 












   42 










   32 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package rand implements a cryptographically secure pseudorandom number
// generator.
package rand

import (
        "bufio"
        "crypto/rand"
        "io"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/sync"
)

// reader implements an io.Reader that returns pseudorandom bytes.
type reader struct {
        once         sync.Once
        useGetrandom bool
}

// Read implements io.Reader.Read.
func (r *reader) Read(p []byte) (int, error) {
        r.once.Do(func() {
                _, err := unix.Getrandom(p, 0)
                if err != unix.ENOSYS {
                        r.useGetrandom = true
                }
        })

        if r.useGetrandom {
                return unix.Getrandom(p, 0)
        }
        return rand.Read(p)
}

// bufferedReader implements a threadsafe buffered io.Reader.
type bufferedReader struct {
        mu sync.Mutex
        r  *bufio.Reader
}

// Read implements io.Reader.Read.
func (b *bufferedReader) Read(p []byte) (int, error) {
        b.mu.Lock()
        n, err := b.r.Read(p)
        b.mu.Unlock()
        return n, err
}

// Reader is the default reader.
var Reader io.Reader = &bufferedReader{r: bufio.NewReader(&reader{})}

// Read reads from the default reader.
func Read(b []byte) (int, error) {
        return io.ReadFull(Reader, b)
}

// Init can be called to make sure /dev/urandom is pre-opened on kernels that
// do not support getrandom(2).
func Init() error {
        p := make([]byte, 1)
        _, err := Read(p)
        return err
}





















































































   26 

   16 


   23 


   23 





   31 
   21 



   28 







   28 




    1 




    1 




    1 














   12 

   17 



   17 



   16 





    9 





    9 




    9 





    1 


    8 



    8 





    8 





    1 


    8 



    7 




    4 




    4 





    3 


    4 


    8 






    4 




    2 



    1 






    1 




    1 


    3 


    2 




    3 






    5 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernfs

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
)

// SeekEndConfig describes the SEEK_END behaviour for FDs.
//
// +stateify savable
type SeekEndConfig int

// Constants related to SEEK_END behaviour for FDs.
const (
        // Consider the end of the file to be after the final static entry. This is
        // the default option.
        SeekEndStaticEntries = iota
        // Consider the end of the file to be at offset 0.
        SeekEndZero
)

// GenericDirectoryFDOptions contains configuration for a GenericDirectoryFD.
//
// +stateify savable
type GenericDirectoryFDOptions struct {
        SeekEnd SeekEndConfig
}

// GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory
// inode that uses OrderChildren to track child nodes.
//
// Note that GenericDirectoryFD holds a lock over OrderedChildren while calling
// IterDirents callback. The IterDirents callback therefore cannot hash or
// unhash children, or recursively call IterDirents on the same underlying
// inode.
//
// Must be initialize with Init before first use.
//
// Lock ordering: mu => children.mu.
//
// +stateify savable
type GenericDirectoryFD struct {
        vfs.FileDescriptionDefaultImpl
        vfs.DirectoryFileDescriptionDefaultImpl
        vfs.LockFD

        // Immutable.
        seekEnd SeekEndConfig

        vfsfd    vfs.FileDescription
        children *OrderedChildren

        // mu protects the fields below.
        mu sync.Mutex `state:"nosave"`

        // off is the current directory offset. Protected by "mu".
        off int64
}

// NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
// dentry.
func NewGenericDirectoryFD(m *vfs.Mount, d *Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) {
        fd := &GenericDirectoryFD{}
        if err := fd.Init(children, locks, opts, fdOpts); err != nil {
                return nil, err
        }
        if err := fd.vfsfd.Init(fd, opts.Flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
                return nil, err
        }
        return fd, nil
}

// Init initializes a GenericDirectoryFD. Use it when overriding
// GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
// correct implementation.
func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error {
        if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
                // Can't open directories for writing.
                return syserror.EISDIR
        }
        fd.LockFD.Init(locks)
        fd.seekEnd = fdOpts.SeekEnd
        fd.children = children
        return nil
}

// VFSFileDescription returns a pointer to the vfs.FileDescription representing
// this object.
func (fd *GenericDirectoryFD) VFSFileDescription() *vfs.FileDescription {
        return &fd.vfsfd
}

// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func (fd *GenericDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
        return fd.FileDescriptionDefaultImpl.ConfigureMMap(ctx, opts)
}

// Read implmenets vfs.FileDescriptionImpl.Read.
func (fd *GenericDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        return fd.DirectoryFileDescriptionDefaultImpl.Read(ctx, dst, opts)
}

// PRead implmenets vfs.FileDescriptionImpl.PRead.
func (fd *GenericDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        return fd.DirectoryFileDescriptionDefaultImpl.PRead(ctx, dst, offset, opts)
}

// Write implements vfs.FileDescriptionImpl.Write.
func (fd *GenericDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        return fd.DirectoryFileDescriptionDefaultImpl.Write(ctx, src, opts)
}

// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
        return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts)
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *GenericDirectoryFD) Release(context.Context) {}

func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem {
        return fd.vfsfd.VirtualDentry().Mount().Filesystem()
}

func (fd *GenericDirectoryFD) dentry() *Dentry {
        return fd.vfsfd.Dentry().Impl().(*Dentry)
}

func (fd *GenericDirectoryFD) inode() Inode {
        return fd.dentry().inode
}

// IterDirents implements vfs.FileDescriptionImpl.IterDirents. IterDirents holds
// o.mu when calling cb.
func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
        fd.mu.Lock()
        defer fd.mu.Unlock()

        opts := vfs.StatOptions{Mask: linux.STATX_INO}
        // Handle ".".
        if fd.off == 0 {
                stat, err := fd.inode().Stat(ctx, fd.filesystem(), opts)
                if err != nil {
                        return err
                }
                dirent := vfs.Dirent{
                        Name:    ".",
                        Type:    linux.DT_DIR,
                        Ino:     stat.Ino,
                        NextOff: 1,
                }
                if err := cb.Handle(dirent); err != nil {
                        return err
                }
                fd.off++
        }

        // Handle "..".
        if fd.off == 1 {
                parentInode := genericParentOrSelf(fd.dentry()).inode
                stat, err := parentInode.Stat(ctx, fd.filesystem(), opts)
                if err != nil {
                        return err
                }
                dirent := vfs.Dirent{
                        Name:    "..",
                        Type:    linux.FileMode(stat.Mode).DirentType(),
                        Ino:     stat.Ino,
                        NextOff: 2,
                }
                if err := cb.Handle(dirent); err != nil {
                        return err
                }
                fd.off++
        }

        // Handle static children.
        fd.children.mu.RLock()
        defer fd.children.mu.RUnlock()
        // fd.off accounts for "." and "..", but fd.children do not track
        // these.
        childIdx := fd.off - 2
        for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
                stat, err := it.inode.Stat(ctx, fd.filesystem(), opts)
                if err != nil {
                        return err
                }
                dirent := vfs.Dirent{
                        Name:    it.name,
                        Type:    linux.FileMode(stat.Mode).DirentType(),
                        Ino:     stat.Ino,
                        NextOff: fd.off + 1,
                }
                if err := cb.Handle(dirent); err != nil {
                        return err
                }
                fd.off++
        }

        var err error
        relOffset := fd.off - int64(len(fd.children.set)) - 2
        fd.off, err = fd.inode().IterDirents(ctx, fd.vfsfd.Mount(), cb, fd.off, relOffset)
        return err
}

// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        fd.mu.Lock()
        defer fd.mu.Unlock()

        switch whence {
        case linux.SEEK_SET:
                // Use offset as given.
        case linux.SEEK_CUR:
                offset += fd.off
        case linux.SEEK_END:
                switch fd.seekEnd {
                case SeekEndStaticEntries:
                        fd.children.mu.RLock()
                        offset += int64(len(fd.children.set))
                        offset += 2 // '.' and '..' aren't tracked in children.
                        fd.children.mu.RUnlock()
                case SeekEndZero:
                        // No-op: offset += 0.
                default:
                        panic(fmt.Sprintf("Invalid GenericDirectoryFD.seekEnd = %v", fd.seekEnd))
                }
        default:
                return 0, linuxerr.EINVAL
        }
        if offset < 0 {
                return 0, linuxerr.EINVAL
        }
        fd.off = offset
        return offset, nil
}

// Stat implements vfs.FileDescriptionImpl.Stat.
func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
        fs := fd.filesystem()
        inode := fd.inode()
        return inode.Stat(ctx, fs, opts)
}

// SetStat implements vfs.FileDescriptionImpl.SetStat.
func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
        creds := auth.CredentialsFromContext(ctx)
        return fd.inode().SetStat(ctx, fd.filesystem(), creds, opts)
}

// Allocate implements vfs.FileDescriptionImpl.Allocate.
func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
        return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
}






























  639 











  635 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package time

import (
        "unsafe"

        "golang.org/x/sys/unix"
)

// syscallTSCReferenceClocks is the standard referenceClocks, collecting
// samples using CLOCK_GETTIME and RDTSC.
type syscallTSCReferenceClocks struct {
        tscCycleClock
}

// Sample implements sampler.Sample.
func (syscallTSCReferenceClocks) Sample(c ClockID) (sample, error) {
        var s sample

        s.before = Rdtsc()

        // Don't call clockGettime to avoid a call which may call morestack.
        var ts unix.Timespec
        _, _, e := unix.RawSyscall(unix.SYS_CLOCK_GETTIME, uintptr(c), uintptr(unsafe.Pointer(&ts)), 0)
        if e != 0 {
                return sample{}, e
        }

        s.after = Rdtsc()
        s.ref = ReferenceNS(ts.Nano())

        return s, nil
}

// clockGettime calls SYS_CLOCK_GETTIME, returning time in nanoseconds.
func clockGettime(c ClockID) (ReferenceNS, error) {
        var ts unix.Timespec
        _, _, e := unix.RawSyscall(unix.SYS_CLOCK_GETTIME, uintptr(c), uintptr(unsafe.Pointer(&ts)), 0)
        if e != 0 {
                return 0, e
        }

        return ReferenceNS(ts.Nano()), nil
}














































































































    5 




    1 


    1 


    1 



    4 





    4 



    4 




    3 




    3 












    3 




    3 








    3 


    1 

    2 






    3 



    3 




    3 



    3 




    3 










    3 
    3 















    3 



















































































































































































    3 



    3 




















    2 




    2 



    1 




    1 







    1 







    1 





























    2 
    1 







    1 





    1 



























    1 


















    1 





    1 





















    5 

    2 





    3 






    3 










    3 








































    5 

    4 




    1 





























    1 











    1 




    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package loader

import (
        "bytes"
        "debug/elf"
        "fmt"
        "io"

        "gvisor.dev/gvisor/pkg/abi"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/cpuid"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsbridge"
        "gvisor.dev/gvisor/pkg/sentry/limits"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/mm"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
)

const (
        // elfMagic identifies an ELF file.
        elfMagic = "\x7fELF"

        // maxTotalPhdrSize is the maximum combined size of all program
        // headers.  Linux limits this to one page.
        maxTotalPhdrSize = hostarch.PageSize
)

var (
        // header64Size is the size of elf.Header64.
        header64Size = (*linux.ElfHeader64)(nil).SizeBytes()

        // Prog64Size is the size of elf.Prog64.
        prog64Size = (*linux.ElfProg64)(nil).SizeBytes()
)

func progFlagsAsPerms(f elf.ProgFlag) hostarch.AccessType {
        var p hostarch.AccessType
        if f&elf.PF_R == elf.PF_R {
                p.Read = true
        }
        if f&elf.PF_W == elf.PF_W {
                p.Write = true
        }
        if f&elf.PF_X == elf.PF_X {
                p.Execute = true
        }
        return p
}

// elfInfo contains the metadata needed to load an ELF binary.
type elfInfo struct {
        // os is the target OS of the ELF.
        os abi.OS

        // arch is the target architecture of the ELF.
        arch arch.Arch

        // entry is the program entry point.
        entry hostarch.Addr

        // phdrs are the program headers.
        phdrs []elf.ProgHeader

        // phdrSize is the size of a single program header in the ELF.
        phdrSize int

        // phdrOff is the offset of the program headers in the file.
        phdrOff uint64

        // sharedObject is true if the ELF represents a shared object.
        sharedObject bool
}

// fullReader interface extracts the ReadFull method from fsbridge.File so that
// client code does not need to define an entire fsbridge.File when only read
// functionality is needed.
//
// TODO(gvisor.dev/issue/1035): Once VFS2 ships, rewrite this to wrap
// vfs.FileDescription's PRead/Read instead.
type fullReader interface {
        // ReadFull is the same as fsbridge.File.ReadFull.
        ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error)
}

// parseHeader parse the ELF header, verifying that this is a supported ELF
// file and returning the ELF program headers.
//
// This is similar to elf.NewFile, except that it is more strict about what it
// accepts from the ELF, and it doesn't parse unnecessary parts of the file.
func parseHeader(ctx context.Context, f fullReader) (elfInfo, error) {
        // Check ident first; it will tell us the endianness of the rest of the
        // structs.
        var ident [elf.EI_NIDENT]byte
        _, err := f.ReadFull(ctx, usermem.BytesIOSequence(ident[:]), 0)
        if err != nil {
                log.Infof("Error reading ELF ident: %v", err)
                // The entire ident array always exists.
                if err == io.EOF || err == io.ErrUnexpectedEOF {
                        err = syserror.ENOEXEC
                }
                return elfInfo{}, err
        }

        // Only some callers pre-check the ELF magic.
        if !bytes.Equal(ident[:len(elfMagic)], []byte(elfMagic)) {
                log.Infof("File is not an ELF")
                return elfInfo{}, syserror.ENOEXEC
        }

        // We only support 64-bit, little endian binaries
        if class := elf.Class(ident[elf.EI_CLASS]); class != elf.ELFCLASS64 {
                log.Infof("Unsupported ELF class: %v", class)
                return elfInfo{}, syserror.ENOEXEC
        }
        if endian := elf.Data(ident[elf.EI_DATA]); endian != elf.ELFDATA2LSB {
                log.Infof("Unsupported ELF endianness: %v", endian)
                return elfInfo{}, syserror.ENOEXEC
        }

        if version := elf.Version(ident[elf.EI_VERSION]); version != elf.EV_CURRENT {
                log.Infof("Unsupported ELF version: %v", version)
                return elfInfo{}, syserror.ENOEXEC
        }
        // EI_OSABI is ignored by Linux, which is the only OS supported.
        os := abi.Linux

        var hdr linux.ElfHeader64
        hdrBuf := make([]byte, header64Size)
        _, err = f.ReadFull(ctx, usermem.BytesIOSequence(hdrBuf), 0)
        if err != nil {
                log.Infof("Error reading ELF header: %v", err)
                // The entire header always exists.
                if err == io.EOF || err == io.ErrUnexpectedEOF {
                        err = syserror.ENOEXEC
                }
                return elfInfo{}, err
        }
        hdr.UnmarshalUnsafe(hdrBuf)

        // We support amd64 and arm64.
        var a arch.Arch
        switch machine := elf.Machine(hdr.Machine); machine {
        case elf.EM_X86_64:
                a = arch.AMD64
        case elf.EM_AARCH64:
                a = arch.ARM64
        default:
                log.Infof("Unsupported ELF machine %d", machine)
                return elfInfo{}, syserror.ENOEXEC
        }

        var sharedObject bool
        elfType := elf.Type(hdr.Type)
        switch elfType {
        case elf.ET_EXEC:
                sharedObject = false
        case elf.ET_DYN:
                sharedObject = true
        default:
                log.Infof("Unsupported ELF type %v", elfType)
                return elfInfo{}, syserror.ENOEXEC
        }

        if int(hdr.Phentsize) != prog64Size {
                log.Infof("Unsupported phdr size %d", hdr.Phentsize)
                return elfInfo{}, syserror.ENOEXEC
        }
        totalPhdrSize := prog64Size * int(hdr.Phnum)
        if totalPhdrSize < prog64Size {
                log.Warningf("No phdrs or total phdr size overflows: prog64Size: %d phnum: %d", prog64Size, int(hdr.Phnum))
                return elfInfo{}, syserror.ENOEXEC
        }
        if totalPhdrSize > maxTotalPhdrSize {
                log.Infof("Too many phdrs (%d): total size %d > %d", hdr.Phnum, totalPhdrSize, maxTotalPhdrSize)
                return elfInfo{}, syserror.ENOEXEC
        }
        if int64(hdr.Phoff) < 0 || int64(hdr.Phoff+uint64(totalPhdrSize)) < 0 {
                ctx.Infof("Unsupported phdr offset %d", hdr.Phoff)
                return elfInfo{}, syserror.ENOEXEC
        }

        phdrBuf := make([]byte, totalPhdrSize)
        _, err = f.ReadFull(ctx, usermem.BytesIOSequence(phdrBuf), int64(hdr.Phoff))
        if err != nil {
                log.Infof("Error reading ELF phdrs: %v", err)
                // If phdrs were specified, they should all exist.
                if err == io.EOF || err == io.ErrUnexpectedEOF {
                        err = syserror.ENOEXEC
                }
                return elfInfo{}, err
        }

        phdrs := make([]elf.ProgHeader, hdr.Phnum)
        for i := range phdrs {
                var prog64 linux.ElfProg64
                prog64.UnmarshalUnsafe(phdrBuf[:prog64Size])
                phdrBuf = phdrBuf[prog64Size:]
                phdrs[i] = elf.ProgHeader{
                        Type:   elf.ProgType(prog64.Type),
                        Flags:  elf.ProgFlag(prog64.Flags),
                        Off:    prog64.Off,
                        Vaddr:  prog64.Vaddr,
                        Paddr:  prog64.Paddr,
                        Filesz: prog64.Filesz,
                        Memsz:  prog64.Memsz,
                        Align:  prog64.Align,
                }
        }

        return elfInfo{
                os:           os,
                arch:         a,
                entry:        hostarch.Addr(hdr.Entry),
                phdrs:        phdrs,
                phdrOff:      hdr.Phoff,
                phdrSize:     prog64Size,
                sharedObject: sharedObject,
        }, nil
}

// mapSegment maps a phdr into the Task. offset is the offset to apply to
// phdr.Vaddr.
func mapSegment(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, phdr *elf.ProgHeader, offset hostarch.Addr) error {
        // We must make a page-aligned mapping.
        adjust := hostarch.Addr(phdr.Vaddr).PageOffset()

        addr, ok := offset.AddLength(phdr.Vaddr)
        if !ok {
                // If offset != 0 we should have ensured this would fit.
                ctx.Warningf("Computed segment load address overflows: %#x + %#x", phdr.Vaddr, offset)
                return syserror.ENOEXEC
        }
        addr -= hostarch.Addr(adjust)

        fileSize := phdr.Filesz + adjust
        if fileSize < phdr.Filesz {
                ctx.Infof("Computed segment file size overflows: %#x + %#x", phdr.Filesz, adjust)
                return syserror.ENOEXEC
        }
        ms, ok := hostarch.Addr(fileSize).RoundUp()
        if !ok {
                ctx.Infof("fileSize %#x too large", fileSize)
                return syserror.ENOEXEC
        }
        mapSize := uint64(ms)

        if mapSize > 0 {
                // This must result in a page-aligned offset. i.e., the original
                // phdr.Off must have the same alignment as phdr.Vaddr. If that is not
                // true, MMap will reject the mapping.
                fileOffset := phdr.Off - adjust

                prot := progFlagsAsPerms(phdr.Flags)
                mopts := memmap.MMapOpts{
                        Length: mapSize,
                        Offset: fileOffset,
                        Addr:   addr,
                        Fixed:  true,
                        // Linux will happily allow conflicting segments to map over
                        // one another.
                        Unmap:    true,
                        Private:  true,
                        Perms:    prot,
                        MaxPerms: hostarch.AnyAccess,
                }
                defer func() {
                        if mopts.MappingIdentity != nil {
                                mopts.MappingIdentity.DecRef(ctx)
                        }
                }()
                if err := f.ConfigureMMap(ctx, &mopts); err != nil {
                        ctx.Infof("File is not memory-mappable: %v", err)
                        return err
                }
                if _, err := m.MMap(ctx, mopts); err != nil {
                        ctx.Infof("Error mapping PT_LOAD segment %+v at %#x: %v", phdr, addr, err)
                        return err
                }

                // We need to clear the end of the last page that exceeds fileSize so
                // we don't map part of the file beyond fileSize.
                //
                // Note that Linux *does not* clear the portion of the first page
                // before phdr.Off.
                if mapSize > fileSize {
                        zeroAddr, ok := addr.AddLength(fileSize)
                        if !ok {
                                panic(fmt.Sprintf("successfully mmaped address overflows? %#x + %#x", addr, fileSize))
                        }
                        zeroSize := int64(mapSize - fileSize)
                        if zeroSize < 0 {
                                panic(fmt.Sprintf("zeroSize too big? %#x", uint64(zeroSize)))
                        }
                        if _, err := m.ZeroOut(ctx, zeroAddr, zeroSize, usermem.IOOpts{IgnorePermissions: true}); err != nil {
                                ctx.Warningf("Failed to zero end of page [%#x, %#x): %v", zeroAddr, zeroAddr+hostarch.Addr(zeroSize), err)
                                return err
                        }
                }
        }

        memSize := phdr.Memsz + adjust
        if memSize < phdr.Memsz {
                ctx.Infof("Computed segment mem size overflows: %#x + %#x", phdr.Memsz, adjust)
                return syserror.ENOEXEC
        }

        // Allocate more anonymous pages if necessary.
        if mapSize < memSize {
                anonAddr, ok := addr.AddLength(mapSize)
                if !ok {
                        panic(fmt.Sprintf("anonymous memory doesn't fit in pre-sized range? %#x + %#x", addr, mapSize))
                }
                anonSize, ok := hostarch.Addr(memSize - mapSize).RoundUp()
                if !ok {
                        ctx.Infof("extra anon pages too large: %#x", memSize-mapSize)
                        return syserror.ENOEXEC
                }

                // N.B. Linux uses vm_brk_flags to map these pages, which only
                // honors the X bit, always mapping at least RW. ignoring These
                // pages are not included in the final brk region.
                prot := hostarch.ReadWrite
                if phdr.Flags&elf.PF_X == elf.PF_X {
                        prot.Execute = true
                }

                if _, err := m.MMap(ctx, memmap.MMapOpts{
                        Length: uint64(anonSize),
                        Addr:   anonAddr,
                        // Fixed without Unmap will fail the mmap if something is
                        // already at addr.
                        Fixed:    true,
                        Private:  true,
                        Perms:    prot,
                        MaxPerms: hostarch.AnyAccess,
                }); err != nil {
                        ctx.Infof("Error mapping PT_LOAD segment %v anonymous memory: %v", phdr, err)
                        return err
                }
        }

        return nil
}

// loadedELF describes an ELF that has been successfully loaded.
type loadedELF struct {
        // os is the target OS of the ELF.
        os abi.OS

        // arch is the target architecture of the ELF.
        arch arch.Arch

        // entry is the entry point of the ELF.
        entry hostarch.Addr

        // start is the end of the ELF.
        start hostarch.Addr

        // end is the end of the ELF.
        end hostarch.Addr

        // interpter is the path to the ELF interpreter.
        interpreter string

        // phdrAddr is the address of the ELF program headers.
        phdrAddr hostarch.Addr

        // phdrSize is the size of a single program header in the ELF.
        phdrSize int

        // phdrNum is the number of program headers.
        phdrNum int

        // auxv contains a subset of ELF-specific auxiliary vector entries:
        // * AT_PHDR
        // * AT_PHENT
        // * AT_PHNUM
        // * AT_BASE
        // * AT_ENTRY
        auxv arch.Auxv
}

// loadParsedELF loads f into mm.
//
// info is the parsed elfInfo from the header.
//
// It does not load the ELF interpreter, or return any auxv entries.
//
// Preconditions: f is an ELF file.
func loadParsedELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, info elfInfo, sharedLoadOffset hostarch.Addr) (loadedELF, error) {
        first := true
        var start, end hostarch.Addr
        var interpreter string
        for _, phdr := range info.phdrs {
                switch phdr.Type {
                case elf.PT_LOAD:
                        vaddr := hostarch.Addr(phdr.Vaddr)
                        if first {
                                first = false
                                start = vaddr
                        }
                        if vaddr < end {
                                // NOTE(b/37474556): Linux allows out-of-order
                                // segments, in violation of the spec.
                                ctx.Infof("PT_LOAD headers out-of-order. %#x < %#x", vaddr, end)
                                return loadedELF{}, syserror.ENOEXEC
                        }
                        var ok bool
                        end, ok = vaddr.AddLength(phdr.Memsz)
                        if !ok {
                                ctx.Infof("PT_LOAD header size overflows. %#x + %#x", vaddr, phdr.Memsz)
                                return loadedELF{}, syserror.ENOEXEC
                        }

                case elf.PT_INTERP:
                        if phdr.Filesz < 2 {
                                ctx.Infof("PT_INTERP path too small: %v", phdr.Filesz)
                                return loadedELF{}, syserror.ENOEXEC
                        }
                        if phdr.Filesz > linux.PATH_MAX {
                                ctx.Infof("PT_INTERP path too big: %v", phdr.Filesz)
                                return loadedELF{}, syserror.ENOEXEC
                        }
                        if int64(phdr.Off) < 0 || int64(phdr.Off+phdr.Filesz) < 0 {
                                ctx.Infof("Unsupported PT_INTERP offset %d", phdr.Off)
                                return loadedELF{}, syserror.ENOEXEC
                        }

                        path := make([]byte, phdr.Filesz)
                        _, err := f.ReadFull(ctx, usermem.BytesIOSequence(path), int64(phdr.Off))
                        if err != nil {
                                // If an interpreter was specified, it should exist.
                                ctx.Infof("Error reading PT_INTERP path: %v", err)
                                return loadedELF{}, syserror.ENOEXEC
                        }

                        if path[len(path)-1] != 0 {
                                ctx.Infof("PT_INTERP path not NUL-terminated: %v", path)
                                return loadedELF{}, syserror.ENOEXEC
                        }

                        // Strip NUL-terminator and everything beyond from
                        // string. Note that there may be a NUL-terminator
                        // before len(path)-1.
                        interpreter = string(path[:bytes.IndexByte(path, '\x00')])
                        if interpreter == "" {
                                // Linux actually attempts to open_exec("\0").
                                // open_exec -> do_open_execat fails to check
                                // that name != '\0' before calling
                                // do_filp_open, which thus opens the working
                                // directory.  do_open_execat returns EACCES
                                // because the directory is not a regular file.
                                //
                                // We bypass that nonsense and simply
                                // short-circuit with EACCES. Those this does
                                // mean that there may be some edge cases where
                                // the open path would return a different
                                // error.
                                ctx.Infof("PT_INTERP path is empty: %v", path)
                                return loadedELF{}, linuxerr.EACCES
                        }
                }
        }

        // Shared objects don't have fixed load addresses. We need to pick a
        // base address big enough to fit all segments, so we first create a
        // mapping for the total size just to find a region that is big enough.
        //
        // It is safe to unmap it immediately without racing with another mapping
        // because we are the only one in control of the MemoryManager.
        //
        // Note that the vaddr of the first PT_LOAD segment is ignored when
        // choosing the load address (even if it is non-zero). The vaddr does
        // become an offset from that load address.
        var offset hostarch.Addr
        if info.sharedObject {
                totalSize := end - start
                totalSize, ok := totalSize.RoundUp()
                if !ok {
                        ctx.Infof("ELF PT_LOAD segments too big")
                        return loadedELF{}, syserror.ENOEXEC
                }

                var err error
                offset, err = m.MMap(ctx, memmap.MMapOpts{
                        Length:  uint64(totalSize),
                        Addr:    sharedLoadOffset,
                        Private: true,
                })
                if err != nil {
                        ctx.Infof("Error allocating address space for shared object: %v", err)
                        return loadedELF{}, err
                }
                if err := m.MUnmap(ctx, offset, uint64(totalSize)); err != nil {
                        panic(fmt.Sprintf("Failed to unmap base address: %v", err))
                }

                start, ok = start.AddLength(uint64(offset))
                if !ok {
                        ctx.Infof(fmt.Sprintf("Start %#x + offset %#x overflows?", start, offset))
                        return loadedELF{}, linuxerr.EINVAL
                }

                end, ok = end.AddLength(uint64(offset))
                if !ok {
                        ctx.Infof(fmt.Sprintf("End %#x + offset %#x overflows?", end, offset))
                        return loadedELF{}, linuxerr.EINVAL
                }

                info.entry, ok = info.entry.AddLength(uint64(offset))
                if !ok {
                        ctx.Infof("Entrypoint %#x + offset %#x overflows? Is the entrypoint within a segment?", info.entry, offset)
                        return loadedELF{}, err
                }
        }

        // Map PT_LOAD segments.
        for _, phdr := range info.phdrs {
                switch phdr.Type {
                case elf.PT_LOAD:
                        if phdr.Memsz == 0 {
                                // No need to load segments with size 0, but
                                // they exist in some binaries.
                                continue
                        }

                        if err := mapSegment(ctx, m, f, &phdr, offset); err != nil {
                                ctx.Infof("Failed to map PT_LOAD segment: %+v", phdr)
                                return loadedELF{}, err
                        }
                }
        }

        // This assumes that the first segment contains the ELF headers. This
        // may not be true in a malformed ELF, but Linux makes the same
        // assumption.
        phdrAddr, ok := start.AddLength(info.phdrOff)
        if !ok {
                ctx.Warningf("ELF start address %#x + phdr offset %#x overflows", start, info.phdrOff)
                phdrAddr = 0
        }

        return loadedELF{
                os:          info.os,
                arch:        info.arch,
                entry:       info.entry,
                start:       start,
                end:         end,
                interpreter: interpreter,
                phdrAddr:    phdrAddr,
                phdrSize:    info.phdrSize,
                phdrNum:     len(info.phdrs),
        }, nil
}

// loadInitialELF loads f into mm.
//
// It creates an arch.Context for the ELF and prepares the mm for this arch.
//
// It does not load the ELF interpreter, or return any auxv entries.
//
// Preconditions:
// * f is an ELF file.
// * f is the first ELF loaded into m.
func loadInitialELF(ctx context.Context, m *mm.MemoryManager, fs *cpuid.FeatureSet, f fsbridge.File) (loadedELF, arch.Context, error) {
        info, err := parseHeader(ctx, f)
        if err != nil {
                ctx.Infof("Failed to parse initial ELF: %v", err)
                return loadedELF{}, nil, err
        }

        // Check Image Compatibility.
        if arch.Host != info.arch {
                ctx.Warningf("Found mismatch for platform %s with ELF type %s", arch.Host.String(), info.arch.String())
                return loadedELF{}, nil, syserror.ENOEXEC
        }

        // Create the arch.Context now so we can prepare the mmap layout before
        // mapping anything.
        ac := arch.New(info.arch, fs)

        l, err := m.SetMmapLayout(ac, limits.FromContext(ctx))
        if err != nil {
                ctx.Warningf("Failed to set mmap layout: %v", err)
                return loadedELF{}, nil, err
        }

        // PIELoadAddress tries to move the ELF out of the way of the default
        // mmap base to ensure that the initial brk has sufficient space to
        // grow.
        le, err := loadParsedELF(ctx, m, f, info, ac.PIELoadAddress(l))
        return le, ac, err
}

// loadInterpreterELF loads f into mm.
//
// The interpreter must be for the same OS/Arch as the initial ELF.
//
// It does not return any auxv entries.
//
// Preconditions: f is an ELF file.
func loadInterpreterELF(ctx context.Context, m *mm.MemoryManager, f fsbridge.File, initial loadedELF) (loadedELF, error) {
        info, err := parseHeader(ctx, f)
        if err != nil {
                if linuxerr.Equals(linuxerr.ENOEXEC, err) {
                        // Bad interpreter.
                        err = linuxerr.ELIBBAD
                }
                return loadedELF{}, err
        }

        if info.os != initial.os {
                ctx.Infof("Initial ELF OS %v and interpreter ELF OS %v differ", initial.os, info.os)
                return loadedELF{}, linuxerr.ELIBBAD
        }
        if info.arch != initial.arch {
                ctx.Infof("Initial ELF arch %v and interpreter ELF arch %v differ", initial.arch, info.arch)
                return loadedELF{}, linuxerr.ELIBBAD
        }

        // The interpreter is not given a load offset, as its location does not
        // affect brk.
        return loadParsedELF(ctx, m, f, info, 0)
}

// loadELF loads args.File into the Task address space.
//
// If loadELF returns ErrSwitchFile it should be called again with the returned
// path and argv.
//
// Preconditions: args.File is an ELF file.
func loadELF(ctx context.Context, args LoadArgs) (loadedELF, arch.Context, error) {
        bin, ac, err := loadInitialELF(ctx, args.MemoryManager, args.Features, args.File)
        if err != nil {
                ctx.Infof("Error loading binary: %v", err)
                return loadedELF{}, nil, err
        }

        var interp loadedELF
        if bin.interpreter != "" {
                // Even if we do not allow the final link of the script to be
                // resolved, the interpreter should still be resolved if it is
                // a symlink.
                args.ResolveFinal = true
                // Refresh the traversal limit.
                *args.RemainingTraversals = linux.MaxSymlinkTraversals
                args.Filename = bin.interpreter
                intFile, err := openPath(ctx, args)
                if err != nil {
                        ctx.Infof("Error opening interpreter %s: %v", bin.interpreter, err)
                        return loadedELF{}, nil, err
                }
                defer intFile.DecRef(ctx)

                interp, err = loadInterpreterELF(ctx, args.MemoryManager, intFile, bin)
                if err != nil {
                        ctx.Infof("Error loading interpreter: %v", err)
                        return loadedELF{}, nil, err
                }

                if interp.interpreter != "" {
                        // No recursive interpreters!
                        ctx.Infof("Interpreter requires an interpreter")
                        return loadedELF{}, nil, syserror.ENOEXEC
                }
        }

        // ELF-specific auxv entries.
        bin.auxv = arch.Auxv{
                arch.AuxEntry{linux.AT_PHDR, bin.phdrAddr},
                arch.AuxEntry{linux.AT_PHENT, hostarch.Addr(bin.phdrSize)},
                arch.AuxEntry{linux.AT_PHNUM, hostarch.Addr(bin.phdrNum)},
                arch.AuxEntry{linux.AT_ENTRY, bin.entry},
        }
        if bin.interpreter != "" {
                bin.auxv = append(bin.auxv, arch.AuxEntry{linux.AT_BASE, interp.start})

                // Start in the interpreter.
                // N.B. AT_ENTRY above contains the *original* entry point.
                bin.entry = interp.entry
        } else {
                // Always add AT_BASE even if there is no interpreter.
                bin.auxv = append(bin.auxv, arch.AuxEntry{linux.AT_BASE, 0})
        }

        return bin, ac, nil
}























































    6 















    8 








    8 





    7 

    2 





    7 




    7 







































    5 


    5 
    2 




    5 





    7 









    7 



    2 






    7 






    7 
















    7 
    5 








    5 








    5 

    7 




    7 












    2 





    7 


    2 


















































    5 















    5 




    5 







    3 






    5 



    2 



    2 


    2 




































































































    7 














    7 







    7 




























    5 






    5 
















    5 





































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
        "container/heap"
        "math"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/seqnum"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

// receiver holds the state necessary to receive TCP segments and turn them
// into a stream of bytes.
//
// +stateify savable
type receiver struct {
        stack.TCPReceiverState
        ep *endpoint

        // rcvWnd is the non-scaled receive window last advertised to the peer.
        rcvWnd seqnum.Size

        // rcvWUP is the RcvNxt value at the last window update sent.
        rcvWUP seqnum.Value

        // prevBufused is the snapshot of endpoint rcvBufUsed taken when we
        // advertise a receive window.
        prevBufUsed int

        closed bool

        // pendingRcvdSegments is bounded by the receive buffer size of the
        // endpoint.
        pendingRcvdSegments segmentHeap

        // Time when the last ack was received.
        lastRcvdAckTime tcpip.MonotonicTime
}

func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8) *receiver {
        return &receiver{
                ep: ep,
                TCPReceiverState: stack.TCPReceiverState{
                        RcvNxt:      irs + 1,
                        RcvAcc:      irs.Add(rcvWnd + 1),
                        RcvWndScale: rcvWndScale,
                },
                rcvWnd:          rcvWnd,
                rcvWUP:          irs + 1,
                lastRcvdAckTime: ep.stack.Clock().NowMonotonic(),
        }
}

// acceptable checks if the segment sequence number range is acceptable
// according to the table on page 26 of RFC 793.
func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
        // r.rcvWnd could be much larger than the window size we advertised in our
        // outgoing packets, we should use what we have advertised for acceptability
        // test.
        scaledWindowSize := r.rcvWnd >> r.RcvWndScale
        if scaledWindowSize > math.MaxUint16 {
                // This is what we actually put in the Window field.
                scaledWindowSize = math.MaxUint16
        }
        advertisedWindowSize := scaledWindowSize << r.RcvWndScale
        return header.Acceptable(segSeq, segLen, r.RcvNxt, r.RcvNxt.Add(advertisedWindowSize))
}

// currentWindow returns the available space in the window that was advertised
// last to our peer.
func (r *receiver) currentWindow() (curWnd seqnum.Size) {
        endOfWnd := r.rcvWUP.Add(r.rcvWnd)
        if endOfWnd.LessThan(r.RcvNxt) {
                // return 0 if r.RcvNxt is past the end of the previously advertised window.
                // This can happen because we accept a large segment completely even if
                // accepting it causes it to partially exceed the advertised window.
                return 0
        }
        return r.RcvNxt.Size(endOfWnd)
}

// getSendParams returns the parameters needed by the sender when building
// segments to send.
func (r *receiver) getSendParams() (RcvNxt seqnum.Value, rcvWnd seqnum.Size) {
        newWnd := r.ep.selectWindow()
        curWnd := r.currentWindow()
        unackLen := int(r.ep.snd.MaxSentAck.Size(r.RcvNxt))
        bufUsed := r.ep.receiveBufferUsed()

        // Grow the right edge of the window only for payloads larger than the
        // the segment overhead OR if the application is actively consuming data.
        //
        // Avoiding growing the right edge otherwise, addresses a situation below:
        // An application has been slow in reading data and we have burst of
        // incoming segments lengths < segment overhead. Here, our available free
        // memory would reduce drastically when compared to the advertised receive
        // window.
        //
        // For example: With incoming 512 bytes segments, segment overhead of
        // 552 bytes (at the time of writing this comment), with receive window
        // starting from 1MB and with rcvAdvWndScale being 1, buffer would reach 0
        // when the curWnd is still 19436 bytes, because for every incoming segment
        // newWnd would reduce by (552+512) >> rcvAdvWndScale (current value 1),
        // while curWnd would reduce by 512 bytes.
        // Such a situation causes us to keep tail dropping the incoming segments
        // and never advertise zero receive window to the peer.
        //
        // Linux does a similar check for minimal sk_buff size (128):
        // https://github.com/torvalds/linux/blob/d5beb3140f91b1c8a3d41b14d729aefa4dcc58bc/net/ipv4/tcp_input.c#L783
        //
        // Also, if the application is reading the data, we keep growing the right
        // edge, as we are still advertising a window that we think can be serviced.
        toGrow := unackLen >= SegSize || bufUsed <= r.prevBufUsed

        // Update RcvAcc only if new window is > previously advertised window. We
        // should never shrink the acceptable sequence space once it has been
        // advertised the peer. If we shrink the acceptable sequence space then we
        // would end up dropping bytes that might already be in flight.
        // ====================================================  sequence space.
        // ^             ^               ^                   ^
        // rcvWUP       RcvNxt         RcvAcc          new RcvAcc
        //               <=====curWnd ===>
        //               <========= newWnd > curWnd ========= >
        if r.RcvNxt.Add(curWnd).LessThan(r.RcvNxt.Add(newWnd)) && toGrow {
                // If the new window moves the right edge, then update RcvAcc.
                r.RcvAcc = r.RcvNxt.Add(newWnd)
        } else {
                if newWnd == 0 {
                        // newWnd is zero but we can't advertise a zero as it would cause window
                        // to shrink so just increment a metric to record this event.
                        r.ep.stats.ReceiveErrors.WantZeroRcvWindow.Increment()
                }
                newWnd = curWnd
        }

        // Apply silly-window avoidance when recovering from zero-window situation.
        // Keep advertising zero receive window up until the new window reaches a
        // threshold.
        if r.rcvWnd == 0 && newWnd != 0 {
                r.ep.rcvQueueInfo.rcvQueueMu.Lock()
                if crossed, above := r.ep.windowCrossedACKThresholdLocked(int(newWnd), int(r.ep.ops.GetReceiveBufferSize())); !crossed && !above {
                        newWnd = 0
                }
                r.ep.rcvQueueInfo.rcvQueueMu.Unlock()
        }

        // Stash away the non-scaled receive window as we use it for measuring
        // receiver's estimated RTT.
        r.rcvWnd = newWnd
        r.rcvWUP = r.RcvNxt
        r.prevBufUsed = bufUsed
        scaledWnd := r.rcvWnd >> r.RcvWndScale
        if scaledWnd == 0 {
                // Increment a metric if we are advertising an actual zero window.
                r.ep.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
        }

        // If we started off with a window larger than what can he held in
        // the 16bit window field, we ceil the value to the max value.
        if scaledWnd > math.MaxUint16 {
                scaledWnd = seqnum.Size(math.MaxUint16)

                // Ensure that the stashed receive window always reflects what
                // is being advertised.
                r.rcvWnd = scaledWnd << r.RcvWndScale
        }
        return r.RcvNxt, scaledWnd
}

// nonZeroWindow is called when the receive window grows from zero to nonzero;
// in such cases we may need to send an ack to indicate to our peer that it can
// resume sending data.
func (r *receiver) nonZeroWindow() {
        // Immediately send an ack.
        r.ep.snd.sendAck()
}

// consumeSegment attempts to consume a segment that was received by r. The
// segment may have just been received or may have been received earlier but
// wasn't ready to be consumed then.
//
// Returns true if the segment was consumed, false if it cannot be consumed
// yet because of a missing segment.
func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum.Size) bool {
        if segLen > 0 {
                // If the segment doesn't include the seqnum we're expecting to
                // consume now, we're missing a segment. We cannot proceed until
                // we receive that segment though.
                if !r.RcvNxt.InWindow(segSeq, segLen) {
                        return false
                }

                // Trim segment to eliminate already acknowledged data.
                if segSeq.LessThan(r.RcvNxt) {
                        diff := segSeq.Size(r.RcvNxt)
                        segLen -= diff
                        segSeq.UpdateForward(diff)
                        s.sequenceNumber.UpdateForward(diff)
                        s.data.TrimFront(int(diff))
                }

                // Move segment to ready-to-deliver list. Wakeup any waiters.
                r.ep.readyToRead(s)

        } else if segSeq != r.RcvNxt {
                return false
        }

        // Update the segment that we're expecting to consume.
        r.RcvNxt = segSeq.Add(segLen)

        // In cases of a misbehaving sender which could send more than the
        // advertised window, we could end up in a situation where we get a
        // segment that exceeds the window advertised. Instead of partially
        // accepting the segment and discarding bytes beyond the advertised
        // window, we accept the whole segment and make sure r.RcvAcc is moved
        // forward to match r.RcvNxt to indicate that the window is now closed.
        //
        // In absence of this check the r.acceptable() check fails and accepts
        // segments that should be dropped because rcvWnd is calculated as
        // the size of the interval (RcvNxt, RcvAcc] which becomes extremely
        // large if RcvAcc is ever less than RcvNxt.
        if r.RcvAcc.LessThan(r.RcvNxt) {
                r.RcvAcc = r.RcvNxt
        }

        // Trim SACK Blocks to remove any SACK information that covers
        // sequence numbers that have been consumed.
        TrimSACKBlockList(&r.ep.sack, r.RcvNxt)

        // Handle FIN or FIN-ACK.
        if s.flags.Contains(header.TCPFlagFin) {
                r.RcvNxt++

                // Send ACK immediately.
                r.ep.snd.sendAck()

                // Tell any readers that no more data will come.
                r.closed = true
                r.ep.readyToRead(nil)

                // We just received a FIN, our next state depends on whether we sent a
                // FIN already or not.
                switch r.ep.EndpointState() {
                case StateEstablished:
                        r.ep.setEndpointState(StateCloseWait)
                case StateFinWait1:
                        if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == r.ep.snd.SndNxt {
                                // FIN-ACK, transition to TIME-WAIT.
                                r.ep.setEndpointState(StateTimeWait)
                        } else {
                                // Simultaneous close, expecting a final ACK.
                                r.ep.setEndpointState(StateClosing)
                        }
                case StateFinWait2:
                        r.ep.setEndpointState(StateTimeWait)
                }

                // Flush out any pending segments, except the very first one if
                // it happens to be the one we're handling now because the
                // caller is using it.
                first := 0
                if len(r.pendingRcvdSegments) != 0 && r.pendingRcvdSegments[0] == s {
                        first = 1
                }

                for i := first; i < len(r.pendingRcvdSegments); i++ {
                        r.PendingBufUsed -= r.pendingRcvdSegments[i].segMemSize()
                        r.pendingRcvdSegments[i].decRef()

                        // Note that slice truncation does not allow garbage collection of
                        // truncated items, thus truncated items must be set to nil to avoid
                        // memory leaks.
                        r.pendingRcvdSegments[i] = nil
                }
                r.pendingRcvdSegments = r.pendingRcvdSegments[:first]

                return true
        }

        // Handle ACK (not FIN-ACK, which we handled above) during one of the
        // shutdown states.
        if s.flags.Contains(header.TCPFlagAck) && s.ackNumber == r.ep.snd.SndNxt {
                switch r.ep.EndpointState() {
                case StateFinWait1:
                        r.ep.setEndpointState(StateFinWait2)
                        // Notify protocol goroutine that we have received an
                        // ACK to our FIN so that it can start the FIN_WAIT2
                        // timer to abort connection if the other side does
                        // not close within 2MSL.
                        r.ep.notifyProtocolGoroutine(notifyClose)
                case StateClosing:
                        r.ep.setEndpointState(StateTimeWait)
                case StateLastAck:
                        r.ep.transitionToStateCloseLocked()
                }
        }

        return true
}

// updateRTT updates the receiver RTT measurement based on the sequence number
// of the received segment.
func (r *receiver) updateRTT() {
        // From: https://public.lanl.gov/radiant/pubs/drs/sc2001-poster.pdf
        //
        // A system that is only transmitting acknowledgements can still
        // estimate the round-trip time by observing the time between when a byte
        // is first acknowledged and the receipt of data that is at least one
        // window beyond the sequence number that was acknowledged.
        r.ep.rcvQueueInfo.rcvQueueMu.Lock()
        if r.ep.rcvQueueInfo.RcvAutoParams.RTTMeasureTime == (tcpip.MonotonicTime{}) {
                // New measurement.
                r.ep.rcvQueueInfo.RcvAutoParams.RTTMeasureTime = r.ep.stack.Clock().NowMonotonic()
                r.ep.rcvQueueInfo.RcvAutoParams.RTTMeasureSeqNumber = r.RcvNxt.Add(r.rcvWnd)
                r.ep.rcvQueueInfo.rcvQueueMu.Unlock()
                return
        }
        if r.RcvNxt.LessThan(r.ep.rcvQueueInfo.RcvAutoParams.RTTMeasureSeqNumber) {
                r.ep.rcvQueueInfo.rcvQueueMu.Unlock()
                return
        }
        rtt := r.ep.stack.Clock().NowMonotonic().Sub(r.ep.rcvQueueInfo.RcvAutoParams.RTTMeasureTime)
        // We only store the minimum observed RTT here as this is only used in
        // absence of a SRTT available from either timestamps or a sender
        // measurement of RTT.
        if r.ep.rcvQueueInfo.RcvAutoParams.RTT == 0 || rtt < r.ep.rcvQueueInfo.RcvAutoParams.RTT {
                r.ep.rcvQueueInfo.RcvAutoParams.RTT = rtt
        }
        r.ep.rcvQueueInfo.RcvAutoParams.RTTMeasureTime = r.ep.stack.Clock().NowMonotonic()
        r.ep.rcvQueueInfo.RcvAutoParams.RTTMeasureSeqNumber = r.RcvNxt.Add(r.rcvWnd)
        r.ep.rcvQueueInfo.rcvQueueMu.Unlock()
}

func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, closed bool) (drop bool, err tcpip.Error) {
        r.ep.rcvQueueInfo.rcvQueueMu.Lock()
        rcvClosed := r.ep.rcvQueueInfo.RcvClosed || r.closed
        r.ep.rcvQueueInfo.rcvQueueMu.Unlock()

        // If we are in one of the shutdown states then we need to do
        // additional checks before we try and process the segment.
        switch state {
        case StateCloseWait, StateClosing, StateLastAck:
                if !s.sequenceNumber.LessThanEq(r.RcvNxt) {
                        // Just drop the segment as we have
                        // already received a FIN and this
                        // segment is after the sequence number
                        // for the FIN.
                        return true, nil
                }
                fallthrough
        case StateFinWait1, StateFinWait2:
                // If the ACK acks something not yet sent then we send an ACK.
                //
                // RFC793, page 37: If the connection is in a synchronized state,
                // (ESTABLISHED, FIN-WAIT-1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK,
                // TIME-WAIT), any unacceptable segment (out of window sequence number
                // or unacceptable acknowledgment number) must elicit only an empty
                // acknowledgment segment containing the current send-sequence number
                // and an acknowledgment indicating the next sequence number expected
                // to be received, and the connection remains in the same state.
                //
                // Just as on Linux, we do not apply this behavior when state is
                // ESTABLISHED.
                // Linux receive processing for all states except ESTABLISHED and
                // TIME_WAIT is here where if the ACK check fails, we attempt to
                // reply back with an ACK with correct seq/ack numbers.
                // https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L6186
                // The ESTABLISHED state processing is here where if the ACK check
                // fails, we ignore the packet:
                // https://github.com/torvalds/linux/blob/v5.8/net/ipv4/tcp_input.c#L5591
                if r.ep.snd.SndNxt.LessThan(s.ackNumber) {
                        r.ep.snd.maybeSendOutOfWindowAck(s)
                        return true, nil
                }

                // If we are closed for reads (either due to an
                // incoming FIN or the user calling shutdown(..,
                // SHUT_RD) then any data past the RcvNxt should
                // trigger a RST.
                endDataSeq := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
                if state != StateCloseWait && rcvClosed && r.RcvNxt.LessThan(endDataSeq) {
                        return true, &tcpip.ErrConnectionAborted{}
                }
                if state == StateFinWait1 {
                        break
                }

                // If it's a retransmission of an old data segment
                // or a pure ACK then allow it.
                if s.sequenceNumber.Add(s.logicalLen()).LessThanEq(r.RcvNxt) ||
                        s.logicalLen() == 0 {
                        break
                }

                // In FIN-WAIT2 if the socket is fully
                // closed(not owned by application on our end
                // then the only acceptable segment is a
                // FIN. Since FIN can technically also carry
                // data we verify that the segment carrying a
                // FIN ends at exactly e.RcvNxt+1.
                //
                // From RFC793 page 25.
                //
                // For sequence number purposes, the SYN is
                // considered to occur before the first actual
                // data octet of the segment in which it occurs,
                // while the FIN is considered to occur after
                // the last actual data octet in a segment in
                // which it occurs.
                if closed && (!s.flags.Contains(header.TCPFlagFin) || s.sequenceNumber.Add(s.logicalLen()) != r.RcvNxt+1) {
                        return true, &tcpip.ErrConnectionAborted{}
                }
        }

        // We don't care about receive processing anymore if the receive side
        // is closed.
        //
        // NOTE: We still want to permit a FIN as it's possible only our
        // end has closed and the peer is yet to send a FIN. Hence we
        // compare only the payload.
        segEnd := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
        if rcvClosed && !segEnd.LessThanEq(r.RcvNxt) {
                return true, nil
        }
        return false, nil
}

// handleRcvdSegment handles TCP segments directed at the connection managed by
// r as they arrive. It is called by the protocol main loop.
func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err tcpip.Error) {
        state := r.ep.EndpointState()
        closed := r.ep.closed

        segLen := seqnum.Size(s.data.Size())
        segSeq := s.sequenceNumber

        // If the sequence number range is outside the acceptable range, just
        // send an ACK and stop further processing of the segment.
        // This is according to RFC 793, page 68.
        if !r.acceptable(segSeq, segLen) {
                r.ep.snd.maybeSendOutOfWindowAck(s)
                return true, nil
        }

        if state != StateEstablished {
                drop, err := r.handleRcvdSegmentClosing(s, state, closed)
                if drop || err != nil {
                        return drop, err
                }
        }

        // Store the time of the last ack.
        r.lastRcvdAckTime = r.ep.stack.Clock().NowMonotonic()

        // Defer segment processing if it can't be consumed now.
        if !r.consumeSegment(s, segSeq, segLen) {
                if segLen > 0 || s.flags.Contains(header.TCPFlagFin) {
                        // We only store the segment if it's within our buffer size limit.
                        //
                        // Only use 75% of the receive buffer queue for out-of-order
                        // segments. This ensures that we always leave some space for the inorder
                        // segments to arrive allowing pending segments to be processed and
                        // delivered to the user.
                        if rcvBufSize := r.ep.ops.GetReceiveBufferSize(); rcvBufSize > 0 && r.PendingBufUsed < int(rcvBufSize)>>2 {
                                r.ep.rcvQueueInfo.rcvQueueMu.Lock()
                                r.PendingBufUsed += s.segMemSize()
                                r.ep.rcvQueueInfo.rcvQueueMu.Unlock()
                                s.incRef()
                                heap.Push(&r.pendingRcvdSegments, s)
                                UpdateSACKBlocks(&r.ep.sack, segSeq, segSeq.Add(segLen), r.RcvNxt)
                        }

                        // Immediately send an ack so that the peer knows it may
                        // have to retransmit.
                        r.ep.snd.sendAck()
                }
                return false, nil
        }

        // Since we consumed a segment update the receiver's RTT estimate
        // if required.
        if segLen > 0 {
                r.updateRTT()
        }

        // By consuming the current segment, we may have filled a gap in the
        // sequence number domain that allows pending segments to be consumed
        // now. So try to do it.
        for !r.closed && r.pendingRcvdSegments.Len() > 0 {
                s := r.pendingRcvdSegments[0]
                segLen := seqnum.Size(s.data.Size())
                segSeq := s.sequenceNumber

                // Skip segment altogether if it has already been acknowledged.
                if !segSeq.Add(segLen-1).LessThan(r.RcvNxt) &&
                        !r.consumeSegment(s, segSeq, segLen) {
                        break
                }

                heap.Pop(&r.pendingRcvdSegments)
                r.ep.rcvQueueInfo.rcvQueueMu.Lock()
                r.PendingBufUsed -= s.segMemSize()
                r.ep.rcvQueueInfo.rcvQueueMu.Unlock()
                s.decRef()
        }
        return false, nil
}

// handleTimeWaitSegment handles inbound segments received when the endpoint
// has entered the TIME_WAIT state.
func (r *receiver) handleTimeWaitSegment(s *segment) (resetTimeWait bool, newSyn bool) {
        segSeq := s.sequenceNumber
        segLen := seqnum.Size(s.data.Size())

        // Just silently drop any RST packets in TIME_WAIT. We do not support
        // TIME_WAIT assasination as a result we confirm w/ fix 1 as described
        // in https://tools.ietf.org/html/rfc1337#section-3.
        //
        // This behavior overrides RFC793 page 70 where we transition to CLOSED
        // on receiving RST, which is also default Linux behavior.
        // On Linux the RST can be ignored by setting sysctl net.ipv4.tcp_rfc1337.
        //
        // As we do not yet support PAWS, we are being conservative in ignoring
        // RSTs by default.
        if s.flags.Contains(header.TCPFlagRst) {
                return false, false
        }

        // If it's a SYN and the sequence number is higher than any seen before
        // for this connection then try and redirect it to a listening endpoint
        // if available.
        //
        // RFC 1122:
        //   "When a connection is [...] on TIME-WAIT state [...]
        //   [a TCP] MAY accept a new SYN from the remote TCP to
        //   reopen the connection directly, if it:

        //    (1) assigns its initial sequence number for the new
        //     connection to be larger than the largest sequence
        //     number it used on the previous connection incarnation,
        //     and

        //    (2) returns to TIME-WAIT state if the SYN turns out
        //      to be an old duplicate".
        if s.flags.Contains(header.TCPFlagSyn) && r.RcvNxt.LessThan(segSeq) {
                return false, true
        }

        // Drop the segment if it does not contain an ACK.
        if !s.flags.Contains(header.TCPFlagAck) {
                return false, false
        }

        // Update Timestamp if required. See RFC7323, section-4.3.
        if r.ep.SendTSOk && s.parsedOptions.TS {
                r.ep.updateRecentTimestamp(s.parsedOptions.TSVal, r.ep.snd.MaxSentAck, segSeq)
        }

        if segSeq.Add(1) == r.RcvNxt && s.flags.Contains(header.TCPFlagFin) {
                // If it's a FIN-ACK then resetTimeWait and send an ACK, as it
                // indicates our final ACK could have been lost.
                r.ep.snd.sendAck()
                return true, false
        }

        // If the sequence number range is outside the acceptable range or
        // carries data then just send an ACK. This is according to RFC 793,
        // page 37.
        //
        // NOTE: In TIME_WAIT the only acceptable sequence number is RcvNxt.
        if segSeq != r.RcvNxt || segLen != 0 {
                r.ep.snd.sendAck()
        }
        return false, false
}






































































 1622 
 1624 


 1629 



 1629 










 1622 
 1625 



 1627 




  944 




 1629 




































 1624 
 1622 



 1624 



  942 


 1629 










 1627 
 1628 





 1627 




 1624 



 1623 
 1624 









 1042 
 1041 




 1045 





 1045 





 1043 
 1044 









































 1626 










 1622 




















 1619 










 1616 









 1044 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
// Copyright 2009 The Go Authors. All rights reserved.
// Copyright 2019 The gVisor Authors.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// This is mostly copied from the standard library's sync/rwmutex.go.
//
// Happens-before relationships indicated to the race detector:
// - Unlock -> Lock (via writerSem)
// - Unlock -> RLock (via readerSem)
// - RUnlock -> Lock (via writerSem)
// - DowngradeLock -> RLock (via readerSem)

package sync

import (
        "sync/atomic"
        "unsafe"
)

// CrossGoroutineRWMutex is equivalent to RWMutex, but it need not be unlocked
// by a the same goroutine that locked the mutex.
type CrossGoroutineRWMutex struct {
        // w is held if there are pending writers
        //
        // We use CrossGoroutineMutex rather than Mutex because the lock
        // annotation instrumentation in Mutex will trigger false positives in
        // the race detector when called inside of RaceDisable.
        w           CrossGoroutineMutex
        writerSem   uint32 // semaphore for writers to wait for completing readers
        readerSem   uint32 // semaphore for readers to wait for completing writers
        readerCount int32  // number of pending readers
        readerWait  int32  // number of departing readers
}

const rwmutexMaxReaders = 1 << 30

// TryRLock locks rw for reading. It returns true if it succeeds and false
// otherwise. It does not block.
// +checklocksignore
func (rw *CrossGoroutineRWMutex) TryRLock() bool {
        if RaceEnabled {
                RaceDisable()
        }
        for {
                rc := atomic.LoadInt32(&rw.readerCount)
                if rc < 0 {
                        if RaceEnabled {
                                RaceEnable()
                        }
                        return false
                }
                if !atomic.CompareAndSwapInt32(&rw.readerCount, rc, rc+1) {
                        continue
                }
                if RaceEnabled {
                        RaceEnable()
                        RaceAcquire(unsafe.Pointer(&rw.readerSem))
                }
                return true
        }
}

// RLock locks rw for reading.
//
// It should not be used for recursive read locking; a blocked Lock call
// excludes new readers from acquiring the lock. See the documentation on the
// RWMutex type.
// +checklocksignore
func (rw *CrossGoroutineRWMutex) RLock() {
        if RaceEnabled {
                RaceDisable()
        }
        if atomic.AddInt32(&rw.readerCount, 1) < 0 {
                // A writer is pending, wait for it.
                semacquire(&rw.readerSem)
        }
        if RaceEnabled {
                RaceEnable()
                RaceAcquire(unsafe.Pointer(&rw.readerSem))
        }
}

// RUnlock undoes a single RLock call.
//
// Preconditions:
// * rw is locked for reading.
// +checklocksignore
func (rw *CrossGoroutineRWMutex) RUnlock() {
        if RaceEnabled {
                RaceReleaseMerge(unsafe.Pointer(&rw.writerSem))
                RaceDisable()
        }
        if r := atomic.AddInt32(&rw.readerCount, -1); r < 0 {
                if r+1 == 0 || r+1 == -rwmutexMaxReaders {
                        panic("RUnlock of unlocked RWMutex")
                }
                // A writer is pending.
                if atomic.AddInt32(&rw.readerWait, -1) == 0 {
                        // The last reader unblocks the writer.
                        semrelease(&rw.writerSem, false, 0)
                }
        }
        if RaceEnabled {
                RaceEnable()
        }
}

// TryLock locks rw for writing. It returns true if it succeeds and false
// otherwise. It does not block.
// +checklocksignore
func (rw *CrossGoroutineRWMutex) TryLock() bool {
        if RaceEnabled {
                RaceDisable()
        }
        // First, resolve competition with other writers.
        if !rw.w.TryLock() {
                if RaceEnabled {
                        RaceEnable()
                }
                return false
        }
        // Only proceed if there are no readers.
        if !atomic.CompareAndSwapInt32(&rw.readerCount, 0, -rwmutexMaxReaders) {
                rw.w.Unlock()
                if RaceEnabled {
                        RaceEnable()
                }
                return false
        }
        if RaceEnabled {
                RaceEnable()
                RaceAcquire(unsafe.Pointer(&rw.writerSem))
        }
        return true
}

// Lock locks rw for writing. If the lock is already locked for reading or
// writing, Lock blocks until the lock is available.
// +checklocksignore
func (rw *CrossGoroutineRWMutex) Lock() {
        if RaceEnabled {
                RaceDisable()
        }
        // First, resolve competition with other writers.
        rw.w.Lock()
        // Announce to readers there is a pending writer.
        r := atomic.AddInt32(&rw.readerCount, -rwmutexMaxReaders) + rwmutexMaxReaders
        // Wait for active readers.
        if r != 0 && atomic.AddInt32(&rw.readerWait, r) != 0 {
                semacquire(&rw.writerSem)
        }
        if RaceEnabled {
                RaceEnable()
                RaceAcquire(unsafe.Pointer(&rw.writerSem))
        }
}

// Unlock unlocks rw for writing.
//
// Preconditions:
// * rw is locked for writing.
// +checklocksignore
func (rw *CrossGoroutineRWMutex) Unlock() {
        if RaceEnabled {
                RaceRelease(unsafe.Pointer(&rw.writerSem))
                RaceRelease(unsafe.Pointer(&rw.readerSem))
                RaceDisable()
        }
        // Announce to readers there is no active writer.
        r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders)
        if r >= rwmutexMaxReaders {
                panic("Unlock of unlocked RWMutex")
        }
        // Unblock blocked readers, if any.
        for i := 0; i < int(r); i++ {
                semrelease(&rw.readerSem, false, 0)
        }
        // Allow other writers to proceed.
        rw.w.Unlock()
        if RaceEnabled {
                RaceEnable()
        }
}

// DowngradeLock atomically unlocks rw for writing and locks it for reading.
//
// Preconditions:
// * rw is locked for writing.
// +checklocksignore
func (rw *CrossGoroutineRWMutex) DowngradeLock() {
        if RaceEnabled {
                RaceRelease(unsafe.Pointer(&rw.readerSem))
                RaceDisable()
        }
        // Announce to readers there is no active writer and one additional reader.
        r := atomic.AddInt32(&rw.readerCount, rwmutexMaxReaders+1)
        if r >= rwmutexMaxReaders+1 {
                panic("DowngradeLock of unlocked RWMutex")
        }
        // Unblock blocked readers, if any. Note that this loop starts as 1 since r
        // includes this goroutine.
        for i := 1; i < int(r); i++ {
                semrelease(&rw.readerSem, false, 0)
        }
        // Allow other writers to proceed to rw.w.Lock(). Note that they will still
        // block on rw.writerSem since at least this reader exists, such that
        // DowngradeLock() is atomic with the previous write lock.
        rw.w.Unlock()
        if RaceEnabled {
                RaceEnable()
        }
}

// A RWMutex is a reader/writer mutual exclusion lock. The lock can be held by
// an arbitrary number of readers or a single writer. The zero value for a
// RWMutex is an unlocked mutex.
//
// A RWMutex must not be copied after first use.
//
// If a goroutine holds a RWMutex for reading and another goroutine might call
// Lock, no goroutine should expect to be able to acquire a read lock until the
// initial read lock is released. In particular, this prohibits recursive read
// locking. This is to ensure that the lock eventually becomes available; a
// blocked Lock call excludes new readers from acquiring the lock.
//
// A Mutex must be unlocked by the same goroutine that locked it. This
// invariant is enforced with the 'checklocks' build tag.
type RWMutex struct {
        m CrossGoroutineRWMutex
}

// TryRLock locks rw for reading. It returns true if it succeeds and false
// otherwise. It does not block.
// +checklocksignore
func (rw *RWMutex) TryRLock() bool {
        // Note lock first to enforce proper locking even if unsuccessful.
        noteLock(unsafe.Pointer(rw))
        locked := rw.m.TryRLock()
        if !locked {
                noteUnlock(unsafe.Pointer(rw))
        }
        return locked
}

// RLock locks rw for reading.
//
// It should not be used for recursive read locking; a blocked Lock call
// excludes new readers from acquiring the lock. See the documentation on the
// RWMutex type.
// +checklocksignore
func (rw *RWMutex) RLock() {
        noteLock(unsafe.Pointer(rw))
        rw.m.RLock()
}

// RUnlock undoes a single RLock call.
//
// Preconditions:
// * rw is locked for reading.
// * rw was locked by this goroutine.
// +checklocksignore
func (rw *RWMutex) RUnlock() {
        rw.m.RUnlock()
        noteUnlock(unsafe.Pointer(rw))
}

// TryLock locks rw for writing. It returns true if it succeeds and false
// otherwise. It does not block.
// +checklocksignore
func (rw *RWMutex) TryLock() bool {
        // Note lock first to enforce proper locking even if unsuccessful.
        noteLock(unsafe.Pointer(rw))
        locked := rw.m.TryLock()
        if !locked {
                noteUnlock(unsafe.Pointer(rw))
        }
        return locked
}

// Lock locks rw for writing. If the lock is already locked for reading or
// writing, Lock blocks until the lock is available.
// +checklocksignore
func (rw *RWMutex) Lock() {
        noteLock(unsafe.Pointer(rw))
        rw.m.Lock()
}

// Unlock unlocks rw for writing.
//
// Preconditions:
// * rw is locked for writing.
// * rw was locked by this goroutine.
// +checklocksignore
func (rw *RWMutex) Unlock() {
        rw.m.Unlock()
        noteUnlock(unsafe.Pointer(rw))
}

// DowngradeLock atomically unlocks rw for writing and locks it for reading.
//
// Preconditions:
// * rw is locked for writing.
// +checklocksignore
func (rw *RWMutex) DowngradeLock() {
        // No note change for DowngradeLock.
        rw.m.DowngradeLock()
}

















































































































   18 







    2 









   10 










    8 






    2 



   10 


    1 













    2 





    1 









    4 



    1 


    3 
    2 

    1 


    3 


    3 
    1 


    1 


   11 



    3 


    8 


    8 






    2 



    1 







    1 




    1 




    3 







    3 

























   12 



    3 

    3 


    3 


   10 
   10 






    5 
    2 


    5 



    3 



    3 

    2 


    2 

    1 
    3 

    2 




    1 
    3 
    3 


   10 


   10 

   10 


   10 


















   12 


    1 



   11 
    7 



   11 

   11 




    3 
    1 




    2 


    3 
    1 






   11 






   11 


   11 



    2 




   11 


    9 





   11 






   11 



   11 









   11 





   14 


    2 


   14 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package devpts

import (
        "bytes"
        "unicode/utf8"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

const (
        // canonMaxBytes is the number of bytes that fit into a single line of
        // terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE
        // in include/linux/tty.h.
        canonMaxBytes = 4096

        // nonCanonMaxBytes is the maximum number of bytes that can be read at
        // a time in noncanonical mode.
        nonCanonMaxBytes = canonMaxBytes - 1

        spacesPerTab = 8
)

// lineDiscipline dictates how input and output are handled between the
// pseudoterminal (pty) master and replica. It can be configured to alter I/O,
// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
// pages are good resources for how to affect the line discipline:
//
//   * termios(3)
//   * tty_ioctl(4)
//
// This file corresponds most closely to drivers/tty/n_tty.c.
//
// lineDiscipline has a simple structure but supports a multitude of options
// (see the above man pages). It consists of two queues of bytes: one from the
// terminal master to replica (the input queue) and one from replica to master
// (the output queue). When bytes are written to one end of the pty, the line
// discipline reads the bytes, modifies them or takes special action if
// required, and enqueues them to be read by the other end of the pty:
//
//       input from terminal    +-------------+   input to process (e.g. bash)
//    +------------------------>| input queue |---------------------------+
//    |   (inputQueueWrite)     +-------------+     (inputQueueRead)      |
//    |                                                                   |
//    |                                                                   v
// masterFD                                                           replicaFD
//    ^                                                                   |
//    |                                                                   |
//    |   output to terminal   +--------------+    output from process    |
//    +------------------------| output queue |<--------------------------+
//        (outputQueueRead)    +--------------+    (outputQueueWrite)
//
// There is special handling for the ECHO option, where bytes written to the
// input queue are also output back to the terminal by being written to
// l.outQueue by the input queue transformer.
//
// Lock order:
//  termiosMu
//    inQueue.mu
//      outQueue.mu
//
// +stateify savable
type lineDiscipline struct {
        // sizeMu protects size.
        sizeMu sync.Mutex `state:"nosave"`

        // size is the terminal size (width and height).
        size linux.WindowSize

        // inQueue is the input queue of the terminal.
        inQueue queue

        // outQueue is the output queue of the terminal.
        outQueue queue

        // termiosMu protects termios.
        termiosMu sync.RWMutex `state:"nosave"`

        // termios is the terminal configuration used by the lineDiscipline.
        termios linux.KernelTermios

        // column is the location in a row of the cursor. This is important for
        // handling certain special characters like backspace.
        column int

        // masterWaiter is used to wait on the master end of the TTY.
        masterWaiter waiter.Queue

        // replicaWaiter is used to wait on the replica end of the TTY.
        replicaWaiter waiter.Queue
}

func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
        ld := lineDiscipline{termios: termios}
        ld.inQueue.transformer = &inputQueueTransformer{}
        ld.outQueue.transformer = &outputQueueTransformer{}
        return &ld
}

// getTermios gets the linux.Termios for the tty.
func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
        l.termiosMu.RLock()
        defer l.termiosMu.RUnlock()
        // We must copy a Termios struct, not KernelTermios.
        t := l.termios.ToTermios()
        _, err := t.CopyOut(task, args[2].Pointer())
        return 0, err
}

// setTermios sets a linux.Termios for the tty.
func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
        l.termiosMu.Lock()
        oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
        // We must copy a Termios struct, not KernelTermios.
        var t linux.Termios
        _, err := t.CopyIn(task, args[2].Pointer())
        l.termios.FromTermios(t)

        // If canonical mode is turned off, move bytes from inQueue's wait
        // buffer to its read buffer. Anything already in the read buffer is
        // now readable.
        if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) {
                l.inQueue.mu.Lock()
                l.inQueue.pushWaitBufLocked(l)
                l.inQueue.readable = true
                l.inQueue.mu.Unlock()
                l.termiosMu.Unlock()
                l.replicaWaiter.Notify(waiter.ReadableEvents)
        } else {
                l.termiosMu.Unlock()
        }

        return 0, err
}

func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error {
        l.sizeMu.Lock()
        defer l.sizeMu.Unlock()
        _, err := l.size.CopyOut(t, args[2].Pointer())
        return err
}

func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error {
        l.sizeMu.Lock()
        defer l.sizeMu.Unlock()
        _, err := l.size.CopyIn(t, args[2].Pointer())
        return err
}

func (l *lineDiscipline) masterReadiness() waiter.EventMask {
        // We don't have to lock a termios because the default master termios
        // is immutable.
        return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
}

func (l *lineDiscipline) replicaReadiness() waiter.EventMask {
        l.termiosMu.RLock()
        defer l.termiosMu.RUnlock()
        return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
}

func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
        return l.inQueue.readableSize(t, io, args)
}

func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
        l.termiosMu.RLock()
        n, pushed, notifyEcho, err := l.inQueue.read(ctx, dst, l)
        l.termiosMu.RUnlock()
        if err != nil {
                return 0, err
        }
        if n > 0 {
                if notifyEcho {
                        l.masterWaiter.Notify(waiter.ReadableEvents | waiter.WritableEvents)
                } else {
                        l.masterWaiter.Notify(waiter.WritableEvents)
                }
                if pushed {
                        l.replicaWaiter.Notify(waiter.ReadableEvents)
                }
                return n, nil
        } else if notifyEcho {
                l.masterWaiter.Notify(waiter.ReadableEvents)
        }
        return 0, syserror.ErrWouldBlock
}

func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
        l.termiosMu.RLock()
        n, notifyEcho, err := l.inQueue.write(ctx, src, l)
        l.termiosMu.RUnlock()
        if err != nil {
                return 0, err
        }
        if notifyEcho {
                l.masterWaiter.Notify(waiter.ReadableEvents)
        }
        if n > 0 {
                l.replicaWaiter.Notify(waiter.ReadableEvents)
                return n, nil
        }
        return 0, syserror.ErrWouldBlock
}

func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
        return l.outQueue.readableSize(t, io, args)
}

func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
        l.termiosMu.RLock()
        // Ignore notifyEcho, as it cannot happen when reading from the output queue.
        n, pushed, _, err := l.outQueue.read(ctx, dst, l)
        l.termiosMu.RUnlock()
        if err != nil {
                return 0, err
        }
        if n > 0 {
                l.replicaWaiter.Notify(waiter.WritableEvents)
                if pushed {
                        l.masterWaiter.Notify(waiter.ReadableEvents)
                }
                return n, nil
        }
        return 0, syserror.ErrWouldBlock
}

func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
        l.termiosMu.RLock()
        // Ignore notifyEcho, as it cannot happen when writing to the output queue.
        n, _, err := l.outQueue.write(ctx, src, l)
        l.termiosMu.RUnlock()
        if err != nil {
                return 0, err
        }
        if n > 0 {
                l.masterWaiter.Notify(waiter.ReadableEvents)
                return n, nil
        }
        return 0, syserror.ErrWouldBlock
}

// transformer is a helper interface to make it easier to stateify queue.
type transformer interface {
        // transform functions require queue's mutex to be held.
        // The boolean indicates whether there was any echoed bytes.
        transform(*lineDiscipline, *queue, []byte) (int, bool)
}

// outputQueueTransformer implements transformer. It performs line discipline
// transformations on the output queue.
//
// +stateify savable
type outputQueueTransformer struct{}

// transform does output processing for one end of the pty. See
// drivers/tty/n_tty.c:do_output_char for an analogous kernel function.
//
// Preconditions:
// * l.termiosMu must be held for reading.
// * q.mu must be held.
func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) (int, bool) {
        // transformOutput is effectively always in noncanonical mode, as the
        // master termios never has ICANON set.

        if !l.termios.OEnabled(linux.OPOST) {
                q.readBuf = append(q.readBuf, buf...)
                if len(q.readBuf) > 0 {
                        q.readable = true
                }
                return len(buf), false
        }

        var ret int
        for len(buf) > 0 {
                size := l.peek(buf)
                cBytes := append([]byte{}, buf[:size]...)
                ret += size
                buf = buf[size:]
                // We're guaranteed that cBytes has at least one element.
                switch cBytes[0] {
                case '\n':
                        if l.termios.OEnabled(linux.ONLRET) {
                                l.column = 0
                        }
                        if l.termios.OEnabled(linux.ONLCR) {
                                q.readBuf = append(q.readBuf, '\r', '\n')
                                continue
                        }
                case '\r':
                        if l.termios.OEnabled(linux.ONOCR) && l.column == 0 {
                                continue
                        }
                        if l.termios.OEnabled(linux.OCRNL) {
                                cBytes[0] = '\n'
                                if l.termios.OEnabled(linux.ONLRET) {
                                        l.column = 0
                                }
                                break
                        }
                        l.column = 0
                case '\t':
                        spaces := spacesPerTab - l.column%spacesPerTab
                        if l.termios.OutputFlags&linux.TABDLY == linux.XTABS {
                                l.column += spaces
                                q.readBuf = append(q.readBuf, bytes.Repeat([]byte{' '}, spacesPerTab)...)
                                continue
                        }
                        l.column += spaces
                case '\b':
                        if l.column > 0 {
                                l.column--
                        }
                default:
                        l.column++
                }
                q.readBuf = append(q.readBuf, cBytes...)
        }
        if len(q.readBuf) > 0 {
                q.readable = true
        }
        return ret, false
}

// inputQueueTransformer implements transformer. It performs line discipline
// transformations on the input queue.
//
// +stateify savable
type inputQueueTransformer struct{}

// transform does input processing for one end of the pty. Characters read are
// transformed according to flags set in the termios struct. See
// drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel
// function.
// It returns an extra boolean indicating whether any characters need to be
// echoed, in which case we need to notify readers.
//
// Preconditions:
// * l.termiosMu must be held for reading.
// * q.mu must be held.
func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) (int, bool) {
        // If there's a line waiting to be read in canonical mode, don't write
        // anything else to the read buffer.
        if l.termios.LEnabled(linux.ICANON) && q.readable {
                return 0, false
        }

        maxBytes := nonCanonMaxBytes
        if l.termios.LEnabled(linux.ICANON) {
                maxBytes = canonMaxBytes
        }

        var ret int
        var notifyEcho bool
        for len(buf) > 0 && len(q.readBuf) < canonMaxBytes {
                size := l.peek(buf)
                cBytes := append([]byte{}, buf[:size]...)
                // We're guaranteed that cBytes has at least one element.
                switch cBytes[0] {
                case '\r':
                        if l.termios.IEnabled(linux.IGNCR) {
                                buf = buf[size:]
                                ret += size
                                continue
                        }
                        if l.termios.IEnabled(linux.ICRNL) {
                                cBytes[0] = '\n'
                        }
                case '\n':
                        if l.termios.IEnabled(linux.INLCR) {
                                cBytes[0] = '\r'
                        }
                }

                // In canonical mode, we discard non-terminating characters
                // after the first 4095.
                if l.shouldDiscard(q, cBytes) {
                        buf = buf[size:]
                        ret += size
                        continue
                }

                // Stop if the buffer would be overfilled.
                if len(q.readBuf)+size > maxBytes {
                        break
                }
                buf = buf[size:]
                ret += size

                // If we get EOF, make the buffer available for reading.
                if l.termios.LEnabled(linux.ICANON) && l.termios.IsEOF(cBytes[0]) {
                        q.readable = true
                        break
                }

                q.readBuf = append(q.readBuf, cBytes...)

                // Anything written to the readBuf will have to be echoed.
                if l.termios.LEnabled(linux.ECHO) {
                        l.outQueue.writeBytes(cBytes, l)
                        notifyEcho = true
                }

                // If we finish a line, make it available for reading.
                if l.termios.LEnabled(linux.ICANON) && l.termios.IsTerminating(cBytes) {
                        q.readable = true
                        break
                }
        }

        // In noncanonical mode, everything is readable.
        if !l.termios.LEnabled(linux.ICANON) && len(q.readBuf) > 0 {
                q.readable = true
        }

        return ret, notifyEcho
}

// shouldDiscard returns whether c should be discarded. In canonical mode, if
// too many bytes are enqueued, we keep reading input and discarding it until
// we find a terminating character. Signal/echo processing still occurs.
//
// Precondition:
// * l.termiosMu must be held for reading.
// * q.mu must be held.
func (l *lineDiscipline) shouldDiscard(q *queue, cBytes []byte) bool {
        return l.termios.LEnabled(linux.ICANON) && len(q.readBuf)+len(cBytes) >= canonMaxBytes && !l.termios.IsTerminating(cBytes)
}

// peek returns the size in bytes of the next character to process. As long as
// b isn't empty, peek returns a value of at least 1.
func (l *lineDiscipline) peek(b []byte) int {
        size := 1
        // If UTF-8 support is enabled, runes might be multiple bytes.
        if l.termios.IEnabled(linux.IUTF8) {
                _, size = utf8.DecodeRune(b)
        }
        return size
}

































































































   17 



   20 




   20 















   20 



   20 
   20 






   20 





   17 



   17 








   14 



    1 



   14 







    1 



   14 

    1 




   14 














    2 














    2 
    2 




    2 












    2 



    2 




   11 





   11 

    1 




   11 

    8 

    1 





    1 


    4 



   11 


   10 

    1 


   11 







   11 






   11 


    1 



    4 






    7 






    7 
    1 



    7 

   11 


   11 

    4 



    3 


    7 














    8 








    5 





    5 






    5 
    3 
    1 


    2 



    1 







    4 
    1 



    3 





    2 



    3 



    3 






    2 



    1 


    1 








    1 




    2 




    1 



    1 




    1 




    1 






    1 




    1 






















    1 

    1 












    1 




    3 











    3 





   14 



















   14 








   14 

    1 







    1 








   14 





   13 














   10 





    3 


   13 









    6 



























    1 




    7 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package raw provides the implementation of raw sockets (see raw(7)). Raw
// sockets allow applications to:
//
//   * manually write and inspect transport layer headers and payloads
//   * receive all traffic of a given transport protocol (e.g. ICMP or UDP)
//   * optionally write and inspect network layer headers of packets
//
// Raw sockets don't have any notion of ports, and incoming packets are
// demultiplexed solely by protocol number. Thus, a raw UDP endpoint will
// receive every UDP packet received by netstack. bind(2) and connect(2) can be
// used to filter incoming packets by source and destination.
package raw

import (
        "io"
        "time"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
        "gvisor.dev/gvisor/pkg/waiter"
)

// +stateify savable
type rawPacket struct {
        rawPacketEntry
        // data holds the actual packet data, including any headers and
        // payload.
        data       buffer.VectorisedView `state:".(buffer.VectorisedView)"`
        receivedAt time.Time             `state:".(int64)"`
        // senderAddr is the network address of the sender.
        senderAddr tcpip.FullAddress
}

// endpoint is the raw socket implementation of tcpip.Endpoint. It is legal to
// have goroutines make concurrent calls into the endpoint.
//
// Lock order:
//   endpoint.mu
//     endpoint.rcvMu
//
// +stateify savable
type endpoint struct {
        stack.TransportEndpointInfo
        tcpip.DefaultSocketOptionsHandler

        // The following fields are initialized at creation time and are
        // immutable.
        stack       *stack.Stack `state:"manual"`
        waiterQueue *waiter.Queue
        associated  bool

        // The following fields are used to manage the receive queue and are
        // protected by rcvMu.
        rcvMu      sync.Mutex `state:"nosave"`
        rcvList    rawPacketList
        rcvBufSize int
        rcvClosed  bool

        // The following fields are protected by mu.
        mu        sync.RWMutex `state:"nosave"`
        closed    bool
        connected bool
        bound     bool
        // route is the route to a remote network endpoint. It is set via
        // Connect(), and is valid only when conneted is true.
        route *stack.Route                 `state:"manual"`
        stats tcpip.TransportEndpointStats `state:"nosave"`
        // owner is used to get uid and gid of the packet.
        owner tcpip.PacketOwner

        // ops is used to get socket level options.
        ops tcpip.SocketOptions

        // frozen indicates if the packets should be delivered to the endpoint
        // during restore.
        frozen bool
}

// NewEndpoint returns a raw  endpoint for the given protocols.
func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
        return newEndpoint(stack, netProto, transProto, waiterQueue, true /* associated */)
}

func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, tcpip.Error) {
        if netProto != header.IPv4ProtocolNumber && netProto != header.IPv6ProtocolNumber {
                return nil, &tcpip.ErrUnknownProtocol{}
        }

        e := &endpoint{
                stack: s,
                TransportEndpointInfo: stack.TransportEndpointInfo{
                        NetProto:   netProto,
                        TransProto: transProto,
                },
                waiterQueue: waiterQueue,
                associated:  associated,
        }
        e.ops.InitHandler(e, e.stack, tcpip.GetStackSendBufferLimits, tcpip.GetStackReceiveBufferLimits)
        e.ops.SetHeaderIncluded(!associated)
        e.ops.SetSendBufferSize(32*1024, false /* notify */)
        e.ops.SetReceiveBufferSize(32*1024, false /* notify */)

        // Override with stack defaults.
        var ss tcpip.SendBufferSizeOption
        if err := s.Option(&ss); err == nil {
                e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */)
        }

        var rs tcpip.ReceiveBufferSizeOption
        if err := s.Option(&rs); err == nil {
                e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */)
        }

        // Unassociated endpoints are write-only and users call Write() with IP
        // headers included. Because they're write-only, We don't need to
        // register with the stack.
        if !associated {
                e.ops.SetReceiveBufferSize(0, false /* notify */)
                e.waiterQueue = nil
                return e, nil
        }

        if err := e.stack.RegisterRawTransportEndpoint(e.NetProto, e.TransProto, e); err != nil {
                return nil, err
        }

        return e, nil
}

// Abort implements stack.TransportEndpoint.Abort.
func (e *endpoint) Abort() {
        e.Close()
}

// Close implements tcpip.Endpoint.Close.
func (e *endpoint) Close() {
        e.mu.Lock()
        defer e.mu.Unlock()

        if e.closed || !e.associated {
                return
        }

        e.stack.UnregisterRawTransportEndpoint(e.NetProto, e.TransProto, e)

        e.rcvMu.Lock()
        defer e.rcvMu.Unlock()

        // Clear the receive list.
        e.rcvClosed = true
        e.rcvBufSize = 0
        for !e.rcvList.Empty() {
                e.rcvList.Remove(e.rcvList.Front())
        }

        e.connected = false

        if e.route != nil {
                e.route.Release()
                e.route = nil
        }

        e.closed = true

        e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents)
}

// ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
func (*endpoint) ModerateRecvBuf(int) {}

func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
        e.mu.Lock()
        defer e.mu.Unlock()
        e.owner = owner
}

// Read implements tcpip.Endpoint.Read.
func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) {
        e.rcvMu.Lock()

        // If there's no data to read, return that read would block or that the
        // endpoint is closed.
        if e.rcvList.Empty() {
                var err tcpip.Error = &tcpip.ErrWouldBlock{}
                if e.rcvClosed {
                        e.stats.ReadErrors.ReadClosed.Increment()
                        err = &tcpip.ErrClosedForReceive{}
                }
                e.rcvMu.Unlock()
                return tcpip.ReadResult{}, err
        }

        pkt := e.rcvList.Front()
        if !opts.Peek {
                e.rcvList.Remove(pkt)
                e.rcvBufSize -= pkt.data.Size()
        }

        e.rcvMu.Unlock()

        res := tcpip.ReadResult{
                Total: pkt.data.Size(),
                ControlMessages: tcpip.ControlMessages{
                        HasTimestamp: true,
                        Timestamp:    pkt.receivedAt.UnixNano(),
                },
        }
        if opts.NeedRemoteAddr {
                res.RemoteAddr = pkt.senderAddr
        }

        n, err := pkt.data.ReadTo(dst, opts.Peek)
        if n == 0 && err != nil {
                return res, &tcpip.ErrBadBuffer{}
        }
        res.Count = n
        return res, nil
}

// Write implements tcpip.Endpoint.Write.
func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
        // We can create, but not write to, unassociated IPv6 endpoints.
        if !e.associated && e.TransportEndpointInfo.NetProto == header.IPv6ProtocolNumber {
                return 0, &tcpip.ErrInvalidOptionValue{}
        }

        if opts.To != nil {
                // Raw sockets do not support sending to a IPv4 address on a IPv6 endpoint.
                if e.TransportEndpointInfo.NetProto == header.IPv6ProtocolNumber && len(opts.To.Addr) != header.IPv6AddressSize {
                        return 0, &tcpip.ErrInvalidOptionValue{}
                }
        }

        n, err := e.write(p, opts)
        switch err.(type) {
        case nil:
                e.stats.PacketsSent.Increment()
        case *tcpip.ErrMessageTooLong, *tcpip.ErrInvalidOptionValue:
                e.stats.WriteErrors.InvalidArgs.Increment()
        case *tcpip.ErrClosedForSend:
                e.stats.WriteErrors.WriteClosed.Increment()
        case *tcpip.ErrInvalidEndpointState:
                e.stats.WriteErrors.InvalidEndpointState.Increment()
        case *tcpip.ErrNoRoute, *tcpip.ErrBroadcastDisabled, *tcpip.ErrNetworkUnreachable:
                // Errors indicating any problem with IP routing of the packet.
                e.stats.SendErrors.NoRoute.Increment()
        default:
                // For all other errors when writing to the network layer.
                e.stats.SendErrors.SendToNetworkFailed.Increment()
        }
        return n, err
}

func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
        // MSG_MORE is unimplemented. This also means that MSG_EOR is a no-op.
        if opts.More {
                return 0, &tcpip.ErrInvalidOptionValue{}
        }
        payloadBytes, route, owner, err := func() ([]byte, *stack.Route, tcpip.PacketOwner, tcpip.Error) {
                e.mu.RLock()
                defer e.mu.RUnlock()

                if e.closed {
                        return nil, nil, nil, &tcpip.ErrInvalidEndpointState{}
                }

                payloadBytes := make([]byte, p.Len())
                if _, err := io.ReadFull(p, payloadBytes); err != nil {
                        return nil, nil, nil, &tcpip.ErrBadBuffer{}
                }

                // Did the user caller provide a destination? If not, use the connected
                // destination.
                if opts.To == nil {
                        // If the user doesn't specify a destination, they should have
                        // connected to another address.
                        if !e.connected {
                                return nil, nil, nil, &tcpip.ErrDestinationRequired{}
                        }

                        e.route.Acquire()

                        return payloadBytes, e.route, e.owner, nil
                }

                // The caller provided a destination. Reject destination address if it
                // goes through a different NIC than the endpoint was bound to.
                nic := opts.To.NIC
                if e.bound && nic != 0 && nic != e.BindNICID {
                        return nil, nil, nil, &tcpip.ErrNoRoute{}
                }

                // Find the route to the destination. If BindAddress is 0,
                // FindRoute will choose an appropriate source address.
                route, err := e.stack.FindRoute(nic, e.BindAddr, opts.To.Addr, e.NetProto, false)
                if err != nil {
                        return nil, nil, nil, err
                }

                return payloadBytes, route, e.owner, nil
        }()
        if err != nil {
                return 0, err
        }
        defer route.Release()

        if e.ops.GetHeaderIncluded() {
                pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                        Data: buffer.View(payloadBytes).ToVectorisedView(),
                })
                if err := route.WriteHeaderIncludedPacket(pkt); err != nil {
                        return 0, err
                }
        } else {
                pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                        ReserveHeaderBytes: int(route.MaxHeaderLength()),
                        Data:               buffer.View(payloadBytes).ToVectorisedView(),
                })
                pkt.Owner = owner
                if err := route.WritePacket(stack.NetworkHeaderParams{
                        Protocol: e.TransProto,
                        TTL:      route.DefaultTTL(),
                        TOS:      stack.DefaultTOS,
                }, pkt); err != nil {
                        return 0, err
                }
        }

        return int64(len(payloadBytes)), nil
}

// Disconnect implements tcpip.Endpoint.Disconnect.
func (*endpoint) Disconnect() tcpip.Error {
        return &tcpip.ErrNotSupported{}
}

// Connect implements tcpip.Endpoint.Connect.
func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error {
        // Raw sockets do not support connecting to a IPv4 address on a IPv6 endpoint.
        if e.TransportEndpointInfo.NetProto == header.IPv6ProtocolNumber && len(addr.Addr) != header.IPv6AddressSize {
                return &tcpip.ErrAddressFamilyNotSupported{}
        }

        e.mu.Lock()
        defer e.mu.Unlock()

        if e.closed {
                return &tcpip.ErrInvalidEndpointState{}
        }

        nic := addr.NIC
        if e.bound {
                if e.BindNICID == 0 {
                        // If we're bound, but not to a specific NIC, the NIC
                        // in addr will be used. Nothing to do here.
                } else if addr.NIC == 0 {
                        // If we're bound to a specific NIC, but addr doesn't
                        // specify a NIC, use the bound NIC.
                        nic = e.BindNICID
                } else if addr.NIC != e.BindNICID {
                        // We're bound and addr specifies a NIC. They must be
                        // the same.
                        return &tcpip.ErrInvalidEndpointState{}
                }
        }

        // Find a route to the destination.
        route, err := e.stack.FindRoute(nic, "", addr.Addr, e.NetProto, false)
        if err != nil {
                return err
        }

        if e.associated {
                // Re-register the endpoint with the appropriate NIC.
                if err := e.stack.RegisterRawTransportEndpoint(e.NetProto, e.TransProto, e); err != nil {
                        route.Release()
                        return err
                }
                e.stack.UnregisterRawTransportEndpoint(e.NetProto, e.TransProto, e)
                e.RegisterNICID = nic
        }

        if e.route != nil {
                // If the endpoint was previously connected then release any previous route.
                e.route.Release()
        }
        e.route = route
        e.connected = true

        return nil
}

// Shutdown implements tcpip.Endpoint.Shutdown. It's a noop for raw sockets.
func (e *endpoint) Shutdown(tcpip.ShutdownFlags) tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()

        if !e.connected {
                return &tcpip.ErrNotConnected{}
        }
        return nil
}

// Listen implements tcpip.Endpoint.Listen.
func (*endpoint) Listen(int) tcpip.Error {
        return &tcpip.ErrNotSupported{}
}

// Accept implements tcpip.Endpoint.Accept.
func (*endpoint) Accept(*tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) {
        return nil, nil, &tcpip.ErrNotSupported{}
}

// Bind implements tcpip.Endpoint.Bind.
func (e *endpoint) Bind(addr tcpip.FullAddress) tcpip.Error {
        e.mu.Lock()
        defer e.mu.Unlock()

        // If a local address was specified, verify that it's valid.
        if len(addr.Addr) != 0 && e.stack.CheckLocalAddress(e.RegisterNICID, e.NetProto, addr.Addr) == 0 {
                return &tcpip.ErrBadLocalAddress{}
        }

        if e.associated {
                // Re-register the endpoint with the appropriate NIC.
                if err := e.stack.RegisterRawTransportEndpoint(e.NetProto, e.TransProto, e); err != nil {
                        return err
                }
                e.stack.UnregisterRawTransportEndpoint(e.NetProto, e.TransProto, e)
                e.RegisterNICID = addr.NIC
                e.BindNICID = addr.NIC
        }

        e.BindAddr = addr.Addr
        e.bound = true

        return nil
}

// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
func (*endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
        return tcpip.FullAddress{}, &tcpip.ErrNotSupported{}
}

// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress.
func (*endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) {
        // Even a connected socket doesn't return a remote address.
        return tcpip.FullAddress{}, &tcpip.ErrNotConnected{}
}

// Readiness implements tcpip.Endpoint.Readiness.
func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
        // The endpoint is always writable.
        result := waiter.WritableEvents & mask

        // Determine whether the endpoint is readable.
        if (mask & waiter.ReadableEvents) != 0 {
                e.rcvMu.Lock()
                if !e.rcvList.Empty() || e.rcvClosed {
                        result |= waiter.ReadableEvents
                }
                e.rcvMu.Unlock()
        }

        return result
}

// SetSockOpt implements tcpip.Endpoint.SetSockOpt.
func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
        switch opt.(type) {
        case *tcpip.SocketDetachFilterOption:
                return nil

        default:
                return &tcpip.ErrUnknownProtocolOption{}
        }
}

func (*endpoint) SetSockOptInt(tcpip.SockOptInt, int) tcpip.Error {
        return &tcpip.ErrUnknownProtocolOption{}
}

// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
func (*endpoint) GetSockOpt(tcpip.GettableSocketOption) tcpip.Error {
        return &tcpip.ErrUnknownProtocolOption{}
}

// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
        switch opt {
        case tcpip.ReceiveQueueSizeOption:
                v := 0
                e.rcvMu.Lock()
                if !e.rcvList.Empty() {
                        p := e.rcvList.Front()
                        v = p.data.Size()
                }
                e.rcvMu.Unlock()
                return v, nil

        default:
                return -1, &tcpip.ErrUnknownProtocolOption{}
        }
}

// HandlePacket implements stack.RawTransportEndpoint.HandlePacket.
func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) {
        e.mu.RLock()
        e.rcvMu.Lock()

        // Drop the packet if our buffer is currently full or if this is an unassociated
        // endpoint (i.e endpoint created  w/ IPPROTO_RAW). Such endpoints are send only
        // See: https://man7.org/linux/man-pages/man7/raw.7.html
        //
        //    An IPPROTO_RAW socket is send only.  If you really want to receive
        //    all IP packets, use a packet(7) socket with the ETH_P_IP protocol.
        //    Note that packet sockets don't reassemble IP fragments, unlike raw
        //    sockets.
        if e.rcvClosed || !e.associated {
                e.rcvMu.Unlock()
                e.mu.RUnlock()
                e.stack.Stats().DroppedPackets.Increment()
                e.stats.ReceiveErrors.ClosedReceiver.Increment()
                return
        }

        rcvBufSize := e.ops.GetReceiveBufferSize()
        if e.frozen || e.rcvBufSize >= int(rcvBufSize) {
                e.rcvMu.Unlock()
                e.mu.RUnlock()
                e.stack.Stats().DroppedPackets.Increment()
                e.stats.ReceiveErrors.ReceiveBufferOverflow.Increment()
                return
        }

        remoteAddr := pkt.Network().SourceAddress()

        if e.bound {
                // If bound to a NIC, only accept data for that NIC.
                if e.BindNICID != 0 && e.BindNICID != pkt.NICID {
                        e.rcvMu.Unlock()
                        e.mu.RUnlock()
                        return
                }
                // If bound to an address, only accept data for that address.
                if e.BindAddr != "" && e.BindAddr != remoteAddr {
                        e.rcvMu.Unlock()
                        e.mu.RUnlock()
                        return
                }
        }

        // If connected, only accept packets from the remote address we
        // connected to.
        if e.connected && e.route.RemoteAddress() != remoteAddr {
                e.rcvMu.Unlock()
                e.mu.RUnlock()
                return
        }

        wasEmpty := e.rcvBufSize == 0

        // Push new packet into receive list and increment the buffer size.
        packet := &rawPacket{
                senderAddr: tcpip.FullAddress{
                        NIC:  pkt.NICID,
                        Addr: remoteAddr,
                },
        }

        // Raw IPv4 endpoints return the IP header, but IPv6 endpoints do not.
        // We copy headers' underlying bytes because pkt.*Header may point to
        // the middle of a slice, and another struct may point to the "outer"
        // slice. Save/restore doesn't support overlapping slices and will fail.
        var combinedVV buffer.VectorisedView
        if e.TransportEndpointInfo.NetProto == header.IPv4ProtocolNumber {
                network, transport := pkt.NetworkHeader().View(), pkt.TransportHeader().View()
                headers := make(buffer.View, 0, len(network)+len(transport))
                headers = append(headers, network...)
                headers = append(headers, transport...)
                combinedVV = headers.ToVectorisedView()
        } else {
                combinedVV = append(buffer.View(nil), pkt.TransportHeader().View()...).ToVectorisedView()
        }
        combinedVV.Append(pkt.Data().ExtractVV())
        packet.data = combinedVV
        packet.receivedAt = e.stack.Clock().Now()

        e.rcvList.PushBack(packet)
        e.rcvBufSize += packet.data.Size()
        e.rcvMu.Unlock()
        e.mu.RUnlock()
        e.stats.PacketsReceived.Increment()
        // Notify waiters that there's data to be read.
        if wasEmpty {
                e.waiterQueue.Notify(waiter.ReadableEvents)
        }
}

// State implements socket.Socket.State.
func (e *endpoint) State() uint32 {
        return 0
}

// Info returns a copy of the endpoint info.
func (e *endpoint) Info() tcpip.EndpointInfo {
        e.mu.RLock()
        // Make a copy of the endpoint info.
        ret := e.TransportEndpointInfo
        e.mu.RUnlock()
        return &ret
}

// Stats returns a pointer to the endpoint stats.
func (e *endpoint) Stats() tcpip.EndpointStats {
        return &e.stats
}

// Wait implements stack.TransportEndpoint.Wait.
func (*endpoint) Wait() {}

// LastError implements tcpip.Endpoint.LastError.
func (*endpoint) LastError() tcpip.Error {
        return nil
}

// SocketOptions implements tcpip.Endpoint.SocketOptions.
func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
        return &e.ops
}

// freeze prevents any more packets from being delivered to the endpoint.
func (e *endpoint) freeze() {
        e.mu.Lock()
        e.frozen = true
        e.mu.Unlock()
}

// thaw unfreezes a previously frozen endpoint using endpoint.freeze() allows
// new packets to be delivered again.
func (e *endpoint) thaw() {
        e.mu.Lock()
        e.frozen = false
        e.mu.Unlock()
}



























    2 













    2 




    2 





    1 


    1 



   20 



    2 


   18 



   18 




    2 


   16 





    4 



    1 


    3 




    2 


    1 





    3 






    3 
    1 


    2 
    1 


    1 







    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

// Getcwd implements Linux syscall getcwd(2).
func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        size := args[1].SizeT()

        root := t.FSContext().RootDirectoryVFS2()
        wd := t.FSContext().WorkingDirectoryVFS2()
        s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd)
        root.DecRef(t)
        wd.DecRef(t)
        if err != nil {
                return 0, nil, err
        }

        // Note this is >= because we need a terminator.
        if uint(len(s)) >= size {
                return 0, nil, linuxerr.ERANGE
        }

        // Construct a byte slice containing a NUL terminator.
        buf := t.CopyScratchBuffer(len(s) + 1)
        copy(buf, s)
        buf[len(buf)-1] = 0

        // Write the pathname slice.
        n, err := t.CopyOutBytes(addr, buf)
        if err != nil {
                return 0, nil, err
        }
        return uintptr(n), nil, nil
}

// Chdir implements Linux syscall chdir(2).
func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()

        path, err := copyInPath(t, addr)
        if err != nil {
                return 0, nil, err
        }
        tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
        if err != nil {
                return 0, nil, err
        }
        defer tpop.Release(t)

        vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
                CheckSearchable: true,
        })
        if err != nil {
                return 0, nil, err
        }
        t.FSContext().SetWorkingDirectoryVFS2(t, vd)
        vd.DecRef(t)
        return 0, nil, nil
}

// Fchdir implements Linux syscall fchdir(2).
func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()

        tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
        if err != nil {
                return 0, nil, err
        }
        defer tpop.Release(t)

        vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
                CheckSearchable: true,
        })
        if err != nil {
                return 0, nil, err
        }
        t.FSContext().SetWorkingDirectoryVFS2(t, vd)
        vd.DecRef(t)
        return 0, nil, nil
}

// Chroot implements Linux syscall chroot(2).
func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()

        if !t.HasCapability(linux.CAP_SYS_CHROOT) {
                return 0, nil, linuxerr.EPERM
        }

        path, err := copyInPath(t, addr)
        if err != nil {
                return 0, nil, err
        }
        tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
        if err != nil {
                return 0, nil, err
        }
        defer tpop.Release(t)

        vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
                CheckSearchable: true,
        })
        if err != nil {
                return 0, nil, err
        }
        t.FSContext().SetRootDirectoryVFS2(t, vd)
        vd.DecRef(t)
        return 0, nil, nil
}










































  135 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package usage

import (
        "time"
)

// CPUStats contains the subset of struct rusage fields that relate to CPU
// scheduling.
//
// +stateify savable
type CPUStats struct {
        // UserTime is the amount of time spent executing application code.
        UserTime time.Duration

        // SysTime is the amount of time spent executing sentry code.
        SysTime time.Duration

        // VoluntarySwitches is the number of times control has been voluntarily
        // ceded due to blocking, etc.
        VoluntarySwitches uint64

        // InvoluntarySwitches (struct rusage::ru_nivcsw) is unsupported, since
        // "preemptive" scheduling is managed by the Go runtime, which doesn't
        // provide this information.
}

// Accumulate adds s2 to s.
func (s *CPUStats) Accumulate(s2 CPUStats) {
        s.UserTime += s2.UserTime
        s.SysTime += s2.SysTime
        s.VoluntarySwitches += s2.VoluntarySwitches
}





























































  685 























 1946 












  594 


















 1947 
 1944 


  683 











  678 




  744 









  746 











  750 









 1943 
  742 







 1952 








 1941 










 1948 
   28 
   22 


   25 




 1321 

 1944 


    1 





 1947 






































 1943 
    6 

    1 



    6 


 1939 





    1 



 1945 
 1940 








  743 







  745 

  798 








  743 




  776 












  283 






  283 






    3 





  418 
  280 







  245 













  419 

















 1930 











   49 













   76 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "bytes"
        "fmt"
        "runtime"
        "runtime/trace"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/goid"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/hostcpu"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/platform"
        "gvisor.dev/gvisor/pkg/syserror"
)

// A taskRunState is a reified state in the task state machine. See README.md
// for details. The canonical list of all run states, as well as transitions
// between them, is given in run_states.dot.
//
// The set of possible states is enumerable and completely defined by the
// kernel package, so taskRunState would ideally be represented by a
// discriminated union. However, Go does not support sum types.
//
// Hence, as with TaskStop, data-free taskRunStates should be represented as
// typecast nils to avoid unnecessary allocation.
type taskRunState interface {
        // execute executes the code associated with this state over the given task
        // and returns the following state. If execute returns nil, the task
        // goroutine should exit.
        //
        // It is valid to tail-call a following state's execute to avoid the
        // overhead of converting the following state to an interface object and
        // checking for stops, provided that the tail-call cannot recurse.
        execute(*Task) taskRunState
}

// run runs the task goroutine.
//
// threadID a dummy value set to the task's TID in the root PID namespace to
// make it visible in stack dumps. A goroutine for a given task can be identified
// searching for Task.run()'s argument value.
func (t *Task) run(threadID uintptr) {
        atomic.StoreInt64(&t.goid, goid.Get())

        // Construct t.blockingTimer here. We do this here because we can't
        // reconstruct t.blockingTimer during restore in Task.afterLoad(), because
        // kernel.timekeeper.SetClocks() hasn't been called yet.
        blockingTimerNotifier, blockingTimerChan := ktime.NewChannelNotifier()
        t.blockingTimer = ktime.NewTimer(t.k.MonotonicClock(), blockingTimerNotifier)
        defer t.blockingTimer.Destroy()
        t.blockingTimerChan = blockingTimerChan

        // Activate our address space.
        t.Activate()
        // The corresponding t.Deactivate occurs in the exit path
        // (runExitMain.execute) so that when
        // Platform.CooperativelySharesAddressSpace() == true, we give up the
        // AddressSpace before the task goroutine finishes executing.

        // If this is a newly-started task, it should check for participation in
        // group stops. If this is a task resuming after restore, it was
        // interrupted by saving. In either case, the task is initially
        // interrupted.
        t.interruptSelf()

        for {
                // Explanation for this ordering:
                //
                // - A freshly-started task that is stopped should not do anything
                // before it enters the stop.
                //
                // - If taskRunState.execute returns nil, the task goroutine should
                // exit without checking for a stop.
                //
                // - Task.Start won't start Task.run if t.runState is nil, so this
                // ordering is safe.
                t.doStop()
                t.runState = t.runState.execute(t)
                if t.runState == nil {
                        t.accountTaskGoroutineEnter(TaskGoroutineNonexistent)
                        t.goroutineStopped.Done()
                        t.tg.liveGoroutines.Done()
                        t.tg.pidns.owner.liveGoroutines.Done()
                        t.tg.pidns.owner.runningGoroutines.Done()
                        t.p.Release()

                        // Deferring this store triggers a false positive in the race
                        // detector (https://github.com/golang/go/issues/42599).
                        atomic.StoreInt64(&t.goid, 0)
                        // Keep argument alive because stack trace for dead variables may not be correct.
                        runtime.KeepAlive(threadID)
                        return
                }
        }
}

// doStop is called by Task.run to block until the task is not stopped.
func (t *Task) doStop() {
        if atomic.LoadInt32(&t.stopCount) == 0 {
                return
        }
        t.Deactivate()
        // NOTE(b/30316266): t.Activate() must be called without any locks held, so
        // this defer must precede the defer for unlocking the signal mutex.
        defer t.Activate()
        t.accountTaskGoroutineEnter(TaskGoroutineStopped)
        defer t.accountTaskGoroutineLeave(TaskGoroutineStopped)
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        t.tg.pidns.owner.runningGoroutines.Add(-1)
        defer t.tg.pidns.owner.runningGoroutines.Add(1)
        t.goroutineStopped.Add(-1)
        defer t.goroutineStopped.Add(1)
        for t.stopCount > 0 {
                t.endStopCond.Wait()
        }
}

func (*runApp) handleCPUIDInstruction(t *Task) error {
        if len(arch.CPUIDInstruction) == 0 {
                // CPUID emulation isn't supported, but this code can be
                // executed, because the ptrace platform returns
                // ErrContextSignalCPUID on page faults too. Look at
                // pkg/sentry/platform/ptrace/ptrace.go:context.Switch for more
                // details.
                return platform.ErrContextSignal
        }
        // Is this a CPUID instruction?
        region := trace.StartRegion(t.traceContext, cpuidRegion)
        expected := arch.CPUIDInstruction[:]
        found := make([]byte, len(expected))
        _, err := t.CopyInBytes(hostarch.Addr(t.Arch().IP()), found)
        if err == nil && bytes.Equal(expected, found) {
                // Skip the cpuid instruction.
                t.Arch().CPUIDEmulate(t)
                t.Arch().SetIP(t.Arch().IP() + uintptr(len(expected)))
                region.End()

                return nil
        }
        region.End() // Not an actual CPUID, but required copy-in.
        return platform.ErrContextSignal
}

// The runApp state checks for interrupts before executing untrusted
// application code.
//
// +stateify savable
type runApp struct{}

func (app *runApp) execute(t *Task) taskRunState {
        if t.interrupted() {
                // Checkpointing instructs tasks to stop by sending an interrupt, so we
                // must check for stops before entering runInterrupt (instead of
                // tail-calling it).
                return (*runInterrupt)(nil)
        }

        // Execute any task work callbacks before returning to user space.
        if atomic.LoadInt32(&t.taskWorkCount) > 0 {
                t.taskWorkMu.Lock()
                queue := t.taskWork
                t.taskWork = nil
                atomic.StoreInt32(&t.taskWorkCount, 0)
                t.taskWorkMu.Unlock()

                // Do not hold taskWorkMu while executing task work, which may register
                // more work.
                for _, work := range queue {
                        work.TaskWork(t)
                }
        }

        // We're about to switch to the application again. If there's still an
        // unhandled SyscallRestartErrno that wasn't translated to an EINTR,
        // restart the syscall that was interrupted. If there's a saved signal
        // mask, restore it. (Note that restoring the saved signal mask may unblock
        // a pending signal, causing another interruption, but that signal should
        // not interact with the interrupted syscall.)
        if t.haveSyscallReturn {
                if sre, ok := syserror.SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
                        if sre == syserror.ERESTART_RESTARTBLOCK {
                                t.Debugf("Restarting syscall %d with restart block after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
                                t.Arch().RestartSyscallWithRestartBlock()
                        } else {
                                t.Debugf("Restarting syscall %d after errno %d: not interrupted by handled signal", t.Arch().SyscallNo(), sre)
                                t.Arch().RestartSyscall()
                        }
                }
                t.haveSyscallReturn = false
        }
        if t.haveSavedSignalMask {
                t.SetSignalMask(t.savedSignalMask)
                t.haveSavedSignalMask = false
                if t.interrupted() {
                        return (*runInterrupt)(nil)
                }
        }

        // Apply restartable sequences.
        if t.rseqPreempted {
                t.rseqPreempted = false
                if t.rseqAddr != 0 || t.oldRSeqCPUAddr != 0 {
                        // Linux writes the CPU on every preemption. We only do
                        // so if it changed. Thus we may delay delivery of
                        // SIGSEGV if rseqAddr/oldRSeqCPUAddr is invalid.
                        cpu := int32(hostcpu.GetCPU())
                        if t.rseqCPU != cpu {
                                t.rseqCPU = cpu
                                if err := t.rseqCopyOutCPU(); err != nil {
                                        t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
                                        t.forceSignal(linux.SIGSEGV, false)
                                        t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                                        // Re-enter the task run loop for signal delivery.
                                        return (*runApp)(nil)
                                }
                                if err := t.oldRSeqCopyOutCPU(); err != nil {
                                        t.Debugf("Failed to copy CPU to %#x for old rseq: %v", t.oldRSeqCPUAddr, err)
                                        t.forceSignal(linux.SIGSEGV, false)
                                        t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
                                        // Re-enter the task run loop for signal delivery.
                                        return (*runApp)(nil)
                                }
                        }
                }
                t.rseqInterrupt()
        }

        // Check if we need to enable single-stepping. Tracers expect that the
        // kernel preserves the value of the single-step flag set by PTRACE_SETREGS
        // whether or not PTRACE_SINGLESTEP/PTRACE_SYSEMU_SINGLESTEP is used (this
        // includes our ptrace platform, by the way), so we should only clear the
        // single-step flag if we're responsible for setting it. (clearSinglestep
        // is therefore analogous to Linux's TIF_FORCED_TF.)
        //
        // Strictly speaking, we should also not clear the single-step flag if we
        // single-step through an instruction that sets the single-step flag
        // (arch/x86/kernel/step.c:is_setting_trap_flag()). But nobody sets their
        // own TF. (Famous last words, I know.)
        clearSinglestep := false
        if t.hasTracer() {
                t.tg.pidns.owner.mu.RLock()
                if t.ptraceSinglestep {
                        clearSinglestep = !t.Arch().SingleStep()
                        t.Arch().SetSingleStep()
                }
                t.tg.pidns.owner.mu.RUnlock()
        }

        region := trace.StartRegion(t.traceContext, runRegion)
        t.accountTaskGoroutineEnter(TaskGoroutineRunningApp)
        info, at, err := t.p.Switch(t, t.MemoryManager(), t.Arch(), t.rseqCPU)
        t.accountTaskGoroutineLeave(TaskGoroutineRunningApp)
        region.End()

        if clearSinglestep {
                t.Arch().ClearSingleStep()
        }

        switch err {
        case nil:
                // Handle application system call.
                return t.doSyscall()

        case platform.ErrContextInterrupt:
                // Interrupted by platform.Context.Interrupt(). Re-enter the run
                // loop to figure out why.
                return (*runApp)(nil)

        case platform.ErrContextSignalCPUID:
                if err := app.handleCPUIDInstruction(t); err == nil {
                        // Resume execution.
                        return (*runApp)(nil)
                }

                // The instruction at the given RIP was not a CPUID, and we
                // fallthrough to the default signal deliver behavior below.
                fallthrough

        case platform.ErrContextSignal:
                // Looks like a signal has been delivered to us. If it's a synchronous
                // signal (SEGV, SIGBUS, etc.), it should be sent to the application
                // thread that received it.
                sig := linux.Signal(info.Signo)

                // Was it a fault that we should handle internally? If so, this wasn't
                // an application-generated signal and we should continue execution
                // normally.
                if at.Any() {
                        region := trace.StartRegion(t.traceContext, faultRegion)
                        addr := hostarch.Addr(info.Addr())
                        err := t.MemoryManager().HandleUserFault(t, addr, at, hostarch.Addr(t.Arch().Stack()))
                        region.End()
                        if err == nil {
                                // The fault was handled appropriately.
                                // We can resume running the application.
                                return (*runApp)(nil)
                        }

                        // Is this a vsyscall that we need emulate?
                        //
                        // Note that we don't track vsyscalls as part of a
                        // specific trace region. This is because regions don't
                        // stack, and the actual system call will count as a
                        // region. We should be able to easily identify
                        // vsyscalls by having a <fault><syscall> pair.
                        if at.Execute {
                                if sysno, ok := t.image.st.LookupEmulate(addr); ok {
                                        return t.doVsyscall(addr, sysno)
                                }
                        }

                        // Faults are common, log only at debug level.
                        t.Debugf("Unhandled user fault: addr=%x ip=%x access=%v err=%v", addr, t.Arch().IP(), at, err)
                        t.DebugDumpState()

                        // Continue to signal handling.
                        //
                        // Convert a BusError error to a SIGBUS from a SIGSEGV. All
                        // other info bits stay the same (address, etc.).
                        if _, ok := err.(*memmap.BusError); ok {
                                sig = linux.SIGBUS
                                info.Signo = int32(linux.SIGBUS)
                        }
                }

                switch sig {
                case linux.SIGILL, linux.SIGSEGV, linux.SIGBUS, linux.SIGFPE, linux.SIGTRAP:
                        // Synchronous signal. Send it to ourselves. Assume the signal is
                        // legitimate and force it (work around the signal being ignored or
                        // blocked) like Linux does. Conveniently, this is even the correct
                        // behavior for SIGTRAP from single-stepping.
                        t.forceSignal(linux.Signal(sig), false /* unconditional */)
                        t.SendSignal(info)

                case platform.SignalInterrupt:
                        // Assume that a call to platform.Context.Interrupt() misfired.

                case linux.SIGPROF:
                        // It's a profiling interrupt: there's not much
                        // we can do. We've already paid a decent cost
                        // by intercepting the signal, at this point we
                        // simply ignore it.

                default:
                        // Asynchronous signal. Let the system deal with it.
                        t.k.sendExternalSignal(info, "application")
                }

                return (*runApp)(nil)

        case platform.ErrContextCPUPreempted:
                // Ensure that rseq critical sections are interrupted and per-thread
                // CPU values are updated before the next platform.Context.Switch().
                t.rseqPreempted = true
                return (*runApp)(nil)

        default:
                // What happened? Can't continue.
                t.Warningf("Unexpected SwitchToApp error: %v", err)
                t.PrepareExit(linux.WaitStatusExit(int32(ExtractErrno(err, -1))))
                return (*runExit)(nil)
        }
}

// assertTaskGoroutine panics if the caller is not running on t's task
// goroutine.
func (t *Task) assertTaskGoroutine() {
        if got, want := goid.Get(), atomic.LoadInt64(&t.goid); got != want {
                panic(fmt.Sprintf("running on goroutine %d (task goroutine for kernel.Task %p is %d)", got, t, want))
        }
}

// GoroutineID returns the ID of t's task goroutine.
func (t *Task) GoroutineID() int64 {
        return atomic.LoadInt64(&t.goid)
}

// waitGoroutineStoppedOrExited blocks until t's task goroutine stops or exits.
func (t *Task) waitGoroutineStoppedOrExited() {
        t.goroutineStopped.Wait()
}

// WaitExited blocks until all task goroutines in tg have exited.
//
// WaitExited does not correspond to anything in Linux; it's provided so that
// external callers of Kernel.CreateProcess can wait for the created thread
// group to terminate.
func (tg *ThreadGroup) WaitExited() {
        tg.liveGoroutines.Wait()
}

// Yield yields the processor for the calling task.
func (t *Task) Yield() {
        atomic.AddUint64(&t.yieldCount, 1)
        runtime.Gosched()
}
































   38 


























   38 













    3 
















   35 











   36 


   38 







   38 



   38 

   38 









   38 
















   38 




   38 





   17 












    2 

    2 


    2 


   17 



   17 













   17 





    3 




    3 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/bpf"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/syserror"
)

const maxSyscallFilterInstructions = 1 << 15

// dataAsBPFInput returns a serialized BPF program, only valid on the current task
// goroutine.
//
// Note: this is called for every syscall, which is a very hot path.
func dataAsBPFInput(t *Task, d *linux.SeccompData) bpf.Input {
        buf := t.CopyScratchBuffer(d.SizeBytes())
        d.MarshalUnsafe(buf)
        return bpf.InputBytes{
                Data: buf,
                // Go-marshal always uses the native byte order.
                Order: hostarch.ByteOrder,
        }
}

func seccompSiginfo(t *Task, errno, sysno int32, ip hostarch.Addr) *linux.SignalInfo {
        si := &linux.SignalInfo{
                Signo: int32(linux.SIGSYS),
                Errno: errno,
                Code:  linux.SYS_SECCOMP,
        }
        si.SetCallAddr(uint64(ip))
        si.SetSyscall(sysno)
        si.SetArch(t.SyscallTable().AuditNumber)
        return si
}

// checkSeccompSyscall applies the task's seccomp filters before the execution
// of syscall sysno at instruction pointer ip. (These parameters must be passed
// in because vsyscalls do not use the values in t.Arch().)
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) checkSeccompSyscall(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) linux.BPFAction {
        result := linux.BPFAction(t.evaluateSyscallFilters(sysno, args, ip))
        action := result & linux.SECCOMP_RET_ACTION
        switch action {
        case linux.SECCOMP_RET_TRAP:
                // "Results in the kernel sending a SIGSYS signal to the triggering
                // task without executing the system call. ... The SECCOMP_RET_DATA
                // portion of the return value will be passed as si_errno." -
                // Documentation/prctl/seccomp_filter.txt
                t.SendSignal(seccompSiginfo(t, int32(result.Data()), sysno, ip))
                // "The return value register will contain an arch-dependent value." In
                // practice, it's ~always the syscall number.
                t.Arch().SetReturn(uintptr(sysno))

        case linux.SECCOMP_RET_ERRNO:
                // "Results in the lower 16-bits of the return value being passed to
                // userland as the errno without executing the system call."
                t.Arch().SetReturn(-uintptr(result.Data()))

        case linux.SECCOMP_RET_TRACE:
                // "When returned, this value will cause the kernel to attempt to
                // notify a ptrace()-based tracer prior to executing the system call.
                // If there is no tracer present, -ENOSYS is returned to userland and
                // the system call is not executed."
                if !t.ptraceSeccomp(result.Data()) {
                        // This useless-looking temporary is needed because Go.
                        tmp := uintptr(unix.ENOSYS)
                        t.Arch().SetReturn(-tmp)
                        return linux.SECCOMP_RET_ERRNO
                }

        case linux.SECCOMP_RET_ALLOW:
                // "Results in the system call being executed."

        case linux.SECCOMP_RET_KILL_THREAD:
                // "Results in the task exiting immediately without executing the
                // system call. The exit status of the task will be SIGSYS, not
                // SIGKILL."

        default:
                // consistent with Linux
                return linux.SECCOMP_RET_KILL_THREAD
        }
        return action
}

func (t *Task) evaluateSyscallFilters(sysno int32, args arch.SyscallArguments, ip hostarch.Addr) uint32 {
        data := linux.SeccompData{
                Nr:                 sysno,
                Arch:               t.image.st.AuditNumber,
                InstructionPointer: uint64(ip),
        }
        // data.args is []uint64 and args is []arch.SyscallArgument (uintptr), so
        // we can't do any slicing tricks or even use copy/append here.
        for i, arg := range args {
                if i >= len(data.Args) {
                        break
                }
                data.Args[i] = arg.Uint64()
        }
        input := dataAsBPFInput(t, &data)

        ret := uint32(linux.SECCOMP_RET_ALLOW)
        f := t.syscallFilters.Load()
        if f == nil {
                return ret
        }

        // "Every filter successfully installed will be evaluated (in reverse
        // order) for each system call the task makes." - kernel/seccomp.c
        for i := len(f.([]bpf.Program)) - 1; i >= 0; i-- {
                thisRet, err := bpf.Exec(f.([]bpf.Program)[i], input)
                if err != nil {
                        t.Debugf("seccomp-bpf filter %d returned error: %v", i, err)
                        thisRet = uint32(linux.SECCOMP_RET_KILL_THREAD)
                }
                // "If multiple filters exist, the return value for the evaluation of a
                // given system call will always use the highest precedent value." -
                // Documentation/prctl/seccomp_filter.txt
                //
                // (Note that this contradicts prctl(2): "If the filters permit prctl()
                // calls, then additional filters can be added; they are run in order
                // until the first non-allow result is seen." prctl(2) is incorrect.)
                //
                // "The ordering ensures that a min_t() over composed return values
                // always selects the least permissive choice." -
                // include/uapi/linux/seccomp.h
                if (thisRet & linux.SECCOMP_RET_ACTION) < (ret & linux.SECCOMP_RET_ACTION) {
                        ret = thisRet
                }
        }

        return ret
}

// AppendSyscallFilter adds BPF program p as a system call filter.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) AppendSyscallFilter(p bpf.Program, syncAll bool) error {
        // While syscallFilters are an atomic.Value we must take the mutex to prevent
        // our read-copy-update from happening while another task is syncing syscall
        // filters to us, this keeps the filters in a consistent state.
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()

        // Cap the combined length of all syscall filters (plus a penalty of 4
        // instructions per filter beyond the first) to maxSyscallFilterInstructions.
        // This restriction is inherited from Linux.
        totalLength := p.Length()
        var newFilters []bpf.Program

        if sf := t.syscallFilters.Load(); sf != nil {
                oldFilters := sf.([]bpf.Program)
                for _, f := range oldFilters {
                        totalLength += f.Length() + 4
                }
                newFilters = append(newFilters, oldFilters...)
        }

        if totalLength > maxSyscallFilterInstructions {
                return syserror.ENOMEM
        }

        newFilters = append(newFilters, p)
        t.syscallFilters.Store(newFilters)

        if syncAll {
                // Note: No new privs is always assumed to be set.
                for ot := t.tg.tasks.Front(); ot != nil; ot = ot.Next() {
                        if ot != t {
                                var copiedFilters []bpf.Program
                                copiedFilters = append(copiedFilters, newFilters...)
                                ot.syscallFilters.Store(copiedFilters)
                        }
                }
        }

        return nil
}

// SeccompMode returns a SECCOMP_MODE_* constant indicating the task's current
// seccomp syscall filtering mode, appropriate for both prctl(PR_GET_SECCOMP)
// and /proc/[pid]/status.
func (t *Task) SeccompMode() int {
        f := t.syscallFilters.Load()
        if f != nil && len(f.([]bpf.Program)) > 0 {
                return linux.SECCOMP_MODE_FILTER
        }
        return linux.SECCOMP_MODE_NONE
}





































    2 














    1 



















    1 




    1 




    1 




    1 









    1 




    1 









    1 




    1 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/usermem"
)

// opathFD implements FileDescriptionImpl for a file description opened with O_PATH.
//
// +stateify savable
type opathFD struct {
        vfsfd FileDescription
        FileDescriptionDefaultImpl
        BadLockFD
}

// Release implements FileDescriptionImpl.Release.
func (fd *opathFD) Release(context.Context) {
        // noop
}

// Allocate implements FileDescriptionImpl.Allocate.
func (fd *opathFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
        return linuxerr.EBADF
}

// PRead implements FileDescriptionImpl.PRead.
func (fd *opathFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
        return 0, linuxerr.EBADF
}

// Read implements FileDescriptionImpl.Read.
func (fd *opathFD) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
        return 0, linuxerr.EBADF
}

// PWrite implements FileDescriptionImpl.PWrite.
func (fd *opathFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
        return 0, linuxerr.EBADF
}

// Write implements FileDescriptionImpl.Write.
func (fd *opathFD) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
        return 0, linuxerr.EBADF
}

// Ioctl implements FileDescriptionImpl.Ioctl.
func (fd *opathFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        return 0, linuxerr.EBADF
}

// IterDirents implements FileDescriptionImpl.IterDirents.
func (fd *opathFD) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
        return linuxerr.EBADF
}

// Seek implements FileDescriptionImpl.Seek.
func (fd *opathFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        return 0, linuxerr.EBADF
}

// ConfigureMMap implements FileDescriptionImpl.ConfigureMMap.
func (fd *opathFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
        return linuxerr.EBADF
}

// ListXattr implements FileDescriptionImpl.ListXattr.
func (fd *opathFD) ListXattr(ctx context.Context, size uint64) ([]string, error) {
        return nil, linuxerr.EBADF
}

// GetXattr implements FileDescriptionImpl.GetXattr.
func (fd *opathFD) GetXattr(ctx context.Context, opts GetXattrOptions) (string, error) {
        return "", linuxerr.EBADF
}

// SetXattr implements FileDescriptionImpl.SetXattr.
func (fd *opathFD) SetXattr(ctx context.Context, opts SetXattrOptions) error {
        return linuxerr.EBADF
}

// RemoveXattr implements FileDescriptionImpl.RemoveXattr.
func (fd *opathFD) RemoveXattr(ctx context.Context, name string) error {
        return linuxerr.EBADF
}

// Sync implements FileDescriptionImpl.Sync.
func (fd *opathFD) Sync(ctx context.Context) error {
        return linuxerr.EBADF
}

// SetStat implements FileDescriptionImpl.SetStat.
func (fd *opathFD) SetStat(ctx context.Context, opts SetStatOptions) error {
        return linuxerr.EBADF
}

// Stat implements FileDescriptionImpl.Stat.
func (fd *opathFD) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
        vfsObj := fd.vfsfd.vd.mount.vfs
        rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
                Root:  fd.vfsfd.vd,
                Start: fd.vfsfd.vd,
        })
        stat, err := fd.vfsfd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
        rp.Release(ctx)
        return stat, err
}

// StatFS returns metadata for the filesystem containing the file represented
// by fd.
func (fd *opathFD) StatFS(ctx context.Context) (linux.Statfs, error) {
        vfsObj := fd.vfsfd.vd.mount.vfs
        rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
                Root:  fd.vfsfd.vd,
                Start: fd.vfsfd.vd,
        })
        statfs, err := fd.vfsfd.vd.mount.fs.impl.StatFSAt(ctx, rp)
        rp.Release(ctx)
        return statfs, err
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/process_group_refs.go: no such file or directory





































 1659 






 1656 

 1655 


  243 


 1655 


 1658 



 1660 


   13 



  437 






   18 
   18 
   15 



    3 





















 1078 













 1075 





 1077 


    5 




 1086 


 1086 


 1082 












   14 
   14 

   12 


   14 






   14 





   14 


    2 


   13 




   12 















 1080 












 1078 







 1077 




 1087 

  514 







  518 






  465 




  468 














  135 





   97 










  135 






  135 


  133 
  133 








   97 




  133 









  134 




  132 



 1048 

   93 

















   93 
   19 

   73 



   93 



   93 
   32 





   92 





   88 






   88 
   43 



   87 
   50 


   88 











   88 





   88 







   88 




   86 
 1047 

































































 1046 




 1081 




 1088 
 1026 


  649 

    6 















 1070 

 1067 


 1071 




 1068 













  129 

   69 


   95 






   54 




   42 







   50 



  119 






  120 





  119 









  120 






  120 

  109 

  109 


  109 






  109 


  109 











































































































    5 
















    5 





    3 










    5 

    3 





    5 




















  958 









  962 
    1 


  964 


   13 














   13 
   13 

   13 


   13 



   13 










 1647 









 1650 






   13 
   13 

   13 


   13 


   13 

   13 








   16 

   16 

   13 


   16 



   16 



  510 




  510 

   55 



  510 

  513 







  482 





  480 

  471 


   59 




  481 



  474 








  521 

  515 








  481 










   39 



  480 




  137 






  136 






    5 



  453 



   72 



  454 








  539 











  541 


   10 






  960 

  953 








    1 


  951 

  958 


 1068 






 1145 












 1141 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mm

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/safecopy"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/usage"
)

// existingPMAsLocked checks that pmas exist for all addresses in ar, and
// support access of type (at, ignorePermissions). If so, it returns an
// iterator to the pma containing ar.Start. Otherwise it returns a terminal
// iterator.
//
// Preconditions:
// * mm.activeMu must be locked.
// * ar.Length() != 0.
func (mm *MemoryManager) existingPMAsLocked(ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) pmaIterator {
        if checkInvariants {
                if !ar.WellFormed() || ar.Length() == 0 {
                        panic(fmt.Sprintf("invalid ar: %v", ar))
                }
        }

        first := mm.pmas.FindSegment(ar.Start)
        pseg := first
        for pseg.Ok() {
                pma := pseg.ValuePtr()
                perms := pma.effectivePerms
                if ignorePermissions {
                        perms = pma.maxPerms
                }
                if !perms.SupersetOf(at) {
                        return pmaIterator{}
                }
                if needInternalMappings && pma.internalMappings.IsEmpty() {
                        return pmaIterator{}
                }

                if ar.End <= pseg.End() {
                        return first
                }
                pseg, _ = pseg.NextNonEmpty()
        }

        // Ran out of pmas before reaching ar.End.
        return pmaIterator{}
}

// existingVecPMAsLocked returns true if pmas exist for all addresses in ars,
// and support access of type (at, ignorePermissions).
//
// Preconditions: mm.activeMu must be locked.
func (mm *MemoryManager) existingVecPMAsLocked(ars hostarch.AddrRangeSeq, at hostarch.AccessType, ignorePermissions bool, needInternalMappings bool) bool {
        for ; !ars.IsEmpty(); ars = ars.Tail() {
                if ar := ars.Head(); ar.Length() != 0 && !mm.existingPMAsLocked(ar, at, ignorePermissions, needInternalMappings).Ok() {
                        return false
                }
        }
        return true
}

// getPMAsLocked ensures that pmas exist for all addresses in ar, and support
// access of type at. It returns:
//
// - An iterator to the pma containing ar.Start. If no pma contains ar.Start,
// the iterator is unspecified.
//
// - An iterator to the gap after the last pma containing an address in ar. If
// pmas exist for no addresses in ar, the iterator is to a gap that begins
// before ar.Start.
//
// - An error that is non-nil if pmas exist for only a subset of ar.
//
// Preconditions:
// * mm.mappingMu must be locked.
// * mm.activeMu must be locked for writing.
// * ar.Length() != 0.
// * vseg.Range().Contains(ar.Start).
// * vmas must exist for all addresses in ar, and support accesses of type at
//   (i.e. permission checks must have been performed against vmas).
func (mm *MemoryManager) getPMAsLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (pmaIterator, pmaGapIterator, error) {
        if checkInvariants {
                if !ar.WellFormed() || ar.Length() == 0 {
                        panic(fmt.Sprintf("invalid ar: %v", ar))
                }
                if !vseg.Ok() {
                        panic("terminal vma iterator")
                }
                if !vseg.Range().Contains(ar.Start) {
                        panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
                }
        }

        // Page-align ar so that all AddrRanges are aligned.
        end, ok := ar.End.RoundUp()
        var alignerr error
        if !ok {
                end = ar.End.RoundDown()
                alignerr = linuxerr.EFAULT
        }
        ar = hostarch.AddrRange{ar.Start.RoundDown(), end}

        pstart, pend, perr := mm.getPMAsInternalLocked(ctx, vseg, ar, at)
        if pend.Start() <= ar.Start {
                return pmaIterator{}, pend, perr
        }
        // getPMAsInternalLocked may not have returned pstart due to iterator
        // invalidation.
        if !pstart.Ok() {
                pstart = mm.findOrSeekPrevUpperBoundPMA(ar.Start, pend)
        }
        if perr != nil {
                return pstart, pend, perr
        }
        return pstart, pend, alignerr
}

// getVecPMAsLocked ensures that pmas exist for all addresses in ars, and
// support access of type at. It returns the subset of ars for which pmas
// exist. If this is not equal to ars, it returns a non-nil error explaining
// why.
//
// Preconditions:
// * mm.mappingMu must be locked.
// * mm.activeMu must be locked for writing.
// * vmas must exist for all addresses in ars, and support accesses of type at
//   (i.e. permission checks must have been performed against vmas).
func (mm *MemoryManager) getVecPMAsLocked(ctx context.Context, ars hostarch.AddrRangeSeq, at hostarch.AccessType) (hostarch.AddrRangeSeq, error) {
        for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
                ar := arsit.Head()
                if ar.Length() == 0 {
                        continue
                }
                if checkInvariants {
                        if !ar.WellFormed() {
                                panic(fmt.Sprintf("invalid ar: %v", ar))
                        }
                }

                // Page-align ar so that all AddrRanges are aligned.
                end, ok := ar.End.RoundUp()
                var alignerr error
                if !ok {
                        end = ar.End.RoundDown()
                        alignerr = linuxerr.EFAULT
                }
                ar = hostarch.AddrRange{ar.Start.RoundDown(), end}

                _, pend, perr := mm.getPMAsInternalLocked(ctx, mm.vmas.FindSegment(ar.Start), ar, at)
                if perr != nil {
                        return truncatedAddrRangeSeq(ars, arsit, pend.Start()), perr
                }
                if alignerr != nil {
                        return truncatedAddrRangeSeq(ars, arsit, pend.Start()), alignerr
                }
        }

        return ars, nil
}

// getPMAsInternalLocked is equivalent to getPMAsLocked, with the following
// exceptions:
//
// - getPMAsInternalLocked returns a pmaIterator on a best-effort basis (that
// is, the returned iterator may be terminal, even if a pma that contains
// ar.Start exists). Returning this iterator on a best-effort basis allows
// callers that require it to use it when it's cheaply available, while also
// avoiding the overhead of retrieving it when it's not.
//
// - getPMAsInternalLocked additionally requires that ar is page-aligned.
//
// getPMAsInternalLocked is an implementation helper for getPMAsLocked and
// getVecPMAsLocked; other clients should call one of those instead.
func (mm *MemoryManager) getPMAsInternalLocked(ctx context.Context, vseg vmaIterator, ar hostarch.AddrRange, at hostarch.AccessType) (pmaIterator, pmaGapIterator, error) {
        if checkInvariants {
                if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
                        panic(fmt.Sprintf("invalid ar: %v", ar))
                }
                if !vseg.Ok() {
                        panic("terminal vma iterator")
                }
                if !vseg.Range().Contains(ar.Start) {
                        panic(fmt.Sprintf("initial vma %v does not cover start of ar %v", vseg.Range(), ar))
                }
        }

        mf := mm.mfp.MemoryFile()
        // Limit the range we allocate to ar, aligned to privateAllocUnit.
        maskAR := privateAligned(ar)
        didUnmapAS := false
        // The range in which we iterate vmas and pmas is still limited to ar, to
        // ensure that we don't allocate or COW-break a pma we don't need.
        pseg, pgap := mm.pmas.Find(ar.Start)
        pstart := pseg
        for {
                // Get pmas for this vma.
                vsegAR := vseg.Range().Intersect(ar)
                vma := vseg.ValuePtr()
        pmaLoop:
                for {
                        switch {
                        case pgap.Ok() && pgap.Start() < vsegAR.End:
                                // Need a pma here.
                                optAR := vseg.Range().Intersect(pgap.Range())
                                if checkInvariants {
                                        if optAR.Length() == 0 {
                                                panic(fmt.Sprintf("vseg %v and pgap %v do not overlap", vseg, pgap))
                                        }
                                }
                                if vma.mappable == nil {
                                        // Private anonymous mappings get pmas by allocating.
                                        allocAR := optAR.Intersect(maskAR)
                                        fr, err := mf.Allocate(uint64(allocAR.Length()), usage.Anonymous)
                                        if err != nil {
                                                return pstart, pgap, err
                                        }
                                        if checkInvariants {
                                                if !fr.WellFormed() || fr.Length() != uint64(allocAR.Length()) {
                                                        panic(fmt.Sprintf("Allocate(%v) returned invalid FileRange %v", allocAR.Length(), fr))
                                                }
                                        }
                                        mm.addRSSLocked(allocAR)
                                        mm.incPrivateRef(fr)
                                        mf.IncRef(fr)
                                        pseg, pgap = mm.pmas.Insert(pgap, allocAR, pma{
                                                file:           mf,
                                                off:            fr.Start,
                                                translatePerms: hostarch.AnyAccess,
                                                effectivePerms: vma.effectivePerms,
                                                maxPerms:       vma.maxPerms,
                                                // Since we just allocated this memory and have the
                                                // only reference, the new pma does not need
                                                // copy-on-write.
                                                private: true,
                                        }).NextNonEmpty()
                                        pstart = pmaIterator{} // iterators invalidated
                                } else {
                                        // Other mappings get pmas by translating.
                                        optMR := vseg.mappableRangeOf(optAR)
                                        reqAR := optAR.Intersect(ar)
                                        reqMR := vseg.mappableRangeOf(reqAR)
                                        perms := at
                                        if vma.private {
                                                // This pma will be copy-on-write; don't require write
                                                // permission, but do require read permission to
                                                // facilitate the copy.
                                                //
                                                // If at.Write is true, we will need to break
                                                // copy-on-write immediately, which occurs after
                                                // translation below.
                                                perms.Read = true
                                                perms.Write = false
                                        }
                                        ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
                                        if checkInvariants {
                                                if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil {
                                                        panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err))
                                                }
                                        }
                                        // Install a pma for each translation.
                                        if len(ts) == 0 {
                                                return pstart, pgap, err
                                        }
                                        pstart = pmaIterator{} // iterators invalidated
                                        for _, t := range ts {
                                                newpmaAR := vseg.addrRangeOf(t.Source)
                                                newpma := pma{
                                                        file:           t.File,
                                                        off:            t.Offset,
                                                        translatePerms: t.Perms,
                                                        effectivePerms: vma.effectivePerms.Intersect(t.Perms),
                                                        maxPerms:       vma.maxPerms.Intersect(t.Perms),
                                                }
                                                if vma.private {
                                                        newpma.effectivePerms.Write = false
                                                        newpma.maxPerms.Write = false
                                                        newpma.needCOW = true
                                                }
                                                mm.addRSSLocked(newpmaAR)
                                                t.File.IncRef(t.FileRange())
                                                // This is valid because memmap.Mappable.Translate is
                                                // required to return Translations in increasing
                                                // Translation.Source order.
                                                pseg = mm.pmas.Insert(pgap, newpmaAR, newpma)
                                                pgap = pseg.NextGap()
                                        }
                                        // The error returned by Translate is only significant if
                                        // it occurred before ar.End.
                                        if err != nil && vseg.addrRangeOf(ts[len(ts)-1].Source).End < ar.End {
                                                return pstart, pgap, err
                                        }
                                        // Rewind pseg to the first pma inserted and continue the
                                        // loop to check if we need to break copy-on-write.
                                        pseg, pgap = mm.findOrSeekPrevUpperBoundPMA(vseg.addrRangeOf(ts[0].Source).Start, pgap), pmaGapIterator{}
                                        continue
                                }

                        case pseg.Ok() && pseg.Start() < vsegAR.End:
                                oldpma := pseg.ValuePtr()
                                if at.Write && mm.isPMACopyOnWriteLocked(vseg, pseg) {
                                        // Break copy-on-write by copying.
                                        if checkInvariants {
                                                if !oldpma.maxPerms.Read {
                                                        panic(fmt.Sprintf("pma %v needs to be copied for writing, but is not readable: %v", pseg.Range(), oldpma))
                                                }
                                        }
                                        // The majority of copy-on-write breaks on executable pages
                                        // come from:
                                        //
                                        // - The ELF loader, which must zero out bytes on the last
                                        // page of each segment after the end of the segment.
                                        //
                                        // - gdb's use of ptrace to insert breakpoints.
                                        //
                                        // Neither of these cases has enough spatial locality to
                                        // benefit from copying nearby pages, so if the vma is
                                        // executable, only copy the pages required.
                                        var copyAR hostarch.AddrRange
                                        if vseg.ValuePtr().effectivePerms.Execute {
                                                copyAR = pseg.Range().Intersect(ar)
                                        } else {
                                                copyAR = pseg.Range().Intersect(maskAR)
                                        }
                                        // Get internal mappings from the pma to copy from.
                                        if err := pseg.getInternalMappingsLocked(); err != nil {
                                                return pstart, pseg.PrevGap(), err
                                        }
                                        // Copy contents.
                                        fr, err := mf.AllocateAndFill(uint64(copyAR.Length()), usage.Anonymous, &safemem.BlockSeqReader{mm.internalMappingsLocked(pseg, copyAR)})
                                        if _, ok := err.(safecopy.BusError); ok {
                                                // If we got SIGBUS during the copy, deliver SIGBUS to
                                                // userspace (instead of SIGSEGV) if we're breaking
                                                // copy-on-write due to application page fault.
                                                err = &memmap.BusError{err}
                                        }
                                        if fr.Length() == 0 {
                                                return pstart, pseg.PrevGap(), err
                                        }
                                        // Unmap all of maskAR, not just copyAR, to minimize host
                                        // syscalls. AddressSpace mappings must be removed before
                                        // mm.decPrivateRef().
                                        if !didUnmapAS {
                                                mm.unmapASLocked(maskAR)
                                                didUnmapAS = true
                                        }
                                        // Replace the pma with a copy in the part of the address
                                        // range where copying was successful. This doesn't change
                                        // RSS.
                                        copyAR.End = copyAR.Start + hostarch.Addr(fr.Length())
                                        if copyAR != pseg.Range() {
                                                pseg = mm.pmas.Isolate(pseg, copyAR)
                                                pstart = pmaIterator{} // iterators invalidated
                                        }
                                        oldpma = pseg.ValuePtr()
                                        if oldpma.private {
                                                mm.decPrivateRef(pseg.fileRange())
                                        }
                                        oldpma.file.DecRef(pseg.fileRange())
                                        mm.incPrivateRef(fr)
                                        mf.IncRef(fr)
                                        oldpma.file = mf
                                        oldpma.off = fr.Start
                                        oldpma.translatePerms = hostarch.AnyAccess
                                        oldpma.effectivePerms = vma.effectivePerms
                                        oldpma.maxPerms = vma.maxPerms
                                        oldpma.needCOW = false
                                        oldpma.private = true
                                        oldpma.internalMappings = safemem.BlockSeq{}
                                        // Try to merge the pma with its neighbors.
                                        if prev := pseg.PrevSegment(); prev.Ok() {
                                                if merged := mm.pmas.Merge(prev, pseg); merged.Ok() {
                                                        pseg = merged
                                                        pstart = pmaIterator{} // iterators invalidated
                                                }
                                        }
                                        if next := pseg.NextSegment(); next.Ok() {
                                                if merged := mm.pmas.Merge(pseg, next); merged.Ok() {
                                                        pseg = merged
                                                        pstart = pmaIterator{} // iterators invalidated
                                                }
                                        }
                                        // The error returned by AllocateAndFill is only
                                        // significant if it occurred before ar.End.
                                        if err != nil && pseg.End() < ar.End {
                                                return pstart, pseg.NextGap(), err
                                        }
                                        // Ensure pseg and pgap are correct for the next iteration
                                        // of the loop.
                                        pseg, pgap = pseg.NextNonEmpty()
                                } else if !oldpma.translatePerms.SupersetOf(at) {
                                        // Get new pmas (with sufficient permissions) by calling
                                        // memmap.Mappable.Translate again.
                                        if checkInvariants {
                                                if oldpma.private {
                                                        panic(fmt.Sprintf("private pma %v has non-maximal pma.translatePerms: %v", pseg.Range(), oldpma))
                                                }
                                        }
                                        // Allow the entire pma to be replaced.
                                        optAR := pseg.Range()
                                        optMR := vseg.mappableRangeOf(optAR)
                                        reqAR := optAR.Intersect(ar)
                                        reqMR := vseg.mappableRangeOf(reqAR)
                                        perms := oldpma.translatePerms.Union(at)
                                        ts, err := vma.mappable.Translate(ctx, reqMR, optMR, perms)
                                        if checkInvariants {
                                                if err := memmap.CheckTranslateResult(reqMR, optMR, perms, ts, err); err != nil {
                                                        panic(fmt.Sprintf("Mappable(%T).Translate(%v, %v, %v): %v", vma.mappable, reqMR, optMR, perms, err))
                                                }
                                        }
                                        // Remove the part of the existing pma covered by new
                                        // Translations, then insert new pmas. This doesn't change
                                        // RSS. Note that we don't need to call unmapASLocked: any
                                        // existing AddressSpace mappings are still valid (though
                                        // less permissive than the new pmas indicate) until
                                        // Invalidate is called, and will be replaced by future
                                        // calls to mapASLocked.
                                        if len(ts) == 0 {
                                                return pstart, pseg.PrevGap(), err
                                        }
                                        transMR := memmap.MappableRange{ts[0].Source.Start, ts[len(ts)-1].Source.End}
                                        transAR := vseg.addrRangeOf(transMR)
                                        pseg = mm.pmas.Isolate(pseg, transAR)
                                        pseg.ValuePtr().file.DecRef(pseg.fileRange())
                                        pgap = mm.pmas.Remove(pseg)
                                        pstart = pmaIterator{} // iterators invalidated
                                        for _, t := range ts {
                                                newpmaAR := vseg.addrRangeOf(t.Source)
                                                newpma := pma{
                                                        file:           t.File,
                                                        off:            t.Offset,
                                                        translatePerms: t.Perms,
                                                        effectivePerms: vma.effectivePerms.Intersect(t.Perms),
                                                        maxPerms:       vma.maxPerms.Intersect(t.Perms),
                                                }
                                                if vma.private {
                                                        newpma.effectivePerms.Write = false
                                                        newpma.maxPerms.Write = false
                                                        newpma.needCOW = true
                                                }
                                                t.File.IncRef(t.FileRange())
                                                pseg = mm.pmas.Insert(pgap, newpmaAR, newpma)
                                                pgap = pseg.NextGap()
                                        }
                                        // The error returned by Translate is only significant if
                                        // it occurred before ar.End.
                                        if err != nil && pseg.End() < ar.End {
                                                return pstart, pgap, err
                                        }
                                        // Ensure pseg and pgap are correct for the next iteration
                                        // of the loop.
                                        if pgap.Range().Length() == 0 {
                                                pseg, pgap = pgap.NextSegment(), pmaGapIterator{}
                                        } else {
                                                pseg = pmaIterator{}
                                        }
                                } else {
                                        // We have a usable pma; continue.
                                        pseg, pgap = pseg.NextNonEmpty()
                                }

                        default:
                                break pmaLoop
                        }
                }
                // Go to the next vma.
                if ar.End <= vseg.End() {
                        if pgap.Ok() {
                                return pstart, pgap, nil
                        }
                        return pstart, pseg.PrevGap(), nil
                }
                vseg = vseg.NextSegment()
        }
}

const (
        // When memory is allocated for a private pma, align the allocated address
        // range to a privateAllocUnit boundary when possible. Larger values of
        // privateAllocUnit may reduce page faults by allowing fewer, larger pmas
        // to be mapped, but may result in larger amounts of wasted memory in the
        // presence of fragmentation. privateAllocUnit must be a power-of-2
        // multiple of hostarch.PageSize.
        privateAllocUnit = hostarch.HugePageSize

        privateAllocMask = privateAllocUnit - 1
)

func privateAligned(ar hostarch.AddrRange) hostarch.AddrRange {
        aligned := hostarch.AddrRange{ar.Start &^ privateAllocMask, ar.End}
        if end := (ar.End + privateAllocMask) &^ privateAllocMask; end >= ar.End {
                aligned.End = end
        }
        if checkInvariants {
                if !aligned.IsSupersetOf(ar) {
                        panic(fmt.Sprintf("aligned AddrRange %#v is not a superset of ar %#v", aligned, ar))
                }
        }
        return aligned
}

// isPMACopyOnWriteLocked returns true if the contents of the pma represented
// by pseg must be copied to a new private pma to be written to.
//
// If the pma is a copy-on-write private pma, and holds the only reference on
// the memory it maps, isPMACopyOnWriteLocked will take ownership of the memory
// and update the pma to indicate that it does not require copy-on-write.
//
// Preconditions:
// * vseg.Range().IsSupersetOf(pseg.Range()).
// * mm.mappingMu must be locked.
// * mm.activeMu must be locked for writing.
func (mm *MemoryManager) isPMACopyOnWriteLocked(vseg vmaIterator, pseg pmaIterator) bool {
        pma := pseg.ValuePtr()
        if !pma.needCOW {
                return false
        }
        if !pma.private {
                return true
        }
        // If we have the only reference on private memory to be copied, just take
        // ownership of it instead of copying. If we do hold the only reference,
        // additional references can only be taken by mm.Fork(), which is excluded
        // by mm.activeMu, so this isn't racy.
        mm.privateRefs.mu.Lock()
        defer mm.privateRefs.mu.Unlock()
        fr := pseg.fileRange()
        // This check relies on mm.privateRefs.refs being kept fully merged.
        rseg := mm.privateRefs.refs.FindSegment(fr.Start)
        if rseg.Ok() && rseg.Value() == 1 && fr.End <= rseg.End() {
                pma.needCOW = false
                // pma.private => pma.translatePerms == hostarch.AnyAccess
                vma := vseg.ValuePtr()
                pma.effectivePerms = vma.effectivePerms
                pma.maxPerms = vma.maxPerms
                return false
        }
        return true
}

// Invalidate implements memmap.MappingSpace.Invalidate.
func (mm *MemoryManager) Invalidate(ar hostarch.AddrRange, opts memmap.InvalidateOpts) {
        if checkInvariants {
                if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
                        panic(fmt.Sprintf("invalid ar: %v", ar))
                }
        }

        mm.activeMu.Lock()
        defer mm.activeMu.Unlock()
        if mm.captureInvalidations {
                mm.capturedInvalidations = append(mm.capturedInvalidations, invalidateArgs{ar, opts})
                return
        }
        mm.invalidateLocked(ar, opts.InvalidatePrivate, true)
}

// invalidateLocked removes pmas and AddressSpace mappings of those pmas for
// addresses in ar.
//
// Preconditions:
// * mm.activeMu must be locked for writing.
// * ar.Length() != 0.
// * ar must be page-aligned.
func (mm *MemoryManager) invalidateLocked(ar hostarch.AddrRange, invalidatePrivate, invalidateShared bool) {
        if checkInvariants {
                if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
                        panic(fmt.Sprintf("invalid ar: %v", ar))
                }
        }

        var didUnmapAS bool
        pseg := mm.pmas.LowerBoundSegment(ar.Start)
        for pseg.Ok() && pseg.Start() < ar.End {
                pma := pseg.ValuePtr()
                if (invalidatePrivate && pma.private) || (invalidateShared && !pma.private) {
                        pseg = mm.pmas.Isolate(pseg, ar)
                        pma = pseg.ValuePtr()
                        if !didUnmapAS {
                                // Unmap all of ar, not just pseg.Range(), to minimize host
                                // syscalls. AddressSpace mappings must be removed before
                                // mm.decPrivateRef().
                                mm.unmapASLocked(ar)
                                didUnmapAS = true
                        }
                        if pma.private {
                                mm.decPrivateRef(pseg.fileRange())
                        }
                        mm.removeRSSLocked(pseg.Range())
                        pma.file.DecRef(pseg.fileRange())
                        pseg = mm.pmas.Remove(pseg).NextSegment()
                } else {
                        pseg = pseg.NextSegment()
                }
        }
}

// Pin returns the memmap.File ranges currently mapped by addresses in ar in
// mm, acquiring a reference on the returned ranges which the caller must
// release by calling Unpin. If not all addresses are mapped, Pin returns a
// non-nil error. Note that Pin may return both a non-empty slice of
// PinnedRanges and a non-nil error.
//
// Pin does not prevent mapped ranges from changing, making it unsuitable for
// most I/O. It should only be used in contexts that would use get_user_pages()
// in the Linux kernel.
//
// Preconditions:
// * ar.Length() != 0.
// * ar must be page-aligned.
func (mm *MemoryManager) Pin(ctx context.Context, ar hostarch.AddrRange, at hostarch.AccessType, ignorePermissions bool) ([]PinnedRange, error) {
        if checkInvariants {
                if !ar.WellFormed() || ar.Length() == 0 || !ar.IsPageAligned() {
                        panic(fmt.Sprintf("invalid ar: %v", ar))
                }
        }

        // Ensure that we have usable vmas.
        mm.mappingMu.RLock()
        vseg, vend, verr := mm.getVMAsLocked(ctx, ar, at, ignorePermissions)
        if vendaddr := vend.Start(); vendaddr < ar.End {
                if vendaddr <= ar.Start {
                        mm.mappingMu.RUnlock()
                        return nil, verr
                }
                ar.End = vendaddr
        }

        // Ensure that we have usable pmas.
        mm.activeMu.Lock()
        pseg, pend, perr := mm.getPMAsLocked(ctx, vseg, ar, at)
        mm.mappingMu.RUnlock()
        if pendaddr := pend.Start(); pendaddr < ar.End {
                if pendaddr <= ar.Start {
                        mm.activeMu.Unlock()
                        return nil, perr
                }
                ar.End = pendaddr
        }

        // Gather pmas.
        var prs []PinnedRange
        for pseg.Ok() && pseg.Start() < ar.End {
                psar := pseg.Range().Intersect(ar)
                f := pseg.ValuePtr().file
                fr := pseg.fileRangeOf(psar)
                f.IncRef(fr)
                prs = append(prs, PinnedRange{
                        Source: psar,
                        File:   f,
                        Offset: fr.Start,
                })
                pseg = pseg.NextSegment()
        }
        mm.activeMu.Unlock()

        // Return the first error in order of progress through ar.
        if perr != nil {
                return prs, perr
        }
        return prs, verr
}

// PinnedRanges are returned by MemoryManager.Pin.
type PinnedRange struct {
        // Source is the corresponding range of addresses.
        Source hostarch.AddrRange

        // File is the mapped file.
        File memmap.File

        // Offset is the offset into File at which this PinnedRange begins.
        Offset uint64
}

// FileRange returns the memmap.File offsets mapped by pr.
func (pr PinnedRange) FileRange() memmap.FileRange {
        return memmap.FileRange{pr.Offset, pr.Offset + uint64(pr.Source.Length())}
}

// Unpin releases the reference held by prs.
func Unpin(prs []PinnedRange) {
        for i := range prs {
                prs[i].File.DecRef(prs[i].FileRange())
        }
}

// movePMAsLocked moves all pmas in oldAR to newAR.
//
// Preconditions:
// * mm.activeMu must be locked for writing.
// * oldAR.Length() != 0.
// * oldAR.Length() <= newAR.Length().
// * !oldAR.Overlaps(newAR).
// * mm.pmas.IsEmptyRange(newAR).
// * oldAR and newAR must be page-aligned.
func (mm *MemoryManager) movePMAsLocked(oldAR, newAR hostarch.AddrRange) {
        if checkInvariants {
                if !oldAR.WellFormed() || oldAR.Length() == 0 || !oldAR.IsPageAligned() {
                        panic(fmt.Sprintf("invalid oldAR: %v", oldAR))
                }
                if !newAR.WellFormed() || newAR.Length() == 0 || !newAR.IsPageAligned() {
                        panic(fmt.Sprintf("invalid newAR: %v", newAR))
                }
                if oldAR.Length() > newAR.Length() {
                        panic(fmt.Sprintf("old address range %v may contain pmas that will not fit in new address range %v", oldAR, newAR))
                }
                if oldAR.Overlaps(newAR) {
                        panic(fmt.Sprintf("old and new address ranges overlap: %v, %v", oldAR, newAR))
                }
                // mm.pmas.IsEmptyRange is checked by mm.pmas.Insert.
        }

        type movedPMA struct {
                oldAR hostarch.AddrRange
                pma   pma
        }
        var movedPMAs []movedPMA
        pseg := mm.pmas.LowerBoundSegment(oldAR.Start)
        for pseg.Ok() && pseg.Start() < oldAR.End {
                pseg = mm.pmas.Isolate(pseg, oldAR)
                movedPMAs = append(movedPMAs, movedPMA{
                        oldAR: pseg.Range(),
                        pma:   pseg.Value(),
                })
                pseg = mm.pmas.Remove(pseg).NextSegment()
                // No RSS change is needed since we're re-inserting the same pmas
                // below.
        }

        off := newAR.Start - oldAR.Start
        pgap := mm.pmas.FindGap(newAR.Start)
        for i := range movedPMAs {
                mpma := &movedPMAs[i]
                pmaNewAR := hostarch.AddrRange{mpma.oldAR.Start + off, mpma.oldAR.End + off}
                pgap = mm.pmas.Insert(pgap, pmaNewAR, mpma.pma).NextGap()
        }

        mm.unmapASLocked(oldAR)
}

// getPMAInternalMappingsLocked ensures that pmas for all addresses in ar have
// cached internal mappings. It returns:
//
// - An iterator to the gap after the last pma with internal mappings
// containing an address in ar. If internal mappings exist for no addresses in
// ar, the iterator is to a gap that begins before ar.Start.
//
// - An error that is non-nil if internal mappings exist for only a subset of
// ar.
//
// Preconditions:
// * mm.activeMu must be locked for writing.
// * pseg.Range().Contains(ar.Start).
// * pmas must exist for all addresses in ar.
// * ar.Length() != 0.
//
// Postconditions: getPMAInternalMappingsLocked does not invalidate iterators
// into mm.pmas.
func (mm *MemoryManager) getPMAInternalMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange) (pmaGapIterator, error) {
        if checkInvariants {
                if !ar.WellFormed() || ar.Length() == 0 {
                        panic(fmt.Sprintf("invalid ar: %v", ar))
                }
                if !pseg.Range().Contains(ar.Start) {
                        panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
                }
        }

        for {
                if err := pseg.getInternalMappingsLocked(); err != nil {
                        return pseg.PrevGap(), err
                }
                if ar.End <= pseg.End() {
                        return pseg.NextGap(), nil
                }
                pseg, _ = pseg.NextNonEmpty()
        }
}

// getVecPMAInternalMappingsLocked ensures that pmas for all addresses in ars
// have cached internal mappings. It returns the subset of ars for which
// internal mappings exist. If this is not equal to ars, it returns a non-nil
// error explaining why.
//
// Preconditions:
// * mm.activeMu must be locked for writing.
// * pmas must exist for all addresses in ar.
//
// Postconditions: getVecPMAInternalMappingsLocked does not invalidate iterators
// into mm.pmas.
func (mm *MemoryManager) getVecPMAInternalMappingsLocked(ars hostarch.AddrRangeSeq) (hostarch.AddrRangeSeq, error) {
        for arsit := ars; !arsit.IsEmpty(); arsit = arsit.Tail() {
                ar := arsit.Head()
                if ar.Length() == 0 {
                        continue
                }
                if pend, err := mm.getPMAInternalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); err != nil {
                        return truncatedAddrRangeSeq(ars, arsit, pend.Start()), err
                }
        }
        return ars, nil
}

// internalMappingsLocked returns internal mappings for addresses in ar.
//
// Preconditions:
// * mm.activeMu must be locked.
// * Internal mappings must have been previously established for all addresses
//   in ar.
// * ar.Length() != 0.
// * pseg.Range().Contains(ar.Start).
func (mm *MemoryManager) internalMappingsLocked(pseg pmaIterator, ar hostarch.AddrRange) safemem.BlockSeq {
        if checkInvariants {
                if !ar.WellFormed() || ar.Length() == 0 {
                        panic(fmt.Sprintf("invalid ar: %v", ar))
                }
                if !pseg.Range().Contains(ar.Start) {
                        panic(fmt.Sprintf("initial pma %v does not cover start of ar %v", pseg.Range(), ar))
                }
        }

        if ar.End <= pseg.End() {
                // Since only one pma is involved, we can use pma.internalMappings
                // directly, avoiding a slice allocation.
                offset := uint64(ar.Start - pseg.Start())
                return pseg.ValuePtr().internalMappings.DropFirst64(offset).TakeFirst64(uint64(ar.Length()))
        }

        var ims []safemem.Block
        for {
                pr := pseg.Range().Intersect(ar)
                for pims := pseg.ValuePtr().internalMappings.DropFirst64(uint64(pr.Start - pseg.Start())).TakeFirst64(uint64(pr.Length())); !pims.IsEmpty(); pims = pims.Tail() {
                        ims = append(ims, pims.Head())
                }
                if ar.End <= pseg.End() {
                        break
                }
                pseg = pseg.NextSegment()
        }
        return safemem.BlockSeqFromSlice(ims)
}

// vecInternalMappingsLocked returns internal mappings for addresses in ars.
//
// Preconditions:
// * mm.activeMu must be locked.
// * Internal mappings must have been previously established for all addresses
//   in ars.
func (mm *MemoryManager) vecInternalMappingsLocked(ars hostarch.AddrRangeSeq) safemem.BlockSeq {
        var ims []safemem.Block
        for ; !ars.IsEmpty(); ars = ars.Tail() {
                ar := ars.Head()
                if ar.Length() == 0 {
                        continue
                }
                for pims := mm.internalMappingsLocked(mm.pmas.FindSegment(ar.Start), ar); !pims.IsEmpty(); pims = pims.Tail() {
                        ims = append(ims, pims.Head())
                }
        }
        return safemem.BlockSeqFromSlice(ims)
}

// incPrivateRef acquires a reference on private pages in fr.
func (mm *MemoryManager) incPrivateRef(fr memmap.FileRange) {
        mm.privateRefs.mu.Lock()
        defer mm.privateRefs.mu.Unlock()
        refSet := &mm.privateRefs.refs
        seg, gap := refSet.Find(fr.Start)
        for {
                switch {
                case seg.Ok() && seg.Start() < fr.End:
                        seg = refSet.Isolate(seg, fr)
                        seg.SetValue(seg.Value() + 1)
                        seg, gap = seg.NextNonEmpty()
                case gap.Ok() && gap.Start() < fr.End:
                        seg, gap = refSet.InsertWithoutMerging(gap, gap.Range().Intersect(fr), 1).NextNonEmpty()
                default:
                        refSet.MergeAdjacent(fr)
                        return
                }
        }
}

// decPrivateRef releases a reference on private pages in fr.
func (mm *MemoryManager) decPrivateRef(fr memmap.FileRange) {
        var freed []memmap.FileRange

        mm.privateRefs.mu.Lock()
        refSet := &mm.privateRefs.refs
        seg := refSet.LowerBoundSegment(fr.Start)
        for seg.Ok() && seg.Start() < fr.End {
                seg = refSet.Isolate(seg, fr)
                if old := seg.Value(); old == 1 {
                        freed = append(freed, seg.Range())
                        seg = refSet.Remove(seg).NextSegment()
                } else {
                        seg.SetValue(old - 1)
                        seg = seg.NextSegment()
                }
        }
        refSet.MergeAdjacent(fr)
        mm.privateRefs.mu.Unlock()

        mf := mm.mfp.MemoryFile()
        for _, fr := range freed {
                mf.DecRef(fr)
        }
}

// addRSSLocked updates the current and maximum resident set size of a
// MemoryManager to reflect the insertion of a pma at ar.
//
// Preconditions: mm.activeMu must be locked for writing.
func (mm *MemoryManager) addRSSLocked(ar hostarch.AddrRange) {
        mm.curRSS += uint64(ar.Length())
        if mm.curRSS > mm.maxRSS {
                mm.maxRSS = mm.curRSS
        }
}

// removeRSSLocked updates the current resident set size of a MemoryManager to
// reflect the removal of a pma at ar.
//
// Preconditions: mm.activeMu must be locked for writing.
func (mm *MemoryManager) removeRSSLocked(ar hostarch.AddrRange) {
        mm.curRSS -= uint64(ar.Length())
}

// pmaSetFunctions implements segment.Functions for pmaSet.
type pmaSetFunctions struct{}

func (pmaSetFunctions) MinKey() hostarch.Addr {
        return 0
}

func (pmaSetFunctions) MaxKey() hostarch.Addr {
        return ^hostarch.Addr(0)
}

func (pmaSetFunctions) ClearValue(pma *pma) {
        pma.file = nil
        pma.internalMappings = safemem.BlockSeq{}
}

func (pmaSetFunctions) Merge(ar1 hostarch.AddrRange, pma1 pma, ar2 hostarch.AddrRange, pma2 pma) (pma, bool) {
        if pma1.file != pma2.file ||
                pma1.off+uint64(ar1.Length()) != pma2.off ||
                pma1.translatePerms != pma2.translatePerms ||
                pma1.effectivePerms != pma2.effectivePerms ||
                pma1.maxPerms != pma2.maxPerms ||
                pma1.needCOW != pma2.needCOW ||
                pma1.private != pma2.private {
                return pma{}, false
        }

        // Discard internal mappings instead of trying to merge them, since merging
        // them requires an allocation and getting them again from the
        // memmap.File might not.
        pma1.internalMappings = safemem.BlockSeq{}
        return pma1, true
}

func (pmaSetFunctions) Split(ar hostarch.AddrRange, p pma, split hostarch.Addr) (pma, pma) {
        newlen1 := uint64(split - ar.Start)
        p2 := p
        p2.off += newlen1
        if !p.internalMappings.IsEmpty() {
                p.internalMappings = p.internalMappings.TakeFirst64(newlen1)
                p2.internalMappings = p2.internalMappings.DropFirst64(newlen1)
        }
        return p, p2
}

// findOrSeekPrevUpperBoundPMA returns mm.pmas.UpperBoundSegment(addr), but may do
// so by scanning linearly backward from pgap.
//
// Preconditions:
// * mm.activeMu must be locked.
// * addr <= pgap.Start().
func (mm *MemoryManager) findOrSeekPrevUpperBoundPMA(addr hostarch.Addr, pgap pmaGapIterator) pmaIterator {
        if checkInvariants {
                if !pgap.Ok() {
                        panic("terminal pma iterator")
                }
                if addr > pgap.Start() {
                        panic(fmt.Sprintf("can't seek backward to %#x from %#x", addr, pgap.Start()))
                }
        }
        // Optimistically check if pgap.PrevSegment() is the PMA we're looking for,
        // which is the case if findOrSeekPrevUpperBoundPMA is called to find the
        // start of a range containing only a single PMA.
        if pseg := pgap.PrevSegment(); pseg.Start() <= addr {
                return pseg
        }
        return mm.pmas.UpperBoundSegment(addr)
}

// getInternalMappingsLocked ensures that pseg.ValuePtr().internalMappings is
// non-empty.
//
// Preconditions: mm.activeMu must be locked for writing.
func (pseg pmaIterator) getInternalMappingsLocked() error {
        pma := pseg.ValuePtr()
        if pma.internalMappings.IsEmpty() {
                // This must use maxPerms (instead of perms) because some permission
                // constraints are only visible to vmas; for example, mappings of
                // read-only files have vma.maxPerms.Write unset, but this may not be
                // visible to the memmap.Mappable.
                perms := pma.maxPerms
                // We will never execute application code through an internal mapping.
                perms.Execute = false
                ims, err := pma.file.MapInternal(pseg.fileRange(), perms)
                if err != nil {
                        return err
                }
                pma.internalMappings = ims
        }
        return nil
}

func (pseg pmaIterator) fileRange() memmap.FileRange {
        return pseg.fileRangeOf(pseg.Range())
}

// Preconditions:
// * pseg.Range().IsSupersetOf(ar).
// * ar.Length != 0.
func (pseg pmaIterator) fileRangeOf(ar hostarch.AddrRange) memmap.FileRange {
        if checkInvariants {
                if !pseg.Ok() {
                        panic("terminal pma iterator")
                }
                if !ar.WellFormed() || ar.Length() == 0 {
                        panic(fmt.Sprintf("invalid ar: %v", ar))
                }
                if !pseg.Range().IsSupersetOf(ar) {
                        panic(fmt.Sprintf("ar %v out of bounds %v", ar, pseg.Range()))
                }
        }

        pma := pseg.ValuePtr()
        pstart := pseg.Start()
        return memmap.FileRange{pma.off + uint64(ar.Start-pstart), pma.off + uint64(ar.End-pstart)}
}


















































































   32 








   32 




   32 









    4 




   32 
   32 

    1 




    1 

















    1 






    1 












    4 








    4 




    3 
    1 




    3 
    2 



    1 




    1 



    1 






    1 












    1 



    1 









    1 

    1 



    1 




















    1 









   20 










   32 







   32 

























    1 






















































































    4 











   31 
   31 








   30 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package arp implements the ARP network protocol. It is used to resolve
// IPv4 addresses into link-local MAC addresses, and advertises IPv4
// addresses of its stack with the local network.
package arp

import (
        "fmt"
        "reflect"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/header/parse"
        "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

const (
        // ProtocolNumber is the ARP protocol number.
        ProtocolNumber = header.ARPProtocolNumber
)

var _ stack.DuplicateAddressDetector = (*endpoint)(nil)
var _ stack.LinkAddressResolver = (*endpoint)(nil)
var _ ip.DADProtocol = (*endpoint)(nil)

// ARP endpoints need to implement stack.NetworkEndpoint because the stack
// considers the layer above the link-layer a network layer; the only
// facility provided by the stack to deliver packets to a layer above
// the link-layer is via stack.NetworkEndpoint.HandlePacket.
var _ stack.NetworkEndpoint = (*endpoint)(nil)

type endpoint struct {
        protocol *protocol

        // enabled is set to 1 when the NIC is enabled and 0 when it is disabled.
        //
        // Must be accessed using atomic operations.
        enabled uint32

        nic   stack.NetworkInterface
        stats sharedStats

        mu struct {
                sync.Mutex

                dad ip.DAD
        }
}

// CheckDuplicateAddress implements stack.DuplicateAddressDetector.
func (e *endpoint) CheckDuplicateAddress(addr tcpip.Address, h stack.DADCompletionHandler) stack.DADCheckAddressDisposition {
        e.mu.Lock()
        defer e.mu.Unlock()
        return e.mu.dad.CheckDuplicateAddressLocked(addr, h)
}

// SetDADConfigurations implements stack.DuplicateAddressDetector.
func (e *endpoint) SetDADConfigurations(c stack.DADConfigurations) {
        e.mu.Lock()
        defer e.mu.Unlock()
        e.mu.dad.SetConfigsLocked(c)
}

// DuplicateAddressProtocol implements stack.DuplicateAddressDetector.
func (*endpoint) DuplicateAddressProtocol() tcpip.NetworkProtocolNumber {
        return header.IPv4ProtocolNumber
}

// SendDADMessage implements ip.DADProtocol.
func (e *endpoint) SendDADMessage(addr tcpip.Address, _ []byte) tcpip.Error {
        return e.sendARPRequest(header.IPv4Any, addr, header.EthernetBroadcastAddress)
}

func (e *endpoint) Enable() tcpip.Error {
        if !e.nic.Enabled() {
                return &tcpip.ErrNotPermitted{}
        }

        e.setEnabled(true)
        return nil
}

func (e *endpoint) Enabled() bool {
        return e.nic.Enabled() && e.isEnabled()
}

// isEnabled returns true if the endpoint is enabled, regardless of the
// enabled status of the NIC.
func (e *endpoint) isEnabled() bool {
        return atomic.LoadUint32(&e.enabled) == 1
}

// setEnabled sets the enabled status for the endpoint.
func (e *endpoint) setEnabled(v bool) {
        if v {
                atomic.StoreUint32(&e.enabled, 1)
        } else {
                atomic.StoreUint32(&e.enabled, 0)
        }
}

func (e *endpoint) Disable() {
        e.setEnabled(false)
}

// DefaultTTL is unused for ARP. It implements stack.NetworkEndpoint.
func (*endpoint) DefaultTTL() uint8 {
        return 0
}

func (e *endpoint) MTU() uint32 {
        lmtu := e.nic.MTU()
        return lmtu - uint32(e.MaxHeaderLength())
}

func (e *endpoint) MaxHeaderLength() uint16 {
        return e.nic.MaxHeaderLength() + header.ARPSize
}

func (*endpoint) Close() {}

func (*endpoint) WritePacket(*stack.Route, stack.NetworkHeaderParams, *stack.PacketBuffer) tcpip.Error {
        return &tcpip.ErrNotSupported{}
}

// NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
func (*endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
        return ProtocolNumber
}

// WritePackets implements stack.NetworkEndpoint.WritePackets.
func (*endpoint) WritePackets(*stack.Route, stack.PacketBufferList, stack.NetworkHeaderParams) (int, tcpip.Error) {
        return 0, &tcpip.ErrNotSupported{}
}

func (*endpoint) WriteHeaderIncludedPacket(*stack.Route, *stack.PacketBuffer) tcpip.Error {
        return &tcpip.ErrNotSupported{}
}

func (e *endpoint) HandlePacket(pkt *stack.PacketBuffer) {
        stats := e.stats.arp
        stats.packetsReceived.Increment()

        if !e.isEnabled() {
                stats.disabledPacketsReceived.Increment()
                return
        }

        if _, _, ok := e.protocol.Parse(pkt); !ok {
                stats.malformedPacketsReceived.Increment()
                return
        }

        h := header.ARP(pkt.NetworkHeader().View())
        if !h.IsValid() {
                stats.malformedPacketsReceived.Increment()
                return
        }

        switch h.Op() {
        case header.ARPRequest:
                stats.requestsReceived.Increment()
                localAddr := tcpip.Address(h.ProtocolAddressTarget())

                if !e.nic.CheckLocalAddress(header.IPv4ProtocolNumber, localAddr) {
                        stats.requestsReceivedUnknownTargetAddress.Increment()
                        return // we have no useful answer, ignore the request
                }

                remoteAddr := tcpip.Address(h.ProtocolAddressSender())
                remoteLinkAddr := tcpip.LinkAddress(h.HardwareAddressSender())

                switch err := e.nic.HandleNeighborProbe(header.IPv4ProtocolNumber, remoteAddr, remoteLinkAddr); err.(type) {
                case nil:
                case *tcpip.ErrNotSupported:
                        // The stack may support ARP but the NIC may not need link resolution.
                default:
                        panic(fmt.Sprintf("unexpected error when informing NIC of neighbor probe message: %s", err))
                }

                respPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                        ReserveHeaderBytes: int(e.nic.MaxHeaderLength()) + header.ARPSize,
                })
                packet := header.ARP(respPkt.NetworkHeader().Push(header.ARPSize))
                respPkt.NetworkProtocolNumber = ProtocolNumber
                packet.SetIPv4OverEthernet()
                packet.SetOp(header.ARPReply)
                // TODO(gvisor.dev/issue/4582): check copied length once TAP devices have a
                // link address.
                _ = copy(packet.HardwareAddressSender(), e.nic.LinkAddress())
                if n := copy(packet.ProtocolAddressSender(), h.ProtocolAddressTarget()); n != header.IPv4AddressSize {
                        panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize))
                }
                origSender := h.HardwareAddressSender()
                if n := copy(packet.HardwareAddressTarget(), origSender); n != header.EthernetAddressSize {
                        panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.EthernetAddressSize))
                }
                if n := copy(packet.ProtocolAddressTarget(), h.ProtocolAddressSender()); n != header.IPv4AddressSize {
                        panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize))
                }

                // As per RFC 826, under Packet Reception:
                //   Swap hardware and protocol fields, putting the local hardware and
                //   protocol addresses in the sender fields.
                //
                //   Send the packet to the (new) target hardware address on the same
                //   hardware on which the request was received.
                if err := e.nic.WritePacketToRemote(tcpip.LinkAddress(origSender), ProtocolNumber, respPkt); err != nil {
                        stats.outgoingRepliesDropped.Increment()
                } else {
                        stats.outgoingRepliesSent.Increment()
                }

        case header.ARPReply:
                stats.repliesReceived.Increment()
                addr := tcpip.Address(h.ProtocolAddressSender())
                linkAddr := tcpip.LinkAddress(h.HardwareAddressSender())

                e.mu.Lock()
                e.mu.dad.StopLocked(addr, &stack.DADDupAddrDetected{HolderLinkAddress: linkAddr})
                e.mu.Unlock()

                // The solicited, override, and isRouter flags are not available for ARP;
                // they are only available for IPv6 Neighbor Advertisements.
                switch err := e.nic.HandleNeighborConfirmation(header.IPv4ProtocolNumber, addr, linkAddr, stack.ReachabilityConfirmationFlags{
                        // Solicited and unsolicited (also referred to as gratuitous) ARP Replies
                        // are handled equivalently to a solicited Neighbor Advertisement.
                        Solicited: true,
                        // If a different link address is received than the one cached, the entry
                        // should always go to Stale.
                        Override: false,
                        // ARP does not distinguish between router and non-router hosts.
                        IsRouter: false,
                }); err.(type) {
                case nil:
                case *tcpip.ErrNotSupported:
                // The stack may support ARP but the NIC may not need link resolution.
                default:
                        panic(fmt.Sprintf("unexpected error when informing NIC of neighbor confirmation message: %s", err))
                }
        }
}

// Stats implements stack.NetworkEndpoint.
func (e *endpoint) Stats() stack.NetworkEndpointStats {
        return &e.stats.localStats
}

var _ stack.NetworkProtocol = (*protocol)(nil)

type protocol struct {
        stack   *stack.Stack
        options Options
}

func (p *protocol) Number() tcpip.NetworkProtocolNumber { return ProtocolNumber }
func (p *protocol) MinimumPacketSize() int              { return header.ARPSize }
func (p *protocol) DefaultPrefixLen() int               { return 0 }

func (*protocol) ParseAddresses(buffer.View) (src, dst tcpip.Address) {
        return "", ""
}

func (p *protocol) NewEndpoint(nic stack.NetworkInterface, _ stack.TransportDispatcher) stack.NetworkEndpoint {
        e := &endpoint{
                protocol: p,
                nic:      nic,
        }

        e.mu.Lock()
        e.mu.dad.Init(&e.mu, p.options.DADConfigs, ip.DADOptions{
                Clock:     p.stack.Clock(),
                SecureRNG: p.stack.SecureRNG(),
                // ARP does not support sending nonce values.
                NonceSize: 0,
                Protocol:  e,
                NICID:     nic.ID(),
        })
        e.mu.Unlock()

        tcpip.InitStatCounters(reflect.ValueOf(&e.stats.localStats).Elem())

        stackStats := p.stack.Stats()
        e.stats.arp.init(&e.stats.localStats.ARP, &stackStats.ARP)

        return e
}

// LinkAddressProtocol implements stack.LinkAddressResolver.LinkAddressProtocol.
func (*endpoint) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
        return header.IPv4ProtocolNumber
}

// LinkAddressRequest implements stack.LinkAddressResolver.LinkAddressRequest.
func (e *endpoint) LinkAddressRequest(targetAddr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) tcpip.Error {
        stats := e.stats.arp

        if len(remoteLinkAddr) == 0 {
                remoteLinkAddr = header.EthernetBroadcastAddress
        }

        if len(localAddr) == 0 {
                addr, err := e.nic.PrimaryAddress(header.IPv4ProtocolNumber)
                if err != nil {
                        return err
                }

                if len(addr.Address) == 0 {
                        stats.outgoingRequestInterfaceHasNoLocalAddressErrors.Increment()
                        return &tcpip.ErrNetworkUnreachable{}
                }

                localAddr = addr.Address
        } else if !e.nic.CheckLocalAddress(header.IPv4ProtocolNumber, localAddr) {
                stats.outgoingRequestBadLocalAddressErrors.Increment()
                return &tcpip.ErrBadLocalAddress{}
        }

        return e.sendARPRequest(localAddr, targetAddr, remoteLinkAddr)
}

func (e *endpoint) sendARPRequest(localAddr, targetAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress) tcpip.Error {
        pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
                ReserveHeaderBytes: int(e.MaxHeaderLength()),
        })
        h := header.ARP(pkt.NetworkHeader().Push(header.ARPSize))
        pkt.NetworkProtocolNumber = ProtocolNumber
        h.SetIPv4OverEthernet()
        h.SetOp(header.ARPRequest)
        // TODO(gvisor.dev/issue/4582): check copied length once TAP devices have a
        // link address.
        _ = copy(h.HardwareAddressSender(), e.nic.LinkAddress())
        if n := copy(h.ProtocolAddressSender(), localAddr); n != header.IPv4AddressSize {
                panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize))
        }
        if n := copy(h.ProtocolAddressTarget(), targetAddr); n != header.IPv4AddressSize {
                panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, header.IPv4AddressSize))
        }

        stats := e.stats.arp
        if err := e.nic.WritePacketToRemote(remoteLinkAddr, ProtocolNumber, pkt); err != nil {
                stats.outgoingRequestsDropped.Increment()
                return err
        }
        stats.outgoingRequestsSent.Increment()
        return nil
}

// ResolveStaticAddress implements stack.LinkAddressResolver.ResolveStaticAddress.
func (*endpoint) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
        if addr == header.IPv4Broadcast {
                return header.EthernetBroadcastAddress, true
        }
        if header.IsV4MulticastAddress(addr) {
                return header.EthernetAddressFromMulticastIPv4Address(addr), true
        }
        return tcpip.LinkAddress([]byte(nil)), false
}

// SetOption implements stack.NetworkProtocol.SetOption.
func (*protocol) SetOption(tcpip.SettableNetworkProtocolOption) tcpip.Error {
        return &tcpip.ErrUnknownProtocolOption{}
}

// Option implements stack.NetworkProtocol.Option.
func (*protocol) Option(tcpip.GettableNetworkProtocolOption) tcpip.Error {
        return &tcpip.ErrUnknownProtocolOption{}
}

// Close implements stack.TransportProtocol.Close.
func (*protocol) Close() {}

// Wait implements stack.TransportProtocol.Wait.
func (*protocol) Wait() {}

// Parse implements stack.NetworkProtocol.Parse.
func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
        return 0, false, parse.ARP(pkt)
}

// Options holds options to configure a protocol.
type Options struct {
        // DADConfigs is the default DAD configurations used by ARP endpoints.
        DADConfigs stack.DADConfigurations
}

// NewProtocolWithOptions returns an ARP network protocol factory that
// will return an ARP network protocol with the provided options.
func NewProtocolWithOptions(opts Options) stack.NetworkProtocolFactory {
        return func(s *stack.Stack) stack.NetworkProtocol {
                return &protocol{
                        stack:   s,
                        options: opts,
                }
        }
}

// NewProtocol returns an ARP network protocol.
func NewProtocol(s *stack.Stack) stack.NetworkProtocol {
        return NewProtocolWithOptions(Options{})(s)
}





















































































































  695 

    3 


  698 


 1955 




  695 







 1358 




 1957 




  840 









  843 




   35 




 1960 




 1958 
    1 



 1955 










   16 





    7 




 1964 




 1963 




    3 




    3 




    3 


    3 





    3 




    3 


    3 


    3 




    3 











    3 



















    3 



    3 






    3 







    3 






    3 
    2 





    1 






    1 



    3 
    1 


    2 








    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build amd64

package arch

import (
        "bytes"
        "fmt"
        "math/rand"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/cpuid"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch/fpu"
        "gvisor.dev/gvisor/pkg/sentry/limits"
)

// Host specifies the host architecture.
const Host = AMD64

// These constants come directly from Linux.
const (
        // maxAddr64 is the maximum userspace address. It is TASK_SIZE in Linux
        // for a 64-bit process.
        maxAddr64 hostarch.Addr = (1 << 47) - hostarch.PageSize

        // maxStackRand64 is the maximum randomization to apply to the stack.
        // It is defined by arch/x86/mm/mmap.c:stack_maxrandom_size in Linux.
        maxStackRand64 = 16 << 30 // 16 GB

        // maxMmapRand64 is the maximum randomization to apply to the mmap
        // layout. It is defined by arch/x86/mm/mmap.c:arch_mmap_rnd in Linux.
        maxMmapRand64 = (1 << 28) * hostarch.PageSize

        // minGap64 is the minimum gap to leave at the top of the address space
        // for the stack. It is defined by arch/x86/mm/mmap.c:MIN_GAP in Linux.
        minGap64 = (128 << 20) + maxStackRand64

        // preferredPIELoadAddr is the standard Linux position-independent
        // executable base load address. It is ELF_ET_DYN_BASE in Linux.
        //
        // The Platform {Min,Max}UserAddress() may preclude loading at this
        // address. See other preferredFoo comments below.
        preferredPIELoadAddr hostarch.Addr = maxAddr64 / 3 * 2
)

// These constants are selected as heuristics to help make the Platform's
// potentially limited address space conform as closely to Linux as possible.
const (
        // Select a preferred minimum TopDownBase address.
        //
        // Some applications (TSAN and other *SANs) are very particular about
        // the way the Linux mmap allocator layouts out the address space.
        //
        // TSAN in particular expects top down allocations to be made in the
        // range [0x7e8000000000, 0x800000000000).
        //
        // The minimum TopDownBase on Linux would be:
        // 0x800000000000 - minGap64 - maxMmapRand64 = 0x7efbf8000000.
        //
        // (minGap64 because TSAN uses a small RLIMIT_STACK.)
        //
        // 0x7e8000000000 is selected arbitrarily by TSAN to leave room for
        // allocations below TopDownBase.
        //
        // N.B. ASAN and MSAN are more forgiving; ASAN allows allocations all
        // the way down to 0x10007fff8000, and MSAN down to 0x700000000000.
        //
        // Of course, there is no hard minimum to allocation; an allocator can
        // search all the way from TopDownBase to Min. However, TSAN declared
        // their range "good enough".
        //
        // We would like to pick a TopDownBase such that it is unlikely that an
        // allocator will select an address below TSAN's minimum. We achieve
        // this by trying to leave a sizable gap below TopDownBase.
        //
        // This is all "preferred" because the layout min/max address may not
        // allow us to select such a TopDownBase, in which case we have to fall
        // back to a layout that TSAN may not be happy with.
        preferredTopDownAllocMin hostarch.Addr = 0x7e8000000000
        preferredAllocationGap                 = 128 << 30 // 128 GB
        preferredTopDownBaseMin                = preferredTopDownAllocMin + preferredAllocationGap

        // minMmapRand64 is the smallest we are willing to make the
        // randomization to stay above preferredTopDownBaseMin.
        minMmapRand64 = (1 << 26) * hostarch.PageSize
)

// context64 represents an AMD64 context.
//
// +stateify savable
type context64 struct {
        State
        sigFPState []fpu.State // fpstate to be restored on sigreturn.
}

// Arch implements Context.Arch.
func (c *context64) Arch() Arch {
        return AMD64
}

func (c *context64) copySigFPState() []fpu.State {
        var sigfps []fpu.State
        for _, s := range c.sigFPState {
                sigfps = append(sigfps, s.Fork())
        }
        return sigfps
}

func (c *context64) FloatingPointData() *fpu.State {
        return &c.State.fpState
}

// Fork returns an exact copy of this context.
func (c *context64) Fork() Context {
        return &context64{
                State:      c.State.Fork(),
                sigFPState: c.copySigFPState(),
        }
}

// Return returns the current syscall return value.
func (c *context64) Return() uintptr {
        return uintptr(c.Regs.Rax)
}

// SetReturn sets the syscall return value.
func (c *context64) SetReturn(value uintptr) {
        c.Regs.Rax = uint64(value)
}

// IP returns the current instruction pointer.
func (c *context64) IP() uintptr {
        return uintptr(c.Regs.Rip)
}

// SetIP sets the current instruction pointer.
func (c *context64) SetIP(value uintptr) {
        c.Regs.Rip = uint64(value)
}

// Stack returns the current stack pointer.
func (c *context64) Stack() uintptr {
        return uintptr(c.Regs.Rsp)
}

// SetStack sets the current stack pointer.
func (c *context64) SetStack(value uintptr) {
        c.Regs.Rsp = uint64(value)
}

// TLS returns the current TLS pointer.
func (c *context64) TLS() uintptr {
        return uintptr(c.Regs.Fs_base)
}

// SetTLS sets the current TLS pointer. Returns false if value is invalid.
func (c *context64) SetTLS(value uintptr) bool {
        if !isValidSegmentBase(uint64(value)) {
                return false
        }

        c.Regs.Fs = 0
        c.Regs.Fs_base = uint64(value)
        return true
}

// SetOldRSeqInterruptedIP implements Context.SetOldRSeqInterruptedIP.
func (c *context64) SetOldRSeqInterruptedIP(value uintptr) {
        c.Regs.R10 = uint64(value)
}

// Native returns the native type for the given val.
func (c *context64) Native(val uintptr) marshal.Marshallable {
        v := primitive.Uint64(val)
        return &v
}

// Value returns the generic val for the given native type.
func (c *context64) Value(val marshal.Marshallable) uintptr {
        return uintptr(*val.(*primitive.Uint64))
}

// Width returns the byte width of this architecture.
func (c *context64) Width() uint {
        return 8
}

// FeatureSet returns the FeatureSet in use.
func (c *context64) FeatureSet() *cpuid.FeatureSet {
        return c.State.FeatureSet
}

// mmapRand returns a random adjustment for randomizing an mmap layout.
func mmapRand(max uint64) hostarch.Addr {
        return hostarch.Addr(rand.Int63n(int64(max))).RoundDown()
}

// NewMmapLayout implements Context.NewMmapLayout consistently with Linux.
func (c *context64) NewMmapLayout(min, max hostarch.Addr, r *limits.LimitSet) (MmapLayout, error) {
        min, ok := min.RoundUp()
        if !ok {
                return MmapLayout{}, unix.EINVAL
        }
        if max > maxAddr64 {
                max = maxAddr64
        }
        max = max.RoundDown()

        if min > max {
                return MmapLayout{}, unix.EINVAL
        }

        stackSize := r.Get(limits.Stack)

        // MAX_GAP in Linux.
        maxGap := (max / 6) * 5
        gap := hostarch.Addr(stackSize.Cur)
        if gap < minGap64 {
                gap = minGap64
        }
        if gap > maxGap {
                gap = maxGap
        }
        defaultDir := MmapTopDown
        if stackSize.Cur == limits.Infinity {
                defaultDir = MmapBottomUp
        }

        topDownMin := max - gap - maxMmapRand64
        maxRand := hostarch.Addr(maxMmapRand64)
        if topDownMin < preferredTopDownBaseMin {
                // Try to keep TopDownBase above preferredTopDownBaseMin by
                // shrinking maxRand.
                maxAdjust := maxRand - minMmapRand64
                needAdjust := preferredTopDownBaseMin - topDownMin
                if needAdjust <= maxAdjust {
                        maxRand -= needAdjust
                }
        }

        rnd := mmapRand(uint64(maxRand))
        l := MmapLayout{
                MinAddr: min,
                MaxAddr: max,
                // TASK_UNMAPPED_BASE in Linux.
                BottomUpBase:     (max/3 + rnd).RoundDown(),
                TopDownBase:      (max - gap - rnd).RoundDown(),
                DefaultDirection: defaultDir,
                // We may have reduced the maximum randomization to keep
                // TopDownBase above preferredTopDownBaseMin while maintaining
                // our stack gap. Stack allocations must use that max
                // randomization to avoiding eating into the gap.
                MaxStackRand: uint64(maxRand),
        }

        // Final sanity check on the layout.
        if !l.Valid() {
                panic(fmt.Sprintf("Invalid MmapLayout: %+v", l))
        }

        return l, nil
}

// PIELoadAddress implements Context.PIELoadAddress.
func (c *context64) PIELoadAddress(l MmapLayout) hostarch.Addr {
        base := preferredPIELoadAddr
        max, ok := base.AddLength(maxMmapRand64)
        if !ok {
                panic(fmt.Sprintf("preferredPIELoadAddr %#x too large", base))
        }

        if max > l.MaxAddr {
                // preferredPIELoadAddr won't fit; fall back to the standard
                // Linux behavior of 2/3 of TopDownBase. TSAN won't like this.
                //
                // Don't bother trying to shrink the randomization for now.
                base = l.TopDownBase / 3 * 2
        }

        return base + mmapRand(maxMmapRand64)
}

// userStructSize is the size in bytes of Linux's struct user on amd64.
const userStructSize = 928

// PtracePeekUser implements Context.PtracePeekUser.
func (c *context64) PtracePeekUser(addr uintptr) (marshal.Marshallable, error) {
        if addr&7 != 0 || addr >= userStructSize {
                return nil, unix.EIO
        }
        // PTRACE_PEEKUSER and PTRACE_POKEUSER are only effective on regs and
        // u_debugreg, returning 0 or silently no-oping for other fields
        // respectively.
        if addr < uintptr(ptraceRegistersSize) {
                regs := c.ptraceGetRegs()
                buf := make([]byte, regs.SizeBytes())
                regs.MarshalUnsafe(buf)
                return c.Native(uintptr(hostarch.ByteOrder.Uint64(buf[addr:]))), nil
        }
        // Note: x86 debug registers are missing.
        return c.Native(0), nil
}

// PtracePokeUser implements Context.PtracePokeUser.
func (c *context64) PtracePokeUser(addr, data uintptr) error {
        if addr&7 != 0 || addr >= userStructSize {
                return unix.EIO
        }
        if addr < uintptr(ptraceRegistersSize) {
                regs := c.ptraceGetRegs()
                buf := make([]byte, regs.SizeBytes())
                regs.MarshalUnsafe(buf)
                hostarch.ByteOrder.PutUint64(buf[addr:], uint64(data))
                _, err := c.PtraceSetRegs(bytes.NewBuffer(buf))
                return err
        }
        // Note: x86 debug registers are missing.
        return nil
}



































    1 




    1 





    1 


    1 


    1 
    1 













    5 












  145 



  144 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package gofer

import (
        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/p9"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/hostfd"
)

// handle represents a remote "open file descriptor", consisting of an opened
// fid (p9.File) and optionally a host file descriptor.
//
// These are explicitly not savable.
type handle struct {
        file p9file
        fd   int32 // -1 if unavailable
}

// Preconditions: read || write.
func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (handle, error) {
        _, newfile, err := file.walk(ctx, nil)
        if err != nil {
                return handle{fd: -1}, err
        }
        var flags p9.OpenFlags
        switch {
        case read && !write:
                flags = p9.ReadOnly
        case !read && write:
                flags = p9.WriteOnly
        case read && write:
                flags = p9.ReadWrite
        }
        if trunc {
                flags |= p9.OpenTruncate
        }
        fdobj, _, _, err := newfile.open(ctx, flags)
        if err != nil {
                newfile.close(ctx)
                return handle{fd: -1}, err
        }
        fd := int32(-1)
        if fdobj != nil {
                fd = int32(fdobj.Release())
        }
        return handle{
                file: newfile,
                fd:   fd,
        }, nil
}

func (h *handle) isOpen() bool {
        return !h.file.isNil()
}

func (h *handle) close(ctx context.Context) {
        h.file.close(ctx)
        h.file = p9file{}
        if h.fd >= 0 {
                unix.Close(int(h.fd))
                h.fd = -1
        }
}

func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
        if dsts.IsEmpty() {
                return 0, nil
        }
        if h.fd >= 0 {
                ctx.UninterruptibleSleepStart(false)
                n, err := hostfd.Preadv2(h.fd, dsts, int64(offset), 0 /* flags */)
                ctx.UninterruptibleSleepFinish(false)
                return n, err
        }
        if dsts.NumBlocks() == 1 && !dsts.Head().NeedSafecopy() {
                n, err := h.file.readAt(ctx, dsts.Head().ToSlice(), offset)
                return uint64(n), err
        }
        // Buffer the read since p9.File.ReadAt() takes []byte.
        buf := make([]byte, dsts.NumBytes())
        n, err := h.file.readAt(ctx, buf, offset)
        if n == 0 {
                return 0, err
        }
        if cp, cperr := safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:n]))); cperr != nil {
                return cp, cperr
        }
        return uint64(n), err
}

func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
        if srcs.IsEmpty() {
                return 0, nil
        }
        if h.fd >= 0 {
                ctx.UninterruptibleSleepStart(false)
                n, err := hostfd.Pwritev2(h.fd, srcs, int64(offset), 0 /* flags */)
                ctx.UninterruptibleSleepFinish(false)
                return n, err
        }
        if srcs.NumBlocks() == 1 && !srcs.Head().NeedSafecopy() {
                n, err := h.file.writeAt(ctx, srcs.Head().ToSlice(), offset)
                return uint64(n), err
        }
        // Buffer the write since p9.File.WriteAt() takes []byte.
        buf := make([]byte, srcs.NumBytes())
        cp, cperr := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), srcs)
        if cp == 0 {
                return 0, cperr
        }
        n, err := h.file.writeAt(ctx, buf[:cp], offset)
        // err takes precedence over cperr.
        if err != nil {
                return uint64(n), err
        }
        return uint64(n), cperr
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/proc/fd_info_dir_inode_refs.go: no such file or directory

























    2 





    2 






    2 

    1 



    1 




    2 





    1 
    1 

    1 

    1 




    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mm

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/kernel/shm"
)

// DetachShm unmaps a sysv shared memory segment.
func (mm *MemoryManager) DetachShm(ctx context.Context, addr hostarch.Addr) error {
        if addr != addr.RoundDown() {
                // "... shmaddr is not aligned on a page boundary." - man shmdt(2)
                return linuxerr.EINVAL
        }

        var detached *shm.Shm
        mm.mappingMu.Lock()
        defer mm.mappingMu.Unlock()

        // Find and remove the first vma containing an address >= addr that maps a
        // segment originally attached at addr.
        vseg := mm.vmas.LowerBoundSegment(addr)
        for vseg.Ok() {
                vma := vseg.ValuePtr()
                if shm, ok := vma.mappable.(*shm.Shm); ok && vseg.Start() >= addr && uint64(vseg.Start()-addr) == vma.off {
                        detached = shm
                        vseg = mm.unmapLocked(ctx, vseg.Range()).NextSegment()
                        break
                } else {
                        vseg = vseg.NextSegment()
                }
        }

        if detached == nil {
                // There is no shared memory segment attached at addr.
                return linuxerr.EINVAL
        }

        // Remove all vmas that could have been created by the same attach.
        end := addr + hostarch.Addr(detached.EffectiveSize())
        for vseg.Ok() && vseg.End() <= end {
                vma := vseg.ValuePtr()
                if vma.mappable == detached && uint64(vseg.Start()-addr) == vma.off {
                        vseg = mm.unmapLocked(ctx, vseg.Range()).NextSegment()
                } else {
                        vseg = vseg.NextSegment()
                }
        }

        return nil
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/kernel_abi_autogen_unsafe.go: no such file or directory






































































































   30 




   46 





    3 










   27 








   21 










    9 







    9 



    9 





   10 

    1 


    9 
















    3 

    6 




    9 



    9 











   34 

   31 

















    2 
    1 


    1 




    2 
    1 


    1 

















   31 

















































































   67 

   13 





   51 





   51 





   54 





   60 





   51 





    2 





   51 





   50 











    7 





    5 





    5 





    5 





    9 





   51 





















   35 




   31 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package tcp contains the implementation of the TCP transport protocol.
package tcp

import (
        "runtime"
        "strings"
        "time"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/header/parse"
        "gvisor.dev/gvisor/pkg/tcpip/seqnum"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
        "gvisor.dev/gvisor/pkg/tcpip/transport/raw"
        "gvisor.dev/gvisor/pkg/waiter"
)

const (
        // ProtocolNumber is the tcp protocol number.
        ProtocolNumber = header.TCPProtocolNumber

        // MinBufferSize is the smallest size of a receive or send buffer.
        MinBufferSize = 4 << 10 // 4096 bytes.

        // DefaultSendBufferSize is the default size of the send buffer for
        // an endpoint.
        DefaultSendBufferSize = 1 << 20 // 1MB

        // DefaultReceiveBufferSize is the default size of the receive buffer
        // for an endpoint.
        DefaultReceiveBufferSize = 1 << 20 // 1MB

        // MaxBufferSize is the largest size a receive/send buffer can grow to.
        MaxBufferSize = 4 << 20 // 4MB

        // MaxUnprocessedSegments is the maximum number of unprocessed segments
        // that can be queued for a given endpoint.
        MaxUnprocessedSegments = 300

        // DefaultTCPLingerTimeout is the amount of time that sockets linger in
        // FIN_WAIT_2 state before being marked closed.
        DefaultTCPLingerTimeout = 60 * time.Second

        // MaxTCPLingerTimeout is the maximum amount of time that sockets
        // linger in FIN_WAIT_2 state before being marked closed.
        MaxTCPLingerTimeout = 120 * time.Second

        // DefaultTCPTimeWaitTimeout is the amount of time that sockets linger
        // in TIME_WAIT state before being marked closed.
        DefaultTCPTimeWaitTimeout = 60 * time.Second

        // DefaultSynRetries is the default value for the number of SYN retransmits
        // before a connect is aborted.
        DefaultSynRetries = 6
)

const (
        ccReno  = "reno"
        ccCubic = "cubic"
)

type protocol struct {
        stack *stack.Stack

        mu                         sync.RWMutex
        sackEnabled                bool
        recovery                   tcpip.TCPRecovery
        delayEnabled               bool
        alwaysUseSynCookies        bool
        sendBufferSize             tcpip.TCPSendBufferSizeRangeOption
        recvBufferSize             tcpip.TCPReceiveBufferSizeRangeOption
        congestionControl          string
        availableCongestionControl []string
        moderateReceiveBuffer      bool
        lingerTimeout              time.Duration
        timeWaitTimeout            time.Duration
        timeWaitReuse              tcpip.TCPTimeWaitReuseOption
        minRTO                     time.Duration
        maxRTO                     time.Duration
        maxRetries                 uint32
        synRetries                 uint8
        dispatcher                 dispatcher
}

// Number returns the tcp protocol number.
func (*protocol) Number() tcpip.TransportProtocolNumber {
        return ProtocolNumber
}

// NewEndpoint creates a new tcp endpoint.
func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
        return newEndpoint(p.stack, netProto, waiterQueue), nil
}

// NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently
// unsupported. It implements stack.TransportProtocol.NewRawEndpoint.
func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
        return raw.NewEndpoint(p.stack, netProto, header.TCPProtocolNumber, waiterQueue)
}

// MinimumPacketSize returns the minimum valid tcp packet size.
func (*protocol) MinimumPacketSize() int {
        return header.TCPMinimumSize
}

// ParsePorts returns the source and destination ports stored in the given tcp
// packet.
func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err tcpip.Error) {
        h := header.TCP(v)
        return h.SourcePort(), h.DestinationPort(), nil
}

// QueuePacket queues packets targeted at an endpoint after hashing the packet
// to a specific processing queue. Each queue is serviced by its own processor
// goroutine which is responsible for dequeuing and doing full TCP dispatch of
// the packet.
func (p *protocol) QueuePacket(ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
        p.dispatcher.queuePacket(ep, id, p.stack.Clock(), pkt)
}

// HandleUnknownDestinationPacket handles packets targeted at this protocol but
// that don't match any existing endpoint.
//
// RFC 793, page 36, states that "If the connection does not exist (CLOSED) then
// a reset is sent in response to any incoming segment except another reset. In
// particular, SYNs addressed to a non-existent connection are rejected by this
// means."
func (p *protocol) HandleUnknownDestinationPacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition {
        s := newIncomingSegment(id, p.stack.Clock(), pkt)
        defer s.decRef()

        if !s.parse(pkt.RXTransportChecksumValidated) || !s.csumValid {
                return stack.UnknownDestinationPacketMalformed
        }

        if !s.flags.Contains(header.TCPFlagRst) {
                replyWithReset(p.stack, s, stack.DefaultTOS, 0)
        }

        return stack.UnknownDestinationPacketHandled
}

// replyWithReset replies to the given segment with a reset segment.
//
// If the passed TTL is 0, then the route's default TTL will be used.
func replyWithReset(st *stack.Stack, s *segment, tos, ttl uint8) tcpip.Error {
        route, err := st.FindRoute(s.nicID, s.dstAddr, s.srcAddr, s.netProto, false /* multicastLoop */)
        if err != nil {
                return err
        }
        defer route.Release()

        // Get the seqnum from the packet if the ack flag is set.
        seq := seqnum.Value(0)
        ack := seqnum.Value(0)
        flags := header.TCPFlagRst
        // As per RFC 793 page 35 (Reset Generation)
        //   1.  If the connection does not exist (CLOSED) then a reset is sent
        //   in response to any incoming segment except another reset.  In
        //   particular, SYNs addressed to a non-existent connection are rejected
        //   by this means.

        //   If the incoming segment has an ACK field, the reset takes its
        //   sequence number from the ACK field of the segment, otherwise the
        //   reset has sequence number zero and the ACK field is set to the sum
        //   of the sequence number and segment length of the incoming segment.
        //   The connection remains in the CLOSED state.
        if s.flags.Contains(header.TCPFlagAck) {
                seq = s.ackNumber
        } else {
                flags |= header.TCPFlagAck
                ack = s.sequenceNumber.Add(s.logicalLen())
        }

        if ttl == 0 {
                ttl = route.DefaultTTL()
        }

        return sendTCP(route, tcpFields{
                id:     s.id,
                ttl:    ttl,
                tos:    tos,
                flags:  flags,
                seq:    seq,
                ack:    ack,
                rcvWnd: 0,
        }, buffer.VectorisedView{}, stack.GSO{}, nil /* PacketOwner */)
}

// SetOption implements stack.TransportProtocol.SetOption.
func (p *protocol) SetOption(option tcpip.SettableTransportProtocolOption) tcpip.Error {
        switch v := option.(type) {
        case *tcpip.TCPSACKEnabled:
                p.mu.Lock()
                p.sackEnabled = bool(*v)
                p.mu.Unlock()
                return nil

        case *tcpip.TCPRecovery:
                p.mu.Lock()
                p.recovery = *v
                p.mu.Unlock()
                return nil

        case *tcpip.TCPDelayEnabled:
                p.mu.Lock()
                p.delayEnabled = bool(*v)
                p.mu.Unlock()
                return nil

        case *tcpip.TCPSendBufferSizeRangeOption:
                if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
                        return &tcpip.ErrInvalidOptionValue{}
                }
                p.mu.Lock()
                p.sendBufferSize = *v
                p.mu.Unlock()
                return nil

        case *tcpip.TCPReceiveBufferSizeRangeOption:
                if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
                        return &tcpip.ErrInvalidOptionValue{}
                }
                p.mu.Lock()
                p.recvBufferSize = *v
                p.mu.Unlock()
                return nil

        case *tcpip.CongestionControlOption:
                for _, c := range p.availableCongestionControl {
                        if string(*v) == c {
                                p.mu.Lock()
                                p.congestionControl = string(*v)
                                p.mu.Unlock()
                                return nil
                        }
                }
                // linux returns ENOENT when an invalid congestion control
                // is specified.
                return &tcpip.ErrNoSuchFile{}

        case *tcpip.TCPModerateReceiveBufferOption:
                p.mu.Lock()
                p.moderateReceiveBuffer = bool(*v)
                p.mu.Unlock()
                return nil

        case *tcpip.TCPLingerTimeoutOption:
                p.mu.Lock()
                if *v < 0 {
                        p.lingerTimeout = 0
                } else {
                        p.lingerTimeout = time.Duration(*v)
                }
                p.mu.Unlock()
                return nil

        case *tcpip.TCPTimeWaitTimeoutOption:
                p.mu.Lock()
                if *v < 0 {
                        p.timeWaitTimeout = 0
                } else {
                        p.timeWaitTimeout = time.Duration(*v)
                }
                p.mu.Unlock()
                return nil

        case *tcpip.TCPTimeWaitReuseOption:
                if *v < tcpip.TCPTimeWaitReuseDisabled || *v > tcpip.TCPTimeWaitReuseLoopbackOnly {
                        return &tcpip.ErrInvalidOptionValue{}
                }
                p.mu.Lock()
                p.timeWaitReuse = *v
                p.mu.Unlock()
                return nil

        case *tcpip.TCPMinRTOOption:
                p.mu.Lock()
                if *v < 0 {
                        p.minRTO = MinRTO
                } else {
                        p.minRTO = time.Duration(*v)
                }
                p.mu.Unlock()
                return nil

        case *tcpip.TCPMaxRTOOption:
                p.mu.Lock()
                if *v < 0 {
                        p.maxRTO = MaxRTO
                } else {
                        p.maxRTO = time.Duration(*v)
                }
                p.mu.Unlock()
                return nil

        case *tcpip.TCPMaxRetriesOption:
                p.mu.Lock()
                p.maxRetries = uint32(*v)
                p.mu.Unlock()
                return nil

        case *tcpip.TCPAlwaysUseSynCookies:
                p.mu.Lock()
                p.alwaysUseSynCookies = bool(*v)
                p.mu.Unlock()
                return nil

        case *tcpip.TCPSynRetriesOption:
                if *v < 1 || *v > 255 {
                        return &tcpip.ErrInvalidOptionValue{}
                }
                p.mu.Lock()
                p.synRetries = uint8(*v)
                p.mu.Unlock()
                return nil

        default:
                return &tcpip.ErrUnknownProtocolOption{}
        }
}

// Option implements stack.TransportProtocol.Option.
func (p *protocol) Option(option tcpip.GettableTransportProtocolOption) tcpip.Error {
        switch v := option.(type) {
        case *tcpip.TCPSACKEnabled:
                p.mu.RLock()
                *v = tcpip.TCPSACKEnabled(p.sackEnabled)
                p.mu.RUnlock()
                return nil

        case *tcpip.TCPRecovery:
                p.mu.RLock()
                *v = p.recovery
                p.mu.RUnlock()
                return nil

        case *tcpip.TCPDelayEnabled:
                p.mu.RLock()
                *v = tcpip.TCPDelayEnabled(p.delayEnabled)
                p.mu.RUnlock()
                return nil

        case *tcpip.TCPSendBufferSizeRangeOption:
                p.mu.RLock()
                *v = p.sendBufferSize
                p.mu.RUnlock()
                return nil

        case *tcpip.TCPReceiveBufferSizeRangeOption:
                p.mu.RLock()
                *v = p.recvBufferSize
                p.mu.RUnlock()
                return nil

        case *tcpip.CongestionControlOption:
                p.mu.RLock()
                *v = tcpip.CongestionControlOption(p.congestionControl)
                p.mu.RUnlock()
                return nil

        case *tcpip.TCPAvailableCongestionControlOption:
                p.mu.RLock()
                *v = tcpip.TCPAvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
                p.mu.RUnlock()
                return nil

        case *tcpip.TCPModerateReceiveBufferOption:
                p.mu.RLock()
                *v = tcpip.TCPModerateReceiveBufferOption(p.moderateReceiveBuffer)
                p.mu.RUnlock()
                return nil

        case *tcpip.TCPLingerTimeoutOption:
                p.mu.RLock()
                *v = tcpip.TCPLingerTimeoutOption(p.lingerTimeout)
                p.mu.RUnlock()
                return nil

        case *tcpip.TCPTimeWaitTimeoutOption:
                p.mu.RLock()
                *v = tcpip.TCPTimeWaitTimeoutOption(p.timeWaitTimeout)
                p.mu.RUnlock()
                return nil

        case *tcpip.TCPTimeWaitReuseOption:
                p.mu.RLock()
                *v = p.timeWaitReuse
                p.mu.RUnlock()
                return nil

        case *tcpip.TCPMinRTOOption:
                p.mu.RLock()
                *v = tcpip.TCPMinRTOOption(p.minRTO)
                p.mu.RUnlock()
                return nil

        case *tcpip.TCPMaxRTOOption:
                p.mu.RLock()
                *v = tcpip.TCPMaxRTOOption(p.maxRTO)
                p.mu.RUnlock()
                return nil

        case *tcpip.TCPMaxRetriesOption:
                p.mu.RLock()
                *v = tcpip.TCPMaxRetriesOption(p.maxRetries)
                p.mu.RUnlock()
                return nil

        case *tcpip.TCPAlwaysUseSynCookies:
                p.mu.RLock()
                *v = tcpip.TCPAlwaysUseSynCookies(p.alwaysUseSynCookies)
                p.mu.RUnlock()
                return nil

        case *tcpip.TCPSynRetriesOption:
                p.mu.RLock()
                *v = tcpip.TCPSynRetriesOption(p.synRetries)
                p.mu.RUnlock()
                return nil

        default:
                return &tcpip.ErrUnknownProtocolOption{}
        }
}

// Close implements stack.TransportProtocol.Close.
func (p *protocol) Close() {
        p.dispatcher.close()
}

// Wait implements stack.TransportProtocol.Wait.
func (p *protocol) Wait() {
        p.dispatcher.wait()
}

// Parse implements stack.TransportProtocol.Parse.
func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
        return parse.TCP(pkt)
}

// NewProtocol returns a TCP transport protocol.
func NewProtocol(s *stack.Stack) stack.TransportProtocol {
        p := protocol{
                stack: s,
                sendBufferSize: tcpip.TCPSendBufferSizeRangeOption{
                        Min:     MinBufferSize,
                        Default: DefaultSendBufferSize,
                        Max:     MaxBufferSize,
                },
                recvBufferSize: tcpip.TCPReceiveBufferSizeRangeOption{
                        Min:     MinBufferSize,
                        Default: DefaultReceiveBufferSize,
                        Max:     MaxBufferSize,
                },
                congestionControl:          ccReno,
                availableCongestionControl: []string{ccReno, ccCubic},
                lingerTimeout:              DefaultTCPLingerTimeout,
                timeWaitTimeout:            DefaultTCPTimeWaitTimeout,
                timeWaitReuse:              tcpip.TCPTimeWaitReuseLoopbackOnly,
                synRetries:                 DefaultSynRetries,
                minRTO:                     MinRTO,
                maxRTO:                     MaxRTO,
                maxRetries:                 MaxRetries,
                // TODO(gvisor.dev/issue/5243): Set recovery to tcpip.TCPRACKLossDetection.
                recovery: 0,
        }
        p.dispatcher.init(s.Rand(), runtime.GOMAXPROCS(0))
        return &p
}





























    6 




    9 



   14 





    1 


   13 





    7 


    7 










   12 




   13 










   13 





    9 

    5 









    3 


    4 








    4 


    5 
















    5 

    5 


    5 








    5 


    5 

    8 
    1 




    7 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
)

// Getdents implements Linux syscall getdents(2).
func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return getdents(t, args, false /* isGetdents64 */)
}

// Getdents64 implements Linux syscall getdents64(2).
func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return getdents(t, args, true /* isGetdents64 */)
}

func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (uintptr, *kernel.SyscallControl, error) {
        fd := args[0].Int()
        addr := args[1].Pointer()
        size := int(args[2].Uint())

        file := t.GetFileVFS2(fd)
        if file == nil {
                return 0, nil, linuxerr.EBADF
        }
        defer file.DecRef(t)

        cb := getGetdentsCallback(t, addr, size, isGetdents64)
        err := file.IterDirents(t, cb)
        n := size - cb.remaining
        putGetdentsCallback(cb)
        if n == 0 {
                return 0, nil, err
        }
        return uintptr(n), nil, nil
}

type getdentsCallback struct {
        t            *kernel.Task
        addr         hostarch.Addr
        remaining    int
        isGetdents64 bool
}

var getdentsCallbackPool = sync.Pool{
        New: func() interface{} {
                return &getdentsCallback{}
        },
}

func getGetdentsCallback(t *kernel.Task, addr hostarch.Addr, size int, isGetdents64 bool) *getdentsCallback {
        cb := getdentsCallbackPool.Get().(*getdentsCallback)
        *cb = getdentsCallback{
                t:            t,
                addr:         addr,
                remaining:    size,
                isGetdents64: isGetdents64,
        }
        return cb
}

func putGetdentsCallback(cb *getdentsCallback) {
        cb.t = nil
        getdentsCallbackPool.Put(cb)
}

// Handle implements vfs.IterDirentsCallback.Handle.
func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
        var buf []byte
        if cb.isGetdents64 {
                // struct linux_dirent64 {
                //     ino64_t        d_ino;    /* 64-bit inode number */
                //     off64_t        d_off;    /* 64-bit offset to next structure */
                //     unsigned short d_reclen; /* Size of this dirent */
                //     unsigned char  d_type;   /* File type */
                //     char           d_name[]; /* Filename (null-terminated) */
                // };
                size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
                size = (size + 7) &^ 7 // round up to multiple of 8
                if size > cb.remaining {
                        return linuxerr.EINVAL
                }
                buf = cb.t.CopyScratchBuffer(size)
                hostarch.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
                hostarch.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
                hostarch.ByteOrder.PutUint16(buf[16:18], uint16(size))
                buf[18] = dirent.Type
                copy(buf[19:], dirent.Name)
                // Zero out all remaining bytes in buf, including the NUL terminator
                // after dirent.Name.
                bufTail := buf[19+len(dirent.Name):]
                for i := range bufTail {
                        bufTail[i] = 0
                }
        } else {
                // struct linux_dirent {
                //     unsigned long  d_ino;     /* Inode number */
                //     unsigned long  d_off;     /* Offset to next linux_dirent */
                //     unsigned short d_reclen;  /* Length of this linux_dirent */
                //     char           d_name[];  /* Filename (null-terminated) */
                //                       /* length is actually (d_reclen - 2 -
                //                          offsetof(struct linux_dirent, d_name)) */
                //     /*
                //     char           pad;       // Zero padding byte
                //     char           d_type;    // File type (only since Linux
                //                               // 2.6.4); offset is (d_reclen - 1)
                //     */
                // };
                if cb.t.Arch().Width() != 8 {
                        panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width()))
                }
                size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
                size = (size + 7) &^ 7 // round up to multiple of sizeof(long)
                if size > cb.remaining {
                        return linuxerr.EINVAL
                }
                buf = cb.t.CopyScratchBuffer(size)
                hostarch.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
                hostarch.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
                hostarch.ByteOrder.PutUint16(buf[16:18], uint16(size))
                copy(buf[18:], dirent.Name)
                // Zero out all remaining bytes in buf, including the NUL terminator
                // after dirent.Name and the zero padding byte between the name and
                // dirent type.
                bufTail := buf[18+len(dirent.Name) : size-1]
                for i := range bufTail {
                        bufTail[i] = 0
                }
                buf[size-1] = dirent.Type
        }
        n, err := cb.t.CopyOutBytes(cb.addr, buf)
        if err != nil {
                // Don't report partially-written dirents by advancing cb.addr or
                // cb.remaining.
                return err
        }
        cb.addr += hostarch.Addr(n)
        cb.remaining -= n
        return nil
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fsimpl/cgroupfs/dir_refs.go: no such file or directory


































  331 





















   13 




   13 














































































    5 



    1 




    1 




    1 


    1 




    1 
    1 


    1 

    1 





    1 

























    1 















    1 
    1 




    1 








    1 





    1 



    1 



    1 









    1 

    1 


    1 
    1 



    1 



    1 
    1 
    1 


    1 


    1 








    1 


    1 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package gofer

import (
        "fmt"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/p9"
        "gvisor.dev/gvisor/pkg/refsvfs2"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
)

func (d *dentry) isDir() bool {
        return d.fileType() == linux.S_IFDIR
}

// Preconditions:
// * filesystem.renameMu must be locked.
// * d.dirMu must be locked.
// * d.isDir().
// * child must be a newly-created dentry that has never had a parent.
func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
        d.IncRef() // reference held by child on its parent
        child.parent = d
        child.name = name
        if d.children == nil {
                d.children = make(map[string]*dentry)
        }
        d.children[name] = child
}

// Preconditions:
// * d.dirMu must be locked.
// * d.isDir().
func (d *dentry) cacheNegativeLookupLocked(name string) {
        // Don't cache negative lookups if InteropModeShared is in effect (since
        // this makes remote lookup unavoidable), or if d.isSynthetic() (in which
        // case the only files in the directory are those for which a dentry exists
        // in d.children). Instead, just delete any previously-cached dentry.
        if d.fs.opts.interop == InteropModeShared || d.isSynthetic() {
                delete(d.children, name)
                return
        }
        if d.children == nil {
                d.children = make(map[string]*dentry)
        }
        d.children[name] = nil
}

type createSyntheticOpts struct {
        name string
        mode linux.FileMode
        kuid auth.KUID
        kgid auth.KGID

        // The endpoint for a synthetic socket. endpoint should be nil if the file
        // being created is not a socket.
        endpoint transport.BoundEndpoint

        // pipe should be nil if the file being created is not a pipe.
        pipe *pipe.VFSPipe
}

// createSyntheticChildLocked creates a synthetic file with the given name
// in d.
//
// Preconditions:
// * d.dirMu must be locked.
// * d.isDir().
// * d does not already contain a child with the given name.
func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
        now := d.fs.clock.Now().Nanoseconds()
        child := &dentry{
                refs:      1, // held by d
                fs:        d.fs,
                ino:       d.fs.nextIno(),
                mode:      uint32(opts.mode),
                uid:       uint32(opts.kuid),
                gid:       uint32(opts.kgid),
                blockSize: hostarch.PageSize, // arbitrary
                atime:     now,
                mtime:     now,
                ctime:     now,
                btime:     now,
                readFD:    -1,
                writeFD:   -1,
                mmapFD:    -1,
                nlink:     uint32(2),
        }
        refsvfs2.Register(child)
        switch opts.mode.FileType() {
        case linux.S_IFDIR:
                // Nothing else needs to be done.
        case linux.S_IFSOCK:
                child.endpoint = opts.endpoint
        case linux.S_IFIFO:
                child.pipe = opts.pipe
        default:
                panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType()))
        }
        child.pf.dentry = child
        child.vfsd.Init(child)

        d.cacheNewChildLocked(child, opts.name)
        d.syntheticChildren++
}

// +stateify savable
type directoryFD struct {
        fileDescription
        vfs.DirectoryFileDescriptionDefaultImpl

        mu      sync.Mutex `state:"nosave"`
        off     int64
        dirents []vfs.Dirent
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *directoryFD) Release(context.Context) {
}

// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
        fd.mu.Lock()
        defer fd.mu.Unlock()

        d := fd.dentry()
        if fd.dirents == nil {
                ds, err := d.getDirents(ctx)
                if err != nil {
                        return err
                }
                fd.dirents = ds
        }

        d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent)
        if d.cachedMetadataAuthoritative() {
                d.touchAtime(fd.vfsfd.Mount())
        }

        for fd.off < int64(len(fd.dirents)) {
                if err := cb.Handle(fd.dirents[fd.off]); err != nil {
                        return err
                }
                fd.off++
        }
        return nil
}

// Preconditions:
// * d.isDir().
// * There exists at least one directoryFD representing d.
func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
        // NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the
        // presence of concurrent mutation of an iterated directory, so
        // implementations may duplicate or omit entries in this case, which
        // violates POSIX semantics. Thus we read all directory entries while
        // holding d.dirMu to exclude directory mutations. (Note that it is
        // impossible for the client to exclude concurrent mutation from other
        // remote filesystem users. Since there is no way to detect if the server
        // has incorrectly omitted directory entries, we simply assume that the
        // server is well-behaved under InteropModeShared.) This is inconsistent
        // with Linux (which appears to assume that directory fids have the correct
        // semantics, and translates struct file_operations::readdir calls directly
        // to readdir RPCs), but is consistent with VFS1.

        // filesystem.renameMu is needed for d.parent, and must be locked before
        // dentry.dirMu.
        d.fs.renameMu.RLock()
        defer d.fs.renameMu.RUnlock()
        d.dirMu.Lock()
        defer d.dirMu.Unlock()
        if d.dirents != nil {
                return d.dirents, nil
        }

        // It's not clear if 9P2000.L's readdir is expected to return "." and "..",
        // so we generate them here.
        parent := genericParentOrSelf(d)
        dirents := []vfs.Dirent{
                {
                        Name:    ".",
                        Type:    linux.DT_DIR,
                        Ino:     uint64(d.ino),
                        NextOff: 1,
                },
                {
                        Name:    "..",
                        Type:    uint8(atomic.LoadUint32(&parent.mode) >> 12),
                        Ino:     uint64(parent.ino),
                        NextOff: 2,
                },
        }
        var realChildren map[string]struct{}
        if !d.isSynthetic() {
                if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared {
                        // Record the set of children d actually has so that we don't emit
                        // duplicate entries for synthetic children.
                        realChildren = make(map[string]struct{})
                }
                off := uint64(0)
                const count = 64 * 1024 // for consistency with the vfs1 client
                d.handleMu.RLock()
                if d.readFile.isNil() {
                        // This should not be possible because a readable handle should
                        // have been opened when the calling directoryFD was opened.
                        d.handleMu.RUnlock()
                        panic("gofer.dentry.getDirents called without a readable handle")
                }
                for {
                        p9ds, err := d.readFile.readdir(ctx, off, count)
                        if err != nil {
                                d.handleMu.RUnlock()
                                return nil, err
                        }
                        if len(p9ds) == 0 {
                                d.handleMu.RUnlock()
                                break
                        }
                        for _, p9d := range p9ds {
                                if p9d.Name == "." || p9d.Name == ".." {
                                        continue
                                }
                                dirent := vfs.Dirent{
                                        Name:    p9d.Name,
                                        Ino:     d.fs.inoFromQIDPath(p9d.QID.Path),
                                        NextOff: int64(len(dirents) + 1),
                                }
                                // p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
                                // DMSOCKET.
                                switch p9d.Type {
                                case p9.TypeSymlink:
                                        dirent.Type = linux.DT_LNK
                                case p9.TypeDir:
                                        dirent.Type = linux.DT_DIR
                                default:
                                        dirent.Type = linux.DT_REG
                                }
                                dirents = append(dirents, dirent)
                                if realChildren != nil {
                                        realChildren[p9d.Name] = struct{}{}
                                }
                        }
                        off = p9ds[len(p9ds)-1].Offset
                }
        }
        // Emit entries for synthetic children.
        if d.syntheticChildren != 0 {
                for _, child := range d.children {
                        if child == nil || !child.isSynthetic() {
                                continue
                        }
                        if _, ok := realChildren[child.name]; ok {
                                continue
                        }
                        dirents = append(dirents, vfs.Dirent{
                                Name:    child.name,
                                Type:    uint8(atomic.LoadUint32(&child.mode) >> 12),
                                Ino:     uint64(child.ino),
                                NextOff: int64(len(dirents) + 1),
                        })
                }
        }
        // Cache dirents for future directoryFDs if permitted.
        if d.cachedMetadataAuthoritative() {
                d.dirents = dirents
        }
        return dirents, nil
}

// Seek implements vfs.FileDescriptionImpl.Seek.
func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        fd.mu.Lock()
        defer fd.mu.Unlock()

        switch whence {
        case linux.SEEK_SET:
                if offset < 0 {
                        return 0, linuxerr.EINVAL
                }
                if offset == 0 {
                        // Ensure that the next call to fd.IterDirents() calls
                        // fd.dentry().getDirents().
                        fd.dirents = nil
                }
                fd.off = offset
                return fd.off, nil
        case linux.SEEK_CUR:
                offset += fd.off
                if offset < 0 {
                        return 0, linuxerr.EINVAL
                }
                // Don't clear fd.dirents in this case, even if offset == 0.
                fd.off = offset
                return fd.off, nil
        default:
                return 0, linuxerr.EINVAL
        }
}

// Sync implements vfs.FileDescriptionImpl.Sync.
func (fd *directoryFD) Sync(ctx context.Context) error {
        return fd.dentry().syncRemoteFile(ctx)
}




































































   23 




   23 




   23 




    2 




    5 














    5 










    9 





    4 






   23 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package header

import (
        "encoding/binary"
        "math"

        "gvisor.dev/gvisor/pkg/tcpip"
)

const (
        udpSrcPort  = 0
        udpDstPort  = 2
        udpLength   = 4
        udpChecksum = 6
)

const (
        // UDPMaximumPacketSize is the largest possible UDP packet.
        UDPMaximumPacketSize = 0xffff
)

// UDPFields contains the fields of a UDP packet. It is used to describe the
// fields of a packet that needs to be encoded.
type UDPFields struct {
        // SrcPort is the "source port" field of a UDP packet.
        SrcPort uint16

        // DstPort is the "destination port" field of a UDP packet.
        DstPort uint16

        // Length is the "length" field of a UDP packet.
        Length uint16

        // Checksum is the "checksum" field of a UDP packet.
        Checksum uint16
}

// UDP represents a UDP header stored in a byte array.
type UDP []byte

const (
        // UDPMinimumSize is the minimum size of a valid UDP packet.
        UDPMinimumSize = 8

        // UDPMaximumSize is the maximum size of a valid UDP packet. The length field
        // in the UDP header is 16 bits as per RFC 768.
        UDPMaximumSize = math.MaxUint16

        // UDPProtocolNumber is UDP's transport protocol number.
        UDPProtocolNumber tcpip.TransportProtocolNumber = 17
)

// SourcePort returns the "source port" field of the UDP header.
func (b UDP) SourcePort() uint16 {
        return binary.BigEndian.Uint16(b[udpSrcPort:])
}

// DestinationPort returns the "destination port" field of the UDP header.
func (b UDP) DestinationPort() uint16 {
        return binary.BigEndian.Uint16(b[udpDstPort:])
}

// Length returns the "length" field of the UDP header.
func (b UDP) Length() uint16 {
        return binary.BigEndian.Uint16(b[udpLength:])
}

// Payload returns the data contained in the UDP datagram.
func (b UDP) Payload() []byte {
        return b[UDPMinimumSize:]
}

// Checksum returns the "checksum" field of the UDP header.
func (b UDP) Checksum() uint16 {
        return binary.BigEndian.Uint16(b[udpChecksum:])
}

// SetSourcePort sets the "source port" field of the UDP header.
func (b UDP) SetSourcePort(port uint16) {
        binary.BigEndian.PutUint16(b[udpSrcPort:], port)
}

// SetDestinationPort sets the "destination port" field of the UDP header.
func (b UDP) SetDestinationPort(port uint16) {
        binary.BigEndian.PutUint16(b[udpDstPort:], port)
}

// SetChecksum sets the "checksum" field of the UDP header.
func (b UDP) SetChecksum(checksum uint16) {
        binary.BigEndian.PutUint16(b[udpChecksum:], checksum)
}

// SetLength sets the "length" field of the UDP header.
func (b UDP) SetLength(length uint16) {
        binary.BigEndian.PutUint16(b[udpLength:], length)
}

// CalculateChecksum calculates the checksum of the UDP packet, given the
// checksum of the network-layer pseudo-header and the checksum of the payload.
func (b UDP) CalculateChecksum(partialChecksum uint16) uint16 {
        // Calculate the rest of the checksum.
        return Checksum(b[:UDPMinimumSize], partialChecksum)
}

// IsChecksumValid returns true iff the UDP header's checksum is valid.
func (b UDP) IsChecksumValid(src, dst tcpip.Address, payloadChecksum uint16) bool {
        xsum := PseudoHeaderChecksum(UDPProtocolNumber, dst, src, b.Length())
        xsum = ChecksumCombine(xsum, payloadChecksum)
        return b.CalculateChecksum(xsum) == 0xffff
}

// Encode encodes all the fields of the UDP header.
func (b UDP) Encode(u *UDPFields) {
        binary.BigEndian.PutUint16(b[udpSrcPort:], u.SrcPort)
        binary.BigEndian.PutUint16(b[udpDstPort:], u.DstPort)
        binary.BigEndian.PutUint16(b[udpLength:], u.Length)
        binary.BigEndian.PutUint16(b[udpChecksum:], u.Checksum)
}

// SetSourcePortWithChecksumUpdate implements ChecksummableTransport.
func (b UDP) SetSourcePortWithChecksumUpdate(new uint16) {
        old := b.SourcePort()
        b.SetSourcePort(new)
        b.SetChecksum(^checksumUpdate2ByteAlignedUint16(^b.Checksum(), old, new))
}

// SetDestinationPortWithChecksumUpdate implements ChecksummableTransport.
func (b UDP) SetDestinationPortWithChecksumUpdate(new uint16) {
        old := b.DestinationPort()
        b.SetDestinationPort(new)
        b.SetChecksum(^checksumUpdate2ByteAlignedUint16(^b.Checksum(), old, new))
}

// UpdateChecksumPseudoHeaderAddress implements ChecksummableTransport.
func (b UDP) UpdateChecksumPseudoHeaderAddress(old, new tcpip.Address, fullChecksum bool) {
        xsum := b.Checksum()
        if fullChecksum {
                xsum = ^xsum
        }

        xsum = checksumUpdate2ByteAlignedAddress(xsum, old, new)
        if fullChecksum {
                xsum = ^xsum
        }

        b.SetChecksum(xsum)
}












































   22 












   22 

   22 




   22 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package jenkins implements Jenkins's one_at_a_time, non-cryptographic hash
// functions created by by Bob Jenkins.
//
// See https://en.wikipedia.org/wiki/Jenkins_hash_function#cite_note-dobbsx-1
//
package jenkins

import (
        "hash"
)

// Sum32 represents Jenkins's one_at_a_time hash.
//
// Use the Sum32 type directly (as opposed to New32 below)
// to avoid allocations.
type Sum32 uint32

// New32 returns a new 32-bit Jenkins's one_at_a_time hash.Hash.
//
// Its Sum method will lay the value out in big-endian byte order.
func New32() hash.Hash32 {
        var s Sum32
        return &s
}

// Reset resets the hash to its initial state.
func (s *Sum32) Reset() { *s = 0 }

// Sum32 returns the hash value
func (s *Sum32) Sum32() uint32 {
        sCopy := *s

        sCopy += sCopy << 3
        sCopy ^= sCopy >> 11
        sCopy += sCopy << 15

        return uint32(sCopy)
}

// Write adds more data to the running hash.
//
// It never returns an error.
func (s *Sum32) Write(data []byte) (int, error) {
        sCopy := *s
        for _, b := range data {
                sCopy += Sum32(b)
                sCopy += sCopy << 10
                sCopy ^= sCopy >> 6
        }
        *s = sCopy
        return len(data), nil
}

// Size returns the number of bytes Sum will return.
func (s *Sum32) Size() int { return 4 }

// BlockSize returns the hash's underlying block size.
func (s *Sum32) BlockSize() int { return 1 }

// Sum appends the current hash to in and returns the resulting slice.
//
// It does not change the underlying hash state.
func (s *Sum32) Sum(in []byte) []byte {
        v := s.Sum32()
        return append(in, byte(v>>24), byte(v>>16), byte(v>>8), byte(v))
}







































   31 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "golang.org/x/time/rate"
)

const (
        // icmpLimit is the default maximum number of ICMP messages permitted by this
        // rate limiter.
        icmpLimit = 1000

        // icmpBurst is the default number of ICMP messages that can be sent in a single
        // burst.
        icmpBurst = 50
)

// ICMPRateLimiter is a global rate limiter that controls the generation of
// ICMP messages generated by the stack.
type ICMPRateLimiter struct {
        *rate.Limiter
}

// NewICMPRateLimiter returns a global rate limiter for controlling the rate
// at which ICMP messages are generated by the stack.
func NewICMPRateLimiter() *ICMPRateLimiter {
        return &ICMPRateLimiter{Limiter: rate.NewLimiter(icmpLimit, icmpBurst)}
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/vfs/mount_namespace_refs.go: no such file or directory

















































































































   14 





















































    3 





    3 











  322 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package p9

import (
        "fmt"
        "strconv"
        "strings"
)

const (
        // highestSupportedVersion is the highest supported version X in a
        // version string of the format 9P2000.L.Google.X.
        //
        // Clients are expected to start requesting this version number and
        // to continuously decrement it until a Tversion request succeeds.
        highestSupportedVersion uint32 = 13

        // lowestSupportedVersion is the lowest supported version X in a
        // version string of the format 9P2000.L.Google.X.
        //
        // Clients are free to send a Tversion request at a version below this
        // value but are expected to encounter an Rlerror in response.
        lowestSupportedVersion uint32 = 0

        // baseVersion is the base version of 9P that this package must always
        // support.  It is equivalent to 9P2000.L.Google.0.
        baseVersion = "9P2000.L"
)

// HighestVersionString returns the highest possible version string that a client
// may request or a server may support.
func HighestVersionString() string {
        return versionString(highestSupportedVersion)
}

// parseVersion parses a Tversion version string into a numeric version number
// if the version string is supported by p9.  Otherwise returns (0, false).
//
// From Tversion(9P): "Version strings are defined such that, if the client string
// contains one or more period characters, the initial substring up to but not
// including any single period in the version string defines a version of the protocol."
//
// p9 intentionally diverges from this and always requires that the version string
// start with 9P2000.L to express that it is always compatible with 9P2000.L.  The
// only supported versions extensions are of the format 9p2000.L.Google.X where X
// is an ever increasing version counter.
//
// Version 9P2000.L.Google.0 implies 9P2000.L.
//
// New versions must always be a strict superset of 9P2000.L. A version increase must
// define a predicate representing the feature extension introduced by that version. The
// predicate must be commented and should take the format:
//
// // VersionSupportsX returns true if version v supports X and must be checked when ...
// func VersionSupportsX(v int32) bool {
//        ...
// )
func parseVersion(str string) (uint32, bool) {
        // Special case the base version which lacks the ".Google.X" suffix.  This
        // version always means version 0.
        if str == baseVersion {
                return 0, true
        }
        substr := strings.Split(str, ".")
        if len(substr) != 4 {
                return 0, false
        }
        if substr[0] != "9P2000" || substr[1] != "L" || substr[2] != "Google" || len(substr[3]) == 0 {
                return 0, false
        }
        version, err := strconv.ParseUint(substr[3], 10, 32)
        if err != nil {
                return 0, false
        }
        return uint32(version), true
}

// versionString formats a p9 version number into a Tversion version string.
func versionString(version uint32) string {
        // Special case the base version so that clients expecting this string
        // instead of the 9P2000.L.Google.0 equivalent get it.  This is important
        // for backwards compatibility with legacy servers that check for exactly
        // the baseVersion and allow nothing else.
        if version == 0 {
                return baseVersion
        }
        return fmt.Sprintf("9P2000.L.Google.%d", version)
}

// VersionSupportsTflushf returns true if version v supports the Tflushf message.
// This predicate must be checked by clients before attempting to make a Tflushf
// request.  If this predicate returns false, then clients may safely no-op.
func VersionSupportsTflushf(v uint32) bool {
        return v >= 1
}

// versionSupportsTwalkgetattr returns true if version v supports the
// Twalkgetattr message. This predicate must be checked by clients before
// attempting to make a Twalkgetattr request.
func versionSupportsTwalkgetattr(v uint32) bool {
        return v >= 2
}

// versionSupportsTucreation returns true if version v supports the Tucreation
// messages (Tucreate, Tusymlink, Tumkdir, Tumknod). This predicate must be
// checked by clients before attempting to make a Tucreation request.
// If Tucreation messages are not supported, their non-UID supporting
// counterparts (Tlcreate, Tsymlink, Tmkdir, Tmknod) should be used.
func versionSupportsTucreation(v uint32) bool {
        return v >= 3
}

// VersionSupportsConnect returns true if version v supports the Tlconnect
// message. This predicate must be checked by clients
// before attempting to make a Tlconnect request. If Tlconnect messages are not
// supported, Tlopen should be used.
func VersionSupportsConnect(v uint32) bool {
        return v >= 4
}

// VersionSupportsAnonymous returns true if version v supports Tlconnect
// with the AnonymousSocket mode. This predicate must be checked by clients
// before attempting to use the AnonymousSocket Tlconnect mode.
func VersionSupportsAnonymous(v uint32) bool {
        return v >= 5
}

// VersionSupportsMultiUser returns true if version v supports multi-user fake
// directory permissions and ID values.
func VersionSupportsMultiUser(v uint32) bool {
        return v >= 6
}

// versionSupportsTallocate returns true if version v supports Allocate().
func versionSupportsTallocate(v uint32) bool {
        return v >= 7
}

// versionSupportsFlipcall returns true if version v supports IPC channels from
// the flipcall package. Note that these must be negotiated, but this version
// string indicates that such a facility exists.
func versionSupportsFlipcall(v uint32) bool {
        return v >= 8
}

// VersionSupportsOpenTruncateFlag returns true if version v supports
// passing the OpenTruncate flag to Tlopen.
func VersionSupportsOpenTruncateFlag(v uint32) bool {
        return v >= 9
}

// versionSupportsGetSetXattr returns true if version v supports
// the Tgetxattr and Tsetxattr messages.
func versionSupportsGetSetXattr(v uint32) bool {
        return v >= 10
}

// versionSupportsListRemoveXattr returns true if version v supports
// the Tlistxattr and Tremovexattr messages.
func versionSupportsListRemoveXattr(v uint32) bool {
        return v >= 11
}

// versionSupportsTsetattrclunk returns true if version v supports
// the Tsetattrclunk message.
func versionSupportsTsetattrclunk(v uint32) bool {
        return v >= 12
}

// versionSupportsTmultiGetAttr returns true if version v supports
// the TmultiGetAttr message.
func versionSupportsTmultiGetAttr(v uint32) bool {
        return v >= 13
}







































































































































  439 

  308 





  441 


















 1439 
  261 


  193 


  260 






    1 




    1 



    1 



  261 




  262 
  193 


  260 

    1 


  258 






  254 





  732 





  345 




   10 




  545 




  183 






    2 









    2 



  183 









    2 


    2 



  183 



  180 

    1 


    1 





  179 





  284 




  480 




 1197 


















































































































































































































   13 




  267 
   14 









  257 



   26 
    6 









   20 

































  239 






  932 






  939 






   87 
    1 


   86 


   85 

   61 


   85 





  258 
    1 


  257 

  225 


  259 







   28 
    1 


   27 


   26 
    6 


   26 



  354 
    1 


  354 
  328 


  354 





   13 




   30 




    2 





   76 




   61 










    5 
    1 









    4 
    1 






    3 









    2 










    2 




    7 
    2 









    5 




    6 
    2 









    4 




    1 




  230 


    2 


  230 



  230 











  231 



  231 








  231 



    1 




  420 




    4 





    2 




    8 




  418 




    2 




   16 


   14 









    1 

    1 



    1 



   14 











   10 







   13 


   12 

    1 



   13 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "io"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fs/lock"
        "gvisor.dev/gvisor/pkg/sentry/fsmetric"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

// A FileDescription represents an open file description, which is the entity
// referred to by a file descriptor (POSIX.1-2017 3.258 "Open File
// Description").
//
// FileDescriptions are reference-counted. Unless otherwise specified, all
// FileDescription methods require that a reference is held.
//
// FileDescription is analogous to Linux's struct file.
//
// +stateify savable
type FileDescription struct {
        FileDescriptionRefs

        // flagsMu protects `statusFlags`, `saved`, and `asyncHandler` below.
        flagsMu sync.Mutex `state:"nosave"`

        // statusFlags contains status flags, "initialized by open(2) and possibly
        // modified by fcntl()" - fcntl(2). statusFlags can be read using atomic
        // memory operations when it does not need to be synchronized with an
        // access to asyncHandler.
        statusFlags uint32

        // saved is true after beforeSave is called. This is used to prevent
        // double-unregistration of asyncHandler. This does not work properly for
        // save-resume, which is not currently supported in gVisor (see b/26588733).
        saved bool `state:"nosave"`

        // asyncHandler handles O_ASYNC signal generation. It is set with the
        // F_SETOWN or F_SETOWN_EX fcntls. For asyncHandler to be used, O_ASYNC must
        // also be set by fcntl(2).
        asyncHandler FileAsync

        // epolls is the set of epollInterests registered for this FileDescription.
        // epolls is protected by epollMu.
        epollMu sync.Mutex `state:"nosave"`
        epolls  map[*epollInterest]struct{}

        // vd is the filesystem location at which this FileDescription was opened.
        // A reference is held on vd. vd is immutable.
        vd VirtualDentry

        // opts contains options passed to FileDescription.Init(). opts is
        // immutable.
        opts FileDescriptionOptions

        // readable is MayReadFileWithOpenFlags(statusFlags). readable is
        // immutable.
        //
        // readable is analogous to Linux's FMODE_READ.
        readable bool

        // writable is MayWriteFileWithOpenFlags(statusFlags). If writable is true,
        // the FileDescription holds a write count on vd.mount. writable is
        // immutable.
        //
        // writable is analogous to Linux's FMODE_WRITE.
        writable bool

        usedLockBSD uint32

        // impl is the FileDescriptionImpl associated with this Filesystem. impl is
        // immutable. This should be the last field in FileDescription.
        impl FileDescriptionImpl
}

// FileDescriptionOptions contains options to FileDescription.Init().
//
// +stateify savable
type FileDescriptionOptions struct {
        // If AllowDirectIO is true, allow O_DIRECT to be set on the file.
        AllowDirectIO bool

        // If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE.
        DenyPRead bool

        // If DenyPWrite is true, calls to FileDescription.PWrite() return
        // ESPIPE.
        DenyPWrite bool

        // If UseDentryMetadata is true, calls to FileDescription methods that
        // interact with file and filesystem metadata (Stat, SetStat, StatFS,
        // ListXattr, GetXattr, SetXattr, RemoveXattr) are implemented by calling
        // the corresponding FilesystemImpl methods instead of the corresponding
        // FileDescriptionImpl methods.
        //
        // UseDentryMetadata is intended for file descriptions that are implemented
        // outside of individual filesystems, such as pipes, sockets, and device
        // special files. FileDescriptions for which UseDentryMetadata is true may
        // embed DentryMetadataFileDescriptionImpl to obtain appropriate
        // implementations of FileDescriptionImpl methods that should not be
        // called.
        UseDentryMetadata bool
}

// FileCreationFlags are the set of flags passed to FileDescription.Init() but
// omitted from FileDescription.StatusFlags().
const FileCreationFlags = linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC

// Init must be called before first use of fd. If it succeeds, it takes
// references on mnt and d. flags is the initial file description flags, which
// is usually the full set of flags passed to open(2).
func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error {
        writable := MayWriteFileWithOpenFlags(flags)
        if writable {
                if err := mnt.CheckBeginWrite(); err != nil {
                        return err
                }
        }

        fd.InitRefs()

        // Remove "file creation flags" to mirror the behavior from file.f_flags in
        // fs/open.c:do_dentry_open.
        fd.statusFlags = flags &^ FileCreationFlags
        fd.vd = VirtualDentry{
                mount:  mnt,
                dentry: d,
        }
        mnt.IncRef()
        d.IncRef()
        fd.opts = *opts
        fd.readable = MayReadFileWithOpenFlags(flags)
        fd.writable = writable
        fd.impl = impl
        return nil
}

// DecRef decrements fd's reference count.
func (fd *FileDescription) DecRef(ctx context.Context) {
        fd.FileDescriptionRefs.DecRef(func() {
                // Generate inotify events.
                ev := uint32(linux.IN_CLOSE_NOWRITE)
                if fd.IsWritable() {
                        ev = linux.IN_CLOSE_WRITE
                }
                fd.Dentry().InotifyWithParent(ctx, ev, 0, PathEvent)

                // Unregister fd from all epoll instances.
                fd.epollMu.Lock()
                epolls := fd.epolls
                fd.epolls = nil
                fd.epollMu.Unlock()
                for epi := range epolls {
                        ep := epi.epoll
                        ep.interestMu.Lock()
                        // Check that epi has not been concurrently unregistered by
                        // EpollInstance.DeleteInterest() or EpollInstance.Release().
                        if _, ok := ep.interest[epi.key]; ok {
                                fd.EventUnregister(&epi.waiter)
                                ep.removeLocked(epi)
                        }
                        ep.interestMu.Unlock()
                }

                // If BSD locks were used, release any lock that it may have acquired.
                if atomic.LoadUint32(&fd.usedLockBSD) != 0 {
                        fd.impl.UnlockBSD(context.Background(), fd)
                }

                // Release implementation resources.
                fd.impl.Release(ctx)
                if fd.writable {
                        fd.vd.mount.EndWrite()
                }
                fd.vd.DecRef(ctx)
                fd.flagsMu.Lock()
                if !fd.saved && fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil {
                        fd.asyncHandler.Unregister(fd)
                }
                fd.asyncHandler = nil
                fd.flagsMu.Unlock()
        })
}

// Mount returns the mount on which fd was opened. It does not take a reference
// on the returned Mount.
func (fd *FileDescription) Mount() *Mount {
        return fd.vd.mount
}

// Dentry returns the dentry at which fd was opened. It does not take a
// reference on the returned Dentry.
func (fd *FileDescription) Dentry() *Dentry {
        return fd.vd.dentry
}

// VirtualDentry returns the location at which fd was opened. It does not take
// a reference on the returned VirtualDentry.
func (fd *FileDescription) VirtualDentry() VirtualDentry {
        return fd.vd
}

// Options returns the options passed to fd.Init().
func (fd *FileDescription) Options() FileDescriptionOptions {
        return fd.opts
}

// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
func (fd *FileDescription) StatusFlags() uint32 {
        return atomic.LoadUint32(&fd.statusFlags)
}

// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error {
        // Compare Linux's fs/fcntl.c:setfl().
        oldFlags := fd.StatusFlags()
        // Linux documents this check as "O_APPEND cannot be cleared if the file is
        // marked as append-only and the file is open for write", which would make
        // sense. However, the check as actually implemented seems to be "O_APPEND
        // cannot be changed if the file is marked as append-only".
        if (flags^oldFlags)&linux.O_APPEND != 0 {
                stat, err := fd.Stat(ctx, StatOptions{
                        // There is no mask bit for stx_attributes.
                        Mask: 0,
                        // Linux just reads inode::i_flags directly.
                        Sync: linux.AT_STATX_DONT_SYNC,
                })
                if err != nil {
                        return err
                }
                if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) {
                        return linuxerr.EPERM
                }
        }
        if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) {
                stat, err := fd.Stat(ctx, StatOptions{
                        Mask: linux.STATX_UID,
                        // Linux's inode_owner_or_capable() just reads inode::i_uid
                        // directly.
                        Sync: linux.AT_STATX_DONT_SYNC,
                })
                if err != nil {
                        return err
                }
                if stat.Mask&linux.STATX_UID == 0 {
                        return linuxerr.EPERM
                }
                if !CanActAsOwner(creds, auth.KUID(stat.UID)) {
                        return linuxerr.EPERM
                }
        }
        if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO {
                return linuxerr.EINVAL
        }
        // TODO(gvisor.dev/issue/1035): FileDescriptionImpl.SetOAsync()?
        const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK
        fd.flagsMu.Lock()
        if fd.asyncHandler != nil {
                // Use fd.statusFlags instead of oldFlags, which may have become outdated,
                // to avoid double registering/unregistering.
                if fd.statusFlags&linux.O_ASYNC == 0 && flags&linux.O_ASYNC != 0 {
                        fd.asyncHandler.Register(fd)
                } else if fd.statusFlags&linux.O_ASYNC != 0 && flags&linux.O_ASYNC == 0 {
                        fd.asyncHandler.Unregister(fd)
                }
        }
        atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags))
        fd.flagsMu.Unlock()
        return nil
}

// IsReadable returns true if fd was opened for reading.
func (fd *FileDescription) IsReadable() bool {
        return fd.readable
}

// IsWritable returns true if fd was opened for writing.
func (fd *FileDescription) IsWritable() bool {
        return fd.writable
}

// Impl returns the FileDescriptionImpl associated with fd.
func (fd *FileDescription) Impl() FileDescriptionImpl {
        return fd.impl
}

// FileDescriptionImpl contains implementation details for an FileDescription.
// Implementations of FileDescriptionImpl should contain their associated
// FileDescription by value as their first field.
//
// For all functions that return linux.Statx, Statx.Uid and Statx.Gid will
// be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and
// auth.KGID respectively).
//
// All methods may return errors not specified.
//
// FileDescriptionImpl is analogous to Linux's struct file_operations.
type FileDescriptionImpl interface {
        // Release is called when the associated FileDescription reaches zero
        // references.
        Release(ctx context.Context)

        // OnClose is called when a file descriptor representing the
        // FileDescription is closed. Note that returning a non-nil error does not
        // prevent the file descriptor from being closed.
        OnClose(ctx context.Context) error

        // Stat returns metadata for the file represented by the FileDescription.
        Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)

        // SetStat updates metadata for the file represented by the
        // FileDescription. Implementations are responsible for checking if the
        // operation can be performed (see vfs.CheckSetStat() for common checks).
        SetStat(ctx context.Context, opts SetStatOptions) error

        // StatFS returns metadata for the filesystem containing the file
        // represented by the FileDescription.
        StatFS(ctx context.Context) (linux.Statfs, error)

        // Allocate grows the file to offset + length bytes.
        // Only mode == 0 is supported currently.
        //
        // Allocate should return EISDIR on directories, ESPIPE on pipes, and ENODEV on
        // other files where it is not supported.
        //
        // Preconditions: The FileDescription was opened for writing.
        Allocate(ctx context.Context, mode, offset, length uint64) error

        // waiter.Waitable methods may be used to poll for I/O events.
        waiter.Waitable

        // PRead reads from the file into dst, starting at the given offset, and
        // returns the number of bytes read. PRead is permitted to return partial
        // reads with a nil error.
        //
        // Errors:
        //
        // - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
        //
        // Preconditions:
        // * The FileDescription was opened for reading.
        // * FileDescriptionOptions.DenyPRead == false.
        PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)

        // Read is similar to PRead, but does not specify an offset.
        //
        // For files with an implicit FileDescription offset (e.g. regular files),
        // Read begins at the FileDescription offset, and advances the offset by
        // the number of bytes read; note that POSIX 2.9.7 "Thread Interactions
        // with Regular File Operations" requires that all operations that may
        // mutate the FileDescription offset are serialized.
        //
        // Errors:
        //
        // - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP.
        //
        // Preconditions: The FileDescription was opened for reading.
        Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error)

        // PWrite writes src to the file, starting at the given offset, and returns
        // the number of bytes written. PWrite is permitted to return partial
        // writes with a nil error.
        //
        // As in Linux (but not POSIX), if O_APPEND is in effect for the
        // FileDescription, PWrite should ignore the offset and append data to the
        // end of the file.
        //
        // Errors:
        //
        // - If opts.Flags specifies unsupported options, PWrite returns
        // EOPNOTSUPP.
        //
        // Preconditions:
        // * The FileDescription was opened for writing.
        // * FileDescriptionOptions.DenyPWrite == false.
        PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)

        // Write is similar to PWrite, but does not specify an offset, which is
        // implied as for Read.
        //
        // Write is a FileDescriptionImpl method, instead of a wrapper around
        // PWrite that uses a FileDescription offset, to make it possible for
        // remote filesystems to implement O_APPEND correctly (i.e. atomically with
        // respect to writers outside the scope of VFS).
        //
        // Errors:
        //
        // - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP.
        //
        // Preconditions: The FileDescription was opened for writing.
        Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error)

        // IterDirents invokes cb on each entry in the directory represented by the
        // FileDescription. If IterDirents has been called since the last call to
        // Seek, it continues iteration from the end of the last call.
        IterDirents(ctx context.Context, cb IterDirentsCallback) error

        // Seek changes the FileDescription offset (assuming one exists) and
        // returns its new value.
        //
        // For directories, if whence == SEEK_SET and offset == 0, the caller is
        // rewinddir(), such that Seek "shall also cause the directory stream to
        // refer to the current state of the corresponding directory" -
        // POSIX.1-2017.
        Seek(ctx context.Context, offset int64, whence int32) (int64, error)

        // Sync requests that cached state associated with the file represented by
        // the FileDescription is synchronized with persistent storage, and blocks
        // until this is complete.
        Sync(ctx context.Context) error

        // ConfigureMMap mutates opts to implement mmap(2) for the file. Most
        // implementations that support memory mapping can call
        // GenericConfigureMMap with the appropriate memmap.Mappable.
        ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error

        // Ioctl implements the ioctl(2) syscall.
        Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)

        // ListXattr returns all extended attribute names for the file.
        ListXattr(ctx context.Context, size uint64) ([]string, error)

        // GetXattr returns the value associated with the given extended attribute
        // for the file.
        GetXattr(ctx context.Context, opts GetXattrOptions) (string, error)

        // SetXattr changes the value associated with the given extended attribute
        // for the file.
        SetXattr(ctx context.Context, opts SetXattrOptions) error

        // RemoveXattr removes the given extended attribute from the file.
        RemoveXattr(ctx context.Context, name string) error

        // SupportsLocks indicates whether file locks are supported.
        SupportsLocks() bool

        // LockBSD tries to acquire a BSD-style advisory file lock.
        LockBSD(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, block lock.Blocker) error

        // UnlockBSD releases a BSD-style advisory file lock.
        UnlockBSD(ctx context.Context, uid lock.UniqueID) error

        // LockPOSIX tries to acquire a POSIX-style advisory file lock.
        LockPOSIX(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, r lock.LockRange, block lock.Blocker) error

        // UnlockPOSIX releases a POSIX-style advisory file lock.
        UnlockPOSIX(ctx context.Context, uid lock.UniqueID, ComputeLockRange lock.LockRange) error

        // TestPOSIX returns information about whether the specified lock can be held, in the style of the F_GETLK fcntl.
        TestPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, r lock.LockRange) (linux.Flock, error)
}

// Dirent holds the information contained in struct linux_dirent64.
//
// +stateify savable
type Dirent struct {
        // Name is the filename.
        Name string

        // Type is the file type, a linux.DT_* constant.
        Type uint8

        // Ino is the inode number.
        Ino uint64

        // NextOff is the offset of the *next* Dirent in the directory; that is,
        // FileDescription.Seek(NextOff, SEEK_SET) (as called by seekdir(3)) will
        // cause the next call to FileDescription.IterDirents() to yield the next
        // Dirent. (The offset of the first Dirent in a directory is always 0.)
        NextOff int64
}

// IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents.
type IterDirentsCallback interface {
        // Handle handles the given iterated Dirent. If Handle returns a non-nil
        // error, FileDescriptionImpl.IterDirents must stop iteration and return
        // the error; the next call to FileDescriptionImpl.IterDirents should
        // restart with the same Dirent.
        Handle(dirent Dirent) error
}

// IterDirentsCallbackFunc implements IterDirentsCallback for a function with
// the semantics of IterDirentsCallback.Handle.
type IterDirentsCallbackFunc func(dirent Dirent) error

// Handle implements IterDirentsCallback.Handle.
func (f IterDirentsCallbackFunc) Handle(dirent Dirent) error {
        return f(dirent)
}

// OnClose is called when a file descriptor representing the FileDescription is
// closed. Returning a non-nil error should not prevent the file descriptor
// from being closed.
func (fd *FileDescription) OnClose(ctx context.Context) error {
        return fd.impl.OnClose(ctx)
}

// Stat returns metadata for the file represented by fd.
func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
        if fd.opts.UseDentryMetadata {
                vfsObj := fd.vd.mount.vfs
                rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
                        Root:  fd.vd,
                        Start: fd.vd,
                })
                stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
                rp.Release(ctx)
                return stat, err
        }
        return fd.impl.Stat(ctx, opts)
}

// SetStat updates metadata for the file represented by fd.
func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error {
        if fd.opts.UseDentryMetadata {
                vfsObj := fd.vd.mount.vfs
                rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
                        Root:  fd.vd,
                        Start: fd.vd,
                })
                err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts)
                rp.Release(ctx)
                return err
        }
        return fd.impl.SetStat(ctx, opts)
}

// StatFS returns metadata for the filesystem containing the file represented
// by fd.
func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
        if fd.opts.UseDentryMetadata {
                vfsObj := fd.vd.mount.vfs
                rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
                        Root:  fd.vd,
                        Start: fd.vd,
                })
                statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp)
                rp.Release(ctx)
                return statfs, err
        }
        return fd.impl.StatFS(ctx)
}

// Allocate grows file represented by FileDescription to offset + length bytes.
func (fd *FileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
        if !fd.IsWritable() {
                return linuxerr.EBADF
        }
        if err := fd.impl.Allocate(ctx, mode, offset, length); err != nil {
                return err
        }
        fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent)
        return nil
}

// Readiness implements waiter.Waitable.Readiness.
//
// It returns fd's I/O readiness.
func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
        return fd.impl.Readiness(mask)
}

// EventRegister implements waiter.Waitable.EventRegister.
//
// It registers e for I/O readiness events in mask.
func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        fd.impl.EventRegister(e, mask)
}

// EventUnregister implements waiter.Waitable.EventUnregister.
//
// It unregisters e for I/O readiness events.
func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
        fd.impl.EventUnregister(e)
}

// PRead reads from the file represented by fd into dst, starting at the given
// offset, and returns the number of bytes read. PRead is permitted to return
// partial reads with a nil error.
func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
        if fd.opts.DenyPRead {
                return 0, linuxerr.ESPIPE
        }
        if !fd.readable {
                return 0, linuxerr.EBADF
        }
        start := fsmetric.StartReadWait()
        n, err := fd.impl.PRead(ctx, dst, offset, opts)
        if n > 0 {
                fd.Dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, PathEvent)
        }
        fsmetric.Reads.Increment()
        fsmetric.FinishReadWait(fsmetric.ReadWait, start)
        return n, err
}

// Read is similar to PRead, but does not specify an offset.
func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
        if !fd.readable {
                return 0, linuxerr.EBADF
        }
        start := fsmetric.StartReadWait()
        n, err := fd.impl.Read(ctx, dst, opts)
        if n > 0 {
                fd.Dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, PathEvent)
        }
        fsmetric.Reads.Increment()
        fsmetric.FinishReadWait(fsmetric.ReadWait, start)
        return n, err
}

// PWrite writes src to the file represented by fd, starting at the given
// offset, and returns the number of bytes written. PWrite is permitted to
// return partial writes with a nil error.
func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
        if fd.opts.DenyPWrite {
                return 0, linuxerr.ESPIPE
        }
        if !fd.writable {
                return 0, linuxerr.EBADF
        }
        n, err := fd.impl.PWrite(ctx, src, offset, opts)
        if n > 0 {
                fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent)
        }
        return n, err
}

// Write is similar to PWrite, but does not specify an offset.
func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
        if !fd.writable {
                return 0, linuxerr.EBADF
        }
        n, err := fd.impl.Write(ctx, src, opts)
        if n > 0 {
                fd.Dentry().InotifyWithParent(ctx, linux.IN_MODIFY, 0, PathEvent)
        }
        return n, err
}

// IterDirents invokes cb on each entry in the directory represented by fd. If
// IterDirents has been called since the last call to Seek, it continues
// iteration from the end of the last call.
func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
        return fd.impl.IterDirents(ctx, cb)
}

// Seek changes fd's offset (assuming one exists) and returns its new value.
func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
        return fd.impl.Seek(ctx, offset, whence)
}

// Sync has the semantics of fsync(2).
func (fd *FileDescription) Sync(ctx context.Context) error {
        return fd.impl.Sync(ctx)
}

// ConfigureMMap mutates opts to implement mmap(2) for the file represented by
// fd.
func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
        return fd.impl.ConfigureMMap(ctx, opts)
}

// Ioctl implements the ioctl(2) syscall.
func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        return fd.impl.Ioctl(ctx, uio, args)
}

// ListXattr returns all extended attribute names for the file represented by
// fd.
//
// If the size of the list (including a NUL terminating byte after every entry)
// would exceed size, ERANGE may be returned. Note that implementations
// are free to ignore size entirely and return without error). In all cases,
// if size is 0, the list should be returned without error, regardless of size.
func (fd *FileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
        if fd.opts.UseDentryMetadata {
                vfsObj := fd.vd.mount.vfs
                rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
                        Root:  fd.vd,
                        Start: fd.vd,
                })
                names, err := fd.vd.mount.fs.impl.ListXattrAt(ctx, rp, size)
                rp.Release(ctx)
                return names, err
        }
        names, err := fd.impl.ListXattr(ctx, size)
        if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) {
                // Linux doesn't actually return EOPNOTSUPP in this case; instead,
                // fs/xattr.c:vfs_listxattr() falls back to allowing the security
                // subsystem to return security extended attributes, which by default
                // don't exist.
                return nil, nil
        }
        return names, err
}

// GetXattr returns the value associated with the given extended attribute for
// the file represented by fd.
//
// If the size of the return value exceeds opts.Size, ERANGE may be returned
// (note that implementations are free to ignore opts.Size entirely and return
// without error). In all cases, if opts.Size is 0, the value should be
// returned without error, regardless of size.
func (fd *FileDescription) GetXattr(ctx context.Context, opts *GetXattrOptions) (string, error) {
        if fd.opts.UseDentryMetadata {
                vfsObj := fd.vd.mount.vfs
                rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
                        Root:  fd.vd,
                        Start: fd.vd,
                })
                val, err := fd.vd.mount.fs.impl.GetXattrAt(ctx, rp, *opts)
                rp.Release(ctx)
                return val, err
        }
        return fd.impl.GetXattr(ctx, *opts)
}

// SetXattr changes the value associated with the given extended attribute for
// the file represented by fd.
func (fd *FileDescription) SetXattr(ctx context.Context, opts *SetXattrOptions) error {
        if fd.opts.UseDentryMetadata {
                vfsObj := fd.vd.mount.vfs
                rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
                        Root:  fd.vd,
                        Start: fd.vd,
                })
                err := fd.vd.mount.fs.impl.SetXattrAt(ctx, rp, *opts)
                rp.Release(ctx)
                return err
        }
        return fd.impl.SetXattr(ctx, *opts)
}

// RemoveXattr removes the given extended attribute from the file represented
// by fd.
func (fd *FileDescription) RemoveXattr(ctx context.Context, name string) error {
        if fd.opts.UseDentryMetadata {
                vfsObj := fd.vd.mount.vfs
                rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
                        Root:  fd.vd,
                        Start: fd.vd,
                })
                err := fd.vd.mount.fs.impl.RemoveXattrAt(ctx, rp, name)
                rp.Release(ctx)
                return err
        }
        return fd.impl.RemoveXattr(ctx, name)
}

// SyncFS instructs the filesystem containing fd to execute the semantics of
// syncfs(2).
func (fd *FileDescription) SyncFS(ctx context.Context) error {
        return fd.vd.mount.fs.impl.Sync(ctx)
}

// MappedName implements memmap.MappingIdentity.MappedName.
func (fd *FileDescription) MappedName(ctx context.Context) string {
        vfsroot := RootFromContext(ctx)
        s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd)
        if vfsroot.Ok() {
                vfsroot.DecRef(ctx)
        }
        return s
}

// DeviceID implements memmap.MappingIdentity.DeviceID.
func (fd *FileDescription) DeviceID() uint64 {
        stat, err := fd.Stat(context.Background(), StatOptions{
                // There is no STATX_DEV; we assume that Stat will return it if it's
                // available regardless of mask.
                Mask: 0,
                // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_sb->s_dev
                // directly.
                Sync: linux.AT_STATX_DONT_SYNC,
        })
        if err != nil {
                return 0
        }
        return uint64(linux.MakeDeviceID(uint16(stat.DevMajor), stat.DevMinor))
}

// InodeID implements memmap.MappingIdentity.InodeID.
func (fd *FileDescription) InodeID() uint64 {
        stat, err := fd.Stat(context.Background(), StatOptions{
                Mask: linux.STATX_INO,
                // fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly.
                Sync: linux.AT_STATX_DONT_SYNC,
        })
        if err != nil || stat.Mask&linux.STATX_INO == 0 {
                return 0
        }
        return stat.Ino
}

// Msync implements memmap.MappingIdentity.Msync.
func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
        return fd.Sync(ctx)
}

// SupportsLocks indicates whether file locks are supported.
func (fd *FileDescription) SupportsLocks() bool {
        return fd.impl.SupportsLocks()
}

// LockBSD tries to acquire a BSD-style advisory file lock.
func (fd *FileDescription) LockBSD(ctx context.Context, ownerPID int32, lockType lock.LockType, blocker lock.Blocker) error {
        atomic.StoreUint32(&fd.usedLockBSD, 1)
        return fd.impl.LockBSD(ctx, fd, ownerPID, lockType, blocker)
}

// UnlockBSD releases a BSD-style advisory file lock.
func (fd *FileDescription) UnlockBSD(ctx context.Context) error {
        return fd.impl.UnlockBSD(ctx, fd)
}

// LockPOSIX locks a POSIX-style file range lock.
func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, ownerPID int32, t lock.LockType, r lock.LockRange, block lock.Blocker) error {
        return fd.impl.LockPOSIX(ctx, uid, ownerPID, t, r, block)
}

// UnlockPOSIX unlocks a POSIX-style file range lock.
func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, r lock.LockRange) error {
        return fd.impl.UnlockPOSIX(ctx, uid, r)
}

// TestPOSIX returns information about whether the specified lock can be held.
func (fd *FileDescription) TestPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, r lock.LockRange) (linux.Flock, error) {
        return fd.impl.TestPOSIX(ctx, uid, t, r)
}

// ComputeLockRange computes the range of a file lock based on the given values.
func (fd *FileDescription) ComputeLockRange(ctx context.Context, start uint64, length uint64, whence int16) (lock.LockRange, error) {
        var off int64
        switch whence {
        case linux.SEEK_SET:
                off = 0
        case linux.SEEK_CUR:
                // Note that Linux does not hold any mutexes while retrieving the file
                // offset, see fs/locks.c:flock_to_posix_lock and fs/locks.c:fcntl_setlk.
                curOff, err := fd.Seek(ctx, 0, linux.SEEK_CUR)
                if err != nil {
                        return lock.LockRange{}, err
                }
                off = curOff
        case linux.SEEK_END:
                stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_SIZE})
                if err != nil {
                        return lock.LockRange{}, err
                }
                off = int64(stat.Size)
        default:
                return lock.LockRange{}, linuxerr.EINVAL
        }

        return lock.ComputeRange(int64(start), int64(length), off)
}

// A FileAsync sends signals to its owner when w is ready for IO. This is only
// implemented by pkg/sentry/fasync:FileAsync, but we unfortunately need this
// interface to avoid circular dependencies.
type FileAsync interface {
        Register(w waiter.Waitable)
        Unregister(w waiter.Waitable)
}

// AsyncHandler returns the FileAsync for fd.
func (fd *FileDescription) AsyncHandler() FileAsync {
        fd.flagsMu.Lock()
        defer fd.flagsMu.Unlock()
        return fd.asyncHandler
}

// SetAsyncHandler sets fd.asyncHandler if it has not been set before and
// returns it.
func (fd *FileDescription) SetAsyncHandler(newHandler func() FileAsync) FileAsync {
        fd.flagsMu.Lock()
        defer fd.flagsMu.Unlock()
        if fd.asyncHandler == nil {
                fd.asyncHandler = newHandler()
                if fd.statusFlags&linux.O_ASYNC != 0 {
                        fd.asyncHandler.Register(fd)
                }
        }
        return fd.asyncHandler
}

// CopyRegularFileData copies data from srcFD to dstFD until reading from srcFD
// returns EOF or an error. It returns the number of bytes copied.
func CopyRegularFileData(ctx context.Context, dstFD, srcFD *FileDescription) (int64, error) {
        done := int64(0)
        buf := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size
        for {
                readN, readErr := srcFD.Read(ctx, buf, ReadOptions{})
                if readErr != nil && readErr != io.EOF {
                        return done, readErr
                }
                src := buf.TakeFirst64(readN)
                for src.NumBytes() != 0 {
                        writeN, writeErr := dstFD.Write(ctx, src, WriteOptions{})
                        done += writeN
                        src = src.DropFirst64(writeN)
                        if writeErr != nil {
                                return done, writeErr
                        }
                }
                if readErr == io.EOF {
                        return done, nil
                }
        }
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/mm/aio_mappable_refs.go: no such file or directory






























   12 






   12 




   12 




   12 




   12 










   12 



    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package netlink

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserr"
)

// socketProviderVFS2 implements socket.Provider.
type socketProviderVFS2 struct {
}

// Socket implements socket.Provider.Socket.
func (*socketProviderVFS2) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
        // Netlink sockets must be specified as datagram or raw, but they
        // behave the same regardless of type.
        if stype != linux.SOCK_DGRAM && stype != linux.SOCK_RAW {
                return nil, syserr.ErrSocketNotSupported
        }

        provider, ok := protocols[protocol]
        if !ok {
                return nil, syserr.ErrProtocolNotSupported
        }

        p, err := provider(t)
        if err != nil {
                return nil, err
        }

        s, err := NewVFS2(t, stype, p)
        if err != nil {
                return nil, err
        }

        vfsfd := &s.vfsfd
        mnt := t.Kernel().SocketMount()
        d := sockfs.NewDentry(t, mnt)
        defer d.DecRef(t)
        if err := vfsfd.Init(s, linux.O_RDWR, mnt, d, &vfs.FileDescriptionOptions{
                DenyPRead:         true,
                DenyPWrite:        true,
                UseDentryMetadata: true,
        }); err != nil {
                return nil, syserr.FromError(err)
        }
        return vfsfd, nil
}

// Pair implements socket.Provider.Pair by returning an error.
func (*socketProviderVFS2) Pair(*kernel.Task, linux.SockType, int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
        // Netlink sockets never supports creating socket pairs.
        return nil, nil, syserr.ErrNotSupported
}









































   45 






   24 























































































   89 







   89 




























    2 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package mm

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsbridge"
)

// Dumpability describes if and how core dumps should be created.
type Dumpability int

const (
        // NotDumpable indicates that core dumps should never be created.
        NotDumpable Dumpability = iota

        // UserDumpable indicates that core dumps should be created, owned by
        // the current user.
        UserDumpable

        // RootDumpable indicates that core dumps should be created, owned by
        // root.
        RootDumpable
)

// Dumpability returns the dumpability.
func (mm *MemoryManager) Dumpability() Dumpability {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        return mm.dumpability
}

// SetDumpability sets the dumpability.
func (mm *MemoryManager) SetDumpability(d Dumpability) {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        mm.dumpability = d
}

// ArgvStart returns the start of the application argument vector.
//
// There is no guarantee that this value is sensible w.r.t. ArgvEnd.
func (mm *MemoryManager) ArgvStart() hostarch.Addr {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        return mm.argv.Start
}

// SetArgvStart sets the start of the application argument vector.
func (mm *MemoryManager) SetArgvStart(a hostarch.Addr) {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        mm.argv.Start = a
}

// ArgvEnd returns the end of the application argument vector.
//
// There is no guarantee that this value is sensible w.r.t. ArgvStart.
func (mm *MemoryManager) ArgvEnd() hostarch.Addr {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        return mm.argv.End
}

// SetArgvEnd sets the end of the application argument vector.
func (mm *MemoryManager) SetArgvEnd(a hostarch.Addr) {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        mm.argv.End = a
}

// EnvvStart returns the start of the application environment vector.
//
// There is no guarantee that this value is sensible w.r.t. EnvvEnd.
func (mm *MemoryManager) EnvvStart() hostarch.Addr {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        return mm.envv.Start
}

// SetEnvvStart sets the start of the application environment vector.
func (mm *MemoryManager) SetEnvvStart(a hostarch.Addr) {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        mm.envv.Start = a
}

// EnvvEnd returns the end of the application environment vector.
//
// There is no guarantee that this value is sensible w.r.t. EnvvStart.
func (mm *MemoryManager) EnvvEnd() hostarch.Addr {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        return mm.envv.End
}

// SetEnvvEnd sets the end of the application environment vector.
func (mm *MemoryManager) SetEnvvEnd(a hostarch.Addr) {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        mm.envv.End = a
}

// Auxv returns the current map of auxiliary vectors.
func (mm *MemoryManager) Auxv() arch.Auxv {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        return append(arch.Auxv(nil), mm.auxv...)
}

// SetAuxv sets the entire map of auxiliary vectors.
func (mm *MemoryManager) SetAuxv(auxv arch.Auxv) {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        mm.auxv = append(arch.Auxv(nil), auxv...)
}

// Executable returns the executable, if available.
//
// An additional reference will be taken in the case of a non-nil executable,
// which must be released by the caller.
func (mm *MemoryManager) Executable() fsbridge.File {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()

        if mm.executable == nil {
                return nil
        }

        mm.executable.IncRef()
        return mm.executable
}

// SetExecutable sets the executable.
//
// This takes a reference on d.
func (mm *MemoryManager) SetExecutable(ctx context.Context, file fsbridge.File) {
        mm.metadataMu.Lock()

        // Grab a new reference.
        file.IncRef()

        // Set the executable.
        orig := mm.executable
        mm.executable = file

        mm.metadataMu.Unlock()

        // Release the old reference.
        //
        // Do this without holding the lock, since it may wind up doing some
        // I/O to sync the dirent, etc.
        if orig != nil {
                orig.DecRef(ctx)
        }
}

// VDSOSigReturn returns the address of vdso_sigreturn.
func (mm *MemoryManager) VDSOSigReturn() uint64 {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        return mm.vdsoSigReturnAddr
}

// SetVDSOSigReturn sets the address of vdso_sigreturn.
func (mm *MemoryManager) SetVDSOSigReturn(addr uint64) {
        mm.metadataMu.Lock()
        defer mm.metadataMu.Unlock()
        mm.vdsoSigReturnAddr = addr
}



















































   17 








    2 





    2 










   17 
   15 



    2 








    2 

    2 



    2 














   17 

    1 


   17 




























































   65 
























































































































































































































  153 
  153 

    8 


  145 





    1 




   37 

    1 



   36 

    3 


   33 







    1 
























    1 




   33 




    1 




  100 





    2 















    1 



    1 






    4 








    3 



    1 







    2 

































    1 







































  107 





   12 






   48 




   14 

    3 



    1 








    3 


    1 

    7 






    3 

    2 




    1 


    3 

    1 


    3 

    2 














   89 
   25 


   69 







  120 

    2 




  118 
   21 

    1 




   20 
   16 



   20 



   44 

    1 


   44 







   46 

    1 


   45 





   23 


   46 

    3 

    1 


    2 
    1 



    1 




    9 


    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package socket provides the interfaces that need to be provided by socket
// implementations and providers, as well as per family demultiplexing of socket
// creation.
package socket

import (
        "bytes"
        "fmt"
        "sync/atomic"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/sentry/device"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/usermem"
)

// ControlMessages represents the union of unix control messages and tcpip
// control messages.
type ControlMessages struct {
        Unix transport.ControlMessages
        IP   IPControlMessages
}

// packetInfoToLinux converts IPPacketInfo from tcpip format to Linux format.
func packetInfoToLinux(packetInfo tcpip.IPPacketInfo) linux.ControlMessageIPPacketInfo {
        var p linux.ControlMessageIPPacketInfo
        p.NIC = int32(packetInfo.NIC)
        copy(p.LocalAddr[:], []byte(packetInfo.LocalAddr))
        copy(p.DestinationAddr[:], []byte(packetInfo.DestinationAddr))
        return p
}

// errOriginToLinux maps tcpip socket origin to Linux socket origin constants.
func errOriginToLinux(origin tcpip.SockErrOrigin) uint8 {
        switch origin {
        case tcpip.SockExtErrorOriginNone:
                return linux.SO_EE_ORIGIN_NONE
        case tcpip.SockExtErrorOriginLocal:
                return linux.SO_EE_ORIGIN_LOCAL
        case tcpip.SockExtErrorOriginICMP:
                return linux.SO_EE_ORIGIN_ICMP
        case tcpip.SockExtErrorOriginICMP6:
                return linux.SO_EE_ORIGIN_ICMP6
        default:
                panic(fmt.Sprintf("unknown socket origin: %d", origin))
        }
}

// sockErrCmsgToLinux converts SockError control message from tcpip format to
// Linux format.
func sockErrCmsgToLinux(sockErr *tcpip.SockError) linux.SockErrCMsg {
        if sockErr == nil {
                return nil
        }

        ee := linux.SockExtendedErr{
                Errno:  uint32(syserr.TranslateNetstackError(sockErr.Err).ToLinux()),
                Origin: errOriginToLinux(sockErr.Cause.Origin()),
                Type:   sockErr.Cause.Type(),
                Code:   sockErr.Cause.Code(),
                Info:   sockErr.Cause.Info(),
        }

        switch sockErr.NetProto {
        case header.IPv4ProtocolNumber:
                errMsg := &linux.SockErrCMsgIPv4{SockExtendedErr: ee}
                if len(sockErr.Offender.Addr) > 0 {
                        addr, _ := ConvertAddress(linux.AF_INET, sockErr.Offender)
                        errMsg.Offender = *addr.(*linux.SockAddrInet)
                }
                return errMsg
        case header.IPv6ProtocolNumber:
                errMsg := &linux.SockErrCMsgIPv6{SockExtendedErr: ee}
                if len(sockErr.Offender.Addr) > 0 {
                        addr, _ := ConvertAddress(linux.AF_INET6, sockErr.Offender)
                        errMsg.Offender = *addr.(*linux.SockAddrInet6)
                }
                return errMsg
        default:
                panic(fmt.Sprintf("invalid net proto for creating SockErrCMsg: %d", sockErr.NetProto))
        }
}

// NewIPControlMessages converts the tcpip ControlMessgaes (which does not
// have Linux specific format) to Linux format.
func NewIPControlMessages(family int, cmgs tcpip.ControlMessages) IPControlMessages {
        var orgDstAddr linux.SockAddr
        if cmgs.HasOriginalDstAddress {
                orgDstAddr, _ = ConvertAddress(family, cmgs.OriginalDstAddress)
        }
        return IPControlMessages{
                HasTimestamp:       cmgs.HasTimestamp,
                Timestamp:          cmgs.Timestamp,
                HasInq:             cmgs.HasInq,
                Inq:                cmgs.Inq,
                HasTOS:             cmgs.HasTOS,
                TOS:                cmgs.TOS,
                HasTClass:          cmgs.HasTClass,
                TClass:             cmgs.TClass,
                HasIPPacketInfo:    cmgs.HasIPPacketInfo,
                PacketInfo:         packetInfoToLinux(cmgs.PacketInfo),
                OriginalDstAddress: orgDstAddr,
                SockErr:            sockErrCmsgToLinux(cmgs.SockErr),
        }
}

// IPControlMessages contains socket control messages for IP sockets.
// This can contain Linux specific structures unlike tcpip.ControlMessages.
//
// +stateify savable
type IPControlMessages struct {
        // HasTimestamp indicates whether Timestamp is valid/set.
        HasTimestamp bool

        // Timestamp is the time (in ns) that the last packet used to create
        // the read data was received.
        Timestamp int64

        // HasInq indicates whether Inq is valid/set.
        HasInq bool

        // Inq is the number of bytes ready to be received.
        Inq int32

        // HasTOS indicates whether Tos is valid/set.
        HasTOS bool

        // TOS is the IPv4 type of service of the associated packet.
        TOS uint8

        // HasTClass indicates whether TClass is valid/set.
        HasTClass bool

        // TClass is the IPv6 traffic class of the associated packet.
        TClass uint32

        // HasIPPacketInfo indicates whether PacketInfo is set.
        HasIPPacketInfo bool

        // PacketInfo holds interface and address data on an incoming packet.
        PacketInfo linux.ControlMessageIPPacketInfo

        // OriginalDestinationAddress holds the original destination address
        // and port of the incoming packet.
        OriginalDstAddress linux.SockAddr

        // SockErr is the dequeued socket error on recvmsg(MSG_ERRQUEUE).
        SockErr linux.SockErrCMsg
}

// Release releases Unix domain socket credentials and rights.
func (c *ControlMessages) Release(ctx context.Context) {
        c.Unix.Release(ctx)
}

// Socket is an interface combining fs.FileOperations and SocketOps,
// representing a VFS1 socket file.
type Socket interface {
        fs.FileOperations
        SocketOps
}

// SocketVFS2 is an interface combining vfs.FileDescription and SocketOps,
// representing a VFS2 socket file.
type SocketVFS2 interface {
        vfs.FileDescriptionImpl
        SocketOps
}

// SocketOps is the interface containing socket syscalls used by the syscall
// layer to redirect them to the appropriate implementation.
//
// It is implemented by both Socket and SocketVFS2.
type SocketOps interface {
        // Connect implements the connect(2) linux unix.
        Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error

        // Accept implements the accept4(2) linux unix.
        // Returns fd, real peer address length and error. Real peer address
        // length is only set if len(peer) > 0.
        Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error)

        // Bind implements the bind(2) linux unix.
        Bind(t *kernel.Task, sockaddr []byte) *syserr.Error

        // Listen implements the listen(2) linux unix.
        Listen(t *kernel.Task, backlog int) *syserr.Error

        // Shutdown implements the shutdown(2) linux unix.
        Shutdown(t *kernel.Task, how int) *syserr.Error

        // GetSockOpt implements the getsockopt(2) linux unix.
        GetSockOpt(t *kernel.Task, level int, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error)

        // SetSockOpt implements the setsockopt(2) linux unix.
        SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error

        // GetSockName implements the getsockname(2) linux unix.
        //
        // addrLen is the address length to be returned to the application, not
        // necessarily the actual length of the address.
        GetSockName(t *kernel.Task) (addr linux.SockAddr, addrLen uint32, err *syserr.Error)

        // GetPeerName implements the getpeername(2) linux unix.
        //
        // addrLen is the address length to be returned to the application, not
        // necessarily the actual length of the address.
        GetPeerName(t *kernel.Task) (addr linux.SockAddr, addrLen uint32, err *syserr.Error)

        // RecvMsg implements the recvmsg(2) linux unix.
        //
        // senderAddrLen is the address length to be returned to the application,
        // not necessarily the actual length of the address.
        //
        // flags control how RecvMsg should be completed. msgFlags indicate how
        // the RecvMsg call was completed. Note that control message truncation
        // may still be required even if the MSG_CTRUNC bit is not set in
        // msgFlags. In that case, the caller should set MSG_CTRUNC appropriately.
        //
        // If err != nil, the recv was not successful.
        RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages ControlMessages, err *syserr.Error)

        // SendMsg implements the sendmsg(2) linux unix. SendMsg does not take
        // ownership of the ControlMessage on error.
        //
        // If n > 0, err will either be nil or an error from t.Block.
        SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages ControlMessages) (n int, err *syserr.Error)

        // SetRecvTimeout sets the timeout (in ns) for recv operations. Zero means
        // no timeout, and negative means DONTWAIT.
        SetRecvTimeout(nanoseconds int64)

        // RecvTimeout gets the current timeout (in ns) for recv operations. Zero
        // means no timeout, and negative means DONTWAIT.
        RecvTimeout() int64

        // SetSendTimeout sets the timeout (in ns) for send operations. Zero means
        // no timeout, and negative means DONTWAIT.
        SetSendTimeout(nanoseconds int64)

        // SendTimeout gets the current timeout (in ns) for send operations. Zero
        // means no timeout, and negative means DONTWAIT.
        SendTimeout() int64

        // State returns the current state of the socket, as represented by Linux in
        // procfs. The returned state value is protocol-specific.
        State() uint32

        // Type returns the family, socket type and protocol of the socket.
        Type() (family int, skType linux.SockType, protocol int)
}

// Provider is the interface implemented by providers of sockets for specific
// address families (e.g., AF_INET).
type Provider interface {
        // Socket creates a new socket.
        //
        // If a nil Socket _and_ a nil error is returned, it means that the
        // protocol is not supported. A non-nil error should only be returned
        // if the protocol is supported, but an error occurs during creation.
        Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error)

        // Pair creates a pair of connected sockets.
        //
        // See Socket for error information.
        Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error)
}

// families holds a map of all known address families and their providers.
var families = make(map[int][]Provider)

// RegisterProvider registers the provider of a given address family so that
// sockets of that type can be created via socket() and/or socketpair()
// syscalls.
//
// This should only be called during the initialization of the address family.
func RegisterProvider(family int, provider Provider) {
        families[family] = append(families[family], provider)
}

// New creates a new socket with the given family, type and protocol.
func New(t *kernel.Task, family int, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
        for _, p := range families[family] {
                s, err := p.Socket(t, stype, protocol)
                if err != nil {
                        return nil, err
                }
                if s != nil {
                        t.Kernel().RecordSocket(s)
                        return s, nil
                }
        }

        return nil, syserr.ErrAddressFamilyNotSupported
}

// Pair creates a new connected socket pair with the given family, type and
// protocol.
func Pair(t *kernel.Task, family int, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
        providers, ok := families[family]
        if !ok {
                return nil, nil, syserr.ErrAddressFamilyNotSupported
        }

        for _, p := range providers {
                s1, s2, err := p.Pair(t, stype, protocol)
                if err != nil {
                        return nil, nil, err
                }
                if s1 != nil && s2 != nil {
                        k := t.Kernel()
                        k.RecordSocket(s1)
                        k.RecordSocket(s2)
                        return s1, s2, nil
                }
        }

        return nil, nil, syserr.ErrSocketNotSupported
}

// NewDirent returns a sockfs fs.Dirent that resides on device d.
func NewDirent(ctx context.Context, d *device.Device) *fs.Dirent {
        ino := d.NextIno()
        iops := &fsutil.SimpleFileInode{
                InodeSimpleAttributes: fsutil.NewInodeSimpleAttributes(ctx, fs.FileOwnerFromContext(ctx), fs.FilePermissions{
                        User: fs.PermMask{Read: true, Write: true},
                }, linux.SOCKFS_MAGIC),
        }
        inode := fs.NewInode(ctx, iops, fs.NewPseudoMountSource(ctx), fs.StableAttr{
                Type:      fs.Socket,
                DeviceID:  d.DeviceID(),
                InodeID:   ino,
                BlockSize: hostarch.PageSize,
        })

        // Dirent name matches net/socket.c:sockfs_dname.
        return fs.NewDirent(ctx, inode, fmt.Sprintf("socket:[%d]", ino))
}

// ProviderVFS2 is the vfs2 interface implemented by providers of sockets for
// specific address families (e.g., AF_INET).
type ProviderVFS2 interface {
        // Socket creates a new socket.
        //
        // If a nil Socket _and_ a nil error is returned, it means that the
        // protocol is not supported. A non-nil error should only be returned
        // if the protocol is supported, but an error occurs during creation.
        Socket(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error)

        // Pair creates a pair of connected sockets.
        //
        // See Socket for error information.
        Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error)
}

// familiesVFS2 holds a map of all known address families and their providers.
var familiesVFS2 = make(map[int][]ProviderVFS2)

// RegisterProviderVFS2 registers the provider of a given address family so that
// sockets of that type can be created via socket() and/or socketpair()
// syscalls.
//
// This should only be called during the initialization of the address family.
func RegisterProviderVFS2(family int, provider ProviderVFS2) {
        familiesVFS2[family] = append(familiesVFS2[family], provider)
}

// NewVFS2 creates a new socket with the given family, type and protocol.
func NewVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
        for _, p := range familiesVFS2[family] {
                s, err := p.Socket(t, stype, protocol)
                if err != nil {
                        return nil, err
                }
                if s != nil {
                        t.Kernel().RecordSocketVFS2(s)
                        return s, nil
                }
        }

        return nil, syserr.ErrAddressFamilyNotSupported
}

// PairVFS2 creates a new connected socket pair with the given family, type and
// protocol.
func PairVFS2(t *kernel.Task, family int, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
        providers, ok := familiesVFS2[family]
        if !ok {
                return nil, nil, syserr.ErrAddressFamilyNotSupported
        }

        for _, p := range providers {
                s1, s2, err := p.Pair(t, stype, protocol)
                if err != nil {
                        return nil, nil, err
                }
                if s1 != nil && s2 != nil {
                        k := t.Kernel()
                        k.RecordSocketVFS2(s1)
                        k.RecordSocketVFS2(s2)
                        return s1, s2, nil
                }
        }

        return nil, nil, syserr.ErrSocketNotSupported
}

// SendReceiveTimeout stores timeouts for send and receive calls.
//
// It is meant to be embedded into Socket implementations to help satisfy the
// interface.
//
// Care must be taken when copying SendReceiveTimeout as it contains atomic
// variables.
//
// +stateify savable
type SendReceiveTimeout struct {
        // send is length of the send timeout in nanoseconds.
        //
        // send must be accessed atomically.
        send int64

        // recv is length of the receive timeout in nanoseconds.
        //
        // recv must be accessed atomically.
        recv int64
}

// SetRecvTimeout implements Socket.SetRecvTimeout.
func (to *SendReceiveTimeout) SetRecvTimeout(nanoseconds int64) {
        atomic.StoreInt64(&to.recv, nanoseconds)
}

// RecvTimeout implements Socket.RecvTimeout.
func (to *SendReceiveTimeout) RecvTimeout() int64 {
        return atomic.LoadInt64(&to.recv)
}

// SetSendTimeout implements Socket.SetSendTimeout.
func (to *SendReceiveTimeout) SetSendTimeout(nanoseconds int64) {
        atomic.StoreInt64(&to.send, nanoseconds)
}

// SendTimeout implements Socket.SendTimeout.
func (to *SendReceiveTimeout) SendTimeout() int64 {
        return atomic.LoadInt64(&to.send)
}

// GetSockOptEmitUnimplementedEvent emits unimplemented event if name is valid.
// It contains names that are valid for GetSockOpt when level is SOL_SOCKET.
func GetSockOptEmitUnimplementedEvent(t *kernel.Task, name int) {
        switch name {
        case linux.SO_ACCEPTCONN,
                linux.SO_BPF_EXTENSIONS,
                linux.SO_COOKIE,
                linux.SO_DOMAIN,
                linux.SO_ERROR,
                linux.SO_GET_FILTER,
                linux.SO_INCOMING_NAPI_ID,
                linux.SO_MEMINFO,
                linux.SO_PEERCRED,
                linux.SO_PEERGROUPS,
                linux.SO_PEERNAME,
                linux.SO_PEERSEC,
                linux.SO_PROTOCOL,
                linux.SO_SNDLOWAT,
                linux.SO_TYPE:

                t.Kernel().EmitUnimplementedEvent(t)

        default:
                emitUnimplementedEvent(t, name)
        }
}

// SetSockOptEmitUnimplementedEvent emits unimplemented event if name is valid.
// It contains names that are valid for SetSockOpt when level is SOL_SOCKET.
func SetSockOptEmitUnimplementedEvent(t *kernel.Task, name int) {
        switch name {
        case linux.SO_ATTACH_BPF,
                linux.SO_ATTACH_FILTER,
                linux.SO_ATTACH_REUSEPORT_CBPF,
                linux.SO_ATTACH_REUSEPORT_EBPF,
                linux.SO_CNX_ADVICE,
                linux.SO_DETACH_FILTER,
                linux.SO_RCVBUFFORCE,
                linux.SO_SNDBUFFORCE:

                t.Kernel().EmitUnimplementedEvent(t)

        default:
                emitUnimplementedEvent(t, name)
        }
}

// emitUnimplementedEvent emits unimplemented event if name is valid. It
// contains names that are common between Get and SetSocketOpt when level is
// SOL_SOCKET.
func emitUnimplementedEvent(t *kernel.Task, name int) {
        switch name {
        case linux.SO_BINDTODEVICE,
                linux.SO_BROADCAST,
                linux.SO_BSDCOMPAT,
                linux.SO_BUSY_POLL,
                linux.SO_DEBUG,
                linux.SO_DONTROUTE,
                linux.SO_INCOMING_CPU,
                linux.SO_KEEPALIVE,
                linux.SO_LINGER,
                linux.SO_LOCK_FILTER,
                linux.SO_MARK,
                linux.SO_MAX_PACING_RATE,
                linux.SO_NOFCS,
                linux.SO_OOBINLINE,
                linux.SO_PASSCRED,
                linux.SO_PASSSEC,
                linux.SO_PEEK_OFF,
                linux.SO_PRIORITY,
                linux.SO_RCVBUF,
                linux.SO_RCVLOWAT,
                linux.SO_RCVTIMEO,
                linux.SO_REUSEADDR,
                linux.SO_REUSEPORT,
                linux.SO_RXQ_OVFL,
                linux.SO_SELECT_ERR_QUEUE,
                linux.SO_SNDBUF,
                linux.SO_SNDTIMEO,
                linux.SO_TIMESTAMP,
                linux.SO_TIMESTAMPING,
                linux.SO_TIMESTAMPNS,
                linux.SO_TXTIME,
                linux.SO_WIFI_STATUS,
                linux.SO_ZEROCOPY:

                t.Kernel().EmitUnimplementedEvent(t)
        }
}

// UnmarshalSockAddr unmarshals memory representing a struct sockaddr to one of
// the ABI socket address types.
//
// Precondition: data must be long enough to represent a socket address of the
// given family.
func UnmarshalSockAddr(family int, data []byte) linux.SockAddr {
        switch family {
        case unix.AF_INET:
                var addr linux.SockAddrInet
                addr.UnmarshalUnsafe(data[:addr.SizeBytes()])
                return &addr
        case unix.AF_INET6:
                var addr linux.SockAddrInet6
                addr.UnmarshalUnsafe(data[:addr.SizeBytes()])
                return &addr
        case unix.AF_UNIX:
                var addr linux.SockAddrUnix
                addr.UnmarshalUnsafe(data[:addr.SizeBytes()])
                return &addr
        case unix.AF_NETLINK:
                var addr linux.SockAddrNetlink
                addr.UnmarshalUnsafe(data[:addr.SizeBytes()])
                return &addr
        default:
                panic(fmt.Sprintf("Unsupported socket family %v", family))
        }
}

var sockAddrLinkSize = (&linux.SockAddrLink{}).SizeBytes()
var sockAddrInetSize = (&linux.SockAddrInet{}).SizeBytes()
var sockAddrInet6Size = (&linux.SockAddrInet6{}).SizeBytes()

// Ntohs converts a 16-bit number from network byte order to host byte order. It
// assumes that the host is little endian.
func Ntohs(v uint16) uint16 {
        return v<<8 | v>>8
}

// Htons converts a 16-bit number from host byte order to network byte order. It
// assumes that the host is little endian.
func Htons(v uint16) uint16 {
        return Ntohs(v)
}

// isLinkLocal determines if the given IPv6 address is link-local. This is the
// case when it has the fe80::/10 prefix. This check is used to determine when
// the NICID is relevant for a given IPv6 address.
func isLinkLocal(addr tcpip.Address) bool {
        return len(addr) >= 2 && addr[0] == 0xfe && addr[1]&0xc0 == 0x80
}

// ConvertAddress converts the given address to a native format.
func ConvertAddress(family int, addr tcpip.FullAddress) (linux.SockAddr, uint32) {
        switch family {
        case linux.AF_UNIX:
                var out linux.SockAddrUnix
                out.Family = linux.AF_UNIX
                l := len([]byte(addr.Addr))
                for i := 0; i < l; i++ {
                        out.Path[i] = int8(addr.Addr[i])
                }

                // Linux returns the used length of the address struct (including the
                // null terminator) for filesystem paths. The Family field is 2 bytes.
                // It is sometimes allowed to exclude the null terminator if the
                // address length is the max. Abstract and empty paths always return
                // the full exact length.
                if l == 0 || out.Path[0] == 0 || l == len(out.Path) {
                        return &out, uint32(2 + l)
                }
                return &out, uint32(3 + l)

        case linux.AF_INET:
                var out linux.SockAddrInet
                copy(out.Addr[:], addr.Addr)
                out.Family = linux.AF_INET
                out.Port = Htons(addr.Port)
                return &out, uint32(sockAddrInetSize)

        case linux.AF_INET6:
                var out linux.SockAddrInet6
                if len(addr.Addr) == header.IPv4AddressSize {
                        // Copy address in v4-mapped format.
                        copy(out.Addr[12:], addr.Addr)
                        out.Addr[10] = 0xff
                        out.Addr[11] = 0xff
                } else {
                        copy(out.Addr[:], addr.Addr)
                }
                out.Family = linux.AF_INET6
                out.Port = Htons(addr.Port)
                if isLinkLocal(addr.Addr) {
                        out.Scope_id = uint32(addr.NIC)
                }
                return &out, uint32(sockAddrInet6Size)

        case linux.AF_PACKET:
                var out linux.SockAddrLink
                out.Family = linux.AF_PACKET
                out.InterfaceIndex = int32(addr.NIC)
                out.HardwareAddrLen = header.EthernetAddressSize
                copy(out.HardwareAddr[:], addr.Addr)
                return &out, uint32(sockAddrLinkSize)

        default:
                return nil, 0
        }
}

// BytesToIPAddress converts an IPv4 or IPv6 address from the user to the
// netstack representation taking any addresses into account.
func BytesToIPAddress(addr []byte) tcpip.Address {
        if bytes.Equal(addr, make([]byte, 4)) || bytes.Equal(addr, make([]byte, 16)) {
                return ""
        }
        return tcpip.Address(addr)
}

// AddressAndFamily reads an sockaddr struct from the given address and
// converts it to the FullAddress format. It supports AF_UNIX, AF_INET,
// AF_INET6, and AF_PACKET addresses.
//
// AddressAndFamily returns an address and its family.
func AddressAndFamily(addr []byte) (tcpip.FullAddress, uint16, *syserr.Error) {
        // Make sure we have at least 2 bytes for the address family.
        if len(addr) < 2 {
                return tcpip.FullAddress{}, 0, syserr.ErrInvalidArgument
        }

        // Get the rest of the fields based on the address family.
        switch family := hostarch.ByteOrder.Uint16(addr); family {
        case linux.AF_UNIX:
                path := addr[2:]
                if len(path) > linux.UnixPathMax {
                        return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
                }
                // Drop the terminating NUL (if one exists) and everything after
                // it for filesystem (non-abstract) addresses.
                if len(path) > 0 && path[0] != 0 {
                        if n := bytes.IndexByte(path[1:], 0); n >= 0 {
                                path = path[:n+1]
                        }
                }
                return tcpip.FullAddress{
                        Addr: tcpip.Address(path),
                }, family, nil

        case linux.AF_INET:
                var a linux.SockAddrInet
                if len(addr) < sockAddrInetSize {
                        return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
                }
                a.UnmarshalUnsafe(addr[:sockAddrInetSize])

                out := tcpip.FullAddress{
                        Addr: BytesToIPAddress(a.Addr[:]),
                        Port: Ntohs(a.Port),
                }
                return out, family, nil

        case linux.AF_INET6:
                var a linux.SockAddrInet6
                if len(addr) < sockAddrInet6Size {
                        return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
                }
                a.UnmarshalUnsafe(addr[:sockAddrInet6Size])

                out := tcpip.FullAddress{
                        Addr: BytesToIPAddress(a.Addr[:]),
                        Port: Ntohs(a.Port),
                }
                if isLinkLocal(out.Addr) {
                        out.NIC = tcpip.NICID(a.Scope_id)
                }
                return out, family, nil

        case linux.AF_PACKET:
                var a linux.SockAddrLink
                if len(addr) < sockAddrLinkSize {
                        return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
                }
                a.UnmarshalUnsafe(addr[:sockAddrLinkSize])
                if a.Family != linux.AF_PACKET || a.HardwareAddrLen != header.EthernetAddressSize {
                        return tcpip.FullAddress{}, family, syserr.ErrInvalidArgument
                }

                return tcpip.FullAddress{
                        NIC:  tcpip.NICID(a.InterfaceIndex),
                        Addr: tcpip.Address(a.HardwareAddr[:header.EthernetAddressSize]),
                }, family, nil

        case linux.AF_UNSPEC:
                return tcpip.FullAddress{}, family, nil

        default:
                return tcpip.FullAddress{}, 0, syserr.ErrAddressFamilyNotSupported
        }
}










































    1 









    2 


    2 








    2 
    2 


    2 


    1 


    1 









    1 



    1 



    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "io"
        "math"

        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/rand"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/usermem"
)

const (
        _GRND_NONBLOCK = 0x1
        _GRND_RANDOM   = 0x2
)

// GetRandom implements the linux syscall getrandom(2).
//
// In a multi-tenant/shared environment, the only valid implementation is to
// fetch data from the urandom pool, otherwise starvation attacks become
// possible. The urandom pool is also expected to have plenty of entropy, thus
// the GRND_RANDOM flag is ignored. The GRND_NONBLOCK flag does not apply, as
// the pool will already be initialized.
func GetRandom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        length := args[1].SizeT()
        flags := args[2].Int()

        // Flags are checked for validity but otherwise ignored. See above.
        if flags & ^(_GRND_NONBLOCK|_GRND_RANDOM) != 0 {
                return 0, nil, linuxerr.EINVAL
        }

        if length > math.MaxInt32 {
                length = math.MaxInt32
        }
        ar, ok := addr.ToRange(uint64(length))
        if !ok {
                return 0, nil, linuxerr.EFAULT
        }

        // "If the urandom source has been initialized, reads of up to 256 bytes
        // will always return as many bytes as requested and will not be
        // interrupted by signals. No such guarantees apply for larger buffer
        // sizes." - getrandom(2)
        min := int(length)
        if min > 256 {
                min = 256
        }
        n, err := t.MemoryManager().CopyOutFrom(t, hostarch.AddrRangeSeqOf(ar), safemem.FromIOReader{&randReader{-1, min}}, usermem.IOOpts{
                AddressSpaceActive: true,
        })
        if n >= int64(min) {
                return uintptr(n), nil, nil
        }
        return 0, nil, err
}

// randReader is a io.Reader that handles partial reads from rand.Reader.
type randReader struct {
        done int
        min  int
}

// Read implements io.Reader.Read.
func (r *randReader) Read(dst []byte) (int, error) {
        if r.done >= r.min {
                return rand.Reader.Read(dst)
        }
        min := r.min - r.done
        if min > len(dst) {
                min = len(dst)
        }
        return io.ReadAtLeast(rand.Reader, dst, min)
}



























































































  666 
  360 





  359 
















  381 



  380 



  380 









  380 




























  270 



  271 


  271 




























   26 







   26 






















  612 



  614 


  615 



























    2 




    2 






    2 







  405 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
)

// FSContext contains filesystem context.
//
// This includes umask and working directory.
//
// +stateify savable
type FSContext struct {
        FSContextRefs

        // mu protects below.
        mu sync.Mutex `state:"nosave"`

        // root is the filesystem root. Will be nil iff the FSContext has been
        // destroyed.
        root *fs.Dirent

        // rootVFS2 is the filesystem root.
        rootVFS2 vfs.VirtualDentry

        // cwd is the current working directory. Will be nil iff the FSContext
        // has been destroyed.
        cwd *fs.Dirent

        // cwdVFS2 is the current working directory.
        cwdVFS2 vfs.VirtualDentry

        // umask is the current file mode creation mask. When a thread using this
        // context invokes a syscall that creates a file, bits set in umask are
        // removed from the permissions that the file is created with.
        umask uint
}

// newFSContext returns a new filesystem context.
func newFSContext(root, cwd *fs.Dirent, umask uint) *FSContext {
        root.IncRef()
        cwd.IncRef()
        f := FSContext{
                root:  root,
                cwd:   cwd,
                umask: umask,
        }
        f.InitRefs()
        return &f
}

// NewFSContextVFS2 returns a new filesystem context.
func NewFSContextVFS2(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
        root.IncRef()
        cwd.IncRef()
        f := FSContext{
                rootVFS2: root,
                cwdVFS2:  cwd,
                umask:    umask,
        }
        f.InitRefs()
        return &f
}

// DecRef implements RefCounter.DecRef.
//
// When f reaches zero references, DecRef will be called on both root and cwd
// Dirents.
//
// Note that there may still be calls to WorkingDirectory() or RootDirectory()
// (that return nil).  This is because valid references may still be held via
// proc files or other mechanisms.
func (f *FSContext) DecRef(ctx context.Context) {
        f.FSContextRefs.DecRef(func() {
                // Hold f.mu so that we don't race with RootDirectory() and
                // WorkingDirectory().
                f.mu.Lock()
                defer f.mu.Unlock()

                if VFS2Enabled {
                        f.rootVFS2.DecRef(ctx)
                        f.rootVFS2 = vfs.VirtualDentry{}
                        f.cwdVFS2.DecRef(ctx)
                        f.cwdVFS2 = vfs.VirtualDentry{}
                } else {
                        f.root.DecRef(ctx)
                        f.root = nil
                        f.cwd.DecRef(ctx)
                        f.cwd = nil
                }
        })
}

// Fork forks this FSContext.
//
// This is not a valid call after f is destroyed.
func (f *FSContext) Fork() *FSContext {
        f.mu.Lock()
        defer f.mu.Unlock()

        if VFS2Enabled {
                if !f.cwdVFS2.Ok() {
                        panic("FSContext.Fork() called after destroy")
                }
                f.cwdVFS2.IncRef()
                f.rootVFS2.IncRef()
        } else {
                if f.cwd == nil {
                        panic("FSContext.Fork() called after destroy")
                }
                f.cwd.IncRef()
                f.root.IncRef()
        }

        ctx := &FSContext{
                cwd:      f.cwd,
                root:     f.root,
                cwdVFS2:  f.cwdVFS2,
                rootVFS2: f.rootVFS2,
                umask:    f.umask,
        }
        ctx.InitRefs()
        return ctx
}

// WorkingDirectory returns the current working directory.
//
// This will return nil if called after f is destroyed, otherwise it will return
// a Dirent with a reference taken.
func (f *FSContext) WorkingDirectory() *fs.Dirent {
        f.mu.Lock()
        defer f.mu.Unlock()

        if f.cwd != nil {
                f.cwd.IncRef()
        }
        return f.cwd
}

// WorkingDirectoryVFS2 returns the current working directory.
//
// This will return an empty vfs.VirtualDentry if called after f is
// destroyed, otherwise it will return a Dirent with a reference taken.
func (f *FSContext) WorkingDirectoryVFS2() vfs.VirtualDentry {
        f.mu.Lock()
        defer f.mu.Unlock()

        if f.cwdVFS2.Ok() {
                f.cwdVFS2.IncRef()
        }
        return f.cwdVFS2
}

// SetWorkingDirectory sets the current working directory.
// This will take an extra reference on the Dirent.
//
// This is not a valid call after f is destroyed.
func (f *FSContext) SetWorkingDirectory(ctx context.Context, d *fs.Dirent) {
        if d == nil {
                panic("FSContext.SetWorkingDirectory called with nil dirent")
        }

        f.mu.Lock()
        defer f.mu.Unlock()

        if f.cwd == nil {
                panic(fmt.Sprintf("FSContext.SetWorkingDirectory(%v)) called after destroy", d))
        }

        old := f.cwd
        f.cwd = d
        d.IncRef()
        old.DecRef(ctx)
}

// SetWorkingDirectoryVFS2 sets the current working directory.
// This will take an extra reference on the VirtualDentry.
//
// This is not a valid call after f is destroyed.
func (f *FSContext) SetWorkingDirectoryVFS2(ctx context.Context, d vfs.VirtualDentry) {
        f.mu.Lock()
        defer f.mu.Unlock()

        if !f.cwdVFS2.Ok() {
                panic(fmt.Sprintf("FSContext.SetWorkingDirectoryVFS2(%v)) called after destroy", d))
        }

        old := f.cwdVFS2
        f.cwdVFS2 = d
        d.IncRef()
        old.DecRef(ctx)
}

// RootDirectory returns the current filesystem root.
//
// This will return nil if called after f is destroyed, otherwise it will return
// a Dirent with a reference taken.
func (f *FSContext) RootDirectory() *fs.Dirent {
        f.mu.Lock()
        defer f.mu.Unlock()
        if f.root != nil {
                f.root.IncRef()
        }
        return f.root
}

// RootDirectoryVFS2 returns the current filesystem root.
//
// This will return an empty vfs.VirtualDentry if called after f is
// destroyed, otherwise it will return a Dirent with a reference taken.
func (f *FSContext) RootDirectoryVFS2() vfs.VirtualDentry {
        f.mu.Lock()
        defer f.mu.Unlock()

        if f.rootVFS2.Ok() {
                f.rootVFS2.IncRef()
        }
        return f.rootVFS2
}

// SetRootDirectory sets the root directory.
// This will take an extra reference on the Dirent.
//
// This is not a valid call after f is destroyed.
func (f *FSContext) SetRootDirectory(ctx context.Context, d *fs.Dirent) {
        if d == nil {
                panic("FSContext.SetRootDirectory called with nil dirent")
        }

        f.mu.Lock()
        defer f.mu.Unlock()

        if f.root == nil {
                panic(fmt.Sprintf("FSContext.SetRootDirectory(%v)) called after destroy", d))
        }

        old := f.root
        f.root = d
        d.IncRef()
        old.DecRef(ctx)
}

// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
//
// This is not a valid call after f is destroyed.
func (f *FSContext) SetRootDirectoryVFS2(ctx context.Context, vd vfs.VirtualDentry) {
        if !vd.Ok() {
                panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
        }

        f.mu.Lock()

        if !f.rootVFS2.Ok() {
                f.mu.Unlock()
                panic(fmt.Sprintf("FSContext.SetRootDirectoryVFS2(%v)) called after destroy", vd))
        }

        old := f.rootVFS2
        vd.IncRef()
        f.rootVFS2 = vd
        f.mu.Unlock()
        old.DecRef(ctx)
}

// Umask returns the current umask.
func (f *FSContext) Umask() uint {
        f.mu.Lock()
        defer f.mu.Unlock()
        return f.umask
}

// SwapUmask atomically sets the current umask and returns the old umask.
func (f *FSContext) SwapUmask(mask uint) uint {
        f.mu.Lock()
        defer f.mu.Unlock()
        old := f.umask
        f.umask = mask
        return old
}







































    2 




    2 









    8 

    8 


    1 

    8 


    7 

    6 








    2 




    2 



  231 




  208 





  182 
















   10 











    8 









    1 


    8 
    2 


    6 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fsbridge

import (
        "io"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/usermem"
)

// VFSFile implements File interface over vfs.FileDescription.
//
// +stateify savable
type VFSFile struct {
        file *vfs.FileDescription
}

var _ File = (*VFSFile)(nil)

// NewVFSFile creates a new File over fs.File.
func NewVFSFile(file *vfs.FileDescription) File {
        return &VFSFile{file: file}
}

// PathnameWithDeleted implements File.
func (f *VFSFile) PathnameWithDeleted(ctx context.Context) string {
        root := vfs.RootFromContext(ctx)
        defer root.DecRef(ctx)

        vfsObj := f.file.VirtualDentry().Mount().Filesystem().VirtualFilesystem()
        name, _ := vfsObj.PathnameWithDeleted(ctx, root, f.file.VirtualDentry())
        return name
}

// ReadFull implements File.
func (f *VFSFile) ReadFull(ctx context.Context, dst usermem.IOSequence, offset int64) (int64, error) {
        var total int64
        for dst.NumBytes() > 0 {
                n, err := f.file.PRead(ctx, dst, offset+total, vfs.ReadOptions{})
                total += n
                if err == io.EOF && total != 0 {
                        return total, io.ErrUnexpectedEOF
                } else if err != nil {
                        return total, err
                }
                dst = dst.DropFirst64(n)
        }
        return total, nil
}

// ConfigureMMap implements File.
func (f *VFSFile) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
        return f.file.ConfigureMMap(ctx, opts)
}

// Type implements File.
func (f *VFSFile) Type(ctx context.Context) (linux.FileMode, error) {
        stat, err := f.file.Stat(ctx, vfs.StatOptions{})
        if err != nil {
                return 0, err
        }
        return linux.FileMode(stat.Mode).FileType(), nil
}

// IncRef implements File.
func (f *VFSFile) IncRef() {
        f.file.IncRef()
}

// DecRef implements File.
func (f *VFSFile) DecRef(ctx context.Context) {
        f.file.DecRef(ctx)
}

// FileDescription returns the FileDescription represented by f. It does not
// take an additional reference on the returned FileDescription.
func (f *VFSFile) FileDescription() *vfs.FileDescription {
        return f.file
}

// fsLookup implements Lookup interface using fs.File.
//
// +stateify savable
type vfsLookup struct {
        mntns *vfs.MountNamespace

        root       vfs.VirtualDentry
        workingDir vfs.VirtualDentry
}

var _ Lookup = (*vfsLookup)(nil)

// NewVFSLookup creates a new Lookup using VFS2.
func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry) Lookup {
        return &vfsLookup{
                mntns:      mntns,
                root:       root,
                workingDir: workingDir,
        }
}

// OpenPath implements Lookup.
//
// remainingTraversals is not configurable in VFS2, all callers are using the
// default anyways.
func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
        vfsObj := l.root.Mount().Filesystem().VirtualFilesystem()
        creds := auth.CredentialsFromContext(ctx)
        path := fspath.Parse(pathname)
        pop := &vfs.PathOperation{
                Root:               l.root,
                Start:              l.workingDir,
                Path:               path,
                FollowFinalSymlink: resolveFinal,
        }
        if path.Absolute {
                pop.Start = l.root
        }
        fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts)
        if err != nil {
                return nil, err
        }
        return &VFSFile{file: fd}, nil
}


















































































































    4 





    4 






    4 


    1 




    3 




    3 
    2 


    1 

    1 































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package fuse implements fusefs.
package fuse

import (
        "math"
        "strconv"
        "sync"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/log"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/waiter"
)

// Name is the default filesystem name.
const Name = "fuse"

// maxActiveRequestsDefault is the default setting controlling the upper bound
// on the number of active requests at any given time.
const maxActiveRequestsDefault = 10000

// FilesystemType implements vfs.FilesystemType.
//
// +stateify savable
type FilesystemType struct{}

// +stateify savable
type filesystemOptions struct {
        // mopts contains the raw, unparsed mount options passed to this filesystem.
        mopts string

        // uid of the mount owner.
        uid auth.KUID

        // gid of the mount owner.
        gid auth.KGID

        // rootMode specifies the the file mode of the filesystem's root.
        rootMode linux.FileMode

        // maxActiveRequests specifies the maximum number of active requests that can
        // exist at any time. Any further requests will block when trying to
        // Call the server.
        maxActiveRequests uint64

        // maxRead is the max number of bytes to read,
        // specified as "max_read" in fs parameters.
        // If not specified by user, use math.MaxUint32 as default value.
        maxRead uint32

        // defaultPermissions is the default_permissions mount option. It instructs
        // the kernel to perform a standard unix permission checks based on
        // ownership and mode bits, instead of deferring the check to the server.
        //
        // Immutable after mount.
        defaultPermissions bool

        // allowOther is the allow_other mount option. It allows processes that
        // don't own the FUSE mount to call into it.
        //
        // Immutable after mount.
        allowOther bool
}

// filesystem implements vfs.FilesystemImpl.
//
// +stateify savable
type filesystem struct {
        kernfs.Filesystem
        devMinor uint32

        // conn is used for communication between the FUSE server
        // daemon and the sentry fusefs.
        conn *connection

        // opts is the options the fusefs is initialized with.
        opts *filesystemOptions

        // umounted is true if filesystem.Release() has been called.
        umounted bool
}

// Name implements vfs.FilesystemType.Name.
func (FilesystemType) Name() string {
        return Name
}

// Release implements vfs.FilesystemType.Release.
func (FilesystemType) Release(ctx context.Context) {}

// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
        devMinor, err := vfsObj.GetAnonBlockDevMinor()
        if err != nil {
                return nil, nil, err
        }

        fsopts := filesystemOptions{mopts: opts.Data}
        mopts := vfs.GenericParseMountOptions(opts.Data)
        deviceDescriptorStr, ok := mopts["fd"]
        if !ok {
                ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option fd missing")
                return nil, nil, linuxerr.EINVAL
        }
        delete(mopts, "fd")

        deviceDescriptor, err := strconv.ParseInt(deviceDescriptorStr, 10 /* base */, 32 /* bitSize */)
        if err != nil {
                ctx.Debugf("fusefs.FilesystemType.GetFilesystem: invalid fd: %q (%v)", deviceDescriptorStr, err)
                return nil, nil, linuxerr.EINVAL
        }

        kernelTask := kernel.TaskFromContext(ctx)
        if kernelTask == nil {
                log.Warningf("%s.GetFilesystem: couldn't get kernel task from context", fsType.Name())
                return nil, nil, linuxerr.EINVAL
        }
        fuseFDGeneric := kernelTask.GetFileVFS2(int32(deviceDescriptor))
        if fuseFDGeneric == nil {
                return nil, nil, linuxerr.EINVAL
        }
        defer fuseFDGeneric.DecRef(ctx)
        fuseFD, ok := fuseFDGeneric.Impl().(*DeviceFD)
        if !ok {
                log.Warningf("%s.GetFilesystem: device FD is %T, not a FUSE device", fsType.Name, fuseFDGeneric)
                return nil, nil, linuxerr.EINVAL
        }

        // Parse and set all the other supported FUSE mount options.
        // TODO(gVisor.dev/issue/3229): Expand the supported mount options.
        if uidStr, ok := mopts["user_id"]; ok {
                delete(mopts, "user_id")
                uid, err := strconv.ParseUint(uidStr, 10, 32)
                if err != nil {
                        log.Warningf("%s.GetFilesystem: invalid user_id: user_id=%s", fsType.Name(), uidStr)
                        return nil, nil, linuxerr.EINVAL
                }
                kuid := creds.UserNamespace.MapToKUID(auth.UID(uid))
                if !kuid.Ok() {
                        ctx.Warningf("fusefs.FilesystemType.GetFilesystem: unmapped uid: %d", uid)
                        return nil, nil, linuxerr.EINVAL
                }
                fsopts.uid = kuid
        } else {
                ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option user_id missing")
                return nil, nil, linuxerr.EINVAL
        }

        if gidStr, ok := mopts["group_id"]; ok {
                delete(mopts, "group_id")
                gid, err := strconv.ParseUint(gidStr, 10, 32)
                if err != nil {
                        log.Warningf("%s.GetFilesystem: invalid group_id: group_id=%s", fsType.Name(), gidStr)
                        return nil, nil, linuxerr.EINVAL
                }
                kgid := creds.UserNamespace.MapToKGID(auth.GID(gid))
                if !kgid.Ok() {
                        ctx.Warningf("fusefs.FilesystemType.GetFilesystem: unmapped gid: %d", gid)
                        return nil, nil, linuxerr.EINVAL
                }
                fsopts.gid = kgid
        } else {
                ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option group_id missing")
                return nil, nil, linuxerr.EINVAL
        }

        if modeStr, ok := mopts["rootmode"]; ok {
                delete(mopts, "rootmode")
                mode, err := strconv.ParseUint(modeStr, 8, 32)
                if err != nil {
                        log.Warningf("%s.GetFilesystem: invalid mode: %q", fsType.Name(), modeStr)
                        return nil, nil, linuxerr.EINVAL
                }
                fsopts.rootMode = linux.FileMode(mode)
        } else {
                ctx.Warningf("fusefs.FilesystemType.GetFilesystem: mandatory mount option rootmode missing")
                return nil, nil, linuxerr.EINVAL
        }

        // Set the maxInFlightRequests option.
        fsopts.maxActiveRequests = maxActiveRequestsDefault

        if maxReadStr, ok := mopts["max_read"]; ok {
                delete(mopts, "max_read")
                maxRead, err := strconv.ParseUint(maxReadStr, 10, 32)
                if err != nil {
                        log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr)
                        return nil, nil, linuxerr.EINVAL
                }
                if maxRead < fuseMinMaxRead {
                        maxRead = fuseMinMaxRead
                }
                fsopts.maxRead = uint32(maxRead)
        } else {
                fsopts.maxRead = math.MaxUint32
        }

        if _, ok := mopts["default_permissions"]; ok {
                delete(mopts, "default_permissions")
                fsopts.defaultPermissions = true
        }

        if _, ok := mopts["allow_other"]; ok {
                delete(mopts, "allow_other")
                fsopts.allowOther = true
        }

        // Check for unparsed options.
        if len(mopts) != 0 {
                log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts)
                return nil, nil, linuxerr.EINVAL
        }

        // Create a new FUSE filesystem.
        fs, err := newFUSEFilesystem(ctx, vfsObj, &fsType, fuseFD, devMinor, &fsopts)
        if err != nil {
                log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err)
                return nil, nil, err
        }

        // Send a FUSE_INIT request to the FUSE daemon server before returning.
        // This call is not blocking.
        if err := fs.conn.InitSend(creds, uint32(kernelTask.ThreadID())); err != nil {
                log.Warningf("%s.InitSend: failed with error: %v", fsType.Name(), err)
                fs.VFSFilesystem().DecRef(ctx) // returned by newFUSEFilesystem
                return nil, nil, err
        }

        // root is the fusefs root directory.
        root := fs.newRoot(ctx, creds, fsopts.rootMode)

        return fs.VFSFilesystem(), root.VFSDentry(), nil
}

// newFUSEFilesystem creates a new FUSE filesystem.
func newFUSEFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, fsType *FilesystemType, fuseFD *DeviceFD, devMinor uint32, opts *filesystemOptions) (*filesystem, error) {
        conn, err := newFUSEConnection(ctx, fuseFD, opts)
        if err != nil {
                log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err)
                return nil, linuxerr.EINVAL
        }

        fs := &filesystem{
                devMinor: devMinor,
                opts:     opts,
                conn:     conn,
        }
        fs.VFSFilesystem().Init(vfsObj, fsType, fs)

        // FIXME(gvisor.dev/issue/4813): Doesn't conn or fs need to hold a
        // reference on fuseFD, since conn uses fuseFD for communication with the
        // server? Wouldn't doing so create a circular reference?
        fs.VFSFilesystem().IncRef() // for fuseFD.fs
        // FIXME(gvisor.dev/issue/4813): fuseFD.fs is accessed without
        // synchronization.
        fuseFD.fs = fs

        return fs, nil
}

// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
        fs.conn.fd.mu.Lock()

        fs.umounted = true
        fs.conn.Abort(ctx)
        // Notify all the waiters on this fd.
        fs.conn.fd.waitQueue.Notify(waiter.ReadableEvents)

        fs.conn.fd.mu.Unlock()

        fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
        fs.Filesystem.Release(ctx)
}

// MountOptions implements vfs.FilesystemImpl.MountOptions.
func (fs *filesystem) MountOptions() string {
        return fs.opts.mopts
}

// inode implements kernfs.Inode.
//
// +stateify savable
type inode struct {
        inodeRefs
        kernfs.InodeAlwaysValid
        kernfs.InodeAttrs
        kernfs.InodeDirectoryNoNewChildren
        kernfs.InodeNotSymlink
        kernfs.OrderedChildren

        // the owning filesystem. fs is immutable.
        fs *filesystem

        // metaDataMu protects the metadata of this inode.
        metadataMu sync.Mutex

        nodeID uint64

        locks vfs.FileLocks

        // size of the file.
        size uint64

        // attributeVersion is the version of inode's attributes.
        attributeVersion uint64

        // attributeTime is the remaining vaild time of attributes.
        attributeTime uint64

        // version of the inode.
        version uint64

        // link is result of following a symbolic link.
        link string
}

func (fs *filesystem) newRoot(ctx context.Context, creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
        i := &inode{fs: fs, nodeID: 1}
        i.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755)
        i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
        i.InitRefs()

        var d kernfs.Dentry
        d.InitRoot(&fs.Filesystem, i)
        return &d
}

func (fs *filesystem) newInode(ctx context.Context, nodeID uint64, attr linux.FUSEAttr) kernfs.Inode {
        i := &inode{fs: fs, nodeID: nodeID}
        creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)}
        i.InodeAttrs.Init(ctx, &creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode))
        atomic.StoreUint64(&i.size, attr.Size)
        i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
        i.InitRefs()
        return i
}

// CheckPermissions implements kernfs.Inode.CheckPermissions.
func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
        // Since FUSE operations are ultimately backed by a userspace process (the
        // fuse daemon), allowing a process to call into fusefs grants the daemon
        // ptrace-like capabilities over the calling process. Because of this, by
        // default FUSE only allows the mount owner to interact with the
        // filesystem. This explicitly excludes setuid/setgid processes.
        //
        // This behaviour can be overriden with the 'allow_other' mount option.
        //
        // See fs/fuse/dir.c:fuse_allow_current_process() in Linux.
        if !i.fs.opts.allowOther {
                if creds.RealKUID != i.fs.opts.uid ||
                        creds.EffectiveKUID != i.fs.opts.uid ||
                        creds.SavedKUID != i.fs.opts.uid ||
                        creds.RealKGID != i.fs.opts.gid ||
                        creds.EffectiveKGID != i.fs.opts.gid ||
                        creds.SavedKGID != i.fs.opts.gid {
                        return linuxerr.EACCES
                }
        }

        // By default, fusefs delegates all permission checks to the server.
        // However, standard unix permission checks can be enabled with the
        // default_permissions mount option.
        if i.fs.opts.defaultPermissions {
                return i.InodeAttrs.CheckPermissions(ctx, creds, ats)
        }
        return nil
}

// Open implements kernfs.Inode.Open.
func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        isDir := i.InodeAttrs.Mode().IsDir()
        // return error if specified to open directory but inode is not a directory.
        if !isDir && opts.Mode.IsDir() {
                return nil, linuxerr.ENOTDIR
        }
        if opts.Flags&linux.O_LARGEFILE == 0 && atomic.LoadUint64(&i.size) > linux.MAX_NON_LFS {
                return nil, linuxerr.EOVERFLOW
        }

        var fd *fileDescription
        var fdImpl vfs.FileDescriptionImpl
        if isDir {
                directoryFD := &directoryFD{}
                fd = &(directoryFD.fileDescription)
                fdImpl = directoryFD
        } else {
                regularFD := &regularFileFD{}
                fd = &(regularFD.fileDescription)
                fdImpl = regularFD
        }
        // FOPEN_KEEP_CACHE is the defualt flag for noOpen.
        fd.OpenFlag = linux.FOPEN_KEEP_CACHE

        // Only send open request when FUSE server support open or is opening a directory.
        if !i.fs.conn.noOpen || isDir {
                kernelTask := kernel.TaskFromContext(ctx)
                if kernelTask == nil {
                        log.Warningf("fusefs.Inode.Open: couldn't get kernel task from context")
                        return nil, linuxerr.EINVAL
                }

                // Build the request.
                var opcode linux.FUSEOpcode
                if isDir {
                        opcode = linux.FUSE_OPENDIR
                } else {
                        opcode = linux.FUSE_OPEN
                }

                in := linux.FUSEOpenIn{Flags: opts.Flags & ^uint32(linux.O_CREAT|linux.O_EXCL|linux.O_NOCTTY)}
                if !i.fs.conn.atomicOTrunc {
                        in.Flags &= ^uint32(linux.O_TRUNC)
                }

                // Send the request and receive the reply.
                req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, &in)
                res, err := i.fs.conn.Call(kernelTask, req)
                if err != nil {
                        return nil, err
                }
                if err := res.Error(); linuxerr.Equals(linuxerr.ENOSYS, err) && !isDir {
                        i.fs.conn.noOpen = true
                } else if err != nil {
                        return nil, err
                } else {
                        out := linux.FUSEOpenOut{}
                        if err := res.UnmarshalPayload(&out); err != nil {
                                return nil, err
                        }

                        // Process the reply.
                        fd.OpenFlag = out.OpenFlag
                        if isDir {
                                fd.OpenFlag &= ^uint32(linux.FOPEN_DIRECT_IO)
                        }

                        fd.Fh = out.Fh
                }
        }

        // TODO(gvisor.dev/issue/3234): invalidate mmap after implemented it for FUSE Inode
        fd.DirectIO = fd.OpenFlag&linux.FOPEN_DIRECT_IO != 0
        fdOptions := &vfs.FileDescriptionOptions{}
        if fd.OpenFlag&linux.FOPEN_NONSEEKABLE != 0 {
                fdOptions.DenyPRead = true
                fdOptions.DenyPWrite = true
                fd.Nonseekable = true
        }

        // If we don't send SETATTR before open (which is indicated by atomicOTrunc)
        // and O_TRUNC is set, update the inode's version number and clean existing data
        // by setting the file size to 0.
        if i.fs.conn.atomicOTrunc && opts.Flags&linux.O_TRUNC != 0 {
                i.fs.conn.mu.Lock()
                i.fs.conn.attributeVersion++
                i.attributeVersion = i.fs.conn.attributeVersion
                atomic.StoreUint64(&i.size, 0)
                i.fs.conn.mu.Unlock()
                i.attributeTime = 0
        }

        if err := fd.vfsfd.Init(fdImpl, opts.Flags, rp.Mount(), d.VFSDentry(), fdOptions); err != nil {
                return nil, err
        }
        return &fd.vfsfd, nil
}

// Lookup implements kernfs.Inode.Lookup.
func (i *inode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
        in := linux.FUSELookupIn{Name: name}
        return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in)
}

// Keep implements kernfs.Inode.Keep.
func (i *inode) Keep() bool {
        // Return true so that kernfs keeps the new dentry pointing to this
        // inode in the dentry tree. This is needed because inodes created via
        // Lookup are not temporary. They might refer to existing files on server
        // that can be Unlink'd/Rmdir'd.
        return true
}

// IterDirents implements kernfs.Inode.IterDirents.
func (*inode) IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
        return offset, nil
}

// NewFile implements kernfs.Inode.NewFile.
func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) {
        kernelTask := kernel.TaskFromContext(ctx)
        if kernelTask == nil {
                log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID)
                return nil, linuxerr.EINVAL
        }
        in := linux.FUSECreateIn{
                CreateMeta: linux.FUSECreateMeta{
                        Flags: opts.Flags,
                        Mode:  uint32(opts.Mode) | linux.S_IFREG,
                        Umask: uint32(kernelTask.FSContext().Umask()),
                },
                Name: name,
        }
        return i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in)
}

// NewNode implements kernfs.Inode.NewNode.
func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (kernfs.Inode, error) {
        in := linux.FUSEMknodIn{
                MknodMeta: linux.FUSEMknodMeta{
                        Mode:  uint32(opts.Mode),
                        Rdev:  linux.MakeDeviceID(uint16(opts.DevMajor), opts.DevMinor),
                        Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
                },
                Name: name,
        }
        return i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in)
}

// NewSymlink implements kernfs.Inode.NewSymlink.
func (i *inode) NewSymlink(ctx context.Context, name, target string) (kernfs.Inode, error) {
        in := linux.FUSESymLinkIn{
                Name:   name,
                Target: target,
        }
        return i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in)
}

// Unlink implements kernfs.Inode.Unlink.
func (i *inode) Unlink(ctx context.Context, name string, child kernfs.Inode) error {
        kernelTask := kernel.TaskFromContext(ctx)
        if kernelTask == nil {
                log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
                return linuxerr.EINVAL
        }
        in := linux.FUSEUnlinkIn{Name: name}
        req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in)
        res, err := i.fs.conn.Call(kernelTask, req)
        if err != nil {
                return err
        }
        // only return error, discard res.
        return res.Error()
}

// NewDir implements kernfs.Inode.NewDir.
func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
        in := linux.FUSEMkdirIn{
                MkdirMeta: linux.FUSEMkdirMeta{
                        Mode:  uint32(opts.Mode),
                        Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
                },
                Name: name,
        }
        return i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in)
}

// RmDir implements kernfs.Inode.RmDir.
func (i *inode) RmDir(ctx context.Context, name string, child kernfs.Inode) error {
        fusefs := i.fs
        task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)

        in := linux.FUSERmDirIn{Name: name}
        req := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_RMDIR, &in)
        res, err := i.fs.conn.Call(task, req)
        if err != nil {
                return err
        }
        return res.Error()
}

// newEntry calls FUSE server for entry creation and allocates corresponding entry according to response.
// Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and FUSE_LOOKUP.
func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (kernfs.Inode, error) {
        kernelTask := kernel.TaskFromContext(ctx)
        if kernelTask == nil {
                log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
                return nil, linuxerr.EINVAL
        }
        req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload)
        res, err := i.fs.conn.Call(kernelTask, req)
        if err != nil {
                return nil, err
        }
        if err := res.Error(); err != nil {
                return nil, err
        }
        out := linux.FUSEEntryOut{}
        if err := res.UnmarshalPayload(&out); err != nil {
                return nil, err
        }
        if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) {
                return nil, syserror.EIO
        }
        child := i.fs.newInode(ctx, out.NodeID, out.Attr)
        return child, nil
}

// Getlink implements kernfs.Inode.Getlink.
func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
        path, err := i.Readlink(ctx, mnt)
        return vfs.VirtualDentry{}, path, err
}

// Readlink implements kernfs.Inode.Readlink.
func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
        if i.Mode().FileType()&linux.S_IFLNK == 0 {
                return "", linuxerr.EINVAL
        }
        if len(i.link) == 0 {
                kernelTask := kernel.TaskFromContext(ctx)
                if kernelTask == nil {
                        log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context")
                        return "", linuxerr.EINVAL
                }
                req := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{})
                res, err := i.fs.conn.Call(kernelTask, req)
                if err != nil {
                        return "", err
                }
                i.link = string(res.data[res.hdr.SizeBytes():])
                if !mnt.Options().ReadOnly {
                        i.attributeTime = 0
                }
        }
        return i.link, nil
}

// getFUSEAttr returns a linux.FUSEAttr of this inode stored in local cache.
// TODO(gvisor.dev/issue/3679): Add support for other fields.
func (i *inode) getFUSEAttr() linux.FUSEAttr {
        return linux.FUSEAttr{
                Ino:  i.Ino(),
                Size: atomic.LoadUint64(&i.size),
                Mode: uint32(i.Mode()),
        }
}

// statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The
// opts.Sync attribute is ignored since the synchronization is handled by the
// FUSE server.
func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx {
        var stat linux.Statx
        stat.Blksize = attr.BlkSize
        stat.DevMajor, stat.DevMinor = linux.UNNAMED_MAJOR, devMinor

        rdevMajor, rdevMinor := linux.DecodeDeviceID(attr.Rdev)
        stat.RdevMajor, stat.RdevMinor = uint32(rdevMajor), rdevMinor

        if mask&linux.STATX_MODE != 0 {
                stat.Mode = uint16(attr.Mode)
        }
        if mask&linux.STATX_NLINK != 0 {
                stat.Nlink = attr.Nlink
        }
        if mask&linux.STATX_UID != 0 {
                stat.UID = attr.UID
        }
        if mask&linux.STATX_GID != 0 {
                stat.GID = attr.GID
        }
        if mask&linux.STATX_ATIME != 0 {
                stat.Atime = linux.StatxTimestamp{
                        Sec:  int64(attr.Atime),
                        Nsec: attr.AtimeNsec,
                }
        }
        if mask&linux.STATX_MTIME != 0 {
                stat.Mtime = linux.StatxTimestamp{
                        Sec:  int64(attr.Mtime),
                        Nsec: attr.MtimeNsec,
                }
        }
        if mask&linux.STATX_CTIME != 0 {
                stat.Ctime = linux.StatxTimestamp{
                        Sec:  int64(attr.Ctime),
                        Nsec: attr.CtimeNsec,
                }
        }
        if mask&linux.STATX_INO != 0 {
                stat.Ino = attr.Ino
        }
        if mask&linux.STATX_SIZE != 0 {
                stat.Size = attr.Size
        }
        if mask&linux.STATX_BLOCKS != 0 {
                stat.Blocks = attr.Blocks
        }
        return stat
}

// getAttr gets the attribute of this inode by issuing a FUSE_GETATTR request
// or read from local cache. It updates the corresponding attributes if
// necessary.
func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions, flags uint32, fh uint64) (linux.FUSEAttr, error) {
        attributeVersion := atomic.LoadUint64(&i.fs.conn.attributeVersion)

        // TODO(gvisor.dev/issue/3679): send the request only if
        // - invalid local cache for fields specified in the opts.Mask
        // - forced update
        // - i.attributeTime expired
        // If local cache is still valid, return local cache.
        // Currently we always send a request,
        // and we always set the metadata with the new result,
        // unless attributeVersion has changed.

        task := kernel.TaskFromContext(ctx)
        if task == nil {
                log.Warningf("couldn't get kernel task from context")
                return linux.FUSEAttr{}, linuxerr.EINVAL
        }

        creds := auth.CredentialsFromContext(ctx)

        in := linux.FUSEGetAttrIn{
                GetAttrFlags: flags,
                Fh:           fh,
        }
        req := i.fs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_GETATTR, &in)
        res, err := i.fs.conn.Call(task, req)
        if err != nil {
                return linux.FUSEAttr{}, err
        }
        if err := res.Error(); err != nil {
                return linux.FUSEAttr{}, err
        }

        var out linux.FUSEGetAttrOut
        if err := res.UnmarshalPayload(&out); err != nil {
                return linux.FUSEAttr{}, err
        }

        // Local version is newer, return the local one.
        // Skip the update.
        if attributeVersion != 0 && atomic.LoadUint64(&i.attributeVersion) > attributeVersion {
                return i.getFUSEAttr(), nil
        }

        // Set the metadata of kernfs.InodeAttrs.
        if err := i.InodeAttrs.SetStat(ctx, fs, creds, vfs.SetStatOptions{
                Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
        }); err != nil {
                return linux.FUSEAttr{}, err
        }

        // Set the size if no error (after SetStat() check).
        atomic.StoreUint64(&i.size, out.Attr.Size)

        return out.Attr, nil
}

// reviseAttr attempts to update the attributes for internal purposes
// by calling getAttr with a pre-specified mask.
// Used by read, write, lseek.
func (i *inode) reviseAttr(ctx context.Context, flags uint32, fh uint64) error {
        // Never need atime for internal purposes.
        _, err := i.getAttr(ctx, i.fs.VFSFilesystem(), vfs.StatOptions{
                Mask: linux.STATX_BASIC_STATS &^ linux.STATX_ATIME,
        }, flags, fh)
        return err
}

// Stat implements kernfs.Inode.Stat.
func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
        attr, err := i.getAttr(ctx, fs, opts, 0, 0)
        if err != nil {
                return linux.Statx{}, err
        }

        return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil
}

// DecRef implements kernfs.Inode.DecRef.
func (i *inode) DecRef(ctx context.Context) {
        i.inodeRefs.DecRef(func() { i.Destroy(ctx) })
}

// StatFS implements kernfs.Inode.StatFS.
func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
        // TODO(gvisor.dev/issues/3413): Complete the implementation of statfs.
        return vfs.GenericStatFS(linux.FUSE_SUPER_MAGIC), nil
}

// fattrMaskFromStats converts vfs.SetStatOptions.Stat.Mask to linux stats mask
// aligned with the attribute mask defined in include/linux/fs.h.
func fattrMaskFromStats(mask uint32) uint32 {
        var fuseAttrMask uint32
        maskMap := map[uint32]uint32{
                linux.STATX_MODE:  linux.FATTR_MODE,
                linux.STATX_UID:   linux.FATTR_UID,
                linux.STATX_GID:   linux.FATTR_GID,
                linux.STATX_SIZE:  linux.FATTR_SIZE,
                linux.STATX_ATIME: linux.FATTR_ATIME,
                linux.STATX_MTIME: linux.FATTR_MTIME,
                linux.STATX_CTIME: linux.FATTR_CTIME,
        }
        for statxMask, fattrMask := range maskMap {
                if mask&statxMask != 0 {
                        fuseAttrMask |= fattrMask
                }
        }
        return fuseAttrMask
}

// SetStat implements kernfs.Inode.SetStat.
func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
        return i.setAttr(ctx, fs, creds, opts, false, 0)
}

func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions, useFh bool, fh uint64) error {
        conn := i.fs.conn
        task := kernel.TaskFromContext(ctx)
        if task == nil {
                log.Warningf("couldn't get kernel task from context")
                return linuxerr.EINVAL
        }

        // We should retain the original file type when assigning new mode.
        fileType := uint16(i.Mode()) & linux.S_IFMT
        fattrMask := fattrMaskFromStats(opts.Stat.Mask)
        if useFh {
                fattrMask |= linux.FATTR_FH
        }
        in := linux.FUSESetAttrIn{
                Valid:     fattrMask,
                Fh:        fh,
                Size:      opts.Stat.Size,
                Atime:     uint64(opts.Stat.Atime.Sec),
                Mtime:     uint64(opts.Stat.Mtime.Sec),
                Ctime:     uint64(opts.Stat.Ctime.Sec),
                AtimeNsec: opts.Stat.Atime.Nsec,
                MtimeNsec: opts.Stat.Mtime.Nsec,
                CtimeNsec: opts.Stat.Ctime.Nsec,
                Mode:      uint32(fileType | opts.Stat.Mode),
                UID:       opts.Stat.UID,
                GID:       opts.Stat.GID,
        }
        req := conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_SETATTR, &in)
        res, err := conn.Call(task, req)
        if err != nil {
                return err
        }
        if err := res.Error(); err != nil {
                return err
        }
        out := linux.FUSEGetAttrOut{}
        if err := res.UnmarshalPayload(&out); err != nil {
                return err
        }

        // Set the metadata of kernfs.InodeAttrs.
        if err := i.InodeAttrs.SetStat(ctx, fs, creds, vfs.SetStatOptions{
                Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
        }); err != nil {
                return err
        }

        return nil
}





































































































































    2 





    1 




    1 





    1 











    3 






    3 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package netfilter

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/bits"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

// matchMaker knows how to (un)marshal the matcher named name().
type matchMaker interface {
        // name is the matcher name as stored in the xt_entry_match struct.
        name() string

        // marshal converts from a stack.Matcher to an ABI struct.
        marshal(matcher matcher) []byte

        // unmarshal converts from the ABI matcher struct to an
        // stack.Matcher.
        unmarshal(task *kernel.Task, buf []byte, filter stack.IPHeaderFilter) (stack.Matcher, error)
}

type matcher interface {
        name() string
}

// matchMakers maps the name of supported matchers to the matchMaker that
// marshals and unmarshals it. It is immutable after package initialization.
var matchMakers = map[string]matchMaker{}

// registermatchMaker should be called by match extensions to register them
// with the netfilter package.
func registerMatchMaker(mm matchMaker) {
        if _, ok := matchMakers[mm.name()]; ok {
                panic(fmt.Sprintf("Multiple matches registered with name %q.", mm.name()))
        }
        matchMakers[mm.name()] = mm
}

func marshalMatcher(mr stack.Matcher) []byte {
        matcher := mr.(matcher)
        matchMaker, ok := matchMakers[matcher.name()]
        if !ok {
                panic(fmt.Sprintf("Unknown matcher of type %T.", matcher))
        }
        return matchMaker.marshal(matcher)
}

// marshalEntryMatch creates a marshalled XTEntryMatch with the given name and
// data appended at the end.
func marshalEntryMatch(name string, data []byte) []byte {
        nflog("marshaling matcher %q", name)

        // We have to pad this struct size to a multiple of 8 bytes.
        size := bits.AlignUp(linux.SizeOfXTEntryMatch+len(data), 8)
        matcher := linux.KernelXTEntryMatch{
                XTEntryMatch: linux.XTEntryMatch{
                        MatchSize: uint16(size),
                },
                Data: data,
        }
        copy(matcher.Name[:], name)

        buf := make([]byte, size)
        entryLen := matcher.XTEntryMatch.SizeBytes()
        matcher.XTEntryMatch.MarshalUnsafe(buf[:entryLen])
        copy(buf[entryLen:], matcher.Data)
        return buf
}

func unmarshalMatcher(task *kernel.Task, match linux.XTEntryMatch, filter stack.IPHeaderFilter, buf []byte) (stack.Matcher, error) {
        matchMaker, ok := matchMakers[match.Name.String()]
        if !ok {
                return nil, fmt.Errorf("unsupported matcher with name %q", match.Name.String())
        }
        return matchMaker.unmarshal(task, buf, filter)
}

// targetMaker knows how to (un)marshal a target. Once registered,
// marshalTarget and unmarshalTarget can be used.
type targetMaker interface {
        // id uniquely identifies the target.
        id() targetID

        // marshal converts from a target to an ABI struct.
        marshal(target target) []byte

        // unmarshal converts from the ABI matcher struct to a target.
        unmarshal(buf []byte, filter stack.IPHeaderFilter) (target, *syserr.Error)
}

// A targetID uniquely identifies a target.
type targetID struct {
        // name is the target name as stored in the xt_entry_target struct.
        name string

        // networkProtocol is the protocol to which the target applies.
        networkProtocol tcpip.NetworkProtocolNumber

        // revision is the version of the target.
        revision uint8
}

// target extends a stack.Target, allowing it to be used with the extension
// system. The sentry only uses targets, never stack.Targets directly.
type target interface {
        stack.Target
        id() targetID
}

// targetMakers maps the targetID of supported targets to the targetMaker that
// marshals and unmarshals it. It is immutable after package initialization.
var targetMakers = map[targetID]targetMaker{}

func targetRevision(name string, netProto tcpip.NetworkProtocolNumber, rev uint8) (uint8, bool) {
        tid := targetID{
                name:            name,
                networkProtocol: netProto,
                revision:        rev,
        }
        if _, ok := targetMakers[tid]; !ok {
                return 0, false
        }

        // Return the highest supported revision unless rev is higher.
        for _, other := range targetMakers {
                otherID := other.id()
                if name == otherID.name && netProto == otherID.networkProtocol && otherID.revision > rev {
                        rev = uint8(otherID.revision)
                }
        }
        return rev, true
}

// registerTargetMaker should be called by target extensions to register them
// with the netfilter package.
func registerTargetMaker(tm targetMaker) {
        if _, ok := targetMakers[tm.id()]; ok {
                panic(fmt.Sprintf("multiple targets registered with name %q.", tm.id()))
        }
        targetMakers[tm.id()] = tm
}

func marshalTarget(tgt stack.Target) []byte {
        // The sentry only uses targets, never stack.Targets directly.
        target := tgt.(target)
        targetMaker, ok := targetMakers[target.id()]
        if !ok {
                panic(fmt.Sprintf("unknown target of type %T with id %+v.", target, target.id()))
        }
        return targetMaker.marshal(target)
}

func unmarshalTarget(target linux.XTEntryTarget, filter stack.IPHeaderFilter, buf []byte) (target, *syserr.Error) {
        tid := targetID{
                name:            target.Name.String(),
                networkProtocol: filter.NetworkProtocol(),
                revision:        target.Revision,
        }
        targetMaker, ok := targetMakers[tid]
        if !ok {
                nflog("unsupported target with name %q", target.Name.String())
                return nil, syserr.ErrInvalidArgument
        }
        return targetMaker.unmarshal(buf, filter)
}






































































































   67 

   67 





   68 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "time"

        "gvisor.dev/gvisor/pkg/tcpip"
)

const (
        // MinBufferSize is the smallest size of a receive or send buffer.
        MinBufferSize = 4 << 10 // 4 KiB

        // DefaultBufferSize is the default size of the send/recv buffer for a
        // transport endpoint.
        DefaultBufferSize = 212 << 10 // 212 KiB

        // DefaultMaxBufferSize is the default maximum permitted size of a
        // send/receive buffer.
        DefaultMaxBufferSize = 4 << 20 // 4 MiB

        // defaultTCPInvalidRateLimit is the default value for
        // stack.TCPInvalidRateLimit.
        defaultTCPInvalidRateLimit = 500 * time.Millisecond
)

// ReceiveBufferSizeOption is used by stack.(Stack*).Option/SetOption to
// get/set the default, min and max receive buffer sizes.
type ReceiveBufferSizeOption struct {
        Min     int
        Default int
        Max     int
}

// TCPInvalidRateLimitOption is used by stack.(Stack*).Option/SetOption to get/set
// stack.tcpInvalidRateLimit.
type TCPInvalidRateLimitOption time.Duration

// SetOption allows setting stack wide options.
func (s *Stack) SetOption(option interface{}) tcpip.Error {
        switch v := option.(type) {
        case tcpip.SendBufferSizeOption:
                // Make sure we don't allow lowering the buffer below minimum
                // required for stack to work.
                if v.Min < MinBufferSize {
                        return &tcpip.ErrInvalidOptionValue{}
                }

                if v.Default < v.Min || v.Default > v.Max {
                        return &tcpip.ErrInvalidOptionValue{}
                }

                s.mu.Lock()
                s.sendBufferSize = v
                s.mu.Unlock()
                return nil

        case tcpip.ReceiveBufferSizeOption:
                // Make sure we don't allow lowering the buffer below minimum
                // required for stack to work.
                if v.Min < MinBufferSize {
                        return &tcpip.ErrInvalidOptionValue{}
                }

                if v.Default < v.Min || v.Default > v.Max {
                        return &tcpip.ErrInvalidOptionValue{}
                }

                s.mu.Lock()
                s.receiveBufferSize = v
                s.mu.Unlock()
                return nil

        case TCPInvalidRateLimitOption:
                if v < 0 {
                        return &tcpip.ErrInvalidOptionValue{}
                }
                s.mu.Lock()
                s.tcpInvalidRateLimit = time.Duration(v)
                s.mu.Unlock()
                return nil

        default:
                return &tcpip.ErrUnknownProtocolOption{}
        }
}

// Option allows retrieving stack wide options.
func (s *Stack) Option(option interface{}) tcpip.Error {
        switch v := option.(type) {
        case *tcpip.SendBufferSizeOption:
                s.mu.RLock()
                *v = s.sendBufferSize
                s.mu.RUnlock()
                return nil

        case *tcpip.ReceiveBufferSizeOption:
                s.mu.RLock()
                *v = s.receiveBufferSize
                s.mu.RUnlock()
                return nil

        case *TCPInvalidRateLimitOption:
                s.mu.RLock()
                *v = TCPInvalidRateLimitOption(s.tcpInvalidRateLimit)
                s.mu.RUnlock()
                return nil

        default:
                return &tcpip.ErrUnknownProtocolOption{}
        }
}



































    3 





    3 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cgroupfs

import (
        "bytes"
        "fmt"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/usermem"
)

// +stateify savable
type jobController struct {
        controllerCommon
        id int64
}

var _ controller = (*jobController)(nil)

func newJobController(fs *filesystem) *jobController {
        c := &jobController{}
        c.controllerCommon.init(controllerJob, fs)
        return c
}

func (c *jobController) AddControlFiles(ctx context.Context, creds *auth.Credentials, _ *cgroupInode, contents map[string]kernfs.Inode) {
        contents["job.id"] = c.fs.newControllerWritableFile(ctx, creds, &jobIDData{c: c})
}

// +stateify savable
type jobIDData struct {
        c *jobController
}

// Generate implements vfs.DynamicBytesSource.Generate.
func (d *jobIDData) Generate(ctx context.Context, buf *bytes.Buffer) error {
        fmt.Fprintf(buf, "%d\n", d.c.id)
        return nil
}

// Write implements vfs.WritableDynamicBytesSource.Write.
func (d *jobIDData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
        val, n, err := parseInt64FromString(ctx, src, offset)
        if err != nil {
                return n, err
        }
        d.c.id = val
        return n, nil
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/memmap/mapping_set_impl.go: no such file or directory



























































   38 
































    3 



    3 
    3 












    3 















    3 

    3 

    3 




    3 
    2 





    1 


    1 







































    3 
    2 


    2 
    1 



    1 



    1 
    1 






































   31 
















   29 










    7 




    7 







    7 






    7 


    7 






   36 
   36 






   36 















   31 





   11 
    2 


   10 





    4 




   10 



   20 




   20 





    5 




   20 





   31 

   30 

    3 
    2 






   30 

   30 


   28 



   30 
   29 



   30 
   28 


   30 





   30 



    6 











    6 






    6 
    6 



    6 
    6 



    6 




    6 




    6 







    6 




   15 






   15 

   15 


    1 


   14 
    1 


   14 



   14 



   14 




   14 



   14 



   14 
    2 


   13 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package ports provides PortManager that manages allocating, reserving and
// releasing ports.
package ports

import (
        "math"
        "math/rand"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/header"
)

const (
        firstEphemeral               = 16000
        anyIPAddress   tcpip.Address = ""
)

// Reservation describes a port reservation.
type Reservation struct {
        // Networks is a list of network protocols to which the reservation
        // applies. Can be IPv4, IPv6, or both.
        Networks []tcpip.NetworkProtocolNumber

        // Transport is the transport protocol to which the reservation applies.
        Transport tcpip.TransportProtocolNumber

        // Addr is the address of the local endpoint.
        Addr tcpip.Address

        // Port is the local port number.
        Port uint16

        // Flags describe features of the reservation.
        Flags Flags

        // BindToDevice is the NIC to which the reservation applies.
        BindToDevice tcpip.NICID

        // Dest is the destination address.
        Dest tcpip.FullAddress
}

func (rs Reservation) dst() destination {
        return destination{
                rs.Dest.Addr,
                rs.Dest.Port,
        }
}

type portDescriptor struct {
        network   tcpip.NetworkProtocolNumber
        transport tcpip.TransportProtocolNumber
        port      uint16
}

type destination struct {
        addr tcpip.Address
        port uint16
}

// destToCounter maps each destination to the FlagCounter that represents
// endpoints to that destination.
//
// destToCounter is never empty. When it has no elements, it is removed from
// the map that references it.
type destToCounter map[destination]FlagCounter

// intersectionFlags calculates the intersection of flag bit values which affect
// the specified destination.
//
// If no destinations are present, all flag values are returned as there are no
// entries to limit possible flag values of a new entry.
//
// In addition to the intersection, the number of intersecting refs is
// returned.
func (dc destToCounter) intersectionFlags(res Reservation) (BitFlags, int) {
        intersection := FlagMask
        var count int

        for dest, counter := range dc {
                if dest == res.dst() {
                        intersection &= counter.SharedFlags()
                        count++
                        continue
                }
                // Wildcard destinations affect all destinations for TupleOnly.
                if dest.addr == anyIPAddress || res.Dest.Addr == anyIPAddress {
                        // Only bitwise and the TupleOnlyFlag.
                        intersection &= (^TupleOnlyFlag) | counter.SharedFlags()
                        count++
                }
        }

        return intersection, count
}

// deviceToDest maps NICs to destinations for which there are port reservations.
//
// deviceToDest is never empty. When it has no elements, it is removed from the
// map that references it.
type deviceToDest map[tcpip.NICID]destToCounter

// isAvailable checks whether binding is possible by device. If not binding to
// a device, check against all FlagCounters. If binding to a specific device,
// check against the unspecified device and the provided device.
//
// If either of the port reuse flags is enabled on any of the nodes, all nodes
// sharing a port must share at least one reuse flag. This matches Linux's
// behavior.
func (dd deviceToDest) isAvailable(res Reservation, portSpecified bool) bool {
        flagBits := res.Flags.Bits()
        if res.BindToDevice == 0 {
                intersection := FlagMask
                for _, dest := range dd {
                        flags, count := dest.intersectionFlags(res)
                        if count == 0 {
                                continue
                        }
                        intersection &= flags
                        if intersection&flagBits == 0 {
                                // Can't bind because the (addr,port) was
                                // previously bound without reuse.
                                return false
                        }
                }
                if !portSpecified && res.Transport == header.TCPProtocolNumber {
                        return false
                }
                return true
        }

        intersection := FlagMask

        if dests, ok := dd[0]; ok {
                var count int
                intersection, count = dests.intersectionFlags(res)
                if count > 0 {
                        if intersection&flagBits == 0 {
                                return false
                        }
                        if !portSpecified && res.Transport == header.TCPProtocolNumber {
                                return false
                        }
                }
        }

        if dests, ok := dd[res.BindToDevice]; ok {
                flags, count := dests.intersectionFlags(res)
                intersection &= flags
                if count > 0 {
                        if intersection&flagBits == 0 {
                                return false
                        }
                        if !portSpecified && res.Transport == header.TCPProtocolNumber {
                                return false
                        }
                }
        }

        return true
}

// addrToDevice maps IP addresses to NICs that have port reservations.
type addrToDevice map[tcpip.Address]deviceToDest

// isAvailable checks whether an IP address is available to bind to. If the
// address is the "any" address, check all other addresses. Otherwise, just
// check against the "any" address and the provided address.
func (ad addrToDevice) isAvailable(res Reservation, portSpecified bool) bool {
        if res.Addr == anyIPAddress {
                // If binding to the "any" address then check that there are no
                // conflicts with all addresses.
                for _, devices := range ad {
                        if !devices.isAvailable(res, portSpecified) {
                                return false
                        }
                }
                return true
        }

        // Check that there is no conflict with the "any" address.
        if devices, ok := ad[anyIPAddress]; ok {
                if !devices.isAvailable(res, portSpecified) {
                        return false
                }
        }

        // Check that this is no conflict with the provided address.
        if devices, ok := ad[res.Addr]; ok {
                if !devices.isAvailable(res, portSpecified) {
                        return false
                }
        }

        return true
}

// PortManager manages allocating, reserving and releasing ports.
type PortManager struct {
        // mu protects allocatedPorts.
        // LOCK ORDERING: mu > ephemeralMu.
        mu sync.RWMutex
        // allocatedPorts is a nesting of maps that ultimately map Reservations
        // to FlagCounters describing whether the Reservation is valid and can
        // be reused.
        allocatedPorts map[portDescriptor]addrToDevice

        // ephemeralMu protects firstEphemeral and numEphemeral.
        ephemeralMu    sync.RWMutex
        firstEphemeral uint16
        numEphemeral   uint16

        // hint is used to pick ports ephemeral ports in a stable order for
        // a given port offset.
        //
        // hint must be accessed using the portHint/incPortHint helpers.
        // TODO(gvisor.dev/issue/940): S/R this field.
        hint uint32
}

// NewPortManager creates new PortManager.
func NewPortManager() *PortManager {
        return &PortManager{
                allocatedPorts: make(map[portDescriptor]addrToDevice),
                firstEphemeral: firstEphemeral,
                numEphemeral:   math.MaxUint16 - firstEphemeral + 1,
        }
}

// PortTester indicates whether the passed in port is suitable. Returning an
// error causes the function to which the PortTester is passed to return that
// error.
type PortTester func(port uint16) (good bool, err tcpip.Error)

// PickEphemeralPort randomly chooses a starting point and iterates over all
// possible ephemeral ports, allowing the caller to decide whether a given port
// is suitable for its needs, and stopping when a port is found or an error
// occurs.
func (pm *PortManager) PickEphemeralPort(rng *rand.Rand, testPort PortTester) (port uint16, err tcpip.Error) {
        pm.ephemeralMu.RLock()
        firstEphemeral := pm.firstEphemeral
        numEphemeral := pm.numEphemeral
        pm.ephemeralMu.RUnlock()

        offset := uint32(rng.Int31n(int32(numEphemeral)))
        return pickEphemeralPort(offset, firstEphemeral, numEphemeral, testPort)
}

// portHint atomically reads and returns the pm.hint value.
func (pm *PortManager) portHint() uint32 {
        return atomic.LoadUint32(&pm.hint)
}

// incPortHint atomically increments pm.hint by 1.
func (pm *PortManager) incPortHint() {
        atomic.AddUint32(&pm.hint, 1)
}

// PickEphemeralPortStable starts at the specified offset + pm.portHint and
// iterates over all ephemeral ports, allowing the caller to decide whether a
// given port is suitable for its needs and stopping when a port is found or an
// error occurs.
func (pm *PortManager) PickEphemeralPortStable(offset uint32, testPort PortTester) (port uint16, err tcpip.Error) {
        pm.ephemeralMu.RLock()
        firstEphemeral := pm.firstEphemeral
        numEphemeral := pm.numEphemeral
        pm.ephemeralMu.RUnlock()

        p, err := pickEphemeralPort(pm.portHint()+offset, firstEphemeral, numEphemeral, testPort)
        if err == nil {
                pm.incPortHint()
        }
        return p, err
}

// pickEphemeralPort starts at the offset specified from the FirstEphemeral port
// and iterates over the number of ports specified by count and allows the
// caller to decide whether a given port is suitable for its needs, and stopping
// when a port is found or an error occurs.
func pickEphemeralPort(offset uint32, first, count uint16, testPort PortTester) (port uint16, err tcpip.Error) {
        for i := uint32(0); i < uint32(count); i++ {
                port := uint16(uint32(first) + (offset+i)%uint32(count))
                ok, err := testPort(port)
                if err != nil {
                        return 0, err
                }

                if ok {
                        return port, nil
                }
        }

        return 0, &tcpip.ErrNoPortAvailable{}
}

// ReservePort marks a port/IP combination as reserved so that it cannot be
// reserved by another endpoint. If port is zero, ReservePort will search for
// an unreserved ephemeral port and reserve it, returning its value in the
// "port" return value.
//
// An optional PortTester can be passed in which if provided will be used to
// test if the picked port can be used. The function should return true if the
// port is safe to use, false otherwise.
func (pm *PortManager) ReservePort(rng *rand.Rand, res Reservation, testPort PortTester) (reservedPort uint16, err tcpip.Error) {
        pm.mu.Lock()
        defer pm.mu.Unlock()

        // If a port is specified, just try to reserve it for all network
        // protocols.
        if res.Port != 0 {
                if !pm.reserveSpecificPortLocked(res, true /* portSpecified */) {
                        return 0, &tcpip.ErrPortInUse{}
                }
                if testPort != nil {
                        ok, err := testPort(res.Port)
                        if err != nil {
                                pm.releasePortLocked(res)
                                return 0, err
                        }
                        if !ok {
                                pm.releasePortLocked(res)
                                return 0, &tcpip.ErrPortInUse{}
                        }
                }
                return res.Port, nil
        }

        // A port wasn't specified, so try to find one.
        return pm.PickEphemeralPort(rng, func(p uint16) (bool, tcpip.Error) {
                res.Port = p
                if !pm.reserveSpecificPortLocked(res, false /* portSpecified */) {
                        return false, nil
                }
                if testPort != nil {
                        ok, err := testPort(p)
                        if err != nil {
                                pm.releasePortLocked(res)
                                return false, err
                        }
                        if !ok {
                                pm.releasePortLocked(res)
                                return false, nil
                        }
                }
                return true, nil
        })
}

// reserveSpecificPortLocked tries to reserve the given port on all given
// protocols.
func (pm *PortManager) reserveSpecificPortLocked(res Reservation, portSpecified bool) bool {
        // Make sure the port is available.
        for _, network := range res.Networks {
                desc := portDescriptor{network, res.Transport, res.Port}
                if addrs, ok := pm.allocatedPorts[desc]; ok {
                        if !addrs.isAvailable(res, portSpecified) {
                                return false
                        }
                }
        }

        // Reserve port on all network protocols.
        flagBits := res.Flags.Bits()
        dst := res.dst()
        for _, network := range res.Networks {
                desc := portDescriptor{network, res.Transport, res.Port}
                addrToDev, ok := pm.allocatedPorts[desc]
                if !ok {
                        addrToDev = make(addrToDevice)
                        pm.allocatedPorts[desc] = addrToDev
                }
                devToDest, ok := addrToDev[res.Addr]
                if !ok {
                        devToDest = make(deviceToDest)
                        addrToDev[res.Addr] = devToDest
                }
                destToCntr := devToDest[res.BindToDevice]
                if destToCntr == nil {
                        destToCntr = make(destToCounter)
                }
                counter := destToCntr[dst]
                counter.AddRef(flagBits)
                destToCntr[dst] = counter
                devToDest[res.BindToDevice] = destToCntr
        }

        return true
}

// ReserveTuple adds a port reservation for the tuple on all given protocol.
func (pm *PortManager) ReserveTuple(res Reservation) bool {
        flagBits := res.Flags.Bits()
        dst := res.dst()

        pm.mu.Lock()
        defer pm.mu.Unlock()

        // It is easier to undo the entire reservation, so if we find that the
        // tuple can't be fully added, finish and undo the whole thing.
        undo := false

        // Reserve port on all network protocols.
        for _, network := range res.Networks {
                desc := portDescriptor{network, res.Transport, res.Port}
                addrToDev, ok := pm.allocatedPorts[desc]
                if !ok {
                        addrToDev = make(addrToDevice)
                        pm.allocatedPorts[desc] = addrToDev
                }
                devToDest, ok := addrToDev[res.Addr]
                if !ok {
                        devToDest = make(deviceToDest)
                        addrToDev[res.Addr] = devToDest
                }
                destToCntr := devToDest[res.BindToDevice]
                if destToCntr == nil {
                        destToCntr = make(destToCounter)
                }

                counter := destToCntr[dst]
                if counter.TotalRefs() != 0 && counter.SharedFlags()&flagBits == 0 {
                        // Tuple already exists.
                        undo = true
                }
                counter.AddRef(flagBits)
                destToCntr[dst] = counter
                devToDest[res.BindToDevice] = destToCntr
        }

        if undo {
                // releasePortLocked decrements the counts (rather than setting
                // them to zero), so it will undo the incorrect incrementing
                // above.
                pm.releasePortLocked(res)
                return false
        }

        return true
}

// ReleasePort releases the reservation on a port/IP combination so that it can
// be reserved by other endpoints.
func (pm *PortManager) ReleasePort(res Reservation) {
        pm.mu.Lock()
        defer pm.mu.Unlock()

        pm.releasePortLocked(res)
}

func (pm *PortManager) releasePortLocked(res Reservation) {
        dst := res.dst()
        for _, network := range res.Networks {
                desc := portDescriptor{network, res.Transport, res.Port}
                addrToDev, ok := pm.allocatedPorts[desc]
                if !ok {
                        continue
                }
                devToDest, ok := addrToDev[res.Addr]
                if !ok {
                        continue
                }
                destToCounter, ok := devToDest[res.BindToDevice]
                if !ok {
                        continue
                }
                counter, ok := destToCounter[dst]
                if !ok {
                        continue
                }
                counter.DropRef(res.Flags.Bits())
                if counter.TotalRefs() > 0 {
                        destToCounter[dst] = counter
                        continue
                }
                delete(destToCounter, dst)
                if len(destToCounter) > 0 {
                        continue
                }
                delete(devToDest, res.BindToDevice)
                if len(devToDest) > 0 {
                        continue
                }
                delete(addrToDev, res.Addr)
                if len(addrToDev) > 0 {
                        continue
                }
                delete(pm.allocatedPorts, desc)
        }
}

// PortRange returns the UDP and TCP inclusive range of ephemeral ports used in
// both IPv4 and IPv6.
func (pm *PortManager) PortRange() (uint16, uint16) {
        pm.ephemeralMu.RLock()
        defer pm.ephemeralMu.RUnlock()
        return pm.firstEphemeral, pm.firstEphemeral + pm.numEphemeral - 1
}

// SetPortRange sets the UDP and TCP IPv4 and IPv6 ephemeral port range
// (inclusive).
func (pm *PortManager) SetPortRange(start uint16, end uint16) tcpip.Error {
        if start > end {
                return &tcpip.ErrInvalidPortRange{}
        }
        pm.ephemeralMu.Lock()
        defer pm.ephemeralMu.Unlock()
        pm.firstEphemeral = start
        pm.numEphemeral = end - start + 1
        return nil
}























































































































    1 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "fmt"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/refsvfs2"
        "gvisor.dev/gvisor/pkg/waiter"
)

// FilesystemImplSaveRestoreExtension is an optional extension to
// FilesystemImpl.
type FilesystemImplSaveRestoreExtension interface {
        // PrepareSave prepares this filesystem for serialization.
        PrepareSave(ctx context.Context) error

        // CompleteRestore completes restoration from checkpoint for this
        // filesystem after deserialization.
        CompleteRestore(ctx context.Context, opts CompleteRestoreOptions) error
}

// PrepareSave prepares all filesystems for serialization.
func (vfs *VirtualFilesystem) PrepareSave(ctx context.Context) error {
        failures := 0
        for fs := range vfs.getFilesystems() {
                if ext, ok := fs.impl.(FilesystemImplSaveRestoreExtension); ok {
                        if err := ext.PrepareSave(ctx); err != nil {
                                ctx.Warningf("%T.PrepareSave failed: %v", fs.impl, err)
                                failures++
                        }
                }
                fs.DecRef(ctx)
        }
        if failures != 0 {
                return fmt.Errorf("%d filesystems failed to prepare for serialization", failures)
        }
        return nil
}

// CompleteRestore completes restoration from checkpoint for all filesystems
// after deserialization.
func (vfs *VirtualFilesystem) CompleteRestore(ctx context.Context, opts *CompleteRestoreOptions) error {
        failures := 0
        for fs := range vfs.getFilesystems() {
                if ext, ok := fs.impl.(FilesystemImplSaveRestoreExtension); ok {
                        if err := ext.CompleteRestore(ctx, *opts); err != nil {
                                ctx.Warningf("%T.CompleteRestore failed: %v", fs.impl, err)
                                failures++
                        }
                }
                fs.DecRef(ctx)
        }
        if failures != 0 {
                return fmt.Errorf("%d filesystems failed to complete restore after deserialization", failures)
        }
        return nil
}

// CompleteRestoreOptions contains options to
// VirtualFilesystem.CompleteRestore() and
// FilesystemImplSaveRestoreExtension.CompleteRestore().
type CompleteRestoreOptions struct {
        // If ValidateFileSizes is true, filesystem implementations backed by
        // remote filesystems should verify that file sizes have not changed
        // between checkpoint and restore.
        ValidateFileSizes bool

        // If ValidateFileModificationTimestamps is true, filesystem
        // implementations backed by remote filesystems should validate that file
        // mtimes have not changed between checkpoint and restore.
        ValidateFileModificationTimestamps bool
}

// saveMounts is called by stateify.
func (vfs *VirtualFilesystem) saveMounts() []*Mount {
        if atomic.LoadPointer(&vfs.mounts.slots) == nil {
                // vfs.Init() was never called.
                return nil
        }
        var mounts []*Mount
        vfs.mounts.Range(func(mount *Mount) bool {
                mounts = append(mounts, mount)
                return true
        })
        return mounts
}

// saveKey is called by stateify.
func (mnt *Mount) saveKey() VirtualDentry { return mnt.getKey() }

// loadMounts is called by stateify.
func (vfs *VirtualFilesystem) loadMounts(mounts []*Mount) {
        if mounts == nil {
                return
        }
        vfs.mounts.Init()
        for _, mount := range mounts {
                vfs.mounts.Insert(mount)
        }
}

// loadKey is called by stateify.
func (mnt *Mount) loadKey(vd VirtualDentry) { mnt.setKey(vd) }

func (mnt *Mount) afterLoad() {
        if atomic.LoadInt64(&mnt.refs) != 0 {
                refsvfs2.Register(mnt)
        }
}

// afterLoad is called by stateify.
func (epi *epollInterest) afterLoad() {
        // Mark all epollInterests as ready after restore so that the next call to
        // EpollInstance.ReadEvents() rechecks their readiness.
        epi.Callback(nil, waiter.EventMaskFromLinux(epi.mask))
}

// beforeSave is called by stateify.
func (fd *FileDescription) beforeSave() {
        fd.saved = true
        if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil {
                fd.asyncHandler.Unregister(fd)
        }
}

// afterLoad is called by stateify.
func (fd *FileDescription) afterLoad() {
        if fd.statusFlags&linux.O_ASYNC != 0 && fd.asyncHandler != nil {
                fd.asyncHandler.Register(fd)
        }
}











































































































  830 






  829 

    6 



  830 
  824 



  122 






  123 
  116 


  124 




  124 






















































































































































  840 



  841 






    1 



    1 







 1884 



 1890 



 1895 

 1888 



  124 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package safecopy

import (
        "fmt"
        "runtime"
        "unsafe"

        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
)

// maxRegisterSize is the maximum register size used in memcpy and memclr. It
// is used to decide by how much to rewind the copy (for memcpy) or zeroing
// (for memclr) before proceeding.
const maxRegisterSize = 16

// memcpy copies data from src to dst. If a SIGSEGV or SIGBUS signal is received
// during the copy, it returns the address that caused the fault and the number
// of the signal that was received. Otherwise, it returns an unspecified address
// and a signal number of 0.
//
// Data is copied in order, such that if a fault happens at address p, it is
// safe to assume that all data before p-maxRegisterSize has already been
// successfully copied.
//
//go:noescape
func memcpy(dst, src uintptr, n uintptr) (fault uintptr, sig int32)

// memclr sets the n bytes following ptr to zeroes. If a SIGSEGV or SIGBUS
// signal is received during the write, it returns the address that caused the
// fault and the number of the signal that was received. Otherwise, it returns
// an unspecified address and a signal number of 0.
//
// Data is written in order, such that if a fault happens at address p, it is
// safe to assume that all data before p-maxRegisterSize has already been
// successfully written.
//
//go:noescape
func memclr(ptr uintptr, n uintptr) (fault uintptr, sig int32)

// swapUint32 atomically stores new into *ptr and returns (the previous *ptr
// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
// value of old is unspecified, and sig is the number of the signal that was
// received.
//
// Preconditions: ptr must be aligned to a 4-byte boundary.
//
//go:noescape
func swapUint32(ptr unsafe.Pointer, new uint32) (old uint32, sig int32)

// swapUint64 atomically stores new into *ptr and returns (the previous *ptr
// value, 0). If a SIGSEGV or SIGBUS signal is received during the swap, the
// value of old is unspecified, and sig is the number of the signal that was
// received.
//
// Preconditions: ptr must be aligned to a 8-byte boundary.
//
//go:noescape
func swapUint64(ptr unsafe.Pointer, new uint64) (old uint64, sig int32)

// compareAndSwapUint32 is like sync/atomic.CompareAndSwapUint32, but returns
// (the value previously stored at ptr, 0). If a SIGSEGV or SIGBUS signal is
// received during the operation, the value of prev is unspecified, and sig is
// the number of the signal that was received.
//
// Preconditions: ptr must be aligned to a 4-byte boundary.
//
//go:noescape
func compareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (prev uint32, sig int32)

// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr.
//
// Preconditions: ptr must be aligned to a 4-byte boundary.
//
//go:noescape
func loadUint32(ptr unsafe.Pointer) (val uint32, sig int32)

// Return the start address of the functions above.
//
// In Go 1.17+, Go references to assembly functions resolve to an ABIInternal
// wrapper function rather than the function itself. We must reference from
// assembly to get the ABI0 (i.e., primary) address.
func addrOfMemcpy() uintptr
func addrOfMemclr() uintptr
func addrOfSwapUint32() uintptr
func addrOfSwapUint64() uintptr
func addrOfCompareAndSwapUint32() uintptr
func addrOfLoadUint32() uintptr

// CopyIn copies len(dst) bytes from src to dst. It returns the number of bytes
// copied and an error if SIGSEGV or SIGBUS is received while reading from src.
func CopyIn(dst []byte, src unsafe.Pointer) (int, error) {
        n, err := copyIn(dst, uintptr(src))
        runtime.KeepAlive(src)
        return n, err
}

// copyIn is the underlying definition for CopyIn.
func copyIn(dst []byte, src uintptr) (int, error) {
        toCopy := uintptr(len(dst))
        if len(dst) == 0 {
                return 0, nil
        }

        fault, sig := memcpy(uintptr(unsafe.Pointer(&dst[0])), src, toCopy)
        if sig == 0 {
                return len(dst), nil
        }

        if fault < src || fault >= src+toCopy {
                panic(fmt.Sprintf("CopyIn raised signal %d at %#x, which is outside source [%#x, %#x)", sig, fault, src, src+toCopy))
        }

        // memcpy might have ended the copy up to maxRegisterSize bytes before
        // fault, if an instruction caused a memory access that straddled two
        // pages, and the second one faulted. Try to copy up to the fault.
        var done int
        if fault-src > maxRegisterSize {
                done = int(fault - src - maxRegisterSize)
        }
        n, err := copyIn(dst[done:int(fault-src)], src+uintptr(done))
        done += n
        if err != nil {
                return done, err
        }
        return done, errorFromFaultSignal(fault, sig)
}

// CopyOut copies len(src) bytes from src to dst. If returns the number of
// bytes done and an error if SIGSEGV or SIGBUS is received while writing to
// dst.
func CopyOut(dst unsafe.Pointer, src []byte) (int, error) {
        n, err := copyOut(uintptr(dst), src)
        runtime.KeepAlive(dst)
        return n, err
}

// copyOut is the underlying definition for CopyOut.
func copyOut(dst uintptr, src []byte) (int, error) {
        toCopy := uintptr(len(src))
        if toCopy == 0 {
                return 0, nil
        }

        fault, sig := memcpy(dst, uintptr(unsafe.Pointer(&src[0])), toCopy)
        if sig == 0 {
                return len(src), nil
        }

        if fault < dst || fault >= dst+toCopy {
                panic(fmt.Sprintf("CopyOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, fault, dst, dst+toCopy))
        }

        // memcpy might have ended the copy up to maxRegisterSize bytes before
        // fault, if an instruction caused a memory access that straddled two
        // pages, and the second one faulted. Try to copy up to the fault.
        var done int
        if fault-dst > maxRegisterSize {
                done = int(fault - dst - maxRegisterSize)
        }
        n, err := copyOut(dst+uintptr(done), src[done:int(fault-dst)])
        done += n
        if err != nil {
                return done, err
        }
        return done, errorFromFaultSignal(fault, sig)
}

// Copy copies toCopy bytes from src to dst. It returns the number of bytes
// copied and an error if SIGSEGV or SIGBUS is received while reading from src
// or writing to dst.
//
// Data is copied in order; if [src, src+toCopy) and [dst, dst+toCopy) overlap,
// the resulting contents of dst are unspecified.
func Copy(dst, src unsafe.Pointer, toCopy uintptr) (uintptr, error) {
        n, err := copyN(uintptr(dst), uintptr(src), toCopy)
        runtime.KeepAlive(dst)
        runtime.KeepAlive(src)
        return n, err
}

// copyN is the underlying definition for Copy.
func copyN(dst, src uintptr, toCopy uintptr) (uintptr, error) {
        if toCopy == 0 {
                return 0, nil
        }

        fault, sig := memcpy(dst, src, toCopy)
        if sig == 0 {
                return toCopy, nil
        }

        // Did the fault occur while reading from src or writing to dst?
        faultAfterSrc := ^uintptr(0)
        if fault >= src {
                faultAfterSrc = fault - src
        }
        faultAfterDst := ^uintptr(0)
        if fault >= dst {
                faultAfterDst = fault - dst
        }
        if faultAfterSrc >= toCopy && faultAfterDst >= toCopy {
                panic(fmt.Sprintf("Copy raised signal %d at %#x, which is outside source [%#x, %#x) and destination [%#x, %#x)", sig, fault, src, src+toCopy, dst, dst+toCopy))
        }
        faultedAfter := faultAfterSrc
        if faultedAfter > faultAfterDst {
                faultedAfter = faultAfterDst
        }

        // memcpy might have ended the copy up to maxRegisterSize bytes before
        // fault, if an instruction caused a memory access that straddled two
        // pages, and the second one faulted. Try to copy up to the fault.
        var done uintptr
        if faultedAfter > maxRegisterSize {
                done = faultedAfter - maxRegisterSize
        }
        n, err := copyN(dst+done, src+done, faultedAfter-done)
        done += n
        if err != nil {
                return done, err
        }
        return done, errorFromFaultSignal(fault, sig)
}

// ZeroOut writes toZero zero bytes to dst. It returns the number of bytes
// written and an error if SIGSEGV or SIGBUS is received while writing to dst.
func ZeroOut(dst unsafe.Pointer, toZero uintptr) (uintptr, error) {
        n, err := zeroOut(uintptr(dst), toZero)
        runtime.KeepAlive(dst)
        return n, err
}

// zeroOut is the underlying definition for ZeroOut.
func zeroOut(dst uintptr, toZero uintptr) (uintptr, error) {
        if toZero == 0 {
                return 0, nil
        }

        fault, sig := memclr(dst, toZero)
        if sig == 0 {
                return toZero, nil
        }

        if fault < dst || fault >= dst+toZero {
                panic(fmt.Sprintf("ZeroOut raised signal %d at %#x, which is outside destination [%#x, %#x)", sig, fault, dst, dst+toZero))
        }

        // memclr might have ended the write up to maxRegisterSize bytes before
        // fault, if an instruction caused a memory access that straddled two
        // pages, and the second one faulted. Try to write up to the fault.
        var done uintptr
        if fault-dst > maxRegisterSize {
                done = fault - dst - maxRegisterSize
        }
        n, err := zeroOut(dst+done, fault-dst-done)
        done += n
        if err != nil {
                return done, err
        }
        return done, errorFromFaultSignal(fault, sig)
}

// SwapUint32 is equivalent to sync/atomic.SwapUint32, except that it returns
// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is
// not aligned to a 4-byte boundary.
func SwapUint32(ptr unsafe.Pointer, new uint32) (uint32, error) {
        if addr := uintptr(ptr); addr&3 != 0 {
                return 0, AlignmentError{addr, 4}
        }
        old, sig := swapUint32(ptr, new)
        return old, errorFromFaultSignal(uintptr(ptr), sig)
}

// SwapUint64 is equivalent to sync/atomic.SwapUint64, except that it returns
// an error if SIGSEGV or SIGBUS is received while accessing ptr, or if ptr is
// not aligned to an 8-byte boundary.
func SwapUint64(ptr unsafe.Pointer, new uint64) (uint64, error) {
        if addr := uintptr(ptr); addr&7 != 0 {
                return 0, AlignmentError{addr, 8}
        }
        old, sig := swapUint64(ptr, new)
        return old, errorFromFaultSignal(uintptr(ptr), sig)
}

// CompareAndSwapUint32 is equivalent to atomicbitops.CompareAndSwapUint32,
// except that it returns an error if SIGSEGV or SIGBUS is received while
// accessing ptr, or if ptr is not aligned to a 4-byte boundary.
func CompareAndSwapUint32(ptr unsafe.Pointer, old, new uint32) (uint32, error) {
        if addr := uintptr(ptr); addr&3 != 0 {
                return 0, AlignmentError{addr, 4}
        }
        prev, sig := compareAndSwapUint32(ptr, old, new)
        return prev, errorFromFaultSignal(uintptr(ptr), sig)
}

// LoadUint32 is like sync/atomic.LoadUint32, but operates with user memory. It
// may fail with SIGSEGV or SIGBUS if it is received while reading from ptr.
//
// Preconditions: ptr must be aligned to a 4-byte boundary.
func LoadUint32(ptr unsafe.Pointer) (uint32, error) {
        if addr := uintptr(ptr); addr&3 != 0 {
                return 0, AlignmentError{addr, 4}
        }
        val, sig := loadUint32(ptr)
        return val, errorFromFaultSignal(uintptr(ptr), sig)
}

func errorFromFaultSignal(addr uintptr, sig int32) error {
        switch sig {
        case 0:
                return nil
        case int32(unix.SIGSEGV):
                return SegvError{addr}
        case int32(unix.SIGBUS):
                return BusError{addr}
        default:
                panic(fmt.Sprintf("safecopy got unexpected signal %d at address %#x", sig, addr))
        }
}

// ReplaceSignalHandler replaces the existing signal handler for the provided
// signal with the one that handles faults in safecopy-protected functions.
//
// It stores the value of the previously set handler in previous.
//
// This function will be called on initialization in order to install safecopy
// handlers for appropriate signals. These handlers will call the previous
// handler however, and if this is function is being used externally then the
// same courtesy is expected.
func ReplaceSignalHandler(sig unix.Signal, handler uintptr, previous *uintptr) error {
        var sa linux.SigAction
        const maskLen = 8

        // Get the existing signal handler information, and save the current
        // handler. Once we replace it, we will use this pointer to fall back to
        // it when we receive other signals.
        if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(sig), 0, uintptr(unsafe.Pointer(&sa)), maskLen, 0, 0); e != 0 {
                return e
        }

        // Fail if there isn't a previous handler.
        if sa.Handler == 0 {
                return fmt.Errorf("previous handler for signal %x isn't set", sig)
        }

        *previous = uintptr(sa.Handler)

        // Install our own handler.
        sa.Handler = uint64(handler)
        if _, _, e := unix.RawSyscall6(unix.SYS_RT_SIGACTION, uintptr(sig), uintptr(unsafe.Pointer(&sa)), 0, maskLen, 0, 0); e != 0 {
                return e
        }

        return nil
}



























































































































































































































































































































































































































































































  398 



   19 



  510 


  126 



  468 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package mm provides a memory management subsystem. See README.md for a
// detailed overview.
//
// Lock order:
//
// fs locks, except for memmap.Mappable locks
//   mm.MemoryManager.metadataMu
//     mm.MemoryManager.mappingMu
//       Locks taken by memmap.Mappable methods other than Translate
//         mm.MemoryManager.activeMu
//           Locks taken by memmap.Mappable.Translate
//             mm.privateRefs.mu
//               platform.AddressSpace locks
//                 memmap.File locks
//         mm.aioManager.mu
//           mm.AIOContext.mu
//
// Only mm.MemoryManager.Fork is permitted to lock mm.MemoryManager.activeMu in
// multiple mm.MemoryManagers, as it does so in a well-defined order (forked
// child first).
package mm

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/safemem"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsbridge"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sentry/pgalloc"
        "gvisor.dev/gvisor/pkg/sentry/platform"
        "gvisor.dev/gvisor/pkg/sync"
)

// MemoryManager implements a virtual address space.
//
// +stateify savable
type MemoryManager struct {
        // p and mfp are immutable.
        p   platform.Platform
        mfp pgalloc.MemoryFileProvider

        // haveASIO is the cached result of p.SupportsAddressSpaceIO(). Aside from
        // eliminating an indirect call in the hot I/O path, this makes
        // MemoryManager.asioEnabled() a leaf function, allowing it to be inlined.
        //
        // haveASIO is immutable.
        haveASIO bool `state:"nosave"`

        // layout is the memory layout.
        //
        // layout is set by the binary loader before the MemoryManager can be used.
        layout arch.MmapLayout

        // privateRefs stores reference counts for private memory (memory whose
        // ownership is shared by one or more pmas instead of being owned by a
        // memmap.Mappable).
        //
        // privateRefs is immutable.
        privateRefs *privateRefs

        // users is the number of dependencies on the mappings in the MemoryManager.
        // When the number of references in users reaches zero, all mappings are
        // unmapped.
        //
        // users is accessed using atomic memory operations.
        users int32

        // mappingMu is analogous to Linux's struct mm_struct::mmap_sem.
        mappingMu sync.RWMutex `state:"nosave"`

        // vmas stores virtual memory areas. Since vmas are stored by value,
        // clients should usually use vmaIterator.ValuePtr() instead of
        // vmaIterator.Value() to get a pointer to the vma rather than a copy.
        //
        // Invariants: vmas are always page-aligned.
        //
        // vmas is protected by mappingMu.
        vmas vmaSet

        // brk is the mm's brk, which is manipulated using the brk(2) system call.
        // The brk is initially set up by the loader which maps an executable
        // binary into the mm.
        //
        // brk is protected by mappingMu.
        brk hostarch.AddrRange

        // usageAS is vmas.Span(), cached to accelerate RLIMIT_AS checks.
        //
        // usageAS is protected by mappingMu.
        usageAS uint64

        // lockedAS is the combined size in bytes of all vmas with vma.mlockMode !=
        // memmap.MLockNone.
        //
        // lockedAS is protected by mappingMu.
        lockedAS uint64

        // dataAS is the size of private data segments, like mm_struct->data_vm.
        // It means the vma which is private, writable, not stack.
        //
        // dataAS is protected by mappingMu.
        dataAS uint64

        // New VMAs created by MMap use whichever of memmap.MMapOpts.MLockMode or
        // defMLockMode is greater.
        //
        // defMLockMode is protected by mappingMu.
        defMLockMode memmap.MLockMode

        // activeMu is loosely analogous to Linux's struct
        // mm_struct::page_table_lock.
        activeMu sync.RWMutex `state:"nosave"`

        // pmas stores platform mapping areas used to implement vmas. Since pmas
        // are stored by value, clients should usually use pmaIterator.ValuePtr()
        // instead of pmaIterator.Value() to get a pointer to the pma rather than
        // a copy.
        //
        // Inserting or removing segments from pmas should happen along with a
        // call to mm.insertRSS or mm.removeRSS.
        //
        // Invariants: pmas are always page-aligned. If a pma exists for a given
        // address, a vma must also exist for that address.
        //
        // pmas is protected by activeMu.
        pmas pmaSet

        // curRSS is pmas.Span(), cached to accelerate updates to maxRSS. It is
        // reported as the MemoryManager's RSS.
        //
        // maxRSS should be modified only via insertRSS and removeRSS, not
        // directly.
        //
        // maxRSS is protected by activeMu.
        curRSS uint64

        // maxRSS is the maximum resident set size in bytes of a MemoryManager.
        // It is tracked as the application adds and removes mappings to pmas.
        //
        // maxRSS should be modified only via insertRSS, not directly.
        //
        // maxRSS is protected by activeMu.
        maxRSS uint64

        // as is the platform.AddressSpace that pmas are mapped into. active is the
        // number of contexts that require as to be non-nil; if active == 0, as may
        // be nil.
        //
        // as is protected by activeMu. active is manipulated with atomic memory
        // operations; transitions to and from zero are additionally protected by
        // activeMu. (This is because such transitions may need to be atomic with
        // changes to as.)
        as     platform.AddressSpace `state:"nosave"`
        active int32                 `state:"zerovalue"`

        // unmapAllOnActivate indicates that the next Activate call should activate
        // an empty AddressSpace.
        //
        // This is used to ensure that an AddressSpace cached in
        // NewAddressSpace is not used after some change in the MemoryManager
        // or VMAs has made that AddressSpace stale.
        //
        // unmapAllOnActivate is protected by activeMu. It must only be set when
        // there is no active or cached AddressSpace. If as != nil, then
        // invalidations should be propagated immediately.
        unmapAllOnActivate bool `state:"nosave"`

        // If captureInvalidations is true, calls to MM.Invalidate() are recorded
        // in capturedInvalidations rather than being applied immediately to pmas.
        // This is to avoid a race condition in MM.Fork(); see that function for
        // details.
        //
        // Both captureInvalidations and capturedInvalidations are protected by
        // activeMu. Neither need to be saved since captureInvalidations is only
        // enabled during MM.Fork(), during which saving can't occur.
        captureInvalidations  bool             `state:"zerovalue"`
        capturedInvalidations []invalidateArgs `state:"nosave"`

        metadataMu sync.Mutex `state:"nosave"`

        // argv is the application argv. This is set up by the loader and may be
        // modified by prctl(PR_SET_MM_ARG_START/PR_SET_MM_ARG_END). No
        // requirements apply to argv; we do not require that argv.WellFormed().
        //
        // argv is protected by metadataMu.
        argv hostarch.AddrRange

        // envv is the application envv. This is set up by the loader and may be
        // modified by prctl(PR_SET_MM_ENV_START/PR_SET_MM_ENV_END). No
        // requirements apply to envv; we do not require that envv.WellFormed().
        //
        // envv is protected by metadataMu.
        envv hostarch.AddrRange

        // auxv is the ELF's auxiliary vector.
        //
        // auxv is protected by metadataMu.
        auxv arch.Auxv

        // executable is the executable for this MemoryManager. If executable
        // is not nil, it holds a reference on the Dirent.
        //
        // executable is protected by metadataMu.
        executable fsbridge.File

        // dumpability describes if and how this MemoryManager may be dumped to
        // userspace.
        //
        // dumpability is protected by metadataMu.
        dumpability Dumpability

        // aioManager keeps track of AIOContexts used for async IOs. AIOManager
        // must be cloned when CLONE_VM is used.
        aioManager aioManager

        // sleepForActivation indicates whether the task should report to be sleeping
        // before trying to activate the address space. When set to true, delays in
        // activation are not reported as stuck tasks by the watchdog.
        sleepForActivation bool

        // vdsoSigReturnAddr is the address of 'vdso_sigreturn'.
        vdsoSigReturnAddr uint64

        // membarrierPrivateEnabled is non-zero if EnableMembarrierPrivate has
        // previously been called. Since, as of this writing,
        // MEMBARRIER_CMD_PRIVATE_EXPEDITED is implemented as a global memory
        // barrier, membarrierPrivateEnabled has no other effect.
        //
        // membarrierPrivateEnabled is accessed using atomic memory operations.
        membarrierPrivateEnabled uint32

        // membarrierRSeqEnabled is non-zero if EnableMembarrierRSeq has previously
        // been called.
        //
        // membarrierRSeqEnabled is accessed using atomic memory operations.
        membarrierRSeqEnabled uint32
}

// vma represents a virtual memory area.
//
// +stateify savable
type vma struct {
        // mappable is the virtual memory object mapped by this vma. If mappable is
        // nil, the vma represents an anonymous mapping.
        mappable memmap.Mappable

        // off is the offset into mappable at which this vma begins. If mappable is
        // nil, off is meaningless.
        off uint64

        // To speedup VMA save/restore, we group and save the following booleans
        // as a single integer.

        // realPerms are the memory permissions on this vma, as defined by the
        // application.
        realPerms hostarch.AccessType `state:".(int)"`

        // effectivePerms are the memory permissions on this vma which are
        // actually used to control access.
        //
        // Invariant: effectivePerms == realPerms.Effective().
        effectivePerms hostarch.AccessType `state:"manual"`

        // maxPerms limits the set of permissions that may ever apply to this
        // memory, as well as accesses for which usermem.IOOpts.IgnorePermissions
        // is true (e.g. ptrace(PTRACE_POKEDATA)).
        //
        // Invariant: maxPerms == maxPerms.Effective().
        maxPerms hostarch.AccessType `state:"manual"`

        // private is true if this is a MAP_PRIVATE mapping, such that writes to
        // the mapping are propagated to a copy.
        private bool `state:"manual"`

        // growsDown is true if the mapping may be automatically extended downward
        // under certain conditions. If growsDown is true, mappable must be nil.
        //
        // There is currently no corresponding growsUp flag; in Linux, the only
        // architectures that can have VM_GROWSUP mappings are ia64, parisc, and
        // metag, none of which we currently support.
        growsDown bool `state:"manual"`

        // dontfork is the MADV_DONTFORK setting for this vma configured by madvise().
        dontfork bool

        mlockMode memmap.MLockMode

        // numaPolicy is the NUMA policy for this vma set by mbind().
        numaPolicy linux.NumaPolicy

        // numaNodemask is the NUMA nodemask for this vma set by mbind().
        numaNodemask uint64

        // If id is not nil, it controls the lifecycle of mappable and provides vma
        // metadata shown in /proc/[pid]/maps, and the vma holds a reference.
        id memmap.MappingIdentity

        // If hint is non-empty, it is a description of the vma printed in
        // /proc/[pid]/maps. hint takes priority over id.MappedName().
        hint string
}

const (
        vmaRealPermsRead = 1 << iota
        vmaRealPermsWrite
        vmaRealPermsExecute
        vmaEffectivePermsRead
        vmaEffectivePermsWrite
        vmaEffectivePermsExecute
        vmaMaxPermsRead
        vmaMaxPermsWrite
        vmaMaxPermsExecute
        vmaPrivate
        vmaGrowsDown
)

func (v *vma) saveRealPerms() int {
        var b int
        if v.realPerms.Read {
                b |= vmaRealPermsRead
        }
        if v.realPerms.Write {
                b |= vmaRealPermsWrite
        }
        if v.realPerms.Execute {
                b |= vmaRealPermsExecute
        }
        if v.effectivePerms.Read {
                b |= vmaEffectivePermsRead
        }
        if v.effectivePerms.Write {
                b |= vmaEffectivePermsWrite
        }
        if v.effectivePerms.Execute {
                b |= vmaEffectivePermsExecute
        }
        if v.maxPerms.Read {
                b |= vmaMaxPermsRead
        }
        if v.maxPerms.Write {
                b |= vmaMaxPermsWrite
        }
        if v.maxPerms.Execute {
                b |= vmaMaxPermsExecute
        }
        if v.private {
                b |= vmaPrivate
        }
        if v.growsDown {
                b |= vmaGrowsDown
        }
        return b
}

func (v *vma) loadRealPerms(b int) {
        if b&vmaRealPermsRead > 0 {
                v.realPerms.Read = true
        }
        if b&vmaRealPermsWrite > 0 {
                v.realPerms.Write = true
        }
        if b&vmaRealPermsExecute > 0 {
                v.realPerms.Execute = true
        }
        if b&vmaEffectivePermsRead > 0 {
                v.effectivePerms.Read = true
        }
        if b&vmaEffectivePermsWrite > 0 {
                v.effectivePerms.Write = true
        }
        if b&vmaEffectivePermsExecute > 0 {
                v.effectivePerms.Execute = true
        }
        if b&vmaMaxPermsRead > 0 {
                v.maxPerms.Read = true
        }
        if b&vmaMaxPermsWrite > 0 {
                v.maxPerms.Write = true
        }
        if b&vmaMaxPermsExecute > 0 {
                v.maxPerms.Execute = true
        }
        if b&vmaPrivate > 0 {
                v.private = true
        }
        if b&vmaGrowsDown > 0 {
                v.growsDown = true
        }
}

// pma represents a platform mapping area.
//
// +stateify savable
type pma struct {
        // file is the file mapped by this pma. Only pmas for which file ==
        // MemoryManager.mfp.MemoryFile() may be saved. pmas hold a reference to
        // the corresponding file range while they exist.
        file memmap.File `state:"nosave"`

        // off is the offset into file at which this pma begins.
        //
        // Note that pmas do *not* hold references on offsets in file! If private
        // is true, MemoryManager.privateRefs holds the reference instead. If
        // private is false, the corresponding memmap.Mappable holds the reference
        // instead (per memmap.Mappable.Translate requirement).
        off uint64

        // translatePerms is the permissions returned by memmap.Mappable.Translate.
        // If private is true, translatePerms is hostarch.AnyAccess.
        translatePerms hostarch.AccessType

        // effectivePerms is the permissions allowed for non-ignorePermissions
        // accesses. maxPerms is the permissions allowed for ignorePermissions
        // accesses. These are vma.effectivePerms and vma.maxPerms respectively,
        // masked by pma.translatePerms and with Write disallowed if pma.needCOW is
        // true.
        //
        // These are stored in the pma so that the IO implementation can avoid
        // iterating mm.vmas when pmas already exist.
        effectivePerms hostarch.AccessType
        maxPerms       hostarch.AccessType

        // needCOW is true if writes to the mapping must be propagated to a copy.
        needCOW bool

        // private is true if this pma represents private memory.
        //
        // If private is true, file must be MemoryManager.mfp.MemoryFile(), the pma
        // holds a reference on the mapped memory that is tracked in privateRefs,
        // and calls to Invalidate for which
        // memmap.InvalidateOpts.InvalidatePrivate is false should ignore the pma.
        //
        // If private is false, this pma caches a translation from the
        // corresponding vma's memmap.Mappable.Translate.
        private bool

        // If internalMappings is not empty, it is the cached return value of
        // file.MapInternal for the memmap.FileRange mapped by this pma.
        internalMappings safemem.BlockSeq `state:"nosave"`
}

// +stateify savable
type privateRefs struct {
        mu sync.Mutex `state:"nosave"`

        // refs maps offsets into MemoryManager.mfp.MemoryFile() to the number of
        // pmas (or, equivalently, MemoryManagers) that share ownership of the
        // memory at that offset.
        refs fileRefcountSet
}

type invalidateArgs struct {
        ar   hostarch.AddrRange
        opts memmap.InvalidateOpts
}

// fileRefcountSetFunctions implements segment.Functions for fileRefcountSet.
type fileRefcountSetFunctions struct{}

func (fileRefcountSetFunctions) MinKey() uint64 {
        return 0
}

func (fileRefcountSetFunctions) MaxKey() uint64 {
        return ^uint64(0)
}

func (fileRefcountSetFunctions) ClearValue(_ *int32) {
}

func (fileRefcountSetFunctions) Merge(_ memmap.FileRange, rc1 int32, _ memmap.FileRange, rc2 int32) (int32, bool) {
        return rc1, rc1 == rc2
}

func (fileRefcountSetFunctions) Split(_ memmap.FileRange, rc int32, _ uint64) (int32, int32) {
        return rc, rc
}



















   21 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package marshal

// Marshal returns the serialized contents of m in a newly allocated
// byte slice.
func Marshal(m Marshallable) []byte {
        buf := make([]byte, m.SizeBytes())
        m.MarshalUnsafe(buf)
        return buf
}




























   84 










    1 



   83 
















    2 


   83 
   75 




   83 





   76 






   76 



   76 


    9 






    2 





   81 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
)

// Mmap implements Linux syscall mmap(2).
func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        prot := args[2].Int()
        flags := args[3].Int()
        fd := args[4].Int()
        fixed := flags&linux.MAP_FIXED != 0
        private := flags&linux.MAP_PRIVATE != 0
        shared := flags&linux.MAP_SHARED != 0
        anon := flags&linux.MAP_ANONYMOUS != 0
        map32bit := flags&linux.MAP_32BIT != 0

        // Require exactly one of MAP_PRIVATE and MAP_SHARED.
        if private == shared {
                return 0, nil, linuxerr.EINVAL
        }

        opts := memmap.MMapOpts{
                Length:   args[1].Uint64(),
                Offset:   args[5].Uint64(),
                Addr:     args[0].Pointer(),
                Fixed:    fixed,
                Unmap:    fixed,
                Map32Bit: map32bit,
                Private:  private,
                Perms: hostarch.AccessType{
                        Read:    linux.PROT_READ&prot != 0,
                        Write:   linux.PROT_WRITE&prot != 0,
                        Execute: linux.PROT_EXEC&prot != 0,
                },
                MaxPerms:  hostarch.AnyAccess,
                GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
                Precommit: linux.MAP_POPULATE&flags != 0,
        }
        if linux.MAP_LOCKED&flags != 0 {
                opts.MLockMode = memmap.MLockEager
        }
        defer func() {
                if opts.MappingIdentity != nil {
                        opts.MappingIdentity.DecRef(t)
                }
        }()

        if !anon {
                // Convert the passed FD to a file reference.
                file := t.GetFileVFS2(fd)
                if file == nil {
                        return 0, nil, linuxerr.EBADF
                }
                defer file.DecRef(t)

                // mmap unconditionally requires that the FD is readable.
                if !file.IsReadable() {
                        return 0, nil, linuxerr.EACCES
                }
                // MAP_SHARED requires that the FD be writable for PROT_WRITE.
                if shared && !file.IsWritable() {
                        opts.MaxPerms.Write = false
                }

                if err := file.ConfigureMMap(t, &opts); err != nil {
                        return 0, nil, err
                }
        } else if shared {
                // Back shared anonymous mappings with an anonymous tmpfs file.
                opts.Offset = 0
                file, err := tmpfs.NewZeroFile(t, t.Credentials(), t.Kernel().ShmMount(), opts.Length)
                if err != nil {
                        return 0, nil, err
                }
                defer file.DecRef(t)
                if err := file.ConfigureMMap(t, &opts); err != nil {
                        return 0, nil, err
                }
        }

        rv, err := t.MemoryManager().MMap(t, opts)
        return uintptr(rv), nil, err
}











































































  561 



  561 


  561 






  560 






  562 



   21 




   21 












   76 





   76 

    1 


   75 



   75 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package limits provides resource limits.
package limits

import (
        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/sync"
)

// LimitType defines a type of resource limit.
type LimitType int

// Set of constants defining the different types of resource limits.
const (
        CPU LimitType = iota
        FileSize
        Data
        Stack
        Core
        Rss
        ProcessCount
        NumberOfFiles
        MemoryLocked
        AS
        Locks
        SignalsPending
        MessageQueueBytes
        Nice
        RealTimePriority
        Rttime
)

// Infinity is a constant representing a resource with no limit.
const Infinity = ^uint64(0)

// Limit specifies a system limit.
//
// +stateify savable
type Limit struct {
        // Cur specifies the current limit.
        Cur uint64
        // Max specifies the maximum settable limit.
        Max uint64
}

// LimitSet represents the Limits that correspond to each LimitType.
//
// +stateify savable
type LimitSet struct {
        mu   sync.Mutex `state:"nosave"`
        data map[LimitType]Limit
}

// NewLimitSet creates a new, empty LimitSet.
func NewLimitSet() *LimitSet {
        return &LimitSet{
                data: make(map[LimitType]Limit),
        }
}

// GetCopy returns a clone of the LimitSet.
func (l *LimitSet) GetCopy() *LimitSet {
        l.mu.Lock()
        defer l.mu.Unlock()
        copyData := make(map[LimitType]Limit)
        for k, v := range l.data {
                copyData[k] = v
        }
        return &LimitSet{
                data: copyData,
        }
}

// Get returns the resource limit associated with LimitType t.
// If no limit is provided, it defaults to an infinite limit.Infinity.
func (l *LimitSet) Get(t LimitType) Limit {
        l.mu.Lock()
        defer l.mu.Unlock()
        s, ok := l.data[t]
        if !ok {
                return Limit{Cur: Infinity, Max: Infinity}
        }
        return s
}

// GetCapped returns the current value for the limit, capped as specified.
func (l *LimitSet) GetCapped(t LimitType, max uint64) uint64 {
        s := l.Get(t)
        if s.Cur == Infinity || s.Cur > max {
                return max
        }
        return s.Cur
}

// SetUnchecked assigns value v to resource of LimitType t.
func (l *LimitSet) SetUnchecked(t LimitType, v Limit) {
        l.mu.Lock()
        defer l.mu.Unlock()
        l.data[t] = v
}

// Set assigns value v to resource of LimitType t and returns the old value.
// privileged should be true only when either the caller has CAP_SYS_RESOURCE
// or when creating limits for a new kernel.
func (l *LimitSet) Set(t LimitType, v Limit, privileged bool) (Limit, error) {
        l.mu.Lock()
        defer l.mu.Unlock()

        // If a limit is already set, make sure the new limit doesn't
        // exceed the previous max limit.
        if _, ok := l.data[t]; ok {
                // Unprivileged users can only lower their hard limits.
                if l.data[t].Max < v.Max && !privileged {
                        return Limit{}, unix.EPERM
                }
                if v.Cur > v.Max {
                        return Limit{}, unix.EINVAL
                }
        }
        old := l.data[t]
        l.data[t] = v
        return old, nil
}































    7 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tmpfs

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
)

// socketFile is a socket (=S_IFSOCK) tmpfs file.
//
// +stateify savable
type socketFile struct {
        inode inode
        ep    transport.BoundEndpoint
}

func (fs *filesystem) newSocketFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, ep transport.BoundEndpoint, parentDir *directory) *inode {
        file := &socketFile{ep: ep}
        file.inode.init(file, fs, kuid, kgid, mode, parentDir)
        file.inode.nlink = 1 // from parent directory
        return &file.inode
}





























   12 





    4 





   16 
    1 


   15 



   15 





    1 


   14 
    2 
    2 



    2 

   12 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/pipefs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

// Pipe implements Linux syscall pipe(2).
func Pipe(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        return 0, nil, pipe2(t, addr, 0)
}

// Pipe2 implements Linux syscall pipe2(2).
func Pipe2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        addr := args[0].Pointer()
        flags := args[1].Int()
        return 0, nil, pipe2(t, addr, flags)
}

func pipe2(t *kernel.Task, addr hostarch.Addr, flags int32) error {
        if flags&^(linux.O_NONBLOCK|linux.O_CLOEXEC) != 0 {
                return linuxerr.EINVAL
        }
        r, w, err := pipefs.NewConnectedPipeFDs(t, t.Kernel().PipeMount(), uint32(flags&linux.O_NONBLOCK))
        if err != nil {
                return err
        }
        defer r.DecRef(t)
        defer w.DecRef(t)

        fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{r, w}, kernel.FDFlags{
                CloseOnExec: flags&linux.O_CLOEXEC != 0,
        })
        if err != nil {
                return err
        }
        if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
                for _, fd := range fds {
                        if _, file := t.FDTable().Remove(t, fd); file != nil {
                                file.DecRef(t)
                        }
                }
                return err
        }
        return nil
}











































   25 






   25 


    1 




   25 
    1 


   24 



   24 
   14 




   10 



   21 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/bpf"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
)

// userSockFprog is equivalent to Linux's struct sock_fprog on amd64.
//
// +marshal
type userSockFprog struct {
        // Len is the length of the filter in BPF instructions.
        Len uint16

        _ [6]byte // padding for alignment

        // Filter is a user pointer to the struct sock_filter array that makes up
        // the filter program. Filter is a uint64 rather than a hostarch.Addr
        // because hostarch.Addr is actually uintptr, which is not a fixed-size
        // type.
        Filter uint64
}

// seccomp applies a seccomp policy to the current task.
func seccomp(t *kernel.Task, mode, flags uint64, addr hostarch.Addr) error {
        // We only support SECCOMP_SET_MODE_FILTER at the moment.
        if mode != linux.SECCOMP_SET_MODE_FILTER {
                // Unsupported mode.
                return linuxerr.EINVAL
        }

        tsync := flags&linux.SECCOMP_FILTER_FLAG_TSYNC != 0

        // The only flag we support now is SECCOMP_FILTER_FLAG_TSYNC.
        if flags&^linux.SECCOMP_FILTER_FLAG_TSYNC != 0 {
                // Unsupported flag.
                return linuxerr.EINVAL
        }

        var fprog userSockFprog
        if _, err := fprog.CopyIn(t, addr); err != nil {
                return err
        }
        filter := make([]linux.BPFInstruction, int(fprog.Len))
        if _, err := linux.CopyBPFInstructionSliceIn(t, hostarch.Addr(fprog.Filter), filter); err != nil {
                return err
        }
        compiledFilter, err := bpf.Compile(filter)
        if err != nil {
                t.Debugf("Invalid seccomp-bpf filter: %v", err)
                return linuxerr.EINVAL
        }

        return t.AppendSyscallFilter(compiledFilter, tsync)
}

// Seccomp implements linux syscall seccomp(2).
func Seccomp(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        return 0, nil, seccomp(t, args[0].Uint64(), args[1].Uint64(), args[2].Pointer())
}
























































    3 









    1 



    1 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package hostmm

import (
        "golang.org/x/sys/unix"
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/log"
)

var (
        haveMembarrierGlobal           = false
        haveMembarrierPrivateExpedited = false
)

func init() {
        supported, _, e := unix.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_QUERY, 0 /* flags */, 0 /* unused */)
        if e != 0 {
                if e != unix.ENOSYS {
                        log.Warningf("membarrier(MEMBARRIER_CMD_QUERY) failed: %s", e.Error())
                }
                return
        }
        // We don't use MEMBARRIER_CMD_GLOBAL_EXPEDITED because this sends IPIs to
        // all CPUs running tasks that have previously invoked
        // MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, which presents a DOS risk.
        // (MEMBARRIER_CMD_GLOBAL is synchronize_rcu(), i.e. it waits for an RCU
        // grace period to elapse without bothering other CPUs.
        // MEMBARRIER_CMD_PRIVATE_EXPEDITED sends IPIs only to CPUs running tasks
        // sharing the caller's MM.)
        if supported&linux.MEMBARRIER_CMD_GLOBAL != 0 {
                haveMembarrierGlobal = true
        }
        if req := uintptr(linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED | linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED); supported&req == req {
                if _, _, e := unix.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0 /* flags */, 0 /* unused */); e != 0 {
                        log.Warningf("membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) failed: %s", e.Error())
                } else {
                        haveMembarrierPrivateExpedited = true
                }
        }
}

// HaveGlobalMemoryBarrier returns true if GlobalMemoryBarrier is supported.
func HaveGlobalMemoryBarrier() bool {
        return haveMembarrierGlobal
}

// GlobalMemoryBarrier blocks until "all running threads [in the host OS] have
// passed through a state where all memory accesses to user-space addresses
// match program order between entry to and return from [GlobalMemoryBarrier]",
// as for membarrier(2).
//
// Preconditions: HaveGlobalMemoryBarrier() == true.
func GlobalMemoryBarrier() error {
        if _, _, e := unix.Syscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_GLOBAL, 0 /* flags */, 0 /* unused */); e != 0 {
                return e
        }
        return nil
}

// HaveProcessMemoryBarrier returns true if ProcessMemoryBarrier is supported.
func HaveProcessMemoryBarrier() bool {
        return haveMembarrierPrivateExpedited
}

// ProcessMemoryBarrier is equivalent to GlobalMemoryBarrier, but only
// synchronizes with threads sharing a virtual address space (from the host OS'
// perspective) with the calling thread.
//
// Preconditions: HaveProcessMemoryBarrier() == true.
func ProcessMemoryBarrier() error {
        if _, _, e := unix.RawSyscall(unix.SYS_MEMBARRIER, linux.MEMBARRIER_CMD_PRIVATE_EXPEDITED, 0 /* flags */, 0 /* unused */); e != 0 {
                return e
        }
        return nil
}




































    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package usermem

import (
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/hostarch"
)

// IOCopyContext wraps an object implementing hostarch.IO to implement
// marshal.CopyContext.
type IOCopyContext struct {
        Ctx  context.Context
        IO   IO
        Opts IOOpts
}

// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
func (i *IOCopyContext) CopyScratchBuffer(size int) []byte {
        return make([]byte, size)
}

// CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
func (i *IOCopyContext) CopyOutBytes(addr hostarch.Addr, b []byte) (int, error) {
        return i.IO.CopyOut(i.Ctx, addr, b, i.Opts)
}

// CopyInBytes implements marshal.CopyContext.CopyInBytes.
func (i *IOCopyContext) CopyInBytes(addr hostarch.Addr, b []byte) (int, error) {
        return i.IO.CopyIn(i.Ctx, addr, b, i.Opts)
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/socket/unix/transport/queue_refs.go: no such file or directory

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/kernel/futex/waiter_list.go: no such file or directory

































 1779 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import "sync/atomic"

// TaskWorker is a deferred task.
//
// This must be savable.
type TaskWorker interface {
        // TaskWork will be executed prior to returning to user space. Note that
        // TaskWork may call RegisterWork again, but this will not be executed until
        // the next return to user space, unlike in Linux. This effectively allows
        // registration of indefinite user return hooks, but not by default.
        TaskWork(t *Task)
}

// RegisterWork can be used to register additional task work that will be
// performed prior to returning to user space. See TaskWorker.TaskWork for
// semantics regarding registration.
func (t *Task) RegisterWork(work TaskWorker) {
        t.taskWorkMu.Lock()
        defer t.taskWorkMu.Unlock()
        atomic.AddInt32(&t.taskWorkCount, 1)
        t.taskWork = append(t.taskWork, work)
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/tcpip/transport/tcp/tcp_segment_list.go: no such file or directory

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/vfs/event_list.go: no such file or directory

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/tcpip/stack/neighbor_entry_list.go: no such file or directory


































































































   28 









    8 


   29 






































































   29 

   29 


   29 





   38 




   38 


   37 
    8 


   29 







   28 






   29 


   29 








   29 


    1 
    1 






    1 



    1 











   28 













   38 

    9 


   29 



   28 



    5 
    2 







    3 



    3 
    1 


    2 
    1 


    1 
    1 






    1 









    1 
    1 






    1 
    1 


    1 





    1 





    1 


    1 


    1 
































    1 
    1 


    1 



    1 
    1 




    1 




    1 


    1 










   29 





   28 

    3 


   28 





   29 



   29 









    1 






    1 



    1 


    1 


    1 


    1 







  388 
  387 




  388 



  387 





  729 









  708 




  706 





    1 








    1 


    1 



















  759 




  551 

























  209 









  209 



  207 



  209 



    2 

    1 


    2 



    2 



  209 









  235 








  236 




  236 


  236 



  235 





  236 




  236 

  235 



  236 



    1 




    1 


    1 



    1 





    1 




    1 




  236 















  415 
    9 



  413 




  318 




    8 



    8 



    8 








    3 





  382 







    3 

    2 


    3 




  234 




   57 










    2 






    2 


    2 
    2 
    2 



    2 

    2 












    2 





    2 
    1 


    2 


    2 


    2 








    2 







    1 







    1 


    1 
    1 
    1 



    1 

    1 
    1 












    1 





    1 















    1 






    1 


    1 

















    1 


    1 


    1 


    1 


















    1 






    1 



    1 



    1 










    1 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs

import (
        "bytes"
        "fmt"
        "math"
        "sort"
        "strings"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/refsvfs2"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/syserror"
)

// A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem
// (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem
// (Mount.fs), which applies to path resolution in the context of a particular
// Mount (Mount.key.parent).
//
// Mounts are reference-counted. Unless otherwise specified, all Mount methods
// require that a reference is held.
//
// Mount and Filesystem are distinct types because it's possible for a single
// Filesystem to be mounted at multiple locations and/or in multiple mount
// namespaces.
//
// Mount is analogous to Linux's struct mount. (gVisor does not distinguish
// between struct mount and struct vfsmount.)
//
// +stateify savable
type Mount struct {
        // vfs, fs, root are immutable. References are held on fs and root.
        // Note that for a disconnected mount, root may be nil.
        //
        // Invariant: if not nil, root belongs to fs.
        vfs  *VirtualFilesystem
        fs   *Filesystem
        root *Dentry

        // ID is the immutable mount ID.
        ID uint64

        // Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except
        // for MS_RDONLY which is tracked in "writers". Immutable.
        Flags MountFlags

        // key is protected by VirtualFilesystem.mountMu and
        // VirtualFilesystem.mounts.seq, and may be nil. References are held on
        // key.parent and key.point if they are not nil.
        //
        // Invariant: key.parent != nil iff key.point != nil. key.point belongs to
        // key.parent.fs.
        key mountKey `state:".(VirtualDentry)"`

        // ns is the namespace in which this Mount was mounted. ns is protected by
        // VirtualFilesystem.mountMu.
        ns *MountNamespace

        // The lower 63 bits of refs are a reference count. The MSB of refs is set
        // if the Mount has been eagerly umounted, as by umount(2) without the
        // MNT_DETACH flag. refs is accessed using atomic memory operations.
        refs int64

        // children is the set of all Mounts for which Mount.key.parent is this
        // Mount. children is protected by VirtualFilesystem.mountMu.
        children map[*Mount]struct{}

        // umounted is true if VFS.umountRecursiveLocked() has been called on this
        // Mount. VirtualFilesystem does not hold a reference on Mounts for which
        // umounted is true. umounted is protected by VirtualFilesystem.mountMu.
        umounted bool

        // The lower 63 bits of writers is the number of calls to
        // Mount.CheckBeginWrite() that have not yet been paired with a call to
        // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
        // writers is accessed using atomic memory operations.
        writers int64
}

func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount {
        mnt := &Mount{
                ID:    atomic.AddUint64(&vfs.lastMountID, 1),
                Flags: opts.Flags,
                vfs:   vfs,
                fs:    fs,
                root:  root,
                ns:    mntns,
                refs:  1,
        }
        if opts.ReadOnly {
                mnt.setReadOnlyLocked(true)
        }
        refsvfs2.Register(mnt)
        return mnt
}

// Options returns a copy of the MountOptions currently applicable to mnt.
func (mnt *Mount) Options() MountOptions {
        mnt.vfs.mountMu.Lock()
        defer mnt.vfs.mountMu.Unlock()
        return MountOptions{
                Flags:    mnt.Flags,
                ReadOnly: mnt.ReadOnly(),
        }
}

// A MountNamespace is a collection of Mounts.//
// MountNamespaces are reference-counted. Unless otherwise specified, all
// MountNamespace methods require that a reference is held.
//
// MountNamespace is analogous to Linux's struct mnt_namespace.
//
// +stateify savable
type MountNamespace struct {
        MountNamespaceRefs

        // Owner is the usernamespace that owns this mount namespace.
        Owner *auth.UserNamespace

        // root is the MountNamespace's root mount. root is immutable.
        root *Mount

        // mountpoints maps all Dentries which are mount points in this namespace
        // to the number of Mounts for which they are mount points. mountpoints is
        // protected by VirtualFilesystem.mountMu.
        //
        // mountpoints is used to determine if a Dentry can be moved or removed
        // (which requires that the Dentry is not a mount point in the calling
        // namespace).
        //
        // mountpoints is maintained even if there are no references held on the
        // MountNamespace; this is required to ensure that
        // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate
        // correctly on unreferenced MountNamespaces.
        mountpoints map[*Dentry]uint32
}

// NewMountNamespace returns a new mount namespace with a root filesystem
// configured by the given arguments. A reference is taken on the returned
// MountNamespace.
func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*MountNamespace, error) {
        rft := vfs.getFilesystemType(fsTypeName)
        if rft == nil {
                ctx.Warningf("Unknown filesystem type: %s", fsTypeName)
                return nil, linuxerr.ENODEV
        }
        fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
        if err != nil {
                return nil, err
        }
        mntns := &MountNamespace{
                Owner:       creds.UserNamespace,
                mountpoints: make(map[*Dentry]uint32),
        }
        mntns.InitRefs()
        mntns.root = newMount(vfs, fs, root, mntns, opts)
        return mntns, nil
}

// NewDisconnectedMount returns a Mount representing fs with the given root
// (which may be nil). The new Mount is not associated with any MountNamespace
// and is not connected to any other Mounts. References are taken on fs and
// root.
func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) (*Mount, error) {
        fs.IncRef()
        if root != nil {
                root.IncRef()
        }
        return newMount(vfs, fs, root, nil /* mntns */, opts), nil
}

// MountDisconnected creates a Filesystem configured by the given arguments,
// then returns a Mount representing it. The new Mount is not associated with
// any MountNamespace and is not connected to any other Mounts.
func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) {
        rft := vfs.getFilesystemType(fsTypeName)
        if rft == nil {
                return nil, linuxerr.ENODEV
        }
        if !opts.InternalMount && !rft.opts.AllowUserMount {
                return nil, linuxerr.ENODEV
        }
        fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
        if err != nil {
                return nil, err
        }
        defer root.DecRef(ctx)
        defer fs.DecRef(ctx)
        return vfs.NewDisconnectedMount(fs, root, opts)
}

// ConnectMountAt connects mnt at the path represented by target.
//
// Preconditions: mnt must be disconnected.
func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error {
        // We can't hold vfs.mountMu while calling FilesystemImpl methods due to
        // lock ordering.
        vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
        if err != nil {
                return err
        }
        vfs.mountMu.Lock()
        vdDentry := vd.dentry
        vdDentry.mu.Lock()
        for {
                if vd.mount.umounted || vdDentry.dead {
                        vdDentry.mu.Unlock()
                        vfs.mountMu.Unlock()
                        vd.DecRef(ctx)
                        return syserror.ENOENT
                }
                // vd might have been mounted over between vfs.GetDentryAt() and
                // vfs.mountMu.Lock().
                if !vdDentry.isMounted() {
                        break
                }
                nextmnt := vfs.mounts.Lookup(vd.mount, vdDentry)
                if nextmnt == nil {
                        break
                }
                // It's possible that nextmnt has been umounted but not disconnected,
                // in which case vfs no longer holds a reference on it, and the last
                // reference may be concurrently dropped even though we're holding
                // vfs.mountMu.
                if !nextmnt.tryIncMountedRef() {
                        break
                }
                // This can't fail since we're holding vfs.mountMu.
                nextmnt.root.IncRef()
                vdDentry.mu.Unlock()
                vd.DecRef(ctx)
                vd = VirtualDentry{
                        mount:  nextmnt,
                        dentry: nextmnt.root,
                }
                vdDentry.mu.Lock()
        }
        // TODO(gvisor.dev/issue/1035): Linux requires that either both the mount
        // point and the mount root are directories, or neither are, and returns
        // ENOTDIR if this is not the case.
        mntns := vd.mount.ns
        vfs.mounts.seq.BeginWrite()
        vfs.connectLocked(mnt, vd, mntns)
        vfs.mounts.seq.EndWrite()
        vdDentry.mu.Unlock()
        vfs.mountMu.Unlock()
        return nil
}

// MountAt creates and mounts a Filesystem configured by the given arguments.
// The VirtualFilesystem will hold a reference to the Mount until it is unmounted.
//
// This method returns the mounted Mount without a reference, for convenience
// during VFS setup when there is no chance of racing with unmount.
func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) {
        mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts)
        if err != nil {
                return nil, err
        }
        defer mnt.DecRef(ctx)
        if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil {
                return nil, err
        }
        return mnt, nil
}

// UmountAt removes the Mount at the given path.
func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
        if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
                return linuxerr.EINVAL
        }

        // MNT_FORCE is currently unimplemented except for the permission check.
        // Force unmounting specifically requires CAP_SYS_ADMIN in the root user
        // namespace, and not in the owner user namespace for the target mount. See
        // fs/namespace.c:SYSCALL_DEFINE2(umount, ...)
        if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
                return linuxerr.EPERM
        }

        vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{})
        if err != nil {
                return err
        }
        defer vd.DecRef(ctx)
        if vd.dentry != vd.mount.root {
                return linuxerr.EINVAL
        }
        vfs.mountMu.Lock()
        if mntns := MountNamespaceFromContext(ctx); mntns != nil {
                defer mntns.DecRef(ctx)
                if mntns != vd.mount.ns {
                        vfs.mountMu.Unlock()
                        return linuxerr.EINVAL
                }

                if vd.mount == vd.mount.ns.root {
                        vfs.mountMu.Unlock()
                        return linuxerr.EINVAL
                }
        }

        // TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's
        // root, which we don't implement yet (we'll just fail it since the caller
        // holds a reference on it).

        vfs.mounts.seq.BeginWrite()
        if opts.Flags&linux.MNT_DETACH == 0 {
                if len(vd.mount.children) != 0 {
                        vfs.mounts.seq.EndWrite()
                        vfs.mountMu.Unlock()
                        return linuxerr.EBUSY
                }
                // We are holding a reference on vd.mount.
                expectedRefs := int64(1)
                if !vd.mount.umounted {
                        expectedRefs = 2
                }
                if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB
                        vfs.mounts.seq.EndWrite()
                        vfs.mountMu.Unlock()
                        return linuxerr.EBUSY
                }
        }
        vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{
                eager:               opts.Flags&linux.MNT_DETACH == 0,
                disconnectHierarchy: true,
        }, nil, nil)
        vfs.mounts.seq.EndWrite()
        vfs.mountMu.Unlock()
        for _, vd := range vdsToDecRef {
                vd.DecRef(ctx)
        }
        for _, mnt := range mountsToDecRef {
                mnt.DecRef(ctx)
        }
        return nil
}

// +stateify savable
type umountRecursiveOptions struct {
        // If eager is true, ensure that future calls to Mount.tryIncMountedRef()
        // on umounted mounts fail.
        //
        // eager is analogous to Linux's UMOUNT_SYNC.
        eager bool

        // If disconnectHierarchy is true, Mounts that are umounted hierarchically
        // should be disconnected from their parents. (Mounts whose parents are not
        // umounted, which in most cases means the Mount passed to the initial call
        // to umountRecursiveLocked, are unconditionally disconnected for
        // consistency with Linux.)
        //
        // disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED.
        disconnectHierarchy bool
}

// umountRecursiveLocked marks mnt and its descendants as umounted. It does not
// release mount or dentry references; instead, it appends VirtualDentries and
// Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef
// respectively, and returns updated slices. (This is necessary because
// filesystem locks possibly taken by DentryImpl.DecRef() may precede
// vfs.mountMu in the lock order, and Mount.DecRef() may lock vfs.mountMu.)
//
// umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree().
//
// Preconditions:
// * vfs.mountMu must be locked.
// * vfs.mounts.seq must be in a writer critical section.
func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) {
        if !mnt.umounted {
                mnt.umounted = true
                mountsToDecRef = append(mountsToDecRef, mnt)
                if parent := mnt.parent(); parent != nil && (opts.disconnectHierarchy || !parent.umounted) {
                        vdsToDecRef = append(vdsToDecRef, vfs.disconnectLocked(mnt))
                }
        }
        if opts.eager {
                for {
                        refs := atomic.LoadInt64(&mnt.refs)
                        if refs < 0 {
                                break
                        }
                        if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs|math.MinInt64) {
                                break
                        }
                }
        }
        for child := range mnt.children {
                vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(child, opts, vdsToDecRef, mountsToDecRef)
        }
        return vdsToDecRef, mountsToDecRef
}

// connectLocked makes vd the mount parent/point for mnt. It consumes
// references held by vd.
//
// Preconditions:
// * vfs.mountMu must be locked.
// * vfs.mounts.seq must be in a writer critical section.
// * d.mu must be locked.
// * mnt.parent() == nil, i.e. mnt must not already be connected.
func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
        if checkInvariants {
                if mnt.parent() != nil {
                        panic("VFS.connectLocked called on connected mount")
                }
        }
        mnt.IncRef() // dropped by callers of umountRecursiveLocked
        mnt.setKey(vd)
        if vd.mount.children == nil {
                vd.mount.children = make(map[*Mount]struct{})
        }
        vd.mount.children[mnt] = struct{}{}
        atomic.AddUint32(&vd.dentry.mounts, 1)
        mnt.ns = mntns
        mntns.mountpoints[vd.dentry]++
        vfs.mounts.insertSeqed(mnt)
        vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
        if !ok {
                vfsmpmounts = make(map[*Mount]struct{})
                vfs.mountpoints[vd.dentry] = vfsmpmounts
        }
        vfsmpmounts[mnt] = struct{}{}
}

// disconnectLocked makes vd have no mount parent/point and returns its old
// mount parent/point with a reference held.
//
// Preconditions:
// * vfs.mountMu must be locked.
// * vfs.mounts.seq must be in a writer critical section.
// * mnt.parent() != nil.
func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
        vd := mnt.getKey()
        if checkInvariants {
                if vd.mount != nil {
                        panic("VFS.disconnectLocked called on disconnected mount")
                }
        }
        mnt.loadKey(VirtualDentry{})
        delete(vd.mount.children, mnt)
        atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
        mnt.ns.mountpoints[vd.dentry]--
        if mnt.ns.mountpoints[vd.dentry] == 0 {
                delete(mnt.ns.mountpoints, vd.dentry)
        }
        vfs.mounts.removeSeqed(mnt)
        vfsmpmounts := vfs.mountpoints[vd.dentry]
        delete(vfsmpmounts, mnt)
        if len(vfsmpmounts) == 0 {
                delete(vfs.mountpoints, vd.dentry)
        }
        return vd
}

// tryIncMountedRef increments mnt's reference count and returns true. If mnt's
// reference count is already zero, or has been eagerly umounted,
// tryIncMountedRef does nothing and returns false.
//
// tryIncMountedRef does not require that a reference is held on mnt.
func (mnt *Mount) tryIncMountedRef() bool {
        for {
                r := atomic.LoadInt64(&mnt.refs)
                if r <= 0 { // r < 0 => MSB set => eagerly unmounted
                        return false
                }
                if atomic.CompareAndSwapInt64(&mnt.refs, r, r+1) {
                        if mnt.LogRefs() {
                                refsvfs2.LogTryIncRef(mnt, r+1)
                        }
                        return true
                }
        }
}

// IncRef increments mnt's reference count.
func (mnt *Mount) IncRef() {
        // In general, negative values for mnt.refs are valid because the MSB is
        // the eager-unmount bit.
        r := atomic.AddInt64(&mnt.refs, 1)
        if mnt.LogRefs() {
                refsvfs2.LogIncRef(mnt, r)
        }
}

// DecRef decrements mnt's reference count.
func (mnt *Mount) DecRef(ctx context.Context) {
        r := atomic.AddInt64(&mnt.refs, -1)
        if mnt.LogRefs() {
                refsvfs2.LogDecRef(mnt, r)
        }
        if r&^math.MinInt64 == 0 { // mask out MSB
                refsvfs2.Unregister(mnt)
                mnt.destroy(ctx)
        }
}

func (mnt *Mount) destroy(ctx context.Context) {
        var vd VirtualDentry
        if mnt.parent() != nil {
                mnt.vfs.mountMu.Lock()
                mnt.vfs.mounts.seq.BeginWrite()
                vd = mnt.vfs.disconnectLocked(mnt)
                mnt.vfs.mounts.seq.EndWrite()
                mnt.vfs.mountMu.Unlock()
        }
        if mnt.root != nil {
                mnt.root.DecRef(ctx)
        }
        mnt.fs.DecRef(ctx)
        if vd.Ok() {
                vd.DecRef(ctx)
        }
}

// RefType implements refsvfs2.CheckedObject.Type.
func (mnt *Mount) RefType() string {
        return "vfs.Mount"
}

// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
func (mnt *Mount) LeakMessage() string {
        return fmt.Sprintf("[vfs.Mount %p] reference count of %d instead of 0", mnt, atomic.LoadInt64(&mnt.refs))
}

// LogRefs implements refsvfs2.CheckedObject.LogRefs.
//
// This should only be set to true for debugging purposes, as it can generate an
// extremely large amount of output and drastically degrade performance.
func (mnt *Mount) LogRefs() bool {
        return false
}

// DecRef decrements mntns' reference count.
func (mntns *MountNamespace) DecRef(ctx context.Context) {
        vfs := mntns.root.fs.VirtualFilesystem()
        mntns.MountNamespaceRefs.DecRef(func() {
                vfs.mountMu.Lock()
                vfs.mounts.seq.BeginWrite()
                vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{
                        disconnectHierarchy: true,
                }, nil, nil)
                vfs.mounts.seq.EndWrite()
                vfs.mountMu.Unlock()
                for _, vd := range vdsToDecRef {
                        vd.DecRef(ctx)
                }
                for _, mnt := range mountsToDecRef {
                        mnt.DecRef(ctx)
                }
        })
}

// getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
// a reference on the returned Mount. If (mnt, d) is not a mount point,
// getMountAt returns nil.
//
// getMountAt is analogous to Linux's fs/namei.c:follow_mount().
//
// Preconditions: References are held on mnt and d.
func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount {
        // The first mount is special-cased:
        //
        // - The caller is assumed to have checked d.isMounted() already. (This
        // isn't a precondition because it doesn't matter for correctness.)
        //
        // - We return nil, instead of mnt, if there is no mount at (mnt, d).
        //
        // - We don't drop the caller's references on mnt and d.
retryFirst:
        next := vfs.mounts.Lookup(mnt, d)
        if next == nil {
                return nil
        }
        if !next.tryIncMountedRef() {
                // Raced with umount.
                goto retryFirst
        }
        mnt = next
        d = next.root
        // We don't need to take Dentry refs anywhere in this function because
        // Mounts hold references on Mount.root, which is immutable.
        for d.isMounted() {
                next := vfs.mounts.Lookup(mnt, d)
                if next == nil {
                        break
                }
                if !next.tryIncMountedRef() {
                        // Raced with umount.
                        continue
                }
                mnt.DecRef(ctx)
                mnt = next
                d = next.root
        }
        return mnt
}

// getMountpointAt returns the mount point for the stack of Mounts including
// mnt. It takes a reference on the returned VirtualDentry. If no such mount
// point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
//
// Preconditions:
// * References are held on mnt and root.
// * vfsroot is not (mnt, mnt.root).
func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
        // The first mount is special-cased:
        //
        // - The caller must have already checked mnt against vfsroot.
        //
        // - We return nil, instead of mnt, if there is no mount point for mnt.
        //
        // - We don't drop the caller's reference on mnt.
retryFirst:
        epoch := vfs.mounts.seq.BeginRead()
        parent, point := mnt.parent(), mnt.point()
        if !vfs.mounts.seq.ReadOk(epoch) {
                goto retryFirst
        }
        if parent == nil {
                return VirtualDentry{}
        }
        if !parent.tryIncMountedRef() {
                // Raced with umount.
                goto retryFirst
        }
        if !point.TryIncRef() {
                // Since Mount holds a reference on Mount.key.point, this can only
                // happen due to a racing change to Mount.key.
                parent.DecRef(ctx)
                goto retryFirst
        }
        if !vfs.mounts.seq.ReadOk(epoch) {
                point.DecRef(ctx)
                parent.DecRef(ctx)
                goto retryFirst
        }
        mnt = parent
        d := point
        for {
                if mnt == vfsroot.mount && d == vfsroot.dentry {
                        break
                }
                if d != mnt.root {
                        break
                }
        retryNotFirst:
                epoch := vfs.mounts.seq.BeginRead()
                parent, point := mnt.parent(), mnt.point()
                if !vfs.mounts.seq.ReadOk(epoch) {
                        goto retryNotFirst
                }
                if parent == nil {
                        break
                }
                if !parent.tryIncMountedRef() {
                        // Raced with umount.
                        goto retryNotFirst
                }
                if !point.TryIncRef() {
                        // Since Mount holds a reference on Mount.key.point, this can
                        // only happen due to a racing change to Mount.key.
                        parent.DecRef(ctx)
                        goto retryNotFirst
                }
                if !vfs.mounts.seq.ReadOk(epoch) {
                        point.DecRef(ctx)
                        parent.DecRef(ctx)
                        goto retryNotFirst
                }
                d.DecRef(ctx)
                mnt.DecRef(ctx)
                mnt = parent
                d = point
        }
        return VirtualDentry{mnt, d}
}

// SetMountReadOnly sets the mount as ReadOnly.
func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error {
        vfs.mountMu.Lock()
        defer vfs.mountMu.Unlock()
        return mnt.setReadOnlyLocked(ro)
}

// CheckBeginWrite increments the counter of in-progress write operations on
// mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns
// EROFS.
//
// If CheckBeginWrite succeeds, EndWrite must be called when the write
// operation is finished.
func (mnt *Mount) CheckBeginWrite() error {
        if atomic.AddInt64(&mnt.writers, 1) < 0 {
                atomic.AddInt64(&mnt.writers, -1)
                return linuxerr.EROFS
        }
        return nil
}

// EndWrite indicates that a write operation signaled by a previous successful
// call to CheckBeginWrite has finished.
func (mnt *Mount) EndWrite() {
        atomic.AddInt64(&mnt.writers, -1)
}

// Preconditions: VirtualFilesystem.mountMu must be locked.
func (mnt *Mount) setReadOnlyLocked(ro bool) error {
        if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro {
                return nil
        }
        if ro {
                if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) {
                        return linuxerr.EBUSY
                }
                return nil
        }
        // Unset MSB without dropping any temporary increments from failed calls to
        // mnt.CheckBeginWrite().
        atomic.AddInt64(&mnt.writers, math.MinInt64)
        return nil
}

// ReadOnly returns true if mount is readonly.
func (mnt *Mount) ReadOnly() bool {
        return atomic.LoadInt64(&mnt.writers) < 0
}

// Filesystem returns the mounted Filesystem. It does not take a reference on
// the returned Filesystem.
func (mnt *Mount) Filesystem() *Filesystem {
        return mnt.fs
}

// submountsLocked returns this Mount and all Mounts that are descendents of
// it.
//
// Precondition: mnt.vfs.mountMu must be held.
func (mnt *Mount) submountsLocked() []*Mount {
        mounts := []*Mount{mnt}
        for m := range mnt.children {
                mounts = append(mounts, m.submountsLocked()...)
        }
        return mounts
}

// Root returns the mount's root. It does not take a reference on the returned
// Dentry.
func (mnt *Mount) Root() *Dentry {
        return mnt.root
}

// Root returns mntns' root. It does not take a reference on the returned Dentry.
func (mntns *MountNamespace) Root() VirtualDentry {
        vd := VirtualDentry{
                mount:  mntns.root,
                dentry: mntns.root.root,
        }
        return vd
}

// GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf.
//
// Preconditions: taskRootDir.Ok().
func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
        rootMnt := taskRootDir.mount

        vfs.mountMu.Lock()
        mounts := rootMnt.submountsLocked()
        // Take a reference on mounts since we need to drop vfs.mountMu before
        // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()).
        for _, mnt := range mounts {
                mnt.IncRef()
        }
        vfs.mountMu.Unlock()
        defer func() {
                for _, mnt := range mounts {
                        mnt.DecRef(ctx)
                }
        }()
        sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })

        for _, mnt := range mounts {
                // Get the path to this mount relative to task root.
                mntRootVD := VirtualDentry{
                        mount:  mnt,
                        dentry: mnt.root,
                }
                path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
                if err != nil {
                        // For some reason we didn't get a path. Log a warning
                        // and run with empty path.
                        ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err)
                        path = ""
                }
                if path == "" {
                        // Either an error occurred, or path is not reachable
                        // from root.
                        break
                }

                opts := "rw"
                if mnt.ReadOnly() {
                        opts = "ro"
                }
                if mnt.Flags.NoATime {
                        opts = ",noatime"
                }
                if mnt.Flags.NoExec {
                        opts += ",noexec"
                }
                if mopts := mnt.fs.Impl().MountOptions(); mopts != "" {
                        opts += "," + mopts
                }

                // Format:
                // <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
                //
                // The "needs dump" and "fsck order" flags are always 0, which
                // is allowed.
                fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0)
        }
}

// GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to
// buf.
//
// Preconditions: taskRootDir.Ok().
func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
        rootMnt := taskRootDir.mount

        vfs.mountMu.Lock()
        mounts := rootMnt.submountsLocked()
        // Take a reference on mounts since we need to drop vfs.mountMu before
        // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or
        // vfs.StatAt() (=> FilesystemImpl.StatAt()).
        for _, mnt := range mounts {
                mnt.IncRef()
        }
        vfs.mountMu.Unlock()
        defer func() {
                for _, mnt := range mounts {
                        mnt.DecRef(ctx)
                }
        }()
        sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })

        creds := auth.CredentialsFromContext(ctx)
        for _, mnt := range mounts {
                // Get the path to this mount relative to task root.
                mntRootVD := VirtualDentry{
                        mount:  mnt,
                        dentry: mnt.root,
                }
                path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
                if err != nil {
                        // For some reason we didn't get a path. Log a warning
                        // and run with empty path.
                        ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err)
                        path = ""
                }
                if path == "" {
                        // Either an error occurred, or path is not reachable
                        // from root.
                        break
                }
                // Stat the mount root to get the major/minor device numbers.
                pop := &PathOperation{
                        Root:  mntRootVD,
                        Start: mntRootVD,
                }
                statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{})
                if err != nil {
                        // Well that's not good. Ignore this mount.
                        ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err)
                        break
                }

                // Format:
                // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
                // (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)

                // (1) Mount ID.
                fmt.Fprintf(buf, "%d ", mnt.ID)

                // (2)  Parent ID (or this ID if there is no parent).
                // Note that even if the call to mnt.parent() races with Mount
                // destruction (which is possible since we're not holding vfs.mountMu),
                // its Mount.ID will still be valid.
                pID := mnt.ID
                if p := mnt.parent(); p != nil {
                        pID = p.ID
                }
                fmt.Fprintf(buf, "%d ", pID)

                // (3) Major:Minor device ID. We don't have a superblock, so we
                // just use the root inode device number.
                fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor)

                // (4) Root: the pathname of the directory in the filesystem
                // which forms the root of this mount.
                //
                // NOTE(b/78135857): This will always be "/" until we implement
                // bind mounts.
                fmt.Fprintf(buf, "/ ")

                // (5) Mount point (relative to process root).
                fmt.Fprintf(buf, "%s ", manglePath(path))

                // (6) Mount options.
                opts := "rw"
                if mnt.ReadOnly() {
                        opts = "ro"
                }
                if mnt.Flags.NoATime {
                        opts = ",noatime"
                }
                if mnt.Flags.NoExec {
                        opts += ",noexec"
                }
                fmt.Fprintf(buf, "%s ", opts)

                // (7) Optional fields: zero or more fields of the form "tag[:value]".
                // (8) Separator: the end of the optional fields is marked by a single hyphen.
                fmt.Fprintf(buf, "- ")

                // (9) Filesystem type.
                fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name())

                // (10) Mount source: filesystem-specific information or "none".
                fmt.Fprintf(buf, "none ")

                // (11) Superblock options, and final newline.
                fmt.Fprintf(buf, "%s\n", superBlockOpts(path, mnt))
        }
}

// manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
// See Linux fs/seq_file.c:mangle_path.
func manglePath(p string) string {
        r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134")
        return r.Replace(p)
}

// superBlockOpts returns the super block options string for the the mount at
// the given path.
func superBlockOpts(mountPath string, mnt *Mount) string {
        // Compose super block options by combining global mount flags with
        // FS-specific mount options.
        opts := "rw"
        if mnt.ReadOnly() {
                opts = "ro"
        }

        if mopts := mnt.fs.Impl().MountOptions(); mopts != "" {
                opts += "," + mopts
        }

        // NOTE(b/147673608): If the mount is a ramdisk-based fake cgroupfs, we also
        // need to include the cgroup name in the options. For now we just read that
        // from the path. Note that this is only possible when "cgroup" isn't
        // registered as a valid filesystem type.
        //
        // TODO(gvisor.dev/issue/190): Once we removed fake cgroupfs support, we
        // should remove this.
        if cgroupfs := mnt.vfs.getFilesystemType("cgroup"); cgroupfs != nil && cgroupfs.opts.AllowUserMount {
                // Real cgroupfs available.
                return opts
        }
        if mnt.fs.FilesystemType().Name() == "cgroup" {
                splitPath := strings.Split(mountPath, "/")
                cgroupType := splitPath[len(splitPath)-1]
                opts += "," + cgroupType
        }

        return opts
}















































   32 









  349 




    2 















   31 




   32 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package inet

// Namespace represents a network namespace. See network_namespaces(7).
//
// +stateify savable
type Namespace struct {
        // stack is the network stack implementation of this network namespace.
        stack Stack `state:"nosave"`

        // creator allows kernel to create new network stack for network namespaces.
        // If nil, no networking will function if network is namespaced.
        //
        // At afterLoad(), creator will be used to create network stack. Stateify
        // needs to wait for this field to be loaded before calling afterLoad().
        creator NetworkStackCreator `state:"wait"`

        // isRoot indicates whether this is the root network namespace.
        isRoot bool
}

// NewRootNamespace creates the root network namespace, with creator
// allowing new network namespaces to be created. If creator is nil, no
// networking will function if the network is namespaced.
func NewRootNamespace(stack Stack, creator NetworkStackCreator) *Namespace {
        return &Namespace{
                stack:   stack,
                creator: creator,
                isRoot:  true,
        }
}

// NewNamespace creates a new network namespace from the root.
func NewNamespace(root *Namespace) *Namespace {
        n := &Namespace{
                creator: root.creator,
        }
        n.init()
        return n
}

// Stack returns the network stack of n. Stack may return nil if no network
// stack is configured.
func (n *Namespace) Stack() Stack {
        return n.stack
}

// IsRoot returns whether n is the root network namespace.
func (n *Namespace) IsRoot() bool {
        return n.isRoot
}

// RestoreRootStack restores the root network namespace with stack. This should
// only be called when restoring kernel.
func (n *Namespace) RestoreRootStack(stack Stack) {
        if !n.isRoot {
                panic("RestoreRootStack can only be called on root network namespace")
        }
        if n.stack != nil {
                panic("RestoreRootStack called after a stack has already been set")
        }
        n.stack = stack
}

func (n *Namespace) init() {
        // Root network namespace will have stack assigned later.
        if n.isRoot {
                return
        }
        if n.creator != nil {
                var err error
                n.stack, err = n.creator.CreateStack()
                if err != nil {
                        panic(err)
                }
        }
}

// afterLoad is invoked by stateify.
func (n *Namespace) afterLoad() {
        n.init()
}

// NetworkStackCreator allows new instances of a network stack to be created. It
// is used by the kernel to create new network namespaces when requested.
type NetworkStackCreator interface {
        // CreateStack creates a new network stack for a network namespace.
        CreateStack() (Stack, error)
}







































































 1631 



 1631 











 1826 




 1126 



























 1812 




 1854 


 1848 















































































































  698 
























 1525 

 1124 







 1127 




 1535 


 1123 
















































 1824 





 1831 


    3 



 1824 
















































   70 







 1823 


 1823 


 1826 
























































 1523 






 1527 



























































































 1807 







 1801 
    7 
 1806 

 1810 






  701 






 1850 








 1419 

 1423 





 1294 









 1185 
  661 





 1156 






 1437 















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package futex provides an implementation of the futex interface as found in
// the Linux kernel. It allows one to easily transform Wait() calls into waits
// on a channel, which is useful in a Go-based kernel, for example.
package futex

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/memmap"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserror"
)

// KeyKind indicates the type of a Key.
type KeyKind int

const (
        // KindPrivate indicates a private futex (a futex syscall with the
        // FUTEX_PRIVATE_FLAG set).
        KindPrivate KeyKind = iota

        // KindSharedPrivate indicates a shared futex on a private memory mapping.
        // Although KindPrivate and KindSharedPrivate futexes both use memory
        // addresses to identify futexes, they do not interoperate (in Linux, the
        // two are distinguished by the FUT_OFF_MMSHARED flag, which is used in key
        // comparison).
        KindSharedPrivate

        // KindSharedMappable indicates a shared futex on a memory mapping other
        // than a private anonymous memory mapping.
        KindSharedMappable
)

// Key represents something that a futex waiter may wait on.
type Key struct {
        // Kind is the type of the Key.
        Kind KeyKind

        // Mappable is the memory-mapped object that is represented by the Key.
        // Mappable is always nil if Kind is not KindSharedMappable, and may be nil
        // even if it is.
        Mappable memmap.Mappable

        // MappingIdentity is the MappingIdentity associated with Mappable.
        // MappingIdentity is always nil is Mappable is nil, and may be nil even if
        // it isn't.
        MappingIdentity memmap.MappingIdentity

        // If Kind is KindPrivate or KindSharedPrivate, Offset is the represented
        // memory address. Otherwise, Offset is the represented offset into
        // Mappable.
        Offset uint64
}

func (k *Key) release(t Target) {
        if k.MappingIdentity != nil {
                k.MappingIdentity.DecRef(t)
        }
        k.Mappable = nil
        k.MappingIdentity = nil
}

func (k *Key) clone() Key {
        if k.MappingIdentity != nil {
                k.MappingIdentity.IncRef()
        }
        return *k
}

// Preconditions: k.Kind == KindPrivate or KindSharedPrivate.
func (k *Key) addr() hostarch.Addr {
        return hostarch.Addr(k.Offset)
}

// matches returns true if a wakeup on k2 should wake a waiter waiting on k.
func (k *Key) matches(k2 *Key) bool {
        // k.MappingIdentity is ignored; it's only used for reference counting.
        return k.Kind == k2.Kind && k.Mappable == k2.Mappable && k.Offset == k2.Offset
}

// Target abstracts memory accesses and keys.
type Target interface {
        context.Context

        // SwapUint32 gives access to hostarch.IO.SwapUint32.
        SwapUint32(addr hostarch.Addr, new uint32) (uint32, error)

        // CompareAndSwap gives access to hostarch.IO.CompareAndSwapUint32.
        CompareAndSwapUint32(addr hostarch.Addr, old, new uint32) (uint32, error)

        // LoadUint32 gives access to hostarch.IO.LoadUint32.
        LoadUint32(addr hostarch.Addr) (uint32, error)

        // GetSharedKey returns a Key with kind KindSharedPrivate or
        // KindSharedMappable corresponding to the memory mapped at address addr.
        //
        // If GetSharedKey returns a Key with a non-nil MappingIdentity, a
        // reference is held on the MappingIdentity, which must be dropped by the
        // caller when the Key is no longer in use.
        GetSharedKey(addr hostarch.Addr) (Key, error)
}

// check performs a basic equality check on the given address.
func check(t Target, addr hostarch.Addr, val uint32) error {
        cur, err := t.LoadUint32(addr)
        if err != nil {
                return err
        }
        if cur != val {
                return linuxerr.EAGAIN
        }
        return nil
}

// atomicOp performs a complex operation on the given address.
func atomicOp(t Target, addr hostarch.Addr, opIn uint32) (bool, error) {
        opType := (opIn >> 28) & 0xf
        cmp := (opIn >> 24) & 0xf
        opArg := (opIn >> 12) & 0xfff
        cmpArg := opIn & 0xfff

        if opType&linux.FUTEX_OP_OPARG_SHIFT != 0 {
                opArg = 1 << opArg
                opType &^= linux.FUTEX_OP_OPARG_SHIFT // Clear flag.
        }

        var (
                oldVal uint32
                err    error
        )
        if opType == linux.FUTEX_OP_SET {
                oldVal, err = t.SwapUint32(addr, opArg)
                if err != nil {
                        return false, err
                }
        } else {
                for {
                        oldVal, err = t.LoadUint32(addr)
                        if err != nil {
                                return false, err
                        }
                        var newVal uint32
                        switch opType {
                        case linux.FUTEX_OP_ADD:
                                newVal = oldVal + opArg
                        case linux.FUTEX_OP_OR:
                                newVal = oldVal | opArg
                        case linux.FUTEX_OP_ANDN:
                                newVal = oldVal &^ opArg
                        case linux.FUTEX_OP_XOR:
                                newVal = oldVal ^ opArg
                        default:
                                return false, syserror.ENOSYS
                        }
                        prev, err := t.CompareAndSwapUint32(addr, oldVal, newVal)
                        if err != nil {
                                return false, err
                        }
                        if prev == oldVal {
                                break // Success.
                        }
                }
        }

        switch cmp {
        case linux.FUTEX_OP_CMP_EQ:
                return oldVal == cmpArg, nil
        case linux.FUTEX_OP_CMP_NE:
                return oldVal != cmpArg, nil
        case linux.FUTEX_OP_CMP_LT:
                return oldVal < cmpArg, nil
        case linux.FUTEX_OP_CMP_LE:
                return oldVal <= cmpArg, nil
        case linux.FUTEX_OP_CMP_GT:
                return oldVal > cmpArg, nil
        case linux.FUTEX_OP_CMP_GE:
                return oldVal >= cmpArg, nil
        default:
                return false, syserror.ENOSYS
        }
}

// Waiter is the struct which gets enqueued into buckets for wake up routines
// and requeue routines to scan and notify. Once a Waiter has been enqueued by
// WaitPrepare(), callers may listen on C for wake up events.
type Waiter struct {
        // Synchronization:
        //
        // - A Waiter that is not enqueued in a bucket is exclusively owned (no
        // synchronization applies).
        //
        // - A Waiter is enqueued in a bucket by calling WaitPrepare(). After this,
        // waiterEntry, bucket, and key are protected by the bucket.mu ("bucket
        // lock") of the containing bucket, and bitmask is immutable. Note that
        // since bucket is mutated using atomic memory operations, bucket.Load()
        // may be called without holding the bucket lock, although it may change
        // racily. See WaitComplete().
        //
        // - A Waiter is only guaranteed to be no longer queued after calling
        // WaitComplete().

        // waiterEntry links Waiter into bucket.waiters.
        waiterEntry

        // bucket is the bucket this waiter is queued in. If bucket is nil, the
        // waiter is not waiting and is not in any bucket.
        bucket AtomicPtrBucket

        // C is sent to when the Waiter is woken.
        C chan struct{}

        // key is what this waiter is waiting on.
        key Key

        // The bitmask we're waiting on.
        // This is used the case of a FUTEX_WAKE_BITSET.
        bitmask uint32

        // tid is the thread ID for the waiter in case this is a PI mutex.
        tid uint32
}

// NewWaiter returns a new unqueued Waiter.
func NewWaiter() *Waiter {
        return &Waiter{
                C: make(chan struct{}, 1),
        }
}

// woken returns true if w has been woken since the last call to WaitPrepare.
func (w *Waiter) woken() bool {
        return len(w.C) != 0
}

// bucket holds a list of waiters for a given address hash.
//
// +stateify savable
type bucket struct {
        // mu protects waiters and contained Waiter state. See comment in Waiter.
        mu sync.Mutex `state:"nosave"`

        waiters waiterList `state:"zerovalue"`
}

// wakeLocked wakes up to n waiters matching the bitmask at the addr for this
// bucket and returns the number of waiters woken.
//
// Preconditions: b.mu must be locked.
func (b *bucket) wakeLocked(key *Key, bitmask uint32, n int) int {
        done := 0
        for w := b.waiters.Front(); done < n && w != nil; {
                if !w.key.matches(key) || w.bitmask&bitmask == 0 {
                        // Not matching.
                        w = w.Next()
                        continue
                }

                // Remove from the bucket and wake the waiter.
                woke := w
                w = w.Next() // Next iteration.
                b.wakeWaiterLocked(woke)
                done++
        }
        return done
}

func (b *bucket) wakeWaiterLocked(w *Waiter) {
        // Remove from the bucket and wake the waiter.
        b.waiters.Remove(w)
        w.C <- struct{}{}

        // NOTE: The above channel write establishes a write barrier according
        // to the memory model, so nothing may be ordered around it. Since
        // we've dequeued w and will never touch it again, we can safely
        // store nil to w.bucket here and allow the WaitComplete() to
        // short-circuit grabbing the bucket lock. If they somehow miss the
        // store, we are still holding the lock, so we can know that they won't
        // dequeue w, assume it's free and have the below operation
        // afterwards.
        w.bucket.Store(nil)
}

// requeueLocked takes n waiters from the bucket and moves them to naddr on the
// bucket "to".
//
// Preconditions: b and to must be locked.
func (b *bucket) requeueLocked(t Target, to *bucket, key, nkey *Key, n int) int {
        done := 0
        for w := b.waiters.Front(); done < n && w != nil; {
                if !w.key.matches(key) {
                        // Not matching.
                        w = w.Next()
                        continue
                }

                requeued := w
                w = w.Next() // Next iteration.
                b.waiters.Remove(requeued)
                requeued.key.release(t)
                requeued.key = nkey.clone()
                to.waiters.PushBack(requeued)
                requeued.bucket.Store(to)
                done++
        }
        return done
}

const (
        // bucketCount is the number of buckets per Manager. By having many of
        // these we reduce contention when concurrent yet unrelated calls are made.
        bucketCount     = 1 << bucketCountBits
        bucketCountBits = 10
)

// getKey returns a Key representing address addr in c.
func getKey(t Target, addr hostarch.Addr, private bool) (Key, error) {
        // Ensure the address is aligned.
        // It must be a DWORD boundary.
        if addr&0x3 != 0 {
                return Key{}, linuxerr.EINVAL
        }
        if private {
                return Key{Kind: KindPrivate, Offset: uint64(addr)}, nil
        }
        return t.GetSharedKey(addr)
}

// bucketIndexForAddr returns the index into Manager.buckets for addr.
func bucketIndexForAddr(addr hostarch.Addr) uintptr {
        // - The bottom 2 bits of addr must be 0, per getKey.
        //
        // - On amd64, the top 16 bits of addr (bits 48-63) must be equal to bit 47
        // for a canonical address, and (on all existing platforms) bit 47 must be
        // 0 for an application address.
        //
        // Thus 19 bits of addr are "useless" for hashing, leaving only 45 "useful"
        // bits. We choose one of the simplest possible hash functions that at
        // least uses all 45 useful bits in the output, given that bucketCountBits
        // == 10. This hash function also has the property that it will usually map
        // adjacent addresses to adjacent buckets, slightly improving memory
        // locality when an application synchronization structure uses multiple
        // nearby futexes.
        //
        // Note that despite the large number of arithmetic operations in the
        // function, many components can be computed in parallel, such that the
        // critical path is 1 bit shift + 3 additions (2 in h1, then h1 + h2). This
        // is also why h1 and h2 are grouped separately; for "(addr >> 2) + ... +
        // (addr >> 42)" without any additional grouping, the compiler puts all 4
        // additions in the critical path.
        h1 := uintptr(addr>>2) + uintptr(addr>>12) + uintptr(addr>>22)
        h2 := uintptr(addr>>32) + uintptr(addr>>42)
        return (h1 + h2) % bucketCount
}

// Manager holds futex state for a single virtual address space.
//
// +stateify savable
type Manager struct {
        // privateBuckets holds buckets for KindPrivate and KindSharedPrivate
        // futexes.
        privateBuckets [bucketCount]bucket `state:"zerovalue"`

        // sharedBucket is the bucket for KindSharedMappable futexes. sharedBucket
        // may be shared by multiple Managers. The sharedBucket pointer is
        // immutable.
        sharedBucket *bucket
}

// NewManager returns an initialized futex manager.
func NewManager() *Manager {
        return &Manager{
                sharedBucket: &bucket{},
        }
}

// Fork returns a new Manager. Shared futex clients using the returned Manager
// may interoperate with those using m.
func (m *Manager) Fork() *Manager {
        return &Manager{
                sharedBucket: m.sharedBucket,
        }
}

// lockBucket returns a locked bucket for the given key.
// +checklocksacquire:b.mu
func (m *Manager) lockBucket(k *Key) (b *bucket) {
        if k.Kind == KindSharedMappable {
                b = m.sharedBucket
        } else {
                b = &m.privateBuckets[bucketIndexForAddr(k.addr())]
        }
        b.mu.Lock()
        return b
}

// lockBuckets returns locked buckets for the given keys.
// +checklocksacquire:b1.mu
// +checklocksacquire:b2.mu
func (m *Manager) lockBuckets(k1, k2 *Key) (b1 *bucket, b2 *bucket) {
        // Buckets must be consistently ordered to avoid circular lock
        // dependencies. We order buckets in m.privateBuckets by index (lowest
        // index first), and all buckets in m.privateBuckets precede
        // m.sharedBucket.

        // Handle the common case first:
        if k1.Kind != KindSharedMappable && k2.Kind != KindSharedMappable {
                i1 := bucketIndexForAddr(k1.addr())
                i2 := bucketIndexForAddr(k2.addr())
                b1 = &m.privateBuckets[i1]
                b2 = &m.privateBuckets[i2]
                switch {
                case i1 < i2:
                        b1.mu.Lock()
                        b2.mu.Lock()
                case i2 < i1:
                        b2.mu.Lock()
                        b1.mu.Lock()
                default:
                        b1.mu.Lock()
                }
                return b1, b2 // +checklocksforce
        }

        // At least one of b1 or b2 should be m.sharedBucket.
        b1 = m.sharedBucket
        b2 = m.sharedBucket
        if k1.Kind != KindSharedMappable {
                b1 = m.lockBucket(k1)
        } else if k2.Kind != KindSharedMappable {
                b2 = m.lockBucket(k2)
        }
        m.sharedBucket.mu.Lock()
        return b1, b2 // +checklocksforce
}

// unlockBuckets unlocks two buckets.
// +checklocksrelease:b1.mu
// +checklocksrelease:b2.mu
func (m *Manager) unlockBuckets(b1, b2 *bucket) {
        b1.mu.Unlock()
        if b1 != b2 {
                b2.mu.Unlock()
        }
        return // +checklocksforce
}

// Wake wakes up to n waiters matching the bitmask on the given addr.
// The number of waiters woken is returned.
func (m *Manager) Wake(t Target, addr hostarch.Addr, private bool, bitmask uint32, n int) (int, error) {
        // This function is very hot; avoid defer.
        k, err := getKey(t, addr, private)
        if err != nil {
                return 0, err
        }

        b := m.lockBucket(&k)
        r := b.wakeLocked(&k, bitmask, n)

        b.mu.Unlock()
        k.release(t)
        return r, nil
}

func (m *Manager) doRequeue(t Target, addr, naddr hostarch.Addr, private bool, checkval bool, val uint32, nwake int, nreq int) (int, error) {
        k1, err := getKey(t, addr, private)
        if err != nil {
                return 0, err
        }
        defer k1.release(t)
        k2, err := getKey(t, naddr, private)
        if err != nil {
                return 0, err
        }
        defer k2.release(t)

        b1, b2 := m.lockBuckets(&k1, &k2)
        defer m.unlockBuckets(b1, b2)

        if checkval {
                if err := check(t, addr, val); err != nil {
                        return 0, err
                }
        }

        // Wake the number required.
        done := b1.wakeLocked(&k1, ^uint32(0), nwake)

        // Requeue the number required.
        b1.requeueLocked(t, b2, &k1, &k2, nreq)

        return done, nil
}

// Requeue wakes up to nwake waiters on the given addr, and unconditionally
// requeues up to nreq waiters on naddr.
func (m *Manager) Requeue(t Target, addr, naddr hostarch.Addr, private bool, nwake int, nreq int) (int, error) {
        return m.doRequeue(t, addr, naddr, private, false, 0, nwake, nreq)
}

// RequeueCmp atomically checks that the addr contains val (via the Target),
// wakes up to nwake waiters on addr and then unconditionally requeues nreq
// waiters on naddr.
func (m *Manager) RequeueCmp(t Target, addr, naddr hostarch.Addr, private bool, val uint32, nwake int, nreq int) (int, error) {
        return m.doRequeue(t, addr, naddr, private, true, val, nwake, nreq)
}

// WakeOp atomically applies op to the memory address addr2, wakes up to nwake1
// waiters unconditionally from addr1, and, based on the original value at addr2
// and a comparison encoded in op, wakes up to nwake2 waiters from addr2.
// It returns the total number of waiters woken.
func (m *Manager) WakeOp(t Target, addr1, addr2 hostarch.Addr, private bool, nwake1 int, nwake2 int, op uint32) (int, error) {
        k1, err := getKey(t, addr1, private)
        if err != nil {
                return 0, err
        }
        defer k1.release(t)
        k2, err := getKey(t, addr2, private)
        if err != nil {
                return 0, err
        }
        defer k2.release(t)

        b1, b2 := m.lockBuckets(&k1, &k2)
        defer m.unlockBuckets(b1, b2)

        done := 0
        cond, err := atomicOp(t, addr2, op)
        if err != nil {
                return 0, err
        }

        // Wake up up to nwake1 entries from the first bucket.
        done = b1.wakeLocked(&k1, ^uint32(0), nwake1)

        // Wake up up to nwake2 entries from the second bucket if the
        // operation yielded true.
        if cond {
                done += b2.wakeLocked(&k2, ^uint32(0), nwake2)
        }

        return done, nil
}

// WaitPrepare atomically checks that addr contains val (via the Checker), then
// enqueues w to be woken by a send to w.C. If WaitPrepare returns nil, the
// Waiter must be subsequently removed by calling WaitComplete, whether or not
// a wakeup is received on w.C.
func (m *Manager) WaitPrepare(w *Waiter, t Target, addr hostarch.Addr, private bool, val uint32, bitmask uint32) error {
        k, err := getKey(t, addr, private)
        if err != nil {
                return err
        }
        // Ownership of k is transferred to w below.

        // Prepare the Waiter before taking the bucket lock.
        select {
        case <-w.C:
        default:
        }
        w.key = k
        w.bitmask = bitmask

        b := m.lockBucket(&k)
        // This function is very hot; avoid defer.

        // Perform our atomic check.
        if err := check(t, addr, val); err != nil {
                b.mu.Unlock()
                w.key.release(t)
                return err
        }

        // Add the waiter to the bucket.
        b.waiters.PushBack(w)
        w.bucket.Store(b)

        b.mu.Unlock()
        return nil
}

// WaitComplete must be called when a Waiter previously added by WaitPrepare is
// no longer eligible to be woken.
func (m *Manager) WaitComplete(w *Waiter, t Target) {
        // Remove w from the bucket it's in.
        for {
                b := w.bucket.Load()

                // If b is nil, the waiter isn't in any bucket anymore. This can't be
                // racy because the waiter can't be concurrently re-queued in another
                // bucket.
                if b == nil {
                        break
                }

                // Take the bucket lock. Note that without holding the bucket lock, the
                // waiter is not guaranteed to stay in that bucket, so after we take
                // the bucket lock, we must ensure that the bucket hasn't changed: if
                // it happens to have changed, we release the old bucket lock and try
                // again with the new bucket; if it hasn't changed, we know it won't
                // change now because we hold the lock.
                b.mu.Lock()
                if b != w.bucket.Load() {
                        b.mu.Unlock()
                        continue
                }

                // Remove waiter from bucket.
                b.waiters.Remove(w)
                w.bucket.Store(nil)
                b.mu.Unlock()
                break
        }

        // Release references held by the waiter.
        w.key.release(t)
}

// LockPI attempts to lock the futex following the Priority-inheritance futex
// rules. The lock is acquired only when 'addr' points to 0. The TID of the
// calling task is set to 'addr' to indicate the futex is owned. It returns true
// if the futex was successfully acquired.
//
// FUTEX_OWNER_DIED is only set by the Linux when robust lists are in use (see
// exit_robust_list()). Given we don't support robust lists, although handled
// below, it's never set.
func (m *Manager) LockPI(w *Waiter, t Target, addr hostarch.Addr, tid uint32, private, try bool) (bool, error) {
        k, err := getKey(t, addr, private)
        if err != nil {
                return false, err
        }
        // Ownership of k is transferred to w below.

        // Prepare the Waiter before taking the bucket lock.
        select {
        case <-w.C:
        default:
        }
        w.key = k
        w.tid = tid

        b := m.lockBucket(&k)
        // Hot function: avoid defers.

        success, err := m.lockPILocked(w, t, addr, tid, b, try)
        if err != nil {
                w.key.release(t)
                b.mu.Unlock()
                return false, err
        }
        if success || try {
                // Release waiter if it's not going to be a wait.
                w.key.release(t)
        }
        b.mu.Unlock()
        return success, nil
}

func (m *Manager) lockPILocked(w *Waiter, t Target, addr hostarch.Addr, tid uint32, b *bucket, try bool) (bool, error) {
        for {
                cur, err := t.LoadUint32(addr)
                if err != nil {
                        return false, err
                }
                if (cur & linux.FUTEX_TID_MASK) == tid {
                        return false, linuxerr.EDEADLK
                }

                if (cur & linux.FUTEX_TID_MASK) == 0 {
                        // No owner and no waiters, try to acquire the futex.

                        // Set TID and preserve owner died status.
                        val := tid
                        val |= cur & linux.FUTEX_OWNER_DIED
                        prev, err := t.CompareAndSwapUint32(addr, cur, val)
                        if err != nil {
                                return false, err
                        }
                        if prev != cur {
                                // CAS failed, retry...
                                // Linux reacquires the bucket lock on retries, which will re-lookup the
                                // mapping at the futex address. However, retrying while holding the
                                // lock is more efficient and reduces the chance of another conflict.
                                continue
                        }
                        // Futex acquired.
                        return true, nil
                }

                // Futex is already owned, prepare to wait.

                if try {
                        // Caller doesn't want to wait.
                        return false, nil
                }

                // Set waiters bit if not set yet.
                if cur&linux.FUTEX_WAITERS == 0 {
                        prev, err := t.CompareAndSwapUint32(addr, cur, cur|linux.FUTEX_WAITERS)
                        if err != nil {
                                return false, err
                        }
                        if prev != cur {
                                // CAS failed, retry...
                                continue
                        }
                }

                // Add the waiter to the bucket.
                b.waiters.PushBack(w)
                w.bucket.Store(b)
                return false, nil
        }
}

// UnlockPI unlocks the futex following the Priority-inheritance futex rules.
// The address provided must contain the caller's TID. If there are waiters,
// TID of the next waiter (FIFO) is set to the given address, and the waiter
// woken up. If there are no waiters, 0 is set to the address.
func (m *Manager) UnlockPI(t Target, addr hostarch.Addr, tid uint32, private bool) error {
        k, err := getKey(t, addr, private)
        if err != nil {
                return err
        }
        b := m.lockBucket(&k)

        err = m.unlockPILocked(t, addr, tid, b, &k)

        k.release(t)
        b.mu.Unlock()
        return err
}

func (m *Manager) unlockPILocked(t Target, addr hostarch.Addr, tid uint32, b *bucket, key *Key) error {
        cur, err := t.LoadUint32(addr)
        if err != nil {
                return err
        }

        if (cur & linux.FUTEX_TID_MASK) != tid {
                return linuxerr.EPERM
        }

        var next *Waiter  // Who's the next owner?
        var next2 *Waiter // Who's the one after that?
        for w := b.waiters.Front(); w != nil; w = w.Next() {
                if !w.key.matches(key) {
                        continue
                }

                if next == nil {
                        next = w
                } else {
                        next2 = w
                        break
                }
        }

        if next == nil {
                // It's safe to set 0 because there are no waiters, no new owner, and the
                // executing task is the current owner (no owner died bit).
                prev, err := t.CompareAndSwapUint32(addr, cur, 0)
                if err != nil {
                        return err
                }
                if prev != cur {
                        // Let user mode handle CAS races. This is different than lock, which
                        // retries when CAS fails.
                        return linuxerr.EAGAIN
                }
                return nil
        }

        // Set next owner's TID, waiters if there are any. Resets owner died bit, if
        // set, because the executing task takes over as the owner.
        val := next.tid
        if next2 != nil {
                val |= linux.FUTEX_WAITERS
        }

        prev, err := t.CompareAndSwapUint32(addr, cur, val)
        if err != nil {
                return err
        }
        if prev != cur {
                return linuxerr.EINVAL
        }

        b.wakeWaiterLocked(next)
        return nil
}































   86 





 1627 





 1623 










   86 
   85 
   86 


   86 

    9 
















 1624 










 1626 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// Copyright 2019 The gVisor Authors.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build go1.13
// +build !go1.18

// When updating the build constraint (above), check that syncMutex matches the
// standard library sync.Mutex definition.

package sync

import (
        "sync"
        "sync/atomic"
        "unsafe"
)

// CrossGoroutineMutex is equivalent to Mutex, but it need not be unlocked by a
// the same goroutine that locked the mutex.
type CrossGoroutineMutex struct {
        sync.Mutex
}

type syncMutex struct {
        state int32
        sema  uint32
}

func (m *CrossGoroutineMutex) state() *int32 {
        return &(*syncMutex)(unsafe.Pointer(&m.Mutex)).state
}

// Lock locks the underlying Mutex.
// +checklocksignore
func (m *CrossGoroutineMutex) Lock() {
        m.Mutex.Lock()
}

// Unlock unlocks the underlying Mutex.
// +checklocksignore
func (m *CrossGoroutineMutex) Unlock() {
        m.Mutex.Unlock()
}

const (
        mutexUnlocked = 0
        mutexLocked   = 1
)

// TryLock tries to acquire the mutex. It returns true if it succeeds and false
// otherwise. TryLock does not block.
func (m *CrossGoroutineMutex) TryLock() bool {
        if atomic.CompareAndSwapInt32(m.state(), mutexUnlocked, mutexLocked) {
                if RaceEnabled {
                        RaceAcquire(unsafe.Pointer(&m.Mutex))
                }
                return true
        }
        return false
}

// Mutex is a mutual exclusion lock. The zero value for a Mutex is an unlocked
// mutex.
//
// A Mutex must not be copied after first use.
//
// A Mutex must be unlocked by the same goroutine that locked it. This
// invariant is enforced with the 'checklocks' build tag.
type Mutex struct {
        m CrossGoroutineMutex
}

// Lock locks m. If the lock is already in use, the calling goroutine blocks
// until the mutex is available.
// +checklocksignore
func (m *Mutex) Lock() {
        noteLock(unsafe.Pointer(m))
        m.m.Lock()
}

// Unlock unlocks m.
//
// Preconditions:
// * m is locked.
// * m was locked by this goroutine.
// +checklocksignore
func (m *Mutex) Unlock() {
        noteUnlock(unsafe.Pointer(m))
        m.m.Unlock()
}

// TryLock tries to acquire the mutex. It returns true if it succeeds and false
// otherwise. TryLock does not block.
// +checklocksignore
func (m *Mutex) TryLock() bool {
        // Note lock first to enforce proper locking even if unsuccessful.
        noteLock(unsafe.Pointer(m))
        locked := m.m.TryLock()
        if !locked {
                noteUnlock(unsafe.Pointer(m))
        }
        return locked
}











































    4 
    1 



    3 







    3 


    3 



   10 
    2 



    8 

    6 
    2 


    4 


    6 
    1 


    5 



    4 




    3 





    2 






    2 



    3 
    1 


    2 



    4 
    1 



    3 

    1 


    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package memxattr provides a default, in-memory extended attribute
// implementation.
package memxattr

import (
        "strings"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/sync"
)

// SimpleExtendedAttributes implements extended attributes using a map of
// names to values.
//
// SimpleExtendedAttributes calls vfs.CheckXattrPermissions, so callers are not
// required to do so.
//
// +stateify savable
type SimpleExtendedAttributes struct {
        // mu protects the below fields.
        mu     sync.RWMutex `state:"nosave"`
        xattrs map[string]string
}

// GetXattr returns the value at 'name'.
func (x *SimpleExtendedAttributes) GetXattr(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, opts *vfs.GetXattrOptions) (string, error) {
        if err := vfs.CheckXattrPermissions(creds, vfs.MayRead, mode, kuid, opts.Name); err != nil {
                return "", err
        }

        x.mu.RLock()
        value, ok := x.xattrs[opts.Name]
        x.mu.RUnlock()
        if !ok {
                return "", linuxerr.ENODATA
        }
        // Check that the size of the buffer provided in getxattr(2) is large enough
        // to contain the value.
        if opts.Size != 0 && uint64(len(value)) > opts.Size {
                return "", linuxerr.ERANGE
        }
        return value, nil
}

// SetXattr sets 'value' at 'name'.
func (x *SimpleExtendedAttributes) SetXattr(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, opts *vfs.SetXattrOptions) error {
        if err := vfs.CheckXattrPermissions(creds, vfs.MayWrite, mode, kuid, opts.Name); err != nil {
                return err
        }

        x.mu.Lock()
        defer x.mu.Unlock()
        if x.xattrs == nil {
                if opts.Flags&linux.XATTR_REPLACE != 0 {
                        return linuxerr.ENODATA
                }
                x.xattrs = make(map[string]string)
        }

        _, ok := x.xattrs[opts.Name]
        if ok && opts.Flags&linux.XATTR_CREATE != 0 {
                return linuxerr.EEXIST
        }
        if !ok && opts.Flags&linux.XATTR_REPLACE != 0 {
                return linuxerr.ENODATA
        }

        x.xattrs[opts.Name] = opts.Value
        return nil
}

// ListXattr returns all names in xattrs.
func (x *SimpleExtendedAttributes) ListXattr(creds *auth.Credentials, size uint64) ([]string, error) {
        // Keep track of the size of the buffer needed in listxattr(2) for the list.
        listSize := 0
        x.mu.RLock()
        names := make([]string, 0, len(x.xattrs))
        haveCap := creds.HasCapability(linux.CAP_SYS_ADMIN)
        for n := range x.xattrs {
                // Hide extended attributes in the "trusted" namespace from
                // non-privileged users. This is consistent with Linux's
                // fs/xattr.c:simple_xattr_list().
                if !haveCap && strings.HasPrefix(n, linux.XATTR_TRUSTED_PREFIX) {
                        continue
                }
                names = append(names, n)
                // Add one byte per null terminator.
                listSize += len(n) + 1
        }
        x.mu.RUnlock()
        if size != 0 && uint64(listSize) > size {
                return nil, linuxerr.ERANGE
        }
        return names, nil
}

// RemoveXattr removes the xattr at 'name'.
func (x *SimpleExtendedAttributes) RemoveXattr(creds *auth.Credentials, mode linux.FileMode, kuid auth.KUID, name string) error {
        if err := vfs.CheckXattrPermissions(creds, vfs.MayWrite, mode, kuid, name); err != nil {
                return err
        }

        x.mu.Lock()
        defer x.mu.Unlock()
        if _, ok := x.xattrs[name]; !ok {
                return linuxerr.ENODATA
        }
        delete(x.xattrs, name)
        return nil
}

































    3 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tmpfs

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
)

// +stateify savable
type namedPipe struct {
        inode inode

        pipe *pipe.VFSPipe
}

// Preconditions:
// * fs.mu must be locked.
// * rp.Mount().CheckBeginWrite() has been called successfully.
func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) *inode {
        file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize)}
        file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode, parentDir)
        file.inode.nlink = 1 // Only the parent has a link.
        return &file.inode
}




































































   32 























   32 














   32 













   32 















   32 















   32 















   32 















   32 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package ipv4

import (
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/network/internal/ip"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
)

var _ stack.IPNetworkEndpointStats = (*Stats)(nil)

// Stats holds statistics related to the IPv4 protocol family.
type Stats struct {
        // IP holds IPv4 statistics.
        IP tcpip.IPStats

        // IGMP holds IGMP statistics.
        IGMP tcpip.IGMPStats

        // ICMP holds ICMPv4 statistics.
        ICMP tcpip.ICMPv4Stats
}

// IsNetworkEndpointStats implements stack.NetworkEndpointStats.
func (*Stats) IsNetworkEndpointStats() {}

// IPStats implements stack.IPNetworkEndointStats
func (s *Stats) IPStats() *tcpip.IPStats {
        return &s.IP
}

type sharedStats struct {
        localStats Stats
        ip         ip.MultiCounterIPStats
        icmp       multiCounterICMPv4Stats
        igmp       multiCounterIGMPStats
}

// LINT.IfChange(multiCounterICMPv4PacketStats)

type multiCounterICMPv4PacketStats struct {
        echoRequest    tcpip.MultiCounterStat
        echoReply      tcpip.MultiCounterStat
        dstUnreachable tcpip.MultiCounterStat
        srcQuench      tcpip.MultiCounterStat
        redirect       tcpip.MultiCounterStat
        timeExceeded   tcpip.MultiCounterStat
        paramProblem   tcpip.MultiCounterStat
        timestamp      tcpip.MultiCounterStat
        timestampReply tcpip.MultiCounterStat
        infoRequest    tcpip.MultiCounterStat
        infoReply      tcpip.MultiCounterStat
}

func (m *multiCounterICMPv4PacketStats) init(a, b *tcpip.ICMPv4PacketStats) {
        m.echoRequest.Init(a.EchoRequest, b.EchoRequest)
        m.echoReply.Init(a.EchoReply, b.EchoReply)
        m.dstUnreachable.Init(a.DstUnreachable, b.DstUnreachable)
        m.srcQuench.Init(a.SrcQuench, b.SrcQuench)
        m.redirect.Init(a.Redirect, b.Redirect)
        m.timeExceeded.Init(a.TimeExceeded, b.TimeExceeded)
        m.paramProblem.Init(a.ParamProblem, b.ParamProblem)
        m.timestamp.Init(a.Timestamp, b.Timestamp)
        m.timestampReply.Init(a.TimestampReply, b.TimestampReply)
        m.infoRequest.Init(a.InfoRequest, b.InfoRequest)
        m.infoReply.Init(a.InfoReply, b.InfoReply)
}

// LINT.ThenChange(../../tcpip.go:ICMPv4PacketStats)

// LINT.IfChange(multiCounterICMPv4SentPacketStats)

type multiCounterICMPv4SentPacketStats struct {
        multiCounterICMPv4PacketStats
        dropped     tcpip.MultiCounterStat
        rateLimited tcpip.MultiCounterStat
}

func (m *multiCounterICMPv4SentPacketStats) init(a, b *tcpip.ICMPv4SentPacketStats) {
        m.multiCounterICMPv4PacketStats.init(&a.ICMPv4PacketStats, &b.ICMPv4PacketStats)
        m.dropped.Init(a.Dropped, b.Dropped)
        m.rateLimited.Init(a.RateLimited, b.RateLimited)
}

// LINT.ThenChange(../../tcpip.go:ICMPv4SentPacketStats)

// LINT.IfChange(multiCounterICMPv4ReceivedPacketStats)

type multiCounterICMPv4ReceivedPacketStats struct {
        multiCounterICMPv4PacketStats
        invalid tcpip.MultiCounterStat
}

func (m *multiCounterICMPv4ReceivedPacketStats) init(a, b *tcpip.ICMPv4ReceivedPacketStats) {
        m.multiCounterICMPv4PacketStats.init(&a.ICMPv4PacketStats, &b.ICMPv4PacketStats)
        m.invalid.Init(a.Invalid, b.Invalid)
}

// LINT.ThenChange(../../tcpip.go:ICMPv4ReceivedPacketStats)

// LINT.IfChange(multiCounterICMPv4Stats)

type multiCounterICMPv4Stats struct {
        packetsSent     multiCounterICMPv4SentPacketStats
        packetsReceived multiCounterICMPv4ReceivedPacketStats
}

func (m *multiCounterICMPv4Stats) init(a, b *tcpip.ICMPv4Stats) {
        m.packetsSent.init(&a.PacketsSent, &b.PacketsSent)
        m.packetsReceived.init(&a.PacketsReceived, &b.PacketsReceived)
}

// LINT.ThenChange(../../tcpip.go:ICMPv4Stats)

// LINT.IfChange(multiCounterIGMPPacketStats)

type multiCounterIGMPPacketStats struct {
        membershipQuery    tcpip.MultiCounterStat
        v1MembershipReport tcpip.MultiCounterStat
        v2MembershipReport tcpip.MultiCounterStat
        leaveGroup         tcpip.MultiCounterStat
}

func (m *multiCounterIGMPPacketStats) init(a, b *tcpip.IGMPPacketStats) {
        m.membershipQuery.Init(a.MembershipQuery, b.MembershipQuery)
        m.v1MembershipReport.Init(a.V1MembershipReport, b.V1MembershipReport)
        m.v2MembershipReport.Init(a.V2MembershipReport, b.V2MembershipReport)
        m.leaveGroup.Init(a.LeaveGroup, b.LeaveGroup)
}

// LINT.ThenChange(../../tcpip.go:IGMPPacketStats)

// LINT.IfChange(multiCounterIGMPSentPacketStats)

type multiCounterIGMPSentPacketStats struct {
        multiCounterIGMPPacketStats
        dropped tcpip.MultiCounterStat
}

func (m *multiCounterIGMPSentPacketStats) init(a, b *tcpip.IGMPSentPacketStats) {
        m.multiCounterIGMPPacketStats.init(&a.IGMPPacketStats, &b.IGMPPacketStats)
        m.dropped.Init(a.Dropped, b.Dropped)
}

// LINT.ThenChange(../../tcpip.go:IGMPSentPacketStats)

// LINT.IfChange(multiCounterIGMPReceivedPacketStats)

type multiCounterIGMPReceivedPacketStats struct {
        multiCounterIGMPPacketStats
        invalid        tcpip.MultiCounterStat
        checksumErrors tcpip.MultiCounterStat
        unrecognized   tcpip.MultiCounterStat
}

func (m *multiCounterIGMPReceivedPacketStats) init(a, b *tcpip.IGMPReceivedPacketStats) {
        m.multiCounterIGMPPacketStats.init(&a.IGMPPacketStats, &b.IGMPPacketStats)
        m.invalid.Init(a.Invalid, b.Invalid)
        m.checksumErrors.Init(a.ChecksumErrors, b.ChecksumErrors)
        m.unrecognized.Init(a.Unrecognized, b.Unrecognized)
}

// LINT.ThenChange(../../tcpip.go:IGMPReceivedPacketStats)

// LINT.IfChange(multiCounterIGMPStats)

type multiCounterIGMPStats struct {
        packetsSent     multiCounterIGMPSentPacketStats
        packetsReceived multiCounterIGMPReceivedPacketStats
}

func (m *multiCounterIGMPStats) init(a, b *tcpip.IGMPStats) {
        m.packetsSent.init(&a.PacketsSent, &b.PacketsSent)
        m.packetsReceived.init(&a.PacketsReceived, &b.PacketsReceived)
}

// LINT.ThenChange(../../tcpip.go:IGMPStats)



































 1790 
 1787 







 1787 
 1791 








 1784 









  612 










  483 



















    9 

    9 

    2 


    7 




    5 


    5 



    5 
    3 


    3 



    4 








    2 

    2 





    2 
    2 



    1 


    1 






    1 

























  216 
   84 



  144 
   55 



  143 
  137 





  138 
  138 
    3 



  137 

    4 


  135 
    1 



  134 



   48 









   50 
   45 

    4 



   43 


   50 












  504 
    9 


  508 
    3 


  504 













  203 
    2 


  200 
    8 


  193 














   40 















   37 






   40 



    3 




    3 




   37 




   37 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "math"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/mm"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
)

// MAX_RW_COUNT is the maximum size in bytes of a single read or write.
// Reads and writes that exceed this size may be silently truncated.
// (Linux: include/linux/fs.h:MAX_RW_COUNT)
var MAX_RW_COUNT = int(hostarch.Addr(math.MaxInt32).RoundDown())

// Activate ensures that the task has an active address space.
func (t *Task) Activate() {
        if mm := t.MemoryManager(); mm != nil {
                if err := mm.Activate(t); err != nil {
                        panic("unable to activate mm: " + err.Error())
                }
        }
}

// Deactivate relinquishes the task's active address space.
func (t *Task) Deactivate() {
        if mm := t.MemoryManager(); mm != nil {
                mm.Deactivate()
        }
}

// CopyInBytes is a fast version of CopyIn if the caller can serialize the
// data without reflection and pass in a byte slice.
//
// This Task's AddressSpace must be active.
func (t *Task) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) {
        return t.MemoryManager().CopyIn(t, addr, dst, usermem.IOOpts{
                AddressSpaceActive: true,
        })
}

// CopyOutBytes is a fast version of CopyOut if the caller can serialize the
// data without reflection and pass in a byte slice.
//
// This Task's AddressSpace must be active.
func (t *Task) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) {
        return t.MemoryManager().CopyOut(t, addr, src, usermem.IOOpts{
                AddressSpaceActive: true,
        })
}

// CopyInString copies a NUL-terminated string of length at most maxlen in from
// the task's memory. The copy will fail with syscall.EFAULT if it traverses
// user memory that is unmapped or not readable by the user.
//
// This Task's AddressSpace must be active.
func (t *Task) CopyInString(addr hostarch.Addr, maxlen int) (string, error) {
        return usermem.CopyStringIn(t, t.MemoryManager(), addr, maxlen, usermem.IOOpts{
                AddressSpaceActive: true,
        })
}

// CopyInVector copies a NULL-terminated vector of strings from the task's
// memory. The copy will fail with syscall.EFAULT if it traverses
// user memory that is unmapped or not readable by the user.
//
// maxElemSize is the maximum size of each individual element.
//
// maxTotalSize is the maximum total length of all elements plus the total
// number of elements. For example, the following strings correspond to
// the following set of sizes:
//
//     { "a", "b", "c" } => 6 (3 for lengths, 3 for elements)
//     { "abc" }         => 4 (3 for length, 1 for elements)
//
// This Task's AddressSpace must be active.
func (t *Task) CopyInVector(addr hostarch.Addr, maxElemSize, maxTotalSize int) ([]string, error) {
        var v []string
        for {
                argAddr := t.Arch().Native(0)
                if _, err := argAddr.CopyIn(t, addr); err != nil {
                        return v, err
                }
                if t.Arch().Value(argAddr) == 0 {
                        break
                }
                // Each string has a zero terminating byte counted, so copying out a string
                // requires at least one byte of space. Also, see the calculation below.
                if maxTotalSize <= 0 {
                        return nil, syserror.ENOMEM
                }
                thisMax := maxElemSize
                if maxTotalSize < thisMax {
                        thisMax = maxTotalSize
                }
                arg, err := t.CopyInString(hostarch.Addr(t.Arch().Value(argAddr)), thisMax)
                if err != nil {
                        return v, err
                }
                v = append(v, arg)
                addr += hostarch.Addr(t.Arch().Width())
                maxTotalSize -= len(arg) + 1
        }
        return v, nil
}

// CopyOutIovecs converts src to an array of struct iovecs and copies it to the
// memory mapped at addr.
//
// Preconditions: Same as usermem.IO.CopyOut, plus:
// * The caller must be running on the task goroutine.
// * t's AddressSpace must be active.
func (t *Task) CopyOutIovecs(addr hostarch.Addr, src hostarch.AddrRangeSeq) error {
        switch t.Arch().Width() {
        case 8:
                const itemLen = 16
                if _, ok := addr.AddLength(uint64(src.NumRanges()) * itemLen); !ok {
                        return linuxerr.EFAULT
                }

                b := t.CopyScratchBuffer(itemLen)
                for ; !src.IsEmpty(); src = src.Tail() {
                        ar := src.Head()
                        hostarch.ByteOrder.PutUint64(b[0:8], uint64(ar.Start))
                        hostarch.ByteOrder.PutUint64(b[8:16], uint64(ar.Length()))
                        if _, err := t.CopyOutBytes(addr, b); err != nil {
                                return err
                        }
                        addr += itemLen
                }

        default:
                return syserror.ENOSYS
        }

        return nil
}

// CopyInIovecs copies an array of numIovecs struct iovecs from the memory
// mapped at addr, converts them to hostarch.AddrRanges, and returns them as a
// hostarch.AddrRangeSeq.
//
// CopyInIovecs shares the following properties with Linux's
// lib/iov_iter.c:import_iovec() => fs/read_write.c:rw_copy_check_uvector():
//
// - If the length of any AddrRange would exceed the range of an ssize_t,
// CopyInIovecs returns EINVAL.
//
// - If the length of any AddrRange would cause its end to overflow,
// CopyInIovecs returns EFAULT.
//
// - If any AddrRange would include addresses outside the application address
// range, CopyInIovecs returns EFAULT.
//
// - The combined length of all AddrRanges is limited to MAX_RW_COUNT. If the
// combined length of all AddrRanges would otherwise exceed this amount, ranges
// beyond MAX_RW_COUNT are silently truncated.
//
// Preconditions: Same as usermem.IO.CopyIn, plus:
// * The caller must be running on the task goroutine.
// * t's AddressSpace must be active.
func (t *Task) CopyInIovecs(addr hostarch.Addr, numIovecs int) (hostarch.AddrRangeSeq, error) {
        if numIovecs == 0 {
                return hostarch.AddrRangeSeq{}, nil
        }

        var dst []hostarch.AddrRange
        if numIovecs > 1 {
                dst = make([]hostarch.AddrRange, 0, numIovecs)
        }

        switch t.Arch().Width() {
        case 8:
                const itemLen = 16
                if _, ok := addr.AddLength(uint64(numIovecs) * itemLen); !ok {
                        return hostarch.AddrRangeSeq{}, linuxerr.EFAULT
                }

                b := t.CopyScratchBuffer(itemLen)
                for i := 0; i < numIovecs; i++ {
                        if _, err := t.CopyInBytes(addr, b); err != nil {
                                return hostarch.AddrRangeSeq{}, err
                        }

                        base := hostarch.Addr(hostarch.ByteOrder.Uint64(b[0:8]))
                        length := hostarch.ByteOrder.Uint64(b[8:16])
                        if length > math.MaxInt64 {
                                return hostarch.AddrRangeSeq{}, linuxerr.EINVAL
                        }
                        ar, ok := t.MemoryManager().CheckIORange(base, int64(length))
                        if !ok {
                                return hostarch.AddrRangeSeq{}, linuxerr.EFAULT
                        }

                        if numIovecs == 1 {
                                // Special case to avoid allocating dst.
                                return hostarch.AddrRangeSeqOf(ar).TakeFirst(MAX_RW_COUNT), nil
                        }
                        dst = append(dst, ar)

                        addr += itemLen
                }

        default:
                return hostarch.AddrRangeSeq{}, syserror.ENOSYS
        }

        // Truncate to MAX_RW_COUNT.
        var total uint64
        for i := range dst {
                dstlen := uint64(dst[i].Length())
                if rem := uint64(MAX_RW_COUNT) - total; rem < dstlen {
                        dst[i].End -= hostarch.Addr(dstlen - rem)
                        dstlen = rem
                }
                total += dstlen
        }

        return hostarch.AddrRangeSeqFromSlice(dst), nil
}

// SingleIOSequence returns a usermem.IOSequence representing [addr,
// addr+length) in t's address space. If this contains addresses outside the
// application address range, it returns EFAULT. If length exceeds
// MAX_RW_COUNT, the range is silently truncated.
//
// SingleIOSequence is analogous to Linux's
// lib/iov_iter.c:import_single_range(). (Note that the non-vectorized read and
// write syscalls in Linux do not use import_single_range(). However they check
// access_ok() in fs/read_write.c:vfs_read/vfs_write, and overflowing address
// ranges are truncated to MAX_RW_COUNT by fs/read_write.c:rw_verify_area().)
func (t *Task) SingleIOSequence(addr hostarch.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) {
        if length > MAX_RW_COUNT {
                length = MAX_RW_COUNT
        }
        ar, ok := t.MemoryManager().CheckIORange(addr, int64(length))
        if !ok {
                return usermem.IOSequence{}, linuxerr.EFAULT
        }
        return usermem.IOSequence{
                IO:    t.MemoryManager(),
                Addrs: hostarch.AddrRangeSeqOf(ar),
                Opts:  opts,
        }, nil
}

// IovecsIOSequence returns a usermem.IOSequence representing the array of
// iovcnt struct iovecs at addr in t's address space. opts applies to the
// returned IOSequence, not the reading of the struct iovec array.
//
// IovecsIOSequence is analogous to Linux's lib/iov_iter.c:import_iovec().
//
// Preconditions: Same as Task.CopyInIovecs.
func (t *Task) IovecsIOSequence(addr hostarch.Addr, iovcnt int, opts usermem.IOOpts) (usermem.IOSequence, error) {
        if iovcnt < 0 || iovcnt > linux.UIO_MAXIOV {
                return usermem.IOSequence{}, linuxerr.EINVAL
        }
        ars, err := t.CopyInIovecs(addr, iovcnt)
        if err != nil {
                return usermem.IOSequence{}, err
        }
        return usermem.IOSequence{
                IO:    t.MemoryManager(),
                Addrs: ars,
                Opts:  opts,
        }, nil
}

type taskCopyContext struct {
        ctx  context.Context
        t    *Task
        opts usermem.IOOpts
}

// CopyContext returns a marshal.CopyContext that copies to/from t's address
// space using opts.
func (t *Task) CopyContext(ctx context.Context, opts usermem.IOOpts) *taskCopyContext {
        return &taskCopyContext{
                ctx:  ctx,
                t:    t,
                opts: opts,
        }
}

// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
func (cc *taskCopyContext) CopyScratchBuffer(size int) []byte {
        if ctxTask, ok := cc.ctx.(*Task); ok {
                return ctxTask.CopyScratchBuffer(size)
        }
        return make([]byte, size)
}

func (cc *taskCopyContext) getMemoryManager() (*mm.MemoryManager, error) {
        cc.t.mu.Lock()
        tmm := cc.t.MemoryManager()
        cc.t.mu.Unlock()
        if !tmm.IncUsers() {
                return nil, linuxerr.EFAULT
        }
        return tmm, nil
}

// CopyInBytes implements marshal.CopyContext.CopyInBytes.
func (cc *taskCopyContext) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) {
        tmm, err := cc.getMemoryManager()
        if err != nil {
                return 0, err
        }
        defer tmm.DecUsers(cc.ctx)
        return tmm.CopyIn(cc.ctx, addr, dst, cc.opts)
}

// CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
func (cc *taskCopyContext) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) {
        tmm, err := cc.getMemoryManager()
        if err != nil {
                return 0, err
        }
        defer tmm.DecUsers(cc.ctx)
        return tmm.CopyOut(cc.ctx, addr, src, cc.opts)
}

type ownTaskCopyContext struct {
        t    *Task
        opts usermem.IOOpts
}

// OwnCopyContext returns a marshal.CopyContext that copies to/from t's address
// space using opts. The returned CopyContext may only be used by t's task
// goroutine.
//
// Since t already implements marshal.CopyContext, this is only needed to
// override the usermem.IOOpts used for the copy.
func (t *Task) OwnCopyContext(opts usermem.IOOpts) *ownTaskCopyContext {
        return &ownTaskCopyContext{
                t:    t,
                opts: opts,
        }
}

// CopyScratchBuffer implements marshal.CopyContext.CopyScratchBuffer.
func (cc *ownTaskCopyContext) CopyScratchBuffer(size int) []byte {
        return cc.t.CopyScratchBuffer(size)
}

// CopyInBytes implements marshal.CopyContext.CopyInBytes.
func (cc *ownTaskCopyContext) CopyInBytes(addr hostarch.Addr, dst []byte) (int, error) {
        return cc.t.MemoryManager().CopyIn(cc.t, addr, dst, cc.opts)
}

// CopyOutBytes implements marshal.CopyContext.CopyOutBytes.
func (cc *ownTaskCopyContext) CopyOutBytes(addr hostarch.Addr, src []byte) (int, error) {
        return cc.t.MemoryManager().CopyOut(cc.t, addr, src, cc.opts)
}

































    2 












  287 

    4 


  287 



  287 
  215 



  230 
    2 

    8 



    8 


    8 


  234 


  233 
   14 


  221 


  209 


   30 


   23 

  165 














  171 
  144 

   11 


  143 

  170 
    1 


  169 







  246 

  233 

   97 


  211 

  193 


  192 











  102 



    3 





  100 


   99 



   99 


   99 


   97 




   94 


   95 



   96 

    1 


   95 



   92 
   51 


   91 





    2 



    2 






   72 



   29 


   58 
    1 


    5 



   56 




   23 



    2 


   21 




    6 
    5 
    1 


    4 

    2 


    2 


    2 


    1 


    1 







   56 
   51 




   51 







   19 
   18 



    7 

    3 

    1 



    7 




   18 






  160 
    2 






  157 


   37 



   81 





   54 



   56 
   54 




   54 
    1 




    1 


    1 





   54 
    3 



   51 



   51 


   47 
    1 


   46 



   45 
   34 




   34 


   33 











   33 



   14 



   12 



   10 


    1 


    1 


    8 


    8 







  116 

   90 
    1 



  115 
   40 





   40 




   40 

   10 


   40 
   25 

    1 


   24 




   24 



    8 

   44 









    6 



    3 


    3 
    1 


    2 




   20 




    2 



   18 




   18 
    1 



    1 

   17 
    2 


   15 


   14 





   14 
    2 


   12 





   11 
    1 


    7 

    1 



    3 
    1 




    8 


    8 
    7 
    1 


    6 
    4 
    1 


    3 


    2 



    2 



    1 






    4 







    4 


    3 



    2 


    3 


    3 

    2 




    2 

    3 



    3 
    3 



    3 

    2 






   12 



    1 


   11 


   10 



   10 


    9 
    2 


    7 


    5 
    1 


    4 


    3 



    3 






    3 











   24 


    5 



   19 

    5 



   14 


   14 



   22 



    2 


   21 





    5 


    5 






   23 
   23 








   11 



    2 


   10 


   10 
    1 


    9 
    2 


    7 


    7 


    5 


    5 



    5 



    1 






    4 









   17 



    4 


   13 


   13 
   12 
    1 


   11 
    1 





    4 



    3 


    1 



    9 



    4 


    5 



   15 


    4 



   12 

    9 



    3 




    6 


    1 



    5 

    4 



    1 




  305 




  303 
    1 


  304 


  304 
   20 














  301 





    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tmpfs

import (
        "fmt"
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/fspath"
        "gvisor.dev/gvisor/pkg/sentry/fsmetric"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
)

// Sync implements vfs.FilesystemImpl.Sync.
func (fs *filesystem) Sync(ctx context.Context) error {
        // All filesystem state is in-memory.
        return nil
}

// stepLocked resolves rp.Component() to an existing file, starting from the
// given directory.
//
// stepLocked is loosely analogous to fs/namei.c:walk_component().
//
// Preconditions:
// * filesystem.mu must be locked.
// * !rp.Done().
func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
        dir, ok := d.inode.impl.(*directory)
        if !ok {
                return nil, linuxerr.ENOTDIR
        }
        if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
                return nil, err
        }
afterSymlink:
        name := rp.Component()
        if name == "." {
                rp.Advance()
                return d, nil
        }
        if name == ".." {
                if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
                        return nil, err
                } else if isRoot || d.parent == nil {
                        rp.Advance()
                        return d, nil
                }
                if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
                        return nil, err
                }
                rp.Advance()
                return d.parent, nil
        }
        if len(name) > linux.NAME_MAX {
                return nil, linuxerr.ENAMETOOLONG
        }
        child, ok := dir.childMap[name]
        if !ok {
                return nil, syserror.ENOENT
        }
        if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
                return nil, err
        }
        if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
                // Symlink traversal updates access time.
                child.inode.touchAtime(rp.Mount())
                if err := rp.HandleSymlink(symlink.target); err != nil {
                        return nil, err
                }
                goto afterSymlink // don't check the current directory again
        }
        rp.Advance()
        return child, nil
}

// walkParentDirLocked resolves all but the last path component of rp to an
// existing directory, starting from the given directory (which is usually
// rp.Start().Impl().(*dentry)). It does not check that the returned directory
// is searchable by the provider of rp.
//
// walkParentDirLocked is loosely analogous to Linux's
// fs/namei.c:path_parentat().
//
// Preconditions:
// * filesystem.mu must be locked.
// * !rp.Done().
func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
        for !rp.Final() {
                next, err := stepLocked(ctx, rp, d)
                if err != nil {
                        return nil, err
                }
                d = next
        }
        dir, ok := d.inode.impl.(*directory)
        if !ok {
                return nil, linuxerr.ENOTDIR
        }
        return dir, nil
}

// resolveLocked resolves rp to an existing file.
//
// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
//
// Preconditions: filesystem.mu must be locked.
func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) {
        d := rp.Start().Impl().(*dentry)
        for !rp.Done() {
                next, err := stepLocked(ctx, rp, d)
                if err != nil {
                        return nil, err
                }
                d = next
        }
        if rp.MustBeDir() && !d.inode.isDir() {
                return nil, linuxerr.ENOTDIR
        }
        return d, nil
}

// doCreateAt checks that creating a file at rp is permitted, then invokes
// create to do so.
//
// doCreateAt is loosely analogous to a conjunction of Linux's
// fs/namei.c:filename_create() and done_path_create().
//
// Preconditions:
// * !rp.Done().
// * For the final path component in rp, !rp.ShouldFollowSymlink().
func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
        fs.mu.Lock()
        defer fs.mu.Unlock()
        parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
        if err != nil {
                return err
        }

        // Order of checks is important. First check if parent directory can be
        // executed, then check for existence, and lastly check if mount is writable.
        if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
                return err
        }
        name := rp.Component()
        if name == "." || name == ".." {
                return linuxerr.EEXIST
        }
        if len(name) > linux.NAME_MAX {
                return linuxerr.ENAMETOOLONG
        }
        if _, ok := parentDir.childMap[name]; ok {
                return linuxerr.EEXIST
        }
        if !dir && rp.MustBeDir() {
                return syserror.ENOENT
        }
        // tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only
        // be dead if it was deleted.
        if parentDir.dentry.vfsd.IsDead() {
                return syserror.ENOENT
        }
        mnt := rp.Mount()
        if err := mnt.CheckBeginWrite(); err != nil {
                return err
        }
        defer mnt.EndWrite()

        if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
                return err
        }
        if err := create(parentDir, name); err != nil {
                return err
        }

        ev := linux.IN_CREATE
        if dir {
                ev |= linux.IN_ISDIR
        }
        parentDir.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
        parentDir.inode.touchCMtime()
        return nil
}

// AccessAt implements vfs.Filesystem.Impl.AccessAt.
func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
        fs.mu.RLock()
        defer fs.mu.RUnlock()
        d, err := resolveLocked(ctx, rp)
        if err != nil {
                return err
        }
        return d.inode.checkPermissions(creds, ats)
}

// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
        fs.mu.RLock()
        defer fs.mu.RUnlock()
        d, err := resolveLocked(ctx, rp)
        if err != nil {
                return nil, err
        }
        if opts.CheckSearchable {
                if !d.inode.isDir() {
                        return nil, linuxerr.ENOTDIR
                }
                if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
                        return nil, err
                }
        }
        d.IncRef()
        return &d.vfsd, nil
}

// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
        fs.mu.RLock()
        defer fs.mu.RUnlock()
        dir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
        if err != nil {
                return nil, err
        }
        dir.dentry.IncRef()
        return &dir.dentry.vfsd, nil
}

// LinkAt implements vfs.FilesystemImpl.LinkAt.
func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
        return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
                if rp.Mount() != vd.Mount() {
                        return linuxerr.EXDEV
                }
                d := vd.Dentry().Impl().(*dentry)
                i := d.inode
                if i.isDir() {
                        return linuxerr.EPERM
                }
                if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
                        return err
                }
                if i.nlink == 0 {
                        return syserror.ENOENT
                }
                if i.nlink == maxLinks {
                        return linuxerr.EMLINK
                }
                i.incLinksLocked()
                i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */)
                parentDir.insertChildLocked(fs.newDentry(i), name)
                return nil
        })
}

// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
        return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error {
                creds := rp.Credentials()
                if parentDir.inode.nlink == maxLinks {
                        return linuxerr.EMLINK
                }
                parentDir.inode.incLinksLocked() // from child's ".."
                childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)
                parentDir.insertChildLocked(&childDir.dentry, name)
                return nil
        })
}

// MknodAt implements vfs.FilesystemImpl.MknodAt.
func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
        return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
                creds := rp.Credentials()
                var childInode *inode
                switch opts.Mode.FileType() {
                case linux.S_IFREG:
                        childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)
                case linux.S_IFIFO:
                        childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)
                case linux.S_IFBLK:
                        childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor, parentDir)
                case linux.S_IFCHR:
                        childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor, parentDir)
                case linux.S_IFSOCK:
                        childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint, parentDir)
                default:
                        return linuxerr.EINVAL
                }
                child := fs.newDentry(childInode)
                parentDir.insertChildLocked(child, name)
                return nil
        })
}

// OpenAt implements vfs.FilesystemImpl.OpenAt.
func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        if opts.Flags&linux.O_TMPFILE != 0 {
                // Not yet supported.
                return nil, linuxerr.EOPNOTSUPP
        }

        // Handle O_CREAT and !O_CREAT separately, since in the latter case we
        // don't need fs.mu for writing.
        if opts.Flags&linux.O_CREAT == 0 {
                fs.mu.RLock()
                d, err := resolveLocked(ctx, rp)
                if err != nil {
                        fs.mu.RUnlock()
                        return nil, err
                }
                d.IncRef()
                defer d.DecRef(ctx)
                fs.mu.RUnlock()
                return d.open(ctx, rp, &opts, false /* afterCreate */)
        }

        mustCreate := opts.Flags&linux.O_EXCL != 0
        start := rp.Start().Impl().(*dentry)
        fs.mu.Lock()
        unlocked := false
        unlock := func() {
                if !unlocked {
                        fs.mu.Unlock()
                        unlocked = true
                }
        }
        defer unlock()
        if rp.Done() {
                // Reject attempts to open mount root directory with O_CREAT.
                if rp.MustBeDir() {
                        return nil, syserror.EISDIR
                }
                if mustCreate {
                        return nil, linuxerr.EEXIST
                }
                start.IncRef()
                defer start.DecRef(ctx)
                unlock()
                return start.open(ctx, rp, &opts, false /* afterCreate */)
        }
afterTrailingSymlink:
        parentDir, err := walkParentDirLocked(ctx, rp, start)
        if err != nil {
                return nil, err
        }
        // Check for search permission in the parent directory.
        if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
                return nil, err
        }
        // Reject attempts to open directories with O_CREAT.
        if rp.MustBeDir() {
                return nil, syserror.EISDIR
        }
        name := rp.Component()
        if name == "." || name == ".." {
                return nil, syserror.EISDIR
        }
        if len(name) > linux.NAME_MAX {
                return nil, linuxerr.ENAMETOOLONG
        }
        // Determine whether or not we need to create a file.
        child, ok := parentDir.childMap[name]
        if !ok {
                // Already checked for searchability above; now check for writability.
                if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
                        return nil, err
                }
                if err := rp.Mount().CheckBeginWrite(); err != nil {
                        return nil, err
                }
                defer rp.Mount().EndWrite()
                // Create and open the child.
                creds := rp.Credentials()
                child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir))
                parentDir.insertChildLocked(child, name)
                child.IncRef()
                defer child.DecRef(ctx)
                unlock()
                fd, err := child.open(ctx, rp, &opts, true)
                if err != nil {
                        return nil, err
                }
                parentDir.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
                parentDir.inode.touchCMtime()
                return fd, nil
        }
        if mustCreate {
                return nil, linuxerr.EEXIST
        }
        // Is the file mounted over?
        if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
                return nil, err
        }
        // Do we need to resolve a trailing symlink?
        if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
                // Symlink traversal updates access time.
                child.inode.touchAtime(rp.Mount())
                if err := rp.HandleSymlink(symlink.target); err != nil {
                        return nil, err
                }
                start = &parentDir.dentry
                goto afterTrailingSymlink
        }
        if rp.MustBeDir() && !child.inode.isDir() {
                return nil, linuxerr.ENOTDIR
        }
        child.IncRef()
        defer child.DecRef(ctx)
        unlock()
        return child.open(ctx, rp, &opts, false)
}

// Preconditions: The caller must hold no locks (since opening pipes may block
// indefinitely).
func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
        ats := vfs.AccessTypesForOpenFlags(opts)
        if !afterCreate {
                if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil {
                        return nil, err
                }
        }
        switch impl := d.inode.impl.(type) {
        case *regularFile:
                var fd regularFileFD
                fd.LockFD.Init(&d.inode.locks)
                if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
                        return nil, err
                }
                if !afterCreate && opts.Flags&linux.O_TRUNC != 0 {
                        if _, err := impl.truncate(0); err != nil {
                                return nil, err
                        }
                }
                if fd.vfsfd.IsWritable() {
                        fsmetric.TmpfsOpensW.Increment()
                } else if fd.vfsfd.IsReadable() {
                        fsmetric.TmpfsOpensRO.Increment()
                }
                return &fd.vfsfd, nil
        case *directory:
                // Can't open directories writably.
                if ats&vfs.MayWrite != 0 {
                        return nil, syserror.EISDIR
                }
                var fd directoryFD
                fd.LockFD.Init(&d.inode.locks)
                if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
                        return nil, err
                }
                return &fd.vfsfd, nil
        case *symlink:
                // Can't open symlinks without O_PATH, which is handled at the VFS layer.
                return nil, linuxerr.ELOOP
        case *namedPipe:
                return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks)
        case *deviceFile:
                return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
        case *socketFile:
                return nil, linuxerr.ENXIO
        default:
                panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
        }
}

// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
        fs.mu.RLock()
        defer fs.mu.RUnlock()
        d, err := resolveLocked(ctx, rp)
        if err != nil {
                return "", err
        }
        symlink, ok := d.inode.impl.(*symlink)
        if !ok {
                return "", linuxerr.EINVAL
        }
        symlink.inode.touchAtime(rp.Mount())
        return symlink.target, nil
}

// RenameAt implements vfs.FilesystemImpl.RenameAt.
func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
        // Resolve newParentDir first to verify that it's on this Mount.
        fs.mu.Lock()
        defer fs.mu.Unlock()
        newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
        if err != nil {
                return err
        }

        if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
                // TODO(b/145974740): Support other renameat2 flags.
                return linuxerr.EINVAL
        }

        newName := rp.Component()
        if newName == "." || newName == ".." {
                if opts.Flags&linux.RENAME_NOREPLACE != 0 {
                        return linuxerr.EEXIST
                }
                return linuxerr.EBUSY
        }
        mnt := rp.Mount()
        if mnt != oldParentVD.Mount() {
                return linuxerr.EXDEV
        }
        if err := mnt.CheckBeginWrite(); err != nil {
                return err
        }
        defer mnt.EndWrite()

        oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory)
        if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
                return err
        }
        renamed, ok := oldParentDir.childMap[oldName]
        if !ok {
                return syserror.ENOENT
        }
        if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil {
                return err
        }
        // Note that we don't need to call rp.CheckMount(), since if renamed is a
        // mount point then we want to rename the mount point, not anything in the
        // mounted filesystem.
        if renamed.inode.isDir() {
                if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) {
                        return linuxerr.EINVAL
                }
                if oldParentDir != newParentDir {
                        // Writability is needed to change renamed's "..".
                        if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
                                return err
                        }
                }
        } else {
                if opts.MustBeDir || rp.MustBeDir() {
                        return linuxerr.ENOTDIR
                }
        }

        if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
                return err
        }
        replaced, ok := newParentDir.childMap[newName]
        if ok {
                if opts.Flags&linux.RENAME_NOREPLACE != 0 {
                        return linuxerr.EEXIST
                }
                replacedDir, ok := replaced.inode.impl.(*directory)
                if ok {
                        if !renamed.inode.isDir() {
                                return syserror.EISDIR
                        }
                        if len(replacedDir.childMap) != 0 {
                                return linuxerr.ENOTEMPTY
                        }
                } else {
                        if rp.MustBeDir() {
                                return linuxerr.ENOTDIR
                        }
                        if renamed.inode.isDir() {
                                return linuxerr.ENOTDIR
                        }
                }
        } else {
                if renamed.inode.isDir() && newParentDir.inode.nlink == maxLinks {
                        return linuxerr.EMLINK
                }
        }
        // tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can
        // only be dead if it was deleted.
        if newParentDir.dentry.vfsd.IsDead() {
                return syserror.ENOENT
        }

        // Linux places this check before some of those above; we do it here for
        // simplicity, under the assumption that applications are not intentionally
        // doing noop renames expecting them to succeed where non-noop renames
        // would fail.
        if renamed == replaced {
                return nil
        }
        vfsObj := rp.VirtualFilesystem()
        mntns := vfs.MountNamespaceFromContext(ctx)
        defer mntns.DecRef(ctx)
        var replacedVFSD *vfs.Dentry
        if replaced != nil {
                replacedVFSD = &replaced.vfsd
        }
        if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
                return err
        }
        if replaced != nil {
                newParentDir.removeChildLocked(replaced)
                if replaced.inode.isDir() {
                        // Remove links for replaced/. and replaced/..
                        replaced.inode.decLinksLocked(ctx)
                        newParentDir.inode.decLinksLocked(ctx)
                }
                replaced.inode.decLinksLocked(ctx)
        }
        oldParentDir.removeChildLocked(renamed)
        newParentDir.insertChildLocked(renamed, newName)
        vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
        oldParentDir.inode.touchCMtime()
        if oldParentDir != newParentDir {
                if renamed.inode.isDir() {
                        oldParentDir.inode.decLinksLocked(ctx)
                        newParentDir.inode.incLinksLocked()
                }
                newParentDir.inode.touchCMtime()
        }
        renamed.inode.touchCtime()

        vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir())
        return nil
}

// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
        fs.mu.Lock()
        defer fs.mu.Unlock()
        parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
        if err != nil {
                return err
        }
        if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
                return err
        }
        name := rp.Component()
        if name == "." {
                return linuxerr.EINVAL
        }
        if name == ".." {
                return linuxerr.ENOTEMPTY
        }
        child, ok := parentDir.childMap[name]
        if !ok {
                return syserror.ENOENT
        }
        if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
                return err
        }
        childDir, ok := child.inode.impl.(*directory)
        if !ok {
                return linuxerr.ENOTDIR
        }
        if len(childDir.childMap) != 0 {
                return linuxerr.ENOTEMPTY
        }
        mnt := rp.Mount()
        if err := mnt.CheckBeginWrite(); err != nil {
                return err
        }
        defer mnt.EndWrite()
        vfsObj := rp.VirtualFilesystem()
        mntns := vfs.MountNamespaceFromContext(ctx)
        defer mntns.DecRef(ctx)
        if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
                return err
        }
        parentDir.removeChildLocked(child)
        parentDir.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
        // Remove links for child, child/., and child/..
        child.inode.decLinksLocked(ctx)
        child.inode.decLinksLocked(ctx)
        parentDir.inode.decLinksLocked(ctx)
        vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
        parentDir.inode.touchCMtime()
        return nil
}

// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
        fs.mu.RLock()
        d, err := resolveLocked(ctx, rp)
        if err != nil {
                fs.mu.RUnlock()
                return err
        }
        err = d.inode.setStat(ctx, rp.Credentials(), &opts)
        fs.mu.RUnlock()
        if err != nil {
                return err
        }

        if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
                d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
        }
        return nil
}

// StatAt implements vfs.FilesystemImpl.StatAt.
func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
        fs.mu.RLock()
        defer fs.mu.RUnlock()
        d, err := resolveLocked(ctx, rp)
        if err != nil {
                return linux.Statx{}, err
        }
        var stat linux.Statx
        d.inode.statTo(&stat)
        return stat, nil
}

// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
        fs.mu.RLock()
        defer fs.mu.RUnlock()
        if _, err := resolveLocked(ctx, rp); err != nil {
                return linux.Statfs{}, err
        }
        return globalStatfs, nil
}

// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
        return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
                creds := rp.Credentials()
                child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target, parentDir))
                parentDir.insertChildLocked(child, name)
                return nil
        })
}

// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
        fs.mu.Lock()
        defer fs.mu.Unlock()
        parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
        if err != nil {
                return err
        }
        if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
                return err
        }
        name := rp.Component()
        if name == "." || name == ".." {
                return syserror.EISDIR
        }
        child, ok := parentDir.childMap[name]
        if !ok {
                return syserror.ENOENT
        }
        if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
                return err
        }
        if child.inode.isDir() {
                return syserror.EISDIR
        }
        if rp.MustBeDir() {
                return linuxerr.ENOTDIR
        }
        mnt := rp.Mount()
        if err := mnt.CheckBeginWrite(); err != nil {
                return err
        }
        defer mnt.EndWrite()
        vfsObj := rp.VirtualFilesystem()
        mntns := vfs.MountNamespaceFromContext(ctx)
        defer mntns.DecRef(ctx)
        if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
                return err
        }

        // Generate inotify events. Note that this must take place before the link
        // count of the child is decremented, or else the watches may be dropped
        // before these events are added.
        vfs.InotifyRemoveChild(ctx, &child.inode.watches, &parentDir.inode.watches, name)

        parentDir.removeChildLocked(child)
        child.inode.decLinksLocked(ctx)
        vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
        parentDir.inode.touchCMtime()
        return nil
}

// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
        fs.mu.RLock()
        defer fs.mu.RUnlock()
        d, err := resolveLocked(ctx, rp)
        if err != nil {
                return nil, err
        }
        if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
                return nil, err
        }
        switch impl := d.inode.impl.(type) {
        case *socketFile:
                if impl.ep == nil {
                        return nil, linuxerr.ECONNREFUSED
                }
                return impl.ep, nil
        default:
                return nil, linuxerr.ECONNREFUSED
        }
}

// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
        fs.mu.RLock()
        defer fs.mu.RUnlock()
        d, err := resolveLocked(ctx, rp)
        if err != nil {
                return nil, err
        }
        return d.inode.listXattr(rp.Credentials(), size)
}

// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
        fs.mu.RLock()
        defer fs.mu.RUnlock()
        d, err := resolveLocked(ctx, rp)
        if err != nil {
                return "", err
        }
        return d.inode.getXattr(rp.Credentials(), &opts)
}

// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
        fs.mu.RLock()
        d, err := resolveLocked(ctx, rp)
        if err != nil {
                fs.mu.RUnlock()
                return err
        }
        err = d.inode.setXattr(rp.Credentials(), &opts)
        fs.mu.RUnlock()
        if err != nil {
                return err
        }

        d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
        return nil
}

// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
        fs.mu.RLock()
        d, err := resolveLocked(ctx, rp)
        if err != nil {
                fs.mu.RUnlock()
                return err
        }
        err = d.inode.removeXattr(rp.Credentials(), name)
        fs.mu.RUnlock()
        if err != nil {
                return err
        }

        d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
        return nil
}

// PrependPath implements vfs.FilesystemImpl.PrependPath.
func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
        fs.mu.RLock()
        defer fs.mu.RUnlock()
        mnt := vd.Mount()
        d := vd.Dentry().Impl().(*dentry)
        for {
                if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
                        return vfs.PrependPathAtVFSRootError{}
                }
                if &d.vfsd == mnt.Root() {
                        return nil
                }
                if d.parent == nil {
                        if d.name != "" {
                                // This file must have been created by
                                // newUnlinkedRegularFileDescription(). In Linux,
                                // mm/shmem.c:__shmem_file_setup() =>
                                // fs/file_table.c:alloc_file_pseudo() sets the created
                                // dentry's dentry_operations to anon_ops, for which d_dname ==
                                // simple_dname. fs/d_path.c:simple_dname() defines the
                                // dentry's pathname to be its name, prefixed with "/" and
                                // suffixed with " (deleted)".
                                b.PrependComponent("/" + d.name)
                                b.AppendString(" (deleted)")
                                return vfs.PrependPathSyntheticError{}
                        }
                        return vfs.PrependPathAtNonMountRootError{}
                }
                b.PrependComponent(d.name)
                d = d.parent
        }
}

// MountOptions implements vfs.FilesystemImpl.MountOptions.
func (fs *filesystem) MountOptions() string {
        return fs.mopts
}

























    1 




    1 


    1 









    7 




    1 





    6 
    2 
    1 


    1 
    1 










    1 
    1 





















    3 

    1 


    2 


    1 




    8 







    8 
    1 



    1 



    1 




    7 



    7 
    1 


    6 




    1 




    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
)

func lookupCaps(t *kernel.Task, tid kernel.ThreadID) (permitted, inheritable, effective auth.CapabilitySet, err error) {
        if tid < 0 {
                err = linuxerr.EINVAL
                return
        }
        if tid > 0 {
                t = t.PIDNamespace().TaskWithID(tid)
        }
        if t == nil {
                err = linuxerr.ESRCH
                return
        }
        creds := t.Credentials()
        permitted, inheritable, effective = creds.PermittedCaps, creds.InheritableCaps, creds.EffectiveCaps
        return
}

// Capget implements Linux syscall capget.
func Capget(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        hdrAddr := args[0].Pointer()
        dataAddr := args[1].Pointer()

        var hdr linux.CapUserHeader
        if _, err := hdr.CopyIn(t, hdrAddr); err != nil {
                return 0, nil, err
        }
        // hdr.Pid doesn't need to be valid if this capget() is a "version probe"
        // (hdr.Version is unrecognized and dataAddr is null), so we can't do the
        // lookup yet.
        switch hdr.Version {
        case linux.LINUX_CAPABILITY_VERSION_1:
                if dataAddr == 0 {
                        return 0, nil, nil
                }
                p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid))
                if err != nil {
                        return 0, nil, err
                }
                data := linux.CapUserData{
                        Effective:   uint32(e),
                        Permitted:   uint32(p),
                        Inheritable: uint32(i),
                }
                _, err = data.CopyOut(t, dataAddr)
                return 0, nil, err

        case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
                if dataAddr == 0 {
                        return 0, nil, nil
                }
                p, i, e, err := lookupCaps(t, kernel.ThreadID(hdr.Pid))
                if err != nil {
                        return 0, nil, err
                }
                data := [2]linux.CapUserData{
                        {
                                Effective:   uint32(e),
                                Permitted:   uint32(p),
                                Inheritable: uint32(i),
                        },
                        {
                                Effective:   uint32(e >> 32),
                                Permitted:   uint32(p >> 32),
                                Inheritable: uint32(i >> 32),
                        },
                }
                _, err = linux.CopyCapUserDataSliceOut(t, dataAddr, data[:])
                return 0, nil, err

        default:
                hdr.Version = linux.HighestCapabilityVersion
                if _, err := hdr.CopyOut(t, hdrAddr); err != nil {
                        return 0, nil, err
                }
                if dataAddr != 0 {
                        return 0, nil, linuxerr.EINVAL
                }
                return 0, nil, nil
        }
}

// Capset implements Linux syscall capset.
func Capset(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        hdrAddr := args[0].Pointer()
        dataAddr := args[1].Pointer()

        var hdr linux.CapUserHeader
        if _, err := hdr.CopyIn(t, hdrAddr); err != nil {
                return 0, nil, err
        }
        switch hdr.Version {
        case linux.LINUX_CAPABILITY_VERSION_1:
                if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() {
                        return 0, nil, linuxerr.EPERM
                }
                var data linux.CapUserData
                if _, err := data.CopyIn(t, dataAddr); err != nil {
                        return 0, nil, err
                }
                p := auth.CapabilitySet(data.Permitted) & auth.AllCapabilities
                i := auth.CapabilitySet(data.Inheritable) & auth.AllCapabilities
                e := auth.CapabilitySet(data.Effective) & auth.AllCapabilities
                return 0, nil, t.SetCapabilitySets(p, i, e)

        case linux.LINUX_CAPABILITY_VERSION_2, linux.LINUX_CAPABILITY_VERSION_3:
                if tid := kernel.ThreadID(hdr.Pid); tid != 0 && tid != t.ThreadID() {
                        return 0, nil, linuxerr.EPERM
                }
                var data [2]linux.CapUserData
                if _, err := linux.CopyCapUserDataSliceIn(t, dataAddr, data[:]); err != nil {
                        return 0, nil, err
                }
                p := (auth.CapabilitySet(data[0].Permitted) | (auth.CapabilitySet(data[1].Permitted) << 32)) & auth.AllCapabilities
                i := (auth.CapabilitySet(data[0].Inheritable) | (auth.CapabilitySet(data[1].Inheritable) << 32)) & auth.AllCapabilities
                e := (auth.CapabilitySet(data[0].Effective) | (auth.CapabilitySet(data[1].Effective) << 32)) & auth.AllCapabilities
                return 0, nil, t.SetCapabilitySets(p, i, e)

        default:
                hdr.Version = linux.HighestCapabilityVersion
                if _, err := hdr.CopyOut(t, hdrAddr); err != nil {
                        return 0, nil, err
                }
                return 0, nil, linuxerr.EINVAL
        }
}





































  527 






  525 



  525 


  527 





























  635 

    1 


  634 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/sync"
)

// SignalHandlers holds information about signal actions.
//
// +stateify savable
type SignalHandlers struct {
        // mu protects actions, as well as the signal state of all tasks and thread
        // groups using this SignalHandlers object. (See comment on
        // ThreadGroup.signalHandlers.)
        mu sync.Mutex `state:"nosave"`

        // actions is the action to be taken upon receiving each signal.
        actions map[linux.Signal]linux.SigAction
}

// NewSignalHandlers returns a new SignalHandlers specifying all default
// actions.
func NewSignalHandlers() *SignalHandlers {
        return &SignalHandlers{
                actions: make(map[linux.Signal]linux.SigAction),
        }
}

// Fork returns a copy of sh for a new thread group.
func (sh *SignalHandlers) Fork() *SignalHandlers {
        sh2 := NewSignalHandlers()
        sh.mu.Lock()
        defer sh.mu.Unlock()
        for sig, act := range sh.actions {
                sh2.actions[sig] = act
        }
        return sh2
}

// CopyForExec returns a copy of sh for a thread group that is undergoing an
// execve. (See comments in Task.finishExec.)
func (sh *SignalHandlers) CopyForExec() *SignalHandlers {
        sh2 := NewSignalHandlers()
        sh.mu.Lock()
        defer sh.mu.Unlock()
        for sig, act := range sh.actions {
                if act.Handler == linux.SIG_IGN {
                        sh2.actions[sig] = linux.SigAction{
                                Handler: linux.SIG_IGN,
                        }
                }
        }
        return sh2
}

// IsIgnored returns true if the signal is ignored.
func (sh *SignalHandlers) IsIgnored(sig linux.Signal) bool {
        sh.mu.Lock()
        defer sh.mu.Unlock()
        sa, ok := sh.actions[sig]
        return ok && sa.Handler == linux.SIG_IGN
}

// dequeueActionLocked returns the SignalAct that should be used to handle sig.
//
// Preconditions: sh.mu must be locked.
func (sh *SignalHandlers) dequeueAction(sig linux.Signal) linux.SigAction {
        act := sh.actions[sig]
        if act.Flags&linux.SA_RESETHAND != 0 {
                delete(sh.actions, sig)
        }
        return act
}











































    3 














    7 




    7 



    7 




    7 










    7 











    7 






    1 





    3 














  188 












  189 



    7 

















    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package proc implements a partial in-memory file system for procfs.
package proc

import (
        "fmt"
        "strconv"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

const (
        // Name is the default filesystem name.
        Name                     = "proc"
        defaultMaxCachedDentries = uint64(1000)
)

// FilesystemType is the factory class for procfs.
//
// +stateify savable
type FilesystemType struct{}

// Name implements vfs.FilesystemType.Name.
func (FilesystemType) Name() string {
        return Name
}

// Release implements vfs.FilesystemType.Release.
func (FilesystemType) Release(ctx context.Context) {}

// +stateify savable
type filesystem struct {
        kernfs.Filesystem

        devMinor uint32
}

// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
        k := kernel.KernelFromContext(ctx)
        if k == nil {
                return nil, nil, fmt.Errorf("procfs requires a kernel")
        }
        pidns := kernel.PIDNamespaceFromContext(ctx)
        if pidns == nil {
                return nil, nil, fmt.Errorf("procfs requires a PID namespace")
        }
        devMinor, err := vfsObj.GetAnonBlockDevMinor()
        if err != nil {
                return nil, nil, err
        }

        mopts := vfs.GenericParseMountOptions(opts.Data)
        maxCachedDentries := defaultMaxCachedDentries
        if str, ok := mopts["dentry_cache_limit"]; ok {
                delete(mopts, "dentry_cache_limit")
                maxCachedDentries, err = strconv.ParseUint(str, 10, 64)
                if err != nil {
                        ctx.Warningf("proc.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
                        return nil, nil, linuxerr.EINVAL
                }
        }

        procfs := &filesystem{
                devMinor: devMinor,
        }
        procfs.MaxCachedDentries = maxCachedDentries
        procfs.VFSFilesystem().Init(vfsObj, &ft, procfs)

        var fakeCgroupControllers map[string]string
        if opts.InternalData != nil {
                data := opts.InternalData.(*InternalData)
                fakeCgroupControllers = data.Cgroups
        }

        inode := procfs.newTasksInode(ctx, k, pidns, fakeCgroupControllers)
        var dentry kernfs.Dentry
        dentry.InitRoot(&procfs.Filesystem, inode)
        return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
}

// Release implements vfs.FilesystemImpl.Release.
func (fs *filesystem) Release(ctx context.Context) {
        fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
        fs.Filesystem.Release(ctx)
}

// MountOptions implements vfs.FilesystemImpl.MountOptions.
func (fs *filesystem) MountOptions() string {
        return fmt.Sprintf("dentry_cache_limit=%d", fs.MaxCachedDentries)
}

// dynamicInode is an overfitted interface for common Inodes with
// dynamicByteSource types used in procfs.
//
// +stateify savable
type dynamicInode interface {
        kernfs.Inode
        vfs.DynamicBytesSource

        Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
}

func (fs *filesystem) newInode(ctx context.Context, creds *auth.Credentials, perm linux.FileMode, inode dynamicInode) dynamicInode {
        inode.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), inode, perm)
        return inode
}

// +stateify savable
type staticFile struct {
        kernfs.DynamicBytesFile
        vfs.StaticData
}

var _ dynamicInode = (*staticFile)(nil)

func newStaticFile(data string) *staticFile {
        return &staticFile{StaticData: vfs.StaticData{Data: data}}
}

func (fs *filesystem) newStaticDir(ctx context.Context, creds *auth.Credentials, children map[string]kernfs.Inode) kernfs.Inode {
        return kernfs.NewStaticDir(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, children, kernfs.GenericDirectoryFDOptions{
                SeekEnd: kernfs.SeekEndZero,
        })
}

// InternalData contains internal data passed in to the procfs mount via
// vfs.GetFilesystemOptions.InternalData.
//
// +stateify savable
type InternalData struct {
        Cgroups map[string]string
}

// +stateify savable
type implStatFS struct{}

// StatFS implements kernfs.Inode.StatFS.
func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
        return vfs.GenericStatFS(linux.PROC_SUPER_MAGIC), nil
}



























































































    7 












    7 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/seqnum"
)

const (
        // MaxSACKBlocks is the maximum number of SACK blocks stored
        // at receiver side.
        MaxSACKBlocks = 6
)

// UpdateSACKBlocks updates the list of SACK blocks to include the segment
// specified by segStart->segEnd. If the segment happens to be an out of order
// delivery then the first block in the sack.blocks always includes the
// segment identified by segStart->segEnd.
func UpdateSACKBlocks(sack *SACKInfo, segStart seqnum.Value, segEnd seqnum.Value, rcvNxt seqnum.Value) {
        newSB := header.SACKBlock{Start: segStart, End: segEnd}

        // Ignore any invalid SACK blocks or blocks that are before rcvNxt as
        // those bytes have already been acked.
        if newSB.End.LessThanEq(newSB.Start) || newSB.End.LessThan(rcvNxt) {
                return
        }

        if sack.NumBlocks == 0 {
                sack.Blocks[0] = newSB
                sack.NumBlocks = 1
                return
        }
        var n = 0
        for i := 0; i < sack.NumBlocks; i++ {
                start, end := sack.Blocks[i].Start, sack.Blocks[i].End
                if end.LessThanEq(rcvNxt) {
                        // Discard any sack blocks that are before rcvNxt as
                        // those have already been acked.
                        continue
                }
                if newSB.Start.LessThanEq(end) && start.LessThanEq(newSB.End) {
                        // Merge this SACK block into newSB and discard this SACK
                        // block.
                        if start.LessThan(newSB.Start) {
                                newSB.Start = start
                        }
                        if newSB.End.LessThan(end) {
                                newSB.End = end
                        }
                } else {
                        // Save this block.
                        sack.Blocks[n] = sack.Blocks[i]
                        n++
                }
        }
        if rcvNxt.LessThan(newSB.Start) {
                // If this was an out of order segment then make sure that the
                // first SACK block is the one that includes the segment.
                //
                // See the first bullet point in
                // https://tools.ietf.org/html/rfc2018#section-4
                if n == MaxSACKBlocks {
                        // If the number of SACK blocks is equal to
                        // MaxSACKBlocks then discard the last SACK block.
                        n--
                }
                for i := n - 1; i >= 0; i-- {
                        sack.Blocks[i+1] = sack.Blocks[i]
                }
                sack.Blocks[0] = newSB
                n++
        }
        sack.NumBlocks = n
}

// TrimSACKBlockList updates the sack block list by removing/modifying any block
// where start is < rcvNxt.
func TrimSACKBlockList(sack *SACKInfo, rcvNxt seqnum.Value) {
        n := 0
        for i := 0; i < sack.NumBlocks; i++ {
                if sack.Blocks[i].End.LessThanEq(rcvNxt) {
                        continue
                }
                if sack.Blocks[i].Start.LessThan(rcvNxt) {
                        // Shrink this SACK block.
                        sack.Blocks[i].Start = rcvNxt
                }
                sack.Blocks[n] = sack.Blocks[i]
                n++
        }
        sack.NumBlocks = n
}

































































    1 



















































    1 


















  670 



    1 



  669 







   62 

   59 



   62 
    1 



    1 






    1 


   62 







  671 
  670 


   10 







   61 

    9 




   57 
   57 
   57 


   56 



   55 

   55 


























































































































































    2 












    2 
    2 


    2 


    2 
    1 









    1 







    1 




    1 




    1 






    1 





    1 










    4 





    2 




    2 




    2 




    2 

    1 



    2 








    3 









    3 


















   12 








   19 









    5 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
)

// SessionID is the public identifier.
type SessionID ThreadID

// ProcessGroupID is the public identifier.
type ProcessGroupID ThreadID

// Session contains a leader threadgroup and a list of ProcessGroups.
//
// +stateify savable
type Session struct {
        SessionRefs

        // leader is the originator of the Session.
        //
        // Note that this may no longer be running (and may be reaped), so the
        // ID is cached upon initial creation. The leader is still required
        // however, since its PIDNamespace defines the scope of the Session.
        //
        // The leader is immutable.
        leader *ThreadGroup

        // id is the cached identifier in the leader's namespace.
        //
        // The id is immutable.
        id SessionID

        // foreground is the foreground process group.
        //
        // This is protected by TaskSet.mu.
        foreground *ProcessGroup

        // ProcessGroups is a list of process groups in this Session. This is
        // protected by TaskSet.mu.
        processGroups processGroupList

        // sessionEntry is the embed for TaskSet.sessions. This is protected by
        // TaskSet.mu.
        sessionEntry
}

// DecRef drops a reference.
//
// Precondition: callers must hold TaskSet.mu for writing.
func (s *Session) DecRef() {
        s.SessionRefs.DecRef(func() {
                // Remove translations from the leader.
                for ns := s.leader.pidns; ns != nil; ns = ns.parent {
                        id := ns.sids[s]
                        delete(ns.sids, s)
                        delete(ns.sessions, id)
                }

                // Remove from the list of global Sessions.
                s.leader.pidns.owner.sessions.Remove(s)
        })
}

// ProcessGroup contains an originator threadgroup and a parent Session.
//
// +stateify savable
type ProcessGroup struct {
        refs ProcessGroupRefs

        // originator is the originator of the group.
        //
        // See note re: leader in Session. The same applies here.
        //
        // The originator is immutable.
        originator *ThreadGroup

        // id is the cached identifier in the originator's namespace.
        //
        // The id is immutable.
        id ProcessGroupID

        // Session is the parent Session.
        //
        // The session is immutable.
        session *Session

        // ancestors is the number of thread groups in this process group whose
        // parent is in a different process group in the same session.
        //
        // The name is derived from the fact that process groups where
        // ancestors is zero are considered "orphans".
        //
        // ancestors is protected by TaskSet.mu.
        ancestors uint32

        // processGroupEntry is the embedded entry for Sessions.groups. This is
        // protected by TaskSet.mu.
        processGroupEntry
}

// Originator retuns the originator of the process group.
func (pg *ProcessGroup) Originator() *ThreadGroup {
        return pg.originator
}

// IsOrphan returns true if this process group is an orphan.
func (pg *ProcessGroup) IsOrphan() bool {
        ts := pg.originator.TaskSet()
        ts.mu.RLock()
        defer ts.mu.RUnlock()
        return pg.ancestors == 0
}

// incRefWithParent grabs a reference.
//
// This function is called when this ProcessGroup is being associated with some
// new ThreadGroup, tg. parentPG is the ProcessGroup of tg's parent
// ThreadGroup. If tg is init, then parentPG may be nil.
//
// Precondition: callers must hold TaskSet.mu for writing.
func (pg *ProcessGroup) incRefWithParent(parentPG *ProcessGroup) {
        // We acquire an "ancestor" reference in the case of a nil parent.
        // This is because the process being associated is init, and init can
        // never be orphaned (we count it as always having an ancestor).
        if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) {
                pg.ancestors++
        }

        pg.refs.IncRef()
}

// decRefWithParent drops a reference.
//
// parentPG is per incRefWithParent.
//
// Precondition: callers must hold TaskSet.mu for writing.
func (pg *ProcessGroup) decRefWithParent(parentPG *ProcessGroup) {
        // See incRefWithParent regarding parent == nil.
        if pg != parentPG && (parentPG == nil || pg.session == parentPG.session) {
                pg.ancestors--
        }

        alive := true
        pg.refs.DecRef(func() {
                alive = false // don't bother with handleOrphan.

                // Remove translations from the originator.
                for ns := pg.originator.pidns; ns != nil; ns = ns.parent {
                        id := ns.pgids[pg]
                        delete(ns.pgids, pg)
                        delete(ns.processGroups, id)
                }

                // Remove the list of process groups.
                pg.session.processGroups.Remove(pg)
                pg.session.DecRef()
        })
        if alive {
                pg.handleOrphan()
        }
}

// parentPG returns the parent process group.
//
// Precondition: callers must hold TaskSet.mu.
func (tg *ThreadGroup) parentPG() *ProcessGroup {
        if tg.leader.parent != nil {
                return tg.leader.parent.tg.processGroup
        }
        return nil
}

// handleOrphan checks whether the process group is an orphan and has any
// stopped jobs. If yes, then appropriate signals are delivered to each thread
// group within the process group.
//
// Precondition: callers must hold TaskSet.mu for writing.
func (pg *ProcessGroup) handleOrphan() {
        // Check if this process is an orphan.
        if pg.ancestors != 0 {
                return
        }

        // See if there are any stopped jobs.
        hasStopped := false
        pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) {
                if tg.processGroup != pg {
                        return
                }
                tg.signalHandlers.mu.Lock()
                if tg.groupStopComplete {
                        hasStopped = true
                }
                tg.signalHandlers.mu.Unlock()
        })
        if !hasStopped {
                return
        }

        // Deliver appropriate signals to all thread groups.
        pg.originator.pidns.owner.forEachThreadGroupLocked(func(tg *ThreadGroup) {
                if tg.processGroup != pg {
                        return
                }
                tg.signalHandlers.mu.Lock()
                tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGHUP), true /* group */)
                tg.leader.sendSignalLocked(SignalInfoPriv(linux.SIGCONT), true /* group */)
                tg.signalHandlers.mu.Unlock()
        })

        return
}

// Session returns the process group's session without taking a reference.
func (pg *ProcessGroup) Session() *Session {
        return pg.session
}

// SendSignal sends a signal to all processes inside the process group. It is
// analagous to kernel/signal.c:kill_pgrp.
func (pg *ProcessGroup) SendSignal(info *linux.SignalInfo) error {
        tasks := pg.originator.TaskSet()
        tasks.mu.RLock()
        defer tasks.mu.RUnlock()

        var lastErr error
        for tg := range tasks.Root.tgids {
                if tg.processGroup == pg {
                        tg.signalHandlers.mu.Lock()
                        infoCopy := *info
                        if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
                                lastErr = err
                        }
                        tg.signalHandlers.mu.Unlock()
                }
        }
        return lastErr
}

// CreateSession creates a new Session, with the ThreadGroup as the leader.
//
// EPERM may be returned if either the given ThreadGroup is already a Session
// leader, or a ProcessGroup already exists for the ThreadGroup's ID.
func (tg *ThreadGroup) CreateSession() error {
        tg.pidns.owner.mu.Lock()
        defer tg.pidns.owner.mu.Unlock()
        tg.signalHandlers.mu.Lock()
        defer tg.signalHandlers.mu.Unlock()
        return tg.createSession()
}

// createSession creates a new session for a threadgroup.
//
// Precondition: callers must hold TaskSet.mu and the signal mutex for writing.
func (tg *ThreadGroup) createSession() error {
        // Get the ID for this thread in the current namespace.
        id := tg.pidns.tgids[tg]

        // Check if this ThreadGroup already leads a Session, or
        // if the proposed group is already taken.
        for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() {
                if s.leader.pidns != tg.pidns {
                        continue
                }
                if s.leader == tg {
                        return linuxerr.EPERM
                }
                if s.id == SessionID(id) {
                        return linuxerr.EPERM
                }
                for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() {
                        if pg.id == ProcessGroupID(id) {
                                return linuxerr.EPERM
                        }
                }
        }

        // Create a new Session, with a single reference.
        s := &Session{
                id:     SessionID(id),
                leader: tg,
        }
        s.InitRefs()

        // Create a new ProcessGroup, belonging to that Session.
        // This also has a single reference (assigned below).
        //
        // Note that since this is a new session and a new process group, there
        // will be zero ancestors for this process group. (It is an orphan at
        // this point.)
        pg := &ProcessGroup{
                id:         ProcessGroupID(id),
                originator: tg,
                session:    s,
                ancestors:  0,
        }
        pg.refs.InitRefs()

        // Tie them and return the result.
        s.processGroups.PushBack(pg)
        tg.pidns.owner.sessions.PushBack(s)

        // Leave the current group, and assign the new one.
        if tg.processGroup != nil {
                oldParentPG := tg.parentPG()
                tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
                        childTG.processGroup.incRefWithParent(pg)
                        childTG.processGroup.decRefWithParent(oldParentPG)
                })
                // If tg.processGroup is an orphan, decRefWithParent will lock
                // the signal mutex of each thread group in tg.processGroup.
                // However, tg's signal mutex may already be locked at this
                // point. We change tg's process group before calling
                // decRefWithParent to avoid locking tg's signal mutex twice.
                oldPG := tg.processGroup
                tg.processGroup = pg
                oldPG.decRefWithParent(oldParentPG)
        } else {
                // The current process group may be nil only in the case of an
                // unparented thread group (i.e. the init process). This would
                // not normally occur, but we allow it for the convenience of
                // CreateSession working from that point. There will be no
                // child processes. We always say that the very first group
                // created has ancestors (avoids checks elsewhere).
                //
                // Note that this mirrors the parent == nil logic in
                // incRef/decRef/reparent, which counts nil as an ancestor.
                tg.processGroup = pg
                tg.processGroup.ancestors++
        }

        // Ensure a translation is added to all namespaces.
        for ns := tg.pidns; ns != nil; ns = ns.parent {
                local := ns.tgids[tg]
                ns.sids[s] = SessionID(local)
                ns.sessions[SessionID(local)] = s
                ns.pgids[pg] = ProcessGroupID(local)
                ns.processGroups[ProcessGroupID(local)] = pg
        }

        // Disconnect from the controlling terminal.
        tg.tty = nil

        return nil
}

// CreateProcessGroup creates a new process group.
//
// An EPERM error will be returned if the ThreadGroup belongs to a different
// Session, is a Session leader or the group already exists.
func (tg *ThreadGroup) CreateProcessGroup() error {
        tg.pidns.owner.mu.Lock()
        defer tg.pidns.owner.mu.Unlock()

        // Get the ID for this thread in the current namespace.
        id := tg.pidns.tgids[tg]

        // Check whether a process still exists or not.
        if id == 0 {
                return linuxerr.ESRCH
        }

        // Per above, check for a Session leader or existing group.
        for s := tg.pidns.owner.sessions.Front(); s != nil; s = s.Next() {
                if s.leader.pidns != tg.pidns {
                        continue
                }
                if s.leader == tg {
                        return linuxerr.EPERM
                }
                for pg := s.processGroups.Front(); pg != nil; pg = pg.Next() {
                        if pg.id == ProcessGroupID(id) {
                                return linuxerr.EPERM
                        }
                }
        }

        // Create a new ProcessGroup, belonging to the current Session.
        //
        // We manually adjust the ancestors if the parent is in the same
        // session.
        tg.processGroup.session.IncRef()
        pg := ProcessGroup{
                id:         ProcessGroupID(id),
                originator: tg,
                session:    tg.processGroup.session,
        }
        pg.refs.InitRefs()

        if tg.leader.parent != nil && tg.leader.parent.tg.processGroup.session == pg.session {
                pg.ancestors++
        }

        // Assign the new process group; adjust children.
        oldParentPG := tg.parentPG()
        tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
                childTG.processGroup.incRefWithParent(&pg)
                childTG.processGroup.decRefWithParent(oldParentPG)
        })
        tg.processGroup.decRefWithParent(oldParentPG)
        tg.processGroup = &pg

        // Add the new process group to the session.
        pg.session.processGroups.PushBack(&pg)

        // Ensure this translation is added to all namespaces.
        for ns := tg.pidns; ns != nil; ns = ns.parent {
                local := ns.tgids[tg]
                ns.pgids[&pg] = ProcessGroupID(local)
                ns.processGroups[ProcessGroupID(local)] = &pg
        }

        return nil
}

// JoinProcessGroup joins an existing process group.
//
// This function will return EACCES if an exec has been performed since fork
// by the given ThreadGroup, and EPERM if the Sessions are not the same or the
// group does not exist.
//
// If checkExec is set, then the join is not permitted after the process has
// executed exec at least once.
func (tg *ThreadGroup) JoinProcessGroup(pidns *PIDNamespace, pgid ProcessGroupID, checkExec bool) error {
        pidns.owner.mu.Lock()
        defer pidns.owner.mu.Unlock()

        // Lookup the ProcessGroup.
        pg := pidns.processGroups[pgid]
        if pg == nil {
                return linuxerr.EPERM
        }

        // Disallow the join if an execve has performed, per POSIX.
        if checkExec && tg.execed {
                return linuxerr.EACCES
        }

        // See if it's in the same session as ours.
        if pg.session != tg.processGroup.session {
                return linuxerr.EPERM
        }

        // Join the group; adjust children.
        parentPG := tg.parentPG()
        pg.incRefWithParent(parentPG)
        tg.forEachChildThreadGroupLocked(func(childTG *ThreadGroup) {
                childTG.processGroup.incRefWithParent(pg)
                childTG.processGroup.decRefWithParent(tg.processGroup)
        })
        tg.processGroup.decRefWithParent(parentPG)
        tg.processGroup = pg

        return nil
}

// Session returns the ThreadGroup's Session.
//
// A reference is not taken on the session.
func (tg *ThreadGroup) Session() *Session {
        tg.pidns.owner.mu.RLock()
        defer tg.pidns.owner.mu.RUnlock()
        return tg.processGroup.session
}

// IDOfSession returns the Session assigned to s in PID namespace ns.
//
// If this group isn't visible in this namespace, zero will be returned. It is
// the callers responsibility to check that before using this function.
func (ns *PIDNamespace) IDOfSession(s *Session) SessionID {
        ns.owner.mu.RLock()
        defer ns.owner.mu.RUnlock()
        return ns.sids[s]
}

// SessionWithID returns the Session with the given ID in the PID namespace ns,
// or nil if that given ID is not defined in this namespace.
//
// A reference is not taken on the session.
func (ns *PIDNamespace) SessionWithID(id SessionID) *Session {
        ns.owner.mu.RLock()
        defer ns.owner.mu.RUnlock()
        return ns.sessions[id]
}

// ProcessGroup returns the ThreadGroup's ProcessGroup.
//
// A reference is not taken on the process group.
func (tg *ThreadGroup) ProcessGroup() *ProcessGroup {
        tg.pidns.owner.mu.RLock()
        defer tg.pidns.owner.mu.RUnlock()
        return tg.processGroup
}

// IDOfProcessGroup returns the process group assigned to pg in PID namespace ns.
//
// The same constraints apply as IDOfSession.
func (ns *PIDNamespace) IDOfProcessGroup(pg *ProcessGroup) ProcessGroupID {
        ns.owner.mu.RLock()
        defer ns.owner.mu.RUnlock()
        return ns.pgids[pg]
}

// ProcessGroupWithID returns the ProcessGroup with the given ID in the PID
// namespace ns, or nil if that given ID is not defined in this namespace.
//
// A reference is not taken on the process group.
func (ns *PIDNamespace) ProcessGroupWithID(id ProcessGroupID) *ProcessGroup {
        ns.owner.mu.RLock()
        defer ns.owner.mu.RUnlock()
        return ns.processGroups[id]
}































   21 



   21 



   21 




    1 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package linux

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/syserror"
)

// CopyInSigSet copies in a sigset_t, checks its size, and ensures that KILL and
// STOP are clear.
//
// TODO(gvisor.dev/issue/1624): This is only exported because
// syscalls/vfs2/signal.go depends on it. Once vfs1 is deleted and the vfs2
// syscalls are moved into this package, then they can be unexported.
func CopyInSigSet(t *kernel.Task, sigSetAddr hostarch.Addr, size uint) (linux.SignalSet, error) {
        if size != linux.SignalSetSize {
                return 0, linuxerr.EINVAL
        }
        b := t.CopyScratchBuffer(8)
        if _, err := t.CopyInBytes(sigSetAddr, b); err != nil {
                return 0, err
        }
        mask := hostarch.ByteOrder.Uint64(b[:])
        return linux.SignalSet(mask) &^ kernel.UnblockableSignals, nil
}

// copyOutSigSet copies out a sigset_t.
func copyOutSigSet(t *kernel.Task, sigSetAddr hostarch.Addr, mask linux.SignalSet) error {
        b := t.CopyScratchBuffer(8)
        hostarch.ByteOrder.PutUint64(b, uint64(mask))
        _, err := t.CopyOutBytes(sigSetAddr, b)
        return err
}

// copyInSigSetWithSize copies in a structure as below
//
//   struct {
//       sigset_t* sigset_addr;
//       size_t sizeof_sigset;
//   };
//
// and returns sigset_addr and size.
func copyInSigSetWithSize(t *kernel.Task, addr hostarch.Addr) (hostarch.Addr, uint, error) {
        switch t.Arch().Width() {
        case 8:
                in := t.CopyScratchBuffer(16)
                if _, err := t.CopyInBytes(addr, in); err != nil {
                        return 0, 0, err
                }
                maskAddr := hostarch.Addr(hostarch.ByteOrder.Uint64(in[0:]))
                maskSize := uint(hostarch.ByteOrder.Uint64(in[8:]))
                return maskAddr, maskSize, nil
        default:
                return 0, 0, syserror.ENOSYS
        }
}































































    2 



















  189 































































  123 







   33 
   31 


    2 




    2 



  540 
  540 



  484 





   83 





   13 


    2 




  225 








   10 






   10 




    4 


    2 




    7 

    3 


    4 


    2 


    1 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package auth

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
)

// Credentials contains information required to authorize privileged operations
// in a user namespace.
//
// +stateify savable
type Credentials struct {
        // Real/effective/saved user/group IDs in the root user namespace. None of
        // these should ever be NoID.
        RealKUID      KUID
        EffectiveKUID KUID
        SavedKUID     KUID
        RealKGID      KGID
        EffectiveKGID KGID
        SavedKGID     KGID

        // Filesystem user/group IDs are not implemented. "... setfsuid() is
        // nowadays unneeded and should be avoided in new applications (likewise
        // for setfsgid(2))." - setfsuid(2)

        // Supplementary groups used by set/getgroups.
        //
        // ExtraKGIDs slices are immutable, allowing multiple Credentials with the
        // same ExtraKGIDs to share the same slice.
        ExtraKGIDs []KGID

        // The capability sets applicable to this set of credentials.
        PermittedCaps   CapabilitySet
        InheritableCaps CapabilitySet
        EffectiveCaps   CapabilitySet
        BoundingCaps    CapabilitySet
        // Ambient capabilities are not introduced until Linux 4.3.

        // KeepCaps is the flag for PR_SET_KEEPCAPS which allow capabilities to be
        // maintained after a switch from root user to non-root user via setuid().
        KeepCaps bool

        // The user namespace associated with the owner of the credentials.
        UserNamespace *UserNamespace
}

// NewAnonymousCredentials returns a set of credentials with no capabilities in
// any user namespace.
func NewAnonymousCredentials() *Credentials {
        // Create a new root user namespace. Since the new namespace's owner is
        // KUID 0 and the returned credentials have non-zero KUID/KGID, the
        // returned credentials do not have any capabilities in the new namespace.
        // Since the new namespace is not part of any existing user namespace
        // hierarchy, the returned credentials do not have any capabilities in any
        // other namespace.
        return &Credentials{
                RealKUID:      NobodyKUID,
                EffectiveKUID: NobodyKUID,
                SavedKUID:     NobodyKUID,
                RealKGID:      NobodyKGID,
                EffectiveKGID: NobodyKGID,
                SavedKGID:     NobodyKGID,
                UserNamespace: NewRootUserNamespace(),
        }
}

// NewRootCredentials returns a set of credentials with KUID and KGID 0 (i.e.
// global root) in user namespace ns.
func NewRootCredentials(ns *UserNamespace) *Credentials {
        // I can't find documentation for this anywhere, but it's correct for the
        // inheritable capability set to be initially empty (the capabilities test
        // checks for this property).
        return &Credentials{
                RealKUID:      RootKUID,
                EffectiveKUID: RootKUID,
                SavedKUID:     RootKUID,
                RealKGID:      RootKGID,
                EffectiveKGID: RootKGID,
                SavedKGID:     RootKGID,
                PermittedCaps: AllCapabilities,
                EffectiveCaps: AllCapabilities,
                BoundingCaps:  AllCapabilities,
                UserNamespace: ns,
        }
}

// NewUserCredentials returns a set of credentials based on the given UID, GIDs,
// and capabilities in a given namespace. If all arguments are their zero
// values, this returns the same credentials as NewRootCredentials.
func NewUserCredentials(kuid KUID, kgid KGID, extraKGIDs []KGID, capabilities *TaskCapabilities, ns *UserNamespace) *Credentials {
        creds := NewRootCredentials(ns)

        // Set the UID.
        uid := kuid
        creds.RealKUID = uid
        creds.EffectiveKUID = uid
        creds.SavedKUID = uid

        // Set GID.
        gid := kgid
        creds.RealKGID = gid
        creds.EffectiveKGID = gid
        creds.SavedKGID = gid

        // Set additional GIDs.
        creds.ExtraKGIDs = append(creds.ExtraKGIDs, extraKGIDs...)

        // Set capabilities.
        if capabilities != nil {
                creds.PermittedCaps = capabilities.PermittedCaps
                creds.EffectiveCaps = capabilities.EffectiveCaps
                creds.BoundingCaps = capabilities.BoundingCaps
                creds.InheritableCaps = capabilities.InheritableCaps
                // TODO(gvisor.dev/issue/3166): Support ambient capabilities.
        } else {
                // If no capabilities are specified, grant capabilities consistent with
                // setresuid + setresgid from NewRootCredentials to the given uid and
                // gid.
                if kuid == RootKUID {
                        creds.PermittedCaps = AllCapabilities
                        creds.EffectiveCaps = AllCapabilities
                } else {
                        creds.PermittedCaps = 0
                        creds.EffectiveCaps = 0
                }
                creds.BoundingCaps = AllCapabilities
        }

        return creds
}

// Fork generates an identical copy of a set of credentials.
func (c *Credentials) Fork() *Credentials {
        nc := new(Credentials)
        *nc = *c // Copy-by-value; this is legal for all fields.
        return nc
}

// InGroup returns true if c is in group kgid. Compare Linux's
// kernel/groups.c:in_group_p().
func (c *Credentials) InGroup(kgid KGID) bool {
        if c.EffectiveKGID == kgid {
                return true
        }
        for _, extraKGID := range c.ExtraKGIDs {
                if extraKGID == kgid {
                        return true
                }
        }
        return false
}

// HasCapabilityIn returns true if c has capability cp in ns.
func (c *Credentials) HasCapabilityIn(cp linux.Capability, ns *UserNamespace) bool {
        for {
                // "1. A process has a capability inside a user namespace if it is a member
                // of that namespace and it has the capability in its effective capability
                // set." - user_namespaces(7)
                if c.UserNamespace == ns {
                        return CapabilitySetOf(cp)&c.EffectiveCaps != 0
                }
                // "3. ... A process that resides in the parent of the user namespace and
                // whose effective user ID matches the owner of the namespace has all
                // capabilities in the namespace."
                if c.UserNamespace == ns.parent && c.EffectiveKUID == ns.owner {
                        return true
                }
                // "2. If a process has a capability in a user namespace, then it has that
                // capability in all child (and further removed descendant) namespaces as
                // well."
                if ns.parent == nil {
                        return false
                }
                ns = ns.parent
        }
}

// HasCapability returns true if c has capability cp in its user namespace.
func (c *Credentials) HasCapability(cp linux.Capability) bool {
        return c.HasCapabilityIn(cp, c.UserNamespace)
}

// UseUID checks that c can use uid in its user namespace, then translates it
// to the root user namespace.
//
// The checks UseUID does are common, but you should verify that it's doing
// exactly what you want.
func (c *Credentials) UseUID(uid UID) (KUID, error) {
        // uid must be mapped.
        kuid := c.UserNamespace.MapToKUID(uid)
        if !kuid.Ok() {
                return NoID, linuxerr.EINVAL
        }
        // If c has CAP_SETUID, then it can use any UID in its user namespace.
        if c.HasCapability(linux.CAP_SETUID) {
                return kuid, nil
        }
        // Otherwise, c must already have the UID as its real, effective, or saved
        // set-user-ID.
        if kuid == c.RealKUID || kuid == c.EffectiveKUID || kuid == c.SavedKUID {
                return kuid, nil
        }
        return NoID, linuxerr.EPERM
}

// UseGID checks that c can use gid in its user namespace, then translates it
// to the root user namespace.
func (c *Credentials) UseGID(gid GID) (KGID, error) {
        kgid := c.UserNamespace.MapToKGID(gid)
        if !kgid.Ok() {
                return NoID, linuxerr.EINVAL
        }
        if c.HasCapability(linux.CAP_SETGID) {
                return kgid, nil
        }
        if kgid == c.RealKGID || kgid == c.EffectiveKGID || kgid == c.SavedKGID {
                return kgid, nil
        }
        return NoID, linuxerr.EPERM
}

// SetUID translates the provided uid to the root user namespace and updates c's
// uids to it. This performs no permissions or capabilities checks, the caller
// is responsible for ensuring the calling context is permitted to modify c.
func (c *Credentials) SetUID(uid UID) error {
        kuid := c.UserNamespace.MapToKUID(uid)
        if !kuid.Ok() {
                return linuxerr.EINVAL
        }
        c.RealKUID = kuid
        c.EffectiveKUID = kuid
        c.SavedKUID = kuid
        return nil
}

// SetGID translates the provided gid to the root user namespace and updates c's
// gids to it. This performs no permissions or capabilities checks, the caller
// is responsible for ensuring the calling context is permitted to modify c.
func (c *Credentials) SetGID(gid GID) error {
        kgid := c.UserNamespace.MapToKGID(gid)
        if !kgid.Ok() {
                return linuxerr.EINVAL
        }
        c.RealKGID = kgid
        c.EffectiveKGID = kgid
        c.SavedKGID = kgid
        return nil
}

































   83 




   11 













   22 









   22 




   22 







   25 


   22 


   25 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
        "gvisor.dev/gvisor/pkg/sync"
)

// segmentQueue is a bounded, thread-safe queue of TCP segments.
//
// +stateify savable
type segmentQueue struct {
        mu     sync.Mutex  `state:"nosave"`
        list   segmentList `state:"wait"`
        ep     *endpoint
        frozen bool
}

// emptyLocked determines if the queue is empty.
// Preconditions: q.mu must be held.
func (q *segmentQueue) emptyLocked() bool {
        return q.list.Empty()
}

// empty determines if the queue is empty.
func (q *segmentQueue) empty() bool {
        q.mu.Lock()
        r := q.emptyLocked()
        q.mu.Unlock()

        return r
}

// enqueue adds the given segment to the queue.
//
// Returns true when the segment is successfully added to the queue, in which
// case ownership of the reference is transferred to the queue. And returns
// false if the queue is full, in which case ownership is retained by the
// caller.
func (q *segmentQueue) enqueue(s *segment) bool {
        // q.ep.receiveBufferParams() must be called without holding q.mu to
        // avoid lock order inversion.
        bufSz := q.ep.ops.GetReceiveBufferSize()
        used := q.ep.receiveMemUsed()
        q.mu.Lock()
        // Allow zero sized segments (ACK/FIN/RSTs etc even if the segment queue
        // is currently full).
        allow := (used <= int(bufSz) || s.payloadSize() == 0) && !q.frozen

        if allow {
                q.list.PushBack(s)
                // Set the owner now that the endpoint owns the segment.
                s.setOwner(q.ep, recvQ)
        }
        q.mu.Unlock()

        return allow
}

// dequeue removes and returns the next segment from queue, if one exists.
// Ownership is transferred to the caller, who is responsible for decrementing
// the ref count when done.
func (q *segmentQueue) dequeue() *segment {
        q.mu.Lock()
        s := q.list.Front()
        if s != nil {
                q.list.Remove(s)
        }
        q.mu.Unlock()

        return s
}

// freeze prevents any more segments from being added to the queue. i.e all
// future segmentQueue.enqueue will return false and not add the segment to the
// queue till the queue is unfroze with a corresponding segmentQueue.thaw call.
func (q *segmentQueue) freeze() {
        q.mu.Lock()
        q.frozen = true
        q.mu.Unlock()
}

// thaw unfreezes a previously frozen queue using segmentQueue.freeze() and
// allows new segments to be queued again.
func (q *segmentQueue) thaw() {
        q.mu.Lock()
        q.frozen = false
        q.mu.Unlock()
}




























    4 

    3 


    4 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pipe

import (
        "unsafe"
)

// lockTwoPipes locks both x.mu and y.mu in an order that is guaranteed to be
// consistent for both lockTwoPipes(x, y) and lockTwoPipes(y, x), such that
// concurrent calls cannot deadlock.
//
// Preconditions: x != y.
// +checklocksacquire:x.mu
// +checklocksacquire:y.mu
func lockTwoPipes(x, y *Pipe) {
        // Lock the two pipes in order of increasing address.
        if uintptr(unsafe.Pointer(x)) < uintptr(unsafe.Pointer(y)) {
                x.mu.Lock()
                y.mu.Lock()
        } else {
                y.mu.Lock()
                x.mu.Lock()
        }
}



































  109 



  109 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build linux

package syserr

import (
        "fmt"

        "golang.org/x/sys/unix"
)

const maxErrno = 134

type linuxHostTranslation struct {
        err *Error
        ok  bool
}

var linuxHostTranslations [maxErrno]linuxHostTranslation

// FromHost translates a unix.Errno to a corresponding Error value.
func FromHost(err unix.Errno) *Error {
        if int(err) >= len(linuxHostTranslations) || !linuxHostTranslations[err].ok {
                panic(fmt.Sprintf("unknown host errno %q (%d)", err.Error(), err))
        }
        return linuxHostTranslations[err].err
}

func addLinuxHostTranslation(host unix.Errno, trans *Error) {
        if linuxHostTranslations[host].ok {
                panic(fmt.Sprintf("duplicate translation for host errno %q (%d)", host.Error(), host))
        }
        linuxHostTranslations[host] = linuxHostTranslation{err: trans, ok: true}
}


































   17 




    1 



   16 



   16 









   16 





   16 
    1 
    1 








   16 
    4 




    1 



   16 


   11 



   16 
    1 



   16 
    1 


   16 



    1 




    1 







   16 
    1 




   16 
   16 
    1 





   16 



   16 



   16 








   16 




   16 
   16 


    1 



   16 



   16 






    9 

    8 






   16 





   16 




    1 






   16 









   16 
   14 




    4 




    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package vfs2

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/eventfd"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/mm"
        slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/usermem"
)

// IoSubmit implements linux syscall io_submit(2).
func IoSubmit(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
        id := args[0].Uint64()
        nrEvents := args[1].Int()
        addr := args[2].Pointer()

        if nrEvents < 0 {
                return 0, nil, linuxerr.EINVAL
        }

        for i := int32(0); i < nrEvents; i++ {
                // Copy in the callback address.
                var cbAddr hostarch.Addr
                switch t.Arch().Width() {
                case 8:
                        var cbAddrP primitive.Uint64
                        if _, err := cbAddrP.CopyIn(t, addr); err != nil {
                                if i > 0 {
                                        // Some successful.
                                        return uintptr(i), nil, nil
                                }
                                // Nothing done.
                                return 0, nil, err
                        }
                        cbAddr = hostarch.Addr(cbAddrP)
                default:
                        return 0, nil, syserror.ENOSYS
                }

                // Copy in this callback.
                var cb linux.IOCallback
                if _, err := cb.CopyIn(t, cbAddr); err != nil {
                        if i > 0 {
                                // Some have been successful.
                                return uintptr(i), nil, nil
                        }
                        // Nothing done.
                        return 0, nil, err
                }

                // Process this callback.
                if err := submitCallback(t, id, &cb, cbAddr); err != nil {
                        if i > 0 {
                                // Partial success.
                                return uintptr(i), nil, nil
                        }
                        // Nothing done.
                        return 0, nil, err
                }

                // Advance to the next one.
                addr += hostarch.Addr(t.Arch().Width())
        }

        return uintptr(nrEvents), nil, nil
}

// submitCallback processes a single callback.
func submitCallback(t *kernel.Task, id uint64, cb *linux.IOCallback, cbAddr hostarch.Addr) error {
        if cb.Reserved2 != 0 {
                return linuxerr.EINVAL
        }

        fd := t.GetFileVFS2(cb.FD)
        if fd == nil {
                return linuxerr.EBADF
        }
        defer fd.DecRef(t)

        // Was there an eventFD? Extract it.
        var eventFD *vfs.FileDescription
        if cb.Flags&linux.IOCB_FLAG_RESFD != 0 {
                eventFD = t.GetFileVFS2(cb.ResFD)
                if eventFD == nil {
                        return linuxerr.EBADF
                }
                defer eventFD.DecRef(t)

                // Check that it is an eventfd.
                if _, ok := eventFD.Impl().(*eventfd.EventFileDescription); !ok {
                        return linuxerr.EINVAL
                }
        }

        ioseq, err := memoryFor(t, cb)
        if err != nil {
                return err
        }

        // Check offset for reads/writes.
        switch cb.OpCode {
        case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
                if cb.Offset < 0 {
                        return linuxerr.EINVAL
                }
        }

        // Prepare the request.
        aioCtx, ok := t.MemoryManager().LookupAIOContext(t, id)
        if !ok {
                return linuxerr.EINVAL
        }
        if err := aioCtx.Prepare(); err != nil {
                return err
        }

        if eventFD != nil {
                // The request is set. Make sure there's a ref on the file.
                //
                // This is necessary when the callback executes on completion,
                // which is also what will release this reference.
                eventFD.IncRef()
        }

        // Perform the request asynchronously.
        fd.IncRef()
        t.QueueAIO(getAIOCallback(t, fd, eventFD, cbAddr, cb, ioseq, aioCtx))
        return nil
}

func getAIOCallback(t *kernel.Task, fd, eventFD *vfs.FileDescription, cbAddr hostarch.Addr, cb *linux.IOCallback, ioseq usermem.IOSequence, aioCtx *mm.AIOContext) kernel.AIOCallback {
        return func(ctx context.Context) {
                // Release references after completing the callback.
                defer fd.DecRef(ctx)
                if eventFD != nil {
                        defer eventFD.DecRef(ctx)
                }

                if aioCtx.Dead() {
                        aioCtx.CancelPendingRequest()
                        return
                }
                ev := &linux.IOEvent{
                        Data: cb.Data,
                        Obj:  uint64(cbAddr),
                }

                var err error
                switch cb.OpCode {
                case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PREADV:
                        ev.Result, err = fd.PRead(ctx, ioseq, cb.Offset, vfs.ReadOptions{})
                case linux.IOCB_CMD_PWRITE, linux.IOCB_CMD_PWRITEV:
                        ev.Result, err = fd.PWrite(ctx, ioseq, cb.Offset, vfs.WriteOptions{})
                case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC:
                        err = fd.Sync(ctx)
                }

                // Update the result.
                if err != nil {
                        err = slinux.HandleIOErrorVFS2(ctx, ev.Result != 0 /* partial */, err, nil /* never interrupted */, "aio", fd)
                        ev.Result = -int64(kernel.ExtractErrno(err, 0))
                }

                // Queue the result for delivery.
                aioCtx.FinishRequest(ev)

                // Notify the event file if one was specified. This needs to happen
                // *after* queueing the result to avoid racing with the thread we may
                // wake up.
                if eventFD != nil {
                        eventFD.Impl().(*eventfd.EventFileDescription).Signal(1)
                }
        }
}

// memoryFor returns appropriate memory for the given callback.
func memoryFor(t *kernel.Task, cb *linux.IOCallback) (usermem.IOSequence, error) {
        bytes := int(cb.Bytes)
        if bytes < 0 {
                // Linux also requires that this field fit in ssize_t.
                return usermem.IOSequence{}, linuxerr.EINVAL
        }

        // Since this I/O will be asynchronous with respect to t's task goroutine,
        // we have no guarantee that t's AddressSpace will be active during the
        // I/O.
        switch cb.OpCode {
        case linux.IOCB_CMD_PREAD, linux.IOCB_CMD_PWRITE:
                return t.SingleIOSequence(hostarch.Addr(cb.Buf), bytes, usermem.IOOpts{
                        AddressSpaceActive: false,
                })

        case linux.IOCB_CMD_PREADV, linux.IOCB_CMD_PWRITEV:
                return t.IovecsIOSequence(hostarch.Addr(cb.Buf), bytes, usermem.IOOpts{
                        AddressSpaceActive: false,
                })

        case linux.IOCB_CMD_FSYNC, linux.IOCB_CMD_FDSYNC, linux.IOCB_CMD_NOOP:
                return usermem.IOSequence{}, nil

        default:
                // Not a supported command.
                return usermem.IOSequence{}, linuxerr.EINVAL
        }
}























































































































































































































































































   31 
   31 


   31 


   31 


   31 


   31 


   31 


   31 











   31 



   31 


























    1 









    9 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package stack

import (
        "math"
        "math/rand"
        "sync"
        "time"

        "gvisor.dev/gvisor/pkg/tcpip"
)

const (
        // defaultBaseReachableTime is the default base duration for computing the
        // random reachable time.
        //
        // Reachable time is the duration for which a neighbor is considered
        // reachable after a positive reachability confirmation is received. It is a
        // function of a uniformly distributed random value between the minimum and
        // maximum random factors, multiplied by the base reachable time. Using a
        // random component eliminates the possibility that Neighbor Unreachability
        // Detection messages will synchronize with each other.
        //
        // Default taken from REACHABLE_TIME of RFC 4861 section 10.
        defaultBaseReachableTime = 30 * time.Second

        // minimumBaseReachableTime is the minimum base duration for computing the
        // random reachable time.
        //
        // Minimum = 1ms
        minimumBaseReachableTime = time.Millisecond

        // defaultMinRandomFactor is the default minimum value of the random factor
        // used for computing reachable time.
        //
        // Default taken from MIN_RANDOM_FACTOR of RFC 4861 section 10.
        defaultMinRandomFactor = 0.5

        // defaultMaxRandomFactor is the default maximum value of the random factor
        // used for computing reachable time.
        //
        // The default value depends on the value of MinRandomFactor.
        // If MinRandomFactor is less than MAX_RANDOM_FACTOR of RFC 4861 section 10,
        // the value from the RFC will be used; otherwise, the default is
        // MinRandomFactor multiplied by three.
        defaultMaxRandomFactor = 1.5

        // defaultRetransmitTimer is the default amount of time to wait between
        // sending reachability probes.
        //
        // Default taken from RETRANS_TIMER of RFC 4861 section 10.
        defaultRetransmitTimer = time.Second

        // minimumRetransmitTimer is the minimum amount of time to wait between
        // sending reachability probes.
        //
        // Note, RFC 4861 does not impose a minimum Retransmit Timer, but we do here
        // to make sure the messages are not sent all at once. We also come to this
        // value because in the RetransmitTimer field of a Router Advertisement, a
        // value of 0 means unspecified, so the smallest valid value is 1. Note, the
        // unit of the RetransmitTimer field in the Router Advertisement is
        // milliseconds.
        minimumRetransmitTimer = time.Millisecond

        // defaultDelayFirstProbeTime is the default duration to wait for a
        // non-Neighbor-Discovery related protocol to reconfirm reachability after
        // entering the DELAY state. After this time, a reachability probe will be
        // sent and the entry will transition to the PROBE state.
        //
        // Default taken from DELAY_FIRST_PROBE_TIME of RFC 4861 section 10.
        defaultDelayFirstProbeTime = 5 * time.Second

        // defaultMaxMulticastProbes is the default number of reachabililty probes
        // to send before concluding negative reachability and deleting the neighbor
        // entry from the INCOMPLETE state.
        //
        // Default taken from MAX_MULTICAST_SOLICIT of RFC 4861 section 10.
        defaultMaxMulticastProbes = 3

        // defaultMaxUnicastProbes is the default number of reachability probes to
        // send before concluding retransmission from within the PROBE state should
        // cease and the entry SHOULD be deleted.
        //
        // Default taken from MAX_UNICASE_SOLICIT of RFC 4861 section 10.
        defaultMaxUnicastProbes = 3

        // defaultMaxAnycastDelayTime is the default time in which the stack SHOULD
        // delay sending a response for a random time between 0 and this time, if the
        // target address is an anycast address.
        //
        // Default taken from MAX_ANYCAST_DELAY_TIME of RFC 4861 section 10.
        defaultMaxAnycastDelayTime = time.Second

        // defaultMaxReachbilityConfirmations is the default amount of unsolicited
        // reachability confirmation messages a node MAY send to all-node multicast
        // address when it determines its link-layer address has changed.
        //
        // Default taken from MAX_NEIGHBOR_ADVERTISEMENT of RFC 4861 section 10.
        defaultMaxReachbilityConfirmations = 3
)

// NUDDispatcher is the interface integrators of netstack must implement to
// receive and handle NUD related events.
type NUDDispatcher interface {
        // OnNeighborAdded will be called when a new entry is added to a NIC's (with
        // ID nicID) neighbor table.
        //
        // This function is permitted to block indefinitely without interfering with
        // the stack's operation.
        //
        // May be called concurrently.
        OnNeighborAdded(tcpip.NICID, NeighborEntry)

        // OnNeighborChanged will be called when an entry in a NIC's (with ID nicID)
        // neighbor table changes state and/or link address.
        //
        // This function is permitted to block indefinitely without interfering with
        // the stack's operation.
        //
        // May be called concurrently.
        OnNeighborChanged(tcpip.NICID, NeighborEntry)

        // OnNeighborRemoved will be called when an entry is removed from a NIC's
        // (with ID nicID) neighbor table.
        //
        // This function is permitted to block indefinitely without interfering with
        // the stack's operation.
        //
        // May be called concurrently.
        OnNeighborRemoved(tcpip.NICID, NeighborEntry)
}

// ReachabilityConfirmationFlags describes the flags used within a reachability
// confirmation (e.g. ARP reply or Neighbor Advertisement for ARP or NDP,
// respectively).
type ReachabilityConfirmationFlags struct {
        // Solicited indicates that the advertisement was sent in response to a
        // reachability probe.
        Solicited bool

        // Override indicates that the reachability confirmation should override an
        // existing neighbor cache entry and update the cached link-layer address.
        // When Override is not set the confirmation will not update a cached
        // link-layer address, but will update an existing neighbor cache entry for
        // which no link-layer address is known.
        Override bool

        // IsRouter indicates that the sender is a router.
        IsRouter bool
}

// NUDConfigurations is the NUD configurations for the netstack. This is used
// by the neighbor cache to operate the NUD state machine on each device in the
// local network.
type NUDConfigurations struct {
        // BaseReachableTime is the base duration for computing the random reachable
        // time.
        //
        // Reachable time is the duration for which a neighbor is considered
        // reachable after a positive reachability confirmation is received. It is a
        // function of uniformly distributed random value between minRandomFactor and
        // maxRandomFactor multiplied by baseReachableTime. Using a random component
        // eliminates the possibility that Neighbor Unreachability Detection messages
        // will synchronize with each other.
        //
        // After this time, a neighbor entry will transition from REACHABLE to STALE
        // state.
        //
        // Must be greater than 0.
        BaseReachableTime time.Duration

        // LearnBaseReachableTime enables learning BaseReachableTime during runtime
        // from the neighbor discovery protocol, if supported.
        //
        // TODO(gvisor.dev/issue/2240): Implement this NUD configuration option.
        LearnBaseReachableTime bool

        // MinRandomFactor is the minimum value of the random factor used for
        // computing reachable time.
        //
        // See BaseReachbleTime for more information on computing the reachable time.
        //
        // Must be greater than 0.
        MinRandomFactor float32

        // MaxRandomFactor is the maximum value of the random factor used for
        // computing reachabile time.
        //
        // See BaseReachbleTime for more information on computing the reachable time.
        //
        // Must be great than or equal to MinRandomFactor.
        MaxRandomFactor float32

        // RetransmitTimer is the duration between retransmission of reachability
        // probes in the PROBE state.
        RetransmitTimer time.Duration

        // LearnRetransmitTimer enables learning RetransmitTimer during runtime from
        // the neighbor discovery protocol, if supported.
        //
        // TODO(gvisor.dev/issue/2241): Implement this NUD configuration option.
        LearnRetransmitTimer bool

        // DelayFirstProbeTime is the duration to wait for a non-Neighbor-Discovery
        // related protocol to reconfirm reachability after entering the DELAY state.
        // After this time, a reachability probe will be sent and the entry will
        // transition to the PROBE state.
        //
        // Must be greater than 0.
        DelayFirstProbeTime time.Duration

        // MaxMulticastProbes is the number of reachability probes to send before
        // concluding negative reachability and deleting the neighbor entry from the
        // INCOMPLETE state.
        //
        // Must be greater than 0.
        MaxMulticastProbes uint32

        // MaxUnicastProbes is the number of reachability probes to send before
        // concluding retransmission from within the PROBE state should cease and
        // entry SHOULD be deleted.
        //
        // Must be greater than 0.
        MaxUnicastProbes uint32

        // MaxAnycastDelayTime is the time in which the stack SHOULD delay sending a
        // response for a random time between 0 and this time, if the target address
        // is an anycast address.
        //
        // TODO(gvisor.dev/issue/2242): Use this option when sending solicited
        // neighbor confirmations to anycast addresses and proxying neighbor
        // confirmations.
        MaxAnycastDelayTime time.Duration

        // MaxReachabilityConfirmations is the number of unsolicited reachability
        // confirmation messages a node MAY send to all-node multicast address when
        // it determines its link-layer address has changed.
        //
        // TODO(gvisor.dev/issue/2246): Discuss if implementation of this NUD
        // configuration option is necessary.
        MaxReachabilityConfirmations uint32
}

// DefaultNUDConfigurations returns a NUDConfigurations populated with default
// values defined by RFC 4861 section 10.
func DefaultNUDConfigurations() NUDConfigurations {
        return NUDConfigurations{
                BaseReachableTime:            defaultBaseReachableTime,
                LearnBaseReachableTime:       true,
                MinRandomFactor:              defaultMinRandomFactor,
                MaxRandomFactor:              defaultMaxRandomFactor,
                RetransmitTimer:              defaultRetransmitTimer,
                LearnRetransmitTimer:         true,
                DelayFirstProbeTime:          defaultDelayFirstProbeTime,
                MaxMulticastProbes:           defaultMaxMulticastProbes,
                MaxUnicastProbes:             defaultMaxUnicastProbes,
                MaxAnycastDelayTime:          defaultMaxAnycastDelayTime,
                MaxReachabilityConfirmations: defaultMaxReachbilityConfirmations,
        }
}

// resetInvalidFields modifies an invalid NDPConfigurations with valid values.
// If invalid values are present in c, the corresponding default values will be
// used instead. This is needed to check, and conditionally fix, user-specified
// NUDConfigurations.
func (c *NUDConfigurations) resetInvalidFields() {
        if c.BaseReachableTime < minimumBaseReachableTime {
                c.BaseReachableTime = defaultBaseReachableTime
        }
        if c.MinRandomFactor <= 0 {
                c.MinRandomFactor = defaultMinRandomFactor
        }
        if c.MaxRandomFactor < c.MinRandomFactor {
                c.MaxRandomFactor = calcMaxRandomFactor(c.MinRandomFactor)
        }
        if c.RetransmitTimer < minimumRetransmitTimer {
                c.RetransmitTimer = defaultRetransmitTimer
        }
        if c.DelayFirstProbeTime == 0 {
                c.DelayFirstProbeTime = defaultDelayFirstProbeTime
        }
        if c.MaxMulticastProbes == 0 {
                c.MaxMulticastProbes = defaultMaxMulticastProbes
        }
        if c.MaxUnicastProbes == 0 {
                c.MaxUnicastProbes = defaultMaxUnicastProbes
        }
}

// calcMaxRandomFactor calculates the maximum value of the random factor used
// for computing reachable time. This function is necessary for when the
// default specified in RFC 4861 section 10 is less than the current
// MinRandomFactor.
//
// Assumes minRandomFactor is positive since validation of the minimum value
// should come before the validation of the maximum.
func calcMaxRandomFactor(minRandomFactor float32) float32 {
        if minRandomFactor > defaultMaxRandomFactor {
                return minRandomFactor * 3
        }
        return defaultMaxRandomFactor
}

// NUDState stores states needed for calculating reachable time.
type NUDState struct {
        clock tcpip.Clock
        rng   *rand.Rand

        mu struct {
                sync.RWMutex

                config NUDConfigurations

                // reachableTime is the duration to wait for a REACHABLE entry to
                // transition into STALE after inactivity. This value is calculated with
                // the algorithm defined in RFC 4861 section 6.3.2.
                reachableTime time.Duration

                expiration            time.Time
                prevBaseReachableTime time.Duration
                prevMinRandomFactor   float32
                prevMaxRandomFactor   float32
        }
}

// NewNUDState returns new NUDState using c as configuration and the specified
// random number generator for use in recomputing ReachableTime.
func NewNUDState(c NUDConfigurations, clock tcpip.Clock, rng *rand.Rand) *NUDState {
        s := &NUDState{
                clock: clock,
                rng:   rng,
        }
        s.mu.config = c
        return s
}

// Config returns the NUD configuration.
func (s *NUDState) Config() NUDConfigurations {
        s.mu.RLock()
        defer s.mu.RUnlock()
        return s.mu.config
}

// SetConfig replaces the existing NUD configurations with c.
func (s *NUDState) SetConfig(c NUDConfigurations) {
        s.mu.Lock()
        defer s.mu.Unlock()
        s.mu.config = c
}

// ReachableTime returns the duration to wait for a REACHABLE entry to
// transition into STALE after inactivity. This value is recalculated for new
// values of BaseReachableTime, MinRandomFactor, and MaxRandomFactor using the
// algorithm defined in RFC 4861 section 6.3.2.
func (s *NUDState) ReachableTime() time.Duration {
        s.mu.Lock()
        defer s.mu.Unlock()

        if s.clock.Now().After(s.mu.expiration) ||
                s.mu.config.BaseReachableTime != s.mu.prevBaseReachableTime ||
                s.mu.config.MinRandomFactor != s.mu.prevMinRandomFactor ||
                s.mu.config.MaxRandomFactor != s.mu.prevMaxRandomFactor {
                s.recomputeReachableTimeLocked()
        }
        return s.mu.reachableTime
}

// recomputeReachableTimeLocked forces a recalculation of ReachableTime using
// the algorithm defined in RFC 4861 section 6.3.2.
//
// This SHOULD automatically be invoked during certain situations, as per
// RFC 4861 section 6.3.4:
//
//    If the received Reachable Time value is non-zero, the host SHOULD set its
//    BaseReachableTime variable to the received value.  If the new value
//    differs from the previous value, the host SHOULD re-compute a new random
//    ReachableTime value.  ReachableTime is computed as a uniformly
//    distributed random value between MIN_RANDOM_FACTOR and MAX_RANDOM_FACTOR
//    times the BaseReachableTime.  Using a random component eliminates the
//    possibility that Neighbor Unreachability Detection messages will
//    synchronize with each other.
//
//    In most cases, the advertised Reachable Time value will be the same in
//    consecutive Router Advertisements, and a host's BaseReachableTime rarely
//    changes.  In such cases, an implementation SHOULD ensure that a new
//    random value gets re-computed at least once every few hours.
//
// s.mu MUST be locked for writing.
func (s *NUDState) recomputeReachableTimeLocked() {
        s.mu.prevBaseReachableTime = s.mu.config.BaseReachableTime
        s.mu.prevMinRandomFactor = s.mu.config.MinRandomFactor
        s.mu.prevMaxRandomFactor = s.mu.config.MaxRandomFactor

        randomFactor := s.mu.config.MinRandomFactor + s.rng.Float32()*(s.mu.config.MaxRandomFactor-s.mu.config.MinRandomFactor)

        // Check for overflow, given that minRandomFactor and maxRandomFactor are
        // guaranteed to be positive numbers.
        if math.MaxInt64/randomFactor < float32(s.mu.config.BaseReachableTime) {
                s.mu.reachableTime = time.Duration(math.MaxInt64)
        } else if randomFactor == 1 {
                // Avoid loss of precision when a large base reachable time is used.
                s.mu.reachableTime = s.mu.config.BaseReachableTime
        } else {
                reachableTime := int64(float32(s.mu.config.BaseReachableTime) * randomFactor)
                s.mu.reachableTime = time.Duration(reachableTime)
        }

        s.mu.expiration = s.clock.Now().Add(2 * time.Hour)
}













































    8 






    8 















   12 








   12 
    8 



    8 




    7 
    1 




    6 
    2 


    5 

    3 






    2 





    6 




    3 




    4 

    3 


    1 




    1 



    2 




   97 




   96 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package tundev implements the /dev/net/tun device.
package tundev

import (
        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
        "gvisor.dev/gvisor/pkg/sentry/inet"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/socket/netstack"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
        "gvisor.dev/gvisor/pkg/tcpip/link/tun"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

const (
        netTunDevMajor = 10
        netTunDevMinor = 200
)

// tunDevice implements vfs.Device for /dev/net/tun.
//
// +stateify savable
type tunDevice struct{}

// Open implements vfs.Device.Open.
func (tunDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
        fd := &tunFD{}
        if err := fd.vfsfd.Init(fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
                UseDentryMetadata: true,
        }); err != nil {
                return nil, err
        }
        return &fd.vfsfd, nil
}

// tunFD implements vfs.FileDescriptionImpl for /dev/net/tun.
//
// +stateify savable
type tunFD struct {
        vfsfd vfs.FileDescription
        vfs.FileDescriptionDefaultImpl
        vfs.DentryMetadataFileDescriptionImpl
        vfs.NoLockFD

        device tun.Device
}

// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
func (fd *tunFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
        request := args[1].Uint()
        data := args[2].Pointer()

        t := kernel.TaskFromContext(ctx)
        if t == nil {
                panic("Ioctl should be called from a task context")
        }

        switch request {
        case linux.TUNSETIFF:
                if !t.HasCapability(linux.CAP_NET_ADMIN) {
                        return 0, linuxerr.EPERM
                }
                stack, ok := t.NetworkContext().(*netstack.Stack)
                if !ok {
                        return 0, linuxerr.EINVAL
                }

                var req linux.IFReq
                if _, err := req.CopyIn(t, data); err != nil {
                        return 0, err
                }

                // Validate flags.
                flags, err := netstack.LinuxToTUNFlags(hostarch.ByteOrder.Uint16(req.Data[:]))
                if err != nil {
                        return 0, err
                }
                return 0, fd.device.SetIff(stack.Stack, req.Name(), flags)

        case linux.TUNGETIFF:
                var req linux.IFReq
                copy(req.IFName[:], fd.device.Name())
                hostarch.ByteOrder.PutUint16(req.Data[:], netstack.TUNFlagsToLinux(fd.device.Flags()))
                _, err := req.CopyOut(t, data)
                return 0, err

        default:
                return 0, linuxerr.ENOTTY
        }
}

// Release implements vfs.FileDescriptionImpl.Release.
func (fd *tunFD) Release(ctx context.Context) {
        fd.device.Release(ctx)
}

// PRead implements vfs.FileDescriptionImpl.PRead.
func (fd *tunFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
        return fd.Read(ctx, dst, opts)
}

// Read implements vfs.FileDescriptionImpl.Read.
func (fd *tunFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
        data, err := fd.device.Read()
        if err != nil {
                return 0, err
        }
        n, err := dst.CopyOut(ctx, data)
        if n > 0 && n < len(data) {
                // Not an error for partial copying. Packet truncated.
                err = nil
        }
        return int64(n), err
}

// PWrite implements vfs.FileDescriptionImpl.PWrite.
func (fd *tunFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
        return fd.Write(ctx, src, opts)
}

// Write implements vfs.FileDescriptionImpl.Write.
func (fd *tunFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
        data := make([]byte, src.NumBytes())
        if _, err := src.CopyIn(ctx, data); err != nil {
                return 0, err
        }
        return fd.device.Write(data)
}

// Readiness implements watier.Waitable.Readiness.
func (fd *tunFD) Readiness(mask waiter.EventMask) waiter.EventMask {
        return fd.device.Readiness(mask)
}

// EventRegister implements watier.Waitable.EventRegister.
func (fd *tunFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        fd.device.EventRegister(e, mask)
}

// EventUnregister implements watier.Waitable.EventUnregister.
func (fd *tunFD) EventUnregister(e *waiter.Entry) {
        fd.device.EventUnregister(e)
}

// IsNetTunSupported returns whether /dev/net/tun device is supported for s.
func IsNetTunSupported(s inet.Stack) bool {
        _, ok := s.(*netstack.Stack)
        return ok
}

// Register registers all devices implemented by this package in vfsObj.
func Register(vfsObj *vfs.VirtualFilesystem) error {
        return vfsObj.RegisterDevice(vfs.CharDevice, netTunDevMajor, netTunDevMinor, tunDevice{}, &vfs.RegisterDeviceOptions{})
}

// CreateDevtmpfsFiles creates device special files in dev representing all
// devices implemented by this package.
func CreateDevtmpfsFiles(ctx context.Context, dev *devtmpfs.Accessor) error {
        return dev.CreateDeviceFile(ctx, "net/tun", vfs.CharDevice, netTunDevMajor, netTunDevMinor, 0666 /* mode */)
}














































   31 



   30 

   18 

   11 







   15 
    1 


   14 




   15 
    1 


   14 



    5 

    3 

    2 






    5 

    3 


    2 








    2 




























   31 




   31 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package icmp contains the implementation of the ICMP and IPv6-ICMP transport
// protocols for use in ping.
package icmp

import (
        "fmt"

        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/tcpip/buffer"
        "gvisor.dev/gvisor/pkg/tcpip/header"
        "gvisor.dev/gvisor/pkg/tcpip/stack"
        "gvisor.dev/gvisor/pkg/tcpip/transport/raw"
        "gvisor.dev/gvisor/pkg/waiter"
)

const (
        // ProtocolNumber4 is the ICMP protocol number.
        ProtocolNumber4 = header.ICMPv4ProtocolNumber

        // ProtocolNumber6 is the IPv6-ICMP protocol number.
        ProtocolNumber6 = header.ICMPv6ProtocolNumber
)

// protocol implements stack.TransportProtocol.
type protocol struct {
        stack *stack.Stack

        number tcpip.TransportProtocolNumber
}

// Number returns the ICMP protocol number.
func (p *protocol) Number() tcpip.TransportProtocolNumber {
        return p.number
}

func (p *protocol) netProto() tcpip.NetworkProtocolNumber {
        switch p.number {
        case ProtocolNumber4:
                return header.IPv4ProtocolNumber
        case ProtocolNumber6:
                return header.IPv6ProtocolNumber
        }
        panic(fmt.Sprint("unknown protocol number: ", p.number))
}

// NewEndpoint creates a new icmp endpoint. It implements
// stack.TransportProtocol.NewEndpoint.
func (p *protocol) NewEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
        if netProto != p.netProto() {
                return nil, &tcpip.ErrUnknownProtocol{}
        }
        return newEndpoint(p.stack, netProto, p.number, waiterQueue)
}

// NewRawEndpoint creates a new raw icmp endpoint. It implements
// stack.TransportProtocol.NewRawEndpoint.
func (p *protocol) NewRawEndpoint(netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, tcpip.Error) {
        if netProto != p.netProto() {
                return nil, &tcpip.ErrUnknownProtocol{}
        }
        return raw.NewEndpoint(p.stack, netProto, p.number, waiterQueue)
}

// MinimumPacketSize returns the minimum valid icmp packet size.
func (p *protocol) MinimumPacketSize() int {
        switch p.number {
        case ProtocolNumber4:
                return header.ICMPv4MinimumSize
        case ProtocolNumber6:
                return header.ICMPv6MinimumSize
        }
        panic(fmt.Sprint("unknown protocol number: ", p.number))
}

// ParsePorts in case of ICMP sets src to 0, dst to ICMP ID, and err to nil.
func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err tcpip.Error) {
        switch p.number {
        case ProtocolNumber4:
                hdr := header.ICMPv4(v)
                return 0, hdr.Ident(), nil
        case ProtocolNumber6:
                hdr := header.ICMPv6(v)
                return 0, hdr.Ident(), nil
        }
        panic(fmt.Sprint("unknown protocol number: ", p.number))
}

// HandleUnknownDestinationPacket handles packets targeted at this protocol but
// that don't match any existing endpoint.
func (*protocol) HandleUnknownDestinationPacket(stack.TransportEndpointID, *stack.PacketBuffer) stack.UnknownDestinationPacketDisposition {
        return stack.UnknownDestinationPacketHandled
}

// SetOption implements stack.TransportProtocol.SetOption.
func (*protocol) SetOption(tcpip.SettableTransportProtocolOption) tcpip.Error {
        return &tcpip.ErrUnknownProtocolOption{}
}

// Option implements stack.TransportProtocol.Option.
func (*protocol) Option(tcpip.GettableTransportProtocolOption) tcpip.Error {
        return &tcpip.ErrUnknownProtocolOption{}
}

// Close implements stack.TransportProtocol.Close.
func (*protocol) Close() {}

// Wait implements stack.TransportProtocol.Wait.
func (*protocol) Wait() {}

// Parse implements stack.TransportProtocol.Parse.
func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
        // Right now, the Parse() method is tied to enabled protocols passed into
        // stack.New. This works for UDP and TCP, but we handle ICMP traffic even
        // when netstack users don't pass ICMP as a supported protocol.
        return false
}

// NewProtocol4 returns an ICMPv4 transport protocol.
func NewProtocol4(s *stack.Stack) stack.TransportProtocol {
        return &protocol{stack: s, number: ProtocolNumber4}
}

// NewProtocol6 returns an ICMPv6 transport protocol.
func NewProtocol6(s *stack.Stack) stack.TransportProtocol {
        return &protocol{stack: s, number: ProtocolNumber6}
}











































    9 
    9 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pgalloc

import (
        "gvisor.dev/gvisor/pkg/context"
)

// contextID is this package's type for context.Context.Value keys.
type contextID int

const (
        // CtxMemoryFile is a Context.Value key for a MemoryFile.
        CtxMemoryFile contextID = iota

        // CtxMemoryFileProvider is a Context.Value key for a MemoryFileProvider.
        CtxMemoryFileProvider
)

// MemoryFileFromContext returns the MemoryFile used by ctx, or nil if no such
// MemoryFile exists.
func MemoryFileFromContext(ctx context.Context) *MemoryFile {
        if v := ctx.Value(CtxMemoryFile); v != nil {
                return v.(*MemoryFile)
        }
        return nil
}

// MemoryFileProviderFromContext returns the MemoryFileProvider used by ctx, or nil if no such
// MemoryFileProvider exists.
func MemoryFileProviderFromContext(ctx context.Context) MemoryFileProvider {
        if v := ctx.Value(CtxMemoryFileProvider); v != nil {
                return v.(MemoryFileProvider)
        }
        return nil
}















































  737 


  737 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
// Copyright 2021 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package fpu provides basic floating point helpers.
package fpu

import (
        "fmt"
        "reflect"
)

// State represents floating point state.
//
// This is a simple byte slice, but may have architecture-specific methods
// attached to it.
type State []byte

// ErrLoadingState indicates a failed restore due to unusable floating point
// state.
type ErrLoadingState struct {
        // supported is the supported floating point state.
        supportedFeatures uint64

        // saved is the saved floating point state.
        savedFeatures uint64
}

// Error returns a sensible description of the restore error.
func (e ErrLoadingState) Error() string {
        return fmt.Sprintf("floating point state contains unsupported features; supported: %#x saved: %#x", e.supportedFeatures, e.savedFeatures)
}

// alignedBytes returns a slice of size bytes, aligned in memory to the given
// alignment. This is used because we require certain structures to be aligned
// in a specific way (for example, the X86 floating point data).
func alignedBytes(size, alignment uint) []byte {
        data := make([]byte, size+alignment-1)
        offset := uint(reflect.ValueOf(data).Index(0).Addr().Pointer() % uintptr(alignment))
        if offset == 0 {
                return data[:size:size]
        }
        return data[alignment-offset:][:size:size]
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/vfs/epoll_interest_list.go: no such file or directory






































  504 





  505 









    1 









 1708 









 1725 

  568 


 1726 
  757 


 1722 


 1724 




    3 















  555 











  766 




  764 


  502 

  503 


  504 




  767 


  753 








  764 


  754 





  753 
  749 






  764 
  733 

  506 



  766 





  527 
  523 




  739 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

import (
        "sync/atomic"
        "unsafe"

        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/vfs"
)

type descriptorTable struct {
        // slice is a *[]unsafe.Pointer, where each element is actually
        // *descriptor object, updated atomically.
        //
        // Changes to the slice itself requiring holding FDTable.mu.
        slice unsafe.Pointer `state:".(map[int32]*descriptor)"`
}

// initNoLeakCheck initializes the table without enabling leak checking.
//
// This is used when loading an FDTable after S/R, during which the ref count
// object itself will enable leak checking if necessary.
func (f *FDTable) initNoLeakCheck() {
        var slice []unsafe.Pointer // Empty slice.
        atomic.StorePointer(&f.slice, unsafe.Pointer(&slice))
}

// init initializes the table with leak checking.
func (f *FDTable) init() {
        f.initNoLeakCheck()
        f.InitRefs()
}

// get gets a file entry.
//
// The boolean indicates whether this was in range.
//
//go:nosplit
func (f *FDTable) get(fd int32) (*fs.File, FDFlags, bool) {
        file, _, flags, ok := f.getAll(fd)
        return file, flags, ok
}

// getVFS2 gets a file entry.
//
// The boolean indicates whether this was in range.
//
//go:nosplit
func (f *FDTable) getVFS2(fd int32) (*vfs.FileDescription, FDFlags, bool) {
        _, file, flags, ok := f.getAll(fd)
        return file, flags, ok
}

// getAll gets a file entry.
//
// The boolean indicates whether this was in range.
//
//go:nosplit
func (f *FDTable) getAll(fd int32) (*fs.File, *vfs.FileDescription, FDFlags, bool) {
        slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
        if fd >= int32(len(slice)) {
                return nil, nil, FDFlags{}, false
        }
        d := (*descriptor)(atomic.LoadPointer(&slice[fd]))
        if d == nil {
                return nil, nil, FDFlags{}, true
        }
        if d.file != nil && d.fileVFS2 != nil {
                panic("VFS1 and VFS2 files set")
        }
        return d.file, d.fileVFS2, d.flags, true
}

// CurrentMaxFDs returns the number of file descriptors that may be stored in f
// without reallocation.
func (f *FDTable) CurrentMaxFDs() int {
        slice := *(*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))
        return len(slice)
}

// set sets an entry for VFS1, refer to setAll().
//
// Precondition: mu must be held.
func (f *FDTable) set(ctx context.Context, fd int32, file *fs.File, flags FDFlags) *fs.File {
        dropFile, _ := f.setAll(ctx, fd, file, nil, flags)
        return dropFile
}

// setVFS2 sets an entry for VFS2, refer to setAll().
//
// Precondition: mu must be held.
func (f *FDTable) setVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) *vfs.FileDescription {
        _, dropFile := f.setAll(ctx, fd, nil, file, flags)
        return dropFile
}

// setAll sets the file description referred to by fd to file/fileVFS2. If
// file/fileVFS2 are non-nil, it takes a reference on them. If setAll replaces
// an existing file description, it returns it with the FDTable's reference
// transferred to the caller, which must call f.drop/dropVFS2() on the returned
// file after unlocking f.mu.
//
// Precondition: mu must be held.
func (f *FDTable) setAll(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) (*fs.File, *vfs.FileDescription) {
        if file != nil && fileVFS2 != nil {
                panic("VFS1 and VFS2 files set")
        }

        slicePtr := (*[]unsafe.Pointer)(atomic.LoadPointer(&f.slice))

        // Grow the table as required.
        if last := int32(len(*slicePtr)); fd >= last {
                end := fd + 1
                if end < 2*last {
                        end = 2 * last
                }
                newSlice := append(*slicePtr, make([]unsafe.Pointer, end-last)...)
                slicePtr = &newSlice
                atomic.StorePointer(&f.slice, unsafe.Pointer(slicePtr))
        }

        slice := *slicePtr

        var desc *descriptor
        if file != nil || fileVFS2 != nil {
                desc = &descriptor{
                        file:     file,
                        fileVFS2: fileVFS2,
                        flags:    flags,
                }
        }

        // Update the single element.
        orig := (*descriptor)(atomic.SwapPointer(&slice[fd], unsafe.Pointer(desc)))

        // Acquire a table reference.
        if desc != nil {
                switch {
                case desc.file != nil:
                        if orig == nil || desc.file != orig.file {
                                desc.file.IncRef()
                        }
                case desc.fileVFS2 != nil:
                        if orig == nil || desc.fileVFS2 != orig.fileVFS2 {
                                desc.fileVFS2.IncRef()
                        }
                }
        }

        // Adjust used.
        switch {
        case orig == nil && desc != nil:
                atomic.AddInt32(&f.used, 1)
        case orig != nil && desc == nil:
                atomic.AddInt32(&f.used, -1)
        }

        if orig != nil {
                switch {
                case orig.file != nil:
                        if desc == nil || desc.file != orig.file {
                                return orig.file, nil
                        }
                case orig.fileVFS2 != nil:
                        if desc == nil || desc.fileVFS2 != orig.fileVFS2 {
                                return nil, orig.fileVFS2
                        }
                }
        }
        return nil, nil
}

open /syzkaller/managers/ci-gvisor-ptrace-1-race-cover/kernel/pkg/sentry/fs/fsutil/dirty_set_impl.go: no such file or directory


































































  669 

  667 

  668 

  609 

   84 










  110 

    4 


  111 













    1 





  713 






   90 













  604 


   96 













  540 


  540 






























  667 



  669 








  665 







  666 







    3 



    3 

    3 







  666 






















  638 



  637 




  626 




  628 













  669 













  668 



    1 








    1 


  629 



   81 





   78 
   78 


   21 













  619 











  114 
   90 









  625 


   15 





  612 





  624 
  623 
  556 



  609 





  114 


  113 


  114 





  113 


  113 


  114 


  112 
   93 













































































































  616 















  606 














  609 



  609 


  604 

    2 














    2 



    2 





    2 






  609 
   15 


  593 

   11 

  592 





















   84 









  588 


  590 








  599 

   84 



   70 



   85 










   81 


   80 








   85 





   70 


   18 




   83 




























    3 


















































































 1952 
    1 


 1952 



 1956 


 1963 


 1951 


































 1957 
 1952 


    8 


   12 

   11 



    1 





 1949 





 1954 


 1949 

   61 


 1956 

    2 



    1 




 1963 


    3 



 1947 


 1950 
    3 




 1961 

   56 





 1961 


    8 










    8 


    6 
    4 
    1 



    5 
    2 




 1957 
    1 



    5 

    1 



    4 


    4 


    4 
    4 
    2 



    2 






 1952 



 1954 
    2 


 1959 






   67 


   67 


    1 











   65 






   65 




   65 

   65 











    3 


   64 




   63 












  665 
  665 





    4 


    3 


    1 


    1 


    1 









    2 





    2 

    2 


    2 









    4 





    4 


    2 


    2 


    2 


    2 









  248 






  115 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

// This file implements the task exit cycle:
//
// - Tasks are asynchronously requested to exit with Task.Kill.
//
// - When able, the task goroutine enters the exit path starting from state
// runExit.
//
// - Other tasks observe completed exits with Task.Wait (which implements the
// wait*() family of syscalls).

import (
        "errors"
        "fmt"
        "strconv"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/waiter"
)

// TaskExitState represents a step in the task exit path.
//
// "Exiting" and "exited" are often ambiguous; prefer to name specific states.
type TaskExitState int

const (
        // TaskExitNone indicates that the task has not begun exiting.
        TaskExitNone TaskExitState = iota

        // TaskExitInitiated indicates that the task goroutine has entered the exit
        // path, and the task is no longer eligible to participate in group stops
        // or group signal handling. TaskExitInitiated is analogous to Linux's
        // PF_EXITING.
        TaskExitInitiated

        // TaskExitZombie indicates that the task has released its resources, and
        // the task no longer prevents a sibling thread from completing execve.
        TaskExitZombie

        // TaskExitDead indicates that the task's thread IDs have been released,
        // and the task no longer prevents its thread group leader from being
        // reaped. ("Reaping" refers to the transitioning of a task from
        // TaskExitZombie to TaskExitDead.)
        TaskExitDead
)

// String implements fmt.Stringer.
func (t TaskExitState) String() string {
        switch t {
        case TaskExitNone:
                return "TaskExitNone"
        case TaskExitInitiated:
                return "TaskExitInitiated"
        case TaskExitZombie:
                return "TaskExitZombie"
        case TaskExitDead:
                return "TaskExitDead"
        default:
                return strconv.Itoa(int(t))
        }
}

// killLocked marks t as killed by enqueueing a SIGKILL, without causing the
// thread-group-affecting side effects SIGKILL usually has.
//
// Preconditions: The signal mutex must be locked.
func (t *Task) killLocked() {
        // Clear killable stops.
        if t.stop != nil && t.stop.Killable() {
                t.endInternalStopLocked()
        }
        t.pendingSignals.enqueue(&linux.SignalInfo{
                Signo: int32(linux.SIGKILL),
                // Linux just sets SIGKILL in the pending signal bitmask without
                // enqueueing an actual siginfo, such that
                // kernel/signal.c:collect_signal() initializes si_code to SI_USER.
                Code: linux.SI_USER,
        }, nil)
        t.interrupt()
}

// killed returns true if t has a SIGKILL pending. killed is analogous to
// Linux's fatal_signal_pending().
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) killed() bool {
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        return t.killedLocked()
}

func (t *Task) killedLocked() bool {
        return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0
}

// PrepareExit indicates an exit with the given status.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) PrepareExit(ws linux.WaitStatus) {
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        t.exitStatus = ws
}

// PrepareGroupExit indicates a group exit with status es to t's thread group.
//
// PrepareGroupExit is analogous to Linux's do_group_exit(), except that it
// does not tail-call do_exit(), except that it *does* set Task.exitStatus.
// (Linux does not do so until within do_exit(), since it reuses exit_code for
// ptrace.)
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) PrepareGroupExit(ws linux.WaitStatus) {
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        if t.tg.exiting || t.tg.execing != nil {
                // Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e.
                // this "group exit" is being executed by the killed sibling of an
                // execing task, then Task.Execve never set t.tg.exitStatus, so it's
                // still the zero value. This is consistent with Linux, both in intent
                // ("all other threads ... report death as if they exited via _exit(2)
                // with exit code 0" - ptrace(2), "execve under ptrace") and in
                // implementation (compare fs/exec.c:de_thread() =>
                // kernel/signal.c:zap_other_threads() and
                // kernel/exit.c:do_group_exit() =>
                // include/linux/sched.h:signal_group_exit()).
                t.exitStatus = t.tg.exitStatus
                return
        }
        t.tg.exiting = true
        t.tg.exitStatus = ws
        t.exitStatus = ws
        for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
                if sibling != t {
                        sibling.killLocked()
                }
        }
}

// Kill requests that all tasks in ts exit as if group exiting with status ws.
// Kill does not wait for tasks to exit.
//
// Kill has no analogue in Linux; it's provided for save/restore only.
func (ts *TaskSet) Kill(ws linux.WaitStatus) {
        ts.mu.Lock()
        defer ts.mu.Unlock()
        ts.Root.exiting = true
        for t := range ts.Root.tids {
                t.tg.signalHandlers.mu.Lock()
                if !t.tg.exiting {
                        t.tg.exiting = true
                        t.tg.exitStatus = ws
                }
                t.killLocked()
                t.tg.signalHandlers.mu.Unlock()
        }
}

// advanceExitStateLocked checks that t's current exit state is oldExit, then
// sets it to newExit. If t's current exit state is not oldExit,
// advanceExitStateLocked panics.
//
// Preconditions: The TaskSet mutex must be locked.
func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) {
        if t.exitState != oldExit {
                panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState))
        }
        t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit)
        t.exitState = newExit
}

// runExit is the entry point into the task exit path.
//
// +stateify savable
type runExit struct{}

func (*runExit) execute(t *Task) taskRunState {
        t.ptraceExit()
        return (*runExitMain)(nil)
}

// +stateify savable
type runExitMain struct{}

func (*runExitMain) execute(t *Task) taskRunState {
        t.traceExitEvent()
        lastExiter := t.exitThreadGroup()

        t.ResetKcov()

        // If the task has a cleartid, and the thread group wasn't killed by a
        // signal, handle that before releasing the MM.
        if t.cleartid != 0 {
                t.tg.signalHandlers.mu.Lock()
                signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
                t.tg.signalHandlers.mu.Unlock()
                if !signaled {
                        zero := ThreadID(0)
                        if _, err := zero.CopyOut(t, t.cleartid); err == nil {
                                t.Futex().Wake(t, t.cleartid, false, ^uint32(0), 1)
                        }
                        // If the CopyOut fails, there's nothing we can do.
                }
        }

        // Handle the robust futex list.
        t.exitRobustList()

        // Deactivate the address space and update max RSS before releasing the
        // task's MM.
        t.Deactivate()
        t.tg.pidns.owner.mu.Lock()
        t.updateRSSLocked()
        t.tg.pidns.owner.mu.Unlock()
        t.mu.Lock()
        t.image.release()
        t.mu.Unlock()

        // Releasing the MM unblocks a blocked CLONE_VFORK parent.
        t.unstopVforkParent()

        t.fsContext.DecRef(t)
        t.fdTable.DecRef(t)

        // Detach task from all cgroups. This must happen before potentially the
        // last ref to the cgroupfs mount is dropped below.
        t.LeaveCgroups()

        t.mu.Lock()
        if t.mountNamespaceVFS2 != nil {
                t.mountNamespaceVFS2.DecRef(t)
                t.mountNamespaceVFS2 = nil
        }
        t.ipcns.DecRef(t)
        t.mu.Unlock()

        // If this is the last task to exit from the thread group, release the
        // thread group's resources.
        if lastExiter {
                t.tg.Release(t)
        }

        // Detach tracees.
        t.exitPtrace()

        // Reparent the task's children.
        t.exitChildren()

        // Don't tail-call runExitNotify, as exitChildren may have initiated a stop
        // to wait for a PID namespace to die.
        return (*runExitNotify)(nil)
}

// exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread
// group that it is no longer eligible to participate in group activities. It
// returns true if t is the last task in its thread group to call
// exitThreadGroup.
func (t *Task) exitThreadGroup() bool {
        t.tg.pidns.owner.mu.Lock()
        defer t.tg.pidns.owner.mu.Unlock()
        t.tg.signalHandlers.mu.Lock()
        // Can't defer unlock: see below.

        t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated)
        t.tg.activeTasks--
        last := t.tg.activeTasks == 0

        // Ensure that someone will handle the signals we can't.
        t.setSignalMaskLocked(^linux.SignalSet(0))

        // Check if this task's exit interacts with an initiated group stop.
        if !t.groupStopPending {
                t.tg.signalHandlers.mu.Unlock()
                return last
        }
        t.groupStopPending = false
        sig := t.tg.groupStopSignal
        notifyParent := t.participateGroupStopLocked()
        // signalStop must be called with t's signal mutex unlocked.
        t.tg.signalHandlers.mu.Unlock()
        if notifyParent && t.tg.leader.parent != nil {
                t.tg.leader.parent.signalStop(t, linux.CLD_STOPPED, int32(sig))
                t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
        }
        return last
}

func (t *Task) exitChildren() {
        t.tg.pidns.owner.mu.Lock()
        defer t.tg.pidns.owner.mu.Unlock()
        newParent := t.findReparentTargetLocked()
        if newParent == nil {
                // "If the init process of a PID namespace terminates, the kernel
                // terminates all of the processes in the namespace via a SIGKILL
                // signal." - pid_namespaces(7)
                t.Debugf("Init process terminating, killing namespace")
                t.tg.pidns.exiting = true
                for other := range t.tg.pidns.tgids {
                        if other == t.tg {
                                continue
                        }
                        other.signalHandlers.mu.Lock()
                        other.leader.sendSignalLocked(&linux.SignalInfo{
                                Signo: int32(linux.SIGKILL),
                        }, true /* group */)
                        other.signalHandlers.mu.Unlock()
                }
                // TODO(b/37722272): The init process waits for all processes in the
                // namespace to exit before completing its own exit
                // (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all
                // other tasks in the namespace are dead, except possibly for this
                // thread group's leader (which can't be reaped until this task exits).
        }
        // This is correct even if newParent is nil (it ensures that children don't
        // wait for a parent to reap them.)
        for c := range t.children {
                if sig := c.ParentDeathSignal(); sig != 0 {
                        siginfo := &linux.SignalInfo{
                                Signo: int32(sig),
                                Code:  linux.SI_USER,
                        }
                        siginfo.SetPID(int32(c.tg.pidns.tids[t]))
                        siginfo.SetUID(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow()))
                        c.tg.signalHandlers.mu.Lock()
                        c.sendSignalLocked(siginfo, true /* group */)
                        c.tg.signalHandlers.mu.Unlock()
                }
                c.reparentLocked(newParent)
                if newParent != nil {
                        newParent.children[c] = struct{}{}
                }
        }
}

// findReparentTargetLocked returns the task to which t's children should be
// reparented. If no such task exists, findNewParentLocked returns nil.
//
// Preconditions: The TaskSet mutex must be locked.
func (t *Task) findReparentTargetLocked() *Task {
        // Reparent to any sibling in the same thread group that hasn't begun
        // exiting.
        if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil {
                return t2
        }
        // "A child process that is orphaned within the namespace will be
        // reparented to [the init process for the namespace] ..." -
        // pid_namespaces(7)
        if init := t.tg.pidns.tasks[InitTID]; init != nil {
                return init.tg.anyNonExitingTaskLocked()
        }
        return nil
}

func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task {
        for t := tg.tasks.Front(); t != nil; t = t.Next() {
                if t.exitState == TaskExitNone {
                        return t
                }
        }
        return nil
}

// reparentLocked changes t's parent. The new parent may be nil.
//
// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) reparentLocked(parent *Task) {
        oldParent := t.parent
        t.parent = parent
        if oldParent != nil {
                delete(oldParent.children, t)
        }
        if parent != nil {
                parent.children[t] = struct{}{}
        }
        // If a thread group leader's parent changes, reset the thread group's
        // termination signal to SIGCHLD and re-check exit notification. (Compare
        // kernel/exit.c:reparent_leader().)
        if t != t.tg.leader {
                return
        }
        if oldParent == nil && parent == nil {
                return
        }
        if oldParent != nil && parent != nil && oldParent.tg == parent.tg {
                return
        }
        t.tg.terminationSignal = linux.SIGCHLD
        if t.exitParentNotified && !t.exitParentAcked {
                t.exitParentNotified = false
                t.exitNotifyLocked(false)
        }
}

// When a task exits, other tasks in the system, notably the task's parent and
// ptracer, may want to be notified. The exit notification system ensures that
// interested tasks receive signals and/or are woken from blocking calls to
// wait*() syscalls; these notifications must be resolved before exiting tasks
// can be reaped and disappear from the system.
//
// Each task may have a parent task and/or a tracer task. If both a parent and
// a tracer exist, they may be the same task, different tasks in the same
// thread group, or tasks in different thread groups. (In the last case, Linux
// refers to the task as being ptrace-reparented due to an implementation
// detail; we avoid this terminology to avoid confusion.)
//
// A thread group is *empty* if all non-leader tasks in the thread group are
// dead, and the leader is either a zombie or dead. The exit of a thread group
// leader is never waitable - by either the parent or tracer - until the thread
// group is empty.
//
// There are a few ways for an exit notification to be resolved:
//
// - The exit notification may be acknowledged by a call to Task.Wait with
// WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall).
//
// - If the notified party is the parent, and the parent thread group is not
// also the tracer thread group, and the notification signal is SIGCHLD, the
// parent may explicitly ignore the notification (see quote in exitNotify).
// Note that it's possible for the notified party to ignore the signal in other
// cases, but the notification is only resolved under the above conditions.
// (Actually, there is one exception; see the last paragraph of the "leader,
// has tracer, tracer thread group is parent thread group" case below.)
//
// - If the notified party is the parent, and the parent does not exist, the
// notification is resolved as if ignored. (This is only possible in the
// sentry. In Linux, the only task / thread group without a parent is global
// init, and killing global init causes a kernel panic.)
//
// - If the notified party is a tracer, the tracer may detach the traced task.
// (Zombie tasks cannot be ptrace-attached, so the reverse is not possible.)
//
// In addition, if the notified party is the parent, the parent may exit and
// cause the notifying task to be reparented to another thread group. This does
// not resolve the notification; instead, the notification must be resent to
// the new parent.
//
// The series of notifications generated for a given task's exit depend on
// whether it is a thread group leader; whether the task is ptraced; and, if
// so, whether the tracer thread group is the same as the parent thread group.
//
// - Non-leader, no tracer: No notification is generated; the task is reaped
// immediately.
//
// - Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer
// notification is resolved (by waiting or detaching), the task is reaped. (For
// non-leaders, whether the tracer and parent thread groups are the same is
// irrelevant.)
//
// - Leader, no tracer: The task remains a zombie, with no notification sent,
// until all other tasks in the thread group are dead. (In Linux terms, this
// condition is indicated by include/linux/sched.h:thread_group_empty(); tasks
// are removed from their thread_group list in kernel/exit.c:release_task() =>
// __exit_signal() => __unhash_process().) Then the thread group's termination
// signal is sent to the parent. When the parent notification is resolved (by
// waiting or ignoring), the task is reaped.
//
// - Leader, has tracer, tracer thread group is not parent thread group:
// SIGCHLD is sent to the tracer. When the tracer notification is resolved (by
// waiting or detaching), and all other tasks in the thread group are dead, the
// thread group's termination signal is sent to the parent. (Note that the
// tracer cannot resolve the exit notification by waiting until the thread
// group is empty.) When the parent notification is resolved, the task is
// reaped.
//
// - Leader, has tracer, tracer thread group is parent thread group:
//
// If all other tasks in the thread group are dead, the thread group's
// termination signal is sent to the parent. At this point, the notification
// can only be resolved by waiting. If the parent detaches from the task as a
// tracer, the notification is not resolved, but the notification can now be
// resolved by waiting or ignoring. When the parent notification is resolved,
// the task is reaped.
//
// If at least one task in the thread group is not dead, SIGCHLD is sent to the
// parent. At this point, the notification cannot be resolved at all; once the
// thread group becomes empty, it can be resolved only by waiting. If the
// parent detaches from the task as a tracer before all remaining tasks die,
// then exit notification proceeds as in the case where the leader never had a
// tracer. If the parent detaches from the task as a tracer after all remaining
// tasks die, the notification is not resolved, but the notification can now be
// resolved by waiting or ignoring. When the parent notification is resolved,
// the task is reaped.
//
// In both of the above cases, when the parent detaches from the task as a
// tracer while the thread group is empty, whether or not the parent resolves
// the notification by ignoring it is based on the parent's SIGCHLD signal
// action, whether or not the thread group's termination signal is SIGCHLD
// (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()).
//
// There is one final wrinkle: A leader can become a non-leader due to a
// sibling execve. In this case, the execing thread detaches the leader's
// tracer (if one exists) and reaps the leader immediately. In Linux, this is
// in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked().

// +stateify savable
type runExitNotify struct{}

func (*runExitNotify) execute(t *Task) taskRunState {
        t.tg.pidns.owner.mu.Lock()
        defer t.tg.pidns.owner.mu.Unlock()
        t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie)
        t.tg.liveTasks--
        // Check if this completes a sibling's execve.
        if t.tg.execing != nil && t.tg.liveTasks == 1 {
                // execing blocks the addition of new tasks to the thread group, so
                // the sole living task must be the execing one.
                e := t.tg.execing
                e.tg.signalHandlers.mu.Lock()
                if _, ok := e.stop.(*execStop); ok {
                        e.endInternalStopLocked()
                }
                e.tg.signalHandlers.mu.Unlock()
        }
        t.exitNotifyLocked(false)
        // The task goroutine will now exit.
        return nil
}

// exitNotifyLocked is called after changes to t's state that affect exit
// notification.
//
// If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace;
// thanks to Linux's haphazard implementation of this functionality, such cases
// determine whether parent notifications are ignored based on the parent's
// handling of SIGCHLD, regardless of what the exited task's thread group's
// termination signal is.
//
// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) exitNotifyLocked(fromPtraceDetach bool) {
        if t.exitState != TaskExitZombie {
                return
        }
        if !t.exitTracerNotified {
                t.exitTracerNotified = true
                tracer := t.Tracer()
                if tracer == nil {
                        t.exitTracerAcked = true
                } else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg {
                        // Don't set exitParentNotified if t is non-leader, even if the
                        // tracer is in the parent thread group, so that if the parent
                        // detaches the following call to exitNotifyLocked passes through
                        // the !exitParentNotified case below and causes t to be reaped
                        // immediately.
                        //
                        // Tracer notification doesn't care about about
                        // SIG_IGN/SA_NOCLDWAIT.
                        tracer.tg.signalHandlers.mu.Lock()
                        tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */)
                        tracer.tg.signalHandlers.mu.Unlock()
                        // Wake EventTraceeStop waiters as well since this task will never
                        // ptrace-stop again.
                        tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop)
                } else {
                        // t is a leader and the tracer is in the parent thread group.
                        t.exitParentNotified = true
                        sig := linux.SIGCHLD
                        if t.tg.tasksCount == 1 {
                                sig = t.tg.terminationSignal
                        }
                        // This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either
                        // (in Linux, the check in do_notify_parent() is gated by
                        // !tsk->ptrace.)
                        t.parent.tg.signalHandlers.mu.Lock()
                        t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */)
                        t.parent.tg.signalHandlers.mu.Unlock()
                        // See below for rationale for this event mask.
                        t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
                }
        }
        if t.exitTracerAcked && !t.exitParentNotified {
                if t != t.tg.leader {
                        t.exitParentNotified = true
                        t.exitParentAcked = true
                } else if t.tg.tasksCount == 1 {
                        t.exitParentNotified = true
                        if t.parent == nil {
                                t.exitParentAcked = true
                        } else {
                                // "POSIX.1-2001 specifies that if the disposition of SIGCHLD is
                                // set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see
                                // sigaction(2)), then children that terminate do not become
                                // zombies and a call to wait() or waitpid() will block until all
                                // children have terminated, and then fail with errno set to
                                // ECHILD. (The original POSIX standard left the behavior of
                                // setting SIGCHLD to SIG_IGN unspecified. Note that even though
                                // the default disposition of SIGCHLD is "ignore", explicitly
                                // setting the disposition to SIG_IGN results in different
                                // treatment of zombie process children.) Linux 2.6 conforms to
                                // this specification." - wait(2)
                                //
                                // Some undocumented Linux-specific details:
                                //
                                // - All of the above is ignored if the termination signal isn't
                                // SIGCHLD.
                                //
                                // - SA_NOCLDWAIT causes the leader to be immediately reaped, but
                                // does not suppress the SIGCHLD.
                                signalParent := t.tg.terminationSignal.IsValid()
                                t.parent.tg.signalHandlers.mu.Lock()
                                if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach {
                                        if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok {
                                                if act.Handler == linux.SIG_IGN {
                                                        t.exitParentAcked = true
                                                        signalParent = false
                                                } else if act.Flags&linux.SA_NOCLDWAIT != 0 {
                                                        t.exitParentAcked = true
                                                }
                                        }
                                }
                                if signalParent {
                                        t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */)
                                }
                                t.parent.tg.signalHandlers.mu.Unlock()
                                // If a task in the parent was waiting for a child group stop
                                // or continue, it needs to be notified of the exit, because
                                // there may be no remaining eligible tasks (so that wait
                                // should return ECHILD).
                                t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
                        }
                }
        }
        if t.exitTracerAcked && t.exitParentAcked {
                t.advanceExitStateLocked(TaskExitZombie, TaskExitDead)
                for ns := t.tg.pidns; ns != nil; ns = ns.parent {
                        tid := ns.tids[t]
                        delete(ns.tasks, tid)
                        delete(ns.tids, t)
                        if t == t.tg.leader {
                                delete(ns.tgids, t.tg)
                        }
                }
                t.tg.exitedCPUStats.Accumulate(t.CPUStats())
                t.tg.ioUsage.Accumulate(t.ioUsage)
                t.tg.signalHandlers.mu.Lock()
                t.tg.tasks.Remove(t)
                t.tg.tasksCount--
                tc := t.tg.tasksCount
                t.tg.signalHandlers.mu.Unlock()
                if tc == 1 && t != t.tg.leader {
                        // Our fromPtraceDetach doesn't matter here (in Linux terms, this
                        // is via a call to release_task()).
                        t.tg.leader.exitNotifyLocked(false)
                } else if tc == 0 {
                        t.tg.processGroup.decRefWithParent(t.tg.parentPG())
                }
                if t.parent != nil {
                        delete(t.parent.children, t)
                        // Do not clear t.parent. It may be still be needed after the task has exited
                        // (for example, to perform ptrace access checks on /proc/[pid] files).
                }
        }
}

// Preconditions: The TaskSet mutex must be locked.
func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *linux.SignalInfo {
        info := &linux.SignalInfo{
                Signo: int32(sig),
        }
        info.SetPID(int32(receiver.tg.pidns.tids[t]))
        info.SetUID(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
        if t.exitStatus.Signaled() {
                info.Code = linux.CLD_KILLED
                info.SetStatus(int32(t.exitStatus.TerminationSignal()))
        } else {
                info.Code = linux.CLD_EXITED
                info.SetStatus(int32(t.exitStatus.ExitStatus()))
        }
        // TODO(b/72102453): Set utime, stime.
        return info
}

// ExitStatus returns t's exit status, which is only guaranteed to be
// meaningful if t.ExitState() != TaskExitNone.
func (t *Task) ExitStatus() linux.WaitStatus {
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        t.tg.signalHandlers.mu.Lock()
        defer t.tg.signalHandlers.mu.Unlock()
        return t.exitStatus
}

// ExitStatus returns the exit status that would be returned by a consuming
// wait*() on tg.
func (tg *ThreadGroup) ExitStatus() linux.WaitStatus {
        tg.pidns.owner.mu.RLock()
        defer tg.pidns.owner.mu.RUnlock()
        tg.signalHandlers.mu.Lock()
        defer tg.signalHandlers.mu.Unlock()
        if tg.exiting {
                return tg.exitStatus
        }
        return tg.leader.exitStatus
}

// TerminationSignal returns the thread group's termination signal, which is
// the signal that will be sent to its leader's parent when all threads have
// exited.
func (tg *ThreadGroup) TerminationSignal() linux.Signal {
        tg.pidns.owner.mu.RLock()
        defer tg.pidns.owner.mu.RUnlock()
        return tg.terminationSignal
}

// Task events that can be waited for.
const (
        // EventExit represents an exit notification generated for a child thread
        // group leader or a tracee under the conditions specified in the comment
        // above runExitNotify.
        EventExit waiter.EventMask = 1 << iota

        // EventChildGroupStop occurs when a child thread group completes a group
        // stop (i.e. all tasks in the child thread group have entered a stopped
        // state as a result of a group stop).
        EventChildGroupStop

        // EventTraceeStop occurs when a task that is ptraced by a task in the
        // notified thread group enters a ptrace stop (see ptrace(2)).
        EventTraceeStop

        // EventGroupContinue occurs when a child thread group, or a thread group
        // whose leader is ptraced by a task in the notified thread group, that had
        // initiated or completed a group stop leaves the group stop, due to the
        // child thread group or any task in the child thread group being sent
        // SIGCONT.
        EventGroupContinue
)

// WaitOptions controls the behavior of Task.Wait.
type WaitOptions struct {
        // If SpecificTID is non-zero, only events from the task with thread ID
        // SpecificTID are eligible to be waited for. SpecificTID is resolved in
        // the PID namespace of the waiter (the method receiver of Task.Wait). If
        // no such task exists, or that task would not otherwise be eligible to be
        // waited for by the waiting task, then there are no waitable tasks and
        // Wait will return ECHILD.
        SpecificTID ThreadID

        // If SpecificPGID is non-zero, only events from ThreadGroups with a
        // matching ProcessGroupID are eligible to be waited for. (Same
        // constraints as SpecificTID apply.)
        SpecificPGID ProcessGroupID

        // Terminology note: Per waitpid(2), "a clone child is one which delivers
        // no signal, or a signal other than SIGCHLD to its parent upon
        // termination." In Linux, termination signal is technically a per-task
        // property rather than a per-thread-group property. However, clone()
        // forces no termination signal for tasks created with CLONE_THREAD, and
        // execve() resets the termination signal to SIGCHLD, so all
        // non-group-leader threads have no termination signal and are therefore
        // "clone tasks".

        // If NonCloneTasks is true, events from non-clone tasks are eligible to be
        // waited for.
        NonCloneTasks bool

        // If CloneTasks is true, events from clone tasks are eligible to be waited
        // for.
        CloneTasks bool

        // If SiblingChildren is true, events from children tasks of any task
        // in the thread group of the waiter are eligible to be waited for.
        SiblingChildren bool

        // Events is a bitwise combination of the events defined above that specify
        // what events are of interest to the call to Wait.
        Events waiter.EventMask

        // If ConsumeEvent is true, the Wait should consume the event such that it
        // cannot be returned by a future Wait. Note that if a task exit is
        // consumed in this way, in most cases the task will be reaped.
        ConsumeEvent bool

        // If BlockInterruptErr is not nil, Wait will block until either an event
        // is available or there are no tasks that could produce a waitable event;
        // if that blocking is interrupted, Wait returns BlockInterruptErr. If
        // BlockInterruptErr is nil, Wait will not block.
        BlockInterruptErr error
}

// Preconditions: The TaskSet mutex must be locked (for reading or writing).
func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace, tracee bool) bool {
        if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] {
                return false
        }
        if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] {
                return false
        }
        // Tracees are always eligible.
        if tracee {
                return true
        }
        if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD {
                return o.NonCloneTasks
        }
        return o.CloneTasks
}

// ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g.
// waitpid(WNOHANG)) that find no waitable events, but determine that waitable
// events may exist in the future. (In contrast, if a non-blocking or blocking
// Wait determines that there are no tasks that can produce a waitable event,
// Task.Wait returns ECHILD.)
var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events")

// WaitResult contains information about a waited-for event.
type WaitResult struct {
        // Task is the task that reported the event.
        Task *Task

        // TID is the thread ID of Task in the PID namespace of the task that
        // called Wait (that is, the method receiver of the call to Task.Wait). TID
        // is provided because consuming exit waits cause the thread ID to be
        // deallocated.
        TID ThreadID

        // UID is the real UID of Task in the user namespace of the task that
        // called Wait.
        UID auth.UID

        // Event is exactly one of the events defined above.
        Event waiter.EventMask

        // Status is the wait status associated with the event.
        Status linux.WaitStatus
}

// Wait waits for an event from a thread group that is a child of t's thread
// group, or a task in such a thread group, or a task that is ptraced by t,
// subject to the options specified in opts.
func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) {
        if opts.BlockInterruptErr == nil {
                return t.waitOnce(opts)
        }
        w, ch := waiter.NewChannelEntry(nil)
        t.tg.eventQueue.EventRegister(&w, opts.Events)
        defer t.tg.eventQueue.EventUnregister(&w)
        for {
                wr, err := t.waitOnce(opts)
                if err != ErrNoWaitableEvent {
                        // This includes err == nil.
                        return wr, err
                }
                if err := t.Block(ch); err != nil {
                        return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr)
                }
        }
}

func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) {
        anyWaitableTasks := false

        t.tg.pidns.owner.mu.Lock()
        defer t.tg.pidns.owner.mu.Unlock()

        if opts.SiblingChildren {
                // We can wait on the children and tracees of any task in the
                // same thread group.
                for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() {
                        wr, any := t.waitParentLocked(opts, parent)
                        if wr != nil {
                                return wr, nil
                        }
                        anyWaitableTasks = anyWaitableTasks || any
                }
        } else {
                // We can only wait on this task.
                var wr *WaitResult
                wr, anyWaitableTasks = t.waitParentLocked(opts, t)
                if wr != nil {
                        return wr, nil
                }
        }

        if anyWaitableTasks {
                return nil, ErrNoWaitableEvent
        }
        return nil, linuxerr.ECHILD
}

// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) waitParentLocked(opts *WaitOptions, parent *Task) (*WaitResult, bool) {
        anyWaitableTasks := false

        for child := range parent.children {
                if !opts.matchesTask(child, parent.tg.pidns, false) {
                        continue
                }
                // Non-leaders don't notify parents on exit and aren't eligible to
                // be waited on.
                if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked {
                        anyWaitableTasks = true
                        if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil {
                                return wr, anyWaitableTasks
                        }
                }
                // Check for group stops and continues. Tasks that have passed
                // TaskExitInitiated can no longer participate in group stops.
                if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 {
                        continue
                }
                if child.exitState >= TaskExitInitiated {
                        continue
                }
                // If the waiter is in the same thread group as the task's
                // tracer, do not report its group stops; they will be reported
                // as ptrace stops instead. This also skips checking for group
                // continues, but they'll be checked for when scanning tracees
                // below. (Per kernel/exit.c:wait_consider_task(): "If a
                // ptracer wants to distinguish the two events for its own
                // children, it should create a separate process which takes
                // the role of real parent.")
                if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg {
                        continue
                }
                anyWaitableTasks = true
                if opts.Events&EventChildGroupStop != 0 {
                        if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil {
                                return wr, anyWaitableTasks
                        }
                }
                if opts.Events&EventGroupContinue != 0 {
                        if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil {
                                return wr, anyWaitableTasks
                        }
                }
        }
        for tracee := range parent.ptraceTracees {
                if !opts.matchesTask(tracee, parent.tg.pidns, true) {
                        continue
                }
                // Non-leaders do notify tracers on exit.
                if opts.Events&EventExit != 0 && !tracee.exitTracerAcked {
                        anyWaitableTasks = true
                        if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil {
                                return wr, anyWaitableTasks
                        }
                }
                if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 {
                        continue
                }
                if tracee.exitState >= TaskExitInitiated {
                        continue
                }
                anyWaitableTasks = true
                if opts.Events&EventTraceeStop != 0 {
                        if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil {
                                return wr, anyWaitableTasks
                        }
                }
                if opts.Events&EventGroupContinue != 0 {
                        if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil {
                                return wr, anyWaitableTasks
                        }
                }
        }

        return nil, anyWaitableTasks
}

// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult {
        if asPtracer && !target.exitTracerNotified {
                return nil
        }
        if !asPtracer && !target.exitParentNotified {
                return nil
        }
        // Zombied thread group leaders are never waitable until their thread group
        // is otherwise empty. Usually this is caught by the
        // target.exitParentNotified check above, but if t is both (in the thread
        // group of) target's tracer and parent, asPtracer may be true.
        if target == target.tg.leader && target.tg.tasksCount != 1 {
                return nil
        }
        pid := t.tg.pidns.tids[target]
        uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
        status := target.exitStatus
        if !opts.ConsumeEvent {
                return &WaitResult{
                        Task:   target,
                        TID:    pid,
                        UID:    uid,
                        Event:  EventExit,
                        Status: status,
                }
        }
        // Surprisingly, the exit status reported by a non-consuming wait can
        // differ from that reported by a consuming wait; the latter will return
        // the group exit code if one is available.
        if target.tg.exiting {
                status = target.tg.exitStatus
        }
        // t may be (in the thread group of) target's parent, tracer, or both. We
        // don't need to check for !exitTracerAcked because tracees are detached
        // here, and we don't need to check for !exitParentAcked because zombies
        // will be reaped here.
        if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified {
                target.exitTracerAcked = true
                target.ptraceTracer.Store((*Task)(nil))
                delete(t.ptraceTracees, target)
        }
        if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified {
                target.exitParentAcked = true
                if target == target.tg.leader {
                        // target.tg.exitedCPUStats doesn't include target.CPUStats() yet,
                        // and won't until after target.exitNotifyLocked() (maybe). Include
                        // target.CPUStats() explicitly. This is consistent with Linux,
                        // which accounts an exited task's cputime to its thread group in
                        // kernel/exit.c:release_task() => __exit_signal(), and uses
                        // thread_group_cputime_adjusted() in wait_task_zombie().
                        t.tg.childCPUStats.Accumulate(target.CPUStats())
                        t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats)
                        t.tg.childCPUStats.Accumulate(target.tg.childCPUStats)
                        // Update t's child max resident set size. The size will be the maximum
                        // of this thread's size and all its childrens' sizes.
                        if t.tg.childMaxRSS < target.tg.maxRSS {
                                t.tg.childMaxRSS = target.tg.maxRSS
                        }
                        if t.tg.childMaxRSS < target.tg.childMaxRSS {
                                t.tg.childMaxRSS = target.tg.childMaxRSS
                        }
                }
        }
        target.exitNotifyLocked(false)
        return &WaitResult{
                Task:   target,
                TID:    pid,
                UID:    uid,
                Event:  EventExit,
                Status: status,
        }
}

// updateRSSLocked updates t.tg.maxRSS.
//
// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) updateRSSLocked() {
        if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS {
                t.tg.maxRSS = mmMaxRSS
        }
}

// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult {
        target.tg.signalHandlers.mu.Lock()
        defer target.tg.signalHandlers.mu.Unlock()
        if !target.tg.groupStopWaitable {
                return nil
        }
        pid := t.tg.pidns.tids[target]
        uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
        sig := target.tg.groupStopSignal
        if opts.ConsumeEvent {
                target.tg.groupStopWaitable = false
        }
        return &WaitResult{
                Task:   target,
                TID:    pid,
                UID:    uid,
                Event:  EventChildGroupStop,
                Status: linux.WaitStatusStopped(uint32(sig)),
        }
}

// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult {
        target.tg.signalHandlers.mu.Lock()
        defer target.tg.signalHandlers.mu.Unlock()
        if !target.tg.groupContWaitable {
                return nil
        }
        pid := t.tg.pidns.tids[target]
        uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
        if opts.ConsumeEvent {
                target.tg.groupContWaitable = false
        }
        return &WaitResult{
                Task:   target,
                TID:    pid,
                UID:    uid,
                Event:  EventGroupContinue,
                Status: linux.WaitStatusContinued(),
        }
}

// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult {
        target.tg.signalHandlers.mu.Lock()
        defer target.tg.signalHandlers.mu.Unlock()
        if target.stop == nil {
                return nil
        }
        if _, ok := target.stop.(*ptraceStop); !ok {
                return nil
        }
        if target.ptraceCode == 0 {
                return nil
        }
        pid := t.tg.pidns.tids[target]
        uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
        code := target.ptraceCode
        if opts.ConsumeEvent {
                target.ptraceCode = 0
        }
        return &WaitResult{
                Task:   target,
                TID:    pid,
                UID:    uid,
                Event:  EventTraceeStop,
                Status: linux.WaitStatusStopped(uint32(code)),
        }
}

// ExitState returns t's current progress through the exit path.
func (t *Task) ExitState() TaskExitState {
        t.tg.pidns.owner.mu.RLock()
        defer t.tg.pidns.owner.mu.RUnlock()
        return t.exitState
}

// ParentDeathSignal returns t's parent death signal.
func (t *Task) ParentDeathSignal() linux.Signal {
        t.mu.Lock()
        defer t.mu.Unlock()
        return t.parentDeathSignal
}

// SetParentDeathSignal sets t's parent death signal.
func (t *Task) SetParentDeathSignal(sig linux.Signal) {
        t.mu.Lock()
        defer t.mu.Unlock()
        t.parentDeathSignal = sig
}







































































































































































    4 









    1 









    1 



    2 





    3 




   15 




   13 













    8 
    1 



    7 


    2 



    5 







    3 
    2 

    1 



    1 


    2 


    2 




    2 





    3 






    3 



    3 






    3 

    1 




    2 



    1 


    1 


    1 













    1 





    1 











    8 

    7 

    2 
    1 


    1 



    2 
    1 



    1 

    1 
    1 








    2 



    1 







    1 





    3 



    4 

    4 

    1 



    1 


    1 


    1 




    1 
    1 















    1 





    1 









    1 

    1 








































    1 






















    3 














    2 







    3 
    2 







    3 

    1 


    3 


    3 




    1 



    2 
    1 

    1 


    1 


    1 


    1 
    1 






































   16 


   14 




   16 



   14 








   14 









   16 



















   16 


   12 









    1 











   21 
   17 

    2 




   16 



    1 



   16 
   12 

    4 



   16 




   21 



   21 


    3 

    2 




    1 






   21 





   21 




    1 



   21 







   21 




   21 



   21 



   13 















    1 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package netlink provides core functionality for netlink sockets.
package netlink

import (
        "io"
        "math"

        "gvisor.dev/gvisor/pkg/abi/linux"
        "gvisor.dev/gvisor/pkg/abi/linux/errno"
        "gvisor.dev/gvisor/pkg/context"
        "gvisor.dev/gvisor/pkg/errors/linuxerr"
        "gvisor.dev/gvisor/pkg/hostarch"
        "gvisor.dev/gvisor/pkg/marshal"
        "gvisor.dev/gvisor/pkg/marshal/primitive"
        "gvisor.dev/gvisor/pkg/sentry/arch"
        "gvisor.dev/gvisor/pkg/sentry/device"
        "gvisor.dev/gvisor/pkg/sentry/fs"
        "gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
        "gvisor.dev/gvisor/pkg/sentry/kernel"
        "gvisor.dev/gvisor/pkg/sentry/kernel/auth"
        ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
        "gvisor.dev/gvisor/pkg/sentry/socket"
        "gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix"
        "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
        "gvisor.dev/gvisor/pkg/sync"
        "gvisor.dev/gvisor/pkg/syserr"
        "gvisor.dev/gvisor/pkg/syserror"
        "gvisor.dev/gvisor/pkg/tcpip"
        "gvisor.dev/gvisor/pkg/usermem"
        "gvisor.dev/gvisor/pkg/waiter"
)

const sizeOfInt32 int = 4

const (
        // minBufferSize is the smallest size of a send buffer.
        minSendBufferSize = 4 << 10 // 4096 bytes.

        // defaultSendBufferSize is the default size for the send buffer.
        defaultSendBufferSize = 16 * 1024

        // maxBufferSize is the largest size a send buffer can grow to.
        maxSendBufferSize = 4 << 20 // 4MB
)

var errNoFilter = syserr.New("no filter attached", errno.ENOENT)

// netlinkSocketDevice is the netlink socket virtual device.
var netlinkSocketDevice = device.NewAnonDevice()

// LINT.IfChange

// Socket is the base socket type for netlink sockets.
//
// This implementation only supports userspace sending and receiving messages
// to/from the kernel.
//
// Socket implements socket.Socket and transport.Credentialer.
//
// +stateify savable
type Socket struct {
        fsutil.FilePipeSeek             `state:"nosave"`
        fsutil.FileNotDirReaddir        `state:"nosave"`
        fsutil.FileNoFsync              `state:"nosave"`
        fsutil.FileNoMMap               `state:"nosave"`
        fsutil.FileNoSplice             `state:"nosave"`
        fsutil.FileNoopFlush            `state:"nosave"`
        fsutil.FileUseInodeUnstableAttr `state:"nosave"`

        socketOpsCommon
}

// socketOpsCommon contains the socket operations common to VFS1 and VFS2.
//
// +stateify savable
type socketOpsCommon struct {
        socket.SendReceiveTimeout

        // ports provides netlink port allocation.
        ports *port.Manager

        // protocol is the netlink protocol implementation.
        protocol Protocol

        // skType is the socket type. This is either SOCK_DGRAM or SOCK_RAW for
        // netlink sockets.
        skType linux.SockType

        // ep is a datagram unix endpoint used to buffer messages sent from the
        // kernel to userspace. RecvMsg reads messages from this endpoint.
        ep transport.Endpoint

        // connection is the kernel's connection to ep, used to write messages
        // sent to userspace.
        connection transport.ConnectedEndpoint

        // mu protects the fields below.
        mu sync.Mutex `state:"nosave"`

        // bound indicates that portid is valid.
        bound bool

        // portID is the port ID allocated for this socket.
        portID int32

        // sendBufferSize is the send buffer "size". We don't actually have a
        // fixed buffer but only consume this many bytes.
        sendBufferSize uint32

        // filter indicates that this socket has a BPF filter "installed".
        //
        // TODO(gvisor.dev/issue/1119): We don't actually support filtering,
        // this is just bookkeeping for tracking add/remove.
        filter bool
}

var _ socket.Socket = (*Socket)(nil)
var _ transport.Credentialer = (*Socket)(nil)

// NewSocket creates a new Socket.
func NewSocket(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socket, *syserr.Error) {
        // Datagram endpoint used to buffer kernel -> user messages.
        ep := transport.NewConnectionless(t)

        // Bind the endpoint for good measure so we can connect to it. The
        // bound address will never be exposed.
        if err := ep.Bind(tcpip.FullAddress{Addr: "dummy"}, nil); err != nil {
                ep.Close(t)
                return nil, err
        }

        // Create a connection from which the kernel can write messages.
        connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t)
        if err != nil {
                ep.Close(t)
                return nil, err
        }

        return &Socket{
                socketOpsCommon: socketOpsCommon{
                        ports:          t.Kernel().NetlinkPorts(),
                        protocol:       protocol,
                        skType:         skType,
                        ep:             ep,
                        connection:     connection,
                        sendBufferSize: defaultSendBufferSize,
                },
        }, nil
}

// Release implements fs.FileOperations.Release.
func (s *socketOpsCommon) Release(ctx context.Context) {
        s.connection.Release(ctx)
        s.ep.Close(ctx)

        if s.bound {
                s.ports.Release(s.protocol.Protocol(), s.portID)
        }
}

// Readiness implements waiter.Waitable.Readiness.
func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask {
        // ep holds messages to be read and thus handles EventIn readiness.
        ready := s.ep.Readiness(mask)

        if mask&waiter.WritableEvents != 0 {
                // sendMsg handles messages synchronously and is thus always
                // ready for writing.
                ready |= waiter.WritableEvents
        }

        return ready
}

// EventRegister implements waiter.Waitable.EventRegister.
func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
        s.ep.EventRegister(e, mask)
        // Writable readiness never changes, so no registration is needed.
}

// EventUnregister implements waiter.Waitable.EventUnregister.
func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) {
        s.ep.EventUnregister(e)
}

// Passcred implements transport.Credentialer.Passcred.
func (s *socketOpsCommon) Passcred() bool {
        return s.ep.SocketOptions().GetPassCred()
}

// ConnectedPasscred implements transport.Credentialer.ConnectedPasscred.
func (s *socketOpsCommon) ConnectedPasscred() bool {
        // This socket is connected to the kernel, which doesn't need creds.
        //
        // This is arbitrary, as ConnectedPasscred on this type has no callers.
        return false
}

// Ioctl implements fs.FileOperations.Ioctl.
func (*Socket) Ioctl(context.Context, *fs.File, usermem.IO, arch.SyscallArguments) (uintptr, error) {
        // TODO(b/68878065): no ioctls supported.
        return 0, linuxerr.ENOTTY
}

// ExtractSockAddr extracts the SockAddrNetlink from b.
func ExtractSockAddr(b []byte) (*linux.SockAddrNetlink, *syserr.Error) {
        if len(b) < linux.SockAddrNetlinkSize {
                return nil, syserr.ErrBadAddress
        }

        var sa linux.SockAddrNetlink
        sa.UnmarshalUnsafe(b[:sa.SizeBytes()])

        if sa.Family != linux.AF_NETLINK {
                return nil, syserr.ErrInvalidArgument
        }

        return &sa, nil
}

// bindPort binds this socket to a port, preferring 'port' if it is available.
//
// port of 0 defaults to the ThreadGroup ID.
//
// Preconditions: mu is held.
func (s *socketOpsCommon) bindPort(t *kernel.Task, port int32) *syserr.Error {
        if s.bound {
                // Re-binding is only allowed if the port doesn't change.
                if port != s.portID {
                        return syserr.ErrInvalidArgument
                }

                return nil
        }

        if port == 0 {
                port = int32(t.ThreadGroup().ID())
        }
        port, ok := s.ports.Allocate(s.protocol.Protocol(), port)
        if !ok {
                return syserr.ErrBusy
        }

        s.portID = port
        s.bound = true
        return nil
}

// Bind implements socket.Socket.Bind.
func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
        a, err := ExtractSockAddr(sockaddr)
        if err != nil {
                return err
        }

        // No support for multicast groups yet.
        if a.Groups != 0 {
                return syserr.ErrPermissionDenied
        }

        s.mu.Lock()
        defer s.mu.Unlock()

        return s.bindPort(t, int32(a.PortID))
}

// Connect implements socket.Socket.Connect.
func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
        a, err := ExtractSockAddr(sockaddr)
        if err != nil {
                return err
        }

        // No support for multicast groups yet.
        if a.Groups != 0 {
                return syserr.ErrPermissionDenied
        }

        s.mu.Lock()
        defer s.mu.Unlock()

        if a.PortID == 0 {
                // Netlink sockets default to connected to the kernel, but
                // connecting anyways automatically binds if not already bound.
                if !s.bound {
                        // Pass port 0 to get an auto-selected port ID.
                        return s.bindPort(t, 0)
                }
                return nil
        }

        // We don't support non-kernel destination ports. Linux returns EPERM
        // if applications attempt to do this without NL_CFG_F_NONROOT_SEND, so
        // we emulate that.
        return syserr.ErrPermissionDenied
}

// Accept implements socket.Socket.Accept.
func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
        // Netlink sockets never support accept.
        return 0, nil, 0, syserr.ErrNotSupported
}

// Listen implements socket.Socket.Listen.
func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
        // Netlink sockets never support listen.
        return syserr.ErrNotSupported
}

// Shutdown implements socket.Socket.Shutdown.
func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
        // Netlink sockets never support shutdown.
        return syserr.ErrNotSupported
}

// GetSockOpt implements socket.Socket.GetSockOpt.
func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
        switch level {
        case linux.SOL_SOCKET:
                switch name {
                case linux.SO_SNDBUF:
                        if outLen < sizeOfInt32 {
                                return nil, syserr.ErrInvalidArgument
                        }
                        s.mu.Lock()
                        defer s.mu.Unlock()
                        return primitive.AllocateInt32(int32(s.sendBufferSize)), nil

                case linux.SO_RCVBUF:
                        if outLen < sizeOfInt32 {
                                return nil, syserr.ErrInvalidArgument
                        }
                        // We don't have limit on receiving size.
                        return primitive.AllocateInt32(math.MaxInt32), nil

                case linux.SO_PASSCRED:
                        if outLen < sizeOfInt32 {
                                return nil, syserr.ErrInvalidArgument
                        }
                        var passcred primitive.Int32
                        if s.Passcred() {
                                passcred = 1
                        }
                        return &passcred, nil

                default:
                        socket.GetSockOptEmitUnimplementedEvent(t, name)
                }

        case linux.SOL_NETLINK:
                switch name {
                case linux.NETLINK_BROADCAST_ERROR,
                        linux.NETLINK_CAP_ACK,
                        linux.NETLINK_DUMP_STRICT_CHK,
                        linux.NETLINK_EXT_ACK,
                        linux.NETLINK_LIST_MEMBERSHIPS,
                        linux.NETLINK_NO_ENOBUFS,
                        linux.NETLINK_PKTINFO:

                        t.Kernel().EmitUnimplementedEvent(t)
                }
        }
        // TODO(b/68878065): other sockopts are not supported.
        return nil, syserr.ErrProtocolNotAvailable
}

// SetSockOpt implements socket.Socket.SetSockOpt.
func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
        switch level {
        case linux.SOL_SOCKET:
                switch name {
                case linux.SO_SNDBUF:
                        if len(opt) < sizeOfInt32 {
                                return syserr.ErrInvalidArgument
                        }
                        size := hostarch.ByteOrder.Uint32(opt)
                        if size < minSendBufferSize {
                                size = minSendBufferSize
                        } else if size > maxSendBufferSize {
                                size = maxSendBufferSize
                        }
                        s.mu.Lock()
                        s.sendBufferSize = size
                        s.mu.Unlock()
                        return nil

                case linux.SO_RCVBUF:
                        if len(opt) < sizeOfInt32 {
                                return syserr.ErrInvalidArgument
                        }
                        // We don't have limit on receiving size. So just accept anything as
                        // valid for compatibility.
                        return nil

                case linux.SO_PASSCRED:
                        if len(opt) < sizeOfInt32 {
                                return syserr.ErrInvalidArgument
                        }
                        passcred := hostarch.ByteOrder.Uint32(opt)

                        s.ep.SocketOptions().SetPassCred(passcred != 0)
                        return nil

                case linux.SO_ATTACH_FILTER:
                        // TODO(gvisor.dev/issue/1119): We don't actually
                        // support filtering. If this socket can't ever send
                        // messages, then there is nothing to filter and we can
                        // advertise support. Otherwise, be conservative and
                        // return an error.
                        if s.protocol.CanSend() {
                                socket.SetSockOptEmitUnimplementedEvent(t, name)
                                return syserr.ErrProtocolNotAvailable
                        }

                        s.mu.Lock()
                        s.filter = true
                        s.mu.Unlock()
                        return nil

                case linux.SO_DETACH_FILTER:
                        // TODO(gvisor.dev/issue/1119): See above.
                        if s.protocol.CanSend() {
                                socket.SetSockOptEmitUnimplementedEvent(t, name)
                                return syserr.ErrProtocolNotAvailable
                        }

                        s.mu.Lock()
                        filter := s.filter
                        s.filter = false
                        s.mu.Unlock()

                        if !filter {
                                return errNoFilter
                        }

                        return nil

                default:
                        socket.SetSockOptEmitUnimplementedEvent(t, name)
                }

        case linux.SOL_NETLINK:
                switch name {
                case linux.NETLINK_ADD_MEMBERSHIP,
                        linux.NETLINK_BROADCAST_ERROR,
                        linux.NETLINK_CAP_ACK,
                        linux.NETLINK_DROP_MEMBERSHIP,
                        linux.NETLINK_DUMP_STRICT_CHK,
                        linux.NETLINK_EXT_ACK,
                        linux.NETLINK_LISTEN_ALL_NSID,
                        linux.NETLINK_NO_ENOBUFS,
                        linux.NETLINK_PKTINFO:

                        t.Kernel().EmitUnimplementedEvent(t)
                }

        }
        // TODO(b/68878065): other sockopts are not supported.
        return syserr.ErrProtocolNotAvailable
}

// GetSockName implements socket.Socket.GetSockName.
func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
        s.mu.Lock()
        defer s.mu.Unlock()

        sa := &linux.SockAddrNetlink{
                Family: linux.AF_NETLINK,
                PortID: uint32(s.portID),
        }
        return sa, uint32(sa.SizeBytes()), nil
}

// GetPeerName implements socket.Socket.GetPeerName.
func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
        sa := &linux.SockAddrNetlink{
                Family: linux.AF_NETLINK,
                // TODO(b/68878065): Support non-kernel peers. For now the peer
                // must be the kernel.
                PortID: 0,
        }
        return sa, uint32(sa.SizeBytes()), nil
}

// RecvMsg implements socket.Socket.RecvMsg.
func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
        from := &linux.SockAddrNetlink{
                Family: linux.AF_NETLINK,
                PortID: 0,
        }
        fromLen := uint32(from.SizeBytes())

        trunc := flags&linux.MSG_TRUNC != 0

        r := unix.EndpointReader{
                Ctx:      t,
                Endpoint: s.ep,
                Peek:     flags&linux.MSG_PEEK != 0,
        }

        doRead := func() (int64, error) {
                return dst.CopyOutFrom(t, &r)
        }

        // If MSG_TRUNC is set with a zero byte destination then we still need
        // to read the message and discard it, or in the case where MSG_PEEK is
        // set, leave it be. In both cases the full message length must be
        // returned.
        if trunc && dst.Addrs.NumBytes() == 0 {
                doRead = func() (int64, error) {
                        err := r.Truncate()
                        // Always return zero for bytes read since the destination size is
                        // zero.
                        return 0, err
                }
        }

        if n, err := doRead(); err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
                var mflags int
                if n < int64(r.MsgSize) {
                        mflags |= linux.MSG_TRUNC
                }
                if trunc {
                        n = int64(r.MsgSize)
                }
                return int(n), mflags, from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
        }

        // We'll have to block. Register for notification and keep trying to
        // receive all the data.
        e, ch := waiter.NewChannelEntry(nil)
        s.EventRegister(&e, waiter.ReadableEvents)
        defer s.EventUnregister(&e)

        for {
                if n, err := doRead(); err != syserror.ErrWouldBlock {
                        var mflags int
                        if n < int64(r.MsgSize) {
                                mflags |= linux.MSG_TRUNC
                        }
                        if trunc {
                                n = int64(r.MsgSize)
                        }
                        return int(n), mflags, from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
                }

                if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
                        if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
                                return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
                        }
                        return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
                }
        }
}

// Read implements fs.FileOperations.Read.
func (s *Socket) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
        if dst.NumBytes() == 0 {
                return 0, nil
        }
        return dst.CopyOutFrom(ctx, &unix.EndpointReader{
                Endpoint: s.ep,
        })
}

// kernelSCM implements control.SCMCredentials with credentials that represent
// the kernel itself rather than a Task.
//
// +stateify savable
type kernelSCM struct{}

// Equals implements transport.CredentialsControlMessage.Equals.
func (kernelSCM) Equals(oc transport.CredentialsControlMessage) bool {
        _, ok := oc.(kernelSCM)
        return ok
}

// Credentials implements control.SCMCredentials.Credentials.
func (kernelSCM) Credentials(*kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) {
        return 0, auth.RootUID, auth.RootGID
}

// kernelCreds is the concrete version of kernelSCM used in all creds.
var kernelCreds = &kernelSCM{}

// sendResponse sends the response messages in ms back to userspace.
func (s *socketOpsCommon) sendResponse(ctx context.Context, ms *MessageSet) *syserr.Error {
        // Linux combines multiple netlink messages into a single datagram.
        bufs := make([][]byte, 0, len(ms.Messages))
        for _, m := range ms.Messages {
                bufs = append(bufs, m.Finalize())
        }

        // All messages are from the kernel.
        cms := transport.ControlMessages{
                Credentials: kernelCreds,
        }

        if len(bufs) > 0 {
                // RecvMsg never receives the address, so we don't need to send
                // one.
                _, notify, err := s.connection.Send(ctx, bufs, cms, tcpip.FullAddress{})
                // If the buffer is full, we simply drop messages, just like
                // Linux.
                if err != nil && err != syserr.ErrWouldBlock {
                        return err
                }
                if notify {
                        s.connection.SendNotify()
                }
        }

        // N.B. multi-part messages should still send NLMSG_DONE even if
        // MessageSet contains no messages.
        //
        // N.B. NLMSG_DONE is always sent in a different datagram. See
        // net/netlink/af_netlink.c:netlink_dump.
        if ms.Multi {
                m := NewMessage(linux.NetlinkMessageHeader{
                        Type:   linux.NLMSG_DONE,
                        Flags:  linux.NLM_F_MULTI,
                        Seq:    ms.Seq,
                        PortID: uint32(ms.PortID),
                })

                // Add the dump_done_errno payload.
                m.Put(primitive.AllocateInt64(0))

                _, notify, err := s.connection.Send(ctx, [][]byte{m.Finalize()}, cms, tcpip.FullAddress{})
                if err != nil && err != syserr.ErrWouldBlock {
                        return err
                }
                if notify {
                        s.connection.SendNotify()
                }
        }

        return nil
}

func dumpErrorMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet, err *syserr.Error) {
        m := ms.AddMessage(linux.NetlinkMessageHeader{
                Type: linux.NLMSG_ERROR,
        })
        m.Put(&linux.NetlinkErrorMessage{
                Error:  int32(-err.ToLinux()),
                Header: hdr,
        })
}

func dumpAckMesage(hdr linux.NetlinkMessageHeader, ms *MessageSet) {
        m := ms.AddMessage(linux.NetlinkMessageHeader{
                Type: linux.NLMSG_ERROR,
        })
        m.Put(&linux.NetlinkErrorMessage{
                Error:  0,
                Header: hdr,
        })
}

// processMessages handles each message in buf, passing it to the protocol
// handler for final handling.
func (s *socketOpsCommon) processMessages(ctx context.Context, buf []byte) *syserr.Error {
        for len(buf) > 0 {
                msg, rest, ok := ParseMessage(buf)
                if !ok {
                        // Linux ignores messages that are too short. See
                        // net/netlink/af_netlink.c:netlink_rcv_skb.
                        break
                }
                buf = rest
                hdr := msg.Header()

                // Ignore control messages.
                if hdr.Type < linux.NLMSG_MIN_TYPE {
                        continue
                }

                ms := NewMessageSet(s.portID, hdr.Seq)
                if err := s.protocol.ProcessMessage(ctx, msg, ms); err != nil {
                        dumpErrorMesage(hdr, ms, err)
                } else if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
                        dumpAckMesage(hdr, ms)
                }

                if err := s.sendResponse(ctx, ms); err != nil {
                        return err
                }
        }

        return nil
}

// sendMsg is the core of message send, used for SendMsg and Write.
func (s *socketOpsCommon) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
        dstPort := int32(0)

        if len(to) != 0 {
                a, err := ExtractSockAddr(to)
                if err != nil {
                        return 0, err
                }

                // No support for multicast groups yet.
                if a.Groups != 0 {
                        return 0, syserr.ErrPermissionDenied
                }

                dstPort = int32(a.PortID)
        }

        if dstPort != 0 {
                // Non-kernel destinations not supported yet. Treat as if
                // NL_CFG_F_NONROOT_SEND is not set.
                return 0, syserr.ErrPermissionDenied
        }

        s.mu.Lock()
        defer s.mu.Unlock()

        // For simplicity, and consistency with Linux, we copy in the entire
        // message up front.
        if src.NumBytes() > int64(s.sendBufferSize) {
                return 0, syserr.ErrMessageTooLong
        }

        buf := make([]byte, src.NumBytes())
        n, err := src.CopyIn(ctx, buf)
        // io.EOF can be only returned if src is a file, this means that
        // sendMsg is called from splice and the error has to be ignored in
        // this case.
        if err == io.EOF {
                err = nil
        }
        if err != nil {
                // Don't partially consume messages.
                return 0, syserr.FromError(err)
        }

        if err := s.processMessages(ctx, buf); err != nil {
                return 0, err
        }

        return n, nil
}

// SendMsg implements socket.Socket.SendMsg.
func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
        return s.sendMsg(t, src, to, flags, controlMessages)
}

// Write implements fs.FileOperations.Write.
func (s *Socket) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
        n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{})
        return int64(n), err.ToError()
}

// State implements socket.Socket.State.
func (s *socketOpsCommon) State() uint32 {
        return s.ep.State()
}

// Type implements socket.Socket.Type.
func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) {
        return linux.AF_NETLINK, s.skType, s.protocol.Protocol()
}

// LINT.ThenChange(./socket_vfs2.go)




































































    1 





    1 


    1 


    1 










    1 









    2 









































































































































  181 






  108 

   86 


  109 



    2 




   25 




    1 




  135 




    2 




   24 




    1 









    3 
















    7 




    1 





   23 




   70 




    6 




    1 




    5 




    1 




    5 









   11 




   21 




   61 






    7 









   51 




    1 




   49 





    5 




    3 





    5 




    2 




    9 




    3 

    2 





    3 




    1 





    1 


   41 







    1 























    2 



































































    2 







    2 




    2 


    2 




    2 








    3 






    1 










   52 





    1 
    1 









    2 





   12 







  184 


    2 


  184 




    3 





   84 







  186 


    3 


  186 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcpip

import (
        "sync/atomic"

        "gvisor.dev/gvisor/pkg/sync"
)

// SocketOptionsHandler holds methods that help define endpoint specific
// behavior for socket level socket options. These must be implemented by
// endpoints to get notified when socket level options are set.
type SocketOptionsHandler interface {
        // OnReuseAddressSet is invoked when SO_REUSEADDR is set for an endpoint.
        OnReuseAddressSet(v bool)

        // OnReusePortSet is invoked when SO_REUSEPORT is set for an endpoint.
        OnReusePortSet(v bool)

        // OnKeepAliveSet is invoked when SO_KEEPALIVE is set for an endpoint.
        OnKeepAliveSet(v bool)

        // OnDelayOptionSet is invoked when TCP_NODELAY is set for an endpoint.
        // Note that v will be the inverse of TCP_NODELAY option.
        OnDelayOptionSet(v bool)

        // OnCorkOptionSet is invoked when TCP_CORK is set for an endpoint.
        OnCorkOptionSet(v bool)

        // LastError is invoked when SO_ERROR is read for an endpoint.
        LastError() Error

        // UpdateLastError updates the endpoint specific last error field.
        UpdateLastError(err Error)

        // HasNIC is invoked to check if the NIC is valid for SO_BINDTODEVICE.
        HasNIC(v int32) bool

        // OnSetSendBufferSize is invoked when the send buffer size for an endpoint is
        // changed. The handler is invoked with the new value for the socket send
        // buffer size. It also returns the newly set value.
        OnSetSendBufferSize(v int64) (newSz int64)

        // OnSetReceiveBufferSize is invoked to set the SO_RCVBUFSIZE.
        OnSetReceiveBufferSize(v, oldSz int64) (newSz int64)
}

// DefaultSocketOptionsHandler is an embeddable type that implements no-op
// implementations for SocketOptionsHandler methods.
type DefaultSocketOptionsHandler struct{}

var _ SocketOptionsHandler = (*DefaultSocketOptionsHandler)(nil)

// OnReuseAddressSet implements SocketOptionsHandler.OnReuseAddressSet.
func (*DefaultSocketOptionsHandler) OnReuseAddressSet(bool) {}

// OnReusePortSet implements SocketOptionsHandler.OnReusePortSet.
func (*DefaultSocketOptionsHandler) OnReusePortSet(bool) {}

// OnKeepAliveSet implements SocketOptionsHandler.OnKeepAliveSet.
func (*DefaultSocketOptionsHandler) OnKeepAliveSet(bool) {}

// OnDelayOptionSet implements SocketOptionsHandler.OnDelayOptionSet.
func (*DefaultSocketOptionsHandler) OnDelayOptionSet(bool) {}

// OnCorkOptionSet implements SocketOptionsHandler.OnCorkOptionSet.
func (*DefaultSocketOptionsHandler) OnCorkOptionSet(bool) {}

// LastError implements SocketOptionsHandler.LastError.
func (*DefaultSocketOptionsHandler) LastError() Error {
        return nil
}

// UpdateLastError implements SocketOptionsHandler.UpdateLastError.
func (*DefaultSocketOptionsHandler) UpdateLastError(Error) {}

// HasNIC implements SocketOptionsHandler.HasNIC.
func (*DefaultSocketOptionsHandler) HasNIC(int32) bool {
        return false
}

// OnSetSendBufferSize implements SocketOptionsHandler.OnSetSendBufferSize.
func (*DefaultSocketOptionsHandler) OnSetSendBufferSize(v int64) (newSz int64) {
        return v
}

// OnSetReceiveBufferSize implements SocketOptionsHandler.OnSetReceiveBufferSize.
func (*DefaultSocketOptionsHandler) OnSetReceiveBufferSize(v, oldSz int64) (newSz int64) {
        return v
}

// StackHandler holds methods to access the stack options. These must be
// implemented by the stack.
type StackHandler interface {
        // Option allows retrieving stack wide options.
        Option(option interface{}) Error

        // TransportProtocolOption allows retrieving individual protocol level
        // option values.
        TransportProtocolOption(proto TransportProtocolNumber, option GettableTransportProtocolOption) Error
}

// SocketOptions contains all the variables which store values for SOL_SOCKET,
// SOL_IP, SOL_IPV6 and SOL_TCP level options.
//
// +stateify savable
type SocketOptions struct {
        handler SocketOptionsHandler

        // StackHandler is initialized at the creation time and will not change.
        stackHandler StackHandler `state:"manual"`

        // These fields are accessed and modified using atomic operations.

        // broadcastEnabled determines whether datagram sockets are allowed to
        // send packets to a broadcast address.
        broadcastEnabled uint32

        // passCredEnabled determines whether SCM_CREDENTIALS socket control
        // messages are enabled.
        passCredEnabled uint32

        // noChecksumEnabled determines whether UDP checksum is disabled while
        // transmitting for this socket.
        noChecksumEnabled uint32

        // reuseAddressEnabled determines whether Bind() should allow reuse of
        // local address.
        reuseAddressEnabled uint32

        // reusePortEnabled determines whether to permit multiple sockets to be
        // bound to an identical socket address.
        reusePortEnabled uint32

        // keepAliveEnabled determines whether TCP keepalive is enabled for this
        // socket.
        keepAliveEnabled uint32

        // multicastLoopEnabled determines whether multicast packets sent over a
        // non-loopback interface will be looped back.
        multicastLoopEnabled uint32

        // receiveTOSEnabled is used to specify if the TOS ancillary message is
        // passed with incoming packets.
        receiveTOSEnabled uint32

        // receiveTClassEnabled is used to specify if the IPV6_TCLASS ancillary
        // message is passed with incoming packets.
        receiveTClassEnabled uint32

        // receivePacketInfoEnabled is used to specify if more inforamtion is
        // provided with incoming packets such as interface index and address.
        receivePacketInfoEnabled uint32

        // hdrIncludeEnabled is used to indicate for a raw endpoint that all packets
        // being written have an IP header and the endpoint should not attach an IP
        // header.
        hdrIncludedEnabled uint32

        // v6OnlyEnabled is used to determine whether an IPv6 socket is to be
        // restricted to sending and receiving IPv6 packets only.
        v6OnlyEnabled uint32

        // quickAckEnabled is used to represent the value of TCP_QUICKACK option.
        // It currently does not have any effect on the TCP endpoint.
        quickAckEnabled uint32

        // delayOptionEnabled is used to specify if data should be sent out immediately
        // by the transport protocol. For TCP, it determines if the Nagle algorithm
        // is on or off.
        delayOptionEnabled uint32

        // corkOptionEnabled is used to specify if data should be held until segments
        // are full by the TCP transport protocol.
        corkOptionEnabled uint32

        // receiveOriginalDstAddress is used to specify if the original destination of
        // the incoming packet should be returned as an ancillary message.
        receiveOriginalDstAddress uint32

        // recvErrEnabled determines whether extended reliable error message passing
        // is enabled.
        recvErrEnabled uint32

        // errQueue is the per-socket error queue. It is protected by errQueueMu.
        errQueueMu sync.Mutex `state:"nosave"`
        errQueue   sockErrorList

        // bindToDevice determines the device to which the socket is bound.
        bindToDevice int32

        // getSendBufferLimits provides the handler to get the min, default and
        // max size for send buffer. It  is initialized at the creation time and
        // will not change.
        getSendBufferLimits GetSendBufferLimits `state:"manual"`

        // sendBufSizeMu protects sendBufferSize and calls to
        // handler.OnSetSendBufferSize.
        sendBufSizeMu sync.Mutex `state:"nosave"`

        // sendBufferSize determines the send buffer size for this socket.
        sendBufferSize int64

        // getReceiveBufferLimits provides the handler to get the min, default and
        // max size for receive buffer. It is initialized at the creation time and
        // will not change.
        getReceiveBufferLimits GetReceiveBufferLimits `state:"manual"`

        // receiveBufSizeMu protects receiveBufferSize and calls to
        // handler.OnSetReceiveBufferSize.
        receiveBufSizeMu sync.Mutex `state:"nosave"`

        // receiveBufferSize determines the receive buffer size for this socket.
        receiveBufferSize int64

        // mu protects the access to the below fields.
        mu sync.Mutex `state:"nosave"`

        // linger determines the amount of time the socket should linger before
        // close. We currently implement this option for TCP socket only.
        linger LingerOption
}

// InitHandler initializes the handler. This must be called before using the
// socket options utility.
func (so *SocketOptions) InitHandler(handler SocketOptionsHandler, stack StackHandler, getSendBufferLimits GetSendBufferLimits, getReceiveBufferLimits GetReceiveBufferLimits) {
        so.handler = handler
        so.stackHandler = stack
        so.getSendBufferLimits = getSendBufferLimits
        so.getReceiveBufferLimits = getReceiveBufferLimits
}

func storeAtomicBool(addr *uint32, v bool) {
        var val uint32
        if v {
                val = 1
        }
        atomic.StoreUint32(addr, val)
}

// SetLastError sets the last error for a socket.
func (so *SocketOptions) SetLastError(err Error) {
        so.handler.UpdateLastError(err)
}

// GetBroadcast gets value for SO_BROADCAST option.
func (so *SocketOptions) GetBroadcast() bool {
        return atomic.LoadUint32(&so.broadcastEnabled) != 0
}

// SetBroadcast sets value for SO_BROADCAST option.
func (so *SocketOptions) SetBroadcast(v bool) {
        storeAtomicBool(&so.broadcastEnabled, v)
}

// GetPassCred gets value for SO_PASSCRED option.
func (so *SocketOptions) GetPassCred() bool {
        return atomic.LoadUint32(&so.passCredEnabled) != 0
}

// SetPassCred sets value for SO_PASSCRED option.
func (so *SocketOptions) SetPassCred(v bool) {
        storeAtomicBool(&so.passCredEnabled, v)
}

// GetNoChecksum gets value for SO_NO_CHECK option.
func (so *SocketOptions) GetNoChecksum() bool {
        return atomic.LoadUint32(&so.noChecksumEnabled) != 0
}

// SetNoChecksum sets value for SO_NO_CHECK option.
func (so *SocketOptions) SetNoChecksum(v bool) {
        storeAtomicBool(&so.noChecksumEnabled, v)
}

// GetReuseAddress gets value for SO_REUSEADDR option.
func (so *SocketOptions) GetReuseAddress() bool {
        return atomic.LoadUint32(&so.reuseAddressEnabled) != 0
}

// SetReuseAddress sets value for SO_REUSEADDR option.
func (so *SocketOptions) SetReuseAddress(v bool) {
        storeAtomicBool(&so.reuseAddressEnabled, v)
        so.handler.OnReuseAddressSet(v)
}

// GetReusePort gets value for SO_REUSEPORT option.
func (so *SocketOptions) GetReusePort() bool {
        return atomic.LoadUint32(&so.reusePortEnabled) != 0
}

// SetReusePort sets value for SO_REUSEPORT option.
func (so *SocketOptions) SetReusePort(v bool) {
        storeAtomicBool(&so.reusePortEnabled, v)
        so.handler.OnReusePortSet(v)
}

// GetKeepAlive gets value for SO_KEEPALIVE option.
func (so *SocketOptions) GetKeepAlive() bool {
        return atomic.LoadUint32(&so.keepAliveEnabled) != 0
}

// SetKeepAlive sets value for SO_KEEPALIVE option.
func (so *SocketOptions) SetKeepAlive(v bool) {
        storeAtomicBool(&so.keepAliveEnabled, v)
        so.handler.OnKeepAliveSet(v)
}

// GetMulticastLoop gets value for IP_MULTICAST_LOOP option.
func (so *SocketOptions) GetMulticastLoop() bool {
        return atomic.LoadUint32(&so.multicastLoopEnabled) != 0
}

// SetMulticastLoop sets value for IP_MULTICAST_LOOP option.
func (so *SocketOptions) SetMulticastLoop(v bool) {
        storeAtomicBool(&so.multicastLoopEnabled, v)
}

// GetReceiveTOS gets value for IP_RECVTOS option.
func (so *SocketOptions) GetReceiveTOS() bool {
        return atomic.LoadUint32(&so.receiveTOSEnabled) != 0
}

// SetReceiveTOS sets value for IP_RECVTOS option.
func (so *SocketOptions) SetReceiveTOS(v bool) {
        storeAtomicBool(&so.receiveTOSEnabled, v)
}

// GetReceiveTClass gets value for IPV6_RECVTCLASS option.
func (so *SocketOptions) GetReceiveTClass() bool {
        return atomic.LoadUint32(&so.receiveTClassEnabled) != 0
}

// SetReceiveTClass sets value for IPV6_RECVTCLASS option.
func (so *SocketOptions) SetReceiveTClass(v bool) {
        storeAtomicBool(&so.receiveTClassEnabled, v)
}

// GetReceivePacketInfo gets value for IP_PKTINFO option.
func (so *SocketOptions) GetReceivePacketInfo() bool {
        return atomic.LoadUint32(&so.receivePacketInfoEnabled) != 0
}

// SetReceivePacketInfo sets value for IP_PKTINFO option.
func (so *SocketOptions) SetReceivePacketInfo(v bool) {
        storeAtomicBool(&so.receivePacketInfoEnabled, v)
}

// GetHeaderIncluded gets value for IP_HDRINCL option.
func (so *SocketOptions) GetHeaderIncluded() bool {
        return atomic.LoadUint32(&so.hdrIncludedEnabled) != 0
}

// SetHeaderIncluded sets value for IP_HDRINCL option.
func (so *SocketOptions) SetHeaderIncluded(v bool) {
        storeAtomicBool(&so.hdrIncludedEnabled, v)
}

// GetV6Only gets value for IPV6_V6ONLY option.
func (so *SocketOptions) GetV6Only() bool {
        return atomic.LoadUint32(&so.v6OnlyEnabled) != 0
}

// SetV6Only sets value for IPV6_V6ONLY option.
//
// Preconditions: the backing TCP or UDP endpoint must be in initial state.
func (so *SocketOptions) SetV6Only(v bool) {
        storeAtomicBool(&so.v6OnlyEnabled, v)
}

// GetQuickAck gets value for TCP_QUICKACK option.
func (so *SocketOptions) GetQuickAck() bool {
        return atomic.LoadUint32(&so.quickAckEnabled) != 0
}

// SetQuickAck sets value for TCP_QUICKACK option.
func (so *SocketOptions) SetQuickAck(v bool) {
        storeAtomicBool(&so.quickAckEnabled, v)
}

// GetDelayOption gets inverted value for TCP_NODELAY option.
func (so *SocketOptions) GetDelayOption() bool {
        return atomic.LoadUint32(&so.delayOptionEnabled) != 0
}

// SetDelayOption sets inverted value for TCP_NODELAY option.
func (so *SocketOptions) SetDelayOption(v bool) {
        storeAtomicBool(&so.delayOptionEnabled, v)
        so.handler.OnDelayOptionSet(v)
}

// GetCorkOption gets value for TCP_CORK option.
func (so *SocketOptions) GetCorkOption() bool {
        return atomic.LoadUint32(&so.corkOptionEnabled) != 0
}

// SetCorkOption sets value for TCP_CORK option.
func (so *SocketOptions) SetCorkOption(v bool) {
        storeAtomicBool(&so.corkOptionEnabled, v)
        so.handler.OnCorkOptionSet(v)
}

// GetReceiveOriginalDstAddress gets value for IP(V6)_RECVORIGDSTADDR option.
func (so *SocketOptions) GetReceiveOriginalDstAddress() bool {
        return atomic.LoadUint32(&so.receiveOriginalDstAddress) != 0
}

// SetReceiveOriginalDstAddress sets value for IP(V6)_RECVORIGDSTADDR option.
func (so *SocketOptions) SetReceiveOriginalDstAddress(v bool) {
        storeAtomicBool(&so.receiveOriginalDstAddress, v)
}

// GetRecvError gets value for IP*_RECVERR option.
func (so *SocketOptions) GetRecvError() bool {
        return atomic.LoadUint32(&so.recvErrEnabled) != 0
}

// SetRecvError sets value for IP*_RECVERR option.
func (so *SocketOptions) SetRecvError(v bool) {
        storeAtomicBool(&so.recvErrEnabled, v)
        if !v {
                so.pruneErrQueue()
        }
}

// GetLastError gets value for SO_ERROR option.
func (so *SocketOptions) GetLastError() Error {
        return so.handler.LastError()
}

// GetOutOfBandInline gets value for SO_OOBINLINE option.
func (*SocketOptions) GetOutOfBandInline() bool {
        return true
}

// SetOutOfBandInline sets value for SO_OOBINLINE option. We currently do not
// support disabling this option.
func (*SocketOptions) SetOutOfBandInline(bool) {}

// GetLinger gets value for SO_LINGER option.
func (so *SocketOptions) GetLinger() LingerOption {
        so.mu.Lock()
        linger := so.linger
        so.mu.Unlock()
        return linger
}

// SetLinger sets value for SO_LINGER option.
func (so *SocketOptions) SetLinger(linger LingerOption) {
        so.mu.Lock()
        so.linger = linger
        so.mu.Unlock()
}

// SockErrOrigin represents the constants for error origin.
type SockErrOrigin uint8

const (
        // SockExtErrorOriginNone represents an unknown error origin.
        SockExtErrorOriginNone SockErrOrigin = iota

        // SockExtErrorOriginLocal indicates a local error.
        SockExtErrorOriginLocal

        // SockExtErrorOriginICMP indicates an IPv4 ICMP error.
        SockExtErrorOriginICMP

        // SockExtErrorOriginICMP6 indicates an IPv6 ICMP error.
        SockExtErrorOriginICMP6
)

// IsICMPErr indicates if the error originated from an ICMP error.
func (origin SockErrOrigin) IsICMPErr() bool {
        return origin == SockExtErrorOriginICMP || origin == SockExtErrorOriginICMP6
}

// SockErrorCause is the cause of a socket error.
type SockErrorCause interface {
        // Origin is the source of the error.
        Origin() SockErrOrigin

        // Type is the origin specific type of error.
        Type() uint8

        // Code is the origin and type specific error code.
        Code() uint8

        // Info is any extra information about the error.
        Info() uint32
}

// LocalSockError is a socket error that originated from the local host.
//
// +stateify savable
type LocalSockError struct {
        info uint32
}

// Origin implements SockErrorCause.
func (*LocalSockError) Origin() SockErrOrigin {
        return SockExtErrorOriginLocal
}

// Type implements SockErrorCause.
func (*LocalSockError) Type() uint8 {
        return 0
}

// Code implements SockErrorCause.
func (*LocalSockError) Code() uint8 {
        return 0
}

// Info implements SockErrorCause.
func (l *LocalSockError) Info() uint32 {
        return l.info
}

// SockError represents a queue entry in the per-socket error queue.
//
// +stateify savable
type SockError struct {
        sockErrorEntry

        // Err is the error caused by the errant packet.
        Err Error
        // Cause is the detailed cause of the error.
        Cause SockErrorCause

        // Payload is the errant packet's payload.
        Payload []byte
        // Dst is the original destination address of the errant packet.
        Dst FullAddress
        // Offender is the original sender address of the errant packet.
        Offender FullAddress
        // NetProto is the network protocol being used to transmit the packet.
        NetProto NetworkProtocolNumber
}

// pruneErrQueue resets the queue.
func (so *SocketOptions) pruneErrQueue() {
        so.errQueueMu.Lock()
        so.errQueue.Reset()
        so.errQueueMu.Unlock()
}

// DequeueErr dequeues a socket extended error from the error queue and returns
// it. Returns nil if queue is empty.
func (so *SocketOptions) DequeueErr() *SockError {
        so.errQueueMu.Lock()
        defer so.errQueueMu.Unlock()

        err := so.errQueue.Front()
        if err != nil {
                so.errQueue.Remove(err)
        }
        return err
}

// PeekErr returns the error in the front of the error queue. Returns nil if
// the error queue is empty.
func (so *SocketOptions) PeekErr() *SockError {
        so.errQueueMu.Lock()
        defer so.errQueueMu.Unlock()
        return so.errQueue.Front()
}

// QueueErr inserts the error at the back of the error queue.
//
// Preconditions: so.GetRecvError() == true.
func (so *SocketOptions) QueueErr(err *SockError) {
        so.errQueueMu.Lock()
        defer so.errQueueMu.Unlock()
        so.errQueue.PushBack(err)
}

// QueueLocalErr queues a local error onto the local queue.
func (so *SocketOptions) QueueLocalErr(err Error, net NetworkProtocolNumber, info uint32, dst FullAddress, payload []byte) {
        so.QueueErr(&SockError{
                Err:      err,
                Cause:    &LocalSockError{info: info},
                Payload:  payload,
                Dst:      dst,
                NetProto: net,
        })
}

// GetBindToDevice gets value for SO_BINDTODEVICE option.
func (so *SocketOptions) GetBindToDevice() int32 {
        return atomic.LoadInt32(&so.bindToDevice)
}

// SetBindToDevice sets value for SO_BINDTODEVICE option. If bindToDevice is
// zero, the socket device binding is removed.
func (so *SocketOptions) SetBindToDevice(bindToDevice int32) Error {
        if bindToDevice != 0 && !so.handler.HasNIC(bindToDevice) {
                return &ErrUnknownDevice{}
        }

        atomic.StoreInt32(&so.bindToDevice, bindToDevice)
        return nil
}

// SendBufferLimits returns the [min, max) range of allowable send buffer
// sizes.
func (so *SocketOptions) SendBufferLimits() (min, max int64) {
        limits := so.getSendBufferLimits(so.stackHandler)
        return int64(limits.Min), int64(limits.Max)
}

// GetSendBufferSize gets value for SO_SNDBUF option.
func (so *SocketOptions) GetSendBufferSize() int64 {
        so.sendBufSizeMu.Lock()
        defer so.sendBufSizeMu.Unlock()
        return so.sendBufferSize
}

// SetSendBufferSize sets value for SO_SNDBUF option. notify indicates if the
// stack handler should be invoked to set the send buffer size.
func (so *SocketOptions) SetSendBufferSize(sendBufferSize int64, notify bool) {
        so.sendBufSizeMu.Lock()
        defer so.sendBufSizeMu.Unlock()
        if notify {
                sendBufferSize = so.handler.OnSetSendBufferSize(sendBufferSize)
        }
        so.sendBufferSize = sendBufferSize
}

// ReceiveBufferLimits returns the [min, max) range of allowable receive buffer
// sizes.
func (so *SocketOptions) ReceiveBufferLimits() (min, max int64) {
        limits := so.getReceiveBufferLimits(so.stackHandler)
        return int64(limits.Min), int64(limits.Max)
}

// GetReceiveBufferSize gets value for SO_RCVBUF option.
func (so *SocketOptions) GetReceiveBufferSize() int64 {
        so.receiveBufSizeMu.Lock()
        defer so.receiveBufSizeMu.Unlock()
        return so.receiveBufferSize
}

// SetReceiveBufferSize sets the value of the SO_RCVBUF option, optionally
// notifying the owning endpoint.
func (so *SocketOptions) SetReceiveBufferSize(receiveBufferSize int64, notify bool) {
        so.receiveBufSizeMu.Lock()
        defer so.receiveBufSizeMu.Unlock()
        if notify {
                receiveBufferSize = so.handler.OnSetReceiveBufferSize(receiveBufferSize, so.receiveBufferSize)
        }
        so.receiveBufferSize = receiveBufferSize
}

poll(0x0, 0x0, 0x7fff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f00000027c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
timer_settime(0x0, 0x0, &(0x7f0000000200)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
clone(0x2008321cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x16, 0x0, @thr={0x0, 0x0}}, &(0x7f00000003c0)=<r2=>0x0)
timer_getoverrun(0x0)
timer_settime(r2, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)

pselect6(0x0, 0xffffffffffffffff, 0x0, 0x0, &(0x7f0000000280), 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
statfs(&(0x7f0000000000)='\x00', 0x0)
r0 = gettid()
rt_sigqueueinfo(r0, 0x2b, &(0x7f0000000100))

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = semget$private(0x0, 0x1, 0x0)
semctl$GETNCNT(r0, 0x4, 0xe, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigreturn()

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = syz_open_pts(r0, 0x0)
ioctl$TIOCSCTTY(r1, 0x540e, 0x0)
exit(0x0)

clone(0xf38055be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000080)='cgroup.controllers\x00', 0x275a, 0x0)
mmap(&(0x7f0000002000/0xb000)=nil, 0xb000, 0x0, 0x1ca011, r0, 0x0)
exit(0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
chmod(&(0x7f0000000180)='./file0\x00', 0x23f)
mkdir(&(0x7f0000000040)='./file0/file1\x00', 0x0)
newfstatat(0xffffffffffffff9c, &(0x7f0000000240)='./file0/file1\x00', &(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, <r2=>0x0}, 0x0)
setreuid(0x0, r2)
rmdir(&(0x7f0000000100)='./file0/file1\x00')
rt_sigreturn()

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000080)={0x3, &(0x7f00000000c0)=[{0x25, 0x0, 0x1}, {}, {0x6, 0x0, 0x0, 0x7ffffff6}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

r0 = socket$packet(0x11, 0x2, 0x300)
recvmmsg(r0, &(0x7f000000ac80)=[{{&(0x7f0000000000)=@in={0x2, 0x0, @local}, 0x80, &(0x7f00000002c0)=[{&(0x7f0000000a40)=""/4096, 0x1000}], 0x1}}], 0x1, 0x20, 0x0)
syz_emit_ethernet(0x36, &(0x7f0000000a00)={@local, @remote, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "a04237", 0x0, 0x0, 0x0, @loopback, @rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01'}}}}, 0x0)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
sendto$inet6(r0, 0x0, 0x0, 0x0, 0x0, 0x0)
connect(r0, &(0x7f00000011c0)=@in={0x2, 0x4e64, @empty}, 0x80)
connect(r0, &(0x7f0000000000)=@nl=@unspec, 0x80)

clone(0x10045be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x3, 0x0, 0x0)
r1 = getpid()
tgkill(r1, r1, 0x3b)

clone(0x100041be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
r1 = syz_open_procfs(0x0, &(0x7f0000000a80)='net/netstat\x00')
read$FUSE(r1, &(0x7f0000002180)={0x2020, 0x0, 0x0, 0x0, 0x0, <r2=>0x0}, 0x2020)
setpgid(0x0, r2)
tgkill(r0, r0, 0x27)

rt_sigprocmask(0x0, &(0x7f0000000100)={[0xfffffffffffe]}, 0x0, 0x8)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
getresuid(0x0, 0x0, 0x0)
r0 = gettid()
rt_sigqueueinfo(r0, 0xa, &(0x7f0000000040))
ppoll(0x0, 0x0, 0x0, &(0x7f0000000180), 0x8)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
capset(&(0x7f0000000040)={0x20080522}, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x4c})
exit_group(0x0)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
sendmmsg$inet6(r0, &(0x7f000000b080)=[{{&(0x7f0000005600)={0xa, 0x4e24, 0x0, @remote}, 0x1c, 0x0}}, {{&(0x7f0000005740)={0xa, 0x0, 0x0, @local}, 0xfffffe07, 0x0}}], 0x2, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
semctl$SEM_STAT(0x0, 0x0, 0x12, &(0x7f0000000280)=""/210)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000140)={0x2, &(0x7f00000000c0)=[{0x87}, {0x6}]})

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
mkdirat(0xffffffffffffff9c, &(0x7f0000000180)='./file1\x00', 0x0)
r0 = openat$dir(0xffffffffffffff9c, &(0x7f0000000380)='./file1\x00', 0x0, 0x0)
unlinkat(0xffffffffffffff9c, &(0x7f00000000c0)='./file1\x00', 0x200)
mkdirat(r0, &(0x7f0000000000)='./file1\x00', 0x0)
rt_sigreturn()

clone(0x10024100, 0x0, 0x0, 0x0, 0x0)
r0 = timerfd_create(0x0, 0x0)
r1 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='fdinfo/3\x00')
close(r0)
read$FUSE(r1, 0x0, 0x0)
rt_sigreturn()

r0 = epoll_create(0x1f)
lseek(r0, 0x0, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f0000000000)='gid_map\x00')
r1 = semget$private(0x0, 0x4000000009, 0x0)
semop(r1, &(0x7f00000001c0)=[{0x0, 0x4}, {0x0, 0x7fff}], 0x2)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
pwrite64(r0, 0x0, 0x0, 0x100000800)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
statx(0xffffffffffffffff, &(0x7f0000000900)='./file0\x00', 0x0, 0x0, 0x0)
exit(0x0)

r0 = openat(0xffffffffffffffff, &(0x7f00000002c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
time(&(0x7f0000000180))

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0xfbe000)=nil, 0xfbe000, 0x7, 0x31, 0xffffffffffffffff, 0x0)
r2 = creat(&(0x7f0000000100)='./bus\x00', 0x0)
io_setup(0x202, &(0x7f0000000200)=<r3=>0x0)
io_submit(r3, 0x1, &(0x7f0000000540)=[&(0x7f00000000c0)={0x0, 0x0, 0x0, 0x1, 0x0, r2, 0x0}])
io_getevents(r3, 0x88a, 0x7fffffffffffebd, &(0x7f00000000c0)=[{}, {}, {}, {}, {}], &(0x7f0000000000)={0x77359400})
io_destroy(r3)

mknod$loop(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
mount(&(0x7f00000003c0)=ANY=[], &(0x7f0000000040)='./file0\x00', &(0x7f0000000140)='sysfs\x00', 0x0, 0x0)
faccessat(0xffffffffffffff9c, &(0x7f00000000c0)='./file0\x00', 0x0)

r0 = memfd_create(&(0x7f00000021c0)='/proc/self/net/pfkey\x00', 0x0)
pwrite64(r0, &(0x7f0000000080)="fd04fa8b67a3feeaf2d248da49c39d841e88899271961ffe41e2ebc9436677bbd49d366646e351d2389eb64718353f1f3f67970e12d75a", 0xfffffffffffffd2c, 0xfc1)
pwrite64(r0, 0x0, 0x0, 0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
r1 = epoll_create(0x46e4)
epoll_ctl$EPOLL_CTL_ADD(r1, 0x300, r0, &(0x7f0000000000))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = getpid()
rt_tgsigqueueinfo(r1, r1, 0x13, &(0x7f0000000100))
ptrace(0x10, r1)
ptrace$getregset(0x4205, r1, 0x202, &(0x7f0000000080)={&(0x7f0000000040)=""/61, 0xffffff78})

r0 = socket$inet_udp(0x2, 0x2, 0x0)
setsockopt$inet_int(r0, 0x0, 0x2, &(0x7f0000000000)=0x4, 0x4)

syz_mount_image$fuse(0x0, &(0x7f0000000080)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
r0 = open$dir(&(0x7f0000000840)='./file0\x00', 0x0, 0x0)
mkdirat(r0, &(0x7f0000000000)='./file0\x00', 0x0)
renameat(0xffffffffffffff9c, &(0x7f00000003c0)='./file0\x00', 0xffffffffffffff9c, &(0x7f0000000400)='./file0\x00')

readlink(0x0, &(0x7f0000000040)=""/18, 0x12)

clone(0x5100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
connect$inet6(r0, &(0x7f0000000180)={0xa, 0x0, 0x0, @dev, 0x9}, 0x1c)
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x180000f, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x200300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
wait4(0x0, 0x0, 0x80000002, 0x0)
r1 = getpid()
rt_tgsigqueueinfo(r1, r1, 0x16, &(0x7f0000000000))
ptrace(0x10, r1)
ptrace$setregs(0xd, r1, 0x0, &(0x7f0000000080)="be9ff483111ec7c05a6e35766a9c5cd98ed812fee8ee677c468e2d01bb01fd560342c1891c9b259ef048c5ac173518e9cd261fa6cbe6a89b00bbcac9c7a8fc13d6d5661f30c63f72be485d2065e695187bb1482dff9c9d341184640629dc64bb37212a404898297b90eb535ba521052c06a3f59c8a96155e941ed41bc723c4062d6dc6418cd0808ff3")
ptrace$getregset(0x4204, r1, 0x2, &(0x7f00000005c0)={0x0, 0x10})

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f00000000c0), 0x280e00, 0x0)
flock(r0, 0x1)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0)
select(0x0, 0x0, 0x0, 0x0, &(0x7f0000000100))
exit_group(0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
getsockopt$inet_opts(r0, 0x0, 0xb, &(0x7f0000000140)=""/181, &(0x7f0000000080)=0xb5)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6806300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setuid(0xee01)
r0 = gettid()
tkill(r0, 0x25)
lstat(&(0x7f0000000140)='.\x00', &(0x7f0000000380)={0x0, 0x0, 0x0, 0x0, <r1=>0x0})
setresuid(0x0, r1, 0x0)
prlimit64(r0, 0x0, 0x0, 0x0)
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = semget$private(0x0, 0x1, 0x0)
semctl$GETZCNT(r1, 0x0, 0xf, 0x0)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000080)={0x2, &(0x7f0000000040)=[{0x4d, 0x0, 0xfc}, {0x6}]})

r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
fsetxattr$security_evm(r0, &(0x7f0000000000), &(0x7f0000000080)=@v2={0x3}, 0x9, 0x0)

syz_emit_ethernet(0x3e, &(0x7f0000000040)={@broadcast, @empty, @void, {@ipv4={0x800, @tipc={{0x6, 0x4, 0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x6, 0x0, @empty, @broadcast, {[@timestamp={0x44, 0x4, 0x6c, 0x0, 0xf}]}}, @payload_conn={{{0x18, 0x0, 0x0, 0x0, 0x0, 0x6}}}}}}}, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
set_robust_list(&(0x7f0000000080), 0x18)

socketpair$unix(0x1, 0x100000000001, 0x0, &(0x7f0000000300)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
pipe(&(0x7f0000000340)={0xffffffffffffffff, <r2=>0xffffffffffffffff})
sched_setaffinity(0x0, 0x8, &(0x7f0000000100)=0xbaa)
splice(r0, 0x0, r2, 0x0, 0x1, 0x0)
write$FUSE_INIT(r2, &(0x7f0000000380)={0x843b99d860c24bb7}, 0xd03cb0e)
close(r1)

mknodat$loop(0xffffffffffffff9c, &(0x7f0000000080)='./file0\x00', 0xc000, 0x1)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000700)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
connect$unix(r0, &(0x7f00000000c0)=@file={0x1, './file0\x00'}, 0x6e)

clone(0x30005100, 0x0, 0x0, 0x0, 0x0)
socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmsg(r0, &(0x7f0000000600)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000080)=[{0x30, 0x0, 0xb, "aa7fe604f81831e6c1e9f33a2b77b58c8331e78e2e11f1ef79"}], 0x30}, 0x0)
r1 = gettid()
tgkill(r1, r1, 0x35)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffffff, &(0x7f0000000440)='/proc/self/exe\x00', 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000100)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r1, &(0x7f0000d06ff8)='./file0\x00')
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()
creat(&(0x7f0000000140)='./file0\x00', 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
read$FUSE(r2, &(0x7f0000000880)={0x2020, 0x0, 0x0, <r3=>0x0}, 0x2020)
newfstatat(0xffffffffffffff9c, &(0x7f00000002c0)='.\x00', &(0x7f0000001500)={0x0, 0x0, 0x0, 0x0, <r4=>0x0}, 0x0)
setreuid(0x0, r4)
setresuid(0x0, r3, 0x0)
exit(0x0)

openat$ptmx(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r0 = syz_open_procfs(0x0, &(0x7f00000000c0)='fd/3\x00')
r1 = openat(0xffffffffffffff9c, &(0x7f0000000440)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x1, 0x0, 0x0)
ioctl$TCSETSW(r0, 0x5403, &(0x7f0000000040)={0xfffffff8, 0x0, 0x0, 0x7fff, 0x0, "c9356ed144fecd0802eca9b82dae125bcff26b"})
writev(r0, &(0x7f0000000100)=[{&(0x7f0000000180)="f470e00082e967881b3ddf5c5b0e6ea6297ddc62fb4b2905dc5d72dea1", 0x1d}], 0x1)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
lseek(r0, 0x0, 0x0)

r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
connect$inet(r0, &(0x7f0000000000)={0x2, 0x0, @dev}, 0x10)
r1 = inotify_init1(0x0)
dup3(r1, r0, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = signalfd(0xffffffffffffffff, &(0x7f0000000080), 0x8)
sendto$packet(r0, 0x0, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = gettid()
r2 = socket$packet(0x11, 0x2, 0x300)
sendto$packet(r2, 0x0, 0x0, 0x0, &(0x7f0000000080)={0x11, 0x0, 0x0, 0x1, 0x0, 0x6, @link_local}, 0x12)
tgkill(r0, r1, 0x24)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
pipe(&(0x7f00000000c0)={<r1=>0xffffffffffffffff})
fcntl$setlease(r1, 0x408, 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000040)={<r1=>0xffffffffffffffff})
shutdown(r1, 0x302f53404f8a3e8a)
rt_sigqueueinfo(r0, 0x4, &(0x7f0000000000))

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
r1 = getpid()
rt_tgsigqueueinfo(r1, r1, 0x16, &(0x7f0000000180))
ptrace(0x10, r1)
ptrace$cont(0x9, r1, 0x0, 0x200)

mbind(&(0x7f0000ffb000/0x2000)=nil, 0x2000, 0x0, &(0x7f0000000200), 0x1, 0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
r1 = socket$packet(0x11, 0x2, 0x300)
sendmmsg$inet(r0, &(0x7f0000000500)=[{{&(0x7f0000000040)={0x2, 0x4e22, @remote}, 0x10, 0x0}}], 0x1, 0x0)
recvmsg(r1, &(0x7f0000001b80)={&(0x7f0000001840)=@xdp={0x2c, 0x0, <r2=>0x0}, 0x80, 0x0}, 0x0)
sendmmsg$inet(r0, &(0x7f0000002280)=[{{&(0x7f0000000480)={0x2, 0x4e22}, 0x10, 0x0, 0x0, &(0x7f0000001bc0)=[@ip_pktinfo={{0x1c, 0x0, 0x8, {r2, @multicast1, @remote}}}, @ip_retopts={{0x3c, 0x0, 0x7, {[@timestamp_addr={0x44, 0x14, 0x85, 0x1, 0x0, [{@empty}, {@initdev={0xac, 0x1e, 0x0, 0x0}}]}, @ra={0x94, 0x4}, @rr={0x7, 0x13, 0x0, [@loopback, @broadcast, @local, @broadcast]}]}}}], 0x60}}], 0x1, 0x0)

mlockall(0x1)
mremap(&(0x7f0000a94000/0x2000)=nil, 0x2000, 0x800000, 0x3, &(0x7f0000130000/0x800000)=nil)
perf_event_open(&(0x7f0000000100)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={0x0}}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
mbind(&(0x7f00003b5000/0x800000)=nil, 0x801100, 0x0, 0x0, 0x0, 0x2)
ioctl$PERF_EVENT_IOC_QUERY_BPF(0xffffffffffffffff, 0xc008240a, 0x0)
openat(0xffffffffffffff9c, 0x0, 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
fgetxattr(0xffffffffffffffff, 0x0, 0x0, 0x0)
fallocate(r0, 0x0, 0x102000006, 0x6)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = gettid()
pipe(&(0x7f0000000300)={0xffffffffffffffff, <r2=>0xffffffffffffffff})
bind$inet6(r2, 0x0, 0x0)
rt_tgsigqueueinfo(r0, r1, 0x2c, &(0x7f00000005c0))

clone(0x2000204d5fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
syz_emit_ethernet(0x2a, &(0x7f0000000000)={@random="aec15f0663ac", @local, @void, {@arp={0x806, @ether_ipv4={0x1, 0x800, 0x6, 0x4, 0x1, @random="8153ee79177f", @loopback, @remote, @broadcast}}}}, 0x0)
syz_extract_tcp_res$synack(0x0, 0x1, 0x0)
exit_group(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat$random(0xffffffffffffff9c, &(0x7f00000007c0), 0x2081, 0x0)
pwritev(r1, &(0x7f0000000e00)=[{0x0}, {&(0x7f00000013c0)='Z', 0x1}], 0x2, 0x0, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = inotify_init()
fremovexattr(r0, &(0x7f0000000000)=@random={'btrfs.', 'security.capability\x00'})
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

clone(0xd00c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6(0xa, 0x2, 0x0)
bind$inet6(r0, &(0x7f0000000000)={0xa, 0x0, 0x0, @remote}, 0x1c)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

r0 = openat(0xffffffffffffffff, &(0x7f0000000440)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
mknod(&(0x7f0000000040)='./file0\x00', 0x1040, 0x0)
creat(&(0x7f0000000080)='./file0\x00', 0x0)
timer_create(0x0, &(0x7f0000000280)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f00009b1ffc))
timer_settime(0x0, 0x0, &(0x7f0000000000)={{0x0, 0x989680}, {0x0, 0x989680}}, 0x0)
r1 = gettid()
creat(&(0x7f0000000200)='./file0\x00', 0x0)
r2 = openat(0xffffffffffffffff, &(0x7f0000000440)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
tkill(r1, 0x1000000000016)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x2, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
getrusage(0xffffffffffffffff, &(0x7f0000000080))

r0 = inotify_init1(0x0)
fcntl$setown(r0, 0x8, 0xffffffffffffffff)
fcntl$getown(r0, 0x9)

clone(0x9b8271be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = semget$private(0x0, 0x20000000102, 0x0)
semop(r0, &(0x7f00000001c0)=[{}], 0x1)
exit(0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setgroups(0x40000089, &(0x7f0000000000)=[0x0])
rt_sigreturn()

capget(&(0x7f0000000080)={0x19980330}, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
fcntl$lock(r0, 0x7, &(0x7f0000000040)={0x0, 0x0, 0x400, 0xfffffffffffffffb, 0xffffffffffffffff})

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setresuid(0x0, 0xee01, 0xffffffffffffffff)
r1 = semget$private(0x0, 0x3, 0x0)
setresuid(0x0, 0x0, 0x0)
semctl$IPC_SET(r1, 0x0, 0xb, 0x0)

creat(&(0x7f0000000540)='./file0\x00', 0x0)
mount(&(0x7f0000000000)=ANY=[], &(0x7f00000000c0)='./file0\x00', &(0x7f0000000080)='sysfs\x00', 0x0, 0x0)
r0 = socket$unix(0x1, 0x1, 0x0)
connect$unix(r0, &(0x7f0000000100)=@file={0x1, './file0\x00'}, 0x6e)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f0000000000)='oom_score_adj\x00')
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
write$tcp_mem(r0, &(0x7f0000000180)={0x20004fff}, 0x48)
exit(0x0)

clone(0x20081004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = shmget$private(0x0, 0x2000, 0x0, &(0x7f0000ffc000/0x2000)=nil)
r2 = syz_open_procfs$namespace(0xffffffffffffffff, &(0x7f0000000000)='ns/pid\x00')
fstat(r2, &(0x7f0000000100)={0x0, 0x0, 0x0, 0x0, <r3=>0x0})
setuid(r3)
r4 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r4, 0x0)
preadv(r4, &(0x7f0000000280), 0x1, 0x0, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER(0x1, 0x0, &(0x7f0000000040)={0x2, &(0x7f00000000c0)=[{0x1e}, {0x6}]})
shmctl$IPC_SET(r1, 0x1, &(0x7f0000000000)={{0x0}})
tkill(r0, 0x25)

clone(0x0, 0x0, 0x0, 0x0, 0x0)
prctl$PR_SET_CHILD_SUBREAPER(0x24, 0x1)
clone(0x0, 0x0, 0x0, 0x0, 0x0)
prctl$PR_SET_CHILD_SUBREAPER(0x24, 0x1)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
chdir(0x0)
rename(&(0x7f0000000080)='./file0\x00', &(0x7f00000000c0)='./file0/file1\x00')
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000180)='cmdline\x00')
renameat(r0, &(0x7f0000000000)='./file0\x00', r0, &(0x7f0000000040)='./file0\x00')

poll(0x0, 0x0, 0x7fff)
clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f00000027c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
timer_settime(0x0, 0x0, &(0x7f0000000200)={{0x0, 0x989680}, {0x0, 0x3938700}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x16, 0x0, @thr={0x0, 0x0}}, &(0x7f00000003c0)=<r2=>0x0)
r3 = epoll_create(0xc07)
r4 = socket$inet6_tcp(0xa, 0x1, 0x0)
epoll_ctl$EPOLL_CTL_ADD(r3, 0x1, r4, &(0x7f0000000300))
timer_settime(r2, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x3938700}}, 0x0)
close(r4)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = syz_open_procfs(0x0, &(0x7f0000000000)='net/unix\x00')
readlinkat(r0, &(0x7f00000002c0)='./file0\x00', &(0x7f0000000340)=""/248, 0xf8)
exit(0x0)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
setsockopt$inet6_tcp_int(r0, 0x6, 0x13, &(0x7f00000000c0)=0x100000001, 0x4)
connect$inet6(r0, &(0x7f0000000080), 0x1c)
r1 = dup2(r0, r0)
setsockopt$inet6_tcp_TCP_REPAIR_OPTIONS(r1, 0x6, 0x16, &(0x7f0000000440), 0x131f64)
setsockopt$sock_linger(r1, 0x1, 0x1b, &(0x7f0000000100), 0x8)

getcwd(&(0x7f0000000000)=""/234, 0xea)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
pipe2(&(0x7f0000000080)={0xffffffffffffffff, <r0=>0xffffffffffffffff}, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
fallocate(r0, 0x0, 0x0, 0x2)
rt_sigreturn()

clone(0x2000204d5fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
openat$null(0xffffffffffffff9c, &(0x7f0000000000), 0x61e0c1, 0x0)
exit_group(0x0)

r0 = socket$inet_icmp(0x2, 0x2, 0x1)
getsockname(r0, &(0x7f0000000000)=@pppol2tp={0x18, 0x1, {0x0, <r1=>0xffffffffffffffff, {0x2, 0x0, @broadcast}}}, &(0x7f0000000080)=0x80)
fstatfs(r1, &(0x7f0000000180)=""/43)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = epoll_create(0x10007fff)
ioctl$TCSETS(r0, 0x40045431, &(0x7f0000000040))
r2 = syz_open_pts(r0, 0x0)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800001, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r4 = syz_open_pts(r0, 0x42c02)
dup3(r1, r0, 0x0)
dup2(r2, r4)

setreuid(0xee01, 0xee01)
prctl$PR_SET_MM(0x23, 0x0, &(0x7f0000ffc000/0x2000)=nil)

clone(0x30005100, 0x0, 0x0, 0x0, 0x0)
r0 = gettid()
socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
getsockopt$sock_cred(r1, 0x1, 0x9, 0x0, &(0x7f0000000080))
tgkill(r0, r0, 0x10)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = memfd_create(&(0x7f0000000280)='0/i\x8d\xe1\xa1=\x90E\xf5:)uYZ\x99h\r\x9c\xfa66\xd8\xac\xbdE\xdc~P5n-4\x01\xc3uM,\xa4&>%p4\x1c\x04\xdd\xd6\xb1g\xd3\x11E\xb7\xf9\xad\x1f\xef.\xf2\xa6\x05\xf9\x12\xcb\xb7Po', 0x0)
r1 = fcntl$dupfd(r0, 0x0, r0)
mmap(&(0x7f0000000000/0x600000)=nil, 0x600000, 0x0, 0x11, r1, 0x0)
set_robust_list(&(0x7f0000000180), 0x18)
exit_group(0x0)

r0 = socket$packet(0x11, 0x2, 0x300)
getsockopt$IP_SET_OP_GET_BYINDEX(r0, 0x1, 0x26, 0x0, &(0x7f00000001c0))

syz_mount_image$tmpfs(&(0x7f0000000000), &(0x7f0000000100)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
syz_mount_image$tmpfs(&(0x7f0000000000), &(0x7f0000000100)='./file0\x00', 0x0, 0x1, 0x0, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER(0x1, 0x0, &(0x7f0000000000)={0x2, &(0x7f0000000080)=[{0x87}, {0x6, 0x0, 0x0, 0xffffffff}]})
preadv(r0, &(0x7f0000000280), 0x100000000000008d, 0x4, 0x0)

r0 = socket$netlink(0x10, 0x3, 0x0)
accept$packet(r0, 0x0, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()
newfstatat(0xffffffffffffff9c, 0x0, 0x0, 0x0)
exit_group(0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000002040), 0x200900, 0x0)
mmap(&(0x7f0000ffc000/0x4000)=nil, 0x4000, 0x0, 0x11, r0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000000)={0x2, &(0x7f00000000c0)=[{0x9}, {0x16}]})

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r1 = shmget$private(0x0, 0x4000, 0x0, &(0x7f000076b000/0x4000)=nil)
socket(0x11, 0x0, 0x0)
shmat(r1, &(0x7f00001b1000/0xc00000)=nil, 0x5000)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

symlink(&(0x7f00000000c0)='./file0\x00', &(0x7f0000000100)='./file0\x00')
lstat(&(0x7f0000000180)='./file0\x00', &(0x7f0000000300)={0x0, 0x0, 0x0, 0x0, <r0=>0x0})
setresuid(0xffffffffffffffff, r0, 0x0)
lsetxattr$trusted_overlay_redirect(&(0x7f0000000000)='./file0\x00', &(0x7f0000000040), 0x0, 0x0, 0x0)

r0 = socket$inet(0x2, 0x2, 0x0)
getsockopt$inet_int(r0, 0x0, 0x22, &(0x7f00000000c0), &(0x7f0000000180)=0x4)

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
openat$cgroup_netprio_ifpriomap(r0, &(0x7f0000000040), 0x2, 0x0)
rt_sigreturn()

perf_event_open(&(0x7f0000000000)={0x1000000002, 0x80, 0x12, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={0x0}}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
clone(0x0, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit(0x0)
clone(0x210612c17c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = getpgrp(0xffffffffffffffff)
prctl$PR_SET_PTRACER(0x59616d61, r1)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
exit_group(0x0)
ptrace(0x4206, r0)

clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
select(0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)={0xfffffffffffffffe})
exit(0x0)

clone(0x106300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r0, &(0x7f0000000180)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
bind$unix(r0, &(0x7f0000003000)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
prlimit64(0x0, 0x0, &(0x7f0000000080), 0x0)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
ioctl$TIOCSPGRP(r0, 0x5410, &(0x7f0000000040)=0xffffffffffffffff)
rt_sigreturn()

r0 = eventfd(0x0)
io_setup(0x7, &(0x7f0000000000)=<r1=>0x0)
r2 = syz_open_procfs$namespace(0x0, &(0x7f0000000040)='ns/pid\x00')
io_submit(r1, 0x2, &(0x7f0000002440)=[&(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, 0x0, r2, 0x0}, &(0x7f0000000100)={0x0, 0x0, 0x0, 0x7, 0x0, r0, &(0x7f0000000180)="d037a08965772f2078ac62f7401a8b67c54181223e412771b2b5896a98fc6253c0003935b9ea75d36270f60fa724a91637915774ca279cd324e72a079d4075b5b8f97d93e9b0dcad8ba7743f2ccb77eb1e5767823a65f504cff244dbdde45a57fda1a74c7fc10a1747b2a712f12ae24447651d7d04ff66e2da5772ede060e9cbf8ad7fe86b5fedd89ee9b1af62ca905f510567280760d829cb03e94bf92e67f8d4f98bfdbf6d34edb55d4537085ca8cd0f77e7d8207dff4ce14daa14754cb7f1944cf8cad87849e4bafd759345f451468e5ae4eae3d2b62f18de946e5028044b2735196f557b8d105c7810c2cdb28b63293170c57d209535e153d9795ef0c4d880f4dfcea10c8120ac672aa2a17912c4db8a1fcda24b09d86cdc9ec0a80cbfc928383218a98e9eb9f3d922f6ad0705cc4807b6f3cb8ff0df0ebbc34682c503d444f973df721e7b821c3c3977b04e534efa75bf3480fd8481b53d61091d9f684059710b3b7475ee1d0d295bc67962429605785e4f36f8f6331619755f73296de27dfc926c05b8ce13fbb56eec33c45f30453885edde8fe4f7f6e52141e4cfc0765b9c911cc5af2aa5c2cd9fcfa2ab2bfb268b17ac686a1ded096e905d5fb13ba454704992f276ddf8b8ea4a8a7deb12e2df365fc0506a9d1d5414ae053d823db3d590c5e7486d5cad30596285363f9cbe972ce217f7824e8d70254dd286a28095563c6888539e134f0498a10ec7604e566b84bb57e28acfc2fa6c425a263de3158fd2a5656f1423abe5b18b459b6b9c160c2ab63c0dc7a1e5095e8dc9081483cf61f8666297c6ea736c0880866979f9c1ec06efda9dc9f30219711d70f7325223859bff24acb01efc2d1835a224a9062d82da934a7c9be6a0d29694f4c2dcb602963fae1739abcc59a40d291c841c3fa474d4715d94a18cf2fa26bc0b2ca8791829c2cb6e9dba3c6f9e6b244d0c5cbbdecdc31ecb82bab3f99806a87db4fc02299dcdd7976e728264a0b181818741f6820a72ee4d925858b74f2ad72aeb66b70a8f5f1fcb1f3d423c0605c310d0cde28fb93db8cd47209627b01aa45cb39e3870b8232aa2852b7004eab55c562e2e6f8f0c3d14890653eaf59daead165a7f51e2a3644f94d02c6e8ac070ff884f7883717073bd883826433c89791fdb6e9fb7c34b26990bb79bd8fe5d19215a7c9377ad01d1bee3822dfcccc74582df772e5279b5ccd591e71fe99f6a4314bfe909f72eada568a170b6888ad9ccd8ddaee4722f66b4f23cd81b55da7af48d41fbc8a9fe0a4c3379a7298bda146754ace187f1849ee7b4e37b0f24a82f349d6c5980f7748ee8f74c6c223bf2ad007442ec749c144fd9fe934170d8c122f4cc22dff69208a1fb1d0d8f65a6997e1511f4abf424737e9ddff306606b546dbd160aee4b96fc43154cfd5b043438222746a8e090433d2b", 0x401}])

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt$inet_mreqn(r0, 0x6, 0x7, 0x0, &(0x7f0000000f40))

r0 = syz_open_procfs(0x0, &(0x7f0000000080)='cgroup\x00')
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = open(&(0x7f0000002000)='./bus\x00', 0x141042, 0x0)
sendfile(r2, r0, 0x0, 0xf6c1)

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r0 = fork()
ptrace(0x10, r0)
ptrace$cont(0xffffffffffffffff, r0, 0x0, 0x0)
exit_group(0x0)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
bind$inet6(r0, &(0x7f0000000500)={0xa, 0x2}, 0x1c)
listen(r0, 0x0)
r1 = socket$inet(0x2, 0x801, 0x0)
r2 = dup(r1)
connect$inet(r2, &(0x7f0000000240)={0x2, 0x2, @loopback}, 0x10)
write$binfmt_misc(r2, 0x0, 0x1)

creat(&(0x7f0000000080)='./file0\x00', 0x0)
mount(&(0x7f0000000380)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000140)='devtmpfs\x00', 0x10409, 0x0)
r0 = syz_open_procfs(0x0, &(0x7f0000000000)='mounts\x00')
preadv(r0, 0x0, 0x0, 0x0, 0x0)

r0 = gettid()
r1 = getpgrp(0x0)
tgkill(r0, r1, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$unix(0x1, 0x1, 0x0)
sendmsg$unix(r0, 0x0, 0x10)
r1 = open(&(0x7f00000001c0)='./bus\x00', 0x140042, 0x0)
fallocate(r1, 0x0, 0x0, 0x7fffffff)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000340), 0x0, 0x0)
sendmsg$sock(r0, 0x0, 0x0)
rt_sigreturn()

timer_create(0x0, &(0x7f0000000300)={0x0, 0x12}, &(0x7f0000000140))
r0 = eventfd2(0x0, 0x0)
read$eventfd(r0, &(0x7f00000000c0), 0x250ce47f)
readv(r0, &(0x7f0000000600)=[{&(0x7f00000001c0)=""/213, 0xd5}], 0x1)
timer_settime(0x0, 0x0, &(0x7f0000000080)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x14, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000040)=<r1=>0x0)
timer_settime(r1, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

clone(0x3524c100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='gid_map\x00')
pwrite64(r0, 0x0, 0x0, 0x80)
pwrite64(r0, 0x0, 0x0, 0x100000001)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
syz_open_procfs(r0, &(0x7f0000000000)='comm\x00')
exit_group(0x0)
syz_open_procfs(0x0, &(0x7f00000000c0)='fd/3\x00')

gettid()
gettid()
tgkill(0x0, 0x0, 0x0)
clone(0x200800059fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit_group(0x0)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_TCP_CONGESTION(r0, 0x6, 0xd, &(0x7f0000000080)='cubic\x00', 0x5)

r0 = openat$full(0xffffffffffffff9c, &(0x7f0000000000), 0x302, 0x0)
write$FUSE_NOTIFY_STORE(r0, 0x0, 0x57)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000140)='oom_score_adj\x00')
write$tcp_mem(r1, 0x0, 0x0)
r2 = gettid()
tgkill(r2, r2, 0x10)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x4000000206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_udp(0xa, 0x2, 0x0)
fcntl$F_SET_FILE_RW_HINT(r0, 0x40e, 0x0)
exit(0x0)
exit_group(0x0)

clone(0x4000002206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
rt_tgsigqueueinfo(r0, r0, 0x16, &(0x7f0000000000))
r1 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
r2 = dup(r1)
ioctl$PERF_EVENT_IOC_ENABLE(r2, 0x8912, 0x400200)
ptrace(0x10, r0)
ptrace$peeksig(0x2, r0, &(0x7f0000000080), &(0x7f0000000e40))

clone(0x20087104ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
syz_open_procfs(0x0, &(0x7f0000000000)='comm\x00')
exit_group(0x0)
perf_event_open(&(0x7f0000000280)={0x2, 0x70, 0x12, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = syz_open_procfs(0x0, &(0x7f0000000100)='fd/3\x00')
write(r0, &(0x7f00000001c0), 0x0)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = memfd_create(&(0x7f0000000780)='\x00', 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x0, 0x11, r0, 0x0)
pselect6(0x0, 0x0, 0x0, 0x0, &(0x7f00000001c0), 0x0)
exit_group(0x0)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
fcntl$lock(r0, 0x7, &(0x7f0000000180))
fcntl$lock(r0, 0x7, &(0x7f0000000000)={0x0, 0x0, 0x4, 0x4000001})
fcntl$lock(r0, 0x7, &(0x7f00000011c0)={0x0, 0x0, 0x800, 0xfffffffffffffffd, 0xffffffffffffffff})
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000a, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
fcntl$lock(r0, 0x7, &(0x7f0000000200)={0x0, 0x0, 0x7})
fcntl$lock(r0, 0x7, &(0x7f0000000040)={0x2, 0x0, 0x7fff, 0x0, 0xffffffffffffffff})
exit(0x0)

mknod$loop(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
mount(&(0x7f0000000280)=ANY=[], &(0x7f00000001c0)='./file0\x00', &(0x7f0000000180)='cgroup\x00', 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x0, 0x0)
r1 = openat$cgroup_procs(r0, &(0x7f0000000040)='tasks\x00', 0x2, 0x0)
preadv(r1, 0x0, 0x0, 0x0, 0x0)

clone(0x200802047fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
pause()
ptrace(0x10, r0)
ptrace$getregs(0x3, r0, 0x100, &(0x7f0000000000)=""/4092)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000080)={<r0=>0xffffffffffffffff})
getsockopt$sock_timeval(r0, 0x1, 0x15, 0x0, &(0x7f0000000000))
r1 = gettid()
tkill(r1, 0x25)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000280)={0x2, &(0x7f00000000c0)=[{0x54}, {0x6, 0x0, 0x0, 0x7fffffff}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$unix(0x1, 0x1, 0x0)
fcntl$setsig(r0, 0xa, 0x41)
rt_sigreturn()

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ioctl$BTRFS_IOC_BALANCE(r0, 0x5000940c, 0x0)
rt_sigreturn()

r0 = semget$private(0x0, 0x4000000009, 0x0)
semtimedop(r0, &(0x7f0000000080)=[{0x0, 0x1}], 0x1, 0x0)
semop(r0, &(0x7f0000000000)=[{0x0, 0xfffe}], 0x1)
semop(r0, &(0x7f0000000040)=[{}, {}], 0x2)
semctl$SETALL(r0, 0x0, 0x11, &(0x7f0000000100))

rt_sigprocmask(0x0, &(0x7f0000000100)={[0xfffffffffffe]}, 0x0, 0x8)
r0 = gettid()
timer_create(0x0, &(0x7f0000000180)={0x0, 0x17, 0x4, @tid=r0}, &(0x7f0000000000))
timer_settime(0x0, 0x0, &(0x7f0000000300)={{0x0, 0x1}, {0x0, 0xe4c}}, 0x0)
clock_gettime(0x0, &(0x7f00000002c0)={0x0, <r1=>0x0})
timer_settime(0x0, 0x0, &(0x7f0000000340)={{0x77359400}, {0x0, r1+10000000}}, 0x0)
ppoll(0x0, 0x0, &(0x7f0000000140)={0x0, 0x989680}, &(0x7f0000000080), 0x8)

mount(&(0x7f0000000000)=@nullb, &(0x7f0000000080)='./file0\x00', &(0x7f0000000040)='cgroup\x00', 0x0, &(0x7f00000000c0)=',!0\x00')

r0 = openat(0xffffffffffffff9c, &(0x7f0000000440)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
prctl$PR_SET_SECCOMP(0x16, 0x2, &(0x7f0000000080)={0x0, 0x0})
rt_sigreturn()

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
unshare(0x8000000)
r1 = semget$private(0x0, 0x4000, 0x0)
semctl$IPC_RMID(r1, 0x0, 0x0)
r2 = ioctl$TIOCGPTPEER(r0, 0x5441, 0x2400000000)
unshare(0x40000000)
sendfile(r2, r0, 0x0, 0x10001)
semop(0x0, &(0x7f0000000000)=[{0x0, 0x8, 0x1800}], 0x1)
r3 = semget(0x0, 0x2, 0x224)
semctl$GETPID(r3, 0x1, 0xb, &(0x7f0000000040)=""/163)
unshare(0x28000400)

open$dir(&(0x7f0000000180)='./file0\x00', 0x240, 0x0)
setxattr$trusted_overlay_redirect(&(0x7f0000000000)='./file0\x00', &(0x7f0000000040), 0x0, 0x0, 0x0)
setxattr(&(0x7f00000000c0)='./file0\x00', &(0x7f0000000140)=@known='trusted.overlay.redirect\x00', 0x0, 0x0, 0x3)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER(0x1, 0x0, &(0x7f0000000100)={0x2, &(0x7f00000000c0)=[{0x81}, {0x6}]})

r0 = fork()
ptrace(0x10, r0)
fork()
ptrace$pokeuser(0x6, r0, 0x0, 0x0)
ptrace$PTRACE_SECCOMP_GET_METADATA(0x420d, 0x0, 0x0, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = memfd_create(&(0x7f0000000200)='H)\xd4\x98#\'%nody\xed\xb0/\xa5\x7f\xfb\xd0ql\x86\xcd\xf6\x14\x93\xb0\x7f\x0eK@.\xc8\xa5\xb31\x10\x0f/;7\xce\xc7\xe3)L\x83\x1c\x06\xb7+&\x88i/\xdb6\xd6\xe26\xdd\xbd\xf9\x0e\xc1*\xbf\xe8,\xc3\xcb\xac\xfeq\x91\x17%M\xf6\x1d\xc6\xa7\xaf\xa0\xb0\xfc\xff\x13u\x98\xd7\xf5\x81\x12\xf4d\xc5\x94A\x03\xa8g\x18\xf5\xa5\x84\xb33H.\xce\xcd|\xf9\x86\xb7s\xf4\xb3)~\x83\xd6\xd7\x03\xcdz\xa6\x9b\x176\xb9\x90\xe3\xfb', 0x0)
fcntl$addseals(r0, 0x409, 0x0)
rt_sigreturn()

r0 = fork()
ptrace(0x4206, r0)
ptrace(0x8, r0)

r0 = memfd_create(&(0x7f00000002c0)='#\'%noY%v\x00\x7f\xe5\xd0ql\xe2m]\'\xe5+\xe8\xe1g\x9d\x8ef\x069\x9b\x93\xb0\x7f_,y<~\xab\x84\x00\x00\x00\x00\x00\x14\x14}\n\x81\xc7\x85|oC\xca\v\x00\xba]fn\r\xdf!\x94\x0f\xaf\xb7\x93\xe8\xb6\xc3N\x16&\xf9{\xaf;\xcf\x8c\xa8\xb9\x06\xaf\xd0\xfb:\x90LNF;\x02\x00\x00/1\xb9V\xf0*\xcb\xdc\x05n<\xcf$\xbb\bLY\xdf \x98q\xb2v\\\xa9\xcf*tM\a\xc43\xd0d\xee\x13Q\xd7\xf4\xef\xac\xa7\x01\xb4\x8c\xc1\x8c\x04,\xe0r\x01\xff\xff\xd9V!>.\x8b\xcf5\x0f\x96\xe6`\xa5.j\xe3D)0\x86\xe1\x81FX\xb9\xaa\xe4\xd2\xaf\xf61\xf9_-\xa3\xb2dM\xee\v ', 0x0)
write(r0, &(0x7f0000002000)='+', 0x1)
sendfile(r0, r0, &(0x7f0000000200), 0x87)
sendfile(r0, r0, &(0x7f00000001c0), 0xfec)
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x5, 0x11, r0, 0x0)
creat(&(0x7f0000000080)='./bus\x00', 0x0)

clone(0x2006d380, 0x0, 0x0, 0x0, 0x0)
sendmsg$sock(0xffffffffffffffff, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000000)=[@timestamping={{0x14, 0x1, 0x25, 0x2}}], 0x18}, 0x0)
r0 = openat$tun(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$TUNSETIFF(r0, 0x400454ca, &(0x7f0000000000)={'syz_tun\x00'})
rt_sigreturn()

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
pipe2(&(0x7f00000001c0)={0xffffffffffffffff, <r0=>0xffffffffffffffff}, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x1, 0x0, 0x0)
openat(r0, &(0x7f0000000300)='./file1\x00', 0x140c0, 0x0)
r2 = gettid()
tkill(r2, 0x18)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = open(&(0x7f00000000c0)='./file0\x00', 0x143042, 0x0)
connect$unix(r0, 0x0, 0x0)
clone(0x20043045d7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit_group(0x0)
exit(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
timer_create(0x0, &(0x7f0000000240)={0x0, 0x0, 0x1, @thr={0x0, 0x0}}, &(0x7f0000000280))
timer_settime(0x0, 0x0, &(0x7f0000000100)={{0x77359400}, {0x0, 0x989680}}, &(0x7f0000004c80))

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r0, 0x0)
rt_sigtimedwait(&(0x7f0000002380), 0x0, &(0x7f0000002400), 0x8)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
capset(&(0x7f0000000040)={0x20080522}, 0x0)
exit_group(0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0xfbe000)=nil, 0xfbe000, 0x2, 0x31, 0xffffffffffffffff, 0x0)
perf_event_open(&(0x7f0000000000)={0x2, 0x70, 0xfd, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r2 = semget$private(0x0, 0x4, 0x0)
semtimedop(r2, &(0x7f0000000080)=[{0x0, 0xffff}, {}], 0x2, &(0x7f00000000c0)={0x0, 0x989680})

clone(0x38004100, 0x0, 0x0, 0x0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
openat(0xffffffffffffff9c, &(0x7f00000000c0)='./file0\x00', 0x103041, 0x0)
getxattr(&(0x7f0000000040)='./file0\x00', &(0x7f0000000080)=@known='user.incfs.size\x00', 0x0, 0x0)
exit_group(0x0)
rt_sigreturn()

set_robust_list(&(0x7f0000000100), 0xc)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
mount$overlay(0x40000a, &(0x7f0000000000)='./file0\x00', &(0x7f00000000c0), 0x0, &(0x7f0000000140)=ANY=[@ANYRESDEC=0x0])
recvmmsg(r0, &(0x7f0000002d40)=[{{0x0, 0x0, 0x0}}], 0x1, 0x0, 0x0)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setrlimit(0x0, &(0x7f0000000080))
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = semget$private(0x0, 0x3, 0x198)
semop(r1, &(0x7f0000000040)=[{0x0, 0x6}, {}], 0x2)
semop(r1, &(0x7f0000000080)=[{}, {0x1, 0xfffc}], 0x2)
semop(r1, &(0x7f0000000000)=[{0x1, 0x9}], 0x1)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
faccessat(0xffffffffffffffff, &(0x7f0000000200)='./file1\x00', 0x0)
tkill(r0, 0x18)

r0 = creat(&(0x7f0000000040)='./bus\x00', 0x0)
lseek(r0, 0xfffffffffffffe01, 0x1)

socket$inet_icmp_raw(0x2, 0x3, 0x1)
r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
syz_open_procfs(0x0, &(0x7f0000000100)='fd/3\x00')

r0 = socket$packet(0x11, 0x2, 0x300)
ioctl$sock_SIOCGIFINDEX(r0, 0x8933, &(0x7f0000000040)={'wlan0\x00', <r1=>0x0})
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
sendto$packet(r0, 0x0, 0x0, 0x0, &(0x7f00000001c0)={0x11, 0x0, r1, 0x1, 0x0, 0x6, @link_local}, 0x14)

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r0 = fork()
ptrace(0x10, r0)
ptrace$setregs(0xd, r0, 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
exit_group(0x0)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mkdir(&(0x7f0000fd5ff8)='./file0\x00', 0x0)
r1 = creat(&(0x7f0000df1000)='./file0/bus\x00', 0x0)
fcntl$lock(r1, 0x7, &(0x7f0000027000)={0x1})
pwrite64(0xffffffffffffffff, 0x0, 0x0, 0x0)
fcntl$lock(r1, 0x6, &(0x7f0000000080)={0x2, 0x0, 0x0, 0x6})

syz_open_dev$char_raw(&(0x7f0000000040), 0x1, 0xa4fd3f45d7cb7006)

clone(0x2000204d5fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
madvise(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x15)
exit_group(0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = memfd_create(&(0x7f0000001fc1)='\x00\xac=\x9d\xd2\xdb\xe6\xbf\xb4\b\xedcJ\x8e\x84\xd4N\x12\x9b\x1f\t\xbd\x11+\x86T\x16\xa3\xb3\xae0\x9f9?\xefo\xa4k\x012>\xa1\x9c\x86x\x1c\x9f\x84\x195\xde\x97_\t~\xf3Y\x12\"p^\xc1\x0f', 0x0)
write(r0, &(0x7f0000002000)='/', 0x1)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x4, 0x11, r0, 0x0)
r1 = getpid()
r2 = gettid()
rt_tgsigqueueinfo(r1, r2, 0x13, &(0x7f0000000100))
ptrace(0x10, r1)
ptrace$getregset(0x4204, r1, 0x202, &(0x7f0000000040)={0x0})

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x38004100, 0x0, 0x0, 0x0, 0x0)
semget$private(0x0, 0x0, 0x0)
rt_sigreturn()

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt$inet_tcp_buf(r0, 0x6, 0xd, &(0x7f00000000c0)=""/2, &(0x7f0000000100)=0x2)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='gid_map\x00')
pwrite64(r0, 0x0, 0x0, 0x100000001)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = syz_open_procfs(0x0, &(0x7f0000000080)='fd\x00')
fstat(r2, &(0x7f0000000340)={0x0, 0x0, 0x0, 0x0, <r3=>0x0})
setuid(r3)
write$tcp_mem(r0, &(0x7f0000000000), 0x48)
exit(0x0)

r0 = signalfd(0xffffffffffffffff, &(0x7f0000000000), 0x8)
mknodat(r0, &(0x7f0000000100)='./file0\x00', 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
unlinkat(r0, &(0x7f0000000040)='./file0\x00', 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = gettid()
tkill(r2, 0x25)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000080)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
setsockopt(r1, 0x6, 0x6, 0x0, 0x0)
tkill(r0, 0x40)

timer_create(0x0, &(0x7f0000000080)={0x0, 0x12}, &(0x7f0000000140))
timer_settime(0x0, 0x0, &(0x7f000006b000)={{0x0, 0x989680}, {0x0, 0x989680}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x14, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000040)=<r0=>0x0)
timer_settime(r0, 0x0, &(0x7f00000000c0)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)
r1 = socket$unix(0x1, 0x5, 0x0)
r2 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r2, &(0x7f0000003000)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
listen(r2, 0x0)
dup3(r2, r1, 0x0)
accept(r1, 0x0, 0x0)

r0 = socket$packet(0x11, 0x2, 0x300)
connect$packet(r0, &(0x7f0000000940)={0x11, 0x0, 0x0, 0x1, 0x0, 0x6, @random="fc7d4c78eb1f"}, 0x14)

pipe(&(0x7f00000002c0)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = socket$inet_udp(0x2, 0x2, 0x0)
r3 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x1, 0x0, 0x0)
close(r2)
openat$zero(0xffffffffffffff9c, &(0x7f0000000040), 0x2, 0x0)
write$binfmt_misc(r1, &(0x7f0000000000)=ANY=[], 0xfffffecc)
splice(r0, 0x0, r2, 0x0, 0x4ffe0, 0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r2 = perf_event_open(&(0x7f000025c000)={0x2, 0x70, 0x15, 0x0, 0x0, 0x0, 0x0, 0x1, 0x824b0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x8000}, 0x0, 0xdf7fffffffffffff, 0xffffffffffffffff, 0x0)
r3 = perf_event_open(&(0x7f000001d000)={0x1, 0x70}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
mmap(&(0x7f0000ffc000/0x3000)=nil, 0x3000, 0x0, 0x11, r3, 0x0)
ioctl$PERF_EVENT_IOC_SET_OUTPUT(r2, 0x2405, r3)
openat$zero(0xffffffffffffff9c, &(0x7f0000000040), 0x2a0c0, 0x0)

r0 = socket$unix(0x1, 0x1, 0x0)
r1 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r1, &(0x7f0000003000)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
listen(r1, 0x1ff)
connect$unix(r0, &(0x7f0000000280)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
r2 = dup3(r1, r0, 0x0)
accept4$packet(r2, 0x0, &(0x7f0000000300), 0x80800)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$inet6_tcp_int(r0, 0x6, 0x2, &(0x7f00000008c0), &(0x7f0000000900)=0x4)

poll(0x0, 0x0, 0x7fff)
clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f00000027c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
r2 = socket$nl_route(0x10, 0x3, 0x0)
getsockopt$sock_timeval(r2, 0x1, 0x0, 0x0, &(0x7f0000000100))
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
timer_settime(0x0, 0x0, &(0x7f0000000200)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f0000000140)={0x0, 0x16, 0x0, @thr={0x0, 0x0}}, &(0x7f00000003c0)=<r3=>0x0)
timer_settime(r3, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x3938700}}, 0x0)

r0 = socket$inet(0x10, 0x80002, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x800003, 0x12, r1, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
sendmsg(r0, &(0x7f0000000000)={0x0, 0x0, &(0x7f0000000080)=[{&(0x7f0000000100)="24000000140007041dfffd946f6105000a0000e8fe020801040008000800070004000000280000001100ffffba16a0aa1c0900000000000012000000000000eff24d8238cfa47e23f7efbf54", 0x4c}], 0x1}, 0x0)

syz_emit_ethernet(0x32, &(0x7f0000000080)={@multicast, @multicast, @void, {@ipv4={0x800, @icmp={{0x7, 0x4, 0x0, 0x0, 0x24, 0x0, 0x0, 0x0, 0x1, 0x0, @rand_addr=0x64010101, @local, {[@cipso={0x86, 0x6}]}}, @info_request={0x8}}}}}, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
prlimit64(0x0, 0x7, &(0x7f0000000000), 0x0)
pipe2(0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000080), 0x0)

r0 = socket$netlink(0x10, 0x3, 0x0)
sendmsg$netlink(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000001c0)=[{&(0x7f0000000200)={0x20, 0x12, 0x1, 0x0, 0x0, "", [@typed={0xc, 0x0, 0x0, 0x0, @u64=0x2}, @generic="8c"]}, 0x20}], 0x1}, 0x0)

clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mknod$loop(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
lsetxattr$smack_xattr_label(&(0x7f0000000040)='./file0\x00', &(0x7f0000000080)='security.SMACK64MMAP\x00', 0x0, 0x0, 0x0)
mount(&(0x7f0000000280)=ANY=[], &(0x7f00000001c0)='./file0\x00', &(0x7f0000000180)='cgroup\x00', 0x0, 0x0)
exit(0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000380)={0x2, 0x4e22}, 0x10)
listen(r0, 0xffd7)
syz_emit_ethernet(0x42, &(0x7f0000000000)={@local, @remote, @void, {@ipv4={0x800, @tcp={{0x5, 0x4, 0x0, 0x0, 0x34, 0x0, 0x0, 0x0, 0x6, 0x0, @dev, @local}, {{0x0, 0x4e22, 0x41424344, 0x41424344, 0x0, 0x6, 0x8, 0x2, 0x0, 0x0, 0x0, {[@timestamp={0x4, 0xa}]}}}}}}}, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
r1 = semget$private(0x0, 0x20000000102, 0x0)
semctl$GETVAL(r1, 0x0, 0xc, 0x0)

clone(0x106300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$tcp_mem(0xffffff9c, &(0x7f0000000000)='/proc/sys/net/ipv4/tcp_rmem\x00', 0x1, 0x0)
pwritev2(r0, 0x0, 0x0, 0x0, 0x0, 0xc)
prlimit64(0x0, 0x0, &(0x7f0000000140), 0x0)

r0 = socket$inet6(0xa, 0x802, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f00000000c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setsockopt$inet6_udp_int(r0, 0x11, 0x67, &(0x7f00000003c0)=0x800000001, 0x4)
sendto$inet6(r0, 0x0, 0x0, 0x0, &(0x7f0000000180)={0xa, 0x4e23, 0x0, @ipv4={'\x00', '\xff\xff', @dev}}, 0x1f)

r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
sendto$inet6(r0, &(0x7f0000000000)="8000000000000000", 0x8, 0x0, &(0x7f0000000140)={0xa, 0x0, 0x0, @local}, 0x1c)
r1 = memfd_create(&(0x7f0000001fc1)='\x00\xac=\x9d\xd2\xdb\xe6\xbf\xb4\b\xedcJ\x8e\x84\xd4N\x12\x9b\x1f\t\xbd\x11+\x86T\x16\xa3\xb3\xae0\x9f9?\xefo\xa4k\x012>\xa1\x9c\x86x\x1c\x9f\x84\x195\xde\x97_\t~\xf3Y\x12\"p^\xc1\x0f', 0x0)
write(r1, &(0x7f0000000140)='/', 0x1)
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x4, 0x11, r1, 0x0)
clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rt_sigreturn()
recvmmsg(r0, &(0x7f0000000400)=[{{0x0, 0x0, 0x0}}], 0x1, 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = memfd_create(&(0x7f0000000100)='\vem1\xc1\xf8\xa6\x8dN\xc0\xa3w\xe2\xcb\xa2\xba\xe5\xf4\x97\xac#*\xff', 0x0)
write(r1, &(0x7f0000001a80)="06", 0x1)
mmap(&(0x7f0000000000/0x7000)=nil, 0x7000, 0x200000a, 0x11, r1, 0x0)
get_mempolicy(0x0, 0x0, 0x0, &(0x7f0000003000/0x3000)=nil, 0x3)
rt_tgsigqueueinfo(r0, r0, 0x34, &(0x7f0000000100))

r0 = openat(0xffffffffffffffff, &(0x7f000060cff8)='/', 0x0, 0x0)
renameat2(r0, &(0x7f0000000040)='./file0\x00', r0, &(0x7f0000000080)='./file0\x00', 0x0)

r0 = signalfd(0xffffffffffffffff, &(0x7f0000000100), 0x8)
r1 = signalfd(r0, &(0x7f0000000100), 0x8)
futimesat(r1, 0x0, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
chmod(&(0x7f0000000180)='./file0\x00', 0x23f)
mkdir(&(0x7f0000000040)='./file0/file1\x00', 0x0)
newfstatat(0xffffffffffffff9c, &(0x7f0000000240)='./file0/file1\x00', &(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, <r0=>0x0}, 0x0)
setreuid(0x0, r0)
rename(&(0x7f0000000280)='./file0/file1\x00', &(0x7f00000002c0)='./file0\x00')
rt_sigreturn()

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = fork()
ptrace(0x10, r0)
rt_sigqueueinfo(r0, 0x9, &(0x7f0000000100)={0x0, 0x0, 0xfffffdfe})
rt_sigreturn()

r0 = socket$inet(0x2, 0x3, 0x2)
setsockopt$inet_mreqsrc(r0, 0x0, 0x27, &(0x7f0000000040)={@multicast2, @local}, 0xc)
r1 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r2 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r2, 0x0)
preadv(r1, &(0x7f00000001c0)=[{0x0, 0x2d}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0x1000007, 0x800000000009031, 0xffffffffffffffff, 0x0)
r3 = socket$inet(0x2, 0x3, 0x2)
setsockopt$inet_mreqsrc(r3, 0x0, 0x27, &(0x7f0000000040)={@multicast2, @local, @loopback}, 0xc)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
writev(0xffffffffffffffff, 0x0, 0x0)
exit(0x0)

ioctl$TCSETSW(0xffffffffffffffff, 0x5403, 0x0)
r0 = socket$inet6(0xa, 0x1, 0x0)
setsockopt$sock_timeval(r0, 0x1, 0x15, &(0x7f0000000180)={0x0, 0xea60}, 0x10)
connect$inet6(r0, &(0x7f0000000040)={0xa, 0x0, 0x0, @remote, 0x3}, 0x1c)
r1 = dup(r0)
sendto$inet6(r1, 0x0, 0x0, 0x0, 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x320e100, 0x0, 0x0, 0x0, 0x0)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000840)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
fcntl$lock(r0, 0x6, &(0x7f0000000180))
fcntl$lock(r0, 0x7, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x1})
fcntl$lock(r0, 0x7, &(0x7f00000011c0)={0x0, 0x0, 0x108800001, 0xc})
fcntl$lock(r0, 0x7, &(0x7f00000002c0)={0x0, 0x0, 0x64f7, 0x1f})
rt_sigreturn()
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
ioctl$FIOCLEX(r0, 0x5451)
r1 = syz_open_procfs(0x0, &(0x7f0000000000)='fdinfo/3\x00')
preadv(r1, &(0x7f0000001600)=[{&(0x7f00000002c0)=""/207, 0xcf}], 0x1, 0x0, 0x0)

clock_getres(0x2cf56f35a76177ff, 0x0)

socket$inet6_tcp(0xa, 0x1, 0x0)
mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
mount(0x0, &(0x7f0000027000)='./file0\x00', &(0x7f0000000080)='ramfs\x00', 0x2000050, 0x0)
r0 = creat(&(0x7f0000000100)='./file0/bus\x00', 0xbc9dc8fbd81cb4b1)
fcntl$lock(r0, 0x7, &(0x7f00000003c0)={0x1, 0x0, 0x0, 0x0, 0xffffffffffffffff})
unshare(0x40600)
pwritev(r0, &(0x7f0000000000)=[{&(0x7f00000000c0)='T', 0x1}], 0x1, 0x0, 0x0)
write$9p(0xffffffffffffffff, 0x0, 0x0)
close(0x4)

r0 = socket$packet(0x11, 0x2, 0x300)
syz_emit_ethernet(0x36, &(0x7f00000001c0)={@random="06c26f420a53", @empty, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "00ff80", 0x0, 0x0, 0x0, @dev, @local}}}}, 0x0)
recvmmsg(r0, &(0x7f0000004a00)=[{{0x0, 0x0, 0x0}}, {{0x0, 0x0, 0x0}}], 0x2, 0x40, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
seccomp$SECCOMP_SET_MODE_FILTER(0x1, 0x0, &(0x7f0000000080)={0x2, &(0x7f0000000040)=[{0xe4}, {0x6}]})
timer_create(0x0, 0x0, &(0x7f0000000400))
timer_settime(0x0, 0x1, &(0x7f0000000480)={{}, {0x0, 0x989680}}, 0x0)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000005240)={<r0=>0xffffffffffffffff})
getsockopt$sock_int(r0, 0x1, 0x1e, &(0x7f0000005280), &(0x7f00000064c0)=0x4)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
sched_setaffinity(0x0, 0x8, &(0x7f0000000100)=0x89)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000080)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
fcntl$lock(r1, 0x6, &(0x7f0000000180)={0x1})
fcntl$lock(r1, 0x5, &(0x7f0000000000)={0x1})

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendto(r1, &(0x7f0000002540)="06", 0x1, 0x0, 0x0, 0x0)
recvmmsg(r0, &(0x7f0000002380)=[{{0x0, 0x0, &(0x7f00000004c0)=[{&(0x7f00000000c0)=""/199, 0xc7}], 0x1}}], 0x1, 0x120, &(0x7f00000024c0)={0x0, 0x3938700})

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
capset(&(0x7f00000000c0), 0x0)

r0 = openat$null(0xffffffffffffff9c, &(0x7f0000000300), 0x0, 0x0)
lseek(r0, 0x0, 0x0)

r0 = openat$null(0xffffffffffffff9c, &(0x7f0000000140), 0x201, 0x0)
pwritev(r0, &(0x7f00000001c0)=[{&(0x7f0000000180)="da", 0x1}], 0x1, 0x0, 0x0)

creat(&(0x7f000000f1c0)='./file0\x00', 0x0)
stat(&(0x7f0000000080)='./file0\x00', &(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, <r0=>0x0})
setreuid(0x0, r0)
lremovexattr(&(0x7f0000000000)='./file0\x00', &(0x7f0000000040)=@known='trusted.syz\x00')

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
dup2(0xffffffffffffffff, 0xffffffffffffffff)
rt_sigreturn()

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = semget$private(0x0, 0x4, 0x0)
semctl$IPC_INFO(r0, 0x4, 0x10, 0x0)
setrlimit(0x0, &(0x7f0000000080))

syz_emit_ethernet(0x4e, &(0x7f0000000040)={@local, @local, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "9aeac6", 0x18, 0x3a, 0xff, @private2, @mcast2, {[], @ndisc_na={0x88, 0x0, 0x0, 0x61, '\x00', @private0}}}}}}, 0x0)

clone(0x5100, 0x0, 0x0, 0x0, 0x0)
getdents64(0xffffffffffffffff, 0x0, 0x0)
rt_sigreturn()

pipe(&(0x7f0000004880)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
write$FUSE_POLL(r1, &(0x7f00000048c0)={0x18}, 0x18)
write$FUSE_GETXATTR(r1, &(0x7f0000004900)={0x18}, 0x18)
read$FUSE(r0, &(0x7f0000004ac0)={0x2020, 0x0, 0x0, 0x0, 0x0, <r2=>0x0}, 0x2020)
capget(&(0x7f0000006b00)={0x19980330, r2}, &(0x7f0000006b40))

r0 = socket$packet(0x11, 0x3, 0x300)
ioctl$sock_inet_SIOCGIFNETMASK(r0, 0x891b, &(0x7f00000000c0)={'syz_tun\x00', {0x2, 0x0, @initdev}})

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$netlink(0x10, 0x3, 0x0)
setsockopt$SO_TIMESTAMPING(r0, 0x1, 0x1b, 0x0, 0x0)
r1 = gettid()
rt_sigqueueinfo(r1, 0xa, &(0x7f0000000040))

removexattr(&(0x7f0000000140)='./file0\x00', 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit_group(0x0)
perf_event_open(&(0x7f000025c000)={0x2, 0x70, 0x15}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = openat$zero(0xffffffffffffff9c, &(0x7f0000000180), 0x1, 0x0)
write$cgroup_subtree(r2, 0x0, 0x5)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
clone(0x4000000206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
syz_open_pts(r0, 0x0)
exit_group(0x0)
clone(0x28e4640, &(0x7f0000000100), 0x0, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
shmctl$SHM_INFO(0x0, 0xe, &(0x7f0000000000)=""/164)

r0 = socket$inet6(0xa, 0x1, 0x0)
getsockopt$inet6_int(r0, 0x29, 0x4a, 0x0, &(0x7f0000000280))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigaction(0xd, &(0x7f0000000080)={&(0x7f0000000000)="f346d37ff2f2438395ff1f965a00c4c1185c840561ee00007b7b01fa6f6aa1c463f962d60064660fd054060aa89ef37c6467f30f2a6cda00c4c1f91129", 0x88000000, 0x0}, 0x0, 0x8, &(0x7f0000000180))
pipe(&(0x7f0000000000)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff})
close(r1)
write$P9_RLINK(r2, &(0x7f0000000040)={0x7}, 0x7)

r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
getsockopt$inet_buf(r0, 0x0, 0x43, &(0x7f0000019300)=""/4096, &(0x7f0000000040)=0x1000)

perf_event_open(&(0x7f0000000000)={0x2, 0x80, 0x23, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={0x0}}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = fork()
ptrace$setopts(0x4206, r0, 0x0, 0x0)
ptrace(0x4207, r0)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = memfd_create(&(0x7f0000000180)='&\x1f\xbb\xf0\xc0\xeb\x827\xfe*\x9dp\xc9@\x84\x95\xae\xb2\xba\x18\x0f\x0f\x8eMq\x1e\x8b\xf1\f6\xb5\xab=\xdc\xe3\xda+\x1bB\xda\"fiOh^\xa4\xad \x81\x19b\xcf\xf3\x00;\xd9\x17\xe3\xfb\xde\xb1\x96\xbcnZc\xe5\x14r8\xe6\xe7\xfa^\xdc\xef\xe9\n \x92\xe9D\xe0\x12\x16\xe2_\xb6\xeb\x8c\xa1\x13\xbf\xa4p\xdb\x80\x91\xdf\xc08\x81\"\xe6\xa2%\x96\v\xe9-\xab\x14.g\xe3g-}@h\x88\xe7\x9eEP#r\xf4\x88\xd1\xbf\xc8\xf8-\x95@\xae\x0f\xd96!\x1c$\xfa\x8cK\x84\xd7\xb9\xa8X\xffj\xf4\xc4\b\x00+\x15\x9c9\x89\x03\x9bC\x9f\x0e', 0x0)
writev(r0, &(0x7f0000000000)=[{0x0, 0x21}, {&(0x7f0000000040)="11", 0xffffffffffffff40}], 0x2)
exit_group(0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0xfbe000)=nil, 0xfbe000, 0x7, 0x31, 0xffffffffffffffff, 0x0)
r2 = creat(&(0x7f0000000400)='./bus\x00', 0x0)
ftruncate(r2, 0x208200)
r3 = open(&(0x7f0000000200)='./bus\x00', 0x10103e, 0x0)
mmap(&(0x7f0000000000/0x600000)=nil, 0x600000, 0x7ffffe, 0x4002011, r3, 0x0)
truncate(&(0x7f0000000040)='./bus\x00', 0x0)

syz_emit_ethernet(0x6e, &(0x7f0000000000)={@broadcast, @broadcast, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "00df5a", 0x38, 0x3a, 0x0, @rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01', @mcast2, {[], @dest_unreach={0x2, 0x0, 0x0, 0x40, '\x00', {0x0, 0x6, "000001", 0x0, 0x3a, 0x0, @dev, @loopback, [], "05335175feab2070"}}}}}}}, 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
r1 = socket$netlink(0x10, 0x3, 0x0)
getsockopt$sock_int(r1, 0x1, 0x8, 0x0, &(0x7f0000002300))
rt_sigqueueinfo(r0, 0x39, &(0x7f0000000000))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x300000b, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigpending(&(0x7f0000000340), 0x8)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
munmap(&(0x7f0000000000/0x3000)=nil, 0x3000)
mincore(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x0)
rt_sigreturn()

poll(0x0, 0x0, 0x7fff)
clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f00000027c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
perf_event_open(0x0, 0x0, 0x0, 0xffffffffffffffff, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f00000005c0)={<r2=>0xffffffffffffffff, <r3=>0xffffffffffffffff})
write$binfmt_elf64(r2, &(0x7f0000000000)=ANY=[], 0x1)
setsockopt$sock_int(r3, 0x1, 0x200000010, &(0x7f00000000c0)=0x1, 0x4)
write$binfmt_elf32(r2, &(0x7f0000000600)=ANY=[], 0x1)
recvmmsg(r3, &(0x7f0000000040)=[{{0x0, 0x0, &(0x7f0000002b00)=[{&(0x7f0000002a80)=""/95, 0x5f}], 0x1}}], 0x400000000000170, 0x0, 0x0)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
timer_settime(0x0, 0x0, &(0x7f0000000200)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x16}, &(0x7f00000003c0)=<r4=>0x0)
timer_settime(r4, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x3938700}}, 0x0)

prctl$PR_SVE_SET_VL(0x26, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
unlink(&(0x7f0000000000)='./file0\x00')
fallocate(r0, 0x0, 0x102000006, 0x6)

socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000380)={<r0=>0xffffffffffffffff})
sendmmsg(r0, &(0x7f0000002e80)=[{{0x0, 0x0, 0x0}}, {{0x0, 0x0, 0x0, 0x0, &(0x7f00000025c0)=[{0x10, 0x1, 0x2}], 0x10}}], 0x2, 0x0)

socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000d80)={<r0=>0xffffffffffffffff})
openat$cgroup(r0, &(0x7f00000005c0)='syz1\x00', 0x200002, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
mremap(&(0x7f0000a94000/0x2000)=nil, 0x2000, 0x800000, 0x3, &(0x7f0000130000/0x800000)=nil)
mlockall(0x3)
clone(0x0, 0x0, 0x0, 0x0, 0x0)
mmap(&(0x7f0000000000/0xfbe000)=nil, 0xfbe000, 0x0, 0x31, 0xffffffffffffffff, 0x0)

getgroups(0x0, &(0x7f0000000100))

prlimit64(0x0, 0x7, &(0x7f0000000140), 0x0)
timerfd_create(0x0, 0x0)

eventfd2(0x0, 0xc0801)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$unix(0x1, 0x5, 0x0)
bind$unix(r0, &(0x7f0000003000)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0xc)
listen(r0, 0x1)
r1 = socket$unix(0x1, 0x5, 0x0)
r2 = socket$unix(0x1, 0x5, 0x0)
connect(r2, &(0x7f0000931ff4)=@un=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0xc)
connect$unix(r1, &(0x7f0000000140)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
exit_group(0x0)
exit(0x0)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000180)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
setsockopt(r0, 0x1, 0xa, 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
mkdir(&(0x7f0000000040)='./file0/file1\x00', 0x0)
getxattr(&(0x7f0000000100)='./file0/file1\x00', &(0x7f0000000240)=@random={'os2.', '-\x00'}, 0x0, 0x0)
exit_group(0x0)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000000), 0x141282, 0x0)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
ftruncate(r0, 0x0)
r1 = gettid()
rt_sigqueueinfo(r1, 0x34, &(0x7f0000000140))

r0 = socket$inet6(0xa, 0x2, 0x0)
bind$inet6(r0, &(0x7f0000f5dfe4)={0xa, 0x4e20, 0x0, @empty}, 0x1c)
sendto$inet6(r0, 0x0, 0x0, 0x0, &(0x7f0000000240)={0xa, 0x4e20, 0x0, @empty}, 0x1c)
setsockopt$inet6_int(r0, 0x29, 0x42, &(0x7f0000000080)=0xf60, 0x4)
recvmmsg(r0, &(0x7f0000000200)=[{{0x0, 0x0, 0x0}}], 0x1, 0x0, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
pipe(&(0x7f00000017c0)={<r0=>0xffffffffffffffff})
fsetxattr$security_evm(r0, &(0x7f0000001280), &(0x7f0000001240)=@sha1={0x1, "018cee8a000000d3ce474200"}, 0xfffffffffffffe11, 0x0)
rt_sigreturn()

mknod(&(0x7f0000000040)='./file1\x00', 0x0, 0x0)
lsetxattr$trusted_overlay_upper(&(0x7f0000000300)='./file1\x00', &(0x7f0000000340), &(0x7f0000000380)={0x0, 0xfb, 0x39, 0x0, 0x0, "c4fb4f64f4c60703046c6b5bd5ab2b2a", "f003d70dd3d1fd4c835ee608155790fb87f42ae7e6b131d0414672a2fbc84aa55a83db65"}, 0x39, 0x0)
setxattr$incfs_metadata(&(0x7f0000000080)='./file1\x00', &(0x7f00000000c0), &(0x7f0000000140)="9157d3ee4ae2510e8c1d30b3a6dad58a8a7e4eb46a64fc357ed090884ece26778d69d9011a4602bd7015be470fbaf0948ebd5a82c6bde7660b", 0x39, 0x0)
lremovexattr(&(0x7f0000000180)='./file1\x00', &(0x7f00000001c0)=@known='trusted.overlay.upper\x00')

perf_event_open(&(0x7f0000000200)={0x2, 0x70, 0x42, 0x8001}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = socket$inet(0x2, 0xa, 0x0)
io_setup(0x8, &(0x7f0000000080)=<r1=>0x0)
io_getevents(r1, 0x3, 0x3, &(0x7f0000000180)=[{}, {}, {}], 0x0)
io_submit(r1, 0x1, &(0x7f00000003c0)=[&(0x7f0000000000)={0x0, 0x0, 0x0, 0x8, 0x0, r0, 0xffffffffffffffff}])
io_destroy(r1)

mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0, 0x10, 0xffffffffffffffff, 0x0)
clone(0x5fd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
wait4(0x0, 0x0, 0x40000000, 0x0)
clone(0x101efff, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
pipe(&(0x7f0000000040)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
write(r1, &(0x7f00000000c0)="6d527cd53870164a3a0d4b64fb0d7bebad2dce076e7768215970e33adf15173c9e665cff10727f62077ebc24a796b221a2f39fd294dc01861206b499138d02ebf3cfc3b11f0e18858568476bac483df9c4d0a61da2d2f9b7c4cb601c0141f209fc9e06d9457920a9a749a23ccd52eb91db5b189627774719cf91bd6e63a2b8a3b657c0e438ffc3e275b03ef0f384a0c1f20143b7b87f2e34729b000000805e0ad338423d4200f349c545516c46bb9f104a3816b12950faa20fab5827bc62a8d4cc12c4c8954308a933d63aa66cdb3646a37626de7361b5338c197dd3e6844dafcb4338dce0b79ee41da150eca12fbd36b4873ce8e4747b63e8830ee6c32f254d3779e13b27a15beaf488ea843600"/297, 0xffffffca)
ptrace(0x4206, r0)
wait4(0x0, 0x0, 0x0, &(0x7f00000015c0))
tkill(r0, 0x800000009)

pipe(&(0x7f00000000c0)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
write$binfmt_misc(r1, &(0x7f0000000100)=ANY=[], 0x4)
r2 = memfd_create(&(0x7f0000000780)='\x00', 0x0)
splice(r0, 0x0, r2, &(0x7f0000000080), 0x400000000003, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = memfd_create(&(0x7f0000000300)='\x00\xc1\xf9\xe7\x92\xc2\xe8\x82\x8e\t\x92\x84\x13\x11^h\x9a*\x0f]\x9d\xa7Z\xf8\xc1\x9cW\n\xa5\xbd\'f\x13\xc8\x8d\xc8\x9d|\x86\x99+\xfa\xef\x7f\xe9\xb2pP\xb88\xec\x84\x18\xa3\xa5^*\xc5.\xbc\xd4\x00&YY*\xa0,\x80\xef\x9e\xd7sY\x1b\x9c\b\x9b\x89\x82\x9dc\x06=\"\x8e\x10\x10\xa40\x8bJ\a\xc6\xe3K\x1a(\x9a\x7fXP\xdd\x1a\xae\b<\xc0\xee\xf8\x11\xd1<,T?\x80-\x9f\xc0[<\xe3(\xd1\xb7}\xa1:\x95m\xcf\x83t\xaa\xaa\x80\x06_\xb0\x81\xee\xdfa\xcd#\xaf\xb5=\xa1\x80\xbc\x89X\xd2\x1et\xee\x99A}\xf8\x1dN\xc6\xb1\x87\xd7\xb7\xfa\x94zt\xe9eI\xac[\xad\\b\x13\x9e\x7f\x01\xfd\xd4\x03\x82', 0x0)
syz_open_procfs(0x0, &(0x7f0000000080)='mountinfo\x00')
write$FUSE_DIRENT(r0, &(0x7f0000000080)=ANY=[], 0x29)
newfstatat(0xffffffffffffff9c, &(0x7f0000000280)='.\x00', &(0x7f0000001500)={0x0, 0x0, 0x0, 0x0, <r1=>0x0}, 0x0)
setreuid(0x0, r1)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x81, 0x11, r0, 0x0)
r2 = syz_open_procfs(0x0, &(0x7f0000000040)='fd/4\x00')
unlinkat(r2, &(0x7f0000000000)='./file1\x00', 0x200)
exit(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = timerfd_create(0x0, 0x0)
timerfd_gettime(r1, &(0x7f0000000100))

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000000)={0x2, &(0x7f0000000140)=[{0x64}, {0x6, 0x0, 0x0, 0x7ffffffa}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f00000000c0)='./file0\x00', 0x0)
mount(&(0x7f00000000c0)=ANY=[], &(0x7f0000000040)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x0, 0x0)
chdir(&(0x7f0000000000)='./file0\x00')
rt_sigreturn()
unlink(&(0x7f0000000100)='.\x00')

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = memfd_create(&(0x7f0000000380)='s\xf5\x89\x1c\x9e\xddF\x9c\x1drity\a\x00\b\x00\x00\x00\x00\x00\x00\xe02\xf9\xde\x7fD6\a\x93\xbf}\xdc-\x05\x8c\xa5\x80\xfcH\x0fi\x000\xf6\x1f~\x9e\xb4\xa8\x14\x93\xa3\xf3^\xfd.\xd1\xe8\xf0\xf8\x83I\x8b\xc7\x10\xd1g\x9fd$\x839\x1e\x88\xe3\x86\x19\x11\xabXK\xc4D\x8fZx\xe7\xe4\x98\x9bx\xfa\'0\xc9[\x9b=2\xfa\xe1\x8at\xd1I2\x14B\xb2\xe7;\xcau\xa7<E\x01U@\xb1n\x00\x00\x00\x00\x00\xd56\xa7\\\x91\x03\xcd;\xb3\x1aiO6\xe9\f\xfcH\xfd\x94\xe8\x1e2\x86W\xd1\x02\n\x10\xa4BE\xfe\x15]\xeb y\x99\xd6\xf7\xa0\xf5\x9b\x01\x00\x00\x00\x00\x00\x00\x00\xfd4\\\f\xb6MC\x80f+\xc4\xf6\x93\x87P@`{\xf9\xff;`\x89:w\xbe\xf3*\xbb/:\x9e?\x06\xdaF\x93@1riK\xc7/\xb3\xd9wT\xf0\xc5\xff\xcdQ\x12\xc9\x95\x95\xd8\x1e@g\x0fa\xd5\xd2\xa3<\xd0\x84,', 0x0)
mmap(&(0x7f0000001000/0x1000)=nil, 0x1000, 0x4, 0x11, r0, 0x0)
symlink(&(0x7f0000001000)='./file0\x00', 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
r2 = socket$inet6_tcp(0xa, 0x1, 0x0)
fallocate(r2, 0x0, 0x102000006, 0x6)

r0 = open(&(0x7f0000000280)='./file0\x00', 0x3fc, 0x0)
flock(r0, 0x1)
r1 = open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
flock(r1, 0x2)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
dup2(r1, r0)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f00000002c0)={<r0=>0xffffffffffffffff})
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000a, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ioctl$sock_inet_SIOCGIFPFLAGS(r0, 0x8935, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
bind$inet6(r0, &(0x7f0000002700)={0xa, 0x4e20, 0x0, @empty}, 0x1c)
r1 = socket$inet6_icmp(0xa, 0x2, 0x3a)
bind$inet6(r1, &(0x7f0000002700)={0xa, 0x4e20, 0x0, @empty}, 0x1c)

poll(0x0, 0x0, 0x7fff)
clone(0x0, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f00000027c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
timer_settime(0x0, 0x0, &(0x7f0000000200)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x16, 0x0, @thr={0x0, 0x0}}, &(0x7f00000003c0)=<r2=>0x0)
timer_settime(r2, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)
madvise(&(0x7f0000466000/0x2000)=nil, 0x2000, 0x13)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
r1 = syz_open_pts(r0, 0x0)
r2 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
fstat(r1, &(0x7f000000f780))

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
fstat(r0, &(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, <r1=>0x0})
setuid(r1)
mkdir(&(0x7f0000000040)='./file0/file1\x00', 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000080), 0x0)

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r0 = fork()
ptrace(0x10, r0)
clone(0xa912d700, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rt_sigreturn()
r1 = getpid()
r2 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
ptrace$peekuser(0x3, r0, 0x7fff)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800013, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigqueueinfo(r1, 0x39, &(0x7f0000000000))

clone(0x2000c500, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet6(0xa, 0x2, 0x0)
getsockopt$IP6T_SO_GET_ENTRIES(r0, 0x29, 0x41, &(0x7f0000000000)={'raw\x00', 0x4, "e2bfffc8"}, &(0x7f0000000080)=0x28)
rt_sigreturn()

io_setup(0xf92, &(0x7f0000000080)=<r0=>0x0)
r1 = eventfd2(0x0, 0x0)
io_submit(r0, 0x1, &(0x7f0000000340)=[&(0x7f0000000300)={0x0, 0x0, 0x0, 0x8, 0x0, r1, 0x0, 0x0, 0x0, 0x0, 0x3, r1}])

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f00000000c0), 0x280e00, 0x0)
flock(r0, 0x8)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f0000000080)='fd\x00')
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x0, 0x11, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x1, 0x0, 0x0)
exit(0x0)

r0 = gettid()
getpid()
timer_create(0x0, &(0x7f0000044000)={0x0, 0x12}, &(0x7f0000044000))
timer_settime(0x0, 0x0, &(0x7f000006b000)={{0x0, 0x8}, {0x0, 0x9}}, 0x0)
r1 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r1, 0x6, 0x10000000013, &(0x7f0000d06000)=0x1, 0x4)
setsockopt$SO_BINDTODEVICE(r1, 0x1, 0x19, &(0x7f0000000000)='veth1\x00', 0x10)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000140)={<r2=>0xffffffffffffffff})
r3 = fcntl$dupfd(r2, 0x0, r2)
ioctl$PERF_EVENT_IOC_ENABLE(r3, 0x8912, 0x400200)
connect$inet(r1, &(0x7f0000000080)={0x2, 0x0, @initdev={0xac, 0x1e, 0x0, 0x0}}, 0x10)
setsockopt$inet_tcp_TCP_REPAIR(r1, 0x6, 0x13, &(0x7f0000000100), 0x4)
setsockopt$sock_linger(r1, 0x1, 0xd, &(0x7f0000000140)={0x1, 0x1}, 0x8)
close(r1)
tkill(r0, 0x16)

io_setup(0x2, &(0x7f0000000100)=<r0=>0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = openat$fuse(0xffffffffffffff9c, &(0x7f0000000280), 0x2, 0x0)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r3, 0x0)
io_submit(r0, 0x2, &(0x7f0000000140)=[&(0x7f0000000180)={0x0, 0x0, 0x0, 0x0, 0x0, r2, 0x0}, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x7, 0x0, r1, &(0x7f0000000440)="80fc4c645f7a02db13a0374c21ae51497890ec02ae3bcc9a5ea8628d278146d85a8bc2b7c38ad1e3197e229176296f51025ee0f989a7ff6b91a73660a17f2932ca9038e6220d961d5a6c7050031e2e3d9ca746edbb589e218307e5b9f03dc23ebddd77c1f65c8b977fac4ef78b5d9bc62501a609ccf00462a88eaa6ece4ba6d8c8e781ea17e4f4bc26c23c5060123762f5704287cc897999d24662ee868a41c5ad6581636d913bd96ba788404a6256d1440496b8327c1b1feecdfb3043", 0xbd}])

r0 = socket$inet6(0xa, 0x801, 0x0)
setsockopt$inet6_int(r0, 0x29, 0x1a, &(0x7f0000000100)=0x1f, 0x4)
bind$inet6(r0, &(0x7f0000000000)={0xa, 0x0, 0x0, @ipv4={'\x00', '\xff\xff', @dev}}, 0x1c)

recvmmsg(0xffffffffffffffff, &(0x7f0000003140)=[{{&(0x7f0000000000)=@ipx, 0xffffffffffffffef, 0x0}}], 0x1, 0x0, 0x0)
perf_event_open(&(0x7f0000000080)={0x1, 0x70}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
clone(0x4000008006ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f0000000180)='fdinfo/3\x00')
exit(0x0)
preadv(r0, &(0x7f0000000500), 0x2cb, 0x0, 0x0)

r0 = socket$inet6(0xa, 0x3, 0x3a)
sendmmsg$sock(r0, &(0x7f0000001740)=[{{&(0x7f0000000000)=@l2tp6={0xa, 0x0, 0x0, @remote}, 0x80, &(0x7f00000011c0)=[{&(0x7f0000000080)="c9a7e8d5", 0x4}], 0x1}}, {{0x0, 0x0, 0x0}}], 0x2, 0x0)

r0 = creat(&(0x7f0000000400)='./file0\x00', 0x0)
capset(&(0x7f0000000000)={0x20080522}, &(0x7f0000000040))
fgetxattr(r0, &(0x7f0000000340)=@random={'user.', 'fou\x00'}, 0x0, 0x0)

setrlimit(0x40000000000008, &(0x7f0000000000)={0x4848, 0xfffffffffffff005})
capset(&(0x7f0000a31000)={0x20071026}, &(0x7f00009b3000))
mlock2(&(0x7f0000006000/0x3000)=nil, 0x3000, 0x0)
mlock(&(0x7f0000008000/0x3000)=nil, 0x3000)

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r0 = fork()
ptrace(0x10, r0)
ptrace$setopts(0x4200, r0, 0x0, 0x100054)
ptrace$cont(0x7, r0, 0x0, 0x6)
exit_group(0x0)

set_mempolicy(0x0, 0xfffffffffffffffe, 0x3aa)

clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
getrandom(0xfffffffffffffffd, 0xfe7b, 0x0)
r0 = gettid()
rt_sigqueueinfo(r0, 0xe, &(0x7f0000000040))

getresuid(&(0x7f0000005dc0), 0x0, 0x0)
accept$packet(0xffffffffffffffff, 0x0, &(0x7f0000009540))
fork()
getresuid(&(0x7f0000009580), &(0x7f00000095c0), &(0x7f0000009600))

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
r1 = getpid()
r2 = syz_open_pts(r0, 0x0)
ioctl$TIOCSPGRP(r2, 0x5410, 0x0)
rt_sigqueueinfo(r1, 0x39, &(0x7f0000000000))

r0 = socket$netlink(0x10, 0x3, 0x0)
connect$netlink(r0, &(0x7f0000000000)=@kern={0x10, 0x0, 0x0, 0x8188084}, 0xc)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000080)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
recvmmsg(r0, &(0x7f0000002a40)=[{{0x0, 0x0, 0x0}}, {{0x0, 0x0, &(0x7f00000010c0)=[{&(0x7f0000002640)=""/74, 0x4a}, {&(0x7f00000026c0)=""/61, 0x3d}, {&(0x7f0000000040)=""/14, 0xe}, {0x0, 0xffc5}, {&(0x7f00000000c0)=""/4096, 0x1000}], 0x5}}], 0x2, 0x0, &(0x7f0000002b40))

clone(0x5100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$inet6_tcp_int(r0, 0x6, 0x19, 0x0, &(0x7f0000000080))
rt_sigreturn()

clone(0x2e380, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mknod$loop(&(0x7f0000000200)='./file0\x00', 0x0, 0x1)
mount(&(0x7f0000000280)=ANY=[], &(0x7f00000000c0)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x0, 0x0)
utime(&(0x7f0000000000)='./file0\x00', 0x0)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
tkill(r0, 0x3a)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

poll(0x0, 0x0, 0x7fff)
clone(0x0, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f00000027c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
wait4(0x0, 0x0, 0x80000000, 0x0)
timer_settime(0x0, 0x0, &(0x7f0000000200)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x16, 0x0, @thr={0x0, 0x0}}, &(0x7f00000003c0)=<r2=>0x0)
timer_settime(r2, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000040)='net/if_inet6\x00')
read$FUSE(r1, &(0x7f0000000600)={0x2020}, 0x2020)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = memfd_create(&(0x7f0000001fc1)='\x00\xac=\x9d\xd2\xdb\xe6\xbf\xb4\b\xedcJ\x8e\x84\xd4N\x12\x9b\x1f\t\xbd\x11+\x86T\x16\xa3\xb3\xae0\x9f9?\xefo\xa4k\x012>\xa1\x9c\x86x\x1c\x9f\x84\x195\xde\x97_\t~\xf3Y\x12\"p^\xc1\x0f', 0x0)
readahead(r0, 0x0, 0x0)
setrlimit(0x0, &(0x7f0000000080))
exit(0x0)

r0 = socket$unix(0x1, 0x1, 0x0)
setsockopt$sock_int(r0, 0x1, 0x10, &(0x7f0000000240)=0xffffffff, 0x4)
r1 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r1, &(0x7f0000000000)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
listen(r1, 0x0)
connect$unix(r0, &(0x7f0000000140)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
r2 = accept(r1, 0x0, 0x0)
sendto$unix(r2, &(0x7f0000000280)="7f", 0x1, 0x0, 0x0, 0x0)
recvmsg(r0, &(0x7f0000000640)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=""/7, 0x7}, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f00000000c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mkdir(&(0x7f0000000040)='./file0\x00', 0x0)
r1 = open(&(0x7f0000000280)='.\x00', 0x0, 0x0)
chdir(&(0x7f00000001c0)='./file0\x00')
mkdirat(r1, &(0x7f0000000200)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0)
renameat2(r1, &(0x7f00000004c0)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0xffffffffffffff9c, &(0x7f0000000140)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
mkdir(&(0x7f0000000300)='./file0\x00', 0xfffffffffffffffe)
mknod$loop(&(0x7f0000000940)='./file0/bus\x00', 0x6210, 0x0)
r1 = socket(0x11, 0x2, 0x0)
getsockopt$sock_cred(r1, 0x1, 0x11, &(0x7f0000caaffb)={0x0, <r2=>0x0}, &(0x7f0000cab000)=0xc)
chown(&(0x7f0000000140)='./file0/bus\x00', r2, 0x0)
open$dir(&(0x7f0000000040)='./file0/bus\x00', 0x40f40, 0x0)

io_setup(0x800, &(0x7f0000000880)=<r0=>0x0)
pipe2$9p(&(0x7f0000000000)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff}, 0x0)
io_submit(r0, 0x1, &(0x7f0000000300)=[&(0x7f0000000180)={0x0, 0x0, 0x0, 0x0, 0x0, r1, &(0x7f0000000100)="b9", 0x1}])
io_setup(0x4, &(0x7f0000000440)=<r3=>0x0)
io_destroy(r0)
io_submit(r3, 0x1, &(0x7f00000006c0)=[&(0x7f0000000480)={0x0, 0x0, 0x0, 0x1, 0x0, r2, &(0x7f0000001180)='U', 0x1}])

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800013, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
setitimer(0x2, &(0x7f00000001c0)={{0x0, 0x2710}, {0xfffffffffffffff7}}, 0x0)
rt_sigreturn()

clone(0x5100, 0x0, 0x0, 0x0, 0x0)
renameat2(0xffffffffffffffff, &(0x7f0000001180)='./file0\x00', 0xffffffffffffffff, 0x0, 0x0)
rt_sigreturn()

r0 = openat(0xffffffffffffffff, &(0x7f0000000440)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
fcntl$lock(r0, 0x6, &(0x7f0000000180))

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = gettid()
r2 = gettid()
mount$fuse(0xf0ffff, &(0x7f00000020c0)='./file0\x00', &(0x7f0000002100), 0x0, &(0x7f0000000040)=ANY=[@ANYBLOB='fd=', @ANYRESOCT=r2])
tkill(r1, 0xe)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_int(r0, 0x0, 0x14, 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = memfd_create(&(0x7f0000001fc1)='\x00\xac=\x9d\xd2\xdb\xe6\xbf\xb4\b\xedcJ\x8e\x84\xd4N\x12\x9b\x1f\t\xbd\x11+\x86T\x16\xa3\xb3\xae0\x9f9?\xefo\xa4k\x012>\xa1\x9c\x86x\x1c\x9f\x84\x195\xde\x97_\t~\xf3Y\x12\"p^\xc1\x0f', 0x0)
write(r0, &(0x7f0000000140)='/', 0x1)
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x4, 0x12, r0, 0x0)
r1 = gettid()
r2 = gettid()
r3 = syz_open_procfs(0x0, &(0x7f0000000300)='fdinfo\x00')
fchmod(r3, 0x0)
tgkill(r1, r2, 0x24)

syz_emit_ethernet(0x2a, &(0x7f0000001680)={@broadcast, @local, @void, {@arp={0x806, @generic={0x0, 0x0, 0x6, 0x0, 0x0, @multicast, "", @multicast, "330ce4c046f12b0a"}}}}, 0x0)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = getpid()
r2 = gettid()
tkill(r2, 0x1000000000016)
ptrace(0x4206, r1)
ptrace$cont(0x18, r2, 0x0, 0x0)

r0 = syz_open_procfs(0x0, &(0x7f00000000c0)='fd\x00')
lseek(r0, 0x0, 0x2)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f00000001c0)='uid_map\x00')
pread64(r1, &(0x7f00000000c0)=""/250, 0xfa, 0x0)

clone(0x20004500, 0x0, 0x0, 0x0, 0x0)
r0 = socket(0x2, 0x803, 0xff)
r1 = dup(r0)
getdents(r1, 0x0, 0x0)
rt_sigreturn()

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r0 = fork()
ptrace(0x10, r0)
clone(0xa912d700, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rt_sigreturn()
ptrace$cont(0x1f, r0, 0x0, 0x3ff)
r1 = gettid()
tgkill(r1, r1, 0x10)

clone(0x6006f00, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = memfd_create(&(0x7f0000001fc1)='\x00\xac=\x9d\xd2\xdb\xe6\xbf\xb4\b\xedcJ\x8e\x84\xd4N\x12\x9b\x1f\t\xbd\x11+\x86T\x16\xa3\xb3\xae0\x9f9?\xefo\xa4k\x012>\xa1\x9c\x86x\x1c\x9f\x84\x195\xde\x97_\t~\xf3Y\x12\"p^\xc1\x0f', 0x0)
write(r0, &(0x7f0000002000)='/', 0x1)
write$binfmt_elf64(r0, &(0x7f0000000080)=ANY=[@ANYRESHEX], 0x1)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x4, 0x11, r0, 0x0)
rename(&(0x7f0000fdbff8)='./file0\x00', &(0x7f0000000000)='./file1\x00')
rt_sigreturn()

r0 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r0, &(0x7f00000001c0)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x56)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
statx(0xffffffffffffff9c, &(0x7f0000000100)='\xe9\x1fq\x89Y\x1e\x923aK\x00', 0x0, 0x10, &(0x7f00000014c0))

clone(0x4180, 0x0, 0x0, 0x0, 0x0)
get_robust_list(0xffffffffffffffff, 0x0, 0x0)
rt_sigreturn()

r0 = shmget$private(0x0, 0x4000, 0x0, &(0x7f0000ffc000/0x4000)=nil)
shmat(r0, &(0x7f0000ffd000/0x3000)=nil, 0x0)
shmctl$IPC_RMID(r0, 0x0)
shmctl$IPC_RMID(r0, 0x0)

r0 = socket$inet(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000080), 0x10)
connect$inet(r0, &(0x7f0000000000), 0x10)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r1 = semget$private(0x0, 0x3, 0x0)
semop(r1, &(0x7f0000000040)=[{0x3}], 0x1)
statx(r0, &(0x7f0000000080)='./file0\x00', 0x0, 0x0, 0x0)
semctl$IPC_SET(r1, 0x0, 0x1, &(0x7f0000000240)={{0x3, 0x0, 0xee00, 0xee01}})
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

clone(0x82106100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
getsockopt$IPT_SO_GET_ENTRIES(r0, 0x0, 0x41, &(0x7f0000000380)={'filter\x00', 0x4, "12033a27"}, &(0x7f00000002c0)=0x28)
rt_sigreturn()

clone(0x4000010006dfd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
sendto$inet(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
exit(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat$tcp_mem(0xffffffffffffff9c, &(0x7f00000000c0)='/proc/sys/net/ipv4/tcp_wmem\x00', 0x1, 0x0)
write(r1, &(0x7f00000001c0)='7', 0x1)

openat$thread_pidfd(0xffffffffffffff9c, &(0x7f0000008480), 0xa3c2, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ioctl$sock_inet6_SIOCADDRT(0xffffffffffffffff, 0x890b, 0x0)
r1 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f00000000c0)='memory.events\x00', 0x26e1, 0x0)
write$cgroup_int(r1, &(0x7f0000000100), 0x12)
perf_event_open(&(0x7f0000000040)={0x0, 0x70, 0x0, 0xff, 0x0, 0x0, 0x0, 0x248000009}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
ioctl$PERF_EVENT_IOC_PERIOD(r1, 0x4030582a, &(0x7f0000000040))
write$cgroup_type(r1, &(0x7f0000000140), 0xffffff1f)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_MCAST_MSFILTER(r0, 0x0, 0xb, 0x0, 0x0)

rt_sigtimedwait(&(0x7f0000000300), 0x0, 0xfffffffffffffffe, 0x8)

r0 = socket$inet(0x2, 0x4000000000000001, 0x0)
sendto$inet(r0, 0x0, 0x0, 0x200007fd, &(0x7f0000e68000)={0x2, 0x0, @local}, 0x10)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x3000002, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = syz_open_procfs(0x0, &(0x7f00000000c0)='net/tcp\x00')
preadv(r2, &(0x7f00000005c0)=[{&(0x7f00000000c0)=""/227, 0xe3}], 0x1, 0x97e, 0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
mmap(&(0x7f00001d9000/0x3000)=nil, 0x3000, 0x0, 0x12, r0, 0x0)
bind$inet(r0, &(0x7f0000000200)={0x2, 0x4e20, @empty}, 0x10)
setsockopt$inet_tcp_int(r0, 0x6, 0x2, &(0x7f0000000040)=0x2800, 0x4)
setsockopt$SO_ATTACH_FILTER(r0, 0x1, 0x1a, &(0x7f0000000400)={0x1, &(0x7f00000000c0)=[{0x6, 0x0, 0x0, 0x7654}]}, 0x10)
setsockopt$inet_tcp_TCP_CONGESTION(r0, 0x6, 0xd, &(0x7f0000000080)='veno\x00', 0x5)
connect$inet(r0, &(0x7f0000000000)={0x2, 0x4e20, @dev={0xac, 0x14, 0x14, 0x1c}}, 0x10)
sendmsg$inet(r0, &(0x7f00000015c0)={0x0, 0x14, &(0x7f0000001600)=[{&(0x7f0000000240)=' ', 0xffffff1f}], 0x1}, 0x0)
recvmsg(r0, &(0x7f0000000580)={0x0, 0x2, &(0x7f0000000500)=[{&(0x7f0000000740)=""/4096, 0xa15b0}], 0x1}, 0x700)
connect$inet(r0, &(0x7f0000000100)={0x2, 0x4e21, @empty}, 0x10)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f00000009c0)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmmsg$sock(r0, &(0x7f0000002140)=[{{0x0, 0x0, 0x0}}, {{&(0x7f0000001f40)=@ll={0x11, 0x0, 0x0, 0x1, 0x0, 0x6, @remote}, 0x80, 0x0}}], 0x2, 0x0)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000180)={0x2, &(0x7f0000000040)=[{0x1c}, {0x6, 0x0, 0x0, 0x7fffffff}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

r0 = timerfd_create(0x0, 0x0)
r1 = timerfd_create(0x0, 0x0)
r2 = fcntl$dupfd(r1, 0x0, r0)
clock_gettime(0x0, &(0x7f0000000040)={0x0, <r3=>0x0})
timerfd_settime(r2, 0x0, &(0x7f0000000000)={{0x0, r3+60000000}, {0x0, 0x3938700}}, 0x0)
read$FUSE(r2, &(0x7f0000000080)={0x2020}, 0x2020)
read$FUSE(r2, &(0x7f0000002180)={0x2020}, 0x2020)

semget$private(0x0, 0x4000, 0x0)
unshare(0x8000000)
semget$private(0x0, 0x4000, 0x0)

clone(0x4000c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt$inet_mreqsrc(r1, 0x0, 0x50, 0x0, &(0x7f00000000c0))
tkill(r0, 0x18)

rt_sigprocmask(0x0, &(0x7f0000000100)={[0xfffffffffffe]}, 0x0, 0x8)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1000006, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = gettid()
rt_sigqueueinfo(r1, 0xa, &(0x7f0000000040))
r2 = openat$tun(0xffffffffffffff9c, &(0x7f0000000480), 0x0, 0x0)
ioctl$TUNSETIFF(r2, 0x400454ca, 0x0)
ppoll(0x0, 0x0, 0x0, &(0x7f0000000540), 0x8)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f00000027c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
r2 = getpid()
clone(0x404000, 0x0, 0x0, 0x0, 0x0)
ptrace(0x10, r2)
waitid(0x0, 0x0, 0x0, 0x80000003, 0x0)

io_setup(0x2, &(0x7f0000000000)=<r0=>0x0)
r1 = openat$null(0xffffffffffffff9c, &(0x7f0000000c00), 0x0, 0x0)
io_submit(r0, 0x1, &(0x7f00000002c0)=[&(0x7f0000000300)={0x0, 0x0, 0x0, 0x7, 0x0, r1, 0x0}])

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mremap(&(0x7f0000523000/0x4000)=nil, 0x4000, 0x2000, 0x0, &(0x7f00002e0000/0x2000)=nil)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000040), 0x8001, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
write$binfmt_aout(r0, &(0x7f00000001c0)=ANY=[], 0xff2e)
ioctl$TCSETS(r0, 0x40045431, &(0x7f0000000100)={0x0, 0x0, 0x0, 0x0, 0x0, "00e4d3f26c00000173d5e822a7632200"})
r2 = syz_open_pts(r0, 0x0)
r3 = dup3(r2, r0, 0x0)
ioctl$TIOCSETD(r0, 0x5423, &(0x7f0000000200)=0x3)
ppoll(&(0x7f00000000c0)=[{r3}], 0x22, 0x0, 0x0, 0x0)

clone(0xc0006300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$packet(0x11, 0x3, 0x300)
setsockopt$sock_int(r0, 0x1, 0x10, 0x0, 0x0)
r1 = gettid()
tkill(r1, 0x25)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f0000000140)='oom_score_adj\x00')
exit(0x0)
preadv(r0, 0x0, 0x0, 0x0, 0x0)
wait4(0x0, 0x0, 0x40000000, 0x0)
sendfile(r0, r0, 0x0, 0x1fe)
r1 = gettid()
r2 = gettid()
tgkill(r1, r2, 0x24)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
getrlimit(0x0, &(0x7f00000004c0))

r0 = socket$inet(0x2, 0x6000000000000003, 0x6)
setsockopt$SO_BINDTODEVICE(r0, 0x1, 0x19, &(0x7f0000000040)='sit0\x00', 0x10)
sendto$inet(r0, 0x0, 0x0, 0x400c0c0, &(0x7f00000000c0), 0x10)
sendto$inet(r0, 0x0, 0x0, 0x0, &(0x7f0000000080)={0x2, 0x0, @local}, 0x10)
sendto$inet(r0, 0x0, 0x0, 0x0, &(0x7f0000002880)={0x2, 0x0, @local}, 0x10)

clone(0x200802047fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
gettid()
r0 = getpid()
pause()
perf_event_open(0x0, 0x0, 0x0, 0xffffffffffffffff, 0x0)
ptrace(0x10, r0)
ptrace$peeksig(0x4201, r0, 0x0, &(0x7f0000002000))

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt$inet_IP_XFRM_POLICY(r0, 0x0, 0x21, 0x0, &(0x7f0000000480))

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000380)={0x2, 0x4e22}, 0x10)
listen(r0, 0x0)
syz_emit_ethernet(0x76, &(0x7f0000000080)={@local, @multicast, @void, {@ipv4={0x800, @tcp={{0x5, 0x4, 0x0, 0x0, 0x68, 0x0, 0x0, 0x0, 0x6, 0x0, @remote, @local}, {{0x0, 0x4e22, 0x41424344, 0x41424344, 0x0, 0x6, 0x6, 0x2, 0x0, 0x0, 0x0, {[@timestamp={0x3, 0xa}, @window={0x2, 0x3}, @mptcp=@synack={0x22, 0x10}, @generic={0x0, 0x6, "b5804178"}, @timestamp={0x8, 0xa}, @mptcp=@synack={0x1e, 0x10}]}}}}}}}, 0x0)

mkdir(&(0x7f0000000280)='./file0\x00', 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
r0 = syz_open_procfs(0x0, &(0x7f0000000080)='fd\x00')
r1 = memfd_create(&(0x7f0000000300), 0x0)
write(r1, &(0x7f00000001c0)="6963e64243ea486da3a74e3deec6fc5bb9650b5de56946c568f95d22467190ba406d59a5958d6f156c9c8a2ac4677b00000000000000000000200000f8bf54da33", 0x27a)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x1, 0x11, r1, 0x0)
chroot(&(0x7f00000000c0)='./file0\x00')
readlinkat(r0, &(0x7f0000000040)='./file0\x00', &(0x7f0000019280)=""/102390, 0x192d4)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000040)={0x2, &(0x7f0000000080)=[{0xac}, {0x6, 0x0, 0x0, 0x50000}]})
close(0xffffffffffffffff)

recvmmsg(0xffffffffffffffff, &(0x7f0000005680)=[{{0x0, 0x0, &(0x7f0000000700)=[{0x0}, {0x0}, {&(0x7f0000000540)=""/166, 0xa6}], 0x3}}], 0x1, 0x0, 0x0)
socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000140)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmmsg$inet(r0, &(0x7f0000000280)=[{{&(0x7f0000000000)={0x2, 0x0, @local}, 0x0, &(0x7f0000000180)=[{&(0x7f0000000040)="5b9611adc34b3a2d510be666e8688d8c745103f1b6f9d84bee0ca10b87b70919f4028e2c13e6cb34839f2e246502958cf3d1d75162cd0708ca21b88c4588211d702cb5c8e51c9a9e70c391eb5fad2738af67e4b70f6462984c690e8f06123bac6cb6314347cdd4a83111dab02762606f4f304ccea8f621cb6c134971fa90279f5a4788de828af6961f0247349c4bf96af3e9caf651dee68ae5cca12274b88acbbebbf11cfce2ecfecfd64c424bc78233bc5f3343945e6b0e930fba21ba27e85fc9641f245f974f"}], 0x0, &(0x7f00000001c0)=[@ip_tos_u8, @ip_retopts, @ip_ttl, @ip_tos_int, @ip_pktinfo={{0x0, 0x0, 0x8, {0x0, @empty, @local}}}, @ip_pktinfo={{0x0, 0x0, 0x8, {0x0, @remote, @local}}}, @ip_ttl]}}], 0x400000000000043, 0x0)

r0 = openat$incfs(0xffffffffffffff9c, &(0x7f0000000080)='.pending_reads\x00', 0x40040, 0x0)
close(r0)
socket$inet_udp(0x2, 0x2, 0x0)
getsockopt$SO_TIMESTAMPING(r0, 0x1, 0x19, 0x0, &(0x7f0000000040))

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mkdir(&(0x7f0000000040)='./file0\x00', 0x0)
mount(0x0, &(0x7f0000000300)='./file0\x00', &(0x7f0000000340)='configfs\x00', 0x0, 0x0)
r1 = open(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
getdents64(r1, &(0x7f0000000100)=""/241, 0x18)
getdents64(r1, &(0x7f0000000380)=""/136, 0x88)

clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
getpeername(0xffffffffffffffff, 0x0, 0x0)
r0 = gettid()
rt_sigqueueinfo(r0, 0xc, &(0x7f0000000040))

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
connect(r0, &(0x7f0000000000)=@in6={0xa, 0x4e22, 0x0, @remote}, 0x80)
connect(r0, &(0x7f0000000080)=@nl=@unspec, 0x80)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
sendto$inet6(r0, 0x0, 0x0, 0x0, &(0x7f0000000200)={0xa, 0x0, 0x0, @dev}, 0x1c)
exit_group(0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
sendmmsg$inet(0xffffffffffffffff, 0x0, 0x0, 0x0)
rt_sigreturn()

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
perf_event_open(&(0x7f000025c000)={0x2, 0x70, 0x15, 0x0, 0x0, 0x0, 0x0, 0x1, 0x824b0}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
perf_event_open(&(0x7f000001d000)={0x0, 0x70}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
sigaltstack(&(0x7f000018c000/0x2000)=nil, &(0x7f00000000c0))

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffffff, &(0x7f0000000100)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
r1 = getpid()
r2 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
preadv(r2, &(0x7f0000000180)=[{&(0x7f0000000140)=""/33, 0x21}], 0x1, 0xa07, 0x0)
rt_tgsigqueueinfo(r1, r1, 0x16, &(0x7f0000000000))
ptrace(0x10, r1)
ptrace$setregs(0xd, r1, 0x0, &(0x7f0000000080)="be9ff483111ec7c05a6e35766a9c5cd98ed812fee8ee677c468e2d01bb01fd560342c1891c9b259ef048c5ac173518e9cd261fa6cbe6a89b00bbcac9c7a8fc13d6d5661f30c63f72be485d2065e695187bb1482dff9c9d341184640629dc64bb37212a404898297b90eb535ba521052c06a3f59c8a96155e941ed41bc723c4062d6dc6418cd0808ff3")

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit_group(0x0)
syz_mount_image$tmpfs(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
creat(&(0x7f0000000400)='./bus\x00', 0x0)
perf_event_open(&(0x7f0000000100)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3c4b}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
r0 = open(&(0x7f0000000200)='./bus\x00', 0x10103e, 0x0)
mmap(&(0x7f0000000000/0x600000)=nil, 0x600000, 0x7ffffe, 0x4002011, r0, 0x0)
ftruncate(r0, 0xcf01)
r1 = openat$random(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
preadv(r1, &(0x7f00000005c0)=[{&(0x7f0000000040)=""/81, 0x51}, {&(0x7f00000000c0)=""/214, 0xd6}, {&(0x7f00000001c0)=""/118, 0x76}, {&(0x7f0000000240)=""/171, 0xab}, {&(0x7f0000000300)=""/248, 0xf8}, {&(0x7f0000000480)=""/109, 0x6d}, {&(0x7f0000000400)=""/27, 0x1b}, {&(0x7f0000000500)=""/109, 0xffffff6b}, {&(0x7f0000000580)=""/57, 0x3a}], 0x9, 0x0, 0x0)

r0 = socket(0x2, 0x803, 0xff)
connect$inet(r0, &(0x7f0000000040)={0x2, 0x0, @dev}, 0x10)
shutdown(r0, 0x0)

perf_event_open(&(0x7f0000000100)={0x2, 0x70, 0x26, 0x1}, 0x0, 0xfffffffbffffffff, 0xffffffffffffffff, 0x0)
socket$inet(0x2, 0x0, 0x0)
write$binfmt_elf64(0xffffffffffffffff, 0x0, 0x0)
fcntl$setstatus(0xffffffffffffffff, 0x408, 0x0)
fstat(0xffffffffffffffff, 0x0)
setsockopt$inet_IP_XFRM_POLICY(0xffffffffffffffff, 0x0, 0x11, 0x0, 0x0)
sendfile(0xffffffffffffffff, 0xffffffffffffffff, 0x0, 0x0)
pipe(&(0x7f0000000180)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
pipe(&(0x7f0000000180)={0xffffffffffffffff, <r2=>0xffffffffffffffff})
splice(r0, 0x0, r2, 0x0, 0x10001, 0x0)
dup3(r0, r1, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x180000f, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
timer_gettime(0x0, 0x0)

gettid()
gettid()
tgkill(0x0, 0x0, 0x0)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
listen(r0, 0x0)
poll(&(0x7f0000000080)=[{r0, 0x1}], 0x1, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
execve(&(0x7f0000000040)='./file0\x00', 0xffffffffffffffff, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
preadv(r0, 0x0, 0x0, 0xfffffffffffffff9, 0x0)
setrlimit(0x0, &(0x7f0000000080))

r0 = syz_open_procfs(0x0, &(0x7f0000000340)='net/udp\x00')
r1 = socket$inet_udp(0x2, 0x2, 0x0)
connect$inet(r1, &(0x7f0000000080)={0x2, 0xffff, @local}, 0x10)
r2 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2000006, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
preadv(r0, 0x0, 0x0, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f00000000c0)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
fcntl$lock(r1, 0x7, &(0x7f0000002000))
fcntl$lock(r1, 0x7, &(0x7f0000000080)={0x1, 0x0, 0x3, 0xf75})
fcntl$lock(r1, 0x7, &(0x7f0000000100))

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000080), 0x80000000000a01, 0x0)
ioctl$TCSETSW(r0, 0x5403, &(0x7f0000000240)={0x0, 0x0, 0x9, 0x0, 0x0, "0000000000000000e2ffffffffffffff00"})
r1 = socket$inet_tcp(0x2, 0x1, 0x0)
write$binfmt_aout(r0, &(0x7f0000000440)=ANY=[@ANYBLOB="9262ed39a8055c20074f23d7ca61ca76135a531896ecebdd803d53ea7f3de7c180d6fc1e9003bf0f5ff2d023f8c0c562816608780e76cebf49a5da263940a0ef28bb034df8b5d2c3d9aa3fd03cbcf92bef13848bf9b1ff221736cc8590b94f46990d619f861ce4bff74188790b31b2c6848700008aedd9000100000000ac2305a4e8707240063332adec0e6bba41abcdd7a869c85239ea3e3b55382b41d94ffc94b8a80805bcee383099c9af59f2b6e95729aa34b9c5a0adc6b2ce5fb535e8f18e16955ba6a8bf9abdd98f27e62c18fa", @ANYRESDEC=r1, @ANYRESDEC=r0, @ANYBLOB="8a497f5425e16514144e314879b649820d905ea7ac278db00e3be0f68ad86c7253994d780fa5ae323ad88e8118606992be0f6bac0effead8ed8103009cf8f04c364ba767a98718fee069c8ce2fb11165ceed44295ee8775b15fb000000004a1b641161bd555b59c9db72f4d402781e9591f08380b31d350e7e951877b90cc6afaa207913e1f39903805d2e877cec73d5cef21fab6dd8c802188eba3f752310cf142075347316709fccb24282b019f8bd6185a32766db06536f908875e26983f90f176d55328dbae1dbd48dad10b3435262419cdbc9bfdd30e4ad5a202c63fc689f248b00e80c91e360ead03277e56b06b00c4b8bb9", @ANYRES32=r0, @ANYRES16=r0], 0xffffff78)
ioctl$TCSETS(r0, 0x40045431, &(0x7f0000000200)={0x0, 0x0, 0x0, 0x0, 0x0, "000000000000c5b877b77fcc63a500"})
socket$nl_route(0x10, 0x3, 0x0)
mkdir(&(0x7f0000000400)='./file0\x00', 0x0)
mount(0x0, &(0x7f0000000400)='./file0\x00', &(0x7f0000000280)='tmpfs\x00', 0x0, 0x0)
chroot(&(0x7f0000000000)='./file0/../file0\x00')
r2 = memfd_create(&(0x7f0000000080)='\xf3e\t\xa9\xff\vty\x01senux\x00', 0x1)
r3 = socket$inet6_tcp(0xa, 0x1, 0x0)
dup2(r3, r3)
pwrite64(r2, &(0x7f000003bfff)='/', 0x1, 0x0)
mmap(&(0x7f0000001000/0x1000)=nil, 0x1000, 0x4, 0x11, r2, 0x0)
lseek(r2, 0x0, 0x2)
sendfile(r2, r2, &(0x7f00000001c0), 0x7fa)
chdir(&(0x7f0000000140)='./file0\x00')
symlink(&(0x7f0000001000)='./file0\x00', &(0x7f00000000c0)='./file0\x00')
open(&(0x7f00000002c0)='./file0/../file0/file0\x00', 0x511681, 0x8)

r0 = openat$dir(0xffffffffffffff9c, &(0x7f0000000000)='.\x00', 0x0, 0x0)
setresuid(0xee01, 0x0, 0x0)
mkdirat(r0, &(0x7f0000000040)='./file0\x00', 0x0)
r1 = getuid()
setresuid(0xffffffffffffffff, r1, 0x0)
chdir(&(0x7f0000000080)='./file0\x00')

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0, 0x12, r0, 0x0)
r1 = fork()
tkill(r1, 0x16)
wait4(0x0, 0x0, 0x8, &(0x7f0000000000))
tgkill(r1, r1, 0x12)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x1, 0x0, 0x0)
r2 = syz_open_procfs(0x0, &(0x7f0000000300)='oom_score_adj\x00')
ftruncate(r2, 0x0)
rt_sigqueueinfo(r0, 0x39, &(0x7f0000000000))

syz_open_procfs$namespace(0x0, &(0x7f0000002040)='ns/pid_for_children\x00')
arch_prctl$ARCH_GET_FS(0x1003, &(0x7f0000000000))

r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
getsockopt$IPT_SO_GET_INFO(r0, 0x0, 0x40, 0x0, &(0x7f0000000080))

r0 = socket$inet_udp(0x2, 0x2, 0x0)
connect$inet(r0, &(0x7f0000000040)={0x2, 0x0, @remote}, 0x10)
r1 = socket(0x400000000000010, 0x802, 0x0)
write(r1, &(0x7f00000000c0)="24000000200099f0003be90000ed190e020008160000100000ba10", 0x1b)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setsockopt$inet_mreqn(r0, 0x0, 0x20, &(0x7f0000000240)={@broadcast, @loopback}, 0xc)

r0 = syz_open_procfs(0x0, &(0x7f0000000000)='task\x00')
fstat(r0, &(0x7f0000000d40))

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
getsockopt$inet6_mreq(r0, 0x29, 0x1a, &(0x7f0000005f40)={@private0}, &(0x7f0000005f80)=0x14)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
unlinkat(r0, &(0x7f0000000040)='./file0\x00', 0x0)

r0 = openat$tcp_mem(0xffffffffffffff9c, &(0x7f0000000200)='/proc/sys/net/ipv4/tcp_wmem\x00', 0x1, 0x0)
write(r0, &(0x7f0000000240)='5', 0x1)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
getsockopt$EBT_SO_GET_INFO(r0, 0x0, 0x50, 0x0, &(0x7f0000000000)=0x1e)
rt_sigreturn()

r0 = socket$unix(0x1, 0x2, 0x0)
bind$unix(r0, &(0x7f0000000080)=@file={0x1, './file0\x00'}, 0x6e)
sendto$unix(r0, &(0x7f0000000180)='.', 0x1, 0x0, &(0x7f0000000000)=@file={0x1, './file0\x00'}, 0x6e)
recvfrom$unix(r0, &(0x7f0000000140)=""/34, 0x22, 0x0, &(0x7f00000012c0)=@file={0x0, './file0\x00'}, 0x6e)

utimensat(0xffffffffffffffff, 0x0, &(0x7f00000005c0)={{}, {0x0, 0xfffffffffffffffe}}, 0x0)

r0 = memfd_create(&(0x7f00000002c0)='#\'%nod%v\x00\x7f\xe5\xd0ql\x86\xc9\xe6\x14\x93\xb0\x7f_,y<~\xab\x84\x00\x00\x00\x00\x00\x00\x14}\n\x81\xc7\x85|oC\xca\v\xe3\xba]fn\r\xdf!\x94\x0f\xaf\xb7\x93\xe8\xb6\xc3N\x16&\xab\xf9{\xaf;\xcf\x8c\xa8\xb9\x06\xaf\xd0\xfb:\x90LNF\x13\x9f\xc2\xb7/1\xb9V\xf0*\xcb\xdc\x05n<\xcfi\x02=1\xda\"\xb3\xfe\xf3\x97\xd9\xa5b\xd4\x00Q$\xb2v\\\xa9\xcf*tw\x8a\n_)\x89A\x8f`R\x12zM\a\xc43\xd0d\xee\x13Q', 0x0)
write(r0, &(0x7f0000002000)='/', 0x1)
sendfile(r0, r0, &(0x7f0000000200), 0x87)
sendfile(r0, r0, &(0x7f00000001c0), 0xfec)
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x4, 0x11, r0, 0x0)
unlink(&(0x7f0000000080)='./bus\x00')

getrlimit(0x0, &(0x7f0000000300))

r0 = openat$fuse(0xffffffffffffff9c, &(0x7f0000002080), 0x42, 0x0)
fcntl$lock(r0, 0x5, &(0x7f0000000080))

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
bind$inet6(r0, &(0x7f0000000040)={0xa, 0x4e22}, 0x36)
listen(r0, 0x0)
syz_emit_ethernet(0x4a, &(0x7f00000001c0)={@local, @local, @void, {@ipv6={0x86dd, @tcp={0x0, 0x6, '~\x00 ', 0x14, 0x6, 0x0, @remote, @local, {[], {{0x0, 0x4e22, 0x41424344, 0x41424344, 0x0, 0x0, 0x5, 0x10}}}}}}}, 0x0)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
ioctl$TCSETS(r0, 0x40045431, &(0x7f00003b9fdc)={0x0, 0x0, 0x0, 0x0, 0x0, "000000000000000000000000eb5e375f00"})
r1 = syz_open_pts(r0, 0x0)
dup3(r1, r0, 0x0)
sched_setscheduler(0x0, 0x5, &(0x7f0000000180))
socketpair$nbd(0x1, 0x1, 0x0, 0x0)
ioctl$TIOCGPGRP(r0, 0x540f, 0x0)

r0 = socket$inet(0x2, 0x3, 0x6)
r1 = socket$inet_tcp(0x2, 0x1, 0x0)
connect$inet(r1, &(0x7f0000000000)={0x2, 0x0, @local}, 0x10)
dup3(r1, r0, 0x0)

arch_prctl$ARCH_GET_FS(0x1003, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
epoll_ctl$EPOLL_CTL_ADD(0xffffffffffffffff, 0x1, 0xffffffffffffffff, 0x0)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
fcntl$dupfd(r0, 0x0, 0xffffffffffffffff)
rt_sigreturn()

r0 = signalfd(0xffffffffffffffff, &(0x7f0000000140)={[0xffffffffdc45af1a]}, 0x8)
openat(r0, &(0x7f0000000180)='./file0\x00', 0x0, 0x80)
r1 = socket$netlink(0x10, 0x3, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
writev(r1, &(0x7f0000000200)=[{&(0x7f0000000100)="390000001300034700bb65e1c3e4ffff060000000100000045000000250000001900040004", 0x25}, {&(0x7f0000000040)="899f0ea1156626d6fb07e613d79d0b6d58348a37", 0x14}], 0x2)
r3 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_mreqn(r3, 0x0, 0x27, &(0x7f0000000000)={@multicast2, @loopback}, 0xc)
ioctl$BTRFS_IOC_SNAP_CREATE(r2, 0x50009401, &(0x7f00000003c0)={{r3}, "7f5412d12e41a7fa9b87636dfa8e3effa48030ac4125fda2c82d9e3125882649ef34cbb9f2398d3874c03453060a75cc3be39c139ae7a5b406491e5b105d83a0fe87f3e041a02d800e72ea0e7bff4eb92c3d19721a6942e9666a3b85703be0852fe5e86cd44b0e0b8e99517c49cdf2832b55bd7ce3fb35381690de07fcdc3f2052456a5f916afc8d6950935ea5f1424bf4e2fb8603688b5d63f6d947fc4131764d995c5bff33abb3df87586ee4a48566cfebfb3dac3ff1682910033f51c3dc8e50dcad55a640b1d7bd8fbe0cb0e8dae73aa6bbc338caad5fab414cc06d05bdcd814979eab4a05d98fc0bf1fabef0bc822d73990c636464f11b0227a412d80cf090c0673684e972f1718af47e4b65782ce44405bb1e652289b4e7c913a8b8e4d547cab3665a9fde91b67a3a552e54927af8b4def7e6e4d0122c0cd0c24baa7371afa63e872e3df2b904cf64d018f871be27f187c99eeab15a90e83eadd2b338d80701f1dd66d8da2ef24ccb6d4849c55453b6b14ecbdfdbdf231ea6c634dfb2c6acc84cc89ff47cb101a6071d2a35ba237d445202da6c193752f22f37a9e7a9ce6529acf1013703fbd11c268df453199d1ed96207885ef14cb3d9c3b28527f0dd494fd5e8f19f7b11f762055909deeba09d422f0e79a54714b8637e3fb217411794c35cc52b48f36f00eb3c055669a21ee56e616f5a0f05f4bd67dd54772acc78cdde13e4deb8a929bd36ffcbb471d195363a484a2a65422d3e083a5c764574d7f25005f86797e1c250bf04f99cabfe93fcd4d2ae50b7a8f926b2ecff79b618a70b0d0f2d248bbdbcff96dcb5f5c96e3c2e24460df6c4a069465daa4d0fe83df1133ac2b60a46e54de2f356393b10b3dc5937ad3302647aa25c907df1f56f93b4891f70a487581850a05871add0a9aceea948dce6af44a882fb1f02177aaed6b8aa702593a308ed28a93c00aa37f7a1d50c7f9bc65b43206a650ca296c8d5d9498207453752976980e605560c8078ab1c3306ccb9e327d892a1d56a023548fe3de607a2c1797632b0654cc12330f56335f9904a602eb6a55765d932384787dfefb841ca61a306c2e57c15d6c1f8fc919c63c81bc024cf20d23ac076518e90ff2272e390ef8adb3cffbdfc15ea45512daf7dae133dc215f78d049d7f71ff59787fdc75be864fad4fa50517acaaf03539451aa5197391897cd9be273aee8d02bafffd965bcaf768b612f33adcf5b28815213618f5e11b2dd09934ab07fff11fe6faa60d9e07d3e1f6301d97e6ac4688e98b6967175688586b445c701abd57441b5b95e8d00a24a2fdad5160f01a10ec6cfeaa3239c47d55637d11e8ead35e6fd0a8d6fe5c019f357212b6317aeae08d292c00345510162e768d9e8dcb8aec323f57e580cf6bae5fd9da38a8f39538ef6c7b82380a7d2d93ad84a394836b0bd9c4c85ea6814c9732b276738b66268c6124fec91a4852c0955376be3fc85ce9e8dadaeb39b00c5fa0e01835401a7f2f79319c65a1f20a94d1f0f5623b3bc7028668c273da733f85b684f77840a3d6daa1785eee3ee5199654a20c0de26b23d5bcad06232e89f72b3bb3390fbef2ef5f5f98f03946e0852f3996f51b2d97523b265f08713c105d79fdef205838a5a3aaf0b3e2ce79bbeebf7cdb1ce209965f625e30f59878943ce246146bdb38250f86c6804a7fdbf5c6cec50b430afcaeff034edd7f396de8440b63a24671682125e3114b29e084315f6519d8cea334226f597442b24c70d9dd692e7e16e5103a9c349820e9a0c1667eb056e1d4fa3bc06d440ea817eca8cbad851abc57c1a33fba62acafd048fa6ddd8fb79c188315222efb65aaa7050f519bea2a153a7b144c48d494050ee1b8f39434383281eea0424fc82eb617a407636c80c7825541245f6cbd65bb6252bd3aa2f62b0de9d0ab79eb5dba91309fb8a0881f58391f22e3a46320665811cf3eefaec74c0eedfa860f80805f810bd4ca356a606c3853f13018dca469d6ddb68579d373708af3bed8aaa4fc237090578d000a9c0e21e0338eb486959ca34b9ab2eb26c55e1fd3921526b31240c57ee0daa5ec2e85d8ff7ee7f16087196f14972e4e0371dbfcd7f278671345ca0bf2377e8b05405b4af5658ae068f843ffac1963c76235d473da86ffcd92e04ff3b5d171cb8b21ad033235ea55d05ab00d5249266c4902e998cd3fab779d2a335586cea5fb0c1301ca5349fe0002bc2592aadb5a2068c960b7b3c7479f387b13eceb8eb024a1f30d126b2ae1357b3f34a9394faf8490002d4c25eac58f6a46812e1a08d409d2d4a9a2817436d2054c82b85a4ae80a601d73cef85eeb408d4cfb40444a804b4f4ea764432e14063ff9b1fce6adc7d4ca3c50619b8e2df81b620d2b49b82ac4ff89f767033362af844675815ece3da6965f3e1b2fc29fb2894342a2c77924aa176278982f9184058612ab31ffc5c15fd526c57af32d16392d87e721e150a88cc70061fc5510981b1204f8acda6cf26c4a811b782cfffcea43c728399824aea0a9a84a3f4e12ac0f5459619a7290c7379a512df10e2fe3ea591f383046bf316a78aca52f093560d15aae765d350a00046125ebd06e876b6d5a76ffd7df1bb537899648779cc302e4d3d09bfd8cacf52fd4f7d4e20da6ea4f798ae84b96e11b5c0aabbc840dc92980f217f652761475b1998970c126979665ce6bf629bd6a9c83f58b32ad4a0dab341421520922fd3e147f87812952e37164ce6db0c9a1ee10335491bea6a2bfee9ab5801e6f7d06f3ccbb2a96e137aead4d0c3ed8a521166fc5c6ee995d3e126703d3ef9c54720668d37163706693b5a10485e693b606c9fb2cbf425e186c2346f2de965dc4d766ca7bff388b54ba2eba3d1835748904676cfd0859a2fcabbbb3837fdf6b73d44b10d46508c71dd164395f21951ee07b4c36b4661f292c70f5204371728e4cc8750e35dc53a97c22829d741724ba6994b3960545bd6fe136f867d4669d76abf143e3176d5d0922fe9b9094c523c351cefd78805082fb0717f7fcbba92a973a94bc4e655ff44489c3b940c6aa256c7c211b1081c49b53c30a24979922f3fda105178a044d5642e67592ace3af280e3d97d321df078c9aec543319eca40216ea195072a13382dee390684f59e137af38775110c5a064cd18556e80b2998ef733f00921d0e339bd91bc2a52f44fce30e19ff251b72219c8b494a8bce0fe37e27a63cd9ec56810a3f9a79a1b6155ebc2a67d022bca05f7a08f881ddd3f848db7d357410d025c4775d1bf4b6e3cc6c28232edf8f62836dda0f7af5569c072ccc61ea53cac0cda2b9a1c5a862f4ecb5994709d7101d6363e4fff8cba9daf7810d195d5861fe18660359ce032e2ab9b8ed60fd0a0e38dbfb89c5835ababe5ed26be36e1963513d27d730936fae155dae15bddfac627c61204244ebf248b9b752083a287212071a7af6bc24a7592f11021cbdcbdba4f139a95cddfaa777680b44b259bea8c02f6d7dc6ea7923e8393768c4eadcb34ac7d4df007ecacbae84170d789264ced03aa4e1125f3afecd44f1d90b8b49d8c8c165b16e7089e675e6716a2d852dfdd07d03fa3431a97050c8a4daf0084687401f35726b646b99534d1e2735a40ad70cee6f59bd8d83d356aea7e5c33d8e9c707d3f1f5a5b4e4264e412b6316f43856cb678ba74334ad515aefa60583cde7a0686c4e39430640a0780de5281e6bd667ec010ec1920a2d47f7b6e5ed99d5fc4184283b295ce15793b5618a0602ab5be175489587d56ebc6f0acc7a0c073983184a9c33fbaae3ff573e53c8692e235308fb041abb8146d05c99a2bf308917551e26652f1791a1bab24af88f87ccbd1fa0d1bc71a4b8cc7a3b0b3ba4779cf499620d5278d2e3869f8efa2cbe0ec7e36a1a9a0c719179cba6ed819267dd53aa425c6fc745a2d42f174686f6421b31bdcc2791da6c9dfc5c9a4a4b0e9024df6f2600bc0f5dda8f974eb1f1b18d649a19d4db153575350a47ae08ee150892abd292be356f4bc2b42e46bee80479758f23e10b97ea0daf8661de371a9e105c44b5e68a0eed81535bd7dc693159268980e79b8666d17cf233eba05b02cef8a7cf9b6e0232f1f8ae501b270a877c449cc0f1170a44e070b0fb1b2dc5d07c230c39efa35109ab26367f85c4262f7c99402aad752f2525c8cd18046a2fa50aac5cd9cc9454df314269ba0e006e0c12b291c55e9e75ffadc665b371d49d5de58766979d5d022e91740dfcee772520f96262ae7e8bce76dfd0ecdc1a8080e4d50e19ecc1a991e1ac1195df39f03188f6b3ea80dbcc62ce1777b1641fa5a93897170ef7ef771c3b2c703a7ca6d310cbff59b6c0c9fc28f793ebcc8122d6d145683edb11654e815862257c22b70197495fec92b8f8c09568848070b5631b1a0d0594312a2db06e22f201f4b725c93d9f3140e8af2392ee5b4d91c46ef3dce8f37e09270de85b9d79bdf83338ec80d52de44153e87d0b3ce9278dd1868e5192f405d87774aab81ede36d61565e0427b07fcdf3a0071466fb99553696a2cd61f15d6821d0e124dde9b735e0cd358a5e33c8a33cbc7b0348156eef3d574e856f824f1ff4b66c2a0229e3ad623470b058affe8ec8911f27ed24bc3194a7cb95e82f29a05699b3197d56ba9db7ad0db71bfd4ac3e3d3d3fcaa495fb6893dbaf9589d448273d02d5f9f2c7fe2e08531fd2efa1c71ee4cb179049d9acd02807f7e1de4b6f77f6ab592c097b6f65cf5220b0d0fe037605bf7b80e88dc9f49f47c8a4822ab13d403dfa552f5cae2755a0cca9a1105020c6eef65441488f90fe50526f154b30562057d5244efdb827205bbda4ce9a81e3a341e1c370d2eef3027ae7f46fea3576ed267a3b40ac34e941ffcd4d39966d7b689b56b54f09ebe55c7743175d19175824efe7fc545d3ae8ae41469508783c5f0304a998b4af176691f773b4a7b13532026636b266734f9d44f09cf2dd74698ebf3c7ecce5030eebd23f74d0d50622f42579b444877e71fa98d040df8dfe8cdb2f161060bcfb3376f37f3a0b17c981081809d294aad6d86522a1dc1da29b86d9ec9ca14bf9a08e63e27928bc2fd5292db681281abeda4ebacd177e1d02d8aed125743628a2e94b5c42e9d3a194a60d1c62c7680d61959f5ed4d837f92d79dd9cbd1c095c2446d3608b744676e139285e18b565128f6db8c90ca524623b14a5e616ea54fd1be2e885a943328eea36d85cf5098f631cd3f86b16ba7c7f3ecfd7c82fe6e60fa4c42ca59fa51ae85b3f2a9fa5954bc5d7410be09d814181b78e5aff074781d557a1eeff57a11e2f898782636deb5f461171a74b9f7d7749e18cf2891a7f71739b8d002024d3422ee7e24de1036af59caa1bd92350b281efd8d1e55cf597c024fd1cd1db65c4a75ff56481aae94264eefc2f593f4a4b251d47a2fc4cd0c83d53ea98c9add90af7d542939bd240af3cc3e93477b1abed7eb2bb17c1b33ea800fb6582f49836877500109cdd176d570b0c934cc4c62c1e5b927541e54a59a2910da35cfd3239846ec1ce75a11a9e9315fc242275dcb3ab463518a9580c293045aeeb3b3ee1d1048703aeaa8b9c926f165d788b48b8d7c9f3038dedaddd1c718978f1e5c898b3de5fffa9ef0c1805293b156c53ec81456c4fbcf9ab60f08d9bea8c0e66d10930eff156cde7e47a57dd6cd0d310c90f5b1446edc90a6a25a44fafd0190065aac58672e7331703e0af94bbb21fd0774220f1207cc4555db57c591f44a69601369b94cade"})
r4 = socket$netlink(0x10, 0x3, 0x0)
r5 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r5, 0x0)
preadv(r5, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
writev(r4, &(0x7f0000000080)=[{&(0x7f00000000c0)="39000000130003470fbb65e1c3e4ffff06006000010000005600000025000000190004000400000007fd17e5ff8e0606040020000000000000", 0x39}], 0x1)

clone(0x2000c500, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet_udp(0x2, 0x2, 0x0)
getsockopt$sock_int(r0, 0x1, 0x7, 0x0, &(0x7f0000000380))
rt_sigreturn()

clone(0x38004100, 0x0, 0x0, 0x0, 0x0)
sync_file_range(0xffffffffffffffff, 0x0, 0x0, 0x0)
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x13, 0xd9f, 0x0)
syz_mount_image$tmpfs(0x0, &(0x7f0000000340)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
clone(0x3000000a0160101, 0x0, 0x0, 0x0, 0x0)
mount$overlay(0x0, &(0x7f00000000c0)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', &(0x7f0000000000), 0x0, &(0x7f00000003c0)={[{@index_off}, {@upperdir={'upperdir', 0x3d, '\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/'}}]})
signalfd(0xffffffffffffffff, 0x0, 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
r1 = getpid()
rt_tgsigqueueinfo(r1, r0, 0x15, &(0x7f00000000c0))
ptrace(0x10, r1)
openat$cgroup_ro(0xffffffffffffffff, &(0x7f0000000080)='cgroup.controllers\x00', 0x275a, 0x0)
perf_event_open(&(0x7f0000000180)={0x2, 0x80, 0x69, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
ptrace$setregs(0xd, r0, 0x0, &(0x7f0000000000))
ptrace$getregset(0x4204, r1, 0x200, &(0x7f0000000400)={&(0x7f0000001800)=""/4096, 0x1000})

clone(0x4000000206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
dup3(0xffffffffffffffff, 0xffffffffffffffff, 0x0)
exit_group(0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
chdir(0x0)
rename(0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat$null(0xffffffffffffff9c, &(0x7f0000000240), 0x0, 0x0)
flock(r0, 0x8)
clone(0x11000300, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()

clone(0x54041bc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
rt_tgsigqueueinfo(r0, r0, 0x16, &(0x7f00000000c0))
ptrace(0x4206, r0)
ptrace$cont(0x7, r0, 0x0, 0x0)
r1 = gettid()
r2 = getpid()
rt_tgsigqueueinfo(r2, r2, 0x16, &(0x7f0000000000))
ptrace$setopts(0x4200, r1, 0x0, 0x100046)
exit(0x0)

r0 = semget$private(0x0, 0x4, 0x0)
semop(r0, &(0x7f0000000040)=[{0x0, 0xdb63}], 0x1)
semctl$GETNCNT(r0, 0x0, 0xe, 0x0)

mkdirat(0xffffffffffffff9c, &(0x7f0000000240)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
mkdirat(r0, &(0x7f0000000080)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0)
perf_event_open(&(0x7f0000000000)={0x1000000002, 0x70, 0x800000000000013, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
mkdir(&(0x7f0000000100)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0)
r1 = open(&(0x7f0000000000)='.\x00', 0x0, 0x0)
renameat2(r1, &(0x7f00000002c0)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', r0, &(0x7f0000000380)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
mount(&(0x7f0000000180)=ANY=[], &(0x7f0000000080)='./file0\x00', &(0x7f0000000000)='proc\x00', 0x0, 0x0)
unlink(&(0x7f0000000100)='./file0/../file0\x00')
rt_sigreturn()

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = creat(&(0x7f0000002680)='./file0\x00', 0x108)
write$binfmt_elf64(r0, &(0x7f0000000440)=ANY=[@ANYBLOB="7f454c4602010100000000800000000803003e0000000000200000e1ff00000040000000000000000000000100000000040000200000380001000000000000e2030000000000000000000000000000000040ffffffffec00000000000000128958"], 0x78)
execveat(0xffffffffffffff9c, &(0x7f0000000280)='./file0\x00', 0x0, 0x0, 0x0)
r1 = getpid()
tkill(r1, 0x40)

r0 = socket$unix(0x1, 0x5, 0x0)
sendmmsg$unix(r0, &(0x7f0000004840)=[{0x0, 0x0, 0x0, 0x0, &(0x7f00000023c0)=ANY=[@ANYBLOB="1c00000000000000010000001d"], 0x38}], 0x1, 0x0)

setreuid(0x0, 0xee01)
r0 = inotify_init1(0x0)
fcntl$setown(r0, 0x8, 0xffffffffffffffff)
fcntl$getownex(r0, 0x10, &(0x7f00000000c0)={0x0, <r1=>0x0})
tkill(r1, 0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
bind$inet(r0, &(0x7f0000000100)={0x2, 0x0, @local}, 0x10)
setsockopt$sock_int(r0, 0x1, 0x6, &(0x7f0000000140)=0x32, 0x4)
connect$inet(r0, &(0x7f0000000280)={0x2, 0x0, @broadcast}, 0x10)
sendmmsg(r0, &(0x7f00000038c0), 0x4000000000000a8, 0xa700)
sendmmsg$sock(r0, &(0x7f0000000700)=[{{0x0, 0x0, 0x0}}, {{0x0, 0x0, 0x0, 0x0, &(0x7f0000000680)=[@timestamping={{0x14, 0x1, 0x25, 0x5}}], 0x18}}], 0x2, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x3000002, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = eventfd2(0x400, 0x80001)
r2 = dup(r1)
openat(0xffffffffffffff9c, 0x0, 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0, 0x10, 0xffffffffffffffff, 0x0)
read$FUSE(r2, &(0x7f0000000380)={0x2020}, 0x2020)

seccomp$SECCOMP_SET_MODE_FILTER(0x1, 0x0, &(0x7f0000000080)={0x3, &(0x7f0000000040)=[{}, {}, {0x2}]})
socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff})
ioctl$sock_ifreq(r0, 0x8910, &(0x7f0000000040)={'bond_slave_1\x00', @ifru_map})

r0 = memfd_create(&(0x7f0000000180)='K\xde\xeb\xc2}]b\x95m|^\xee0z\x85\xb5\xb5S\xf4P\x1bG\x8c$\xee#9\xda\xf7\x9e-\xb7[\x96h\xdb\xd0+\x9f\r\x1c\xfd\x958w\x1e\xcf\xf7\xf4\x1b\tQ\x9f\x82\xd7\xc7Rd\xab\xc7\v\xe6\xc7\x87^\xdf\xdd\xc7s\xaf\x9f\xc6\x9f\x06,f\x9f\xba\x1c\x90\xd2w\xb6\xaand\x85N\xaf\xd0!\xcd\xce4R\xad\xd1\xaa>!\xea\x00\x00\xc3\x9e\xef\a\xff\x00\x00\x00\x00\x00\x00\xa1\xb3\xfa\x81\xb5\x00\x00\x00\x00#\\\x94\x91\x04\xaf7\x9b\xaf\xec\x9d\xa9\f\xa5\x16\x12&\b-\x93`\xfe\xde3\x94\x99\xc9\xcb\x99\xa6\xef\xaa_\xec\xe1+\xcd\x00\x1d\xd2:q\xd6\xdd\x82\xc9\xc1\x8b{\xf5\xa8qBNlde8\xec\x00\xcf\x06o\xa6\xd3kv\xa7i*\x87\xb8W\xd3\xa4', 0x0)
write(r0, &(0x7f00000000c0)="6963e64243ea486da3a74e3deec6fc5bb9650b5de56942c568f95d22467190ba406d59a5958d6f156c9c8a2aaeb53451af0ac47e0000000000200000f8bf54da33", 0x3f0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x1, 0x11, r0, 0x0)
r1 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
setsockopt$sock_timeval(r1, 0x1, 0x15, &(0x7f0000000140), 0x10)

get_mempolicy(&(0x7f0000000000), &(0x7f0000000040), 0x1000, &(0x7f0000ffd000/0x3000)=nil, 0x4)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r0, &(0x7f0000000040)=@file={0x1, './file0\x00'}, 0x6e)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000180)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
connect$unix(r1, &(0x7f00000001c0)=@file={0x1, './file0\x00'}, 0x6e)
exit_group(0x0)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000600)={0x2, &(0x7f00000005c0)=[{0x34}, {0x6}]})

r0 = socket$inet6(0xa, 0x2, 0x0)
connect$inet6(r0, &(0x7f0000000000)={0xa, 0x0, 0x0, @loopback}, 0x1c)
r1 = openat(0xffffffffffffffff, &(0x7f00000002c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = syz_open_procfs(0x0, &(0x7f0000000040)='net/unix\x00')
sendfile(r0, r2, 0x0, 0xedc0)

clone(0x100041be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
statx(r0, &(0x7f0000000080)='./file0\x00', 0x0, 0x0, 0x0)
exit(0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = semget$private(0x0, 0x1, 0x0)
semctl$SETVAL(r0, 0x0, 0x10, 0xfffffffffffffffe)
exit(0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
rt_tgsigqueueinfo(r0, r0, 0x16, &(0x7f0000000000))
ptrace(0x10, r0)
ptrace$getregset(0x4202, r0, 0x0, 0x0)

pipe(&(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
read(r1, &(0x7f0000000040)=""/28, 0x1c)
write(r0, 0x0, 0x0)

syz_emit_ethernet(0x107, &(0x7f0000000480)={@link_local, @remote, @void, {@ipv4={0x800, @udp={{0x5, 0x4, 0x0, 0x0, 0xb0, 0x0, 0x0, 0x0, 0x11, 0x0, @private, @broadcast=0xe0000001}, {0x0, 0x0, 0x9c, 0x0, @opaque="8174b9193f286b3694a6d2dab8860c3187b14e476b3fc791dd683f23da29134646368eb023362bd91f67d60b479de11a32b6392be503d04524a59c543780b27e3f61fb3cd61140da8463194ec24c3903380b119e042667f642c06223285f65a0e288897bf9f0e6389673085b6246d182863eef4a937d7d29c17cff6d3fb06aad9d285571165328714482ddc31e0f66d51754a31c54bc94b5b6bb19da9f35fbf0fd81ba6efb7147085ea52d8371ba870bc1a8276d263a4d7c14c06d456762ba623709a184dacfc4b2f15ffc17688b280f384080d8cc34e6440c5227b731a076c03d97867995b74282c939b437eff2f75c11119b1a0e5d2aaaa5698d47563c0108b26faa15a57b9b36dfaf4718168a37efcdee6253f6e6669abae6d6d894f731166fb676e71c0a7b7143c1c97f4b67fbd8dafa840b4ec81bd7ddd5ff2a23804b285987750ed6ef22e1538e386438b7a8f7e8a5f40c0d6306db9dce627c7ccf15ffe6a402576ca1dc009be797e9f49d8bf579ec1c47c9ad53361db3669b96046a3317833109a2ba1bdc1f61e054a3e66890f3076a4c097b2fe0aeab47d048e643b699a4f8d34e17172bc52eb74a5f1966243361be032b676e0d1e80678d90731d988c7293199f2d1784427fdb89051d3150439fc4e62948b692fab64b24f46ac5a87ecfea676cd281c7a3ee97a92bdcfa31d3d95e0513d55e49556b1b2f6ad837ee56f62d0dfb9946af4f25c6379d2a38ffb66260af6d32672fe245ff08bdef329b41ef2b9a59df5b1d4a989954330b704bbd45d08daf36bcb3886359dc9bcf43f3d033ef5300a74d642fee3814087abcd89cfbe6da7b9c7f9aeb4bc90f35c853296ba2be3434789a9a8170a91b0288019db035ec6d973432b969db0a721208e4d12b079c3f7ef586739ef4bead38781e9ce950ae170c73ff4081bbb617d45564a8f2ef44405d4a55b443c7fa091d81e97bc1aa43fcf9b60d5aec2752dfa9c5ba49adbcdcb8f4129f058126dbc9d362f5a4d67cbd26961c56d80142ec1a3a6fcbce799811da3951b9fb14d87969335a3fdcfd21ec3e2400883782bcc836de26ccf7b28e4759765addbdef82ee8ce9f2e4f8967d63534748165a6fbc6b2d99588d6f0858e63a7f32e4598fde5c6793e5fe6bdcc1c7f7cf4023676f714fcbf5a9ccaee244be6544f1b885ed3d9b48cf922c54056c4efeb02b51f3cebc358e5009836f4746108b5616d25e493363df996cb72118cef6b51a9a13548157483d13339e7054271c097812a5767b335a55d90ee0beccb012eb0799cd64489d1fe12e84a077245f5f84072e5452ceba20eecf7c4c4d4de7fb51a46325f383a64376d93380cf20c402af9dd6b6a9418411477d34bb3335c903a6654748bdb8e7c271ce3e950c27088560e1b2f9763ae1969001a300bfd5e3e41b55693168a75ca8c14f7710d880f5f53a563b2b4d6639072850454bff58719173c0e568fb3d56107fff078a716b2a9702805fcc59da4b45704bcaa9d00fcc3f0f8336c4de0fda0251d2588e7d3bc62934ca5007c756071b7ba7f0da9f874149d13aa33b0ef1086c05200680824a32794998f6daa01995f4d3b9880839d1a7f67dbdbd45b5a4493b21cecc08f8114369509475b75eec1aac7b49837e5cb524b4cb3f34c52dd9d40ff5241c02b1f2bc03139255fefc5280b2275cea5c3c720b3ada2186c9409e97c955b722a6b8f00c1b30ca3efea6642cc146ffdbd030040d329ae3cf9020844bf6244f6106e63bd109829cecbae7b17695fbeaecdb7f5dd61f92d3e0f27a32f0f96be5654bf2ae19244265ab1e7430a166c54339f2d283c3f2eff163951874b819b1ee0a8ef9bb3529868560ebe9221820e3e1f5ac61ed28512e1ed737cd9bc7657fe01699226921a778603d32d9b6894436ab11348b9f27054d3452888a831f3efaa274e78d2ece9ac02c18602a91e09556f804fe1700ad85c191ae514a5c84c38d2c47f81192b7e276afa216d59301b1ce06cd37e4bad93b3890b8dd27d03ce6a225153261a0748af9bdf8bab36b741dde885170071e40bab43ee348cdb9880f0231df59a383829790f0f3e41373027763c26e44bf87e6c135cadd937ffead6ad4799d4c25572016abad88460b53dfa1cf323020d3974ebd101c69510384d058c37f8a64ffdebc5b26c02833c03552dfc8e832b398d9aa6ac8a68a9fa11b741840f9c308139786728fc5a80dd79c86ff97e6aa3dc4c2bfdb8fcdce1da3d1e58aee227702d369738832e0174d72af32576623e0d56e064e3cfce292a5d9365f4b93aebceab58194625b89609fefc48b488e3fee7340fec0425ba695149e80a160955e8275694198b006205e082c95a2f40e58d51808ded1c665ac738026b096528327954a72aae30d0588987f332eebb88d9200b5b9a2ea46c59cf906abbbd77a3ee408d3ba39988953aa18846162ad5cc98b39182a4b79ff544e36a9895cae23452fca675502f03656aa823c7f9aaa28b358f8fa01490cec55df73b620e062d5c6eb34e2128f9a624f651b66a38336f8b4aedbc7d70b19977cad8968d258fea896eb8eeba3a5f5da3406a53103cedcba171bc9f30d20ca20565f56c8912292aa4ff8ed917996cb42777c88b5cf1a67e075dbfdf660aa89055c13d3927f63a99d3dcc6ed283882b91a2a70107d57b9c258ecca08f6d79738353be1f7814a7de3c3698cee72f752846c7bb917f36b7a185418a0ae4639292c6cfdd582ca3f3fa22d424f1f88129854a27a2de4d37aff4cdfd955c20e999af304157112203526ce55b01798de3883daa72ddfd23988b512e7243e31fd468bf591b02fc62e1e5253a8810844baae22defaff6650b3c8a0e7d291367ae212e3e28c70f88aa0cbc804d4ec88a5a9b76d305aa4e99262194fe405d175b3c00524a4fa033aee81f2c92392993077f4a59a2aa8770929ddbd4cf616a05304d7289662aa512992430b038b134b567ee4b2e3775cddfa5611b70c461dc7f18c028c853dc5ee1cd8b39a0732a5fff3ffdd51c05b0c6844794c8d24272856f7f07ac2ef3f3372d47a2224a93dc1e51d5b426f3eb0349fbcbf6149aa77523e0d693aaef51b81495d934501c428fa7b98cde4dd71bc4a6e72bcbdbd54a6145b4f960fed065812cba48b88397fa877bba7f11a27efb0bbaa2f700e68f9afb254975ba8ca4ca985007f19ba439ebe68df5b94d616c18c67a790a28f883d934935e288c297ca9939c22778bbbd3dc91a4be5ed959043c43b1608fbbd9f7750edf4e7e44fc85bc98a2a35545e1f90f3c2566eb167598dc9efab9b70f4bcd895cc563ac32cac459f6f8be191bf98af71591356c855063c0bdd5274f19bfdbd9eb0a2caf6938571b8d544095c5427d468a4d11612b50c32aa3427947710f709828d83bc61608efb3825d50ed19ecc67bd95d145b23aca20828fadc5f4a70ff04aaf7e845e3d5f01342629fc35d19a744f66e10714f297631c9359c4aa94fb7fcb76ba5b0ec88d7dc77d87a79ab2e8573882db4877de92a6108292ec11e331fbc89541c49ed8a16024fc9d91a0ca8c5cbd193f331a52fb5ea9154e187285345be2d7198da1de0f332e8a4d2a5ad1cf2a6ae588456a9b5afc6d1357b70af3ab0a0da2700df487309357e9bc282920e46ddd00f59dc62cc39302f769fc7edd1332882800985bf7f35f2f7a0f61c98f47108116c49ae5714f7b408ed5dc956a507072128f8e38db48e8b88cc5ed15bd6402828b495864bdf0c5cc5dbee67b75a40bea3a1a84d617e9b12cbd6f062679ea2aac1bce0ba1170aa993bf3efedc8edbc28e6df8d61cdad13cdad3b94018ffba8b138605fd690110c65e1210801decedd08217587d19a177c5390ab02288449c3e25bb6aba285098e87a3fc1f0e427e70f7f8f8adbb34abdc1d993570d0e468c7fcdbda2ad06fe076194c4a95d02fa6503aeeb26986dc94816efe272d16d74495d565e7fe7728f4fbb8d9cfcf9d81836653ec6352e455791b30f38b7e33f34635371c94a71282c66f14ef2092c36921a941021a8513690696d28136683b6b26bc19b6783957e62c98fb6c71ab1f141ccea59b7094d6d8d3bd8a502b65e885298199ace086d23d92f1d23de85b1aab534b4e077632da83d09ba4fe69313349b16a7de4098cbfc3fa5a3a0c848e00ad3427fe7024681284d5caa191f90c0ae652e4d1ba1b4b13b1316dd0a2d58aadc16ad594bc6510c80fe82d99ba22b7ce445c14685532d7b2225d41209609f27437f5e1ebbb967dc2ab74bd10e4f3e938fa08f94a550c53d067d6b404177694bb41e36d72f4c3200da660f4b87cec0ef3e9d6eb47a8ff24b9b052c64aa39b72d6668f414c410b97eb00ffd1729053497bcc06f6ef81a0eacfcd7fdb235a214d9d671303bf8c38dd949d98c9bedec93a9f6b2eed7bd4f64c2a431ef1282662be908eb1e5ea8b4e525737da5e809f3f17593b07c27cfd1fc63e8b879a8ec0cb9c142335ca5bc1adbfcf4546c76f6d006bee28f1301c5e605394a6bd01ca1b9e301e047cdaedfeee8f4c3ef9ec52da03a126f41e3c396ab9d3c0267c8ad28fefc363b0ee5a06fab2ffd9c840bdf69c01a7e5840192fd4851ec5ba71b44f275a8896f28851bb809fe803830bd700df6876c5a12d9a0a2a79b518e32560df803d4e0b226b631df29e26e217d7ba3623fab7724b6ea95a5d2539ab01eb2c6d6da84eeecc2c7df0dbc2fd0bed12da656f5a5c3f71f29729d02ecad1df48e86746fb0618bf93cfeefe72e5c1f7baa9d7459cbf232052f10de99b6094d7513b2150874996a45bc69dc2d3e7d28bec48cef4f537995d523874bd189c67a4b3292384f6fdd99dca25aafdf50041cb8a7b88d193fff905b596de6f068bf4295b55ab78f00c3d50fd2323237e374bd9b3d8f820183da9b45c196906538457678d7f459c48522b543c2c4d7a9fe90ab94e306dafef5d8488d328807e681853c6bba12fadef71b5238cf509a747b2b6f2c43bf435ba6dcd2209302ffc196be44338af77c918587198e3ad710c710aed564fcf79ff210dfc7050b88ebfb75f62052eb4b4638c959398e8c7e510be21ba9984b4c80f991523249d62f354094e2f31c134d3c60e81afbec01142d3380ae5992b29f6b6758e95c93f38753fb3afec4f62fafb4eaee86331672a104a2bac3afae6acc2e0117678340310312dc51f72d7b840f308bc22adf251670c8566974c67a1da5238b98eba9a0d391dc4fca1fd29c93ca13ae67dc67a35e18fa1fcdfd5cdb5f733980ca9f2f04fd4d0728830eb7dc8c65b5e4bf14694d2aac21dfb88f7d50d7579498dbde5ebd063fc2f7703d1c322e72c7a69a0afb7e578e4ab4a650ca8333297ffa49e6444469ecaefbc810d22d2d0edbe28bc5e14ffb8000ed50e3fd3d1fddd27f32ab7c38cb7342ca9ad0d451efcf878eb5748fb65f11829efa4ebf600fe1e747dfafdd3a2089c6a3f73ec3c4dd3ec12982017bb7648e3934c0cd12b18c2044c3cc422dd58ca4bcc8850fe1afa0aba340470a0f13c5d3b1366a5c7c3f9a92ef6062123d1dab1873573d6ae2627441a6b3f003294b203fc041ad10c95a21db04d54fb94a0897b372b503866955009114d3ed23e4e69ec2790aabe99a7c390724dea099e296b43db2b1281b73708d757cad1a34e5eb6dd97a45a6accc7b687cb5095e55e98cd5757eaba533dbb0e310a7047675bb0018b51e12fc8df5c3ebbb8c07a62a6555ec3e7eb274e8e67fdeeab880be1ef516769c23926f85ab0fbcaf0f5ebc270aa3e58be53833c75bd431"}}}}}, 0x0)

r0 = openat$tun(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$TUNGETDEVNETNS(r0, 0x400454ca, 0x400030)
ioctl$TUNGETIFF(r0, 0x800454d2, &(0x7f0000000000)={'sit0\x00'})

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
chmod(&(0x7f0000000180)='./file0\x00', 0x23f)
mkdir(&(0x7f0000000040)='./file0/file1\x00', 0x0)
newfstatat(0xffffffffffffff9c, &(0x7f0000000140)='./file0\x00', &(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, <r0=>0x0}, 0x0)
setreuid(0x0, r0)
rmdir(&(0x7f00000004c0)='./file0/file1\x00')
exit_group(0x0)

r0 = openat$fuse(0xffffffffffffff9c, &(0x7f0000000000), 0x2, 0x0)
pwrite64(r0, 0x0, 0x0, 0xfffffffffffffffe)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
lseek(r0, 0xc0000000000, 0x3)
exit(0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x844640, &(0x7f0000000180), 0x0, 0x0, 0x0)

r0 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
getsockopt$IP6T_SO_GET_ENTRIES(r0, 0x29, 0x41, 0x0, &(0x7f0000000100)=0xe1)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat$tcp_mem(0xffffffffffffff9c, &(0x7f00000001c0)='/proc/sys/net/ipv4/tcp_wmem\x00', 0x1, 0x0)
sendfile(0xffffffffffffffff, r0, 0x0, 0x0)
rt_sigreturn()

clone(0x5100, 0x0, 0x0, 0x0, 0x0)
r0 = inotify_init1(0x0)
inotify_rm_watch(r0, 0x0)
rt_sigreturn()

socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000080)={<r0=>0xffffffffffffffff})
sendmmsg(r0, &(0x7f0000006140)=[{{0x0, 0x0, 0x0}}, {{0x0, 0x0, 0x0, 0x0, &(0x7f0000001880)=[{0x18, 0x1, 0x1, "06"}], 0x18}}], 0x2, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000100)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000000c0)={0x0, <r2=>0x0})
sendmmsg(r2, &(0x7f0000008600)=[{{0x0, 0x0, &(0x7f0000003140)}}, {{&(0x7f00000072c0)=@un=@file={0x1, './file0\x00'}, 0xa, &(0x7f0000007380), 0x0, &(0x7f0000000600)}}], 0x2, 0x0)

syz_emit_ethernet(0x3e, &(0x7f0000000640)={@dev, @link_local, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "3314f1", 0x8, 0x0, 0x0, @remote, @mcast2, {[@fragment={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x67}]}}}}}, 0x0)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mount(&(0x7f0000000280)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000180)='sysfs\x00', 0x0, 0x0)
r1 = gettid()
tkill(r1, 0x18)

r0 = timerfd_create(0x0, 0x0)
r1 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
readv(r0, &(0x7f0000000180)=[{&(0x7f00000004c0)=""/94, 0x5e}], 0x1)
r2 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r2, 0x0)
preadv(r1, &(0x7f00000001c0)=[{0x0}], 0x1, 0x0, 0x0)
timerfd_settime(r0, 0x1, &(0x7f0000000040)={{0x77359400}, {0x77359400}}, 0x0)

clone(0x4000000206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
ioctl$KDSETMODE(r0, 0x4b3a, 0x0)
exit_group(0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$inet6_int(r0, 0x29, 0x19, 0x0, &(0x7f0000000180))
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = inotify_init1(0x0)
fcntl$setown(r0, 0x8, 0xffffffffffffffff)
fcntl$getownex(r0, 0x10, &(0x7f0000000080)={0x0, <r1=>0x0})
ptrace$setopts(0x4206, r1, 0x8, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x1, 0x0, 0x0)
r3 = gettid()
tkill(r3, 0x37)

mkdirat(0xffffffffffffff9c, &(0x7f0000000240)='./file0\x00', 0x0)
mkdirat$cgroup_root(0xffffffffffffff9c, &(0x7f0000000000)='./cgroup.net/syz1\x00', 0x1ff)
mount$fuse(0x20000000, &(0x7f00000004c0)='./file0\x00', 0x0, 0x7a04, 0x0)
chdir(&(0x7f0000000240)='./file0\x00')
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
flistxattr(r0, &(0x7f0000000080)=""/5, 0x5)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
llistxattr(&(0x7f00000000c0)='./bus\x00', &(0x7f0000000100)=""/44, 0x2c)
removexattr(&(0x7f0000000140)='./file0\x00', &(0x7f0000000180)=@known='user.incfs.id\x00')
creat(&(0x7f0000000040)='./bus\x00', 0x0)

r0 = syz_open_procfs(0x0, &(0x7f0000000040)='task\x00')
lseek(r0, 0x0, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
lgetxattr(&(0x7f0000000280)='./file0\x00', 0x0, 0x0, 0xfffffffffffffd7e)
r0 = gettid()
tgkill(r0, r0, 0xf)

socketpair(0x1, 0x1, 0x0, &(0x7f0000000200)={<r0=>0xffffffffffffffff})
clock_gettime(0x4, &(0x7f0000000140)={<r1=>0x0})
setsockopt$sock_timeval(r0, 0x1, 0x14, &(0x7f0000000000)={r1}, 0x10)
recvfrom$inet6(r0, 0x0, 0x0, 0x0, 0x0, 0x0)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000180)={<r0=>0xffffffffffffffff})
setsockopt(r0, 0x1, 0x7, 0x0, 0x0)

r0 = epoll_create1(0x0)
pipe2(&(0x7f0000000040)={0xffffffffffffffff, <r1=>0xffffffffffffffff}, 0x0)
epoll_ctl$EPOLL_CTL_ADD(r0, 0x1, r1, &(0x7f0000000140))
r2 = epoll_create1(0x0)
epoll_ctl$EPOLL_CTL_ADD(r2, 0x1, r1, &(0x7f0000000080))
epoll_ctl$EPOLL_CTL_ADD(r2, 0x1, r0, &(0x7f00000000c0))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
fcntl$dupfd(r0, 0x0, 0xffffffffffffffff)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r1, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

madvise(&(0x7f0000a93000/0x4000)=nil, 0x4000, 0x80000000e)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
epoll_create1(0x0)
mremap(&(0x7f0000a96000/0x1000)=nil, 0x1000, 0x800000, 0x3, &(0x7f0000130000/0x800000)=nil)
madvise(&(0x7f0000388000/0x1000)=nil, 0x1000, 0x65)
shmget$private(0x0, 0x1000, 0x54000000, &(0x7f000024b000/0x1000)=nil)
r1 = shmget$private(0x0, 0x1000, 0x54001800, &(0x7f0000fff000/0x1000)=nil)
r2 = shmget$private(0x0, 0x3000, 0x20, &(0x7f00002c6000/0x3000)=nil)
shmctl$SHM_UNLOCK(r2, 0xc)
shmat(0xffffffffffffffff, &(0x7f0000ffc000/0x2000)=nil, 0x6000)
shmat(r1, &(0x7f0000000000/0x4000)=nil, 0xffffffffffffdfff)
shmat(r1, &(0x7f0000699000/0x1000)=nil, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000000)={<r3=>0xffffffffffffffff})
r4 = memfd_create(&(0x7f0000000080)=',\xae@\x00', 0x2)
mmap(&(0x7f00007c1000/0x2000)=nil, 0x2000, 0x1000008, 0x10010, r4, 0x1f98e000)
accept$unix(r3, 0x0, &(0x7f00000000c0))

r0 = socket$inet6(0xa, 0x1, 0x0)
setsockopt$inet6_int(r0, 0x29, 0x43, &(0x7f0000000040)=0xffffffff, 0x4)

alarm(0xffffffff00000001)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f00000002c0)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x0, 0x0)
chdir(&(0x7f0000000200)='./file0\x00')
mount(&(0x7f0000000680)=ANY=[], &(0x7f00000005c0)='./bus\x00', &(0x7f0000000600)='sysfs\x00', 0x0, 0x0)
open(&(0x7f0000000100)='./bus\x00', 0x11250c2, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

r0 = openat(0xffffffffffffffff, &(0x7f0000000200)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x10012, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r1 = socket$unix(0x1, 0x1, 0x0)
fcntl$addseals(r1, 0x409, 0x0)
rt_sigreturn()

getresgid(&(0x7f0000000040), &(0x7f0000000080), 0x0)

lsetxattr$trusted_overlay_opaque(&(0x7f0000000000)='\x00', 0x0, 0x0, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = getpid()
r2 = getpid()
rt_tgsigqueueinfo(r2, r1, 0x15, &(0x7f00000000c0))
ptrace(0x10, r2)
openat$cgroup_ro(0xffffffffffffffff, &(0x7f0000000080)='cgroup.controllers\x00', 0x275a, 0x0)
ptrace$setregs(0xd, r1, 0x0, &(0x7f0000000000))
ptrace$getregset(0x4204, r2, 0x2, &(0x7f0000000180)={0x0})

r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
setsockopt$inet_icmp_ICMP_FILTER(r0, 0x1, 0x1b, &(0x7f0000000000), 0x4)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
creat(&(0x7f00000007c0)='./file0/file0\x00', 0x0)
r0 = gettid()
removexattr(&(0x7f0000000340)='./file0/file0\x00', &(0x7f0000000380)=@random={'btrfs.', 'fuse\x00'})
tgkill(r0, r0, 0xe)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(0xffffffffffffffff, 0x6, 0x0, 0x0, 0x0)
setsockopt$inet_tcp_int(0xffffffffffffffff, 0x6, 0x0, 0x0, 0x0)
bind$inet(r0, &(0x7f0000000000)={0x2, 0x4e21, @local}, 0x10)
connect$inet(r0, &(0x7f0000000080)={0x2, 0x4e21, @local}, 0x10)
open(0x0, 0x0, 0x0)
write$FUSE_NOTIFY_DELETE(0xffffffffffffffff, 0x0, 0x1f)
sendfile(0xffffffffffffffff, 0xffffffffffffffff, 0x0, 0x0)
ioctl$ifreq_SIOCGIFINDEX_wireguard(0xffffffffffffffff, 0x8933, 0x0)
sendto$inet(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
sendto$inet(r0, &(0x7f0000000140)="11", 0xfe22, 0x0, 0x0, 0x0)
recvfrom$inet(r0, &(0x7f0000000200)=""/4084, 0xff4, 0x62, 0x0, 0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x6, &(0x7f0000000140)=0x7, 0x4)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = inotify_init1(0x0)
fcntl$setown(r0, 0x8, 0xffffffffffffffff)
fcntl$getownex(r0, 0x10, &(0x7f0000000140)={0x0, <r1=>0x0})
ptrace$setopts(0x4206, r1, 0x0, 0x0)
listen(0xffffffffffffffff, 0x0)
rt_sigqueueinfo(r1, 0x200000000012, &(0x7f0000000000)={0x0, 0x0, 0xfffffffffffffff9})
ptrace(0x4208, r1)
rt_sigreturn()

socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000080)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
read(r0, 0x0, 0x0)

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = signalfd4(0xffffffffffffffff, &(0x7f0000000000), 0x8, 0x0)
readlinkat(r0, &(0x7f0000000080)='\x00', &(0x7f00000000c0)=""/169, 0xa9)
exit_group(0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f00000000c0)={0x0, 0x0})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x844640, &(0x7f0000000180), 0x0, 0x0, 0x0)

mknod(&(0x7f0000000000)='./file0\x00', 0x1000, 0x0)
open$dir(&(0x7f0000000040)='./file0\x00', 0x101003, 0x0)

clone(0x38004100, 0x0, 0x0, 0x0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f00000000c0)='./file0\x00', 0x103041, 0x0)
fremovexattr(r1, &(0x7f0000000100)=ANY=[])
exit_group(0x0)
rt_sigreturn()

clone(0x4000000206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
fcntl$setflags(0xffffffffffffffff, 0x2, 0x0)
exit_group(0x0)

perf_event_open(&(0x7f0000000040)={0x2, 0x70, 0x7f, 0x1}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
llistxattr(&(0x7f00000001c0)='./cgroup/cgroup.procs\x00', 0x0, 0x0)

r0 = open(&(0x7f0000103ff8)='./file0\x00', 0x141042, 0x0)
ftruncate(r0, 0x1000)
r1 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000000), 0x806, 0x0)
sendfile(r1, r0, 0x0, 0x40810ffe)
r2 = openat(0xffffffffffffffff, &(0x7f0000001940)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
readv(r1, &(0x7f0000000100)=[{&(0x7f0000000040)=""/4, 0x4}], 0x1)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x9, 0x12, r0, 0x0)
capget(&(0x7f0000000040), 0x0)
r1 = gettid()
tgkill(r1, r1, 0x24)

sched_setaffinity(0xffffffffffffffff, 0x8, &(0x7f0000000380))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r1 = socket$netlink(0x10, 0x3, 0x0)
getsockopt$sock_cred(r1, 0x1, 0x11, 0x0, &(0x7f0000cab000))
syz_mount_image$fuse(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rt_sigreturn()
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000200)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendto(r0, &(0x7f0000000600), 0xfffffd71, 0x0, &(0x7f00000027c0)=@alg={0x26, 'skcipher\x00', 0x0, 0x0, 'xts(twofish)\x00'}, 0x80)
rt_sigreturn()

clone(0x4000000206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
rt_tgsigqueueinfo(r0, r0, 0x0, 0x0)
exit_group(0x0)

syz_emit_ethernet(0x82, &(0x7f0000000200)={@link_local, @random="00801000", @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x74, 0x0, 0x0, 0x0, 0x1, 0x0, @initdev={0xac, 0x1e, 0x0, 0x0}, @local}, @time_exceeded={0x5, 0x0, 0x0, 0x3, 0x0, 0x0, {0x16, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, @loopback, @multicast2, {[@noop, @timestamp={0x44, 0x1c, 0x0, 0x0, 0x0, [0x0, 0x0, 0x0, 0x0, 0x0, 0x0]}, @timestamp_addr={0x44, 0x24, 0x0, 0x1, 0x0, [{@private}, {}, {@multicast2}, {}]}]}}}}}}}, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='gid_map\x00')
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x7800001, 0x11, r1, 0x0)
utimes(&(0x7f00000000c0)='./file0\x00', &(0x7f0000001200))
pwrite64(r0, 0x0, 0x0, 0x100000001)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f00000027c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
r2 = socket$unix(0x1, 0x2, 0x0)
r3 = fork()
ptrace(0x10, r3)
ptrace$setregset(0x4205, r3, 0x202, &(0x7f0000000080)={0x0})
ftruncate(r2, 0x20007ffefffc)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
prlimit64(0x0, 0x7, &(0x7f0000000000)={0x4, 0xfffffffffffffff7}, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
socket$inet_tcp(0x2, 0x1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
tkill(r0, 0x25)

socketpair(0x1, 0x3, 0x0, &(0x7f0000000100)={<r0=>0xffffffffffffffff})
ioctl$sock_inet_udp_SIOCOUTQ(r0, 0x5411, &(0x7f0000000000))

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mlockall(0xb735c6256c5ad4cf)
r0 = gettid()
tkill(r0, 0x18)

clone(0x200411277fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000080)={<r0=>0xffffffffffffffff})
fcntl$setownex(r0, 0xf, &(0x7f00000000c0))
ioctl$int_in(r0, 0x5452, &(0x7f0000000000)=0x20)
exit_group(0x0)

r0 = socket$inet6(0xa, 0x2, 0x0)
bind$inet6(r0, &(0x7f0000f5dfe4)={0xa, 0x4e20}, 0x1c)
sendto$inet6(r0, 0x0, 0x0, 0x0, &(0x7f0000000240)={0xa, 0x4e20, 0x0, @loopback}, 0x1c)
recvmmsg(r0, &(0x7f0000004940)=[{{0x0, 0x0, 0x0}}, {{0x0, 0x0, 0x0}}], 0x2, 0x0, &(0x7f0000004a80))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = socket$inet6(0xa, 0x800000000000002, 0x0)
connect$inet6(r1, &(0x7f0000000000)={0xa, 0x0, 0x0, @empty}, 0x1c)
writev(r1, &(0x7f0000000580)=[{0x0}, {&(0x7f0000000500)="80", 0x1}], 0x2)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
setsockopt$inet6_tcp_int(r0, 0x6, 0x4, &(0x7f00000001c0), 0x4)
rmdir(&(0x7f00000001c0)='./file0\x00')
r1 = memfd_create(&(0x7f0000000280)='-B\xd5NI\xc5j\xbappp\xf0\b\x84\xa2m\x00:)\x00\xbb\x8d\xac\xacva}knh#\b\x00\x00\x00\xc0:\x9cc\x10d\xee\xa9\x8bCc\xad\x89\x9ck\xde\xc5\xe9\xaa\x9b\xc3\x14\xd2\xd1y\x1f\x9e\x856\xddU\xa9=\xcdJx\xaa\x8f~\xb90a\xa9\xb2\x04K\x98\x93?\x88Q\xf7\xd6\x1d\xa1\xce\x8b\x19\xea\xef\xe3\xab\xb6\xa5$4\xd6\xfe7\x0f\xe7\xd9$\xce\x00\x00\x00\x00\xc9\xad\xd3g@\xe1\'s\x0e\x90\xf2\xcdr\xb8(\xb8\xd9\xa3\xc4p\xf4\\>A\x11U\x99\x8d\xa3\x86\xb7\x1d\x87j\xd3\xc4\xdf\x13/\x97Yy\x8b{\x1df\x8d/\x90\xd3<\xf8\x18\xa4\x88\xcf\x048\xb4\xbe\x00\x00\xb7\xd6\xa5&);\x1br\xd2\xa4\xba\x8b\xa7\x15\xbe\x95\xeb\x1bB\xacoyP\xbb\x1c\xb9S-\xe0oK\xac\x00;S\x8a\x01\xd2\xca\xa3\x1c]<\x04\xaf\x04\x9a\x9d\x84\xa5\x94J>F\xc5V\xc6\xfa\x8e\v\xe1\x82\x03`\xf8\xca\xf4\x89\r^Z44\x91\xeb\xf4$\xf3\x1d\xd5\xbd\xb6ZZ\xd8\xfdS\r\x98\x06/\x9a%m\xcf\xab u\xa6Fw\xde\xb4?\r\xbdK\xfb\xf2\x13\xb3\xfa\x00\xaaP\xc9t\x7f;A Y\x84\x17\x14\xa8\xb5\x0f\xc3i\x9a\x87W\x90h.\x8b\xf5\xf9\xc1\xf04\x9a\xf9DB|L\xbc^n\xd5\x85\xd7\xaf-}\xce\x0e\xcc{\xb1\x9d_\xb2BmU\xc2\xad2q\xd5t&v\x89O\xf0+Q?\xf5\x1eV\x8d[\x98\x11\f#\x13\xc7\xd9\x92\xcc\xf7\xfb\xd3\bGy\x98\x1b\xe7\x86i\xe1.\x1f\x9e\x8cPFYi\x94\x13\xddm\x9c\xbfV\xe7^@\xe0\xa3\xa5(\f\x18>94\xedZ\xa7\xe4\xb2\xb6.\bY\xa9\xff\xbb', 0x0)
pwrite64(r1, &(0x7f00000006c0)='/', 0x1, 0xffffffff)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
getitimer(0x7, &(0x7f00000005c0))

syz_emit_ethernet(0x66, &(0x7f0000000000)={@random="0dfd3992fa3c", @link_local, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "337403", 0x30, 0x0, 0x0, @initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @mcast2, {[@dstopts={0x0, 0x4, '\x00', [@calipso={0x7, 0x8}, @hao={0xc9, 0x10, @initdev={0xfe, 0x88, '\x00', 0x0, 0x0}}, @enc_lim, @enc_lim]}]}}}}}, 0x0)

r0 = socket$nl_route(0x10, 0x3, 0x0)
getsockopt(r0, 0x1, 0x7, 0x0, &(0x7f0000001480))

r0 = socket$inet(0x2, 0x2, 0x0)
bind$inet(r0, &(0x7f0000000040)={0x2, 0x1004e20, @dev}, 0x10)
connect$inet(r0, &(0x7f00000002c0)={0x2, 0x4e20, @empty}, 0x10)
setsockopt$inet_opts(r0, 0x0, 0xd, &(0x7f0000001a00)="9a", 0x1)
write(r0, 0x0, 0x0)
recvmmsg(r0, &(0x7f0000002700)=[{{&(0x7f0000001bc0)=@alg, 0x80, 0x0}}], 0x1, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
sendto$inet(0xffffffffffffffff, &(0x7f0000000000)="c04a3c61c8f06069becda749e845a8e754ee5a6997d91ec8aabf5325d5439ad1b2c2a135229954d2bd8d08d5dde39d566ac597304e9657246c159ac7f822527a4d62e73bc4e7bafb8300cc49a729d85bcfb636b8b10106d9338c9a250fc492eb10dd9c07013b8e7286207999f2784e9d7a48a86ebc30ded78348c6205c39b4d5113975dc3e54e97403bbd7f6fbd95efb851e21547783d956ae25bc47ba97b3480473459256e8b61978c90f032eb2d855403608206bc53dbd71046bac2280e1c3a42ec1", 0xc3, 0x0, 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)

r0 = syz_open_procfs(0x0, &(0x7f0000002400)='fdinfo\x00')
getdents(r0, &(0x7f0000000000)=""/90, 0x5a)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = epoll_create(0x20)
r1 = openat$tun(0xffffffffffffff9c, &(0x7f0000005e00), 0x0, 0x0)
epoll_ctl$EPOLL_CTL_DEL(r0, 0x2, r1)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
socketpair(0x1, 0x1, 0x0, &(0x7f00000000c0)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
getsockopt$inet_tcp_buf(r1, 0x6, 0xd, 0x0, &(0x7f0000000100))
rt_sigqueueinfo(r0, 0x3a, &(0x7f0000000000))

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
recvfrom$inet(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
fgetxattr(r1, &(0x7f0000000200)=@random={'system.', '/dev/zero\x00'}, 0x0, 0x0)

mkdir(&(0x7f0000000200)='./file0\x00', 0x0)
mkdir(&(0x7f00000002c0)='./file1\x00', 0x0)
mkdir(&(0x7f0000000140)='./bus\x00', 0x0)
mount$overlay(0x0, &(0x7f0000000040)='./bus\x00', &(0x7f0000000080), 0x0, &(0x7f0000000280)=ANY=[@ANYBLOB='upperdir=./bus,workdir=./file1,lowerdir=./file0'])
chdir(&(0x7f00000001c0)='./bus\x00')
r0 = open(&(0x7f0000000040)='.\x00', 0x0, 0x0)
getdents64(r0, 0x0, 0x0)
r1 = open(&(0x7f0000000040)='.\x00', 0x0, 0x0)
getdents64(r1, 0x0, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x200c2, 0x0)
getdents64(r1, &(0x7f00000000c0)=""/109, 0x6d)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
setsockopt$inet6_tcp_int(r0, 0x6, 0x5, 0x0, 0x0)
exit_group(0x0)

getrlimit(0x3, &(0x7f0000000140))

clone(0x9b8271be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000300)='fdinfo\x00')
fchdir(r1)
mknod(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
exit(0x0)

prctl$PR_SET_SECUREBITS(0x1c, 0x25)
setresuid(0xee01, 0xee00, 0x0)
r0 = socket(0x10, 0x2, 0x0)
getsockopt$sock_cred(r0, 0x1, 0x11, &(0x7f0000caaffb)={0x0, 0x0, <r1=>0x0}, &(0x7f0000cab000)=0xc)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setregid(0x0, r1)
setgroups(0x0, 0x0)
openat$tcp_mem(0xffffff9c, &(0x7f0000000080)='/proc/sys/net/ipv4/tcp_wmem\x00', 0x1, 0x0)

munmap(&(0x7f0000ffd000/0x2000)=nil, 0x2000)
sigaltstack(&(0x7f0000ffe000/0x2000)=nil, 0x0)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
getsockopt$inet6_int(r0, 0x29, 0x43, 0x0, &(0x7f0000000140))

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$unix(0x1, 0x5, 0x0)
bind$unix(r0, 0x0, 0x0)
rt_sigreturn()

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet_udp(0x2, 0x2, 0x0)
ioctl$sock_SIOCOUTQ(r0, 0x5411, 0x0)
rt_sigreturn()

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket(0x11, 0x2, 0x2)
setsockopt$inet_mreq(r0, 0x0, 0x23, &(0x7f0000000000)={@loopback}, 0x8)
rt_sigreturn()

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f00000000c0)={0x2, &(0x7f0000000040)=[{0x5, 0x0, 0x0, 0xe800}, {0x6}]})

prctl$PR_SET_PDEATHSIG(0x1, 0x4036)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
setsockopt$inet_udp_encap(r0, 0x11, 0x64, &(0x7f0000000c40), 0x4)

r0 = openat$full(0xffffffffffffff9c, &(0x7f00000001c0), 0x0, 0x0)
r1 = ioctl$TUNGETDEVNETNS(r0, 0x5450, 0x0)
fcntl$addseals(r1, 0xb, 0x0)

clone(0x14004100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
fcntl$getflags(r1, 0x40a)
preadv(r1, &(0x7f0000000280), 0x1, 0x0, 0x0)
r2 = getpid()
rt_sigqueueinfo(r2, 0x39, &(0x7f0000002200))

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mknod(&(0x7f0000000080)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0, 0x0)
r0 = gettid()
rt_sigqueueinfo(r0, 0xa, &(0x7f0000000040))

fork()
wait4(0x0, 0x0, 0x2, 0x0)
perf_event_open(0x0, 0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0)
r0 = fork()
prctl$PR_SET_PTRACER(0x59616d61, 0x0)
r1 = perf_event_open(0x0, 0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x1)
waitid(0x0, 0x0, &(0x7f0000000040), 0x0, &(0x7f00000000c0))
r2 = fork()
perf_event_open(0x0, r2, 0x0, r1, 0x0)
perf_event_open(0x0, 0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0)
perf_event_open(0x0, 0x0, 0x0, 0xffffffffffffffff, 0x0)
ptrace(0x10, r0)
ptrace$peek(0x1, r0, 0x0)

time(&(0x7f00000001c0))

syz_emit_ethernet(0x46, &(0x7f00000001c0)={@multicast, @local, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "3c3acc", 0x10, 0x0, 0x0, @empty, @local, {[@dstopts={0x2c, 0x0, '\x00', [@enc_lim, @padn]}]}}}}}, 0x0)

r0 = socket(0x10, 0x3, 0x0)
setsockopt(r0, 0x1, 0x7, &(0x7f00000001c0)="4085f0af", 0x4)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000000)={0x2, &(0x7f0000000040)=[{0x6c}, {0x6, 0x0, 0x0, 0x7fffffff}]})
openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000000)='cpuacct.usage_percpu_sys\x00', 0x275a, 0x0)

clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
fadvise64(0xffffffffffffffff, 0x0, 0xfffffffffffffffb, 0x0)
exit(0x0)

munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
r0 = shmget$private(0x0, 0x4000, 0x0, &(0x7f0000ffa000/0x4000)=nil)
r1 = shmat(r0, &(0x7f0000ffb000/0x2000)=nil, 0x0)
mremap(&(0x7f0000fff000/0x1000)=nil, 0x1000, 0x1000, 0x3, &(0x7f0000ffc000/0x1000)=nil)
shmdt(r1)

r0 = socket$inet(0x2, 0x2, 0x0)
bind$inet(r0, &(0x7f0000000040)={0x2, 0x1004e20, @dev={0xac, 0x14, 0x14, 0x1}}, 0x10)
connect$inet(r0, &(0x7f00000002c0)={0x2, 0x4e20, @empty}, 0x10)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
setsockopt$inet_opts(r0, 0x0, 0xd, &(0x7f0000001a00)="9a", 0x1)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
write(r0, 0x0, 0x0)
recvmmsg(r0, &(0x7f0000002700)=[{{&(0x7f0000001bc0)=@alg, 0x80, 0x0}}], 0x1, 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
sendto$inet6(r0, &(0x7f0000000000)="8000000000000000", 0x8, 0x0, &(0x7f0000000140)={0xa, 0x0, 0x0, @local}, 0x1c)
r1 = gettid()
rt_sigqueueinfo(r1, 0x2b, &(0x7f0000000100))

r0 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
getsockname(r0, 0x0, &(0x7f00000000c0))

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
execveat(r0, &(0x7f0000000340)='./file0\x00', 0x0, 0x0, 0x0)
rt_sigreturn()

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
execveat(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x400)
rt_sigreturn()

perf_event_open(&(0x7f0000000040)={0x2, 0x70, 0x7f, 0x1}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = openat$full(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
pipe2$9p(&(0x7f0000000100)={0xffffffffffffffff, <r1=>0xffffffffffffffff}, 0x0)
fcntl$setpipe(r1, 0x407, 0x200006)
splice(r0, 0x0, r1, 0x0, 0x55aa40be, 0x0)

setrlimit(0x7, &(0x7f00000001c0))
eventfd(0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='gid_map\x00')
pwrite64(r0, 0x0, 0x0, 0x100000001)
open$dir(&(0x7f0000001700)='./file0\x00', 0x412800, 0x0)
exit(0x0)

r0 = timerfd_create(0x0, 0x0)
r1 = dup2(r0, r0)
read$char_raw(r1, 0x0, 0x0)

futimesat(0xffffffffffffff9c, 0x0, &(0x7f0000000080)={{}, {0x0, 0xfffffffffffffcb3}})

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f00000007c0)='./file0\x00', 0x0)
mount(&(0x7f0000000800)=ANY=[], &(0x7f0000000040)='./file0\x00', &(0x7f0000000240)='tmpfs\x00', 0x1, 0x0)
chdir(&(0x7f0000000000)='./file0\x00')
rename(&(0x7f0000000140)='./file2\x00', &(0x7f0000000180)='./file2\x00')
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f00000000c0)='/proc/self/exe\x00', 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
perf_event_open(&(0x7f000025c000)={0x2, 0x80, 0x15, 0x0, 0x0, 0x0, 0x0, 0x1, 0x864a8, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={0x0}}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
getsockopt$IPT_SO_GET_ENTRIES(r2, 0x0, 0x41, &(0x7f0000000180)={'filter\x00', 0x5, "b933185dd8"}, &(0x7f0000000040)=0x29)

pipe2(&(0x7f0000003d80)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff}, 0x0)
write$char_usb(r1, &(0x7f0000000a00)="38f61ae88f001a53e3078e89024509c912d8f529607ec1c69b6b8bb9", 0x1c)
read$FUSE(r0, &(0x7f0000001a00)={0x2020, 0x0, 0x0, 0x0, 0x0, <r2=>0x0}, 0x2020)
wait4(r2, 0x0, 0x0, 0x0)

r0 = socket$nl_route(0x10, 0x3, 0x0)
getsockopt$sock_int(r0, 0x1, 0x7, &(0x7f0000000000), &(0x7f0000000040)=0x4)

prctl$PR_SET_MM_EXE_FILE(0x10, 0xd, 0xffffffffffffffff)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = memfd_create(&(0x7f0000000100)='\vem1\xc1\xf8\xa6\x8dN\xc0\xa3\\\xe2\xcb\xa2\xba\xe5\xf4\x97\xac#*\xff', 0x0)
write$FUSE_NOTIFY_STORE(r1, &(0x7f00000001c0)=ANY=[], 0x2c)
sendfile(r1, r1, &(0x7f0000001000), 0xffff)
mmap(&(0x7f0000000000/0x7000)=nil, 0x7000, 0x80000000004, 0x11, r1, 0x0)
setitimer(0x0, &(0x7f0000000080)={{0x0, 0x2710}}, 0x0)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
r1 = dup(r0)
setsockopt$inet_buf(r1, 0x0, 0x40, 0x0, 0x0)
r2 = gettid()
tkill(r2, 0x18)

mkdirat(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mount(&(0x7f0000000040)=ANY=[], &(0x7f0000002280)='./file0\x00', &(0x7f00000022c0)='devtmpfs\x00', 0x0, 0x0)
r1 = open$dir(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
getdents64(r1, &(0x7f0000000340)=""/185, 0xfffffffffffffd0d)

creat(&(0x7f0000000440)='./file0\x00', 0x0)
mount(&(0x7f0000000000)=ANY=[], &(0x7f0000000040)='./file0\x00', &(0x7f0000000080)='proc\x00', 0x0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
unlink(&(0x7f0000000000)='./file0\x00')
rt_sigreturn()

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
fsetxattr(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()

r0 = socket$unix(0x1, 0x2, 0x0)
setsockopt$sock_int(r0, 0x1, 0x8, &(0x7f0000000000), 0x4)

clone(0x1004100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='/proc/self/exe\x00', 0x0, 0x0)
newfstatat(0xffffffffffffff9c, &(0x7f0000000280)='.\x00', &(0x7f0000001500)={0x0, 0x0, 0x0, 0x0, <r1=>0x0}, 0x0)
setreuid(0x0, r1)
fsetxattr(r0, &(0x7f0000000000)=@random={'user.', '[+&\x00'}, 0x0, 0x0, 0x0)
rt_sigreturn()

r0 = socket$nl_route(0x10, 0x3, 0x0)
sendmmsg(r0, &(0x7f0000000000)=[{{0x0, 0x0, 0x0}}, {{&(0x7f0000000940)=@nl=@proc={0x10, 0x0, 0x0, 0x10000000}, 0x80, 0x0}}], 0x2, 0x0)

r0 = timerfd_create(0x0, 0x0)
fcntl$lock(r0, 0x6, &(0x7f0000000000)={0x2, 0x0, 0x0, 0x1, 0xffffffffffffffff})

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
fstatfs(r0, &(0x7f0000000300)=""/4096)

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
getsockopt$ARPT_SO_GET_ENTRIES(r0, 0x0, 0x21, 0x0, &(0x7f0000000100)=0x85)
rt_sigreturn()

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
syz_open_procfs(0x0, &(0x7f0000000000)='environ\x00')
exit_group(0x0)
r0 = syz_open_procfs(0x0, &(0x7f00000000c0)='fd/3\x00')
preadv(r0, &(0x7f0000000540)=[{&(0x7f0000000200)=""/27, 0x1b}], 0x1, 0x0, 0x0)

mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0xb635773f06ebbeee, 0x8031, 0xffffffffffffffff, 0x0)
r0 = open(&(0x7f0000000000)='./bus\x00', 0x109842, 0x0)
mmap(&(0x7f0000001000/0xa000)=nil, 0xa000, 0x800006, 0x11, r0, 0xc03000)
r1 = creat(&(0x7f0000000080)='./bus\x00', 0x0)
ioctl$FS_IOC_SETFLAGS(r1, 0x40086602, &(0x7f0000000040))
ftruncate(r1, 0x2008000)
r2 = socket$inet_udp(0x2, 0x2, 0x0)
connect$inet(r2, &(0x7f0000000480)={0x2, 0x0, @remote}, 0x10)
sendmmsg(r2, &(0x7f0000007fc0), 0x800001d, 0x0)
madvise(&(0x7f0000000000/0x600000)=nil, 0x60000b, 0x9)

mknod$loop(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
mount(&(0x7f0000000280)=ANY=[], &(0x7f00000000c0)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
statfs(&(0x7f0000002740)='./file0\x00', &(0x7f0000002780)=""/22)

socketpair(0x1, 0x1, 0x0, &(0x7f0000000180)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
setsockopt$inet6_tcp_TLS_RX(r0, 0x6, 0x2, &(0x7f0000000140)=@gcm_128={{}, "bb4cd2c68245de84", "f8ca44ef42db53f395a36ccf62ae6b78", "fa68512f", "c41f713db18e0e40"}, 0x28)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
execveat(0xffffffffffffffff, &(0x7f0000000100)='./file2\x00', 0x0, &(0x7f0000000240)=[&(0x7f00000001c0)='\xa1\x9fn\xb4dR\x04i5\x02\xac\xce\xe1\x88\x9d[@8\xd7\xce\x1f 9I\x7f\x15\x1d\x93=\xb5\xe7\\\'L\xe6\xd2\x8e\xbc)JtTDq\x81\xcf\x81\xba\xe51\xf5 \xc8\x10>\xc9\\\x85\x17L\xbf\xcf\x91\xdfM\xf3\x02^T*\x00\x02\xb9~B\x9f\xacl\x1d3\x06o\xf8\x16H\xaa*\x02\xf7\xfb\x06\xf1\x91\x92\xa8\xc2\xcb\xae\xb0\xb4\x93\xb8\x04\xf1\x99\xc2y'], 0x0)
rt_sigreturn()

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x7114d180, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
renameat2(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0xffffffffffffff9c, &(0x7f00000000c0)='./file0\x00', 0x1)
exit_group(0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
socket(0x0, 0x0, 0x0)
write$binfmt_elf32(0xffffffffffffffff, 0x0, 0x483)
bind$inet(0xffffffffffffffff, 0x0, 0x0)
bind$inet(r0, &(0x7f0000000000)={0x2, 0x4e21, @local}, 0x10)
connect$inet(r0, &(0x7f0000000180)={0x2, 0x4e21, @local}, 0x10)
socket(0x0, 0x0, 0x0)
setsockopt$inet_tcp_int(0xffffffffffffffff, 0x6, 0x0, 0x0, 0x0)
recvmmsg(r0, &(0x7f0000002440)=[{{0x0, 0x0, &(0x7f0000004940)=[{&(0x7f00000059c0)=""/102400, 0x19000}], 0x1}}, {{0x0, 0x0, &(0x7f0000002380)=[{&(0x7f0000000040)=""/231, 0x120}, {&(0x7f000001e9c0)=""/102400, 0x19000}], 0x2}}], 0x2, 0x0, 0x0)
sendto$inet(r0, &(0x7f0000000480)="fbbf0b5044e308cb7bd572aa2b42e9678bcf30eff9f3aed14dc94a114bd2b45956aebe2b108a87e865501a5f9e0383611afdd3f8bac3d5cfd7772a3ab48d0ba4b600731e357e38716c449fae7c28548a4f2105f44b8fd9b33041270ae01f1a405e3f650fc3b0926d481c364fca00000000000000006d3a3ede9fc738b8d86209c060161d5ddb5fcf3d09001117cdb9d055aa2d89fe3458720724853a876448d4a1fe9ef0569ad98a05ab5df763923b4e2c576e00000000000000000000000000000000002090666159e3075f7244cf4ec3d7814c0c934f44e200219e6dd7bc23397d5f2f2c76a5baddd0fd8c340362691ef226f7a0ac51b74b6be5ed6737948514cd466943d08eeb3895b80499da2b209da4f3ec5e3744ce3e863b0e04d0ec2f39edf50b6e08c4b47e448a35414763d687fbe3792ee15c5b9791310a346472723c100bf77a310b0ced8004b5ac6d48c40439f512e8ef34a53d65f55563f68136a577736ca5f6f66e01ef4ec2cdc8db34f6de50713adaa3f70189958263fddc1314f8a28ccdef6e1390c5fbaeadc3035d019f0dc75de307de6c0d010000000000000027083d1d5b4b013c503b863b560688d94de886b6dc73d5da2dfeff4bed1a49a975a6c8dbb480e4415ddca5657a5a8e3b111015499e952bb5e8d8f60de3d688df7802c6e8b27b31fac4e199038b79a3999920e634a5af162a9581b0e6647e410700246548234acacf9cb43ab332a37bbc926c39897395c974fda31536be523bf4260300730ae6136fecae5f0fa6ab2df8d98128b24589e3bbe5230e07dc5e0d65cc397e3f8204d48e59e8e294a6d7008ba8fba28cd5009fe1a7c569ce740078bf1c7389a6ba0f89257f0eac417aac0d2d89b05ee5dafa2f1d936c87264d077b2c0d5abdbc64ce943f895dd4c2e9dd7393543d89b00dc6b3a25045d4ec932366c67dfad087fa8dc104644828440bdf67dd97ebccb3bd", 0xfffffea5, 0xc000, 0x0, 0xfffffcef)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setsockopt$netlink_NETLINK_RX_RING(0xffffffffffffffff, 0x10e, 0x6, 0x0, 0x0)
shutdown(0xffffffffffffffff, 0x0)
rt_sigreturn()

r0 = creat(&(0x7f0000000140)='./bus\x00', 0x0)
rt_sigprocmask(0x0, &(0x7f0000000200)={[0xfffffffffffffffd]}, 0x0, 0x8)
setrlimit(0x1, &(0x7f0000000000))
io_setup(0x7, &(0x7f0000000280)=<r1=>0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
io_submit(r1, 0x0, 0x0)
io_submit(r1, 0xf10, &(0x7f0000000540)=[&(0x7f00000000c0)={0x0, 0x4000, 0x0, 0x1, 0x0, r0, &(0x7f0000000000), 0x200a00}])

unshare(0x2c020000)
semget$private(0x0, 0x403, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = semget$private(0x0, 0x3, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
semop(r1, &(0x7f00000003c0)=[{0x0, 0x0, 0x1000}, {0x4}], 0x2)
unshare(0x8020000)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
ioctl$KDSKBLED(r0, 0x4b65, 0x8000)
clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = getpid()
r2 = inotify_init1(0x0)
inotify_add_watch(r2, &(0x7f0000000040)='./file0\x00', 0x8100041a)
rt_sigqueueinfo(r1, 0x39, &(0x7f0000000000))

r0 = openat$full(0xffffffffffffff9c, &(0x7f0000000440), 0x200400, 0x0)
read$FUSE(r0, 0x0, 0x0)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000040)={0x2, &(0x7f0000000100)=[{0x84}, {0x6, 0x0, 0x0, 0xfffffffc}]})
socket$packet(0x11, 0x3, 0x300)

r0 = socket(0x1, 0x3, 0x0)
ioctl$sock_SIOCETHTOOL(r0, 0x891d, &(0x7f0000000300)={'lo\x00', 0x0})

r0 = openat$ptmx(0xffffff9c, &(0x7f0000000540), 0x0, 0x0)
fchown(r0, 0xffffffffffffffff, 0xffffffffffffffff)

mlock2(&(0x7f0000001000/0x3000)=nil, 0x3000, 0x76f2d6a68e1d8a44)

clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
faccessat(0xffffffffffffff9c, 0x0, 0x40)
exit(0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000000400)='./file0\x00')
chdir(&(0x7f00000001c0)='./file0\x00')
r1 = socket$unix(0x1, 0x2, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x100000000000008d, 0x4, 0x0)
connect$unix(r1, &(0x7f0000000080)=@file={0x1, './file0\x00'}, 0x6e)
rt_sigreturn()

syz_mount_image$tmpfs(0x0, &(0x7f0000000480)='./file1\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
llistxattr(&(0x7f00000003c0)='./file1\x00', &(0x7f0000000080)=""/177, 0xfffffebc)

chroot(&(0x7f0000000000)='\x00')

r0 = socket$inet_icmp(0x2, 0x2, 0x1)
connect$inet(r0, &(0x7f0000000040)={0x2, 0x0, @loopback}, 0x10)
connect$inet(r0, &(0x7f0000000200)={0x2, 0x0, @loopback}, 0x10)

socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000003480)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendto$unix(r1, &(0x7f0000000040)='j', 0x1, 0x0, 0x0, 0x0)
recvfrom$unix(r0, 0x0, 0x0, 0x122, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000040)={0x3, &(0x7f00000002c0)=[{0x6, 0x0, 0x0, 0x7fffff7d}, {0x2, 0x0, 0x0, 0xb}, {0x6}]})
openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)

r0 = socket$netlink(0x10, 0x3, 0x0)
sendmsg$netlink(r0, &(0x7f00000000c0)={0x0, 0x0, &(0x7f0000000000)=[{&(0x7f0000000100)={0x1c, 0x1a, 0x1, 0x0, 0x0, "", [@nested={0xa, 0x0, 0x0, 0x1, [@generic="880200020010"]}]}, 0x1c}], 0x1}, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
unshare(0x20000400)
r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0xb8080100, 0x0, 0x0, 0x0, 0x0)
r1 = gettid()
tkill(r1, 0x18)

symlinkat(&(0x7f00000000c0)='/', 0xffffffffffffff9c, &(0x7f0000000240)='./file0\x00')
rmdir(&(0x7f0000000000)='./file0/file0\x00')

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat$fuse(0xffffffffffffff9c, &(0x7f0000000100), 0x2, 0x0)
tee(r1, r0, 0x8001, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$unix(0x1, 0x1, 0x0)
ioctl$sock_SIOCETHTOOL(r0, 0x8946, &(0x7f0000000080)={'veth0\x00', 0x0})
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
fallocate(r1, 0x0, 0x102000006, 0x6)

arch_prctl$ARCH_GET_GS(0x1004, &(0x7f0000000200))

mprotect(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x0)
time(&(0x7f0000000000))

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffffff, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
lseek(r0, 0xfffffffffffffffc, 0x0)
rt_sigreturn()

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
semtimedop(0x0, &(0x7f0000000000)=[{}], 0x1, 0x0)
exit(0x0)

perf_event_open(&(0x7f0000000100)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3c43}, 0x0, 0x1, 0xffffffffffffffff, 0x0)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
ioctl$BTRFS_IOC_SNAP_DESTROY(0xffffffffffffffff, 0x5000940f, &(0x7f0000000240)={{}, "32de4c21c5359a600f570d03091aea800cf29706eed9776d72238000f6af41a3bbe66a30c457403b8afd3d956b3481bdd0b8662d3edf31975e25f9549ce6ded1ceb9a5c485570478a28b80fad5b40798782e9fc80cb23b6985ed5d822124d8998d4d4244817007b971e76a0f4260e8e337327940b9981be3d971b6a291995571c9e3031031c3350a64fa68f8525122e436d987ab2b2e97513ee8f04b8e5e845f11c621df21d35eaac820e881e57f157420fade6587d5d6f707a6c3eabee0d2b7f9054321e5ef05174f22796bb72c6bf279b00dba910fa77a584d9615421cf481355100b81b7bc5cf5ced1e02abefed080ce72acdb0afdb3dc2253636b1ceb355f9453db02d5c7978f32dd4b246acef0ef573ba835bf02e9d50d74908db0edcec2c98ad9629f60d27aa4d81d41e4e9d310462be0b94ca79349b091943082476b1662691567b6a892d3b4112e1070902faa2a643743c54bad3af1cb674c574e74f677c365451be22769ee81739b8205ee7d88450c7c9ada3b0ae4c99222f324c7e42aa8edb20ffd21b08d6a6d5d62f1f28c826f7dae9b7c33508d55a9763082c4c94944f200899419134181018c6d7cb2f8ee6be5fc94dba3cb6c4ba7625eadc6088647ed693c7ccaf710830f23562b2fd2a763f56b13a628a6cf620b40b84320dd3efb42b1f313fd21c1d187b3b4df0dba85ff438dcf482fcb756f6d0886216b46a01e0d305db556b84b1d1337ad40bb34b9bec54a7283460ce99b840a5b0524d772ee60ac54aab4bf52db30f4cfeef99c45129d26ee921d557682bdbb3a3d1f707911e7cf086fe1665e8d851958105d0c08058a1ed1acecf3f8eeb639fc5e934d8efc809a9d6b5c92bbd1fc08faff4f441ff82242ec82c959a1ee80ee8fac419ce43b4141f09e92eadebe20456b1ef1c36a9d8cdbb52963140022b42bcb400eee26eea7c32ed2f66c189cf2f4c6ba0a57b335d5bd24303c67e919a323c9e70587cd02a40cfd0062b2320d3fdd70dcd149201a4376faeaaa62028aac18c8ad47ce02e877119e744c784d4ee73052e68df59b6837eba0d69a7dac0ad01a1ca3f150c3b9e4b1d9fd8dc4f9ed637af486668a8dbd25c11479b8f144c8c04b98fbdd144ad40848248046f6d11a3ccfe85e878196cc404988b43f52b408025eac509444e33b87ec2756bd7cba1acddde315ece2ccbc4c03b9b1e892048d6ff16fe3186fd6792c1fd134131108fd3dfa8927223fad77406a174c6fbce6b9c208dee95ac1b3cf0d6e21cf3075c862bc87c8c4e48df983d8537c72eeb28cd625642c8ff53374480d27253e4324cbeaea29425e227d1edc1df8278fe95a333ba0821b188af19f4baa1af96ca52529a8549dccc98f4ee7381b578bd43022f238c10bf20ddc53741c9b37dc7fd81f1abb39da2d8097029dd1d4d9cdf6c527b317bfdfd367aa2799b47b796cb262f7fa6ef3be760290863c677550fd816fec72cfe5e9c2d1ef92f758a4365423ad1f2c649b67bf9e56e0ca638ac7aa3fd10f4a01e76bbb2b008a9593ee121b1077e431b1c2984dffee639d03070d5b946f339e15616debf0563a9813748710ccf985d2cfc229b0252d9af31c8615ab554e7e262dd4b0d8f9d376da5236a33548313f9be056171b6c104953c11483f0ef8a7ab2e7d33599a083eceddaea36e0f19c46c2e9d22d79b6aed6e0614bf615eb684f736f387915d2f8cfd86b7ed9ca2388d459728fc07a3d6b778458a55bf97a4b34c16b4538c91e1b7fdd8a12fa05bc199b31f3fc3b997ee9917ce720ab5a80aba35f299950b8d553b5e5943ded76c0eb8bd1cd6f6315b914ce67c61c54930fd62cef0ff6c0b12a0f8d969f19b75e2d5ed0b9540906041658da8da1bab6b1004c65bc6422ae5036bff55516ebd816916023f5c0905d055920abe2dd7f8ef8b59e23ff01dbdf3c3f40a21d71c4d0fad9595911019055e4527ad3f7cde48900d9ce3e5e931b5b158d49686efb64aaa5f24c1f8852fdbd0f607b7a332a9bbf67749af5479e3f0267bfb0f374ebbd3beaae5c349570c5c65b868340a939eea3f8d99390d422db4200822675501c811d3a38fc713cb159b1a04802963728c5861629d07f9e2b8b5eb9d50f2f3691c6d71b39f013978de30e0dbdac04f3b5dd9f6cf46ea661576828cb9eb26de98a0e55e99da5778e4942ebbeeb426282038c656d502b1ec7824f8a8b90aab41a11d105b14834ffd5339db38a3068f6c5a1e6a58b8e5051198a5ffb13e079ccf728d98726991f5720f63e502ee7394dd729f0b9cedbb9234ba5a073c4af80ba505e46ec315bad8882e2b051e1dcc57d0992c2b2526ab5cc18f6fb5337d3e5622c9c0448bcade8eaa2ea0adaf97c6e03dbe9c30eda770186117fbea8d80feb111e214cb6cfdf58ccb70a367cf246234e067be908d049e1fa59f6eea1b6a1c12b641db10f60e25f6660965c8b673e1d1991d6323c7278a722af016232ab83beecf87dcc0ace57696c5ff432a6baca1e52f04b15c7b6a5b4a2fb7ab38cae81aa89dfedecbe90f93b21904384d1ef6a75a2cf2570243151dd28e6c6b268beb7ab2acdf0dd5688e63474914742b9b7f7082f1cbf6dda800c8d065b2a7c66d195a04670d89bfc61a529925515b5f979fd71f6a417c43417d23c6e70ba46f735729d2ed21507677aaa679ae0bb18b62ce97df8807cb2e9fc17788c8879a3718be16a5c1c234a20efb13b7f20120cff7980be6313d742b548f913a55ed5886b01a971c32ac7b1d9904aaaf756b1dcc401f8bbcdee9db0356c5df357de82e962bc32e840cfcb5ce3d2299302fbea5cc2f43175b8f948a6b1d92f20ae097bf16218b0a68c04f680d82ebe982f759cbe619316fdf217942d337f694ca10a9ef6aa567bbd834ead86f06612da4a23b1310f1331d253c3ce81d588249a0f546b85956405dfd5bf4042ba5d3dc319da307862a0c14de2539376f3805955d2901ca5fc8ec7163eb0d3968914801014844e287a247daaa3fc4e7808e0d05b9af9bd3e71883c5b5b70b26dadc1cd49cab7c3a605e277e52f226e7a64c7a5b92f2d2035a446c79d880a5172742221075dc0263d3db1f4e8200e02c8d5bf401254f9701d10efe35da58bed6e795e5840349b9d35b7424ae758fd1b7180b5596de6acf725917382b30c2c3579f18119e6763a9ee4cb4f71bff1d3828dbf4e97f5bc09f74d8496e9939244fd91536fc219d3981456eade86c47275d8056b5f3034b99f343596718e381c23644afd02d835f3ab411be8f089b9b40f27a6573a581d24310262f7a0d46d367174cdf2c7e78afaa5a0f6fcc3811f61abe17cb532f450622b4c7323e66fbe46502f9e454ae9c5b5c6d2b360b69d1a1689c6ab4d2ec9230b9f47dad70bdf077a81e90d08f22bf1ebde8e8f2ddc0ddf1a80e0d69effaee620816a2c54cd9da2a3100fb4f34b0c1a5a766fe9fd9b5ef6ba47783c0270af3762783c9d0ed3623b5908a9ff3a8ad8d2676aaf69870e51d9f1bd5021765b9ac6c0fee2846b39ef69c96df482fc2b2cf45ea1b5bd427ae18b3378ef1f760909966effee0060bca9cddc151db36d708a0715de06f6b705455ca77997b2da4c837768a9207f58845049ef3a37b195a5fa379b7b6bf8378f3d5c16b9d399b75af620048684a3c82bd8ca798f6c0b8880dd5b191e9dc56729268d568f2836a711f8959476a7130fad7a0d24682cc9ab73cb13c5fee9be163ae451c5da8d530dae978ad044bd7d19dc742058b410c7116ff91346ccc056ee18de22cb3913ce952d9c2b56576413e323abd39e6ac3ab23dcc2c8c06109ccdd261b70063686f3355ca0deba671c6c68b4b19da79dddf1408290c3cbb0a0b6b17c7e06379770a2e00ad663dc14a920f1c369ad4ab42b0df333a5557e84a4f876e31f0e9c27e8695a91e45c118ed759f41a18798594fb6341a7106756952d2bb26594c9dbdeedf362ad82df0095eb17d6aa52ead894e81101229cc802f5ded7c1937ba9370abbb876ff13e0d482910d7bca3239885c33c52ac1317c6a936b5ff339ebda9e0c0d1b5a20868965005c126c40bcc8d254107b96793bcc3da8dd81f19661ec82fc6d9818dbb884b30a34702f9472f0752a257a55e5d49b261150967d9386383f806a6801697d2d54492fb807b28604a4300706868cb886d1075e50e764431d8f7fd788de282d24e40ce42da33cd451d025d4dc4e1f2256959d2a7d2305cec8177e392b1019b59c63039b77b29073f89810a479a50c0cc2425aa3c6fd0f92842e3b8aca0d9229d5e6e8a0f9b60495b22e9bdcebb456bc3ec20168c4f55300c2b0fdc10de3f0de72df93222bff14a0edd275de187e4b8fea39bfff7226287454a838b66e6c9be95dc6f58a4a01a10d35073b9da7e393e372829a3fe3785d4565b2676ba9de3467a07c48a3dc0c9cf7b7efbbbf6a8bd437ac676bf71327b19d64327a06eac829e094755e18261641d04126cabe034e130343f01ef27bff6cf089aa58c3dcc55c05e921bac518c2566e0fb79b8bf2f8dec4963be21e86d12e4fec93cc66ae188f224d0d781a6dfe51716775bc49730ca94bf313cdbafa68879eaa2c5d78e27ae5a9f29b531a44499726277c38354a9b192cfc59235a799090c33f31abdf49a731375319a66e96eaa8d320eef1d6be96c05afa36e0a58e1fc1849bf8b59f1238553566a5eeedc189a6e0b8509eff118074c49a337059e3a87d2ad8854318ce2344af5c0ed2d0f1358fc5cf08c336a99201f157d26bedf12d20bf995cefa4f839c7eff4f169f433fd90d6db0abddabd40ee9bce3599746631a166c396cdaf22181eb338324283cdbd8aac76092a8b081d64e1f38a5435e9c981eb175bd27324f261f3e7d6912c6e3631a678d5de1dfeae06d8608c8d0ab842ae20fc1bb4b6e6137f68299687f48ac79de71230a4480d17444b5fe5db0a0bf4504e0463d344625eb52dfefd583dea9edb8813d441d7dec84f70c013ebf987e7dfdf55a7e69046b41836714588eb623c16a5677f38a265c65fc8185a77f755fb2dc652a0c68ab4e6b3e54ef72b0e58baf41364b1dd81dd315aabdb03c05c10c7586f082689ca0f3b3611279dcef81c742c75b2f939cccdda2016e1a6b2a9ddf0d803f40a939d444bcbd7d83083559405b2d2de3539cb594ee5f6b2765a3ee9970e5387d8dbbbbebe1f617b5157c5738ad28f394974e61d6d9808fcf831c4a9e631df44aecf88f7a3f6b2a8e9149e478ac6c6f252e97a56b8e5d486721674a4cb8ba758e04187ec2315b744b9206917554a5600a7db2f42ca4b1bcf958dab1c3c6502299c379f0026f7e8c0be0e01105dd4a174cf11e6827552092e1dffc43d478f55326c982ead3ace38928ae904bbde45ef86f133d9891578ae6253640fbc808c033923e5e6cae4e89cc58074d5e5a77c953c1efa39875e22a7090cb22566e0499e4fb23abffcb2db5d472e1e14a5b7fd7feb306bd4552868327a5af84888ad375ba76ed87363ff0173308815499486d408f2f5e1522404a6b61eb8e61175559a5edb83c44c03be845cac91642bb88087c6b426c7fd4c2372b994ebbb4b6de0a5bb148c623309621ce417dfc9230a39bc00e817e5792af9955e9504c715cb684a6013e1f7d1b42ea8ead5e0665a72359266a216bb0ec8daced3ecee3d969ab8cc1decde56fd47517d835e12295b5896d8db1b44174fac6508fb7314f3e7caa248fbfd474236d8f46939dfbf3a6e0a6eedf83b108a86dcb4033eb52f85915759ff31fb4bb829fd000"})
dup(0xffffffffffffffff)
socket$unix(0x1, 0x0, 0x0)
exit_group(0x4000800004)
r0 = syz_open_dev$tty1(0xc, 0x4, 0x1)
writev(r0, &(0x7f0000000080)=[{&(0x7f0000000100)="8e0ee8679b1b28429b334a", 0xff8d}], 0x1)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x0, 0x12, r1, 0x0)
r2 = gettid()
setsockopt$sock_timeval(r0, 0x1, 0x0, &(0x7f0000000040)={0x77359400}, 0x10)
r3 = getpid()
tgkill(r3, r2, 0x2f)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
setsockopt$inet6_tcp_int(r0, 0x6, 0x7, &(0x7f00000001c0)=0x4, 0x4)

tee(0xffffffffffffffff, 0xffffffffffffffff, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x2200c0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
readlinkat(r0, &(0x7f0000002300)='\x00', &(0x7f0000002340)=""/40, 0x28)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
setsockopt$inet6_int(r0, 0x29, 0x40, 0x0, 0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
sendmsg$inet(r0, &(0x7f0000000140)={0x0, 0x0, 0x0}, 0x0)
bind$inet(r0, &(0x7f0000000000)={0x2, 0x0, @loopback}, 0x10)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit_group(0x0)
r0 = socket(0x400000010, 0x3, 0x0)
write(r0, &(0x7f00000006c0)="fc0000001400073eac093a00090007000aab0800080000000400e293210001c000000000060000000100000009000000fa2c1ec28656aaa79bffffffff000000bc00024000036c6c256f1a272fdf0d11512fd633d4400007f60eb9fa2e6b00000000fd368934d07302ade01720d7d5bbc91a3e2e80772c05f70c9ddef2fe082038f4f8b29d3ef3d92883170efdffffff3ae4f50504000000000040d815b2ccd243f295edbabc7c3f2eeb57d43dd16b176e83df150c3b8829411f46a6b56787e6e158a1ad0a4f41f0d48f6f0000080548deac270e37429f3694dec896592d69d381873cf1582740000000000000001ace36f071d0c227000000000000", 0xfc)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='gid_map\x00')
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setgid(0xee01)
r1 = inotify_init1(0x0)
fcntl$setown(r1, 0x8, 0xffffffffffffffff)
fcntl$getownex(r1, 0x10, &(0x7f0000000080)={0x0, <r2=>0x0})
ptrace$setopts(0x4206, r2, 0x0, 0x0)
timer_create(0x0, 0x0, &(0x7f0000000080))
timer_settime(0x0, 0x1, &(0x7f00000002c0)={{0x0, 0x989680}, {0x0, 0x3938700}}, 0x0)
pwrite64(r0, 0x0, 0x0, 0x100000001)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
openat$dir(0xffffffffffffff9c, &(0x7f00000001c0)='./file1\x00', 0x51f183, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x4000812, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000040)='net/snmp\x00')
read$FUSE(r1, 0x0, 0x0)

openat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x82541, 0x0)
setxattr$trusted_overlay_opaque(&(0x7f0000000000)='./file0\x00', &(0x7f0000000080), 0x0, 0x0, 0x0)
lgetxattr(&(0x7f0000000340)='./file0\x00', &(0x7f0000000380)=@known='trusted.overlay.opaque\x00', 0x0, 0xffffffffffffff7b)

clone(0x20806300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$full(0xffffffffffffff9c, &(0x7f0000000c00), 0x40102, 0x0)
r1 = socket$inet6_udp(0xa, 0x2, 0x0)
connect$inet6(r1, &(0x7f0000000300)={0xa, 0x4e28, 0x0, @loopback}, 0x1c)
connect$inet6(r1, &(0x7f0000000300)={0xa, 0x4e28, 0x0, @loopback}, 0x1c)
ftruncate(r0, 0x7fefffe1)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat$random(0xffffffffffffff9c, &(0x7f0000000180), 0x2100, 0x0)
close(r1)

r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
setsockopt$inet_icmp_ICMP_FILTER(r0, 0x1, 0x15, &(0x7f0000000040), 0x4)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000080), 0x80000000000a01, 0x0)
ioctl$TCSETSW(r0, 0x5403, &(0x7f0000000240)={0x0, 0x0, 0x0, 0xfffffffd, 0x0, "00800000a2b70021000010000093000400"})
write$binfmt_aout(r0, &(0x7f00000000c0)=ANY=[], 0xffffff78)
ioctl$TCSETS(r0, 0x40045431, &(0x7f0000000200))
r1 = syz_open_pts(r0, 0x0)
readv(r1, &(0x7f00000000c0)=[{&(0x7f0000001940)=""/4076, 0xfec}, {&(0x7f0000000280)=""/213, 0xd5}], 0x2)
ioctl$TCSETA(r1, 0x5406, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x2ea, 0x0, "0400"})
r2 = openat(0xffffffffffffff9c, &(0x7f00000008c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setsockopt$IP_VS_SO_SET_ZERO(0xffffffffffffffff, 0x0, 0x48f, &(0x7f0000000000)={0x0, @empty, 0x0, 0x0, 'ovf\x00'}, 0x2c)
read(r1, &(0x7f0000000100)=""/19, 0x13)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
r1 = socket$inet(0x2, 0x3, 0x1)
getsockopt$IPT_SO_GET_REVISION_TARGET(r1, 0x0, 0x43, &(0x7f0000000140)={'TPROXY\x00'}, &(0x7f0000000180)=0x1e)
tkill(r0, 0x40)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
recvfrom$unix(0xffffffffffffffff, 0x0, 0x0, 0x40000020, 0x0, 0x0)
rt_sigreturn()

r0 = socket(0x1, 0x1, 0x0)
ioctl$sock_inet6_tcp_SIOCINQ(r0, 0x541b, &(0x7f0000000040))

prctl$PR_SET_SECCOMP(0x16, 0x2, &(0x7f0000002300)={0x2, &(0x7f0000002400)=[{0x45, 0x1}, {0x6}]})

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
bind(r0, &(0x7f0000000000)=@in6={0xa, 0x0, 0x0, @dev={0xfe, 0x80, '\x00', 0x3c}, 0x2}, 0x80)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f00000000c0)='/proc/self/exe\x00', 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000080)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={0xffffffffffffffff, <r2=>0xffffffffffffffff})
fcntl$lock(r2, 0x7, &(0x7f0000000180))
fcntl$lock(r2, 0x7, &(0x7f0000000000)={0x0, 0x0, 0x4, 0x4000001})
fcntl$lock(r2, 0x7, &(0x7f00000011c0)={0x0, 0x0, 0x800, 0xfffffffffffffffd, 0xffffffffffffffff})
fcntl$lock(r2, 0x7, &(0x7f0000000040)={0x2, 0x0, 0x7fff, 0x0, 0xffffffffffffffff})
exit(0x0)

r0 = syz_open_procfs(0x0, &(0x7f0000000000)='fdinfo\x00')
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
fchown(r0, 0xffffffffffffffff, 0x0)

clone(0x6900, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet(0x2, 0x801, 0x0)
bind$inet(r0, &(0x7f0000000040)={0x2, 0x0, @broadcast}, 0x10)
connect$inet(r0, &(0x7f0000000180)={0x2, 0x0, @local}, 0x10)
connect$inet(r0, &(0x7f0000000000), 0x10)
setrlimit(0x0, &(0x7f0000000080))

syz_emit_ethernet(0x42, &(0x7f0000000000)={@empty, @remote, @void, {@ipv4={0x800, @icmp={{0x6, 0x4, 0x0, 0x0, 0x34, 0x0, 0x0, 0x0, 0x1, 0x0, @remote, @local, {[@timestamp={0x44, 0x4}]}}, @redirect={0x5, 0x0, 0x0, @multicast1, {0x5, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @dev, @remote}}}}}}, 0x0)

r0 = openat$full(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
fremovexattr(r0, &(0x7f00000000c0)=@known='user.incfs.id\x00')

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f00000002c0)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x9, 0x0)
chdir(&(0x7f0000000200)='./file0\x00')
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
symlink(&(0x7f0000000040)='./bus\x00', &(0x7f00000000c0)='./file0\x00')
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

r0 = openat$full(0xffffffffffffff9c, &(0x7f0000004fc0), 0x0, 0x0)
lseek(r0, 0x0, 0x0)

perf_event_open(&(0x7f00000003c0)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3c40}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
set_mempolicy(0x4001, &(0x7f0000000140)=0x100000000000001, 0x9)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
pipe2(&(0x7f0000000080)={<r0=>0xffffffffffffffff}, 0x6000)
write$P9_RREMOVE(r0, 0x0, 0x0)
exit_group(0x0)
bind$inet(0xffffffffffffffff, 0x0, 0x0)
syz_emit_ethernet(0x32, &(0x7f0000000140)={@local, @remote, @void, {@ipv4={0x800, @udp={{0x5, 0x4, 0x0, 0x0, 0x24, 0x0, 0x0, 0x0, 0x11, 0x0, @empty, @broadcast=0xe0000001}, {0x0, 0x4e22, 0x10, 0x0, @gue={{0x2}}}}}}}, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
getgroups(0x1, &(0x7f0000000140)=[<r1=>0xffffffffffffffff])
getpid()
rt_sigqueueinfo(0x0, 0x0, 0x0)
setgroups(0x1, &(0x7f0000000180)=[r1])

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000240), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f00000001c0)=[{0x0}], 0x1, 0x0, 0x0)
msync(&(0x7f0000ff0000/0x1000)=nil, 0xffffffffdf00ffff, 0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000002040), 0x200100, 0x0)
fchown(r0, 0xffffffffffffffff, 0x0)

clone(0x20002000, 0x0, 0x0, 0x0, 0x0)
io_getevents(0x0, 0x0, 0x0, 0x0, 0x0)
clock_getres(0x0, &(0x7f00000002c0))

io_setup(0x1, &(0x7f0000000080)=<r0=>0x0)
clock_gettime(0x0, &(0x7f00000000c0)={0x0, <r1=>0x0})
io_getevents(r0, 0x2, 0x2, &(0x7f0000000200)=[{}, {}], &(0x7f0000000140)={0x0, r1+60000000})

syz_emit_ethernet(0xc6, &(0x7f0000000100)={@local, @empty, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "9ea4b1", 0x90, 0x0, 0x0, @remote, @mcast2, {[@hopopts={0x0, 0x10, '\x00', [@generic={0x0, 0x80, "329fdb593b65d7e693cd211c7dbee4bd889a9d67ffc1c75af89dfb7cb4f1886fd788be9af3ac4121fd5f181946569acf24a80dc8bc5b4d77cebafb690de8910dc84f668bd6b812f1c082163f0603d7b9890c5712a83ce69ad5f4dbf574ca5fc31c82f4b2a8cc27fd1bea5542f47671de379fc1822aea6c24cbeb081183cac2e0"}]}]}}}}}, 0x0)

clock_gettime(0x812de26aba8f82ab, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x42, 0x3, 0x0)
pipe(&(0x7f00000001c0)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff})
pipe(&(0x7f0000000080)={0xffffffffffffffff, <r3=>0xffffffffffffffff})
write$binfmt_aout(r2, 0x0, 0x20)
splice(r1, 0x0, r3, 0x0, 0x10000, 0x0)
dup2(r3, r2)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000000)={0x2, 0x0, @local}, 0x6)
r1 = gettid()
tgkill(r1, r1, 0x10)

r0 = openat$tun(0xffffffffffffff9c, &(0x7f0000000000), 0x210402, 0x0)
fcntl$setflags(r0, 0x2, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
readlink(&(0x7f00000012c0)='./file0\x00', &(0x7f0000001300)=""/54, 0x36)

clone(0x2000eb00, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
clone(0x0, 0x0, 0x0, 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
execveat(0xffffffffffffffff, &(0x7f0000000140)='./file0\x00', &(0x7f0000000240), 0x0, 0x0)
clone(0x30005100, 0x0, 0x0, 0x0, 0x0)
r1 = gettid()
rt_tgsigqueueinfo(r1, r1, 0x1, &(0x7f0000000440))
tgkill(r1, r1, 0x10)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
r1 = syz_open_procfs$namespace(0x0, &(0x7f00000000c0)='ns/pid\x00')
fchown(r1, 0x0, 0xee01)
exit_group(0x0)

mprotect(&(0x7f0000fff000/0x1000)=nil, 0x7f7fffffb000, 0x3)

r0 = openat$pidfd(0xffffffffffffff9c, &(0x7f0000001300), 0x0, 0x0)
close(r0)
creat(&(0x7f0000001400)='./file0\x00', 0x0)
fsetxattr$trusted_overlay_origin(r0, &(0x7f00000025c0), 0x0, 0x0, 0x0)
fremovexattr(r0, &(0x7f0000000000)=@known='trusted.overlay.origin\x00')

set_mempolicy(0x1, 0x0, 0x0)
sync()

mkdir(0x0, 0x0)
ftruncate(0xffffffffffffffff, 0x8200)
write$cgroup_type(0xffffffffffffffff, &(0x7f0000000200), 0x175d900f)
recvmmsg(0xffffffffffffffff, &(0x7f0000007280)=[{{&(0x7f00000003c0)=@llc={0x1a, 0x0, 0x0, 0x0, 0x0, 0x0, @remote}, 0x80, &(0x7f0000000600)=[{&(0x7f0000007500)=""/73, 0x49}, {&(0x7f0000000480)=""/122, 0x7a}, {&(0x7f0000000500)=""/241, 0xf1}, {&(0x7f0000000680)=""/152, 0x98}], 0x4, &(0x7f0000000740)=""/255, 0xff}}, {{&(0x7f0000000840)=@can, 0x80, &(0x7f0000000d00)=[{0x0}, {&(0x7f00000009c0)=""/151, 0x97}, {&(0x7f0000000a80)=""/247, 0xf7}, {&(0x7f0000000b80)=""/163, 0xa3}, {&(0x7f0000000e80)=""/4096, 0x1000}, {&(0x7f0000000c40)=""/175, 0xaf}], 0x6}, 0x8}, {{0x0, 0x0, &(0x7f0000000dc0)=[{&(0x7f0000001e80)=""/231, 0xe7}, {&(0x7f0000001f80)=""/146, 0x92}, {&(0x7f0000002040)=""/230, 0xe6}], 0x3, &(0x7f0000002140)=""/185, 0xb9}, 0x1}, {{0x0, 0x0, &(0x7f00000022c0)=[{&(0x7f0000002280)=""/2, 0x2}], 0x1, &(0x7f0000002300)=""/252, 0xfc}, 0x8}, {{&(0x7f0000002400)=@tipc=@name, 0x80, &(0x7f0000002500)=[{&(0x7f0000002480)=""/113, 0x71}], 0x1}}, {{&(0x7f0000002700)=@in={0x2, 0x0, @initdev}, 0x80, &(0x7f0000003800)=[{&(0x7f0000002780)=""/34, 0x22}, {0x0}, {&(0x7f0000002800)=""/4096, 0x1000}], 0x3}, 0x5}, {{&(0x7f0000003840)=@phonet, 0x80, &(0x7f0000003cc0)=[{&(0x7f00000038c0)=""/130, 0x82}, {&(0x7f0000003980)=""/128, 0x80}, {&(0x7f0000003a00)}, {&(0x7f0000003a40)=""/107, 0x6b}, {&(0x7f0000003ac0)=""/151, 0x97}, {&(0x7f0000003b80)=""/81, 0x51}, {&(0x7f0000003c00)=""/162, 0xa2}], 0x7, &(0x7f0000003d40)}, 0x80000001}, {{&(0x7f0000003d80)=@in={0x2, 0x0, @initdev}, 0x80, &(0x7f0000003e40)=[{&(0x7f0000003e00)=""/7, 0x7}], 0x1, &(0x7f0000006280)=""/4096, 0x1000}, 0x800}, {{&(0x7f0000003e80)=@can, 0x80, &(0x7f0000004140), 0x0, &(0x7f0000004180)=""/69, 0x45}, 0x80000001}], 0x9, 0x40000120, &(0x7f0000004200)={0x0, 0x3938700})
read$FUSE(0xffffffffffffffff, 0x0, 0x0)
perf_event_open(&(0x7f0000000100)={0x1, 0x69, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3c41}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
r0 = socket$inet6(0xa, 0x2, 0x0)
bind$inet6(r0, &(0x7f0000000000)={0xa, 0x14e24}, 0x1c)
setsockopt$inet6_int(r0, 0x29, 0xb, &(0x7f00000000c0)=0x801, 0x4)
recvmmsg(r0, &(0x7f00000004c0), 0x2bd, 0x0, 0x0)
connect$inet6(r0, &(0x7f0000000640)={0xa, 0x1000000000004e24, 0x0, @empty}, 0x1c)
r1 = getpid()
sched_setscheduler(r1, 0x0, &(0x7f00000001c0))
read$FUSE(0xffffffffffffffff, &(0x7f0000004240)={0x2020}, 0x2020)
r2 = fcntl$dupfd(r0, 0x0, r0)
perf_event_open(&(0x7f0000000340)={0x0, 0x70, 0xfd, 0x3, 0x0, 0x8, 0x0, 0xffffffffa1def973, 0x1000, 0x2, 0x1, 0x0, 0x0, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x1, 0x1, 0x0, 0x1, 0x1, 0x1, 0x2, 0x0, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x95, 0x4, 0x0, 0x7838, 0x0, 0x1, 0x6, 0xa77, 0x2, 0x4b8}, r1, 0xa, r2, 0x2)
sendmmsg(r0, &(0x7f00000092c0), 0x4ff, 0x0)

r0 = creat(&(0x7f0000000040)='./file0\x00', 0x1e)
close(r0)
r1 = open(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
unlink(&(0x7f0000000040)='./file0\x00')
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x180000b, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
execveat(r1, &(0x7f0000000000)='\x00', 0x0, 0x0, 0x1000)

clone(0x100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = semget$private(0x0, 0x2, 0x0)
semtimedop(r1, &(0x7f0000000080)=[{0x0, 0xe265}], 0x7, 0x0)
ptrace(0x10, r0)
perf_event_open(0x0, 0x0, 0x0, 0xffffffffffffffff, 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
r1 = socket$inet(0x2, 0x3, 0x1)
getsockopt$IPT_SO_GET_REVISION_TARGET(r1, 0x0, 0x43, 0x0, 0x0)
r2 = openat(0xffffffffffffffff, &(0x7f00000002c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
tkill(r0, 0x40)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000100)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000000c0)={0x0, <r1=>0x0})
sendmmsg(r1, &(0x7f0000008600)=[{{0x0, 0x0, &(0x7f0000003140)}}, {{&(0x7f00000072c0)=@un=@file={0x1, './file0\x00'}, 0xa, &(0x7f0000007380), 0x0, &(0x7f0000000600)}}], 0x2, 0x0)

r0 = signalfd(0xffffffffffffffff, &(0x7f0000000000), 0x8)
unlinkat(r0, &(0x7f0000000040)='./file0\x00', 0x0)

perf_event_open(&(0x7f0000000180)={0x2, 0x70, 0x69, 0x1}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = open(&(0x7f00009e1000)='./file0\x00', 0x102440, 0x0)
fcntl$setsig(r0, 0xa, 0x11)
fcntl$setlease(r0, 0x400, 0x1)
open(&(0x7f00009e1000)='./file0\x00', 0x0, 0x0)
close(r0)

creat(&(0x7f00000000c0)='./control\x00', 0x0)
r0 = inotify_init()
r1 = epoll_create(0x5)
epoll_ctl$EPOLL_CTL_ADD(r1, 0x1, r0, &(0x7f00007a8000))
perf_event_open(&(0x7f0000940000)={0x2, 0x70, 0xfffffffffffffffd, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r2 = inotify_add_watch(r0, &(0x7f0000000000)='./control\x00', 0x70)
epoll_wait(r1, &(0x7f0000000040)=[{}], 0x39f, 0xc36)
inotify_rm_watch(r0, r2)
epoll_ctl$EPOLL_CTL_MOD(r1, 0x3, r0, &(0x7f0000000080)={0xa4000011})

setitimer(0x2, &(0x7f0000000000)={{}, {0x77359400}}, 0x0)
getitimer(0x2, &(0x7f0000000080))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000440)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
clone(0xb00c2000, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
ioctl$sock_ipv4_tunnel_SIOCDELTUNNEL(r0, 0x8947, &(0x7f0000000040)={'tunl0\x00', 0x0})

clone(0x0, &(0x7f0000000080)="7f9d5671f006141b8e468fe245f8ec5e2a5e7287192526e5e0914834989d2dfe25664a675aed783239a47153dc5650a19d66b253426da3947820a01b9fb377ba02", &(0x7f0000000100), &(0x7f0000000140), 0x0)
clock_gettime(0x0, &(0x7f0000000000))
sched_getscheduler(0x0)
sched_rr_get_interval(0x0, &(0x7f0000000040))

clone(0x60804780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = creat(&(0x7f00000006c0)='./file0\x00', 0x0)
clone(0x247ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = syz_open_procfs(0x0, &(0x7f0000000040)='task\x00')
exit(0x0)
lseek(r1, 0x0, 0x0)
fallocate(r0, 0x0, 0x0, 0x4000000e)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
openat$sysfs(0xffffffffffffff9c, &(0x7f0000000140)='/sys/class/power_supply', 0x0, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x28041, 0x0)
r1 = inotify_init1(0x0)
dup2(r1, r0)
ioctl$TUNGETSNDBUF(r0, 0x800454d3, 0x0)
rt_sigreturn()

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x10000000013, &(0x7f0000000100)=0x1, 0x4)
setsockopt$inet_tcp_int(r0, 0x6, 0x14, &(0x7f0000788ffc)=0x100000001, 0x4)
bind$inet(r0, &(0x7f0000000000)={0x2, 0x4e21, @local}, 0x10)
connect$unix(r0, &(0x7f0000000180)=@file={0x0, './file0\x00'}, 0x6e)
connect$inet(r0, &(0x7f0000000080)={0x2, 0x4e21, @local}, 0x10)
sendto$inet(r0, &(0x7f0000000140)="1102962a3b814a508870092479bd5f", 0x1, 0x8000, 0x0, 0x59)
setsockopt$inet_tcp_TCP_REPAIR_WINDOW(r0, 0x6, 0x1d, &(0x7f00000000c0)={0x0, 0x0, 0xde}, 0x14)
recvfrom$inet(r0, &(0x7f0000000200)=""/4084, 0xff4, 0x62, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
execve(&(0x7f0000000040)='./file0\x00', 0x0, 0xffffffffffffffff)

r0 = semget$private(0x0, 0x2, 0x0)
r1 = semget(0x2, 0x2, 0x713)
semctl$SEM_STAT_ANY(r1, 0x0, 0x14, &(0x7f0000000000)=""/4096)
semctl$SETALL(r0, 0x0, 0x11, 0x0)

clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x0, 0x0, 0x0, 0x0, 0x0)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f00000000c0)='uid_map\x00')
ppoll(0x0, 0x0, 0x0, 0x0, 0x0)
write$tcp_mem(r0, &(0x7f0000001380)={0x0, 0x20, 0x8, 0x20, 0x1}, 0x48)

timer_create(0x0, &(0x7f0000000180)={0x0, 0x0, 0x3, @thr={0x0, 0x0}}, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x106300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rt_sigreturn()
mkdirat(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x0)
mount(&(0x7f0000000040)=ANY=[], &(0x7f0000002280)='./file0\x00', &(0x7f00000022c0)='devtmpfs\x00', 0x400, 0x0)
r1 = open$dir(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
getdents64(r1, &(0x7f0000000340)=""/185, 0xfffffffffffffd0d)

creat(&(0x7f0000000280)='./file0\x00', 0x0)
io_setup(0x103, &(0x7f0000000040)=<r0=>0x0)
r1 = openat$fuse(0xffffffffffffff9c, &(0x7f0000000140), 0x2, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x100000000000008d, 0x4, 0x0)
io_submit(r0, 0x0, 0x0)
mount$fuse(0x0, &(0x7f0000000400)='./file0\x00', &(0x7f0000000000), 0x0, &(0x7f0000000080)=ANY=[@ANYBLOB='fd=', @ANYRESHEX=r1, @ANYBLOB=',rootmode=00000000000000000040000,user_id=', @ANYRESDEC=0x0, @ANYBLOB=',group_id=', @ANYRESDEC=0x0])
io_submit(r0, 0x2, &(0x7f0000000640)=[&(0x7f0000000180)={0x0, 0x0, 0x0, 0x1, 0x0, r1, &(0x7f0000000200)="0210000006ffffffffff600000000000", 0x1002}])

sched_getparam(0xffffffffffffffff, 0xfffffffffffffffe)

r0 = socket$netlink(0x10, 0x3, 0x0)
sendmsg$netlink(r0, &(0x7f0000000900)={0x0, 0x0, &(0x7f0000000000)=[{&(0x7f0000001ac0)={0x334, 0x15, 0x1, 0x0, 0x0, "", [@nested={0x1c1, 0x0, 0x0, 0x1, [@generic="9f98c1e11b034e7190b1cc8af7832493126ee03522e79d52676666d6234ed168eaf9b09fc8d32f2a6e5d61915c144f7d221206adc69a81e2125db7478afc8c7043df9414609521d8f915719acc8d2216983786a4f5f635ff9f70a2c5b5e53b81e2513f1f9d2bf5864a1db00ceb08e93732e7bb7c846ffcf09fe1d7b707554e74468c0260aac56462770df83a995d76049e474cd7373198db689ef485b8731741160b369690b8930678143ef7f9aea3bbc71f3fb5ca487a0308ed16baba45cc5dae89b0fc0ad80e57d8d543a3459da552336965ac3c4bf3b70587a3170e1668c5e9faa353560af95984c6db51469e54d93967282d223c16f85710cd2d", @generic="0cbeb00f34495fed193b03d7368ca25cb4a084e25785e983d6a32001949c35e9897f68684d6b63f6ced110faa348ec001c7a5c38425d4d10a1d1991cfec58712975c0a2a9b19ee278a4d20b5357a91c7f6bfcaa057bea1b1fe86db2761136ea5023f57e41a0cdb50d5b294f8cc10b6bc6c408a8a3bb0682e87a55483d96098eb85a553a23acf0fb9aa578781afc4f0931996e66b90ac49cae094e89dd7307c61633c36f25793b6bbe4a9f5c595ca9ac4c5dffa7e6b3bead1446cb75d87891a984e"]}, @nested={0x15d, 0x0, 0x0, 0x1, [@generic="eda177ff86c7df84794cd6e7e9083b1cf6d275b4de5d3197f9e23d5de69d968a15787f49f2c4013fb3fe51391ef982bef49a9ac41d1d7f77d2f322b916cd125f63b716a278bfeaccb609abe8b56c7ef3be7442086c98ad2479ffe5cd0c0533583c8474d6f01fc4e8d525ac8175b61894359aa73bca746c20b0dcdbc5f2c584e5db43fa99ec55f9242dd98a3c", @generic="41d5ab0c2e9b0eb483e84f0b0484d9ebd0abb331d6b78a83a48b5dfcc6cff295b93fae668b09c98658909f18911b496a1223a6502f183045d1391d6dea7fbc37a0fafb390c144f8ff4eea7ce9bffe423df4623e2e8dee4c156be2764bed93727b75fce3c27cb9030b0f7ff61c1ab4cac7bcac30d41607c1ffba9e2d3c0e2c59eca836bc6b9a5ad220c1e653cdecd16aa95e4cd149ce9749302317d059346522d747244910baf6023f2eb4d0a3895d53c558750d53fa5777a0fe9214c499fb03b698fea7f1150f24f8619d79e4b"]}]}, 0x334}], 0x1}, 0x0)

syz_emit_ethernet(0x3e, &(0x7f00000000c0)={@multicast, @empty, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "69aedd", 0x8, 0x0, 0x0, @rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01', @local, {[@srh={0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x5}]}}}}}, 0x0)

capget(&(0x7f00000001c0)={0x0, 0xffffffffffffffff}, 0x0)

pipe2(&(0x7f0000000240)={0xffffffffffffffff, <r0=>0xffffffffffffffff}, 0x0)
write$FUSE_LK(r0, &(0x7f00000002c0)={0x28}, 0xfffffdef)
fcntl$setpipe(r0, 0x407, 0x0)

syz_emit_ethernet(0x95, &(0x7f0000000200)={@local, @remote, @void, {@ipv6={0x86dd, @tcp={0x0, 0x6, "c8ac9a", 0x0, 0x3a, 0x0, @private1, @local, {[@srh={0x0, 0x0, 0x4, 0x0, 0x0, 0x0, 0x0, [@mcast2, @mcast2, @mcast2, @local, @empty, @rand_addr=' \x01\x00', @ipv4={'\x00', '\xff\xff', @local}, @mcast2]}, @dstopts={0x0, 0x0, '\x00', [@hao={0xc9, 0x0, @private2}, @generic={0x0, 0x0, "46c24b0e65256608d6558f16448241d68479eed4524d088602481e3f1f249fb6d726cd3b9be56ac6115e22124f364ea51b4c28eb8d63aa8335c4791ee8f2bcc95ebbded2ed5cf648c331ebdf6c8c7b374a2a03907d016ce70ecb0aa40bf0eebf139cf21b17d0a81ec4995171a837e50e7fbb34b551f8e1e14456c8d5883bc8f37bb119d366569614dfa6ce1d76357acd4367db330ef94bb3a3d82bd5c491350599f30f701d7b9d37016858fb27694548b0c312f4685052d67e777dc7c29e564f7b"}, @generic={0x0, 0x0, "916c3dd8ea47e7f07f9ba7fcc9294381573cbe6977b9b5606bb4a77657bda28d0990fca8a7ff8665facb80d7a02b929027fbf76658f050dbb4fc09a02ddc79d149c6d47fc34f74ca51a09499de350a"}, @calipso={0x7, 0x0, {0x0, 0x0, 0x0, 0x0, [0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0]}}]}, @hopopts={0x0, 0x0, '\x00', [@generic={0x0, 0x0, "bd641f963fd9a5a220947b623d381d75686d0ed3aa38ea3e99f096435f43d02025de06d0287f3a4dbab33e90af75e5c3c8ca763b19ee9942cf46a6f1ed7430887e52f86d366a6cfe572f9cfb439fc4305710cf787bf2ba121157edcdf10e834a9fcb83c0bc657d5ddaaf6856315567486be584732121a581c3e55a8fc48a4d4858ec3cc185069c20808ebcf2a00a6e2bbcb1407030459d0c1080d632470ea5bbc29811ed0d379df3747619c3"}, @generic={0x0, 0x0, "8b31a6563a"}, @ra, @hao={0xc9, 0x0, @rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02'}]}, @routing={0x0, 0x0, 0x0, 0x0, 0x0, [@private0]}, @routing={0x0, 0x0, 0x0, 0x0, 0x0, [@empty, @mcast1, @rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02', @private1, @initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @private2, @empty, @mcast1, @mcast2]}, @hopopts={0x0, 0x0, '\x00', [@ra]}, @hopopts={0x0, 0x0, '\x00', [@hao={0xc9, 0x0, @dev}, @hao={0xc9, 0x0, @remote}]}], {{0x0, 0x0, 0x41424344, 0x41424344, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, {[@exp_smc, @sack={0x5, 0x0, [0x0]}, @sack_perm, @md5sig={0x13, 0x0, "0aa7d6ad33ca5914d036cd4aef803aa6"}, @mss, @mptcp=@ack={0x1e, 0x0, 0x0, 0x4, "330182d4b18b7d95b1a45bf20f"}]}}, {"7048f716999724bec14e93c6e1c6cad1a54ff245755705960faf6b70f8ee5376"}}}}}}}, 0x0)

syz_emit_ethernet(0x4e, &(0x7f0000000100)={@local, @dev, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0xb, "801d00", 0x18, 0x3a, 0x0, @empty, @mcast2, {[], @mld={0x0, 0x0, 0x0, 0x0, 0x0, @mcast2}}}}}}, 0x0)

io_setup(0x1, &(0x7f0000000300)=<r0=>0x0)
r1 = openat$tun(0xffffffffffffff9c, &(0x7f0000000480), 0x0, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
io_submit(r0, 0x2, &(0x7f0000000140)=[&(0x7f0000000180)={0x0, 0x0, 0x0, 0x0, 0x0, r1, 0x0}, &(0x7f0000000100)={0x0, 0x0, 0x0, 0x0, 0x0, r3, &(0x7f0000000200)="de", 0x1}])

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
capset(&(0x7f0000002ffa)={0x20080522}, &(0x7f0000002000))
prctl$PR_SET_MM_AUXV(0x18, 0xc, 0x0, 0x0)
rt_sigreturn()

clone(0x4000008006ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f0000000080)='stat\x00')
shutdown(r0, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f00000000c0)='fd\x00')
lseek(r1, 0x0, 0x4)
exit(0x0)

clone(0x30005100, 0x0, 0x0, 0x0, 0x0)
syz_open_procfs(0x0, &(0x7f00000000c0)='fd/3\x00')
r0 = gettid()
tgkill(r0, r0, 0x10)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
newfstatat(0xffffffffffffff9c, &(0x7f0000000280)='.\x00', &(0x7f0000001500)={0x0, 0x0, 0x0, 0x0, <r0=>0x0}, 0x0)
setreuid(0x0, r0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x1, 0x0, 0x0)
creat(&(0x7f0000000080)='./file0/file0\x00', 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

r0 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000040)='memory.events\x00', 0x275a, 0x0)
writev(r0, &(0x7f0000000080)=[{&(0x7f00000002c0)="448154117c936cf04330955be3792c76bc5fb54d66535b6b8b6226c408a2a21d89134727dd58356fb5cc121243368b9ac0faafc9886f6fd5a637182c0dc0ecc40d8f", 0x42}, {&(0x7f0000000340)="740d65b6ad6587ec453d58e8989d255c0150e5443018d231ae343e4f684aa11fa878f0fcad9709fc4a2c05ac3c9820afb8d7afa0831a653b31ed17f65c6be165a4b4fa2adcc2b8d8628013e120a4eb869bda5f4cba8d5642f12d2405957e4bd5c904b55b7626aaaa5a9c6231cc04b5b310a70a96f0ca77696ccbfbf1d12c77cdb11dc21373d0d128edcec0c92b6ebe66ae984378ee3ff4f2b5d94f7b78ed6ab05c55e1d44b04904efe51a16ed6f01579d464dea5af47f0414b8539690d87", 0xbe}], 0x2)
mmap(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x1, 0x10012, r0, 0x0)
mkdirat$cgroup_root(0xffffffffffffff9c, &(0x7f0000000000)='./cgroup/syz0\x00', 0x1ff)

r0 = socket$unix(0x1, 0x1, 0x0)
getsockopt$sock_int(r0, 0x1, 0xa, &(0x7f0000000040), &(0x7f00000000c0)=0x4)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
unshare(0x4000000)

mknodat$null(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x1040, 0x103)
execve(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
openat$dir(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x60002, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = semget$private(0x0, 0x3, 0x1d4)
r2 = openat(0xffffffffffffffff, &(0x7f00000002c0)='/proc/self/exe\x00', 0x0, 0x0)
read$FUSE(r2, &(0x7f0000002f40)={0x2020, 0x0, 0x0, <r3=>0x0}, 0x2020)
r4 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
read$FUSE(r4, &(0x7f0000000280)={0x2020, 0x0, 0x0, 0x0, <r5=>0x0}, 0x2020)
setresgid(0x0, r5, 0x0)
setreuid(0x0, r3)
semctl$GETVAL(r1, 0x0, 0xc, 0x0)

clone(0x30005100, 0x0, 0x0, 0x0, 0x0)
rt_sigqueueinfo(0x0, 0x0, &(0x7f0000000080))
r0 = gettid()
tgkill(r0, r0, 0x10)

r0 = gettid()
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000140)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff})
ioctl$int_in(r1, 0x5452, &(0x7f00000002c0)=0xece)
fcntl$setsig(r1, 0xa, 0x12)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x1, 0x0, 0x0)
recvmmsg(r2, &(0x7f0000004800)=[{{0x0, 0x0, 0x0}}], 0x1, 0x0, 0x0)
dup2(r1, r2)
fcntl$setown(r2, 0x8, r0)
tkill(r0, 0x16)

r0 = socket$netlink(0x10, 0x3, 0x0)
ioctl$sock_SIOCSIFVLAN_ADD_VLAN_CMD(r0, 0x5452, 0x0)

mkdir(&(0x7f0000004840)='./file0\x00', 0x0)
statx(0xffffffffffffff9c, &(0x7f00000001c0)='./file0\x00', 0x4000, 0x0, &(0x7f0000000200))

r0 = socket(0xa, 0x2, 0x0)
sendmmsg$sock(r0, &(0x7f0000002480)=[{{&(0x7f0000000080)=@in={0x2, 0x4e20, @remote}, 0x80, 0x0}}, {{&(0x7f0000000300)=@ethernet={0x0, @random="21a1eb38948f"}, 0x80, 0x0}}], 0x2, 0x0)

setxattr$security_selinux(&(0x7f0000000040)='.\x00', &(0x7f0000000080), 0x0, 0x60, 0x0)

clone(0x38004100, 0x0, 0x0, 0x0, 0x0)
semget$private(0x0, 0x85583a7640102e2, 0x0)
rt_sigreturn()

clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
faccessat(0xffffffffffffff9c, 0x0, 0x0)
exit(0x0)

r0 = openat$null(0xffffffffffffff9c, &(0x7f00000021c0), 0x0, 0x0)
read$FUSE(r0, 0x0, 0x0)

r0 = socket$nl_route(0x10, 0x3, 0x0)
dup3(r0, 0xffffffffffffffff, 0xde8f636b07168e06)

clone(0x4000000206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
splice(0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0, 0x88000cc, 0x0)
exit_group(0x0)

pipe(&(0x7f00000001c0)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
write$binfmt_misc(r1, &(0x7f0000000000)=ANY=[], 0xfffffecc)
pipe2(&(0x7f0000000000)={<r2=>0xffffffffffffffff, <r3=>0xffffffffffffffff}, 0x0)
ppoll(&(0x7f0000000080)=[{r2}], 0x1, 0x0, 0x0, 0x0)
r4 = openat(0xffffffffffffffff, &(0x7f0000000180)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r4, 0x0)
preadv(r4, &(0x7f0000000280), 0x1, 0x0, 0x0)
splice(r0, 0x0, r3, 0x0, 0x4ffe2, 0x0)

rt_sigprocmask(0x0, &(0x7f0000000100)={[0xfffffffffffe]}, 0x0, 0x8)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
fcntl$getflags(r1, 0x40a)
r2 = gettid()
rt_sigqueueinfo(r2, 0xa, &(0x7f0000000040))
ppoll(0x0, 0x0, 0x0, &(0x7f00000000c0), 0x8)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt$inet_int(r0, 0x0, 0x21, &(0x7f00000000c0), &(0x7f0000000100)=0x4)

mknod(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
mount(&(0x7f0000000600)=@md0, &(0x7f0000000640)='./file0\x00', &(0x7f0000000680)='sysfs\x00', 0x402, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
rt_tgsigqueueinfo(r0, r0, 0x16, &(0x7f0000000000))
ptrace(0x10, r0)
ptrace$setregs(0xd, r0, 0x0, &(0x7f0000000140)="be9ff483111ec7c05a6e35766a9c5cd98ed812fee8ee677c468e2d01bb01fd560342c1891c9b259ef048c5ac173518e9cd261fa6cbe6a89b00bbcac9c7a8fc13d6d5661f30c63f72be485d1465e695187bb1482dff9c9d341184640629dc64bb37212a404898297b90eb535ba521052c06a3f59c8a96155e941ed41bc723c4062d6dc6418cd0808ff3a491d6694464e1b0f399e5247ee4bd2f5fc3b5e90fe3b6fb")

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
r1 = syz_open_procfs(0x0, &(0x7f0000000100)='net\x00')
utimensat(r1, 0x0, 0x0, 0x0)
rt_sigqueueinfo(r0, 0x39, &(0x7f0000000000))

r0 = epoll_create(0x101)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
epoll_ctl$EPOLL_CTL_ADD(r0, 0x1, r0, 0x0)
r1 = socket$inet_tcp(0x2, 0x1, 0x0)
r2 = fcntl$dupfd(r1, 0x0, r1)
r3 = fcntl$dupfd(r2, 0x0, r0)
sendto$inet(r3, 0x0, 0x0, 0x0, 0x0, 0x0)
epoll_ctl$EPOLL_CTL_ADD(r2, 0x1, 0xffffffffffffffff, 0x0)
fcntl$addseals(r0, 0x409, 0x0)
rt_sigreturn()

r0 = openat$null(0xffffffffffffff9c, &(0x7f0000004500), 0x0, 0x0)
request_key(&(0x7f0000000000)='pkcs7_test\x00', 0x0, 0x0, 0x0)
fcntl$getownex(r0, 0x5, &(0x7f0000000000))

pipe2(&(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff}, 0x0)
splice(r0, &(0x7f0000000080), r1, 0x0, 0x100000000, 0x0)

mknod$loop(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
mount(&(0x7f0000000280)=ANY=[], &(0x7f00000001c0)='./file0\x00', &(0x7f0000000180)='cgroup\x00', 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x0, 0x0)
r1 = openat$cgroup_procs(r0, &(0x7f0000000040)='tasks\x00', 0x2, 0x0)
write$binfmt_script(r1, 0x0, 0x0)

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = memfd_create(&(0x7f0000001fc1)='\x00\xac=\x9d\xd2\xdb\xe6\xbf\xb4\b\xedcJ\x8e\x84\xd4N\x12\x9b\x1f\t\xbd\x11+\x86T\x16\xa3\xb3\xae0\x9f9?\xefo\xa4k\x012>\xa1\x9c\x86x\x1c\x9f\x84\x195\xde\x97_\t~\xf3Y\x12\"p^\xc1\x0f', 0x0)
write(r0, &(0x7f0000002000)='/', 0x1)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x4, 0x11, r0, 0x0)
readlinkat(0xffffffffffffffff, &(0x7f0000000000)='./file0\x00', &(0x7f00000000c0)=""/9, 0x5)
rt_sigreturn()

symlink(&(0x7f0000000540)='.\x00', &(0x7f0000000580)='./file0\x00')
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
readlink(&(0x7f0000000080)='./file0/file0\x00', &(0x7f00000000c0)=""/236, 0xec)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
sendmsg$inet6(r1, &(0x7f00000003c0)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000980)=ANY=[@ANYBLOB="68000000000000002900000019"], 0x68}, 0x0)
rt_sigqueueinfo(r0, 0xa, &(0x7f0000000040))

r0 = socket$inet_icmp(0x2, 0x2, 0x1)
sendto$inet(r0, &(0x7f0000000200)="080000000000e272", 0x8, 0x0, &(0x7f0000000240)={0x2, 0x0, @remote}, 0x10)
recvfrom$inet(r0, 0x0, 0x0, 0x0, &(0x7f0000000080)={0x2, 0x0, @remote}, 0x10)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
sendmsg$inet6(r0, &(0x7f0000000300)={0x0, 0x0, 0x0}, 0x0)
setsockopt$inet6_int(r0, 0x29, 0x1a, &(0x7f0000000000), 0x4)

r0 = inotify_init()
inotify_add_watch(r0, &(0x7f00000005c0)='.\x00', 0x60000f6)
r1 = open(&(0x7f0000000340)='./file0\x00', 0x1031c2, 0x0)
write$nbd(r1, &(0x7f0000000600)=ANY=[], 0x74)
sendfile(r1, r1, &(0x7f0000000200), 0xa198)
unlink(&(0x7f0000000000)='./file0\x00')

r0 = syz_open_procfs(0x0, &(0x7f0000000000)='io\x00')
r1 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
preadv(r0, &(0x7f0000000200)=[{&(0x7f0000001b80)=""/4096, 0x1000}], 0x1, 0x0, 0x0)

clone(0x4000008006ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f0000000080)='stat\x00')
setregid(0xffffffffffffffff, 0xee00)
exit(0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
preadv(r0, &(0x7f00000000c0)=[{&(0x7f0000000240)=""/4096, 0x1000}], 0x1, 0x0, 0x0)

seccomp$SECCOMP_SET_MODE_FILTER(0x1, 0x0, &(0x7f0000000140)={0x1, &(0x7f00000000c0)=[{0x6, 0x0, 0x0, 0xffffffff}]})
membarrier(0x0, 0x0)

socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
getsockopt$sock_int(r0, 0x1, 0x9, &(0x7f0000000180), &(0x7f00000001c0)=0x4)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
bind$inet(r0, &(0x7f0000000000)={0x2, 0x4e22, @multicast1}, 0x10)
syz_emit_ethernet(0x86, &(0x7f0000000540)={@broadcast, @local, @void, {@ipv4={0x800, @udp={{0x5, 0x4, 0x0, 0x0, 0x78, 0x0, 0x0, 0x0, 0x11, 0x0, @private, @broadcast=0xe0000001}, {0x0, 0x4e22, 0x64, 0x0, @wg=@response={0x2, 0x0, 0x0, "5fe455845f5b99cff1e74c74ff3828424d7196db6fe35016b196bd3e72ccdfa4", "6fe031f0050b404a92bd51f376e33dcd", {"583f38b1b993003da631289a6a652100", "c28624eb806a31245bf038cc65937db3"}}}}}}}, 0x0)
ioctl$sock_inet_udp_SIOCINQ(r0, 0x541b, &(0x7f0000000040))

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$nl_route(0x10, 0x3, 0x0)
connect$netlink(r0, &(0x7f0000000000), 0xc)
bind$netlink(r0, &(0x7f0000000040), 0xc)
exit(0x0)

r0 = socket$inet6(0xa, 0x3, 0x6)
read$FUSE(r0, &(0x7f00000000c0)={0x2020}, 0x2020)
connect$inet6(r0, &(0x7f0000000080)={0xa, 0x0, 0x0, @loopback}, 0x1c)
sendto$inet6(r0, 0x0, 0x0, 0x0, 0x0, 0x0)

mkdir(&(0x7f0000000040)='./file0\x00', 0x0)
mount$overlay(0x0, &(0x7f0000000200)='./file0\x00', &(0x7f00000000c0), 0x0, &(0x7f0000000240)=ANY=[@ANYBLOB='lowerdir=/:file0'])
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
openat(0xffffffffffffff9c, &(0x7f0000004280)='./file0\x00', 0x0, 0x0)
getdents(0xffffffffffffffff, 0x0, 0x0)
r1 = openat$fuse(0xffffffffffffff9c, &(0x7f0000002080), 0x42, 0x0)
mount$fuse(0x0, &(0x7f00000020c0)='./file0\x00', &(0x7f0000002100), 0x0, &(0x7f0000002140)={{'fd', 0x3d, r1}, 0x2c, {'rootmode', 0x3d, 0x4000}})
read$FUSE(r1, &(0x7f00000077c0)={0x2020, 0x0, <r2=>0x0}, 0x2020)
write$FUSE_INIT(r1, &(0x7f0000004200)={0x50, 0x0, r2, {0x7, 0x1f, 0x0, 0x26000}}, 0x50)
syz_fuse_handle_req(r1, &(0x7f00000042c0)='\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00(`qH\x00', 0x2000, &(0x7f00000062c0)={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f0000006340)={0x20}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
r3 = open$dir(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
lseek(r3, 0x400000000000002, 0x0)
getdents64(r3, 0x0, 0xffffffffffffff7f)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
bind$inet6(r0, &(0x7f0000000000)={0xa, 0x0, 0x0, @ipv4={'\x00', '\xff\xff', @dev}}, 0x1c)
sendmsg$inet6(r0, &(0x7f0000000400)={&(0x7f0000000040)={0xa, 0x0, 0x0, @local, 0x100}, 0x1c, 0x0}, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
ioctl$sock_SIOCOUTQ(r0, 0x5411, 0x0)
exit_group(0x0)

semctl$IPC_STAT(0x0, 0x0, 0x2, &(0x7f0000000140)=""/84)

r0 = openat(0xffffffffffffffff, &(0x7f0000000440)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000840)={0xffffffffffffffff, <r2=>0xffffffffffffffff})
fork()
pipe2(&(0x7f0000000b40)={0xffffffffffffffff, <r3=>0xffffffffffffffff}, 0x0)
sendmmsg$unix(r2, &(0x7f0000000940)=[{0x0, 0x0, 0x0, 0x0, &(0x7f00000008c0)=[@rights={{0x14, 0x1, 0x1, [r2]}}, @rights={{0x14, 0x1, 0x1, [r3]}}], 0x30}], 0x1, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$unix(0x1, 0x2, 0x0)
setsockopt$sock_timeval(r0, 0x1, 0x14, 0x0, 0x0)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit(0x0)
rt_sigreturn()

clone(0x14244100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = semget$private(0x0, 0x20000000107, 0x0)
semctl$IPC_RMID(r0, 0xff7f0000, 0x10)
exit(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = memfd_create(&(0x7f0000000340)='-B\xd5NI\xc5j\xbappp\xf0\b\x84\xa2m\x00:)\x00\xbb\x8d\xac\xacva}knh#\xcb)\x0f\xc8\xc0:\x9cc\x10d\xee\xa9\x8bCc\xad\x89\x9ck\xde\xc5\xe96\xddU\xa9=\xcdJx\xaa\x8f~\xb90a\xa9\xb2\x04K\x98\x93?\x88Q\xf7\xd6\x1d\xa1\xce\x8b\x19\xea\xef\xe3\xab\xb6\xa5$4\xd6\xfe7\x0f\xe7\xd9$\xce \xabN\xae\xc9\xbd\xd3g@\xe1\'s\x0e\x90\xf2\xcdr\xb8(', 0x0)
write(r1, &(0x7f00000004c0)='1', 0x1)
mmap(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x1000004, 0x80051, r1, 0x0)
sendfile(r1, r1, &(0x7f0000000200), 0xff8)
r2 = gettid()
r3 = gettid()
syz_mount_image$fuse(0x0, &(0x7f0000000140)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
tgkill(r2, r3, 0x24)

r0 = socket(0x2, 0x3, 0x1)
getsockopt$IPT_SO_GET_ENTRIES(r0, 0x0, 0x41, 0x0, &(0x7f00000001c0)=0x100)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000180)={0x2, &(0x7f0000000000)=[{0x4d}, {0x6, 0x0, 0x0, 0x50000}]})
openat$cgroup_ro(0xffffffffffffff9c, 0x0, 0x100002, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000008740)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
getsockopt(r0, 0x6, 0x9, 0x0, &(0x7f0000000140))
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
setsockopt$inet6_tcp_int(r0, 0x6, 0x9, 0x0, 0x0)

socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000100)={<r0=>0xffffffffffffffff})
bind$unix(r0, &(0x7f0000000080)=@file={0x1, './file0\x00'}, 0x6e)
connect$unix(r0, &(0x7f0000000000)=@file={0x1, './file0\x00'}, 0x6e)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0xff5000)=nil, 0xff5000, 0x0, 0x200000005c831, 0xffffffffffffffff, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
pipe(&(0x7f0000000000))
mbind(&(0x7f0000196000/0x2000)=nil, 0x2000, 0x0, 0x0, 0x0, 0x3)

perf_event_open(&(0x7f000001d000)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x413d}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
r0 = socket$inet6(0xa, 0x2, 0x0)
syz_open_procfs(0x0, 0x0)
bind$inet6(r0, &(0x7f0000000000)={0xa, 0x14e24}, 0x1c)
recvmmsg(r0, &(0x7f00000004c0), 0x2bd, 0x0, 0x0)
connect$inet6(r0, &(0x7f0000000080)={0xa, 0x1000000000004e24, 0x0, @ipv4={'\x00', '\xff\xff', @loopback}}, 0x1c)
sendmmsg(r0, &(0x7f00000092c0), 0x4ff, 0x0)
listen(r0, 0x4)
getsockopt$EBT_SO_GET_INFO(r0, 0x0, 0x80, &(0x7f00000000c0)={'broute\x00'}, &(0x7f0000000140)=0x78)
setsockopt$inet6_IPV6_ADDRFORM(r0, 0x29, 0x1, &(0x7f0000000040), 0x4)

syz_emit_ethernet(0x3c, &(0x7f0000000000)={@multicast, @multicast, @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x13, 0x0, 0x0, 0x0, 0x1, 0x0, @private, @private}, @echo={0x8, 0x0, 0x0, 0x0, 0x0, "b2beb7f704caf60167d9d629f5ec6f3df8fa"}}}}}, 0x0)

syz_emit_ethernet(0x4e, &(0x7f0000000100)={@local, @dev, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "801d00", 0x18, 0x3a, 0x0, @empty, @mcast2, {[], @mld={0x83, 0x0, 0x0, 0x0, 0x0, @mcast2}}}}}}, 0x0)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = epoll_create1(0x0)
epoll_ctl$EPOLL_CTL_MOD(0xffffffffffffffff, 0x3, 0xffffffffffffffff, &(0x7f0000000040)={0x80000000})
ioctl$sock_netdev_private(r0, 0x8901, &(0x7f0000000040))
r1 = gettid()
tkill(r1, 0x2c)

syz_emit_ethernet(0x4e, &(0x7f0000000180)={@multicast, @broadcast, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, '\x00', 0x18, 0x3a, 0xff, @remote, @mcast2, {[], @ndisc_ra={0x86, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, [{0x1f, 0x1, "a78ce540b259"}]}}}}}}, 0x0)

r0 = socket$netlink(0x10, 0x3, 0xf)
sendmsg$netlink(r0, &(0x7f0000000100)={0x0, 0x0, &(0x7f00000000c0)=[{&(0x7f0000001280)={0x10}, 0x10}, {&(0x7f0000000900)={0x10, 0x29, 0x1}, 0x10}], 0x2}, 0x0)

mkdirat(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mount(&(0x7f0000002240)=ANY=[], &(0x7f0000002280)='./file0\x00', &(0x7f00000022c0)='devtmpfs\x00', 0x0, 0x0)
setxattr$security_evm(&(0x7f0000000040)='./file0/../file0\x00', &(0x7f00000000c0), 0x0, 0x0, 0x0)

syz_emit_ethernet(0x2e, &(0x7f0000000000)={@multicast, @local, @void, {@ipv4={0x800, @icmp={{0x6, 0x4, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x1, 0x0, @remote, @multicast1, {[@ra={0x94, 0x4}]}}, @address_reply}}}}, 0x0)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000001340)={<r0=>0xffffffffffffffff})
getsockopt(r0, 0x6, 0x5, &(0x7f0000000000)=""/28, &(0x7f0000000080)=0x1c)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
clone(0x5310d980, 0x0, 0x0, 0x0, 0x0)
exit(0x0)
syz_mount_image$tmpfs(0x0, 0x0, 0x0, 0x2, &(0x7f0000000880)=[{&(0x7f00000003c0)='/', 0x1, 0x195e}, {&(0x7f0000000200)="a0", 0x1}], 0x0, 0x0)
rt_sigreturn()

pipe(&(0x7f00000000c0)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
pipe(&(0x7f0000000000)={0xffffffffffffffff, <r2=>0xffffffffffffffff})
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
write$binfmt_misc(r1, &(0x7f0000000040)=ANY=[], 0xfffffc8f)
splice(r0, 0x0, r2, 0x0, 0x4df3, 0x0)

socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
r1 = dup(r0)
ioctl$PERF_EVENT_IOC_ENABLE(r1, 0x8912, 0x400200)
r2 = socket$nl_route(0x10, 0x3, 0x0)
listen(r2, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f00000007c0)='./file0\x00', 0x0)
mount(&(0x7f0000000800)=ANY=[], &(0x7f0000000040)='./file0\x00', &(0x7f0000000240)='tmpfs\x00', 0x45, 0x0)
r0 = openat$dir(0xffffffffffffff9c, &(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
openat(r0, &(0x7f0000000000)='./file0\x00', 0x40, 0x0)
rt_sigreturn()

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet_icmp(0x2, 0x2, 0x1)
sendmsg$inet(r0, &(0x7f00000005c0)={&(0x7f0000000000)={0x2, 0x0, @dev}, 0x10, &(0x7f0000000500)=[{&(0x7f0000000040)="268295af489e62", 0x7}, {&(0x7f00000000c0)="b4", 0x1}], 0x2}, 0x0)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

setuid(0xee00)
getresuid(&(0x7f0000000040)=<r0=>0x0, &(0x7f0000000080), &(0x7f00000000c0))
setuid(r0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
pipe(&(0x7f0000000000)={<r0=>0xffffffffffffffff})
clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
splice(r0, 0x0, r0, 0x0, 0x5, 0x0)
clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rt_sigreturn()
rt_sigreturn()
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000080)={0x3, &(0x7f0000000040)=[{0x15, 0x0, 0x1, 0x8000}, {}, {0x6, 0x0, 0x0, 0x7fff7ffe}]})
openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rename(&(0x7f0000000000)='./bus\x00', &(0x7f0000000040)='./bus\x00')
rt_sigreturn()

preadv(0xffffffffffffffff, &(0x7f0000001400)=[{&(0x7f0000001440)=""/4088, 0xff8}], 0x1, 0x0, 0x0)
ioctl$BTRFS_IOC_GET_SUBVOL_ROOTREF(0xffffffffffffffff, 0xd000943d, 0x0)
clone(0x4000028806ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f0000000140)='status\x00')
socket(0x0, 0x0, 0x0)
exit(0x0)
preadv(r0, &(0x7f0000000500), 0x123, 0x0, 0x0)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f00000001c0), 0x0, 0x0)
ioctl$TCSETS(r0, 0x40045431, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x0, 0x0, "850e32db99baeb0600"})
r1 = syz_open_pts(r0, 0x0)
r2 = dup2(r1, r0)
fcntl$setstatus(r2, 0x4, 0x42c00)

utimensat(0xffffffffffffff9c, 0x0, &(0x7f0000000040)={{0x0, 0xffffffffffffffff}}, 0x0)

socketpair$unix(0x1, 0x0, 0x0, &(0x7f0000000080))
clone(0x0, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
symlink(&(0x7f0000000180)='./file1\x00', &(0x7f0000000040)='./file0\x00')
symlink(&(0x7f0000000100)='./file0/file0\x00', &(0x7f00000008c0)='./file1\x00')
dup(0xffffffffffffffff)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000000c0)={0x0, <r0=>0x0})
sendmmsg(r0, &(0x7f0000008600)=[{{0x0, 0x0, &(0x7f0000003140)}}, {{&(0x7f00000072c0)=@un=@file={0x1, './file0\x00'}, 0xa, &(0x7f0000007380), 0x0, &(0x7f0000000600)}}], 0x2, 0x0)

clone(0x4000002206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
rt_tgsigqueueinfo(r0, r0, 0x16, &(0x7f0000000000))
ptrace(0x10, r0)
ptrace$getregset(0x4205, r0, 0x1, &(0x7f0000000080)={&(0x7f0000002300)=""/4096, 0x1000})

munmap(&(0x7f0000000000/0x2000)=nil, 0x2000)
timer_create(0x0, &(0x7f0000000000), 0x0)

rt_sigprocmask(0x0, &(0x7f0000000100)={[0xfffffffffffe]}, 0x0, 0x8)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1000006, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = gettid()
rt_sigqueueinfo(r1, 0xa, &(0x7f0000000040))
r2 = openat$tun(0xffffffffffffff9c, &(0x7f0000000480), 0x0, 0x0)
ioctl$TUNSETIFF(r2, 0x400454ca, &(0x7f00000000c0))
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000940)='./file0\x00', r3, &(0x7f0000000980)='./file0\x00')
ppoll(0x0, 0x0, 0x0, &(0x7f0000000540), 0x8)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000080)={0x2, &(0x7f0000000040)=[{0xf}, {0x6}]})
exit(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
creat(&(0x7f0000000180)='./file0\x00', 0x0)
rename(&(0x7f0000000080)='./file0\x00', &(0x7f00000000c0)='./file0\x00')

mknodat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x1300, 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x2, 0x0)
poll(&(0x7f0000000000)=[{r0}, {r0, 0x27}], 0x2, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet(0x2, 0x2, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x0, 0x0, 0x0)
rt_sigreturn()

clone(0x6900, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffffff, &(0x7f0000000040)='/', 0x208000, 0x0)
lseek(r0, 0x0, 0x0)
setrlimit(0x0, &(0x7f0000000080))

clone(0x4000c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket(0xa, 0x2, 0x0)
setsockopt$inet6_mreq(r0, 0x29, 0x14, 0x0, 0x0)
r1 = gettid()
tgkill(r1, r1, 0xf)

clone(0x4000000006ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f0000000140)='cgroup\x00')
exit(0x0)
preadv(r0, &(0x7f0000000100)=[{&(0x7f0000000040)=""/61, 0x3d}], 0x1, 0x0, 0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
setsockopt$inet_opts(r0, 0x0, 0x1, &(0x7f0000000080)='^', 0x1)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000000)={0x2, &(0x7f00000000c0)=[{0x3c}, {0x6}]})

pipe(&(0x7f0000000140)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = dup2(r0, r1)
poll(&(0x7f0000000040)=[{r0, 0x40}, {r2}], 0x2, 0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
bind$inet6(r1, &(0x7f0000000500)={0xa, 0x2, 0x0, @empty}, 0x1c)
listen(r1, 0x0)
connect$inet(r0, &(0x7f00000001c0)={0x2, 0x2, @remote}, 0x10)
r2 = accept4$inet6(r1, 0x0, 0x0, 0x0)
sendto$inet(r2, 0x0, 0x0, 0x0, 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f00000007c0)='./file0\x00', 0x0)
mount(&(0x7f0000000800)=ANY=[], &(0x7f0000000040)='./file0\x00', &(0x7f0000000240)='tmpfs\x00', 0x45, 0x0)
r0 = openat$dir(0xffffffffffffff9c, &(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
getdents(r0, 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigreturn()

io_setup(0x81, &(0x7f0000000040)=<r0=>0x0)
r1 = openat$fuse(0xffffffffffffff9c, &(0x7f0000000080), 0x2, 0x0)
io_submit(r0, 0x1, &(0x7f0000000500)=[&(0x7f0000000200)={0x0, 0x0, 0x0, 0x1, 0x0, r1, 0x0, 0x0, 0x0, 0x0, 0x2}])

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
r2 = creat(&(0x7f00000002c0)='./file0\x00', 0x0)
fgetxattr(r2, &(0x7f0000000480)=@known='system.posix_acl_access\x00', 0x0, 0x0)
exit_group(0x0)

r0 = openat$dir(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x80340, 0x0)
statx(r0, &(0x7f00000092c0)='\x00', 0x1000, 0x0, &(0x7f0000009300))

r0 = syz_open_procfs(0x0, &(0x7f0000000080)='net/route\x00')
preadv(r0, &(0x7f0000001340)=[{0x0}, {&(0x7f00000001c0)=""/4096, 0x1000}], 0x2, 0x0, 0x0)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
bind$inet6(r0, &(0x7f00000003c0)={0xa, 0x4e22}, 0x1c)
listen(r0, 0xfffffffd)
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
sendto$inet6(r1, 0x0, 0x0, 0x20000004, &(0x7f0000000040)={0xa, 0x4e22}, 0x6d)
close(0xffffffffffffffff)
r2 = socket$inet6_tcp(0xa, 0x1, 0x0)
connect$inet6(r2, &(0x7f0000000200)={0xa, 0x4e22, 0x0, @loopback}, 0x1c)
accept4(r0, 0x0, 0x0, 0x0)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt$inet_tcp_int(r0, 0x6, 0xa, 0x0, &(0x7f0000000080))
exit_group(0x0)

r0 = perf_event_open(&(0x7f000001d000)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x41c0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
clone(0x22004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
recvmmsg(0xffffffffffffffff, &(0x7f0000002540)=[{{0x0, 0x0, &(0x7f00000000c0)=[{0x0}], 0x1}}], 0x1, 0x0, 0x0)
ftruncate(0xffffffffffffffff, 0x42)
exit(0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000040)='net/igmp6\x00')
perf_event_open(0x0, 0x0, 0x0, r0, 0x0)
preadv(r1, &(0x7f00000017c0), 0x1b4, 0x96000000, 0x0)
socket$packet(0x11, 0x2, 0x300)
ioctl$ifreq_SIOCGIFINDEX_batadv_hard(0xffffffffffffffff, 0x8933, 0x0)
read$FUSE(0xffffffffffffffff, 0x0, 0x0)

perf_event_open(&(0x7f0000000040)={0x2, 0x70, 0x80, 0x1}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = socket$unix(0x1, 0x1, 0x0)
r1 = socket$unix(0x1, 0x400040000000001, 0x0)
bind$unix(r1, &(0x7f0000003000)=@abs={0x1}, 0x8)
listen(r1, 0x0)
connect(r0, &(0x7f0000985ff8)=@un=@abs={0x1}, 0x8)
r2 = syz_open_procfs(0x0, &(0x7f0000000180)='net/unix\x00')
sendfile(r0, r2, 0x0, 0x800000bf)

r0 = memfd_create(&(0x7f0000000400)='J6\x8a\xa9\x16\x11Ou\x9b\x94a\xac\x00\x00\x00\x00\x00\x00\x00\xe6a\xe5\xa3v\x9d\xd2\xd0\xf4\xa7jq\xcc\xf5\xc8\xbd\xdd\xb5\x02bB]-\xc1\xc1\x10\x9a\xfd\xb1\xc4\xc6\x8a\xd0\xe9\xc7Q\xb6\xd5Y-\xaf\x93\x8cnw{\xe3*B\x85\xfdh\x18O\xc2\xd6XI\xa7W\x93\xe3Q\x1e\x10O$w\xae\xee\xe5\x01@Z\x8cb\x81N3<U\x8c\xc9\xc4U\xc1\xb3&Z\xf8\xac6\xa0\x14c\xd4D7\x1c^\xbb\x83\x91\xf4\xe6&\xb1#c\xfev\xacd\xe7\x13\x86g\x18\x8b\xb5q\xac\xc1\x03\xd1\xa1\x91\xeel\xd1.\x99\x93\xc5*\x9c\x13\xa1*\xeejb\xb4 \x93\xe6\xe8\x02Bg3\x00\xc7\xfeCh\xb5\xc9\xb9\xa4\xe5\xd5\xe7c\xb5v\xe0\xfdL\xa6\xc0\xebq96\xae\xdcNS\xad\xba5\xa3\xe2w\x15\xb36;\xdbN\x81\xa9)A\xad\xff7\xe1*\xedq\xa6\x05\x00\xc0Us\xb1\xf4\x1a\x9e\xedP\xb1\xa0a\x1d\xe1L\x83\xa7\xda\xbfJ]\x82\x95\xa4\x11\xc2;<>g{\xca\x80\xe9d\x98\x8c*\x16\xb4d\xf17(\xf2\v\xc3\b\xdbU\n\x89oj\x94\xc9\xf3\x82\x0fz\rC\xd4\x8a\x1b\xc77\xd1Cg\xb4\x8dAe\xaa\xa4a\x154\x12b\xad\xe6\xa9\x1eA\x1ce\xa0CJ\xe4\xd2\x93\x83\x9d\xec\xe3/v\xa9\x04L\xc5\xc5WG\x15t(\x1b-X\x0ec\xee05\xd6lZ\x0e\x9d\xb3\xd4\x98M\xdd)\xb9U\x94\xd7\xc0J;\xcc\x8f\"7*\x1a\xf0\xa21\xb4I\xe4\'_>\x19\x94\x81\xadp\x00)8\x03\x97\x89\xd09h\xf5w\fC7\x83d\xf6\x0e\xd8\x11(tX\x11\xdf\x14\x87\x84>\xc5[ 6\x05\x0f\f\x84\x0fl\xe5\x17s\x01\xd1\xa2\xcc\xf7\xc2\xac\x04\xe8\x15\x9dt\xbc\x03\x16\xa9a\x9e\xd7\x01GPr\xf1\xa7Xr\x1c\r\x0f;\x00\x00\xb9\xca\x81\xbay\x83\xda\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00g\xb1\xabI\xde\x87\x16\xc3I\xc5w(\xd8\xd6\xbd\xba\xffSq\xed\xe0\xdf\xc3\x03\x95\x11\x01\xd1\xb9\xa2e\xc5`\x15\xf7\xf3Cp\xdc\x84)\x02\xb3\xbae\xcd\xb5\x84>\x12\xf5\xf9\x98a\xf9LPkn\xb8\xdd\xfe\x05B\xec\xefz\xb9\x11\xd8O\xda\xb2$\x95y+\xef\xddL\x1dg\xb6N\xbb\x1e\xa8\xd5}\xdcA\x13\xa1\xaa\xb7\xe3\xfa\x92jNf\x82\xb7\xc1\x87QF6\x19\x97o\x1d\xc3{.A\xf3\x96\xc6e_ \x0f\xe4!\xdbnA\xbe\x84\x17\x15\xa5\xb8\x03\x12Z!o\x89\x9d\xcc?W\x00\xb2\xa5\xcdUk\xe6\xb5\"\xb2\x82q\x9c[}\x16\x14\x97s\xba\xae\xcc\x11\t\xe0\xd5\x01\x99\xd9\xb7\xda#\xc3\x927\xa2\x1e\x06\xb8c\x0e\xacIK\xf2eJ\xf6\v\x19x\x98\xfcO\xc0\xb1<\x0e,\xe3%\xfa\xfchv\x83\xf0\x94=\xcb\xd6\v\x85\x81Ic\x17\xad\xef\xbaxo\xc3\x04A\x99@\x9c9\x12z\xcc\xbd\xd0\xfa\xe7\xa3y\x92\xea~MwGn\xf2\x98\xd0\x97\xaf\x966t\x97\xa0\xf1\xba\xbbw\xc3\xe3\x14\xdc\x9b\xd9W\xd3\xf2\nu\x96\xad\xa2\xdb\x826\xc8aJ\x89\x19\x81\x1b\xc1\"/:w;p\b\\\n\xc5\x8d', 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r2, &(0x7f0000000280), 0x1, 0x0, 0x0)
perf_event_open(&(0x7f00000002c0)={0x2, 0x70, 0x16}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
fcntl$addseals(r0, 0x40a, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
sched_setaffinity(0x0, 0x17, 0x0)
r0 = gettid()
tkill(r0, 0x24)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
setsockopt$inet6_MRT6_DEL_MFC(r0, 0x29, 0x4a, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000080)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = dup(r1)
ioctl$PERF_EVENT_IOC_ENABLE(r2, 0x8912, 0x400200)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
setresuid(0xee01, 0x0, 0x0)
faccessat(r0, &(0x7f0000000100)='./file0\x00', 0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
setsockopt$inet_int(r0, 0x0, 0x3, &(0x7f0000000000), 0x4)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
socket$unix(0x1, 0xbc77224527d1f3c3, 0x0)
rt_sigreturn()

r0 = epoll_create1(0x0)
r1 = epoll_create(0x7)
epoll_ctl$EPOLL_CTL_ADD(r0, 0x1, r1, &(0x7f0000000080))
r2 = epoll_create1(0x0)
epoll_ctl$EPOLL_CTL_ADD(r2, 0x1, r0, &(0x7f0000000000))
epoll_ctl$EPOLL_CTL_ADD(r1, 0x1, r2, &(0x7f0000000040))

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
getsockopt$sock_int(r0, 0x1, 0x1e, 0x0, &(0x7f0000001200))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = syz_open_procfs$userns(0xffffffffffffffff, &(0x7f00000004c0))
close(r1)

r0 = memfd_create(&(0x7f00000006c0)='K\xde\xeb\xc2}]b\x95m|^\xee0z\x85\xb5\xb5S\xf4P\x1bG\x8c$\xee#9\xda\xf7\x9e-\xb7[\x96h\xdb\xd0+\x9f\r\x1c\xfd\x958w\x1e\xcf\xf7\xf4\x1b\tQ\x9f\x82\xd7\xc7Rd\xab\xc7\v\xe6\xc7\x87^\xdf\xdd\xc7s\xaf\x9f\xc6\x9f\x06,f\x9f\xba\x1c\x90\xd2w\xb6\xaand\x85N\xaf\xd0!\xcd\xce4R\xad\xd1\xaa>!\xea\x00\x00\xc3\x9e\xef\a\xff\x00\x00\x00\x00\x00\x00\xa1\xb3\xfa\x81\xb5\x00\x00\x00\x00#\\\x94\x91\x04\xaf7\x9b\xaf\xec\x9d\xa9\f\xa5\x16\x12&\b-\x93`\xfe\xde3\x94\xef\xaa_\xec\xe1+\xcd\x00\x1d\xd2:q\xd6\xdd\x82\xc9\xc1\x8b{\xf5\xa8qBNlde8\xec\x00\xcf\x06o\xa6\xd3kv\xa7i*\x87\xb8W\xd3\xa4\xb5\xed\x867rk\xea\x85g\xb1\xa1\xe3R\xf0\v<\xf1\xa7\xa2\x84\x95:dhi\x1dOk\xab\x8b9\xb0[\xeedl#Xy#\xca9\xc3\xf5W\xbf\xcd\xaf3bM\xf9\x96>\xdbw\xabS\x7f\xad\xd6\x84]\xdc\xc3#\rW*\xcb\xa8\x8b\xe2\xa1\x17\x93\x02d\x1c\xd9Vm[0<P\xee\xedN\xdaEqPD\xbf\xc9\xff3\xcf\xc6U\xa5m4o\x8f\xd48\xa5#\xbcG\xa4\x19&\xb9\xf7\x94\xa2\xc0\xe5@\xdb\xb2=I\x05\x122(\xc7\x9e\x16W\xaf\x15{q\xfb\xf0\x82', 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x0, 0x11, r0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = gettid()
rt_sigaction(0x0, &(0x7f0000000080)={0x0, 0x0, 0x0}, 0x0, 0x8, &(0x7f0000000180))
r2 = gettid()
tgkill(r1, r2, 0x24)

syz_emit_ethernet(0x4e, &(0x7f00000000c0)={@local, @remote, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "275b1a", 0x18, 0x3a, 0x0, @empty, @local, {[], @mld={0x84, 0x0, 0x0, 0x0, 0x0, @loopback}}}}}}, 0x0)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
clone(0x6900, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = fork()
ptrace(0x10, r1)
ptrace$setregs(0xd, r1, 0x0, &(0x7f00000015c0))
r2 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r3 = gettid()
r4 = gettid()
tgkill(r3, r4, 0x24)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
setsockopt$sock_int(r0, 0x1, 0xf, 0x0, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = memfd_create(&(0x7f0000000100)='\v\x8b\x8a\xa9\x9fca\x16\x11O\xdd\xdfk(F\x99\xdf\x92\xd5>oJ\x02u\x9b', 0x3)
write$binfmt_misc(r0, &(0x7f0000000040)=ANY=[], 0x5)
fcntl$addseals(r0, 0x409, 0xb)
ftruncate(r0, 0x0)
exit(0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f0000000080)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x1, 0x0)
chdir(&(0x7f0000000200)='./file0\x00')
rename(&(0x7f00000000c0)='./file0\x00', &(0x7f00000001c0)='./file0\x00')
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

r0 = socket$unix(0x1, 0x2, 0x0)
getsockopt$sock_cred(r0, 0x1, 0xd, 0x0, &(0x7f0000000080))

clone(0x79346100, 0x0, 0x0, 0x0, 0x0)
setxattr$incfs_metadata(&(0x7f0000000040)='./file0\x00', 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()

clone(0xd00c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000000c0)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
bind$unix(r1, &(0x7f0000000240)=@file={0x1, './file0\x00'}, 0x6e)
connect$unix(r0, &(0x7f0000000000)=@file={0x1, './file0\x00'}, 0x6e)
rt_sigreturn()

timer_create(0x3, 0x0, &(0x7f0000000100))
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
prctl$PR_SET_SECCOMP(0x16, 0x1, 0x0)
timer_delete(0x0)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mknodat(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x1000, 0x0)
open$dir(&(0x7f0000000040)='./file0\x00', 0x981, 0x0)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='gid_map\x00')
pwrite64(r0, 0x0, 0x0, 0x100100000001)

syz_emit_ethernet(0x82, &(0x7f00000005c0)={@multicast, @multicast, @void, {@ipv6={0x86dd, @gre_packet={0x0, 0x6, '\x00', 0x4c, 0x2c, 0x0, @private0, @local, {[@routing={0x62}]}}}}}, 0x0)

io_submit(0x0, 0x1, &(0x7f0000000540)=[&(0x7f00000000c0)={0xffff7f0800000004, 0x0, 0x0, 0x1, 0x0, 0xffffffffffffffff, 0x0}])
r0 = openat$tun(0xffffffffffffff9c, &(0x7f0000000480), 0x0, 0x0)
ioctl$TUNSETIFF(r0, 0x400454ca, &(0x7f00000000c0))
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000040)={<r1=>0xffffffffffffffff})
r2 = dup(r1)
ioctl$PERF_EVENT_IOC_ENABLE(r2, 0x8912, 0xc00200)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
mount$overlay(0x40000a, &(0x7f0000000000)='./file0\x00', &(0x7f00000000c0), 0x0, &(0x7f0000000100)={[{@lowerdir={'lowerdir', 0x3d, './file0'}}], [], 0xf603000000000000})
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setrlimit(0x0, &(0x7f0000000080))
rt_sigreturn()

r0 = semget$private(0x0, 0x7, 0x0)
semtimedop(r0, &(0x7f0000000200)=[{0x0, 0xf001}], 0x1, 0x0)
semtimedop(r0, &(0x7f00000003c0)=[{0x0, 0xff4b}], 0x1, 0x0)
semtimedop(r0, &(0x7f0000000240)=[{0x0, 0xfff}, {0x3}], 0x2, 0x0)

perf_event_open(&(0x7f0000000000)={0x2, 0x70, 0x26, 0x1}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = socket$unix(0x1, 0x2, 0x0)
getsockopt$sock_cred(r0, 0x1, 0x11, &(0x7f0000caaffb)={0x0, <r1=>0x0}, &(0x7f0000cab000)=0xc)
setresuid(0x0, r1, 0x0)
clone(0x30802000, 0x0, 0x0, 0x0, 0x0)

prctl$PR_SET_SECCOMP(0x17, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
lseek(r1, 0x0, 0x4)

r0 = syz_open_procfs(0x0, &(0x7f0000000040)='ns\x00')
fchdir(r0)
r1 = open(&(0x7f0000000080)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000031c0)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', r1, &(0x7f0000000000)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00')

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
getrandom(&(0x7f0000000080)=""/119, 0xffffffffffffffd6, 0x2)

unshare(0x28000400)
r0 = openat$urandom(0xffffffffffffff9c, &(0x7f0000000080), 0x208041, 0x0)
flistxattr(r0, 0x0, 0x0)

r0 = socket$inet6(0xa, 0x3, 0x1)
setsockopt$inet6_mreq(r0, 0x29, 0x13, 0x0, 0x0)

capset(&(0x7f0000000000)={0x20080522}, &(0x7f0000000080))
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$IP6T_SO_GET_INFO(r0, 0x29, 0x40, 0x0, &(0x7f00000000c0))

syz_emit_ethernet(0x4f, &(0x7f0000000800)={@random="09479b2ce681", @random="ff070000ad79", @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "36e646", 0x19, 0x3a, 0xff, @private0, @mcast2, {[], @ndisc_ns={0x87, 0x0, 0x0, @initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, [{0x0, 0x0, "e7"}, {}]}}}}}}, 0x0)

syz_emit_ethernet(0x5a, &(0x7f00000001c0)={@local, @multicast, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "b9deee", 0xb, 0x2c, 0x0, @private1, @mcast2, {[@hopopts={0x2c}], @mlv2_query={0x82, 0x0, 0x0, 0x0, 0x0, @private2}}}}}}, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
r1 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r2 = openat(0xffffffffffffffff, &(0x7f0000000080)='/proc/self/exe\x00', 0x0, 0x0)
open(&(0x7f0000000040)='./file0\x00', 0x4000, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r2, 0x0)
preadv(r1, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
rt_sigreturn()

r0 = socket$unix(0x1, 0x5, 0x0)
getsockopt$sock_timeval(r0, 0x1, 0x15, &(0x7f0000000100), &(0x7f0000000140)=0x10)

poll(0x0, 0x0, 0x7fff)
clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f00000027c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
r2 = open(&(0x7f0000000000)='./bus\x00', 0x141042, 0x0)
mmap(&(0x7f0000001000/0xa000)=nil, 0xa000, 0x0, 0x4052, r2, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
timer_settime(0x0, 0x0, &(0x7f0000000200)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x16, 0x0, @thr={0x0, 0x0}}, &(0x7f00000003c0)=<r3=>0x0)
timer_settime(r3, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
utimes(0x0, 0x0)
exit_group(0x0)

clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f00000000c0)='uid_map\x00')
r1 = creat(&(0x7f0000000100)='./file0\x00', 0x0)
fallocate(r1, 0x0, 0x100000002, 0x6)
r2 = syz_open_procfs(0x0, &(0x7f0000000080)='mountinfo\x00')
sendfile(r0, r2, 0x0, 0xa)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
r1 = memfd_create(&(0x7f0000000280)='\x00\xc2\xea\x99\xbb\x1c\xf8jw\x97\x05\xa3\xa2\'\xdd\xe4q\xbf\t\x8c\xe0Y\xe5\xbcJ6\xfc\xa8\xcdj\xc1F\x02\xd0[\x97\x1a\xf7#Y?Q\x84P4\"\x99\x93\xca\xd4S\x00\x00\x00\x00\x00\x00\x00\x00\x00\b \xff\x13\x96\xe3?\xdfH\x8c\xe4V\xe2\xfe\v8\x04\xa5\xb9\xc4%\xf3\xf6y_w\xc5\xcf\x95k\x05\x00\xf9\x1e\xf8*<\x03\x85m5\x12\x04\xb7\x9d\xddbNO\f8\xeb\x18\xfa\xbf\xf3x\x86\xc2\xa3\t\x008\xb6XA]\xdc\xad\xbb!13g\xe6\x01\x00\x00\x80\x00\x00\x00\x00\x92\x01\xe6\xae\xb1\xecb\xf7\xe2\x80\xe2Xt\x94\x02\x9b\xfa\xb1\xd04q\x01!d\xce\x88\xc1\x03\xd5\xf8\x1f\xd3j\xe9w\x04[E\xdcg!\xb9\t\x8b\x9e+T\xb7\x9f \xa2\xa6q\xfe`,\xd5\xf8\xa8Ab\xd2\xf1_\xe0r\x9f\xd91\xd7e\xe6\xea\x1a\x8a\xd3\xfdR\x97\xf1\xea\x92\x8d)n/\xfd]\x8d>A\xf9 V\xd0\xf5\x9ep85\xce?\x9d\x95\x93\xa6)\xd9\xc9\x12\xb9\xd0\xcb\x89Q\x8bb\xa2e\xe7D\xa0\xcaL\xb4RS\x9f\xb5\xdc\'@\xc9\x91&\xeb!\x1d\xe1\x7f\x8a\xfeI\x8a\x01\x7f[&\x02T\x01^\r\xb9\xd5\x8eL\xbf\xed\xd3\x17\xdd\xf0\xba\xec\x03\x12\xec#Y\x886\xf3C\xf8y\x93<\r\xbd\x9bkyx6\xfbP\xf2Lib\xd9\xefb\xbc`\"\xf9\"A\xf9Y+:_\x97\xb3y\xa9N#\xae\xf1T\xc5\xfa\nt\x89\xebK', 0x0)
write(r1, &(0x7f00000000c0)='i', 0x1)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x1, 0x11, r1, 0x0)
r2 = inotify_init1(0x0)
inotify_add_watch(r2, &(0x7f0000000240)='./file0/file0\x00', 0x22000607)
rt_sigqueueinfo(r0, 0x39, &(0x7f0000000000))

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r1 = socket$unix(0x1, 0x5, 0x0)
connect$unix(r1, &(0x7f0000000100)=@abs={0x1}, 0x6e)
rt_sigreturn()
clone(0x80844240, &(0x7f0000000040), 0x0, 0x0, 0x0)

symlink(&(0x7f0000000580)='\x00', &(0x7f00000005c0)='./file0\x00')

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
memfd_create(0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x14244100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
semget$private(0x0, 0x0, 0x0)
semctl$SEM_STAT(0x0, 0x0, 0x12, 0x0)
exit(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x3000002, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setitimer(0x1, &(0x7f0000000000)={{0x77359400}, {0x77359400}}, 0x0)
getitimer(0x1, &(0x7f0000000180))

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$bt_hci(r0, 0x0, 0x2, &(0x7f00000000c0)=""/9, &(0x7f0000000100)=0x9)

mount$9p_rdma(&(0x7f0000000080), &(0x7f00000000c0)='./file0\x00', &(0x7f0000000100), 0x0, 0xffffffffffffffff)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0xb635773f06ebbeee, 0x8031, 0xffffffffffffffff, 0x0)
mkdirat(0xffffffffffffff9c, &(0x7f0000000080)='./file0\x00', 0x0)
mkdirat$cgroup_root(0xffffffffffffff9c, &(0x7f0000000000)='./cgroup.net/syz1\x00', 0x1ff)
mount$fuse(0x20000000, &(0x7f00000004c0)='./file0\x00', 0x0, 0x7a04, 0x0)
chdir(&(0x7f0000000100)='./file0\x00')
creat(&(0x7f0000000300)='./bus\x00', 0x0)
rename(&(0x7f0000000180)='./bus\x00', &(0x7f0000000380)='./file0/file0\x00')

mremap(&(0x7f0000ffe000/0x2000)=nil, 0x2000, 0x1000, 0xf, &(0x7f0000fff000/0x1000)=nil)

clone(0x7310d980, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet_icmp(0x2, 0x2, 0x1)
shutdown(r0, 0x0)
exit(0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
getsockopt$IPT_SO_GET_REVISION_TARGET(r0, 0x0, 0x43, &(0x7f00000000c0)={'IDLETIMER\x00'}, &(0x7f0000000080)=0x1e)

clone(0xd00c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
symlink(&(0x7f0000000040)='.\x00', &(0x7f0000000000)='./file0\x00')
rmdir(&(0x7f0000000180)='./file0/file0/..\x00')
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
sendmmsg$inet6(r0, &(0x7f0000003b00)=[{{&(0x7f0000000000)={0xa, 0x4e20, 0x0, @dev}, 0x1c, &(0x7f00000023c0)=[{&(0x7f0000000100)="4a983ace43a57f71e8c9dd40ece85f2ce467754a56043a50b81e56de9ec1d3817c51fc975630044dd79832cb704023fbffd5f88ea8e1aa81debf5246282bccb39a0c13c016183a8a5c43d32cac932bb62846cc153b15b9b07f80bbf9551014624cfe62e47c9cd80ee41c1c5adf7e223f4826f33daa77a84d9fae2e91155735a7473fd297377b120109306e541a90465376802611523164902debd33b8e4de7676be31c8874bd59d08886c3399d59db6d919635dd95cdbe473edac45a87781f9342f3d8752dab72ee91af3f983c1c196631f05aef393245b272bc3603d064b54a67b6eab62f23098adb2abb9c04c141cc1247a9c10e673495dd5381729e27021a0c520743795eef74e888f0897a0310d83e63f4c838d61a2046f31312e6f038231583731e2c5ead7d2a83b7a5ae3a209e4c5ae53225dcb08b6d68911f9802d68b5d18786d0e309b95dd2a34b534c8eb522b798bea9c88f03d50f82168e501b04ff8d0304aef395f4f56dd319c7346edb2aa3c083e2b84bed1ad4579404392d68970a6c98e4a2823a397ae05d389e4636eb99836968d67a2103b2594d502febbab0c45ce8d14283161d48c9143fca5934e0bb2db9632d323c6e2a168e540c513389f8969891721bedb9fb7a6c0bd0523fd0a80c32e95ed6e17c01cea081f18ce74fab5297eba2474c8fcce710504a30d430144f528301f042eee861530fbbf617e8f2936e0a226f0bf217555cbd3fc5b2c6effcd42979f4849d62962d53399d97d737b5b81025e549933ff10a01f5882bb430547f1b21db9811ed8046fe1a143de0f4621ca9ef349779d499cf35b19b23bf367e94a891840f2c97a125bee126336ea6548ce8f1cfad9072d380d51442e94b66d1a864f2989176dacbbcf5e47c86a03a7d35e6e2fd19569df10e981668ed55e6935726ff5aa0162e9f42efe5697fe7bcd594cafc8f1ff540e5141692eb4e3f1d85b09a3cdd8b0e28f85e6531007e04d6d9658330ae207e2fffc486362045b5e92c71401c06f5e89ce48bc17f3ad6a3ddd17fbaaa7ef77472f710598666b44f421c77bb01eb4a2b3d009ff52eb0a78261ecc69e58af08cb5b1bfb289d62df315c407cbae606d2f99e5cfdc43f2909e1f4775fbee0a8ddaf815c19ff24d000a6ecd6b94b34810c6566a03e104d4a2ea722dbb7615bca895abda750931236e5625b8082f31d38927cb1fbb3a1b5d33be8efbab62a9798c297bba038d5effeba199df2e80a5a93d40cc1a401efe8a684318298b46e5fca6b39b25516a097fe615c4e9daf93b1fbb3ffefbffa9ec57f2ebeed7d643a20b8a872c969e29f6e9a8c37807b7f6d49f0642f775773642197dacb421234a42021d2e8d132cf92d41c2b21c55acc2dcf2db8f4d5c4040db3aeb74022820113c2464060d8fa1d2a07b6166037374a5634db82a9ddc0574035412788ac2aedba4e180892e451305a4e63a152b02b5fe3d0850d32e1d97ff789c0fcc7c7e4d1fed6d3b0de740a58ce266f09b1ee3750ae95c214ef6e31ee1b7c0c1fbc0974c25b927100d95c76f07841117a42e6756ec9619eb961bc00a70c992a2fd7f5c99aa562292aa9c20ea8e37768aa1be5b3d4302f1c26352de903726387c5e1df56031def9427140ddba5f778146ab485f1b3e649f7dc58ca094330b4d5df65be5b378ec859759889bee5ac29e92e552d7225f5be2fd54023ff18d3b5a1f9328c0d54612cf780d703083c97b71ae341acea87274cc1a8bdb49e28a41f6abc65af0b41bf1ae88456f5d45c2a669104add389a9007e726c7f002be9f28387add345dbf246e554c0c39ee1d6ea3b8380351e5cc51a97c926219b42c6616abc78783c25be9081f1033d32a66fa627cf86d5f44924ee05cccc5db30fcbd7f8fd12d7936df20b2d8d4687e68e8b7482305b61c8713fd446ecac4dd9ec6c2f28f893ec3fbd13c7787a6a048db739447f686b7c28728480bc3512c8f6bfd2e81c714a9e933572c4d8fccbb79147e82842743ea2c898a1379efc098dc44bfbe331a6cb9d904d630fb0c3428a002a9c5567aa382af6315e44c75ac627844d50ab8f55d21e0fe3acbd8e80c790e05cc508d27fbb97257407bf050104528b04f8153e4b4c65ea6158e6f1b80b5b02cb3ec09280b6fb034e9ef3aaced87406e0632199954fe21b47c3d9b916e525e8e2bb589d0fbf2e4eb5394c374974482b12560882165ec25742f0f54c7ab207545ae59a8d95ee3dc308437dbe4ae5d8f136a41354ef1fb968055cf734463da557c029d39e1f6d5a0eddb426d6a03714583dcec0b5a9b8f79cb62b7dfc5f7f809f0a46aa92da5b44179f90e45c75ad4bedfdaeb3056c3033610a615b646d1386d35f8e0957263338d5f3e40886edec29f97f535434623038279ae433b8afc555f4b4b43d04bbf9b2130f16d4ae6a350e9aa727258f87823df490e6b06df20488363d30d8042971b7d1e413cbd9b4b5f579e3df388098619aed3881f7432c15023e30fc6039a8ffd2679e57a2dece00c554560bf39671e675563add675d20676222fc9c9d0ac7ea7d54413d81f99e571914f1c670fa96424b3ba3266633c820774067aa0d136557fbbcde7a3db6f52866957d0ef50539fb8d4914b75e663291ebbd5d511aa4598cf3330105cb6990b53dc131c2bd5b3be2a3260114a97ba773c6063f27fa4435dec74461d0c34712711ad3556ba1670b429779c62022e55fb2dff24da3d6828c42247c85f8672830cac686af3786f669fdc696c7ff0195dd75c2a874e3e5948fd3f8a9576f3b1054ed9f3260e3ed3598a8296f26c68f4f086ec2b5d3eba69deb03aaa0791192f580bb0b4819255d495ec814fd44a8fcc8957c473d521c11acbf4b2a239e7020f34cb9d95e2a243e06f536901fca9e0876215800eff0bb63addeacdcb6a47f94c21347fbaf83981bf2dcd5ea51010ecc02c12f4765da0aa8e0b63d77bf43522b67382abb8c388ce8486fdb325d15b56be5c2503b31d44da175bc34fedbd818622d3ea6a8cc207efaafe43ba83af6dacee25ec69e107c8a6fcd0a5a21685f5d5be2ffef9ed85c47aa009308361a35671d3df919d575b8fff9375549159fbd43ee7dcb28b416a650f8b59235aa29d98d2cbd55d62296b072c59663b40c945afed5a1cce1de007735998f4d7760aed4c55b8a9fe57ca9790b86b780ecbfc8f01b1304ec544cc06486f102ce2e64b501c4860009079e5ad6127c738c97ae08d15d2fdaa4cf65c690b64083dcf762b517c238c0c892be9e126728fa00a7499f36af4c62bb2ff2b817164afb200c916d9d64ed86c6ab8e38d05bac457494a7ce74cce880c8b6a93b45029b4142411aa0413a95578c4a68ec8898b681483232d0a4b89429f687b4913dd7e4236c218b39a93f65828490d99568bd89b29a619ff97888d46d4967db9fe6ed84f60f5595c48ec5812413772ff672596b1f689eb50484731e12665fbf5274bdf52322e2822a471baa111fb6d60167c0f552429578bb3fa4f6cd72839754596dce018916ea1db8f47a7d1582e80d286189df6e14bbc27f980c7e0baac8d2151a72920a186685f912f062b7bc568a743b198d54f29aeaad78fcb6dacf9e3d925d18e671432e7607d2e2fd506777daaa764ac057d590ca2452d29dae57c51f81ff9cc1922a067055b5ee44d18dd5c6a3cda435fedabd0a8f205a6e41c1f3975e2da50b0d8981fc7761858dfd1ff40b4756bf5a8c6c74b910623dd20165989300b6afabc7d8520d7ebd3eb5820777a54ca175b2c038d83fb94df188da70405d3d74a0569884b290f8348ba58c90ddb5aa9fde3b237704d926ba2cc95ecef368736aa1cafaaafbcac9e068e062a19193902623a16821be6a32b938e8c7b872f6805270e98deac07f860e87458400a413eeb92ad86baa7f396b14e76a5cb46b4444d2722cd7942ef1299cd60597d11071c20e41c7db1fa813c9d790cb9b6bc2fceb7cf0bbc2ff2c6477da26428cfd565eb067f37caa7dd77964dd84afef2e67ef1043f59837c884759f7fed72732b014e2b08b71dbe51f3068b78cacaa91e4e4a5da4c07254538fcedf05305375d02", 0xb48}, {&(0x7f0000001200)="83", 0x1}], 0x2}}], 0x1, 0x0)

prctl$PR_GET_SPECULATION_CTRL(0x3, 0x0, 0x0)

syz_mount_image$tmpfs(0x0, &(0x7f0000000100)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
lsetxattr$trusted_overlay_nlink(&(0x7f0000000000)='./file0\x00', &(0x7f00000001c0), &(0x7f00000000c0)={'L+'}, 0x16, 0x0)
getxattr(&(0x7f00000004c0)='./file0\x00', &(0x7f0000000500)=@known='trusted.overlay.nlink\x00', 0x0, 0xfffffffffffffca0)

mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x3, 0x32, 0xffffffffffffffff, 0x0)
r0 = openat$urandom(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
read(r0, &(0x7f0000000000), 0x2000)
close(r0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000080)='/proc/self/exe\x00', 0x105000, 0x0)
read(r1, &(0x7f0000000000), 0x2000)
close(r1)
munmap(&(0x7f0000000000/0x2000)=nil, 0x2000)

clone(0x200ce804bfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = memfd_create(&(0x7f0000000900)='#\'%nodev\x00\x7f\xe5\xd0ql\x86\xcd\xe6\x14\x93\xb0\x7f_,y<', 0x0)
write(r0, &(0x7f0000002000)='/', 0x1)
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x4, 0x11, r0, 0x0)
execveat(0xffffffffffffffff, &(0x7f0000000000)='\x00', 0x0, 0x0, 0x0)
exit_group(0x0)

r0 = epoll_create1(0x0)
r1 = epoll_create(0x5db1)
epoll_pwait(r1, &(0x7f0000000180)=[{}], 0x1, 0x101, 0x0, 0x0)
epoll_ctl$EPOLL_CTL_ADD(r1, 0x1, r0, &(0x7f0000000080)={0xa25b88c46e9fda5f})
pipe2(&(0x7f0000000040)={0xffffffffffffffff, <r2=>0xffffffffffffffff}, 0x0)
epoll_ctl$EPOLL_CTL_ADD(r0, 0x1, r2, &(0x7f0000000140))
epoll_ctl$EPOLL_CTL_MOD(r0, 0x3, r2, &(0x7f0000000000)={0x200c})
epoll_wait(r1, &(0x7f00000000c0)=[{}], 0x1, 0xab)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000000)={0x2, 0x4e21, @local}, 0x10)
connect$inet(r0, &(0x7f0000000180)={0x2, 0x4e21, @local}, 0x10)
recvmmsg(r0, &(0x7f00000021c0)=[{{0x0, 0x0, 0x0}}], 0x1, 0x0, 0x0)
setsockopt$inet_tcp_int(0xffffffffffffffff, 0x6, 0x0, 0x0, 0x0)
recvmmsg(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
connect$inet(0xffffffffffffffff, 0x0, 0x0)
sendto$inet(r0, &(0x7f0000000480)="fbbf0b5044e308cb7bd572aa2b42e9678bcf30eff9f3aed14dc94a114bd2b45956aebe2b108a87e865501a5f9e0383611afdd3f8bac3d5cfd7772a3ab48d0ba4b600731e357e38716c449fae7c28548a4f2105f44b8fd9b33041270ae01f1a405e3f650fc3b0926d481c364fca00000000000000006d3a3ede9fc738b8d86209c060161d5ddb5fcf3d09001117cdb9d055aa2d89fe3458720724853a876448d4a1fe9ef0569ad98a05ab5df763923b4e2c576e00000000000000000000000000000000002090666159e3075f7244cf4ec3d7814c0c934f44e200219e6dd7bc23397d5f2f2c76a5baddd0fd8c340362691ef226f7a0ac51b74b6be5ed6737948514cd466943d08eeb3895b80499da2b209da4f3ec5e3744ce3e863b0e04d0ec2f39edf50b6e08c4b47e448a35414763d687fbe3792ee15c5b9791310a346472723c100bf77a310b0ced8004b5ac6d48c40439f512e8ef34a53d65f55563f68136a577736ca5f6f66e01ef4ec2cdc8db34f6de50713adaa3f70189958263fddc1314f8a28ccdef6e1390c5fbaeadc3035d019f0dc75de307de6c0d010000000000000027083d1d5b4b013c503b863b560688d94de886b6dc73d5da2dfeff4bed1a49a975a6c8dbb480e4415ddca5657a5a8e3b111015499e952bb5e8d8f60de3d688df7802c6e8b27b31fac4e199038b79a3999920e634a5af162a9581b0e6647e410700246548234acacf9cb43ab332a37bbc926c39897395c974fda31536be523bf4260300730ae6136fecae5f0fa6ab2df8d98128b24589e3bbe5230e07dc5e0d65cc397e3f8204d48e59e8e294a6d7008ba8fba28cd5009fe1a7c569ce740078bf1c7389a6ba0f89257f0eac417aac0d2d89b05ee5dafa2f1d936c87264d077b2c0d5abdbc64ce943f895dd4c2e9dd7393543d89b00dc6b3a25045d4ec932366c67dfad087fa8dc104644828440bdf67dd97ebccb3bd", 0xfffffea5, 0xc000, 0x0, 0xfffffcef)

r0 = memfd_create(&(0x7f0000000180)='-B\xd5NI\xc5j\x9appp\xf0\b\x84\xa2m\x00\v\x18\x004\xa6Ey\xdb\xd1\xa7\xb1S\xf1:)\x00\xca\xd7Uw\x00\xbc\xfa2\xb3\xbb\x8d\xac\xacva}knh#\xcf)\x0f\xc8\xc0:\x9cc\x10d\xee\xa9\x8b\x066\xb8G\xd1c\xe1$\xff\x97k\xde\xc5\xe96\xddU)\xc98M\xcd\xfb\xcc\x82n=\x7f=\xcdJx\xaa\x8f~\xb90a\xa9\xb2\x04K\x98\x93=\xabQ\xf7\x05\x1d\xa1\xce\x8b\x19\xea\xef\xe3', 0x0)
r1 = dup(r0)
write$cgroup_pid(r1, &(0x7f0000000040), 0x12)
mmap(&(0x7f0000000000/0x1000)=nil, 0x1000, 0x4, 0x11, r0, 0x0)
r2 = syz_open_procfs(0x0, &(0x7f0000000080)='fd\x00')
fchdir(r2)
sendfile(r0, r1, &(0x7f0000000100), 0x220)
mkdir(&(0x7f0000000100)='\x00', 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x580000b, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
mkdir(&(0x7f00000003c0)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0)
symlink(&(0x7f0000000140)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38//../file0\x00', &(0x7f00000002c0)='./file0\x00')
unlink(&(0x7f0000000300)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38//../file0\x00')

r0 = syz_open_procfs$namespace(0xffffffffffffffff, &(0x7f0000000000)='ns/pid\x00')
fstat(r0, &(0x7f0000000140)={0x0, 0x0, 0x0, 0x0, <r1=>0x0})
setuid(r1)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r2 = openat$tun(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
sendfile(0xffffffffffffffff, r2, 0x0, 0x0)
rt_sigreturn()

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
perf_event_open(&(0x7f000025c000)={0x2, 0x70, 0x15}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
lseek(r1, 0x3, 0x1)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
openat$full(0xffffffffffffff9c, &(0x7f0000000380), 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
fallocate(r1, 0x0, 0x102000006, 0x6)

prctl$PR_SET_MM(0x23, 0x2, &(0x7f0000ff5000/0xb000)=nil)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
clone(0x6900, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = fork()
ptrace(0x4207, r1)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r3 = gettid()
r4 = gettid()
tgkill(r3, r4, 0x24)

r0 = socket$netlink(0x10, 0x3, 0x0)
fstatfs(r0, &(0x7f0000000000)=""/240)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
r1 = socket$packet(0x11, 0x3, 0x300)
ioctl$sock_SIOCGIFINDEX(r1, 0x8933, &(0x7f0000000100)={'lo\x00', <r2=>0x0})
setsockopt$inet_mreqn(r0, 0x0, 0x20, &(0x7f0000000140)={@multicast1, @multicast1, r2}, 0xc)

openat$cgroup_ro(0xffffffffffffff9c, &(0x7f00000000c0)='memory.events\x00', 0x26e1, 0x0)
r0 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000140)='memory.events\x00', 0x7a05, 0x1700)
r1 = creat(&(0x7f0000000040)='./bus\x00', 0x0)
fcntl$setstatus(r1, 0x4, 0x6900)
ftruncate(r1, 0x800)
lseek(r1, 0x0, 0x2)
r2 = open(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
sendfile(r1, r2, 0x0, 0x8400fffffffa)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
write$cgroup_type(r0, &(0x7f0000000000), 0x248800)

syz_emit_ethernet(0xce, &(0x7f0000000300)={@broadcast, @local, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "e85a01", 0x98, 0x3a, 0x0, @private0, @local, {[@hopopts={0x0, 0xa, '\x00', [@generic={0x0, 0x44, "15356ecc8aa456f4158be29b514122392e798b78062c3c2ee7ec9d4ef86644fc3a0d0ca0fbb4bb1157329d95288e8bf7dcf65a0122dc6200b41b38b6c6d6f7f26525ee9d"}, @pad1, @padn={0x1, 0x6, [0x0, 0x0, 0x0, 0x0, 0x0, 0x0]}]}], @param_prob={0x4, 0x0, 0x0, 0x0, {0x0, 0x6, "e17f46", 0x0, 0x0, 0x0, @mcast1, @mcast1, [@fragment]}}}}}}}, 0x0)

io_submit(0x0, 0x1, &(0x7f0000000540)=[&(0x7f00000000c0)={0xffff7f0800000004, 0x0, 0x0, 0x1, 0x0, 0xffffffffffffffff, 0x0}])
io_setup(0x1, &(0x7f0000000300)=<r0=>0x0)
r1 = openat$tun(0xffffffffffffff9c, &(0x7f0000000480), 0x103802, 0x0)
ioctl$TUNSETIFF(r1, 0x400454ca, &(0x7f00000000c0))
io_submit(r0, 0x1, &(0x7f0000000500)=[&(0x7f0000000000)={0x0, 0x0, 0x0, 0x1, 0x0, r1, 0x0}])

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$packet(0x11, 0x3, 0x300)
r1 = dup2(r0, r0)
sendto$packet(r1, 0x0, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f00000002c0)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x0, 0x0)
chdir(&(0x7f0000000200)='./file0\x00')
rename(&(0x7f0000000000)='./bus\x00', &(0x7f0000000300)='.\x00')
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
set_mempolicy(0x4004, 0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
setresuid(0xee01, 0xee00, 0xee01)
getresuid(&(0x7f00000000c0)=<r1=>0x0, &(0x7f0000000100)=<r2=>0x0, &(0x7f0000000140))
setresuid(r1, r2, 0x0)
tkill(r0, 0x4)
exit(0x0)

getgid()
read$FUSE(0xffffffffffffffff, &(0x7f0000000000)={0x2020}, 0x2020)
sync()
write$FUSE_INTERRUPT(0xffffffffffffffff, 0x0, 0x0)

clone(0x54041bc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
rt_tgsigqueueinfo(r0, r0, 0x16, &(0x7f00000000c0))
ptrace(0x4206, r0)
ptrace$cont(0x7, r0, 0x0, 0x0)
r1 = gettid()
rt_sigqueueinfo(r1, 0xa, &(0x7f0000000040))
exit(0x0)

socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmmsg(r0, &(0x7f0000001700)=[{{0x0, 0x0, 0x0}}, {{&(0x7f0000001580)=@xdp, 0x80, 0x0}}], 0x2, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet(0x2, 0x4000000000000001, 0x0)
r1 = dup(r0)
setsockopt$sock_int(r1, 0x1, 0x0, &(0x7f0000000000), 0xfffffe25)
rt_sigreturn()

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = socket$inet(0x2, 0x3, 0x6)
bind$inet(r1, &(0x7f0000000000)={0x2, 0x0, @rand_addr=0x64010101}, 0x10)
tkill(r0, 0x6)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setpriority(0x2, 0x0, 0x0)
r2 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r2, 0x6, 0x210000000013, &(0x7f00000000c0)=0x100000001, 0x4)
bind$inet(r2, &(0x7f0000000080)={0x2, 0x4e21, @local}, 0x10)
pwritev(r1, 0x0, 0x0, 0x0, 0x0)
setsockopt$inet_tcp_TCP_MD5SIG(0xffffffffffffffff, 0x6, 0xe, 0x0, 0x0)
connect$inet(r2, &(0x7f0000000180)={0x2, 0x4e21, @local}, 0x10)
setsockopt$inet_tcp_TCP_REPAIR_OPTIONS(0xffffffffffffffff, 0x6, 0x16, 0x0, 0x0)
setsockopt$inet_tcp_TCP_REPAIR(r2, 0x6, 0x13, &(0x7f00000001c0), 0xc7)
socket$inet(0x2, 0x0, 0x0)
setsockopt$inet_mreqn(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
setsockopt$inet_msfilter(0xffffffffffffffff, 0x0, 0x29, 0x0, 0x14)
setsockopt$inet_opts(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
sendto$inet(r2, &(0x7f0000000340)="081ca8251afe10b08de8a04095e752d1dfa9182ae2a3ff5101ef83ecb4651da4ecced7f3903f73503be897f4d5bfba51fc5401614736728c36a5063c15cf20ac7f755878674b00cc2c8cf01ad115af", 0x1, 0x4000081, 0x0, 0x59)
sendto$inet(r2, &(0x7f0000000000), 0xffffffffffffff94, 0x0, 0x0, 0x0)
recvfrom$inet(r2, &(0x7f0000000080)=""/8, 0xfffffffffffffd0b, 0x0, 0x0, 0xfffffffffffffd25)

r0 = openat(0xffffffffffffffff, &(0x7f0000000440)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000000)={0x1, &(0x7f0000000140)=[{0x6, 0x0, 0x0, 0x7ffffff6}]})
getrusage(0x0, &(0x7f0000000180))

r0 = open(&(0x7f0000000240)='./file0\x00', 0x102e0, 0x0)
fcntl$setsig(r0, 0xa, 0x11)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
fcntl$setlease(r0, 0x400, 0x0)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
timer_settime(0x0, 0x0, &(0x7f000006b000)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x16, 0x0, @thr={0x0, 0x0}}, &(0x7f00000003c0)=<r2=>0x0)
timer_settime(r2, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)
creat(&(0x7f0000000280)='./file0\x00', 0x0)
sendmsg$netlink(0xffffffffffffffff, &(0x7f00000006c0)={0x0, 0x0, &(0x7f0000000640)=[{&(0x7f0000000280)={0x10}, 0x10}], 0x1}, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = epoll_create1(0x0)
r1 = gettid()
epoll_ctl$EPOLL_CTL_ADD(r0, 0x1, 0xffffffffffffffff, 0x0)
r2 = gettid()
tgkill(r1, r2, 0x24)

r0 = socket$unix(0x1, 0x1, 0x0)
getsockopt$sock_int(r0, 0x1, 0x3, &(0x7f0000000180), &(0x7f00000001c0)=0x4)

r0 = inotify_init1(0x0)
fcntl$setown(r0, 0x8, 0xffffffffffffffff)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
fcntl$getownex(r0, 0x10, &(0x7f00000001c0)={0x0, <r2=>0x0})
r3 = getpid()
setpgid(r3, r2)

clone(0x38004100, 0x0, 0x0, 0x0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f00000000c0)='./file0\x00', 0x103041, 0x0)
fremovexattr(r1, &(0x7f00000001c0)=@known='user.syz\x00')
exit_group(0x0)
rt_sigreturn()

r0 = socket$netlink(0x10, 0x3, 0x0)
sendmsg$netlink(r0, &(0x7f0000001ac0)={0x0, 0x0, &(0x7f0000001a40)=[{&(0x7f0000001980)={0x14, 0x14, 0x1, 0x0, 0x0, "", [@typed={0x4, 0x0, 0x0, 0x0, @binary}]}, 0x14}], 0x1}, 0x0)

clone(0x2006d380, 0x0, 0x0, 0x0, 0x0)
io_getevents(0x0, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()

mount(&(0x7f00000001c0)=ANY=[], &(0x7f0000000080)='.\x00', &(0x7f00000000c0)='sysfs\x00', 0x0, 0x0)
mount(&(0x7f0000000100)=ANY=[], &(0x7f0000000080)='.\x00', &(0x7f00000000c0)='sysfs\x00', 0x0, 0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x11, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r2 = fork()
ptrace(0x10, r2)
ptrace$getregs(0x2, r2, 0x0, 0x0)
exit(0x0)

pipe(&(0x7f00000000c0)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = socket$packet(0x11, 0x3, 0x300)
setsockopt$packet_fanout(r2, 0x107, 0x12, &(0x7f0000000000)={0x0, 0x8000}, 0x4)
setsockopt$SO_ATTACH_FILTER(r2, 0x1, 0x1a, &(0x7f0000000080)={0x1, &(0x7f0000000340)=[{0x6, 0x0, 0x0, 0x9}]}, 0x10)
r3 = socket$inet_udp(0x2, 0x2, 0x0)
fcntl$setpipe(r0, 0x407, 0x0)
write$binfmt_misc(r1, &(0x7f0000000140)=ANY=[], 0x4240a2a0)
bind$inet(r3, &(0x7f00000002c0)={0x2, 0x0, @local}, 0x10)
connect$inet(r3, &(0x7f0000000040)={0x2, 0x0, @multicast1}, 0x10)
splice(r0, 0x0, r3, 0x0, 0x2ffff, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x10012, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
setxattr$security_smack_transmute(&(0x7f0000000040)='./file0\x00', &(0x7f0000000080), 0x0, 0x0, 0x0)
rt_sigreturn()

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mremap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x4000, 0x0, &(0x7f0000000000/0x4000)=nil)
rt_sigreturn()

clone(0x9106300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$unix(0x1, 0x5, 0x0)
setsockopt$SO_ATTACH_FILTER(r0, 0x1, 0x14, &(0x7f00000002c0)={0x0, &(0x7f0000000240)}, 0x10)
rt_sigreturn()

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
recvmsg(0xffffffffffffffff, 0x0, 0x0)
rt_sigqueueinfo(r0, 0xc, &(0x7f0000000040))

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
clone(0x30005100, 0x0, 0x0, 0x0, 0x0)
r1 = memfd_create(&(0x7f0000000000)='.\x00', 0x0)
sendfile(r0, r1, 0x0, 0xfffffffffffffff7)
r2 = gettid()
tgkill(r2, r2, 0x10)

r0 = socket$packet(0x11, 0x2, 0x300)
r1 = dup(r0)
ioctl$PERF_EVENT_IOC_ENABLE(r1, 0x8912, 0x400200)
r2 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
getsockopt$IP6T_SO_GET_INFO(r2, 0x29, 0x40, &(0x7f0000000100)={'filter\x00'}, &(0x7f0000000180)=0x54)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x10012, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mkdir(&(0x7f00000001c0)='./file0\x00', 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
setrlimit(0x0, &(0x7f0000000080))
truncate(&(0x7f0000000200)='./file0\x00', 0x0)
rt_sigreturn()

mknod$loop(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
mount(&(0x7f0000000280)=ANY=[], &(0x7f00000001c0)='./file0\x00', &(0x7f0000000180)='cgroup\x00', 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x0, 0x0)
r1 = openat$cgroup_procs(r0, &(0x7f0000000080)='cgroup.procs\x00', 0x2, 0x0)
write$binfmt_script(r1, 0x0, 0x0)

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
epoll_pwait(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()

r0 = open(&(0x7f0000000040)='./file0\x00', 0xc68c2, 0x0)
r1 = open$dir(&(0x7f0000002280)='./file0\x00', 0x0, 0x0)
r2 = creat(&(0x7f0000000240)='./file0\x00', 0x0)
close(r2)
write(r0, &(0x7f0000001400)="bb8f9f640903127a53527c6fbfe65d43b0e0586f2d40c7e7df58cac83420e83662d6e39bb6d5430622431454eedeeaee423d8f210bc3525fa7927c18d5fbc950d59ba94d02e39a6796d6d81ab13c2041136047d8da9375934d00f325499bfe7712208d387d41c31821c2a22d1325b556528e9b790b74053d1ed631c6ec8126d37c87216173138c00cef396868029af5b76bebac5e38b74d8bbc6ae66b6e202b6d505710377710ea7d43edf00e1a4c1c670bb4c263ce777da81abdd5ba5a5c82f67232f9b4d6f21b2e6afa8b38e4642b0daa2acbc0478d89b88e8b2094d4248855e5e81992e60be3afff0f3c3799350615489a901a659abdeca0c615a622ebf64175f990320e0356d4a11ed62eff72b709c23dd65942e8b534d7d775d370c1e435654a2634e6ee3649c4c3bcbe39e866f1eb9972af1a9cef42e701798a53dc92a242aadbac343e1765cf8ea5665e22deda69777e52b1e9e5d3edc022256939e1eee013448294911139d5b9c6241049fecdf9b31f4cdf6cff65d71b5071ef70e1798347c8846aa5b905e83050e3e606986ed3f603d18c5cdeb11cda1ce3abddea9376231af130e19fc7601ec1ab8cb5a7de9e2174547f18225b09a54fc8720dbb91eb69c1be88e601c3c9df4f2758f39a5151951b9c6dbb419ffd783a34c9fad10f201b8724d31865e2fc1fcf48db2be713053b43a0e3943c1e2b08e8a26e946c439d891db2a80b5ffa054bc8d0177d3214056250c61a537d2ec7630fb90395df2cd6aa9c5e573365db580520bdc3f1c6b2be992e1516a4b810a11935dc780699e461d9715f70c6d103ff49bf168f3cff4c0d0d6ba5671fcd2975450b0e1a3474139cb0f2d3476fcd87a8bccbeb2f5f8a821017b879aa5b9d1fafa9f4c429a74cd2da782114d97bf031746a817dd0293c4eddc3a9ecb5afe496f4971474ccd827449409f07cf94dda2e7dbe8520b5afff10a7e3b0b4289a167fc692635b5b7426d9e771b95860b09a3e752c867213c772e48ba30a78feecc7ff26e63e1749c62b52d377ac5cc52db830c965e04ccfcfb24ceacc69cfdc2094124dd27b2d68f699476a2562bab1de117ffc2b0702ee8b0b77f85fecec3a25b37ab7eb06232c9a73b4470f1727c82581d81942abf42d3ea37957927ad3dbd0ebe46678a9f4d25c47acaac83c14210a54b71fed40df017e2be27f01e3e6ea3ea381ca14efd202858e6535ad4f79a8cdf47e385b740a70c14e3651731a45cc0ca4a56f2e903cad0384efcc364caf67e09551d35c682ebc90a9286c4d274ba1b8ced742ede4e55a62db7ef9a96fde2add05b41599fd72a79ba280cc125cb266ef21b7eeafe3ee6aa78eef33fc66454549c3ec298cb683a55a32208cf6ce88b560166159d59e65f30540f62c5437652ea7f2a02f87ca242dd2250f58e75938a9a3ded51ae6598c2aba53287ff5cdaab17be7f4310f8e07e310d52778de79e1c2e4391b254be38910aa1cf6e1ca0e75d24be97d59f7025f16c6fbd549529b74e319c58f50438765ef0e3dc394eeb75dec41b3d80435b17520c97f5cd7ee692fb86d188fb0089fbf737dc1b96b9cc49be23a38a8cd92e5075f349993d7866369dd25eccbe1481477a05fee14e0edd1617921d7fcaa8fdc642e63b64388fa5b63442ff1466b1938d02546750b01ed9c980282ccbe10f204741249aed88c555ed6fdf7f68af9209807a71bbf4f31bef5a4223466da74ba8b034df529aef6ec6610a0d25973961e50e02af22d0ca8be1b9804a5918acbdb536e8f2f441ec9d640ed15133ee747440c86fc4526c9195954528673d25c8390170f3c19cb0b0c30b9e634c7ebae96946ae97c19eabca92226da925d22be37abbe0740938ea899ec42fd529a3b1063981e4c154219df5cf5af60a29b5a8ee530905725a14b28900eae937e705401ba8f632a7bca00d9724a992afdc9ed14aac71b8e3a7ee5ca095888feb195b4e083c3b611a1c2f8d092febe3b9f5f0df61e8d3c31a643c935b76bc1ad4265164e755484beb06610510bc51e8c6da8d71123bbe83a5e4128f41cf5c486d6a60496c300c406f990bee485cbdef794f2663ee66d2b18d8e55210c25c04b1a0c6d9c5f904e72806d2d4f5e5439bcdf146343cdffe4d0d70d42a9959cff9bd50c37cd478b0b0add16aae4dc839b46bca2ac7547144b6a422aed5e2db661bcb31a82bc0fc678e71a6cb090ef772860f3008b4152b5d281033be4a77b367baec3ab8ce7c83c601b11c8199bafcd15161a5454a6a982dbc3c2e3a5172b6a63e4904817075754eccbb0188c9cb2e5da9600f567485014887463b40f189b7ec3c5c0f36d502509e402c285765c78417ce6b3aab130ee79622dcd8ada842233e73a14554ee5e4995e32b3fe4075e247eb9bdeef64d1a7436c9b5782fb2f84f74e4c6e976289729c37b5bb8200a9480d181a6b11f5ec229b818134b8334967ae935ac1d81be4776fae4cb68b6fa330e93bd8de388b38455d569159bcd166df030a6dfedf28caf4608de7243f5df2c76f6680c301d819dc67d24d2f780432a931700a253b0a5b075195c6dbfd1fe17a1c11a3dffa872a07b877adc66d069fffb6d8326e1998c5a337c3d530250cae335ceffcf81dc438a47e73490d050a053813dbe6674e42c91ae94b4a88144f36adc1b08b4bcd6ddf4058c4e08d4dc83d5fbd843ee27eaf50b297c220350260d9abbeeb6deb921e50cae0ea590cfb6f00fb3c71520f565bb769705e2481ef27cf537d29f163c9fe3d39ed9fd18dc8b0c976cd302283e430807a9d751357f89092532d89fe280c69ad36e3541b5da9dea13fd19d0434c760fe295dfd9b9e63453c7853914c50b1b77ccd4b33c8b1f31fcb1aec040ffe2f9d728d8ea84297bac2e22230ebd1488c503b05b2e433cc37ce9fc123b7d3eb244b4549e9a841e73b664c8f6621ea5a4c9ff9c1da032255311f2c063a682baf4c97e7ba552bf71af4bd64f43872b846d15c65c487089be272cfa24a33f8c50930ea0bc4b089505fb8e97ded0e7e64a5ce897da6b940df4c3dddb8b4871d773f6ebf02058518c55c19aadb0f266caf18777ae68d2bfcfb2225961fcd10538ba664fd053a443320072707533ab761b9397bd0559126b84fe9196463ae50633017ea2d80940311d9c867102b1017f34af1965d8eb61be616a0d40656e2bbb750aee6f74f788c8acb2578e8686f5f8da6a19e979c152b7ee7c7f16902120588e2ff630144b5f929ffc593e946d9717c5968aa16c2d73d689fb5cce117acda3e23e5e0116de1cb6bddfa1a84cefb22c1e2c3753017696f27b9aec5d44f15411247643b84dd4410e784b4eb5b9c68fea671976f9b51c6526e2ddb40659611b0b3bcc7e249c77396fdb8c864ea9318f9de7fd3936fcbdc732c2f8b9556ec9afeb15d5e2df890351d66074d53dddc6e8dba8c91d733623ac95a49eb69c7de37ff2364ccddb01f6500750a012c2acf32a9f6bbd9e92d17ef858fbc34575db414ea42ad87a65b11ae5506469db256c421328f45aec73bdf18562447dc2840bbe9fc84dedd0fe6276fa174f21210d40193530ec7a70a9e60bfd6b00cba4d483be59950f16fa0dbd089b5fd0bb078badfcf42270cc62fe37be22b0d81f755263d74636fa466d2574ca62b58c649936d21e949de73ea45df3acade6609413f56fc218d6f11947bf1fd629d38d8acce90ee3c1a51117601ae126bec537e3e76ce7ecc53cfeefee8aaa104aa853a65aadf27bac9849ee0122793b11cd8e4f32fb07afa39e439cb738c30a6df958160aac15c26032a61aaf614b2e6601581fb0ab8d115e045005dee3df8ea42643cbcbb0b111130c42c94b7e874cfee50d5c2eb13b3a38c417fce9d740b7d43120431a7d44bc3934ee87b112401cbee3553837b6a0b4b3faaceb46eca4064301c8060870350e256ac9d5557f674536b9d9717a82fa211e7dfb52ca410d8a4f2f6b733c2a082f247538a6c40f56cf64204e62515db692f32733ff6f4b0787ef305d5e80881375467ae603ac3600e688c42f89a4fea4fdda09a8d59e19457c654a98c8129a8c65bae3310af2b170729e18400c915d0a2f4c4fc51747846630b95ec181228bda7ef48ad389815aa82de6c4a3b0746c28c01f9ec697ff17f095e1cf9d2fe78fdab9cbb1ca7aefc8af6a0cd98444735fad79e394a8f9c2fd358c3cde4ca6d57a620d0bb52dc6ff93f034b596f407c21511cc2c5ec8b0ede7f5c0acca61939ac7f2cad820c15133e69507eaa739e9a88936f4a74f0ad1574a1591f31f582a76157d89805cb3ba5e8d10509bf7a08e928653b4f05ba797a06765b74c8759fc34178624c08a2b99e59bcff5d33af2b0f9176b56c35f4da31c751a4c2fd88a1997cd9fc9bbf78220731d4c8cea23be1fd29c36b34d8458b7021ebcfafdc7e54096e517da3eb684298f742532d776164d9c9658e5faca0a3b08afca1bc27ed357884f49fe51bd0c38057f4288f1917e36e3865310b5eed140aef6150ae363293cf7467abd5e06cd7af5e2c49e7c5253a1155741e380bccb023a0faed93d9a64512d72436f1ef4ac0904a413e45164c23413bd57274a0a951c41a9a43aed094d4ea5c480ce64663cc9d36723179e2b19a48e9277a6591bbd888a06e0f2f142cd4495be4ba7274d69ba32a2788b935b2f18c5f336cb9de062829e2e0bb476efb36c3f53a766c14314f31637a464ac59d378ad7f51bef8d88715613653a427038e0d3e4dc3948bb1d70eb55c91c92f7510b1c0387253b458d2c90d17238f9eb239c680179a6c1e0759be367963e3b4d7395fd3911626582a094e6a8d0e746242f94267a4c57d5b2360ce3a6f7f3e3c10e124a54ed24d1585bc7b731cf731c94ee00ebf070b999b9bf28d76bfec9dcf12fcce2b9c4444c706ec6b943b5f39e9151a764ba1cd0cd6c1c7cdc3aa824cf17da705be27a18fbee41be39d6ae4dd4312f5f4bfee2c5bb21d941666f9d79b0f80c9b1bace84a05d2b0e3be1c3fd04d72b4b0124595c435813969d413960fddc858730a433383f3bc0472cb7683ea569e001f00000000", 0xe00)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
sendfile(r0, r2, 0x0, 0x7)
sendfile(r0, r1, 0x0, 0x11f06)

r0 = socket$netlink(0x10, 0x3, 0x0)
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
r2 = fcntl$dupfd(r0, 0x0, r1)
ioctl$PERF_EVENT_IOC_ENABLE(r2, 0x8912, 0x400200)
socket$unix(0x1, 0x0, 0x0)
bind$unix(0xffffffffffffffff, &(0x7f0000003000)=@file={0x0, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0xc)
r3 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000080), 0x80000000000a01, 0x0)
ioctl$TCSETSW(r3, 0x5403, &(0x7f0000000240)={0x0, 0x0, 0x0, 0x0, 0x0, "0000000000000000000000000300"})
write$binfmt_aout(r3, &(0x7f00000000c0)=ANY=[], 0xffffff78)
ioctl$TCSETS(r3, 0x40045431, &(0x7f0000000200))
r4 = syz_open_pts(r3, 0x0)
readv(r4, &(0x7f00000001c0)=[{&(0x7f0000000280)=""/4101, 0x1005}], 0x1)
ioctl$TCSETSW(r4, 0x5403, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x357e9682})
read(r4, &(0x7f00000000c0)=""/19, 0x8)

mlock2(&(0x7f0000800000/0x800000)=nil, 0x800000, 0x0)
msync(&(0x7f0000ffa000/0x3000)=nil, 0x3000, 0x2)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$inet6_tcp_int(r0, 0x6, 0x1, &(0x7f00000009c0), &(0x7f0000000a00)=0x4)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
shmget(0x1, 0xc00000, 0x0, &(0x7f0000000000/0xc00000)=nil)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
fcntl$lock(r1, 0x6, &(0x7f0000002000))
fcntl$lock(r1, 0x7, &(0x7f0000000100)={0x2, 0x0, 0x0, 0xb5e})

r0 = semget$private(0x0, 0x20000000102, 0x0)
semop(r0, &(0x7f0000000000)=[{0x0, 0xffff}, {}], 0x2)
semctl$GETNCNT(r0, 0x0, 0xf, 0x0)

r0 = socket$inet6(0xa, 0x400000000001, 0x0)
close(r0)
r1 = open(&(0x7f0000002000)='./bus\x00', 0x6042, 0x0)
ftruncate(r1, 0x2008002)
sendfile(r0, r1, 0x0, 0x200fff)
r2 = open(&(0x7f0000000400)='./bus\x00', 0x18183e, 0x0)
mmap(&(0x7f0000000000/0x600000)=nil, 0x600000, 0x7ffffe, 0x11, r2, 0x0)
read(r2, &(0x7f0000000180)=""/19, 0xfffffe47)

r0 = socket$inet(0x2, 0x4000000000000001, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x80000000000002, &(0x7f0000000040)=0x76, 0x4)
bind$inet(r0, &(0x7f0000000280)={0x2, 0x4e23, @multicast2}, 0x10)
ioctl$INCFS_IOC_PERMIT_FILL(0xffffffffffffffff, 0x40046721, 0x0)
r2 = socket(0x11, 0x800000003, 0x8)
bind(r2, &(0x7f0000000280)=@generic={0x11, "0000010000000000080044944eeba71a4976e252922cb18f6e2e2aba000000012e0b3836005404b0e0301a4ce875f2e3ff5f163ee340b7679500800000000000000101013c5811039e15775027ecce66fd792bbf0e5bf5ff1b0816f3f6db1c00010000000000000049740000000000000006ad8e5ecc326d3a0dffc2c654"}, 0x80)
setsockopt$SO_ATTACH_FILTER(r0, 0x1, 0x1a, &(0x7f0000000480)={0x1, &(0x7f0000000100)=[{0x6, 0x0, 0x0, 0xe7}]}, 0x10)
sendto$inet(r0, 0x0, 0x0, 0x200007fd, &(0x7f0000e68000)={0x2, 0x4e23, @local}, 0x10)
sendto$inet(r0, &(0x7f00000012c0)="0c268a927f1f6588b967480941ba7860ac5cf65ac618ded8974895abeaf4b4836af922b3f1e0b02bd60da03059bcecc7a95425a3a07e758044ab4ea6f7ae0ed88fecf90b1a7511bf746bec66ba", 0xfe6a, 0x20c49a, 0x0, 0x27)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
fcntl$lock(r0, 0x7, &(0x7f0000000040)={0x0, 0x0, 0x81, 0xfffffffffffffffa})

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0, 0x12, r0, 0x0)
clone(0x0, &(0x7f0000000300), 0x0, 0x0, 0x0)
r1 = gettid()
r2 = gettid()
tgkill(r1, r2, 0x24)

r0 = socket(0x10, 0x2, 0x0)
write(r0, &(0x7f0000000280)="1c0000001a009b8a140000003b9b301f00"/28, 0x32)
recvmmsg(r0, &(0x7f0000003800)=[{{0x0, 0x0, 0x0}}, {{0x0, 0x0, &(0x7f0000000600)=[{&(0x7f0000000100)=""/24, 0x18}, {&(0x7f0000001380)=""/4096, 0x1000}, {&(0x7f0000000480)=""/168, 0xa8}, {&(0x7f0000000540)=""/130, 0x82}], 0x4}}], 0x2, 0x0, 0x0)

unlink(&(0x7f0000000500)='.\x00')

getitimer(0x946a67b2a657f4cc, &(0x7f0000000180))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000180)={0x1, &(0x7f0000000140)=[{0x6, 0x0, 0x0, 0x7fff0000}]})
fchown(r1, 0xffffffffffffffff, 0xffffffffffffffff)

mremap(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0x3000, 0x0, &(0x7f0000ffc000/0x3000)=nil)
madvise(&(0x7f0000ffa000/0x2000)=nil, 0x2000, 0x64)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = socket$nl_route(0x10, 0x3, 0x0)
getpeername(r1, &(0x7f0000000100)=@pppol2tpv3in6={0x18, 0x1, {0x0, <r2=>0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, {0xa, 0x0, 0x0, @local}}}, &(0x7f0000000040)=0x76)
mmap(&(0x7f0000005000/0x3000)=nil, 0x200000, 0x0, 0x12, r2, 0x0)

syz_emit_ethernet(0x3a, &(0x7f0000000080)={@broadcast, @broadcast, @void, {@ipv4={0x800, @udp={{0x8, 0x4, 0x0, 0x0, 0x2c, 0x0, 0x0, 0x0, 0x11, 0x0, @rand_addr, @broadcast, {[@rr={0x7, 0xb, 0x9, [@remote, @multicast1]}]}}, {0x0, 0x0, 0xc, 0x0, @gue={{0x1, 0x0, 0x0, 0x0, 0x0, @void}}}}}}}, 0x0)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f00000023c0)={0x2, &(0x7f0000002340)=[{0xb1}, {0x6}]})

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = memfd_create(&(0x7f0000000900)='#\'%nodev\x00\x7f\xe5\xd0ql\x86\xcd\xe6\x14\x93\xb0\x7f_,y<', 0x0)
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x0, 0x11, r0, 0x0)
socketpair$nbd(0x1, 0x1, 0x0, 0x0)
rt_sigreturn()

setuid(0xee01)
r0 = socket$netlink(0x10, 0x3, 0x0)
getsockopt$sock_cred(r0, 0x1, 0x11, &(0x7f0000caaffb)={0x0, 0x0, <r1=>0x0}, &(0x7f0000cab000)=0xc)
setregid(0x0, r1)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
rt_tgsigqueueinfo(r0, r0, 0x15, &(0x7f00000000c0))
ptrace(0x10, r0)
ptrace$cont(0x9, r0, 0x0, 0x5)

set_mempolicy(0x0, &(0x7f0000000280)=0x1, 0x0)
set_mempolicy(0x0, &(0x7f0000000000), 0x8001)

capget(&(0x7f0000001a80)={0x20071026}, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
readahead(0xffffffffffffffff, 0x0, 0x0)
rt_sigreturn()

r0 = signalfd(0xffffffffffffffff, &(0x7f00000001c0), 0x8)
mkdir(&(0x7f0000000140)='./control\x00', 0x0)
close(r0)
r1 = inotify_init1(0x0)
fcntl$setstatus(r0, 0x4, 0x2c00)
r2 = gettid()
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000140)={<r3=>0xffffffffffffffff})
getsockopt$sock_cred(r3, 0x1, 0x11, &(0x7f0000caaffb)={0x0, <r4=>0x0}, &(0x7f0000cab000)=0x1)
r5 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x180000f, 0x12, r5, 0x0)
preadv(r5, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setresuid(0x0, r4, 0x0)
fcntl$setown(r0, 0x8, r2)
rt_sigprocmask(0x0, &(0x7f0000000000)={[0xfffffffffffffffd]}, 0x0, 0x8)
rt_sigtimedwait(&(0x7f0000000040)={[0xfffffffffffffff8]}, 0x0, 0x0, 0x8)
inotify_add_watch(r1, &(0x7f0000000180)='./control\x00', 0xa4000960)
rmdir(&(0x7f0000000100)='./control\x00')

clone(0x2e380, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mknod$loop(&(0x7f0000000200)='./file0\x00', 0x0, 0x1)
mount(&(0x7f0000000280)=ANY=[], &(0x7f00000000c0)='./file0\x00', &(0x7f0000000100)='cgroup\x00', 0x0, 0x0)
utime(&(0x7f0000000000)='./file0\x00', 0x0)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
tkill(r0, 0x3a)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x4000000206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
ioctl$TCSETSW(r0, 0x5403, &(0x7f0000000040)={0x0, 0xffffffff, 0x0, 0x0, 0x0, "00020000000000000000001100"})
r1 = syz_open_pts(r0, 0x2)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
write$binfmt_aout(0xffffffffffffffff, &(0x7f00000002c0)={{}, "d7358de3b6c58c0d"}, 0x28)
write$binfmt_elf32(r1, &(0x7f0000000080)=ANY=[], 0x43a)
exit_group(0x0)

clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$fuse(0xffffffffffffff9c, &(0x7f0000002000), 0x2, 0x0)
fchown(r0, 0x0, 0x0)
r1 = gettid()
rt_sigqueueinfo(r1, 0x8, &(0x7f0000000040))

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setitimer(0x0, 0xfffffffffffffffd, 0x0)
rt_sigreturn()

mkdir(&(0x7f0000000180)='./file0\x00', 0x0)
mount(0x0, &(0x7f0000000140)='./file0\x00', &(0x7f0000000000)='configfs\x00', 0x0, 0x0)
mount$overlay(0x0, &(0x7f0000000200)='./file0\x00', &(0x7f00000000c0), 0x0, &(0x7f0000000240)=ANY=[@ANYBLOB='lowerdir=.:file0'])
r0 = open(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
lseek(r0, 0x0, 0x4)

r0 = syz_open_procfs(0x0, &(0x7f0000000940)='ns\x00')
symlinkat(&(0x7f0000000040)='./file0\x00', r0, &(0x7f00000000c0)='.\x00')

clone(0x54041bc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
r1 = gettid()
tkill(r1, 0x14)
ptrace(0x4206, r0)
ptrace$cont(0x18, r1, 0x0, 0x0)

mkdir(&(0x7f0000000000)='./file0\x00', 0x204)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setxattr$incfs_id(&(0x7f0000000140)='./file0\x00', &(0x7f0000000180), 0x0, 0x0, 0x0)

r0 = socket$inet(0x2, 0x2, 0x0)
bind(r0, &(0x7f0000000080)=@in={0x2, 0x4e20}, 0x7c)
sendto$inet(r0, 0x0, 0x0, 0x0, &(0x7f0000000280)={0x2, 0x8004e20}, 0x10)
recvmmsg(r0, &(0x7f00000004c0)=[{{0x0, 0x0, 0x0}}], 0x1, 0x0, 0x0)
ioctl$SIOCGSTAMP(r0, 0x8906, &(0x7f0000000040))

clock_getres(0xfffffefffffffff2, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='gid_map\x00')
pwrite64(r0, 0x0, 0x0, 0x100000001)
readv(r0, &(0x7f0000002940)=[{&(0x7f0000002300)=""/161, 0xa1}, {&(0x7f00000023c0)=""/215, 0xd7}, {&(0x7f00000024c0)=""/110, 0x7a}, {&(0x7f0000002540)=""/195, 0xc3}, {&(0x7f0000002640)=""/74, 0x4a}, {&(0x7f0000002ac0)=""/113, 0x71}, {&(0x7f0000002b40)=""/247, 0xfffffffffffffe0c}, {&(0x7f0000002840)=""/24, 0x18}, {&(0x7f0000002880)=""/190, 0xbe}], 0x9)
exit(0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800013, 0x11, r0, 0x0)
rt_sigreturn()

clone(0x6900, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffffff, &(0x7f0000000080)='/', 0x0, 0x0)
fcntl$addseals(r0, 0x409, 0x0)
rt_sigreturn()

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
readlinkat(r1, &(0x7f0000000080)='\xe9\x1fq\x89Y\x1e\x923aK\x00', &(0x7f00000002c0)=""/4096, 0x1000)

syz_emit_ethernet(0x46, &(0x7f0000000080)={@broadcast, @local, @val, {@ipv4}}, 0x0)
r0 = semget$private(0x0, 0x2, 0x614)
semop(r0, &(0x7f0000000180)=[{0x1, 0xb07d, 0x1000}, {0x0, 0x2, 0x7259d1fe4de7b5d7}, {0x3, 0x6, 0x1000}, {0x4, 0x7, 0x800}, {0x4, 0x1}, {0x0, 0x8000, 0x800}, {0x4, 0x1f, 0x1000}, {0x4, 0x8, 0x1000}, {0x4, 0xffff, 0x1800}], 0x9)
semop(r0, &(0x7f0000000380)=[{0x0, 0xffffffffffffffff, 0x1800}, {0x2, 0x20}, {0x3, 0x0, 0x800}, {0x4, 0x5}, {0x0, 0x7, 0x1800}], 0x5)
semctl$SETALL(r0, 0x0, 0x9, &(0x7f00000001c0)=[0x7, 0x1ff, 0x1])
semctl$GETZCNT(r0, 0x5, 0x7, &(0x7f0000000480)=""/246)
semctl$SETVAL(r0, 0x0, 0x8, &(0x7f0000000040)=0x800)
semctl$GETNCNT(r0, 0x3, 0x3, &(0x7f0000000200)=""/21)
semctl$SETVAL(r0, 0x4, 0x8, &(0x7f00000000c0)=0x1000)
r1 = socket(0x2, 0x2, 0x0)
connect$unix(r1, &(0x7f0000000000)=@file={0xbd5699bc1ec0282, './file0\x00'}, 0x10)
r2 = socket(0x10000000002, 0x2, 0x0)
r3 = shmget$private(0x0, 0x600000, 0x0, &(0x7f0000a00000/0x600000)=nil)
shmat(r3, &(0x7f0000e80000/0x2000)=nil, 0x0)
shmctl$SHM_UNLOCK(r3, 0x4)
setsockopt$sock_int(r2, 0xffff, 0x1023, &(0x7f00000000c0)=0x3, 0x4)
connect$unix(r1, &(0x7f0000000240)=ANY=[@ANYBLOB="0000660d0000000000af1a93c764d45b3025029eb44018634fc88feac43539fb127a6565c891ff67b16a3985fdadfb67327c97fa2628a033ec9bf06b5deb90e64550522057059d24f4a483341b547a2347b4b8d221a6daaab9a5e58944795ea1b4ad969e32d0321db4cc43602e53e505d4e94c0ab911ac89c449fbb370cde92d8f40e8acf462357bf804ca2b18a006cb4a430aee5bd12fb7ebab42b01e589d02d1f83069d4012bef250f0e316dd9cb61a1c016cfd82090c9e8817c0ab9bb28661a5cfb7611aac229da28f5f879e780ccd51e9a6b249d47698bf28828deb6a7"], 0xa)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
capset(&(0x7f0000000040)={0x20080522}, &(0x7f0000000080))
capset(&(0x7f0000000000)={0x19980330}, &(0x7f00000000c0)={0x0, 0x0, 0xc61})
exit_group(0x0)

pipe(&(0x7f00000000c0)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ioctl$sock_inet6_udp_SIOCINQ(r0, 0x541b, &(0x7f0000000200))

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f00000007c0)='./file0\x00', 0x0)
mount(&(0x7f0000000800)=ANY=[], &(0x7f0000000040)='./file0\x00', &(0x7f0000000240)='tmpfs\x00', 0x0, 0x0)
open(&(0x7f0000000000)='./file0\x00', 0x141241, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x800006, 0x12, r0, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000000)={0x1, &(0x7f00000000c0)=[{0x6, 0x0, 0x0, 0x7fffffdf}]})
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
flock(r2, 0x6)

r0 = socket(0x1, 0x1, 0x0)
getsockopt$inet6_tcp_buf(r0, 0x6, 0x12, &(0x7f0000000300)=""/207, &(0x7f0000000040)=0xcf)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
r1 = eventfd2(0x0, 0x0)
read$eventfd(r1, &(0x7f0000000000), 0x8)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
write$eventfd(r1, &(0x7f0000000080), 0x8)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_TCP_MD5SIG(r0, 0x6, 0xe, &(0x7f0000000240)={@in={{0x2, 0x0, @empty}}, 0x0, 0x0, 0x46, 0x0, "9861ecce9996c73a405abc66a60a7daa1273df60b84ce14cf290cba85046546022501c18525a742b5667a5021aac9a312771700126c8cd899ea5f21ff3a3ead2c59866394a8c76bd03b0140c460f187b"}, 0xd8)
r1 = openat(0xffffffffffffff9c, &(0x7f00000001c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x210000000013, &(0x7f00000000c0)=0x100000001, 0x4)
connect$inet(r0, &(0x7f0000000180)={0x2, 0x0, @dev={0xac, 0x14, 0x14, 0xb}}, 0x10)
setsockopt$inet_tcp_TCP_REPAIR(r0, 0x6, 0x13, &(0x7f0000000200), 0x88)

madvise(&(0x7f0000000000/0x4000)=nil, 0xfffffffffffffff4, 0x14)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
setsockopt$inet_mreqn(r0, 0x0, 0x23, &(0x7f0000000000)={@multicast1, @loopback}, 0xc)
setsockopt$inet_mreqn(r0, 0x0, 0x24, &(0x7f0000000440)={@multicast1, @local}, 0xc)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x5, &(0x7f0000000000)=0x9, 0x4)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000480), 0x1, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ioctl$TIOCSETD(r0, 0x5423, &(0x7f0000000080)=0xd)
write(r0, &(0x7f0000000040)='8', 0xfdef)

r0 = memfd_create(&(0x7f00000002c0)='#\'%nod%v\x00\x7f\xe5\xd0ql\x86\xc9\xe6\x14\x93\xb0\x7f_,y<~\xab\x84\x00\x00\x00\x00\x00\x00\x14}\n\x81\xc7\x85|oC\xca\v\xe3\xba]fn\r\xdf!\x94\x0f\xaf\xb7\x93\xe8\xb6\xc3N\x16&\xab\xf9{\xaf;\xcf\x8c\xa8\xb9\x06\xaf\xd0\xfb:\x90LNF\x13\x9f\xc2\xb7/1\xb9V\xf0*\xcb\xdc\x05n<\xcfi\x02=1\xda\"\xb3\xfe\xf3\x97\xd9\xa5b\xd4\x00Q$\xb2v\\\xa9\xcf*tw\x8a\n_)\x89A\x8f`R\x12zM\a\xc43\xd0d\xee\x13Q', 0x0)
write(r0, &(0x7f0000002000)='/', 0x1)
sendfile(r0, r0, &(0x7f0000000200), 0x87)
sendfile(r0, r0, &(0x7f00000001c0), 0xfec)
perf_event_open(&(0x7f000001d000)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x400, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000240)=[{&(0x7f0000000580)=""/212, 0xd4}, {&(0x7f0000000680)=""/249, 0xf9}, {&(0x7f0000000780)=""/4096, 0x1000}, {&(0x7f0000000180)}], 0x4, 0x8, 0xffff)
ioctl$BTRFS_IOC_QGROUP_ASSIGN(r1, 0x40189429, &(0x7f0000000180)={0x1, 0x0, 0x5})
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ioctl$F2FS_IOC_GET_PIN_FILE(r1, 0x8004f50e, &(0x7f0000000140))
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x5, 0x11, r0, 0x0)
syz_open_procfs(0x0, 0x0)
mount(&(0x7f0000000480)=ANY=[@ANYBLOB="377a4f760700f5fe000000"], &(0x7f0000000080)='.', 0x0, 0x5010, 0x0)
r2 = syz_open_dev$tty20(0xc, 0x4, 0x0)
pwrite64(r2, &(0x7f0000000480), 0x0, 0xdcad8c4)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000440)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000e8c000/0x14000)=nil, 0x14000, 0x0, 0x12, r3, 0x0)
getsockopt$inet_IP_IPSEC_POLICY(r3, 0x0, 0x10, &(0x7f0000000380)={{{@in=@broadcast, @in6=@loopback, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, <r4=>0x0}}, {{@in6=@private2}, 0x0, @in=@initdev}}, &(0x7f0000000100)=0xe8)
mount$9p_tcp(&(0x7f0000000000), &(0x7f0000000040)='./file0\x00', &(0x7f00000000c0), 0x0, &(0x7f00000018c0)=ANY=[@ANYBLOB="7472616e733d7463702c706f72743d6d9883227a531b9a303030303030346532332c756e616d653d2327256e6f642576007fe5d0716c86c9e61493b07f5f2c793c7eab84000000000000147d0a81c7857c6f43ca0be3ba5d666e0ddf21940fafb793e8b6c34e1626ab21514e3935deb93b03fcf97baf3bcf8ca8b906afd0fb3a904c4e46139fc2b72f31b956f02acbdc056e3ccf69029f11110300000000000000e2b2ce07a972c0677d3d31da22b3fef3d549f035e0d7b24a97d9a562d4005124b2765ca9cf2a74778a0a5f2989418f6052127a4d07c433d064ee13512c6e6f88e7e5486e642c6d73697a653d3078303030303030303030303030303062352c666f776e65723c3af7d4f1", @ANYRESDEC=r4, @ANYBLOB=',\x00'])

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
r1 = fork()
ptrace(0x10, r1)
ptrace$cont(0x18, r1, 0x0, 0x7797000)
tkill(r0, 0x40)

poll(&(0x7f0000000280)=[{}, {}, {}], 0x20000061, 0x0)

clone(0x90126500, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
r1 = memfd_create(&(0x7f0000000080)='y\x105%\xfa,\x1f\x99\xa2\xc9\x8e\xcd\xfc\xfa\xf6\x12\x95^\xdfT\xe2=\x0e~F\xcds', 0x2)
ftruncate(r1, 0xffff)
fcntl$addseals(r1, 0x409, 0xb)
ftruncate(r1, 0x0)
r2 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_tgsigqueueinfo(r0, r0, 0xe, &(0x7f0000000200))

execveat(0xffffffffffffff9c, &(0x7f0000000040)='./file1\x00', &(0x7f0000000080)=[0x0], &(0x7f00000000c0)=[0x0], 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000100)='./file1\x00', 0x42, 0x1ff)
close(r0)
execveat(0xffffffffffffff9c, &(0x7f0000000140)='./file1\x00', &(0x7f0000000180)=[0x0], &(0x7f00000001c0)=[0x0], 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000200)='./file1\x00', 0x2, 0x0)
write(r1, &(0x7f0000000240)="01010101", 0x4)
close(r1)
execveat(0xffffffffffffff9c, &(0x7f0000000280)='./file1\x00', &(0x7f00000002c0)=[0x0], &(0x7f0000000300)=[0x0], 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000340)='./file0\x00', 0x42, 0x0)
close(r2)
execveat(0xffffffffffffff9c, &(0x7f0000000380)='./file1\x00', &(0x7f00000003c0)=[0x0], &(0x7f0000000400)=[0x0], 0x0)
fchmodat(0xffffffffffffff9c, &(0x7f0000000440)='./file0\x00', 0x1ff)
execveat(0xffffffffffffff9c, &(0x7f0000000480)='./file1\x00', &(0x7f00000004c0)=[0x0], &(0x7f0000000500)=[0x0], 0x0)

socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000080)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
fcntl$lock(r0, 0x7, &(0x7f0000000400)={0x0, 0x0, 0x40})
fcntl$lock(r0, 0x7, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x1})
fcntl$lock(r0, 0x7, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff})

r0 = syz_open_procfs(0x0, &(0x7f0000000040)='net/fib_triestat\x00')
perf_event_open(&(0x7f0000000100)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3c43}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
recvmmsg(r0, &(0x7f0000003640)=[{{0x0, 0x0, &(0x7f0000001940)=[{&(0x7f0000001cc0)=""/231, 0x6}], 0x1}}], 0x1, 0x12161, 0x0)
exit_group(0x0)
preadv(r0, &(0x7f00000017c0), 0x47, 0x0, 0x0)

r0 = socket$unix(0x1, 0x2, 0x0)
bind$unix(r0, &(0x7f0000000000)=@file={0x1, './file0\x00'}, 0x6e)
connect$unix(r0, &(0x7f0000000080)=@file={0x1, './file0\x00'}, 0x6e)
setsockopt(r0, 0x1, 0x7, &(0x7f0000000280)="1fd7cc29", 0x4)

clone(0x54041bc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
tkill(r0, 0x14)
ptrace(0x4206, r0)
ptrace$cont(0x1f, r0, 0x0, 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f00000001c0)='cgroup.controllers\x00', 0x275a, 0x0)
preadv2(r0, 0x0, 0x0, 0x0, 0x0, 0x10)
exit(0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f00000002c0)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x0, 0x0)
chdir(&(0x7f0000000200)='./file0\x00')
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
r0 = openat$null(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
fstat(r0, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, <r1=>0x0})
setreuid(0xee01, r1)
rename(&(0x7f0000000000)='./bus\x00', &(0x7f0000000040)='./file0\x00')
exit(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r2 = getpid()
r3 = getpid()
rt_tgsigqueueinfo(r3, r2, 0x15, &(0x7f00000000c0))
ptrace(0x10, r3)
openat$cgroup_ro(0xffffffffffffffff, &(0x7f0000000080)='cgroup.controllers\x00', 0x275a, 0x0)
ptrace$setregs(0xd, r2, 0x0, &(0x7f0000000000))
ptrace$getregset(0x4204, r3, 0x200, &(0x7f0000000400)={&(0x7f0000001800)=""/4096, 0x1000})

epoll_create(0x0)
getrusage(0x1, &(0x7f0000000000))

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0, 0xfffffffffffffd2f}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r2 = socket(0x10, 0x803, 0x0)
sendto(r2, &(0x7f0000cfefee)="120000001200e7ef007b00000000000000a1", 0x12, 0x0, 0x0, 0x0)
recvmsg(0xffffffffffffffff, &(0x7f0000000b40)={&(0x7f00000000c0)=@pppol2tpv3={0x18, 0x1, {0x0, 0xffffffffffffffff, {0x2, 0x0, @initdev}}}, 0x80, &(0x7f00000006c0)=[{&(0x7f00000002c0)=""/144, 0x90}, {&(0x7f0000000540)=""/163, 0xa3}, {&(0x7f0000000880)=""/204, 0xcc}, {&(0x7f0000000980)=""/145, 0x91}], 0x4, &(0x7f0000000a40)=""/248, 0xf8}, 0x0)
recvmmsg(r2, &(0x7f00000037c0)=[{{&(0x7f00000004c0)=@ethernet={0x0, @random}, 0x374, &(0x7f0000000380)=[{&(0x7f0000000040)=""/95, 0x1c5}, {&(0x7f0000000140)=""/85, 0xe75}, {&(0x7f0000000fc0)=""/4096, 0xf2}, {&(0x7f0000000400)=""/106, 0x65e}, {&(0x7f0000000740)=""/73, 0x3b3}, {&(0x7f0000000200)=""/77, 0x4d}, {&(0x7f00000007c0)=""/154, 0x40d}, {&(0x7f0000000000)=""/22, 0xa}], 0x81, &(0x7f0000000600)=""/191, 0x41}}], 0x4000000000003b4, 0x0, &(0x7f0000003700)={0x77359400})

clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
prlimit64(0x0, 0x4, &(0x7f0000000000)={0x0, 0xa46}, 0x0)
r0 = gettid()
rt_sigqueueinfo(r0, 0x8, &(0x7f0000000040))

r0 = socket$netlink(0x10, 0x3, 0x0)
sendmsg$netlink(r0, &(0x7f0000000c40)={0x0, 0x0, &(0x7f0000000c00)=[{&(0x7f0000000100)={0x14, 0x12, 0x5, 0x0, 0x0, "", [@generic="c4"]}, 0x14}], 0x1}, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='gid_map\x00')
pwrite64(r0, 0x0, 0x0, 0x100000001)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
write$tcp_mem(r0, &(0x7f0000000000)={0x0, 0x20, 0xe26, 0x20, 0xffffffff}, 0x48)
exit(0x0)

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
waitid(0x0, 0x0, 0x0, 0x0, 0x0)
exit_group(0x0)

perf_event_open(&(0x7f000001d000)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x41c1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={0x0}}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
mmap(&(0x7f000000a000/0x1000)=nil, 0x1000, 0x2000009, 0x8004400b871, 0xffffffffffffffff, 0x0)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000080)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
shutdown(r1, 0x20000000000001)
recvmmsg(r0, &(0x7f0000001f4c), 0x209a6b90bb7b17, 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = getpid()
openat$null(0xffffffffffffff9c, &(0x7f00000001c0), 0x1c8c0, 0x0)
rt_sigqueueinfo(r1, 0x39, &(0x7f0000000000))
r2 = gettid()
rt_sigqueueinfo(r2, 0x2b, &(0x7f0000000100))

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
get_mempolicy(0x0, &(0x7f0000000040), 0xffffffffffff64b3, &(0x7f0000ffa000/0x4000)=nil, 0x3)
r0 = getpid()
tkill(r0, 0x39)

socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000001580)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
ioctl$BTRFS_IOC_GET_SUBVOL_INFO(r0, 0x541b, &(0x7f00000015c0))

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
sendto$packet(0xffffffffffffffff, 0x0, 0xffffffffffffff02, 0x0, 0x0, 0x0)
tgkill(r0, r0, 0x2d)

splice(0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0, 0x0, 0x0)

r0 = signalfd4(0xffffffffffffffff, &(0x7f0000004340), 0x8, 0x80800)
fcntl$getflags(r0, 0x1)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f00000007c0), 0x0, 0x0)
r1 = syz_open_pts(r0, 0x0)
dup2(r1, r0)
syz_open_pts(r0, 0x0)

clone(0x4380, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
rt_tgsigqueueinfo(r0, r0, 0x4000000000000016, &(0x7f0000000640))
ptrace(0x10, r0)
ptrace$poke(0x10, r0, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
r1 = socket(0x10, 0x803, 0x0)
getsockname$packet(r1, 0x0, &(0x7f0000000200))

mmap(&(0x7f00009fd000/0x600000)=nil, 0x600000, 0x0, 0x6031, 0xffffffffffffffff, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mremap(&(0x7f0000a01000/0x4000)=nil, 0x4000, 0x800000, 0x3, &(0x7f0000130000/0x800000)=nil)
munlock(&(0x7f0000e8d000/0x2000)=nil, 0x2000)
fork()

r0 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
getsockopt$IP6T_SO_GET_INFO(r0, 0x29, 0x40, &(0x7f0000000000)={'security\x00'}, &(0x7f0000000080)=0x54)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
fstatfs(r0, &(0x7f0000000140)=""/75)

r0 = openat$fuse(0xffffff9c, &(0x7f0000000040), 0x2, 0x0)
io_setup(0x1, &(0x7f0000000100)=<r1=>0x0)
pipe2(&(0x7f0000002640)={<r2=>0xffffffffffffffff}, 0x0)
io_submit(r1, 0x2, &(0x7f0000000240)=[&(0x7f0000000140)={0x0, 0x0, 0x0, 0x0, 0x0, r0, 0x0}, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x6, 0x0, r2, 0x0}])

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
fcntl$setownex(r0, 0xf, &(0x7f0000000000)={0x0, 0xffffffffffffffff})

clone(0x2006d380, 0x0, 0x0, 0x0, 0x0)
r0 = socket$packet(0x11, 0x2, 0x300)
connect(r0, &(0x7f0000000300)=@nl=@unspec, 0x80)
rt_sigreturn()

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f00000001c0), 0x0, 0x0)
ioctl$TIOCSCTTY(r0, 0x540e, 0x0)
exit_group(0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = shmget$private(0x0, 0x2000, 0x0, &(0x7f0000ffc000/0x2000)=nil)
shmctl$SHM_STAT_ANY(r1, 0xf, 0x0)
r2 = getpid()
tgkill(r2, r0, 0x38)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = gettid()
r2 = openat$fuse(0xffffffffffffff9c, &(0x7f0000000080), 0x2, 0x0)
read$FUSE(r2, 0xffffffffffffffff, 0x0)
tgkill(r0, r1, 0x8)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800013, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
execveat(0xffffffffffffffff, &(0x7f00000000c0)='./file0\x00', 0x0, &(0x7f0000000300)=[&(0x7f0000000240)='\x00'], 0x0)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
semctl$SETVAL(0x0, 0x0, 0x10, 0xfffffffffffffffe)
r0 = gettid()
r1 = getpid()
tgkill(r1, r0, 0x2b)

syz_emit_ethernet(0x36, &(0x7f0000000100)={@broadcast, @random="00801000", @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x28, 0x0, 0x0, 0x0, 0x1, 0x0, @initdev={0xac, 0x1e, 0x0, 0x0}, @local}, @timestamp}}}}, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
pipe(&(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
splice(r0, 0x0, r1, 0x0, 0x737, 0x0)
rt_sigreturn()

clone(0x100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
wait4(0x0, 0x0, 0x80000002, 0x0)
getpid()
r0 = getpid()
r1 = getpid()
rt_tgsigqueueinfo(r1, r0, 0x16, &(0x7f0000000100))
ptrace(0x10, r0)
read$FUSE(0xffffffffffffffff, 0x0, 0x0)
ptrace$pokeuser(0x6, r0, 0x388, 0xb8)
ioctl$FS_IOC_GET_ENCRYPTION_POLICY(0xffffffffffffffff, 0x400c6615, &(0x7f00000000c0)={0x0, @adiantum})
perf_event_open(0x0, 0x0, 0x0, 0xffffffffffffffff, 0x8)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, 0x0)

perf_event_open(&(0x7f0000000040)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000180)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = dup3(r1, r0, 0x0)
ioctl$PERF_EVENT_IOC_ENABLE(r2, 0x8912, 0x400200)
exit_group(0x0)
r3 = socket$inet(0x2, 0xa, 0x0)
bind$inet(r3, &(0x7f0000deb000)={0x2, 0x0, @multicast1}, 0x10)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt$inet_tcp_int(r0, 0x6, 0x1, 0x0, &(0x7f00000000c0))
rt_sigreturn()

r0 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000ffc000/0x1000)=nil)
shmat(r0, &(0x7f0000ffb000/0x2000)=nil, 0x0)
shmctl$IPC_RMID(r0, 0x0)
shmctl$IPC_STAT(r0, 0x2, &(0x7f0000000000)=""/136)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = memfd_create(&(0x7f0000000380)='s\xf5\x89\x1c\x9e\xddF\x9c\x1drity\a\x00\b\x00\x00\x00\x00\x00\x00\xe02\xf9\xde\x7fD6\a\x93\xbf}\xdc-\x05\x8c\xa5\x80\xfcH\x0fi\x000\xf6\x1f~\x9e\xb4\xa8\x14\x93\xa3\xf3^\xfd.\xd1\xe8\xf0\xf8\x83I\x8b\xc7\x10\xd1g\x9fd$\x839\x1e\x88\xe3\x86\x19\x11\xabXK\xc4D\x8fZx\xe7\xe4\x98\x9bx\xfa\'0\xc9[\x9b=2\xfa\xe1\x8at\xd1I2\x14B\xb2\xe7;\xcau\xa7<E\x01U@\xb1n\x00\x00\x00\x00\x00\xd56\xa7\\\x91\x03\xcd;\xb3\x1aiO6\xe9\f\xfcH\xfd\x94\xe8\x1e2\x86W\xd1\x02\n\x10\xa4BE\xfe\x15]\xeb y\x99\xd6\xf7\xa0\xf5\x9b\x01\x00\x00\x00\x00\x00\x00\x00\xfd4\\\f\xb6MC\x80f+\xc4\xf6\x93\x87P@`{\xf9\xff;`\x89:w\xbe\xf3*\xbb/:\x9e?\x06\xdaF\x93@1riK\xc7/\xb3\xd9wT\xf0\xc5\xff\xcdQ\x12\xc9\x95\x95\xd8\x1e@g\x0fa\xd5\xd2\xa3<\xd0\x84,', 0x0)
pwrite64(r0, &(0x7f000003bfff)='/', 0x1, 0x0)
mmap(&(0x7f0000001000/0x1000)=nil, 0x1000, 0x4, 0x11, r0, 0x0)
symlink(&(0x7f0000001000)='./file0\x00', &(0x7f00000000c0)='./file0\x00')
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
creat(&(0x7f0000000000)='./file0/../file0\x00', 0x0)
r2 = socket$inet6_tcp(0xa, 0x1, 0x0)
fallocate(r2, 0x0, 0x102000006, 0x6)

get_mempolicy(&(0x7f0000000140), &(0x7f0000000180), 0x1, &(0x7f0000ffc000/0x4000)=nil, 0x3)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
r1 = fcntl$dupfd(r0, 0x0, r0)
sync_file_range(r1, 0x0, 0x0, 0x4)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
pipe(&(0x7f0000000100)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
close(r1)
r2 = socket$inet6_tcp(0xa, 0x1, 0x0)
setsockopt$inet6_tcp_int(r2, 0x6, 0x1e, &(0x7f00000000c0)=0x800000100000001, 0x4)
ioctl$int_in(r1, 0x5421, &(0x7f00000001c0)=0x1000)
connect$inet6(r2, &(0x7f0000000000)={0xa, 0x0, 0x0, @loopback}, 0x1c)
close(r2)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
clock_settime(0x0, 0x0)
rt_sigreturn()

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
chdir(&(0x7f00000001c0)='./file0\x00')
rmdir(&(0x7f0000000100)='./file0\x00')
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x800006, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000080)={0x1, &(0x7f0000000040)=[{0x6, 0x0, 0x0, 0x50000}]})
syz_mount_image$tmpfs(&(0x7f0000000000), 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f0000002440))

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
getgroups(0x13fbb995ebf5c232, 0x0)
rt_sigreturn()

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000100)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r1, &(0x7f0000d06ff8)='./file0\x00')
r2 = gettid()
statfs(&(0x7f00000003c0)='./file0\x00', 0x0)
rt_sigqueueinfo(r2, 0x10, &(0x7f0000000040))

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = creat(&(0x7f0000000100)='./file0\x00', 0x0)
sched_setaffinity(0x0, 0x0, 0x0)
fallocate(r0, 0x0, 0x100000006, 0x6)

clone(0x200800059fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
readv(0xffffffffffffffff, 0x0, 0x0)
exit_group(0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt(r0, 0x1, 0x4, 0x0, &(0x7f00000001c0))

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet6(0xa, 0x1, 0x0)
bind$inet6(r0, &(0x7f0000000000), 0x1c)
setsockopt$inet6_int(r0, 0x29, 0x1a, &(0x7f0000000080), 0x4)
rt_sigreturn()

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = signalfd4(0xffffffffffffffff, &(0x7f0000004340), 0x8, 0x0)
r1 = signalfd4(0xffffffffffffffff, &(0x7f0000004340), 0x8, 0x0)
r2 = signalfd4(r0, &(0x7f0000000400), 0x8, 0x0)
sendfile(r1, r2, &(0x7f0000000440), 0x0)
rt_sigreturn()

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
mount(&(0x7f0000000300)=ANY=[], &(0x7f0000000140)='./file0\x00', &(0x7f0000000180)='proc\x00', 0x0, 0x0)
truncate(&(0x7f0000000200)='./file0\x00', 0x0)
rt_sigqueueinfo(r0, 0x39, &(0x7f0000000000))

mkdir(&(0x7f0000fd5ff8)='./file0\x00', 0x0)
r0 = creat(&(0x7f0000df1000)='./file0/bus\x00', 0x0)
fcntl$lock(r0, 0x7, &(0x7f0000027000)={0x1})
unshare(0x40600)
fcntl$lock(r0, 0x6, &(0x7f0000000100)={0x1})

setsockopt$inet_tcp_TCP_REPAIR_OPTIONS(0xffffffffffffffff, 0x6, 0x16, 0x0, 0x0)
set_mempolicy(0x3, &(0x7f0000000000)=0x5, 0x9)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
r1 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r1, 0x6, 0x21000000000b, &(0x7f0000000280)=0x3, 0x4)
bind$inet(r0, &(0x7f0000000080)={0x2, 0x4e21, @local}, 0x10)
setsockopt$inet_tcp_TCP_REPAIR_QUEUE(r0, 0x6, 0x14, &(0x7f0000000140)=0x2, 0x4)
connect$inet(r0, &(0x7f0000000180)={0x2, 0x4e21, @local}, 0x10)
setsockopt$inet_tcp_TCP_CONGESTION(r0, 0x6, 0xd, &(0x7f0000000100)='reno\x00', 0x5)
sendto$inet(r0, &(0x7f0000000000)="92", 0x1, 0x90, 0x0, 0x0)
sendto$inet(r0, &(0x7f0000000380)='d', 0x1, 0x0, 0x0, 0x0)
setsockopt$inet_tcp_TCP_REPAIR_OPTIONS(r0, 0x6, 0x16, &(0x7f0000000240)=[@window, @window, @window, @window, @window, @window, @sack_perm, @timestamp], 0x8)
setsockopt$inet_tcp_TCP_REPAIR(r0, 0x6, 0x13, &(0x7f0000000200), 0x88)
sendto$inet(r0, &(0x7f00000004c0)="34e2de4d8d957a8de4e490b6cd20b988d4edef164bd3377aa381b5f50b7ca40a516489f78cd7208982e9bde22b2b7c1c7606d565477f3db9d2b077283644c0f27ab52a863a42863e06944e40a0b3c5d21c8cbe052e7f726263f28aef1bc12a069063d4c30e8f329fdb36859be727fbef4314161e5fb5f01ae00a2634d5cdecca2089c62e32f4c919886b2b88d237e287318739bec0364caf15889f38a312ef6621c0f21709a4bf2b16274cf933f6ad8fcc9c2024bc1b4713f650e860f93ae93b2361956b3e80c38c5fd29b5c1b5d7ce67edc856a8dc0ba54cee53de9a48c131389426bd06ec7c695add357934fc0321f0d3d7982e4fe5a0039decc491a663afd02facb08dd9695f854c7b031d9af8bd7350897996b5208b23030cc0feb84570730eaf24b9f2ac05d0feb3be07a29f887095f36f3c8f0e77e45509acd14a5be4a1572dd4cd1231087b830fa03e071571d4abd694710ef140469cf6df8a59839aafe046a5bffb97e5247be901789eafd726ba090337a2c49207e6b900c7e982472e6aac70e5d52ca2c1bab47b1f6d00f9601e2281686c21f770ae96e0ffec4b30496d012fa00958f794cdbd721bd155cae87", 0x20000671, 0x805, 0x0, 0x0)
ioctl$sock_ipv4_tunnel_SIOCCHGTUNNEL(0xffffffffffffffff, 0x89f3, &(0x7f0000000140)={'syztnl1\x00', &(0x7f00000000c0)={'gre0\x00', <r2=>0x0, 0x20, 0x7800, 0x0, 0x0, {{0xe, 0x4, 0x0, 0x9, 0x38, 0x66, 0x0, 0x80, 0x29, 0x0, @broadcast, @empty, {[@ssrr={0x89, 0x23, 0x3b, [@empty, @local, @multicast2, @loopback, @broadcast, @initdev={0xac, 0x1e, 0x1, 0x0}, @loopback, @local]}]}}}}})
setsockopt$inet_mreqn(r0, 0x0, 0x24, &(0x7f0000000180)={@initdev={0xac, 0x1e, 0x0, 0x0}, @remote, r2}, 0xc)
sendto$inet(r0, &(0x7f0000000680)="e65d8df65c4563b580438fb8e8fb59888aa3635b16f29aebed3b86f7b6edaf9a41ef3b0ae247cf31c220fc9681f490ae751010858abf67580db794f9ac9738507dcf68c3de4e0d31e5b80a36b4dbbc8b50c6abce4468cc68369fe3ecab845ed97c9703ae4a1aa69ca3371d632a1fba8533f8df6bfa8265f2bd05c1522bf6e72e51de42a435ab30a83fc56144c2d0543c8b1200b97bda47f631749d1a4bebc43dc40712739ec437e126f502357d09a6dc27c0ce5c86ffcca3cf7f599c396a51501bab6454f24d431d10e018dce5c5d0ef95dd8e88d5328d7d348cb512efe79ff186cf23a7dcf2cb93d7e012e4728b66b89058ccc1e96829829f021f6b8de1a1e2cb556eb94c9ec4e06d07acc41ee55ad776e324ced33c0c513e87ea24c827a3af6d0b39dd8f442a7a95fce45032a09bcb4c5b8c12feec6a19eacc7329cdce75c3a97beb50b5da5fcd4d350724231e987ac6ef129528845298da78e61b08811fd0b9c351bdf0012097636ffc769d3408cb1732ade78fa84bd5a9e7efa028ddd8fb0cf54b8642a6a64510618a2b746f02c0c364bf202744050947aa27d74764290ce4cea92b8d8b55a40b4c3d1b2c78edf3069a55f92fff8fb67d9083c30bef2ed403980c9e550aee9f0ee2d074e8e0b96d3923d3cc72a16e3350f617bb14fd9d249c86b9fe32865a7f5fabc19287e2c6566288e569b1c3b1f64283803ef634e40e3348ef2c74d3cebe08a895abd1594cdf40c05751fd252ee9ffc875c1293b79b8e9f559f79a5a596e73d593dd5e5235ac9ce189e4b2fa4dc6221ae83d141ced2e16853ed22df48fff78525ff63645becb97c2bfc1fa27060629f74c378218857ed753e56fdb8b3ad2dc6a31ce166209a9741d1f68b08aed0ed3e967b3644dce4aba533c902f5fda0df6a57e7194b98c4e0b1c7bd645aafeb4fda55b36c2acc080902c279e6e3475ebc151e1dbb203ef79d2909ded9359f38c43ef39e8a08a52698a0a49a013d59b32ee9bf4ade3c57562cde2040720bafcaca5cf35fc6bcf0f0a4b7cc80977a249adf8b13bd98b683a29379d4035d9797d8bf3fcc60c7d4b1f8b5e75fb842de9a5d7ec45b9fe479d110bce01a4752b41769d3d6244ce1522aa688aa4144a9f40423fc917a55b86fa2b0f95777e2f2a27d88efb28e882c4fd44bb7bc08965d77d8d24a635fe5ac97bd57d86e246283a57772e0198bdedd8d8288a20067b0b60be08740dd6bb6b1cdcf0fb62fc22b9abe8dd785ac68ffaf6ba1276b5adfaaeacdb05830bb8399954ff191aa547653b7c9979c4da69ce4b3f0a7c35c9f76c30b1a5774ce904c97f555b27866fa2c15022c0cc4f64a76dea5cd1d4a93c244e8defb4f156fc4065d71ca956699c37f2d8485faa441add3c1d8bbbc18158196846adc7ff0d0e2ffba96c6e8fd26f0b9cef4d7cab91582089823db5751ef7e1c8cdc02f1d89d83118f2846b3e2625b46e2b83bda023686f08c5ee767214a4dba0f9801ae185800ed72cafce8fe48c5d7086eb0d749235e0d3b958241fc7954e76b0b584a73739bcbb6f658b54a74361293b298874d0a72773f58fad8908c6760fbbfc091580d796438e4725b7684a785779308fa7073d8297e5a79694d9f1bbf3bc140557504eeda4ed4d2b66ae583e92aca8faa1289df4f54eb6412e2fa0e41a92300c98f1173a5f0591fed514f788b96758b7aba143363c4b97dad9feda55129ddb5c93a9ba9576834c667b48c65b8cfe18dcc3c53f63f3c924ebf310bcd9c72138226da1ea984fd1d61709cc9dd3efaef0ac7cfcb26966be562cee1ffe96a90b23c2686de239fb3c862d2f48af94ce5b27b3bb61781219b87dd6d5c9306e5482001fef8ebe5c1deb8a63d840f7a8e2c1b37dc78654b50b43fbbed9db605466e82935bb6c4471ba5e22934d5664d5834f2933109a34ffc939ba8416b31c3fe55725ce5f60156125bb92a0b8580a6bec3bfd72a653e31b6558b1f18d410c5fc2f7ba73ce711fe2de1edfd2a629506feb1ee4ed935279367069eb91e7e26fe2a46929361e485d9b4ebec98bbfb412e70c54e9a83b9a89d52837deb989f61a73941925602bb877ffa1582888d63cfafc0ebd85b9d5ed6f49906c3d5edd30f6c10a39d91e60ce7d79fd47c93d069958221444bab2e7470ab9e38fdb524cd3c30a03a05368f865017ce8bc197db25ec2c0e7ccccf012cc154f23580e5bd66b6dfd22be101909b7a1dde93562d340710122479ce2ab640a6cfb79a8a74ce4305960ef8f89b38bd5b6505f77b28b9048e6fd43b465d5c6138dcef3f81497b244372da3447f47bce9cc4cf24da4f9ab3c678b6a6914c5f1c7d01e26143104b55c33ab656d56def72762ba48142741d6199f7de6ae042e060452f29b3642fe9a3b78c720a1a38b24cc0f34a6d5f43f392b44981dcdf19e06c66fd091de6c5394852b21a2095eff281230cd636f5c9521d46f72ecc5a9cd68683b0ee449efbc61c7f7eeb922c3dd0923db1436dd98a40d366f886eb2337a46fdc273c8b389f76da67e72b5445053a3fe816abf748db167445c7d715c58ff093b9b3dc88096f37c8db3d45ec2296099cc618bd370be89a49dc7f9fb3b9bd3de491b95a60e2056e4fd4ac94480bf4f4c4fa2bbcc4fbfb1188204480fe03c05b03e3f1c0eb6a4a0ab8fdf8f096677cd42a94d880a7a2543df61d90689caf73aa834fce5056f247ac2dbabbc0e6b890454b024ed09b1770a44ac05166d19eb1d644cdd6efe44d2001c366d066510e4c09b4b760c51bdca8f55ff11fc1649d4b32659ccd86e605a0bf3556a29f0429c51fe1c28958b237e24ab9ff1185da07b04530b2e96261561ead72d333269a639783f874a192cc41b95e6b237e30c9d10562f82594b7d216f80867c1813f197e852c2a4991132923e49ac808d8bc79fec8bed2dd3a9a29d2d16a882b7f19697bd83b8803b3d96f0086d2f778ac292e47bfe4e0916f2f810ce9cb99a5fd133d559fba25cc318dd6f4c26ef3c1f5f0dff32a7965708864aa8a20c85dc8c07e96348346bb2ba27d8273457ff6e50c65bd556899958f871e018d5fcaf4c5ffbc499e301f24912e51b674364ebc4fef259f016ee0ec0ab9d95b1fd5c3a9e0df4a6f7cc76f31a9c3ce6b0cf63490334b1bacae5aad428196a0565557cd38c2cb5409591919df2a5bf26125a05475be1d59304088e19ce3a4ea9bd283481df4a5685b37e732cb6c676ab4d56370476aa1ae06ed43f7d9179e6506ec3b6d15f795784b02834233eb7ff217161a8abca1535e1e335ef1932742aef9ee5d3ced7a23094259e159e1efb3719ff334fea17f4a3b63f6425ecf38f208685ad6935276643e555c18521291d4dcf90b5dbd22922cd846750a0f864cf8ccb6667bacd6eaed5a45e52d6a618c1c8fbd8590a50e159fe644162517007ef83a76f1ab0e4b243ae898535f056fbe3cce19fa2799a76465b375f6cc24b0ddfb8f43cbe2761368d4272183f633db17c77ed556216de3fe61cd4e2fc7db43237aaf38a507bf329923d173197285321b023e42aa834dead92b57f38d039c599d7ab2f4b8d1de0f217ef401d085c3e3dbc43ff20c76de086f5b86f70d1e3c00924c4ba7e4e736e727c102ac42f6f8af39ecba31afd18b5e25b90a159644e8ef01725607b7d55fe15f405787d35340b41c7004d6ec39172bfefffd0102af221ccb4214d7bea0a7c1e495e4aefb7a3ac360dbbe4a21b941c6c0a73e1f5abf1087b749a6f3e113b57eb3c14656dba13449176021e0aaf133a5bf9757491ea40037ac1646529ee028c07d2f09521c51ad1ab448573b0dd49276e75d592058384f96deddcddaf3c5c14f9487a30905a0e1e0f25bc284162ae6381969b41a5c0d6bb4637c61f9061e7f36b47d07f1c16682f6e5a10d227e150e944efee136eba31814c93913cba2758813da8574f633ac283d87ff21372af78e1dfe570e0570b161deadb51f5fc7d020bee7a9baf3ce6c4fe5d460a748ae19876a22336da106f0397a45da331f997d1a5e1989c2632ea014033af3d38bb81c5b0dfea8809800c4e03458f71bd76a24470bd1675b34f8356a8e7435f6fa4b2ee8027382acd1e2f3435261d5e2aa858ecf395c48f23092acf30c07b960bd1e6bc066c211ef89b5cd445b9bc65a003dd7eeadc8c8d9b13a6885effb2db639878d91981ab8930fa47b10e61cbcb0b211a12718eb25a2b548aa5ad16babc6d928b0f8536c6cde281315373ee9bf12390fb301c11563af9947b66ee7d0e329e799ab2c94d69ab11e16e866f1ff10180edd7c5032cf27973495f112c64187431de34c1cec898dfb6bb2585d06291742ed19911af6e2158891fbb48c89962c8a758125b41f20a4e7dacb99bf736083994c506d5c109e03b687239c32548c742ced92e757e389717d4f66cb09b18dfde88c5d861b8c8cf2ba511dd4b8bbf898f9185e383ca657585e68432445fccabfedebe172161fd3863effc8eb928d4f1c17928c10dcf4f2566e88758b8bf47e6dc118826e4d1fb72999b2dcf67150db909d364954f3edadb5d9dc505c7551320ff564da2f4f863e0f578e89ef3e81b1cb06d26b6367f7b01934fd745932022b029fa7ecee3fac14b2b3a9c9475e4e9f0b9fbd3a3358c1587cbcce3d47fe1b0e1735b8d7b26ba9b3dffcdd7fb4032b4a565b25d37ff347802857ef7cd587e1941f5246d3a48ba1742c87010ca4494f51fbab5917ffc93450963fd49f6768eff5f8159083f06c788c2896b9414e90a5146da607615ef19b1f8f3aa23a3ba10b8da04fcf631afb2278fe67b0eb9f0881a0af250383eb9419d38af82b028fa3566117e8b0e87ee71440124c1cd4930a1a5d71f89bf568ec4853413735eff265d857823ba10fa67ef67eb4b99d38898ec6e851f97924ed7c4bc837b1aa87bd2ada57e777f75a8050d54508fac211baeaf98738fa3093c231c308dad14a71cd31a057a4758c613a9cb440a249d79a675539cb8bcfbe13ad4501bdf44c69670da2fa04f91bdd9528e1218a322de726b327e938b450cd46795a865246ef914880aa1fd80b087cc70a88de7d22eda099c7a6e39d6ae88b07cbf7e81e4254aa1643e322fe7434ab0c29b0eea8003f778b0e4f39c32fefbf3ba09533134d46bd91b8408fb53c7c7383c181c0c2a6c3c0934d01f292d0be230b700784b0746a2a1948953198e39c3e94aee2862f5c1ce3df9f094972fb8d69e6ac9e07b937fd07b684167a9de6ce48f951897a4c09c85ac22256e25c846fce886799f867fc042058323b0cb4d0ee56ff4069f93e7298df8f30792363410fc63f8726b82da3614df667b7a5a96514ebb2a9d2154947ca9a26145a80cc6f3701ad4e2f8f0aaf3abf53166bc3a78ca4521c60ccfc2b57b7196fe86e8e8aa22f552299169ee7541e8e43ac1459f92e192afc85ff8885b9d5697087026beadedd73d02ef238f7a21051f9cbd9c1f63d12c56937781caac1fc1de17482723e9e216ede64eb196b54ce6cbf79194d5108e8c771a74a5913001ab671a4d38ad610525fe3870901e92a8630962b3ea601f52791c787218bb1478576de6297f72d343ca43e6dd3432c6f30bc5b388f0f41988c016a53251a305ff13e5647292ad843fe2f97eafc98f976bbf296732de4929c43c2e9839a02f6dfacffbf50c3ebbbd9d46232935a3d74269dab6f3286da3b5c01f845169b66ca79b035d10072052791f11a449ef22556707daa8c9bfca202f80046a81c349e43c43c6348e1dd44259105d93f88ac8533ed023ed69ab2b66602a989e5836e5b9", 0x1000, 0x4004000, &(0x7f00000001c0)={0x2, 0x1000, @broadcast}, 0x10)
clone(0x2000100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
set_mempolicy(0x8003, &(0x7f0000000080)=0x6, 0x8f)
request_key(&(0x7f0000000040)='asymmetric\x00', &(0x7f0000001ffb)={'syz', 0x1, 0x14}, &(0x7f0000001fee)='R\trust\xe3c*sgrVid:De', 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f00000000c0)='/proc/self/exe\x00', 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
perf_event_open(&(0x7f000025c000)={0x2, 0x80, 0x15, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = socket$nl_route(0x10, 0x3, 0x0)
fremovexattr(r2, &(0x7f0000000040)=@known='user.incfs.metadata\x00')

socketpair(0xa, 0x1, 0x0, 0x0)

ppoll(0x0, 0x0, &(0x7f0000000080)={0x0, 0xfffffe01}, 0x0, 0x0)

perf_event_open(&(0x7f0000000200)={0x2, 0x70, 0x42, 0x8001}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
getpeername(r0, &(0x7f00000003c0)=@pppol2tpv3in6={0x18, 0x1, {0x0, <r1=>0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, {0xa, 0x0, 0x0, @local}}}, &(0x7f00000000c0)=0x80)
mmap(&(0x7f0000005000/0x3000)=nil, 0x200002, 0x0, 0x12, r1, 0x0)

syz_emit_ethernet(0x66, &(0x7f0000000100)={@link_local, @empty, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "e60013", 0x30, 0x3a, 0xff, @private2, @local, {[], @ndisc_na={0x88, 0x0, 0x0, 0x0, '\x00', @dev, [{0x19, 0x3, "3a172b70e3faa4f4a51e60ece11cdb3a3d65726952a4"}]}}}}}}, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
openat$zero(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800013, 0x12, r0, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000140)='fdinfo/4\x00')
fchmodat(r1, &(0x7f0000000180)='./file0\x00', 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigreturn()

clone(0xcba66300, 0x0, 0x0, 0x0, 0x0)
r0 = getpid()
r1 = openat$tun(0xffffffffffffff9c, &(0x7f0000000080), 0x102e02, 0x0)
ioctl$TUNSETPERSIST(r1, 0x400454ca, 0x400000)
pread64(r1, 0x0, 0x0, 0x0)
tkill(r0, 0x1)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000a, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
sendmmsg$inet(r0, &(0x7f00000014c0)=[{{&(0x7f0000000000)={0x2, 0x4e23}, 0x10, 0x0}}, {{&(0x7f0000000040)={0x2, 0x0, @loopback}, 0x10, 0x0, 0x0, &(0x7f0000000580)=[@ip_pktinfo={{0x18, 0x0, 0x8, {0x0, @private, @private}}}], 0x18}}], 0x2, 0x0)

mkdirat(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x0)
mount(&(0x7f0000000040)=ANY=[], &(0x7f0000002280)='./file0\x00', &(0x7f00000022c0)='devtmpfs\x00', 0x400, 0x0)
r0 = open$dir(&(0x7f0000000080)='./file0\x00', 0x0, 0x0)
getdents64(r0, &(0x7f0000000340)=""/185, 0xfffffffffffffd0d)

clone(0x4000c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffffff, &(0x7f0000000080)='/', 0x0, 0x0)
r1 = gettid()
fchown(r0, 0x0, 0x0)
tgkill(r1, r1, 0x24)

symlinkat(&(0x7f0000000080)='./file0/file0\x00', 0xffffffffffffff9c, &(0x7f0000000100)='./file0\x00')
readlink(&(0x7f00000000c0)='./file0\x00', &(0x7f0000000140)=""/200, 0xc8)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000000)={0x2, &(0x7f00000000c0)=[{0xa4}, {0x6, 0x0, 0x0, 0x7fffffff}]})
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

creat(&(0x7f00000002c0)='./file0\x00', 0x0)
setxattr$incfs_metadata(&(0x7f0000000300)='./file0\x00', &(0x7f0000000340), 0x0, 0x0, 0x0)
setxattr$trusted_overlay_nlink(&(0x7f0000000000)='./file0\x00', &(0x7f0000000040), 0x0, 0x0, 0x3)

r0 = socket$inet6(0x2, 0x3, 0xff)
getsockopt(r0, 0x0, 0x40, 0x0, &(0x7f00000001c0)=0x54)

syz_emit_ethernet(0x0, 0x0, 0x0)
syz_emit_ethernet(0x0, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
mincore(&(0x7f0000f0c000/0x3000)=nil, 0x0, &(0x7f0000afaf0a)=""/246)

r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
setsockopt$inet_mreqn(r0, 0x0, 0xc, &(0x7f0000000a80)={@multicast2, @loopback}, 0xc)
getsockopt$inet_mreqn(r0, 0x0, 0xb, 0x0, &(0x7f0000000040))

r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
connect$inet(r0, &(0x7f0000000040)={0x10, 0x2}, 0x10)
shutdown(r0, 0x0)
recvfrom(r0, 0x0, 0x0, 0x0, 0x0, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
timer_settime(0x0, 0x0, &(0x7f00000000c0)={{0x0, 0x989680}}, 0x0)
exit(0x0)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = open$dir(&(0x7f0000000040)='.\x00', 0x0, 0x0)
getdents(r1, &(0x7f0000000280)=""/212, 0x7d)
lseek(r1, 0x2, 0x1)

pipe(&(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = socket$inet_udp(0x2, 0x2, 0x0)
close(r2)
r3 = socket$inet6(0xa, 0x2, 0x0)
connect$inet6(r3, &(0x7f0000000000)={0xa, 0x0, 0x0, @mcast1, 0x2}, 0x1c)
connect$inet6(r3, &(0x7f0000003a40)={0xa, 0x0, 0x0, @ipv4={'\x00', '\xff\xff', @loopback}}, 0x1c)
setsockopt$inet6_IPV6_ADDRFORM(r3, 0x29, 0x1, &(0x7f00000001c0), 0x4)
write$binfmt_misc(r1, &(0x7f0000000200)=ANY=[], 0xfffffecc)
setsockopt$inet_int(r2, 0x0, 0xb, &(0x7f0000000180)=0x6, 0x4)
splice(r0, 0x0, r2, 0x0, 0x4ffe0, 0x0)

r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
setsockopt$inet_icmp_ICMP_FILTER(r0, 0x1, 0x9, &(0x7f0000000040), 0x4)

clone(0x200800059fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$netlink(0x10, 0x3, 0x0)
setsockopt$SO_ATTACH_FILTER(r0, 0x1, 0x1a, 0x0, 0x0)
exit_group(0x0)

syz_emit_ethernet(0x2e, &(0x7f00000001c0)={@random="e7b8d203b11e", @multicast, @void, {@ipv4={0x800, @igmp={{0x6, 0x4, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x2, 0x0, @empty, @multicast1, {[@ra={0x94, 0x4}]}}, {0x11, 0xff, 0x0, @empty}}}}}, 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
unlinkat(0xffffffffffffffff, &(0x7f00000003c0)='./file0\x00', 0x0)
exit(0x0)

io_setup(0x81, &(0x7f0000000080)=<r0=>0x0)
r1 = openat$pidfd(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
io_submit(r0, 0x1, &(0x7f00000004c0)=[&(0x7f0000000140)={0x0, 0x0, 0x0, 0x0, 0x0, r1, 0x0}])

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x10012, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
clone(0x53109200, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()

creat(&(0x7f0000000040)='./file0\x00', 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
preadv2(r0, 0x0, 0x660d1472115cb2d3, 0x0, 0x0, 0x0)

epoll_create1(0xc1a46fdaddf14141)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
clone(0x404000, 0x0, 0x0, 0x0, 0x0)
ptrace(0x10, r0)
waitid(0x0, 0x0, 0x0, 0x80000003, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit_group(0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = open(&(0x7f0000000080)='.\x00', 0x0, 0x0)
lseek(r1, 0x3, 0x0)
getdents(r1, 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = syz_open_procfs(0x0, &(0x7f0000000000)='gid_map\x00')
read$FUSE(r1, &(0x7f0000000080)={0x2020, 0x0, 0x0, 0x0, 0x0, <r2=>0x0}, 0x2020)
sched_getscheduler(r2)
rt_tgsigqueueinfo(r0, r0, 0x38, &(0x7f0000000000))

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x210000000013, &(0x7f00000000c0)=0x100000001, 0x4)
bind$inet(r0, &(0x7f0000000080)={0x2, 0x4e21, @local}, 0x10)
setsockopt$inet_tcp_TCP_REPAIR_QUEUE(r0, 0x6, 0x14, &(0x7f0000000140)=0x2, 0x4)
connect$inet(r0, &(0x7f0000000180)={0x2, 0x4e21, @local}, 0x10)
sendto$inet(r0, &(0x7f0000001e00)='r', 0x1, 0x0, 0x0, 0x0)
setsockopt$inet_tcp_TCP_REPAIR(r0, 0x6, 0x13, &(0x7f0000000200), 0x88)
sendto$inet(r0, &(0x7f00000004c0)='4', 0x1, 0x80d, 0x0, 0x0)
close(r0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = creat(&(0x7f0000000000)='./file0\x00', 0x0)
openat(r0, &(0x7f0000000080)='./file0\x00', 0x0, 0x0)
rt_sigreturn()

add_key$keyring(&(0x7f00000001c0), 0x0, 0x0, 0x0, 0xfffffffffffffffb)
perf_event_open(&(0x7f000001d000)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x7f}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
add_key$keyring(&(0x7f0000000100), &(0x7f0000000140)={'syz', 0x1}, 0x0, 0x0, 0xfffffffffffffffc)
exit_group(0x0)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000180)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
setsockopt$SO_ATTACH_FILTER(r1, 0x1, 0x1a, &(0x7f0000ab9ff0)={0x2, &(0x7f0000000000)=[{0xb1}, {0x6}]}, 0x10)
sendmmsg(r0, &(0x7f0000001e00), 0x3fffffffffffe36, 0x0)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$inet6_int(r0, 0x29, 0x50, 0x0, &(0x7f0000000040)=0x76)

mount(&(0x7f0000000640)=@nbd={'/dev/nbd', 0x0}, &(0x7f00000017c0)='./file0\x00', &(0x7f0000001800)='tmpfs\x00', 0x0, &(0x7f0000001840)='/dev/null\x00')

write$P9_RREADLINK(0xffffffffffffffff, &(0x7f0000000300)=ANY=[], 0x116)
sendfile(0xffffffffffffffff, 0xffffffffffffffff, 0x0, 0x0)
r0 = creat(&(0x7f0000000280)='./file0\x00', 0xd931d3864d39dcd7)
write$binfmt_script(r0, &(0x7f0000000340)=ANY=[@ANYBLOB="2321202e2f66696c653020806f0600000e672400459f000000007d16651e000072dc000092290a7e4555a763c12ca9cfa45b9deb6116cb5ccda085e2da58f729fa6680d7b0ec89b551c642a09112a3b4aa0e74467700000000ee9f99ba46c604a8bf8893cc71495953142417d1b8785800a3044b2e8092657768f0771b65d33e129933dd93f99f00cd6b3e5903fcddb592a67f706e0000000000000052564b27ab02abd4f2964a0e00000000000096f5131fd16e111e9d85a914031b0eaf7392fde9073fd902ff944b7dd129dc18a2dd004ddd3f005d9467590271d8ed07d8e363bdd3fde4fef23005d54194d33c9cf0f1579f5fe90c8f78ad0fb55b80c7bc5b04ada2d1f3c69de1c8f88fd96431bf2d39b1afdff27bad3ea6a2c5861acd0359b88fe4f30f831ec3f16f207dd495191b966bdb24ec090901d1f6b4f8b80e849493667287362c3ca6f44859f2592f7a0e48dccf7b552178bb1496b209dd45bc8a07b5c179a4af932c37d863b131"], 0x80)
perf_event_open(&(0x7f00000002c0)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
close(r0)
ptrace$setopts(0x4206, 0x0, 0x0, 0x0)
execve(&(0x7f0000000180)='./file0\x00', 0x0, 0x0)
r1 = timerfd_create(0x1, 0x0)
timerfd_gettime(r1, &(0x7f0000000040))
r2 = timerfd_create(0x0, 0x0)
timerfd_gettime(r2, &(0x7f0000000040))
sendfile(r1, r2, 0x0, 0x0)
socket$inet6_udp(0xa, 0x2, 0x0)

rt_sigprocmask(0x2, &(0x7f0000000140)={[0x88b1]}, &(0x7f0000000180), 0x8)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet(0x2, 0x2, 0x0)
connect$inet(r0, &(0x7f00000004c0)={0x2, 0x4e23, @loopback}, 0x10)
sendto$inet(r0, 0x0, 0x0, 0x0, 0x0, 0x0)
recvmmsg(r0, &(0x7f0000001840)=[{{0x0, 0x2b, 0x0}}], 0x1, 0x0, 0x0)
r1 = getpid()
rt_sigqueueinfo(r1, 0x39, &(0x7f0000000000))

clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = getpgid(0x0)
ptrace$setopts(0x4206, r1, 0x0, 0x0)
rt_sigqueueinfo(r0, 0x8, &(0x7f0000000040))

creat(&(0x7f0000000040)='./bus\x00', 0x0)
syz_mount_image$tmpfs(&(0x7f00000002c0), &(0x7f0000000100)='./bus\x00', 0x0, 0x0, 0x0, 0x0, &(0x7f0000000000)=ANY=[])
r0 = open(&(0x7f0000000200)='./bus\x00', 0x10103e, 0x0)
mmap(&(0x7f0000000000/0x600000)=nil, 0x600000, 0x7ffffe, 0x4002011, r0, 0x0)
truncate(&(0x7f0000000040)='./bus\x00', 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
lseek(0xffffffffffffffff, 0x0, 0x0)
open(&(0x7f0000000000)='./file0\x00', 0x400a82, 0x0)
rt_sigreturn()

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER(0x1, 0x0, &(0x7f0000000040)={0x1, &(0x7f0000000000)=[{0x6, 0x0, 0x0, 0x7ffffffc}]})
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
rename(&(0x7f0000000440)='./file0\x00', &(0x7f0000000480)='./file0\x00')

r0 = socket$inet(0x2, 0x3, 0x2)
syz_emit_ethernet(0x52, &(0x7f0000000140)={@link_local, @remote, @void, {@ipv4={0x800, @icmp={{0xc, 0x4, 0x0, 0x0, 0x44, 0x0, 0x0, 0x0, 0x2, 0x0, @rand_addr, @multicast1, {[@timestamp_addr={0x44, 0x1c, 0x13, 0x1, 0x0, [{@initdev={0xac, 0x1e, 0x0, 0x0}}, {@private}, {@multicast1}]}]}}, @timestamp_reply}}}}, 0x0)
recvmsg(r0, 0x0, 0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
ioctl$FS_IOC_READ_VERITY_METADATA(r0, 0x5450, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
io_setup(0x0, &(0x7f0000000000)=<r0=>0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
io_destroy(r0)
r2 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000ffc000/0x1000)=nil)
shmat(r2, &(0x7f0000ffc000/0x2000)=nil, 0x0)
io_destroy(r0)
rt_sigreturn()

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000700)={0x2, &(0x7f0000000240)=[{0x4c}, {0x6, 0x0, 0x0, 0xffffffff}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)

r0 = socket$packet(0x11, 0x2, 0x300)
sendto$packet(r0, 0x0, 0x0, 0x0, &(0x7f0000000300)={0x11, 0x0, 0x0, 0x1, 0x0, 0x9, @local}, 0x14)

r0 = getpid()
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000080)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff})
ioctl$int_in(r1, 0x5452, &(0x7f0000000100)=0x29efd6cf)
fcntl$setsig(r1, 0xa, 0x12)
r3 = getpgid(0x0)
fcntl$setownex(r1, 0xf, &(0x7f0000000180)={0x2, r3})
recvmsg(r2, &(0x7f000095cfc8)={0x0, 0x0, 0x0}, 0x0)
dup2(r1, r2)
tkill(r0, 0x15)

prctl$PR_SET_NAME(0xf, &(0x7f00000000c0)='(0\xbe\xbf\x97`b\xdaB\x0f\xacI\xd6\x7f\x9a\xff\n\xd2\xcf\xac\xf5\x11\x11C(\xbaa\x1fJ\xbf\x13t\x1e\x83]\x84v\xde\xcfCX\b,\xd4+\x88uT\xc7\x91\xb5S\xc9\xddvs\xa8e\x9d\x94A\xe8\xc6,p`\x83`\xab\xedr\xeb\x97\xc2\xa6\xa8\x96A\xb6\xd1\xcd-wj\xe3\x91D\x03\xdd}2\'\x7f\xf4\xed\bB\xe9\xfe\xf7X\xfb1q\x12\xc1\x04\x8a(\x99\xe6\xd0U\xe5\x8d~\x9a\xec8\\\xa0\x8e\xc1(\xd2-\xc2\xf5\x1b!\xbc\xcazq\xea\x9d\xbf\xca~\xae\x92\x18t\x8d\'\xb7c-\x9f/\xc0l\x93\x8b\x00\x1a}\xaf\x0e\x10*b\xd3\xa8M\xdcz\xc6W\xe9\xd6\x12\f\xab\xe1\"\x02\xda\vWT\r\xccK\x1d\xc4\x7f\x99#\xab\x10\x0eD\xce\xcb`#>\xe4\'\xea\xc3;W\xb7\xaf')

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
recvfrom$inet6(r0, 0x0, 0x0, 0x0, 0x0, 0x0)
sendto$inet6(r0, 0x0, 0x0, 0x0, 0x0, 0x0)
shutdown(r0, 0x0)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
timer_create(0x0, 0x0, &(0x7f0000001380))
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x11, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
timer_settime(0x0, 0x1, &(0x7f0000000040)={{0x0, 0x989680}}, 0x0)
timer_create(0x0, &(0x7f0000001300)={0x0, 0x0, 0x0, @tid=0xffffffffffffffff}, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit_group(0x0)
r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$TCSETS(r0, 0x541b, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x0, 0x0, "f5aea01ac36f4b420fc6c9121f630c8b5b39cb"})

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$unix(0x1, 0x1, 0x0)
sendmsg$unix(r0, &(0x7f0000002680)={0x0, 0x0, 0x0, 0x0, &(0x7f0000002580)=[@cred={{0x1c, 0x1, 0x2, {0x0, 0x0, 0xffffffffffffffff}}}], 0x20}, 0x0)
rt_sigreturn()

syz_emit_ethernet(0x66, &(0x7f0000000000)={@local, @remote, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "d1e838", 0x30, 0x3a, 0x0, @private1, @mcast2, {[], @dest_unreach={0x1, 0x0, 0x0, 0x0, '\x00', {0x0, 0x6, "2e1e4e", 0x0, 0x6, 0x0, @local, @remote}}}}}}}, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000080)='net/dev_snmp6\x00')
r2 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000040), 0x5, 0x0)
ioctl$TCSETS(r2, 0x5402, &(0x7f0000000180)={0x0, 0x15, 0x0, 0x8, 0x0, "02e1000000060e008946e3c600fd8400"})
ioctl$TCSETS(r2, 0x40045431, &(0x7f00000000c0))
r3 = syz_open_pts(r2, 0x4000000000000002)
r4 = dup3(r3, r1, 0x0)
ioctl$TCXONC(r4, 0x540a, 0x0)
write$binfmt_script(r2, &(0x7f0000000000)=ANY=[@ANYBLOB="23010b6f00000000000000fd0d"], 0x17)
write$P9_RATTACH(r1, &(0x7f00000001c0)={0x14}, 0xffffffde)
fcntl$setstatus(r1, 0x4, 0x40800)
ioctl$TCXONC(r4, 0x540a, 0x1)

mkdir(&(0x7f00000003c0)='./file0\x00', 0x0)
setresuid(0xee01, 0x0, 0x0)
r0 = openat(0xffffffffffffffff, &(0x7f00000002c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = getuid()
setresuid(0xee01, r1, 0x0)
link(&(0x7f0000000100)='./file0\x00', &(0x7f00000000c0)='./file1\x00')

r0 = openat$dir(0xffffffffffffff9c, &(0x7f0000000000)='.\x00', 0x0, 0x0)
getdents64(r0, 0x0, 0xffffffffffffffb4)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat$zero(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
mmap$perf(&(0x7f0000ffe000/0x2000)=nil, 0x2000, 0x0, 0x9811, r1, 0x0)
r2 = openat$zero(0xffffffffffffff9c, &(0x7f0000000100), 0x2, 0x0)
mmap$perf(&(0x7f0000ffe000/0x2000)=nil, 0x2000, 0x0, 0x9811, r2, 0x0)

syz_emit_ethernet(0x36, &(0x7f0000001500)={@broadcast, @dev, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "9061a9", 0x0, 0x3b, 0x0, @remote, @empty}}}}, 0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
connect$inet(r0, &(0x7f00000003c0)={0x2, 0x4e24, @empty}, 0x10)
getpeername$inet(r0, 0x0, &(0x7f0000000740))

perf_event_open(0x0, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000180), 0x2f606557d6081b8a, 0x0)
read$FUSE(0xffffffffffffffff, 0x0, 0x0)
write$FUSE_DIRENTPLUS(0xffffffffffffffff, &(0x7f0000002340)=ANY=[@ANYBLOB="070000000004", @ANYRES32], 0xd0)
ioctl$TCSETS(r0, 0x40045431, &(0x7f0000000240)={0x0, 0x0, 0x0, 0x0, 0x0, "8000"})
write$binfmt_aout(r0, &(0x7f0000000240)=ANY=[], 0xfe8d)
shmat(0x0, &(0x7f0000ffc000/0x4000)=nil, 0x0)
r1 = syz_open_pts(r0, 0x0)
ioctl$TCSETSF(r1, 0x5412, &(0x7f0000000000)={0x10017})

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x4040, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000100)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
rt_sigreturn()

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
mount(&(0x7f0000000300)=ANY=[], &(0x7f0000000140)='./file0\x00', &(0x7f0000000180)='proc\x00', 0x0, 0x0)
readlinkat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', &(0x7f00000001c0)=""/208, 0xd0)
rt_sigreturn()

poll(0x0, 0x0, 0x7fff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f00000027c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
timer_settime(0x0, 0x0, &(0x7f0000000200)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
clone(0x2008321cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x16, 0x0, @thr={0x0, 0x0}}, &(0x7f00000003c0)=<r2=>0x0)
timer_settime(r2, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)
mlock2(&(0x7f0000276000/0x2000)=nil, 0x2000, 0x0)

seccomp$SECCOMP_SET_MODE_FILTER(0x1, 0x0, &(0x7f00000002c0)={0x2, &(0x7f0000000000)=[{0x7c}, {0x6, 0x0, 0x0, 0xfffffffd}]})
prctl$PR_SET_DUMPABLE(0x4, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
prctl$PR_SET_PTRACER(0x27, 0xffffffffffffffff)
r0 = creat(&(0x7f0000000000)='./file0\x00', 0x0)
fallocate(r0, 0x0, 0x0, 0x4000000e)

r0 = openat$tcp_congestion(0xffffffffffffff9c, &(0x7f0000000000), 0x500, 0x0)
lseek(r0, 0x2, 0x1)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = semget$private(0x0, 0x3, 0x0)
semop(r0, &(0x7f0000000000)=[{0x2, 0x7}, {0x2, 0x7fff}], 0x2)
exit(0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
mmap(&(0x7f0000ffe000/0x1000)=nil, 0x1000, 0x4, 0x11, r0, 0x0)
mlock(&(0x7f0000ffd000/0x2000)=nil, 0x2000)
mprotect(&(0x7f0000ffc000/0x4000)=nil, 0x4000, 0x2)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff})
getsockopt$sock_int(r0, 0x1, 0xb, &(0x7f0000000040), &(0x7f0000000080)=0x4)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
execve(&(0x7f0000000040)='./file0\x00', &(0x7f00000001c0)=[0x0], 0x0)

clone(0xa0064700, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
removexattr(&(0x7f0000000000)='./file0\x00', &(0x7f0000000040)=@known='user.incfs.size\x00')
prlimit64(0x0, 0x0, &(0x7f0000000080), 0x0)

r0 = socket(0x2, 0x3, 0x1)
connect$inet6(r0, &(0x7f0000000000)={0xa, 0x0, 0x0, @empty}, 0x1c)

unshare(0x20600)
r0 = inotify_init1(0x0)
fchown(r0, 0x0, 0xee00)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0, 0x10, 0xffffffffffffffff, 0x0)
ioctl$TIOCSLCKTRMIOS(r0, 0x5401, &(0x7f0000000080))

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
listen(r0, 0x0)
r1 = fcntl$dupfd(r0, 0x0, r0)
listen(r1, 0x0)

r0 = openat(0xffffffffffffffff, &(0x7f0000001100)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setreuid(0x0, 0xee00)
r1 = geteuid()
setreuid(r1, 0x0)
r2 = getpid()
prlimit64(r2, 0x0, 0x0, 0x0)

clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
open$dir(&(0x7f0000000040)='./file1\x00', 0x2140, 0x0)
r0 = open$dir(&(0x7f00000001c0)='./file1\x00', 0x200800, 0x0)
getdents64(r0, 0x0, 0x0)
exit(0x0)

clone(0x200800059fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
sendto$inet6(r0, 0x0, 0x0, 0x0, 0x0, 0x0)
bind$inet6(r0, &(0x7f0000000180)={0xa, 0x0, 0x0, @dev}, 0x1c)
exit_group(0x0)

syz_mount_image$fuse(0x0, &(0x7f0000000080)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
r0 = openat$dir(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', 0x0, 0x0)
mknodat(r0, &(0x7f0000000200)='./file0\x00', 0x0, 0x0)
futimesat(r0, 0x0, 0x0)
execveat(r0, &(0x7f0000000040)='./file0\x00', &(0x7f00000002c0)=[&(0x7f00000000c0)='&\x00', &(0x7f0000000100)='-..[&]\x00', &(0x7f0000000140)='\x00', &(0x7f0000000180)='^:[}(\\-//*-+\x00', &(0x7f00000001c0)='$\x00', &(0x7f0000000240)=']:[--]\x00', &(0x7f0000000280)='(-\x00'], &(0x7f0000000380)=[&(0x7f0000000300)='\x00', &(0x7f0000000340)='\xd9&\\\'\x00'], 0x1000)
umount2(&(0x7f0000000480)='./file0\x00', 0xa)
setxattr$trusted_overlay_nlink(&(0x7f00000003c0)='./file0\x00', &(0x7f0000000400), &(0x7f0000000440)={'L+', 0x5738}, 0x16, 0x3)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = open(&(0x7f00000001c0)='./bus\x00', 0x140042, 0x0)
munlock(&(0x7f0000009000/0x1000)=nil, 0xffffffffdfff6fff)
fallocate(r0, 0x0, 0x0, 0x7fffffff)

r0 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
setsockopt$inet6_buf(r0, 0x29, 0x15, 0x0, 0x0)

getrlimit(0x539c711a53038ef1, 0x0)

clock_nanosleep(0x4b1259eb839ac536, 0x0, &(0x7f0000000000)={0x0, 0x3938700}, 0x0)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = epoll_create1(0x0)
epoll_pwait(r1, &(0x7f0000000140)=[{}], 0x1, 0x40100, 0x0, 0x0)
epoll_wait(r1, &(0x7f0000000500)=[{}], 0x1, 0x10001)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
epoll_ctl$EPOLL_CTL_ADD(r1, 0x1, r0, &(0x7f0000000040))
ioctl$TIOCSETD(r0, 0x5423, &(0x7f0000000080))

exit(0x0)
clone(0x12106100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = gettid()
r2 = gettid()
rt_tgsigqueueinfo(r1, r2, 0x2c, &(0x7f00000005c0))
r3 = open$dir(&(0x7f0000000040)='.\x00', 0x0, 0x0)
fstat(r3, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, <r4=>0x0})
setuid(r4)
tgkill(r0, r1, 0x0)

pipe(&(0x7f0000000180))
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat(r0, &(0x7f0000000000)='/proc/self/exe\x00', 0x21c140, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x280000f, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = socket$inet_udp(0x2, 0x2, 0x0)
close(r2)
socket$nl_route(0x10, 0x3, 0x0)
socket$inet6_tcp(0xa, 0x1, 0x0)
mkdir(&(0x7f00000000c0)='./file0\x00', 0x0)
mount(0x0, &(0x7f00000004c0)='./file0\x00', &(0x7f0000000000)='cgroup\x00', 0x0, &(0x7f0000000540)='pids')
chdir(&(0x7f0000000080)='./file0\x00')
perf_event_open(&(0x7f0000000000)={0x1000000002, 0x80, 0x800000000000013, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={0x0}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3ff}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
fstat(0xffffffffffffffff, 0x0)
syz_fuse_handle_req(0xffffffffffffffff, 0x0, 0x0, 0x0)
fstat(0xffffffffffffffff, 0x0)
rmdir(&(0x7f0000000100)='./file0\x00')

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000000)={<r1=>0xffffffffffffffff})
connect(r1, &(0x7f0000000040)=@rxrpc=@in4={0x21, 0x0, 0x2, 0x10, {0x2, 0x0, @loopback}}, 0x80)
fchmodat(0xffffffffffffffff, &(0x7f00000004c0)='./file0\x00', 0x0)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

io_setup(0x5, &(0x7f0000000840)=<r0=>0x0)
r1 = openat$tcp_mem(0xffffffffffffff9c, &(0x7f0000000000)='/proc/sys/net/ipv4/tcp_wmem\x00', 0x1, 0x0)
io_submit(r0, 0x1, &(0x7f0000000100)=[&(0x7f00000000c0)={0x0, 0x0, 0x0, 0x1, 0x0, r1, &(0x7f0000000040)="e0", 0x1, 0x9}])

perf_event_open(&(0x7f0000000180)={0x2, 0x70, 0xfffffffffffffffd, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5010, 0x4000000000000}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000200)={0x0, <r0=>0x0})
r1 = syz_open_procfs(0x0, &(0x7f0000000240)='net\x00')
fchdir(r1)
sendmmsg(r0, &(0x7f0000008600)=[{{0x0, 0x0, &(0x7f0000003140)}}, {{&(0x7f00000072c0)=@un=@file={0x1, './file0\x00'}, 0xa, &(0x7f0000007380), 0x0, &(0x7f0000000600)}}], 0x2, 0x0)
syz_open_procfs(0x0, 0x0)
fchdir(0xffffffffffffffff)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = semget$private(0x0, 0x8, 0x0)
semop(r1, &(0x7f00000000c0)=[{0x0, 0xff7d}], 0x1)
semtimedop(r1, &(0x7f0000000080)=[{0x0, 0xffff}, {0x0, 0x100}], 0x2, 0x0)
semop(r1, &(0x7f0000000040)=[{0x0, 0x0, 0x1800}, {0x0, 0x200}], 0x2)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000700)={0x2, &(0x7f0000000100)=[{0x10}, {0x6}]})

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
io_setup(0x5, &(0x7f0000000680)=<r1=>0x0)
io_submit(r1, 0x0, 0x0)
perf_event_open(&(0x7f0000000000)={0x2, 0x80, 0x83, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
perf_event_open(&(0x7f0000000000)={0x2, 0x80, 0x83, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
mremap(&(0x7f0000146000/0x3000)=nil, 0x3000, 0x3000, 0x0, &(0x7f0000573000/0x3000)=nil)
perf_event_open(&(0x7f0000000000)={0x2, 0x80, 0x83, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
io_submit(r1, 0x3, &(0x7f0000001a40)=[&(0x7f0000000100)={0x0, 0x0, 0x0, 0x2, 0x0, r0, &(0x7f0000000080)}, &(0x7f0000000180)={0x0, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff, 0x0}, 0x0])
clone(0x0, &(0x7f0000000000), 0x0, 0x0, 0x0)
ioctl$PERF_EVENT_IOC_DISABLE(0xffffffffffffffff, 0x2401, 0x0)

clock_nanosleep(0x0, 0x0, &(0x7f0000000000)={0x0, 0x14a9218000000000}, 0x0)

r0 = signalfd(0xffffffffffffffff, &(0x7f0000000040), 0x8)
linkat(0xffffffffffffff9c, &(0x7f0000000000)='\x00', r0, &(0x7f0000000080)='.\x00', 0x1400)

clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
io_setup(0x6, &(0x7f0000000000)=<r0=>0x0)
r1 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000180)='maps\x00')
io_submit(r0, 0x1, &(0x7f00000002c0)=[&(0x7f0000000080)={0x0, 0x0, 0x0, 0x0, 0x0, r1, 0x0}])
r2 = getpid()
rt_tgsigqueueinfo(r2, r2, 0x4, &(0x7f00000000c0))

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
getsockopt$inet6_int(r0, 0x29, 0x50, 0x0, &(0x7f0000000000)=0x2c)

clone(0x24100, 0x0, 0x0, 0x0, 0x0)
r0 = openat$fuse(0xffffffffffffff9c, &(0x7f0000002080), 0x2, 0x0)
fsetxattr$trusted_overlay_opaque(r0, &(0x7f0000000200), 0x0, 0x0, 0x3)
rt_sigreturn()

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000300)={0x2, &(0x7f0000000040)=[{0x14}, {0x6, 0x0, 0x0, 0x7ffffdbf}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x3800009, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

perf_event_open(&(0x7f000001d000)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x41c1}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
syz_mount_image$tmpfs(&(0x7f00000002c0), &(0x7f0000000100)='./bus\x00', 0x0, 0x0, 0x0, 0x0, &(0x7f0000000000)=ANY=[@ANYBLOB='huge=always'])
chdir(&(0x7f0000000380)='./bus\x00')
listxattr(0x0, 0x0, 0x0)
r0 = creat(&(0x7f0000000040)='./bus\x00', 0x0)
ftruncate(r0, 0x800)
lseek(r0, 0x1200, 0x0)
open(0x0, 0x0, 0x0)
r1 = open(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
sendfile(r0, r1, 0x0, 0x8400fffffffa)
creat(&(0x7f0000000200)='./bus\x00', 0x0)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
bind$inet6(r0, &(0x7f0000d84000)={0xa, 0x2}, 0x1c)
sendto$inet6(r0, &(0x7f0000f6f000), 0xfffffffffffffea7, 0x20000004, &(0x7f0000b63fe4)={0xa, 0x2}, 0x1c)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = syz_open_procfs(0x0, &(0x7f0000000200)='net/tcp6\x00')
r3 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f00000001c0)='cgroup.controllers\x00', 0x275a, 0x0)
write$binfmt_script(r3, &(0x7f0000000400)=ANY=[], 0x208e24b)
preadv(r2, &(0x7f0000000180)=[{&(0x7f0000000380)=""/155, 0x9b}], 0x1, 0x200000000000004, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
fstat(r0, &(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, <r1=>0x0})
setuid(r1)
setreuid(0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000080), 0x0)

clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$netlink(0x10, 0x3, 0x0)
getsockopt$sock_int(r0, 0x1, 0x10, 0x0, &(0x7f0000000040))
r1 = gettid()
rt_sigqueueinfo(r1, 0xa, &(0x7f0000000040))

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x0, 0x0, @thr={0x0, 0x0}}, 0x0)
timer_settime(0x0, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)
r2 = gettid()
tkill(r2, 0x18)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000040)='comm\x00')
read$FUSE(r1, &(0x7f0000002900)={0x2020}, 0x15b)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt$inet_tcp_buf(r0, 0x6, 0x5, &(0x7f0000000000)=""/212, &(0x7f0000000100)=0xd4)

clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
pwrite64(r1, 0x0, 0x0, 0x0)
rt_sigqueueinfo(r0, 0x8, &(0x7f0000000040))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000300)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
statfs(&(0x7f0000000000)='./file0/file0\x00', &(0x7f0000000180)=""/219)

rt_sigprocmask(0x0, &(0x7f0000000100)={[0xfffffffffffe]}, 0x0, 0x8)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
ppoll(0x0, 0x0, &(0x7f0000000440), &(0x7f0000000480)={[0x1]}, 0x8)
rt_sigqueueinfo(r0, 0x24, &(0x7f0000000040))
set_robust_list(&(0x7f00000003c0)={&(0x7f0000000380), 0x101}, 0x18)
ppoll(0x0, 0x0, 0x0, &(0x7f00000000c0), 0x8)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = gettid()
mount$fuse(0xf0ffff, &(0x7f00000020c0)='./file0\x00', &(0x7f0000002100), 0x0, &(0x7f0000000040)=ANY=[@ANYBLOB='fd=', @ANYRESOCT=r1])
tkill(r0, 0xe)

pipe2(&(0x7f0000000140)={0xffffffffffffffff, <r0=>0xffffffffffffffff}, 0x0)
pread64(r0, 0x0, 0x0, 0xfffffffffffffffc)

r0 = socket$unix(0x1, 0x3, 0x0)
listen(r0, 0x0)

creat(&(0x7f0000000040)='./file0\x00', 0x0)
removexattr(&(0x7f0000000300)='./file0\x00', &(0x7f0000000340)=@known='trusted.overlay.nlink\x00')

syz_emit_ethernet(0x50, &(0x7f0000002480)={@multicast, @empty, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "a0f009", 0x1a, 0x3a, 0xff, @remote, @mcast2, {[], @ndisc_na={0x88, 0x0, 0x0, 0x0, '\x00', @empty, [{}]}}}}}}, 0x0)

syz_emit_ethernet(0x56, &(0x7f0000000080)={@multicast, @random="d57f62fbe364", @void, {@ipv6={0x86dd, @tipc_packet={0x0, 0x6, "4585f8", 0x20, 0x6, 0x0, @private2, @loopback, {[], @payload_direct={{{{0x20, 0x0, 0x0, 0x0, 0x0, 0x8}}}}}}}}}, 0x0)

unshare(0x40000400)
mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0x3, 0x8031, 0xffffffffffffffff, 0x0)
ioctl$TUNSETIFF(0xffffffffffffffff, 0x400454ca, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x800006, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
getsockopt$IPT_SO_GET_INFO(r1, 0x0, 0x40, &(0x7f0000000180)={'raw\x00'}, &(0x7f00000000c0)=0x54)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
r2 = syz_open_pts(r1, 0x0)
ioctl$BTRFS_IOC_QGROUP_ASSIGN(r2, 0x40189429, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000001180), 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000240))
timer_settime(0x0, 0x0, &(0x7f0000000340)={{0x0, 0x989680}, {0x0, 0x989680}}, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f00000001c0)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
unshare(0x20020400)
fcntl$lock(r1, 0x6, &(0x7f0000002000)={0x1})
fcntl$lock(r1, 0x26, &(0x7f0000000080))
fcntl$lock(r1, 0x7, &(0x7f0000000100))
timer_create(0x0, &(0x7f0000000000)={0x0, 0x14, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000200)=<r2=>0x0)
fcntl$lock(r1, 0x26, &(0x7f0000000040)={0x1})
timer_settime(r2, 0x0, &(0x7f0000000140)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)
chdir(&(0x7f0000000080)='./bus\x00')

r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
r1 = dup(r0)
ioctl$PERF_EVENT_IOC_ENABLE(r1, 0x8912, 0x400200)
setsockopt$IPT_SO_SET_ADD_COUNTERS(r0, 0x0, 0x41, 0x0, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
get_mempolicy(0x0, &(0x7f0000000080), 0x5, &(0x7f0000304000/0x1000)=nil, 0x2)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

socket(0xa, 0x80002, 0x1)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000080)={0x2, &(0x7f00000000c0)=[{0x3d}, {0x6, 0x0, 0x0, 0x7ffffff8}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)

syz_emit_ethernet(0x66, &(0x7f0000000380)={@local, @broadcast, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "b61d18", 0x30, 0x0, 0x0, @dev, @local, {[@dstopts={0x3a, 0x4, '\x00', [@hao={0xc9, 0x10, @private2}, @hao={0xc9, 0x10, @private2}]}]}}}}}, 0x0)

r0 = open(&(0x7f00000000c0)='.\x00', 0x0, 0x0)
lseek(r0, 0x438b2ecd, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = socket(0x10, 0x80002, 0x0)
bind$netlink(r1, &(0x7f0000177ff4)={0x10, 0x0, 0x1}, 0xc)
write(r1, &(0x7f0000000140)="2600000022004701050000000000000005006d20002b1f000a4a51f1ee839cd53400b017ca5b", 0x26)
connect$netlink(r1, &(0x7f00000014c0)=@proc={0x10, 0x0, 0x1}, 0xc)
setsockopt$sock_int(r1, 0x1, 0x8, &(0x7f0000b4bffc), 0x4)
sendto(r1, 0x0, 0x0, 0x0, 0x0, 0x0)
recvmmsg(r1, &(0x7f0000000780), 0x3ffffffffffff81, 0x2, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
fsetxattr$security_evm(r0, &(0x7f0000000480), &(0x7f0000000640)=ANY=[], 0x101, 0x0)
symlink(0x0, 0x0)
openat(0xffffffffffffff9c, 0x0, 0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
fallocate(r1, 0x0, 0x102000006, 0x6)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
clock_getres(0xb, 0x0)
rt_sigreturn()

sync_file_range(0xffffffffffffffff, 0xffffffffffffffe1, 0x0, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
chdir(&(0x7f00000001c0)='./file0\x00')
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setxattr$smack_xattr_label(&(0x7f0000000140)='./file0\x00', &(0x7f0000000100)='security.SMACK64IPIN\x00', 0x0, 0x0, 0x0)
rt_sigreturn()

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
connect(r0, &(0x7f00000004c0)=@in={0x2, 0x0, @dev={0xac, 0x14, 0x14, 0x20}}, 0x80)
sendmmsg$inet6(r0, &(0x7f0000002d40)=[{{&(0x7f0000000000)={0x2, 0x4e20, 0x0, @empty}, 0x1c, 0x0}}, {{&(0x7f0000000040)={0xa, 0x4e20, 0x0, @dev}, 0x1c, 0x0}}], 0x2, 0x0)

r0 = socket$unix(0x1, 0x2, 0x0)
setreuid(0x0, 0xee00)
sendmsg$unix(r0, &(0x7f0000000340)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000300)=[@cred={{0x1c, 0x1, 0x2, {0xffffffffffffffff}}}], 0x20}, 0x0)

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
fcntl$notify(r0, 0x8, 0x80000000)
rt_sigreturn()

clone(0x4000c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
getresgid(0x0, 0x0, 0x0)
mount$fuse(0x0, 0x0, &(0x7f0000000040), 0x0, 0x0)
r0 = gettid()
tkill(r0, 0x25)

r0 = creat(&(0x7f0000000080)='./bus\x00', 0x0)
rt_sigprocmask(0x0, &(0x7f0000000240)={[0xfffffffffffffffd]}, 0x0, 0x8)
setrlimit(0x1, &(0x7f0000000080))
ftruncate(r0, 0xff)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = open(&(0x7f0000000300)='./bus\x00', 0x142042, 0x0)
fallocate(r0, 0x0, 0x0, 0x7fffffff)
clone(0x2fa4d1c0, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
lremovexattr(0x0, 0x0)
r1 = gettid()
tkill(r1, 0x18)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
ioctl$TCSETSW(r0, 0x5403, &(0x7f0000000040)={0x0, 0xffffffff, 0x0, 0x0, 0x0, "00000100"})
ioctl$TCSETS(r0, 0x40045431, &(0x7f00003b9fdc))
r1 = syz_open_pts(r0, 0x2)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x1, 0x0, 0x0)
write(r1, &(0x7f0000002340)="d17b76f4a4d725f070b5511c7aa017f8230882dc6b1fd2f99aeb13c4b51ebaba58f00777b2d5a56278ecf61c818f6e3954b87f4fb1e1f15c7d5ecf2bcd59643fea88974b8f9bdd1849f00b8cba7bbefa58347d9c0b248df4ef2e902e021863ad6d66d7b83aacff26280b4d4c5ebffb6a7cdfa0da5cbfae21655c3a1b785fd079203ecca06e3074cbb12fb48142dcf7b99611fbe01cfc345917e6eb7e5a673edb1c67769e3219550d8898e6e4f6815e571082da6b6509091f8560af4fc5759f215b46a60b2eb6457ee216763c298031b97bbf5a018069a49d556faa8138918a07a3c3dd30a90c249a8764e49ddc84726240d54c94f0941061464d6e877444c8065e26b71a7c119a3642907d8e999133cc7442b77acfee6d574b48698f1d84e03d3f7ebe86c8f3215e84443e748b89de7e5f7e098fa461d510d6d8f246a2e04f5032837f6a177e310cfc2cb145eb5c7bf2542f72da141610718f4f3ff7c2428c3c266f192ae7", 0x165)

r0 = semget$private(0x0, 0x2, 0x35)
semctl$IPC_STAT(r0, 0x0, 0x2, &(0x7f0000000080)=""/172)

r0 = socket$inet6(0xa, 0x1, 0x0)
setsockopt$inet6_tcp_int(r0, 0x6, 0xa, 0x0, 0x0)

clone(0x200800059fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
connect(r0, 0x0, 0x0)
exit_group(0x0)

syz_emit_ethernet(0x3e, &(0x7f0000001580)={@link_local, @remote, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "03ce02", 0x8, 0x3a, 0xff, @remote, @mcast2, {[], @ndisc_rs}}}}}, 0x0)

prctl$PR_GET_NAME(0x10, &(0x7f0000000040)=""/183)

clone(0x106300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setrlimit(0x7, &(0x7f0000000040))
socketpair$nbd(0x1, 0x1, 0x0, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()
rt_sigreturn()

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mknodat(0xffffffffffffffff, &(0x7f0000000000)='./file0\x00', 0x0, 0x0)
exit_group(0x0)

clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mknod$loop(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
mount(&(0x7f0000000280)=ANY=[], &(0x7f00000001c0)='./file0\x00', &(0x7f0000000180)='cgroup\x00', 0x0, 0x0)
statfs(&(0x7f0000000640)='./file0\x00', 0x0)
exit(0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt$sock_timeval(r0, 0x1, 0xf, 0x0, &(0x7f0000000240))

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = gettid()
r2 = fork()
r3 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
tkill(r2, 0x23)
ptrace(0x10, r2)
tgkill(r0, r1, 0x24)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
chdir(&(0x7f00000001c0)='./file0\x00')
open$dir(&(0x7f0000000040)='.\x00', 0x1670c2, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigreturn()

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000240)='./file0\x00', 0x0)
chmod(&(0x7f0000000180)='./file0\x00', 0x1d)
setuid(0xee01)
mkdir(&(0x7f0000000340)='./file0/file0\x00', 0x0)
rename(&(0x7f0000000140)='./file0/file0\x00', &(0x7f0000000200)='./file0\x00')
r0 = gettid()
rt_sigqueueinfo(r0, 0xa, &(0x7f0000000180))

syz_emit_ethernet(0x56, &(0x7f00000008c0)={@multicast, @broadcast, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "cd8d24", 0x20, 0x3a, 0xff, @initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @mcast2, {[], @ndisc_na={0x88, 0x0, 0x0, 0x0, '\x00', @empty, [{0x19, 0x1, "afd6a34df3b0"}]}}}}}}, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
socket$packet(0x11, 0x0, 0x300)
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800013, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000100)='status\x00')
preadv(r1, 0x0, 0x0, 0x0, 0x0)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

r0 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
bind$inet6(r0, &(0x7f0000000080)={0xa, 0x0, 0x0, @local, 0x2000002}, 0x1c)
connect$inet6(r0, &(0x7f00000000c0)={0xa, 0x0, 0x0, @remote, 0x7}, 0x1c)

clone(0x10024100, 0x0, 0x0, 0x0, 0x0)
getsockopt$inet_IP_IPSEC_POLICY(0xffffffffffffffff, 0x0, 0x10, 0x0, 0x0)
setresgid(0xffffffffffffffff, 0x0, 0x0)
rt_sigreturn()

r0 = creat(&(0x7f0000000280)='./file0\x00', 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x1, 0x0, 0x0)
fremovexattr(r0, &(0x7f0000000040)=@random={'security.', '$@%\x00'})

fork()
shmget$private(0x0, 0x3000, 0x40, &(0x7f0000ffc000/0x3000)=nil)
getuid()
getuid()
fork()
shmctl$IPC_SET(0x0, 0x1, 0x0)
r0 = shmget$private(0x0, 0x1000, 0x0, &(0x7f0000ffc000/0x1000)=nil)
shmctl$IPC_RMID(r0, 0x0)

r0 = epoll_create(0x101)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r1 = socket$unix(0x1, 0x2, 0x0)
epoll_ctl$EPOLL_CTL_MOD(r0, 0x3, r1, 0x0)
rt_sigreturn()

r0 = open(&(0x7f00000000c0)='./file0\x00', 0x2042, 0x0)
pipe(&(0x7f0000000100)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = openat(0xffffffffffffff9c, &(0x7f0000000080)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r2, 0x0)
splice(r0, &(0x7f0000000040), r1, 0x0, 0x8000, 0x0)

perf_event_open(&(0x7f0000000440)={0x2, 0x80, 0xbb, 0x2, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={0x0}}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x2200c0, 0x0)
readlinkat(r0, &(0x7f0000002300)='\x00', &(0x7f0000002340)=""/55, 0x37)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x4000000206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit(0x0)
r0 = gettid()
tkill(r0, 0x18)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(0x0, 0x0)
rename(&(0x7f00000000c0)='./file0/file1\x00', 0x0)
exit_group(0x0)

socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
pipe(&(0x7f0000000100)={<r2=>0xffffffffffffffff, <r3=>0xffffffffffffffff})
splice(r0, 0x0, r3, 0x0, 0x8ec0, 0x0)
fcntl$setpipe(r3, 0x407, 0x8000000000)
dup3(r1, r2, 0x0)
r4 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r4, 0x0)
preadv(r4, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r5 = dup2(r2, r3)
write$FUSE_POLL(r5, &(0x7f0000000180)={0x18}, 0xffa0)

r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
getsockopt(r0, 0x0, 0x2, &(0x7f0000000100)=""/87, &(0x7f0000000040)=0x57)

getitimer(0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000100)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x180000f, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setgroups(0x2, &(0x7f00000003c0)=[0x0, 0x0])
getgroups(0x2, &(0x7f0000000000)=[0x0, 0x0])

clone(0x3004100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
setsockopt$sock_int(r0, 0x1, 0x9, 0x0, 0x0)
rt_sigreturn()

syz_emit_ethernet(0x46, &(0x7f0000000540)={@link_local, @random="c710c0f1b80c", @void, {@ipv4={0x800, @tipc={{0x8, 0x4, 0x0, 0x0, 0x38, 0x0, 0x0, 0x0, 0x6, 0x0, @rand_addr=0x64010102, @multicast1, {[@rr={0x7, 0xb, 0x6, [@remote, @initdev={0xac, 0x1e, 0x0, 0x0}]}]}}, @payload_conn={{{0x18, 0x0, 0x0, 0x0, 0x0, 0x6}}}}}}}, 0x0)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000100)={0x3, &(0x7f0000000080)=[{0x1d}, {}, {0x6, 0x0, 0x0, 0xffffff00}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

mkdir(&(0x7f00000000c0)='./bus\x00', 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mount$overlay(0x400002, &(0x7f0000000000)='./bus\x00', &(0x7f0000000100), 0x0, &(0x7f0000000080)={[{@index_on}]})

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = semget$private(0x0, 0x20000000102, 0x0)
semop(r1, &(0x7f0000000140)=[{0x0, 0x4}], 0x1)
semop(r1, &(0x7f0000000000)=[{0x0, 0xe34c}], 0x1)
semop(r1, &(0x7f0000000080)=[{}, {}], 0x2)
semctl$IPC_RMID(r1, 0x0, 0x10)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000080), 0x230140, 0x0)
fstat(r0, &(0x7f00000000c0))

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$unix(0x1, 0x2, 0x0)
dup3(r0, 0xffffffffffffffff, 0x0)
accept4(0xffffffffffffffff, 0x0, 0x0, 0x0)
rt_sigreturn()

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt$IPT_SO_GET_ENTRIES(r0, 0x0, 0x41, 0x0, &(0x7f0000000980)=0x28)

socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000000c0)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
recvmmsg(r0, &(0x7f0000002840)=[{{0x0, 0x0, 0x0}}], 0x1, 0x0, 0x0)
r2 = memfd_create(&(0x7f00005f8ffe)='#}\x00', 0x0)
ftruncate(r2, 0x40001)
setsockopt$sock_int(r0, 0x1, 0x10, &(0x7f0000000140)=0x4, 0x4)
r3 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
sendfile(r1, r2, 0x0, 0x100000000002)

r0 = openat$full(0xffffffffffffff9c, &(0x7f00000004c0), 0x0, 0x0)
linkat(r0, &(0x7f0000000000)='\x00', 0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x1000)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x44000, 0x0)
preadv(r0, 0x0, 0x0, 0x0, 0x0)

mknodat(0xffffffffffffff9c, &(0x7f0000000100)='./file1\x00', 0x0, 0x0)
mount(&(0x7f00000000c0)=@sg0, &(0x7f0000000240)='./file1\x00', &(0x7f0000000280)='proc\x00', 0x0, 0x0)
umount2(&(0x7f0000000000)='./file1\x00', 0x0)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000580)={0x2, &(0x7f00000000c0)=[{0xc}, {0x6, 0x0, 0x0, 0x7fffffff}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

clone(0x4000c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt$inet_tcp_int(r1, 0x6, 0xc, 0x0, &(0x7f00000000c0))
tkill(r0, 0x18)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$inet6_tcp_int(r0, 0x6, 0x3, &(0x7f0000000100), &(0x7f0000000140)=0x4)

clone(0x2100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
wait4(0x0, 0x0, 0x40000000, 0x0)
exit_group(0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = gettid()
recvmmsg(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
ptrace$setopts(0x4206, r1, 0xfff, 0x0)
preadv(0xffffffffffffffff, 0x0, 0x0, 0x7, 0x0)

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r0 = openat$full(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
timerfd_settime(r0, 0x0, 0x0, 0x0)
exit_group(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = memfd_create(&(0x7f0000000340)='-vboxnet0\x00]\xea\xb0\xe2N\xc6c\x05\x8d\xb5\xc0\n\xad\x0f#+\x17\xd6A\xf4\xdf\x1b\xf9~\x8e\"\r\xff\xbb\xb0\xc3\x86\x97\xaf\xf8\x7f*\xfc\xfd\xe7\xcc\xbc\xddI.\xc3\x9aQ\xc8\x8e8U\xdaX\x06\x92\x1d\xbd\x10\xf9\xe8q\x00\xddr>\a\x00\x00\x00\xeeE\xa83K\xf60 \x7f\xcd\xac\x86\x9eT*\xf7\a^L,\x98\xa2(2,\x8c*\xff\x00\x00\xa1\x15<g8\xddGdF(\xeaN\xbcSoz\xe5.c\xdf\x8b\xd2mu\xc7D\x98|\x04\xeb\xe0\xc6\x96\xd4p.e\x050,c9:\x95z\xe2\xa9\xcf\xd2\x90*\xaa\xc0W.\xcb|.F_\xf8(\xa4\x91\xc8\v\x9b>3\x15\b\xefw\xa6\xc7c@\xb5\x9b\xdc$`\x00\x00', 0x0)
ftruncate(r1, 0x1000000)
r2 = openat$null(0xffffffffffffff9c, &(0x7f00000001c0), 0x101202, 0x0)
sendfile(r2, r1, 0x0, 0xeefffdef)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
getresuid(&(0x7f00000000c0), &(0x7f0000000140), &(0x7f0000000180))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat$tun(0xffffffffffffff9c, &(0x7f00000000c0), 0x6, 0x0)
ioctl$TUNSETIFF(r1, 0x400454ca, &(0x7f0000000000))
write$cgroup_subtree(r1, &(0x7f0000000240)=ANY=[], 0xfd45)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = inotify_init1(0x0)
fcntl$setown(r0, 0x8, 0xffffffffffffffff)
fcntl$getownex(r0, 0x10, &(0x7f0000000080)={0x0, <r1=>0x0})
ptrace$setopts(0x4206, r1, 0x0, 0x0)
ptrace(0x4207, r1)
pwritev2(0xffffffffffffffff, &(0x7f0000000040)=[{&(0x7f0000000140)="7a9f572884b013ddaba73a0fc7db1a20556cbba49056e71cea33a5a8df784781724960a62999e21f0976000890e66c6b2b378a1b36bb7968dd", 0x39}], 0x1, 0x0, 0x0, 0x0)
ptrace$setregs(0xd, r1, 0x0, &(0x7f00000000c0))
rt_sigreturn()

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
timer_create(0x0, &(0x7f0000000140)={0x0, 0x0, 0x4}, 0x0)
clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit_group(0x0)
exit(0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
readlinkat(0xffffffffffffffff, &(0x7f00000002c0)='./file0\x00', &(0x7f0000000340)=""/248, 0xf8)
exit(0x0)

r0 = socket$nl_route(0x10, 0x3, 0x0)
sendmmsg$inet6(r0, &(0x7f0000004100)=[{{0x0, 0x0, 0x0}}, {{&(0x7f0000001480)={0xa, 0x0, 0x0, @loopback}, 0x1c, 0x0}}], 0x2, 0x0)

r0 = openat$tun(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
ioctl$TUNGETIFF(r0, 0x800454d2, &(0x7f00000000c0)={'macvlan1\x00'})

syz_emit_ethernet(0x22, &(0x7f0000000580)={@remote, @random='\x00\x00 \x00\x00@', @void, {@ipv4={0x800, @generic={{0x5, 0x4, 0x0, 0x0, 0x14, 0x0, 0x0, 0x0, 0x2, 0x0, @dev, @multicast1}}}}}, 0x0)

mremap(&(0x7f0000ffb000/0x3000)=nil, 0x3000, 0x1000, 0x3, &(0x7f0000ffe000/0x1000)=nil)
r0 = shmget$private(0x0, 0x2000, 0x0, &(0x7f0000ffb000/0x2000)=nil)
shmat(r0, &(0x7f0000ffb000/0x2000)=nil, 0x0)
mremap(&(0x7f0000ffb000/0x1000)=nil, 0x1000, 0x2000, 0x3, &(0x7f0000ffe000/0x2000)=nil)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
semget(0x0, 0x0, 0x0)
tkill(r0, 0x25)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_udp(0x2, 0x2, 0x0)
setsockopt$inet_buf(r0, 0x0, 0x40, &(0x7f00000000c0)="453c141d3a421aa49db50f9ba71007c9a2b36c78504ed695177660e5ee6db4c50938e538740277d27dbffc7cd26668f394790b653a845d3614893ae7bdcd0b7571cea390f5dbfd051804adba482917d1146132c2e7f0d0d63e936ca2880df2b1", 0x60)
r1 = gettid()
tkill(r1, 0x18)

r0 = socket(0x11, 0x3, 0x8)
getpeername$netlink(r0, 0x0, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f00000008c0)='/proc/self/exe\x00', 0x0, 0x0)
setreuid(0xee00, 0xee00)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x2012, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
getsockopt$inet_int(r0, 0x0, 0x1, 0x0, &(0x7f0000000180))

r0 = socket$netlink(0x10, 0x3, 0x0)
recvmmsg(r0, &(0x7f0000000c00)=[{{0x0, 0x0, 0x0}}, {{0x0, 0x0, &(0x7f0000000480)=[{&(0x7f00000000c0)=""/248, 0xf8}], 0x1}}], 0x2, 0x0, &(0x7f0000003d80)={0x0, 0x3938700})

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
newfstatat(0xffffffffffffff9c, 0x0, 0x0, 0x2000)
r0 = memfd_create(&(0x7f0000000280)='-B\xd5NI\xc5j\xbappp\xf0\b\x84\xa2m\x00:)\x00\xbb\x8d\xac\xacva}knh#\b\x00\x00\x00\xc0:\x9cc\x10d\xee\xa9\x8bCc\xad\x89\x9ck\xde\xc5\xe9\xaa\x9b\xc3\x14\xd2\xd1y\x1f\x9e\x856\xddU\xa9=\xcdJx\xaa\x8f~\xb90a\xa9\xb2\x04K\x98\x93?\x88Q\xf7\xd6\x1d\xa1\xce\x8b\x19\xea\xef\xe3\xab\xb6\xa5$4\xd6\xfe7\x0f\xe7\xd9$\xce\x00\x00\x00\x00\xc9\xad\xd3g@\xe1\'s\x0e\x90\xf2\xcdr\xb8(\xb8\xd9\xa3\xc4p\xf4\\>A\x11U\x99\x8d\xa3\x86\xb7\x1d\x87j\xd3\xc4\xdf\x13/\x97Yy\x8b{\x1df\x8d/\x90\xd3<\xf8\x18\xa4\x88\xcf\x048\xb4\xbe\x00\x00\xb7\xd6\xa5&);\x1br\xd2\xa4\xba\x8b\xa7\x15\xbe\x95\xeb\x1bB\xacoyP\xbb\x1c\xb9S-\xe0oK\xac\x00;S\x8a\x01\xd2\xca\xa3\x1c]<\x04\xaf\x04\x9a\x9d\x84\xa5\x94J>F\xc5V\xc6\xfa\x8e\v\xe1\x82\x03`\xf8\xca\xf4\x89\r^Z44\x91\xeb\xf4$\xf3\x1d\xd5\xbd\xb6ZZ\xd8\xfdS\r\x98\x06/\x9a%m\xcf\xab u\xa6Fw\xde\xb4?\r\xbdK\xfb\xf2\x13\xb3\xfa\x00\xaaP\xc9t\x7f;A Y\x84\x17\x14\xa8\xb5\x0f\xc3i\x9a\x87W\x90h.\x8b\xf5\xf9\xc1\xf04\x9a\xf9DB|L\xbc^n\xd5\x85\xd7\xaf-}\xce\x0e\xcc{\xb1\x9d_\xb2BmU\xc2\xad2q\xd5t&v\x89O\xf0+Q?\xf5\x1eV\x8d[\x98\x11\f#\x13\xc7\xd9\x92\xcc\xf7\xfb\xd3\bGy\x98\x1b\xe7\x86i\xe1.\x1f\x9e\x8cPFYi\x94\x13\xddm\x9c\xbfV\xe7^@\xe0\xa3\xa5(\f\x18>94\xedZ\xa7\xe4\xb2\xb6.\bY\xa9\xff\xbb', 0x0)
pwrite64(r0, &(0x7f00000006c0)='/', 0x1, 0xffffffff)

semctl$SETALL(0xffffffffffffffff, 0x0, 0x11, &(0x7f0000000000))
r0 = semget(0x0, 0x4, 0x40)
r1 = semget$private(0x0, 0x0, 0x4)
semctl$SETVAL(r1, 0x2, 0x10, &(0x7f0000000140)=0x80)
r2 = semget$private(0x0, 0x3, 0x0)
semctl$SETVAL(r2, 0x1, 0x10, &(0x7f0000000100)=0x2)
semctl$SEM_STAT_ANY(r0, 0x2, 0x14, &(0x7f0000000000)=""/45)
semctl$GETVAL(r0, 0x1, 0xc, &(0x7f0000000040)=""/179)

chdir(0x0)
setrlimit(0x1, &(0x7f0000000100)={0xffffffffffffffff, 0xffffffffffffffff})
r0 = open(&(0x7f00000000c0)='./bus\x00', 0x141042, 0x0)
pwrite64(r0, &(0x7f0000000080)="9c", 0xfdf6, 0xfffffffefff)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
unshare(0x4000400)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
fchdir(r1)

clone(0xc0006300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800013, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = getpid()
rt_tgsigqueueinfo(r1, r1, 0x4000000000000016, &(0x7f0000000640))
ptrace(0x4206, r1)
r2 = getpid()
wait4(r2, 0x0, 0x0, 0x0)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
setsockopt$inet6_tcp_int(r0, 0x6, 0x24, &(0x7f0000000080), 0x4)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f00000000c0)='uid_map\x00')
r1 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='gid_map\x00')
pwrite64(r1, 0x0, 0x0, 0x100000001)
write$tcp_mem(r0, &(0x7f0000000000)={0x0, 0x20, 0xe26, 0x20, 0xffffffff}, 0x48)
exit(0x0)

socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
pipe(&(0x7f0000000040)={<r2=>0xffffffffffffffff, <r3=>0xffffffffffffffff})
splice(r0, 0x0, r3, 0x0, 0x8ec1, 0x0)
readv(r2, &(0x7f0000000580)=[{&(0x7f0000000100)=""/102, 0x66}], 0x1)
r4 = syz_open_procfs(0x0, &(0x7f0000000080)='net/anycast6\x00')
dup3(r4, r3, 0x0)
pipe(&(0x7f00000000c0)={0xffffffffffffffff, <r5=>0xffffffffffffffff})
splice(r2, 0x0, r5, 0x0, 0x7, 0x0)
write$binfmt_elf64(r1, 0x0, 0x0)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = eventfd2(0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0, 0x12, r2, 0x0)
write$eventfd(r0, &(0x7f0000000000), 0x8)
exit_group(0x0)

capget(0x0, &(0x7f0000000280))

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = syz_open_pts(r0, 0x0)
fchown(r1, 0x0, 0xffffffffffffffff)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0xfbe000)=nil, 0xfbe000, 0x7, 0x31, 0xffffffffffffffff, 0x0)
set_mempolicy(0x0, &(0x7f00000000c0), 0xc2)

r0 = socket$inet6(0xa, 0x400000000001, 0x0)
close(r0)
r1 = socket$inet6(0xa, 0x801, 0x0)
setsockopt$sock_int(r1, 0x1, 0x4000000000000002, &(0x7f00000001c0)=0xfc, 0x4)
bind$inet6(r1, &(0x7f0000000140)={0xa, 0x4e20}, 0x1c)
sendto$inet6(r1, 0x0, 0x0, 0xfffffeffffffffbe, &(0x7f0000000040)={0xa, 0x4e20, 0x0, @loopback}, 0x1c)
r2 = openat(0xffffffffffffffff, &(0x7f0000000180)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r3 = open(&(0x7f0000000100)='./file0\x00', 0x143042, 0x0)
ftruncate(r3, 0x2007fff)
sendfile(r0, r3, 0x0, 0x209000)

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
read$FUSE(0xffffffffffffffff, 0x0, 0x0)
rt_sigreturn()

r0 = epoll_create(0x4)
r1 = socket$packet(0x11, 0x3, 0x300)
epoll_ctl$EPOLL_CTL_MOD(r0, 0x3, r1, &(0x7f0000000000))

r0 = semget$private(0x0, 0x20000000102, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x180000f, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
semctl$GETALL(r0, 0x0, 0xd, &(0x7f0000000080)=""/143)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
shmget(0x1, 0x1000, 0x1f303283734672f6, &(0x7f0000b76000/0x1000)=nil)
shmat(0x0, &(0x7f0000ffe000/0x2000)=nil, 0x0)
shmget(0x1, 0x1000, 0x1f303283734672f6, &(0x7f0000b76000/0x1000)=nil)
shmat(0x0, &(0x7f0000ffe000/0x2000)=nil, 0x0)
shmctl$IPC_RMID(0x0, 0x0)
shmctl$IPC_RMID(0x0, 0x0)

syz_emit_ethernet(0x3a, &(0x7f00000001c0)={@local, @empty, @void, {@ipv4={0x800, @tcp={{0x6, 0x4, 0x0, 0x0, 0x2c, 0x0, 0x0, 0x0, 0x6, 0x0, @empty, @local, {[@ra={0x94, 0x4, 0x1}]}}, {{0x0, 0x0, 0x41424344, 0x41424344, 0x0, 0x0, 0x5}}}}}}, 0x0)

setrlimit(0x7, &(0x7f0000000000))
epoll_create(0x8)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000003d80)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff})
recvmmsg(r2, &(0x7f0000004f80)=[{{0x0, 0x0, &(0x7f00000002c0)=[{&(0x7f00000000c0)=""/214, 0xd6}], 0x1, &(0x7f0000000300)=""/28, 0x1c}}], 0x1, 0x0, &(0x7f0000003d40)={0x77359400})
gettid()
r3 = syz_open_procfs$namespace(0x0, &(0x7f00000005c0)='ns/user\x00')
sendmsg$unix(r1, &(0x7f00000006c0)={0x0, 0x0, &(0x7f0000000580)=[{&(0x7f0000000400)="19", 0x1}], 0x1, &(0x7f0000000000)=[@rights={{0x18, 0x1, 0x1, [r0, r3]}}, @rights={{0x14, 0x1, 0x1, [r0]}}, @rights={{0x14, 0x1, 0x1, [r0]}}], 0x48}, 0x0)

clone(0x4000c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
creat(0x0, 0x0)
clone(0x247ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = syz_open_procfs(0x0, &(0x7f0000000040)='task\x00')
exit(0x0)
fallocate(0xffffffffffffffff, 0x0, 0x0, 0x0)
fcntl$lock(r1, 0x7, &(0x7f0000000080)={0x0, 0x2})
tkill(r0, 0x18)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='/proc/self/exe\x00', 0x0, 0x0)
lseek(r0, 0x409ec4e2, 0x3)
rt_sigreturn()
exit_group(0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
io_setup(0x0, &(0x7f0000000000)=<r0=>0x0)
io_destroy(r0)
r1 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffc000/0x3000)=nil)
shmat(r1, &(0x7f0000ff6000/0xa000)=nil, 0x0)
io_destroy(r0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f00000002c0)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x0, 0x0)
chdir(&(0x7f0000000200)='./file0\x00')
link(&(0x7f0000000080)='./bus\x00', &(0x7f0000000040)='./bus\x00')
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
rt_tgsigqueueinfo(r0, r0, 0x13, &(0x7f0000000640))
ptrace(0x4206, r0)
ptrace(0x4208, r0)
ptrace$cont(0xffffffffffffffff, r0, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f000032d000/0x2000)=nil, 0x2000, 0x0, 0x12, r1, 0x0)
mmap(&(0x7f0000307000/0x4000)=nil, 0x4000, 0x0, 0x11, r1, 0x0)
mremap(&(0x7f0000a96000/0x1000)=nil, 0x1000, 0x800000, 0x3, &(0x7f0000130000/0x800000)=nil)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000001240)={<r0=>0xffffffffffffffff})
getsockopt$sock_timeval(r0, 0x1, 0x14, 0x0, &(0x7f00000012c0))

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f00000001c0), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x100000000000008d, 0x4, 0x0)
fstatfs(r0, &(0x7f0000000080)=""/84)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000080)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
sendmmsg(r0, &(0x7f00000001c0)=[{{0x0, 0x0, 0x0}}, {{0x0, 0x0, 0x0, 0x0, 0x0, 0x20}}], 0x2, 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000140)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = dup2(r1, r0)
r3 = openat(0xffffffffffffff9c, &(0x7f0000002600)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x300000e, 0x12, r3, 0x77d51000)
write$FUSE_IOCTL(r2, &(0x7f0000000440)={0x20}, 0x20)
r4 = gettid()
r5 = getpid()
tgkill(r5, r4, 0x2b)

r0 = socket$netlink(0x10, 0x3, 0x0)
sendmsg$netlink(r0, &(0x7f0000000280)={0x0, 0x0, &(0x7f0000000200)=[{&(0x7f0000002180)={0x24, 0x12, 0x1, 0x0, 0x0, "", [@typed={0x14, 0x0, 0x0, 0x0, @ipv6=@initdev={0xfe, 0x88, '\x00', 0x0, 0x0}}]}, 0x24}], 0x1}, 0x0)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000480), 0x1, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ioctl$TIOCSETD(r0, 0x5423, &(0x7f0000000080)=0xd)
writev(r0, &(0x7f00000082c0)=[{&(0x7f0000005180)="f58aa3f2ccd98b95f21e7eab655ea634d52cc72093a8e2b2cba40a3c6c93ff86b49c60c43c99d4413bb9f0fb1afed0a2132af04770048ac5dea153d998e7d875cace850b19644b4a00b77c0ba27b541a7454a39835c842c4827b9b2b506888e8c96453b1d0920819f3b0989f81b923b8e924dec343cee1451dd87d2d964703daa79b58e76ca4dbadb11289619a417bf1b7219f1589eb623eb3bf41d7f04f9c313e245df50bb51923f56e28c7ce4ffa5c0c36d6ef613290c5b8c4c20dac41caaf4617e117214b8d5a85f621762fb6fd6d1195bd9349caef99f3f013dc313d1171a90a21c03984587b6c8459dd0a50f3c60d54fd44ba7ed768687badb779698949aafe59130d833c1f4845b7b0e8c6f748240737dce619e3846f0a9559aae52bcb42b3cf47432c9b525ebd282ea893c00a033316a59b818ff9c0c3525600afd4fa4aee3efb6320fa765686ef0bd6356ae903f7661cf3b1a296e548713e4a67a5dcd6d6f7a0f943c8219084378b8117fb42b8afa49e61a29d03861b01d3fc2d5381ed02bac92a2904b7dfd9874ef44911de582d244858db208ddf232f96e43125b6ee44dfca2c906c8526db102fd16dda1ce1800dbd9e0d7dd4f35a4f74b27d26a2e4078ece6c8ced7bbd4bb5ea6a2beb18783f7b52c2f7165ca1084840342d3c4270c2c2b9b21fb1c8dc6968fcce6a66de8d0c1f7db56e7a2ce502db97bb72f0c8abdd669713a41a8c1c953fe12e701d05e722038554dff2f49371e8ac4e3ff4ab5ae3cf80145baf6dd5a4389bb46741fe0638987ec00051697697315d4c05234b1a4d0f1df9460dd6fc01d5b61566361090e356a84b5b4b6c6fef879b238b045f73a6e932f67f4794b463f3d79cf19b7b099df68a49dbf693c69bc1a41f809db0de1874e13836e3d8bd76171bd2ccd067abdd492aceb5efe25933b31f35ccbfdcacdd393dada32620455b2fd112394c990a92dd14936e55ef01c10660985955a3813432dd50a62770848cfad02dd63610c6c3cbd19c4dd20a543a794b326e0593b3d026e8f79ea4c76f2a69e281b01ef36410d74ca5155e65871d41584c6b9ba402eb022a33cf92709b0dc9aae6b5cffdfe82ccc8caa46ad250dea3eb23f91f631cc93a25d0e23a77eab5e818634cad05401776194a03f5a35de9816ad8c6d8eb0927f8d203187cbeeecf743bae3fcb31a01891c99cd0af7552a9b94f1f9eafe00eeaf6632c887b94b227a070611bd42867ab83d04cd903b69e4abd9dbdc8bcdd2ed8e439b7e42d5550eec96a37e08ba0e3f4be0f63c730a49b0fcc18cba6c61b55ead9ad4bc92c4568493f8a55a0b6c2d313fa2cee4f390b68b69a7938b715226f0233d6585b90e574384d8777949c0e5226d39480edaaad8c865ff6fd81572312808cf6fba105b1e4db886d95444c5dc1d7197fc3172b84eb9ec14e14a6660d3843da03dc129a8440974d12c526877c250fa8a038b0a385cbea817c270d57b4058d503954b71f271e06c90d160f1198a3981cfd1ce63edb7f32f51c459f8e1dc803d12938fcccd10cc6e54fd43fa466f729a5ae407e4561c0c96630ae9f07770e06b0a56eb65ccc0b918ae7bf3ca49703126f4b421e7c0f069ad4eeb056c62e0c372c972cfc47d85bd6e57105ffc7c216c657a53a084ea4ec824c156f5b2ba5eef312b756709d2ca671c6e17e02b757feac93250996359c35f0e4631559002969bd46f0ab35e3ee6f5c8e376ad4321ebf668c7cb24fee568135dc6c0815447b27c6070bcb64c66723a3a231c3cee903015cab5babea835ee597fabe3a5421f83bfa14c622e93870e01b70d387604071a015586e7231019b4646f8e66c1bc909b158edf9475f202811c5baaf525d1e1e77d05eed8726c173619851fc7b1ca3c569ca80139bc1aead06e91577cc6492f32d9f4f763e966153ef4e1e31bb6fe4a751f953d87ed3eee06f8a734de57e9d06f9dbcf7a49d889adaaaf7daee0dea0c0603e373f2d7d9666c4eeb3373b5d5c07458250769b219fa4c1dd40be37ddead0e90e985abeb7c73c7361befb553a8ccbbe7fbcae8806e6f0d55fc0e5802ee3c642f510bfad842be7c3d561467ee98267d5ed8e92c9afec82382cfa4bff27b260f7a49f4461e0a43f454a9f144d93b6286fe5d66ad2fb086177efbb6dd88ae93eeeda0af624d72feca724214f5ef017f50c362703155cf5a1277b95a98b9c82af73d3ecb1c9021776d493ce707a09291c2f84ec6de7f10cadae70bcb830aa80319f04c65a8e8dd8a64eea06e641c87a649c8249442bc646994cc837291d3371cb3708a32c89d91ba1eb3c61e45843ec09b6fa9a8c892158b9caf0079b5b3d32a393491524316800a8b373bd94bef27744587c3b53266c581599b90af8d6ca7a240a16ac897367f2ac6aa72d15fddc43e1f98fbdd9d803198806da916c9b5db19ad26e252b65e5d2f28160477f2224c5e6d8b24053de2a94318297c3119c45188e9e0184ba8f108848591e9eff4ac7e9c8274e49da4daf8491c192600e707939a41b829d2ddb7027aed33dd13a6f8fe2e8eb81ff138d5a7365dd8c8737a95e1a80b35226d238f94a64fdb6b2007dbdbb4216004f9e01b85998abb100d686be8450e3378fb7d1ad59305050a27f861aae818d4b5f724412ebbd3d49a9403b62c0b65fb1486c2d633726b5bc9cbf895da512e08f76155d8a7461e8e322ceff214c9e62a4f4e1ecaa4a6a7f84a059f8795ca973f2379c4bf3dc6240057039b061b7e00ed990d615d4195fffae30d75c69bda8c4598deb20d4bbe1fa89d9e884e18395db4d673badc7e388127a2877ea3963aa76a2e86e04ef927da1fe0f275ec2e526250ff26dd07491581f35cffe6ef94d5d11ef7820db2eb54a269adcd6dc3598d3d6c0a9089114ecf8f37f8eb648fc45f724d8312534c161fd0e94225269535524ddfb41ad98b842373a3ed206212bc6ef6d41e5f081257b8391f702b2272f1eb2e6b544b17fa6f79ee34ac64d6a3ce6f436b9a262b2da0fd0200cbe7a0a70599c5c9841844a74ff93e25f9807315258cdec86b45dc823045fa14a66275c0c0d18e5cd413f17750149f4b6549dd7293e7c224e12dbd3f857f8da3c3dd10479aafbad3a4cc52a239a2a4c46ecb1012bc65362581dbb9d5b6e662737d84e0bce34f4b1678516c43b76a61e5c6d312b71eb3a8f75daf15d0581ad791a1583b41ddcf7f48bc61ad6049746f15e93cd1ad5073da3616318d9298a2a122a2ea5c8e91e5774be79eb88cdda20f153afdab320169262949a820baa9cf3d285ec2913fa9717654903970b07b3e25d551344772e392dd61d4f20d72ce9c4cffe4f6f4b39ba65f0f2fb74010a6dd4a130f74c7911f74501308f91610337b931445936c6a3e00176d4e77dd3da88e20ef2472e0aec59441e5baccedfbed9dd1f699742664cd30b1f1258ed5613880216534ae4e4174e2319356b42f8700eacfaf58dda5b5b7921b3b206e4892d29a8497dbcc3ebb2fb38460dac91d816e1d774f5872713c1e12505aac44b1fd5822322c3f662acb0e01584277ce3a1399c5e8a412a6c53780f4ddf8357c69ca76f013219730efd10c750ea2b28110eb366d84b6197432ed1b61616343779ecf3888ad943018a0f8691e2b102a999c043afd5a9cb3c34bc36b6f2dfcacf474aee33af6dfbfcd76377a40dba4eb707869d8455e16d7d72075c555f3dc88b1f95698509fa9efdbe689997cb546813d6a2b754936e7e06d48211e4ce06aa3821a64dfc2b38bffaa2320b9a05d9a8fa1b3f34d897763044b0815f1036405f08448721f6141bf170b86d2d3b404c0403da12d2971afa1806711303ca35d95de6ebe8ef0ef1f7c043f8174150b0db5615b9be98fe49b507f7c634444f57d08892f706d8a9c3370dfdb9796b8fcdabb9088c6e2434236e88eb526ebac8c6e2c5679359a2829115dfde029592f4329cf15f2abff3a04a3d6feb7788a80c758a37f05235a8ddb31f3712986ff425186e33efd5d45b59b0c1b1920d59437a424937196ed437eacd3217fd68a5a8714668babc2d4dcb9cadc9837fb1f43f6e989980698e0c1ee375b3743a97480f9af8c0a1e06548209397332844c6a786f10c89cd549ba47dd746e71d59b9f499a34762a02d43cc4240ceb025e80d9cc923004edd3f0522f31736ec8a75bcdfbb8de7548f80e07171e6c5136b60bcf227922c33477904e480a6384af1657f820b49cb6569ef47ee20a8403650f6099bf743c784c2d3e993477b0c3bfc3501b28d886db37c5771b5e0b5c92dce4f64054c76aee004a75fa0f6274b77b86cd06abcdde2fb4e085f04674f7d86a2ba395e0fb3a8e4a2d5e858e89ce9c7f933d62a84e0538237c8c969168052a9a65f265a501a2ac2327e0faa577f7c73b7143ebd3a58fe6ac51bbaa87095750e5d8451e0b5f650cabf8058d4bccccec73d917cc2bc8f58c3c292784173f62fd64bb86868fe25b12cf81acd58d1aa6ddb35b3534c3868a30f336714df53538111245115f0ab8b831132e87732fff2aecaacfd4c8440d7de4f06e4c6a2a748f9a369fd7d18341acbafdfaf1549f67ae064102ac95589f3e795471d1cbb391415b751c5e79167da4f769e8a9694ea4be1ff30f2281536fc1d6f7fde131e6bdef23d7fadb661c3c4c9cdbcc1de92abf4d0bb9d86bc91418dc3b3e4dfba31371021637e2a80d334f2b73fe139f14f1359463dc99ad17baec08efefa5b9381563152f0bfbf671a79dcb1db152ecea112dab85fa65fd892e91ec6e4cbbfed33df7aed055f0a82f886003bc4d259755d2a01ab54ed5f1ae3d5e7924129ea3f4c7507093db2b41a25ce311a8fb415c7201f261566e9e3285d8e1693e51ccdef29b2816b1f6d420c96c77bb2da28a13b4cc92185980f8cf7be62f1f33dce5f3543ceacbb31495ef28bf3d9715eb895f1e4861cfb6edd01458cb32ddb6c9045f735dd8eae08a8d6d14f46967cb5eae2c299937ffc32100aba8035ae3df0a3e256d8d5b9fc06e83e8ff239374bfaaf84010635a2ca7db23c8c71eeab19584d9d324f7871073c1c5a78e4246dc10a2ce9c43bd7705b13c8fdad8c1cff58202f292811f753909633d336c9be754551d2e3efdafc626d4f25c172f5a3e68f2e3387e4af56dc0786ae0b65250998ffca8c2a60e45a2af2a5722f743fd668fc89cc315dddaca7884e0e26be2a977ecc3517124228e85ecf037088de9e0522b452587d7edda172cfbcc50f3e7878b9d9c6642b7846a82ef52233acff47efc5e678f6e349ace13e2fe21d606dadb4b2bbb2be2634f10f9c73f9c3c620ae1b4b9d33f031d664dbb5f34727466944b16459d3875e9dc8df4a69e4bbfc230ab8e9a5b01cffc115ec1491208426fb650cb9c070d81b1456cb8484a90334c3d0ca178651df7527a84e2f45347a3054c08cba5bf90763e628ca59d51866e5af02d70c03082848a5fd50ed28acd9af8f14676b76f6aac5be135dd852c0895ce96dc6dbe5c430e5015bb9ef9fecfcdc2236ba6cb44cef11198fa42797a50d6a9ca6b3cc4ce4aa465b97f57c8e70d0f202a8318ea8dec3bd817037f3e551e8dfa26cea267cc691aa93985a3b4e8ec3d8a3577ab6fdc980271f91b759d1761c9ea128973e9da485515f58c19534542cb5a586482b89b8d16c4174ae8d931d423744429447f4e59dfe72a9f919da26aeddebefdf36db5148b851f68bf11cdc2b4fa3f85a454976a17787504a4a69d726c893ea866bc4cd51584b8b3672ada1ad7ce45d57ff", 0xfef}, {&(0x7f0000006180)="5aaec54a9e2235c0bd08ed6827eb3eff98c4b779617eef3835f69854c1edebeae2387d4c2038b79430c4c3a45c19d9019e93cd5adf3aff184f23a411d6944831eb37fd846c5229894e11e1d3c501e921619200ab64b3ff864fc11474d678cd6e8233085090566e7d7f1ea889aed707ebcf78c3c78918251fa861e22f774faa808590fac78a745036ce473e0ae4305064d6756dbfe07f8181c4aac197082acfe535ba83b43ba683a1f57bee44b455ea2f49ed1c36dba1430061dbc0617c0bf7df265b07", 0xc3}, {&(0x7f0000000400)="5bade2ec69c81bb1340bada11abe307f3f6d9c83260f82b06ce3bbb19b042dc96655ca021a29991012b2b5b404ff6e7f7680799428a3db02476e6770bd353664b891c5689f046f6c473caf4adbc8780e215d1d7a1242eee03e04deecacffcf4d0e6da45fb2fa3c7d", 0x68}, {&(0x7f0000006280)="35ee5769175103e36ec2e9193b2b63b5096fda3a6643b8e6827d4adbcc3ccccd64d04a2cecd6c4ca98af8926cd177dfec1726bd22030fd6906fd92e68a1e80599dc5ff2d51158813750e8c6e3dc7f3a835abdc3f7f9ab48651736c89b3f256d4426af47a9d0464edea1e2777760d7bb2712a7d2a3611250b9985492609962278136ba080e00188c481e66d4b013d280db68a7881965657e02452e4f6a80f5c085aef584d4d3ad1ca818d9260594fd991ff01727fc5b3939a5228ad58ad3924b527f20fb2ba5c839802465cbee004f84fb327adf9885a9c2e567f188311dd99b71fc2b4c1278095bc150c975fb4b5bddd26d22c12c378f884e60dfdc859a50c563638383641fbaf1a214410513c7412e44294d15af1cd1f0bd92d789254b16a4985641e9f840465eea640853843f3280acdfd9402a54d54b303cea6227b8f0568e8a4abe81e4069b4fd36c2558be638595a3c59cc873150872cc4313180135b3cd920e2dac3ebecdd109f844e5d79f440f6a4fdf52601b7647439df3cec4d864921f35e857bc767938918a4f1b0b86e2a64e3e464476a19ade329474f7b798dd28e3c1a724821cba716f29923008d40ad6d3165c50d2db9534b453853f86279a87634bbf5340159163b47a49b9c628f0ee570e02be895fec79963221d205c10ef2fda83b5c1a5a54bbbd2702a169a11687eb5b51d1cd5638126dd284993bf635ff8e402ab5130157cabf690ad343d60bdf13a65019c97c6b2344bdd2064ee29c7e9420351d17146b3347c097adac319db07bb9a825816cace8dabaff27f7fa273e69575a323026bcfc645bc86607f6ef215e873f8f78ce5c867f87340d9a01b6d47b2a5711e8b14be4915de6c5ce3c865acb45b491d7bd16f5b5a7b508b2946581041d1e9cbeacc5014fd60ad54694c48bc8cc57eb578306e911bff2e1030110bef6310357acf7e8fbb709884176e21c27e6b7b9a0dfca71c1a837c4d165761831762ab02c9721ba37173bdaf607415ead3f45ff207590df29b856eae31bfdaa37985f1c12c70597fad06af245b486ac67aa75c190c1985c99da0c9193973a36b58d7030257ee9fe40fb83e82986c5162616b63c85fe4cb67a5584e3de659f288b000ed1f5e202fc8f89109be255c3eddee973b8bab6e9f9d334c001b57737a399dbc7cc202edd2dfda250a63a83d5dc84513a8381b1b8751dcc30e152b6cd5a42a4d1eb5f4a7a6070d7849afb4eb855e02180d973e3d97edecc1a8321ec72d4ecaca53df4d836a78afe192be44b4341456a5437637209ecf1eba1fdb3d5ad234ec6775e4bf1e2c2ea554708b15488a6c4749309866ca56ab54eca1189772abfbe360453e85fadab8bc9fd1d9c60a9757281557d10481d78fdd51afa9fce7bd36dd317adab96eed960b3f771f67a0ea3e865ee9382c461d3da9f4562a9de91bf903951aa43ad8c09bbc37c3caa67a21109cbd9c10301041eb2a470c36e51215c7971c6198cab8fb06af4af3eaa07667d7f8902cd5b55ce8c82b1baf2a1a354c6eb8714cfd3151b95b70a47616b569dcf9ef53d99897666ef8bd036f9d230b9d55ad48bef969309832476c7143528c2ac6df904fbb4cba59db52eb98ae9edeb26fa0056ade0885f1cb6af53c83f637a23409644cabf61851f8235fd3123de6271ba9cb486679a2f06515da407f5aab4ce71d60606d70a5e794261894492c30b6d21fedba4e247b889fc13259fefb60d857af3ea963ba5769351237f05d030d63a898f9fe23bcd0d1eb8c3dff0313492f79c3a3b02f52eadd7eb10a6e874f79abd9a297936308aad6d8ad41796e99bfab99647e2f27a1e2f5db1f3884d92107fe387248c066834bb4b25ee8f76daafafde8bcf60f68f1d7ab968c1df925aa7b19e15dea5159460fdc95cfda3f0b7285fcc615148b5916b61a57fb39bdb08a58d434d0f2563086364b7eb1fb85080a746efec011111dde34983d8a95aa589c4514dac7ccb0addc7c82526d5ca9bd0addbca106dc91154ca82224176a1194f6707140f086264f98c8f72c1e2ec27b4f9baf6d2e5104648200f4cd2824f06cec2f5077343c15541417acf50c70c84935b18fb8ea2c84f98cbb6875616a6477c529aea222021ef7e624cf1916a4f634113721b94ddc6f35445aa0efa5ac183c5de57e1be139876f4e7f1982d3229583cfdae5f322d22ffa2e5e552ac54f5a11e044ba8ef9aabdb4217b9ab17af0114be3b95f264eca19c4e75ada91040e92f4546731db1936d0c8d7de6db6d6033c60c594b99f3f47227da0634c858a2605dcf48e7eff5c30b9c631d752d68ebf04632bab6971f16d038641c82176c5bd316a1b203c051a21a9dbd6d93b47877e804bce041d0d6bf3a2650ac445ae6d1d86c4f3564b751353dac266060392c88e81c6fdd26a6093ca1fbeb607c1b42f8dcce687edf05db6ed4b6d941a333b3a9a6742f817f6300883f8620f6ef3fee0a51e6628e8212fbe550b8d7e7e87d36b9ded5aaf675eb8076eae1410371cf378e9b7341b3953cb859542e9d834bfb5c9e62c72e8aacd9b7b41a6f8153613409b676bc1b486ac04e1dd215376b7fe175c1f4d5671b99c69cda30e770b8bb0ade96b4e15594761f81f4cedd694b84567ce0312a53d83e1f57ec96623b2b45d1ec1458b8cd9716e5c37d3ab2a28763cfac604949e9b9f27e37e8b575aa926b2710f1398256b29e4dcb9b1d387d1f57a805ebd2adb651808f9fb298a9ab28170c76480a32bc6e8e67af7bc4de875e65a1488ac28f949ef1d608a9eacf079815dffcc863e0b1ed2073a53eafaa6f5018572afb495d75bf5ebb16ec70b29a22d57bdd1e4f204880cf1bdfdb831d4df91c0615a8a622943e489270c506a5835a1a29e3d7c31e3cb20a84f3f2ebe47dc07633a213b6507cfcba7811d89fac5eacc01e8d9ba5218a2392202d79196ba10c12c20b063b1a173433f973166d6ae0fdf2c6081c196127f205bbe6ceb9bf49cdba5c652e0a0604d7a00aed48e82323034a8eb13e73f708bbb9fc22e0639b5a2cc37304057c95b66be65912f66ced9897ea0024a211575e2552e218e3b4079d544d82d62215e7cdeca4f32a6d376bae1ac877e8ef0b17f4397f4222fccf26178203847cba1e169bf808825c2193aed4bc26be53a8d891d87c8ee79914364dec8cc58ecbed0e969b68843f9a857703c6e17774b30e01fb0f797fb06d217a3e102dd74df3c5230c108a5195cb19f1e3b99243f2e7ee04e55808f5a9c7ae613d15e39346b2cbe422eb536813b195eec744e9ebb93c8ee21edf4c0285d6f68f78fdf1d083b8241fd4b2f99692a85cff4a62903c00cc5cfcad9899473fdec3cf3552c96b297fd6e1c467addd9622c1a00ab2cb8cf1db00914bb99a046854417785a66f0ea72d23f59281a59571b04d4fe4fecb60cd298ac197ee113c3466e50ba44908895fd219777872e7ebffbc59cb24c0586f1c299b136b7c340c26e7f94a90a7327135682a096ef03248c72eb96efa515d63ff98c562dbada2edcbff96d44636388bc32e9ce2c25bdaeac6c05e89d0d047a3e67ff57f654939a64fdab7bde4231e897d6095abc84c9ff91f7305cb2566373a5d960df7df6a9ee201019ff889322f438c8dba7da98449df0e43b26d68d53bc3900f16bca6100bc71d4261404e175d4106b475dc57de2df7196247a9199bc37dbda2075458557215767e2b7a4d337ea641b9628f80ebe13bc7f4e579429f6cf35f0132f51e767222c1991ea61ddb390f3f1591610e4874022630f3edc622768d8c42c2558fc613d5b2ead9b8f0f1018d6bc36494e8cc120aaf5ad45ffca23bce816beb7aaf6a78a3c2ce761914e27aa4d43a239118332c4ee97fd01c6da3f1c12263dc21022aaa2e7ba79254c92714d06805055b194793fbf0b26c847cee1048a0753ff430f7bebcf3aa745627351dca8a1895dd3d51fe2cd0ad12136bfeeb86726c7badaff1bbd21c1c322be2b769bba707a41493d763114e32a54ecd4558ed9735665c46160331781279f049e5aaba8f7d1dbd580e944b5192b965073f8ed20404c660ebc15dee644aec85b94773017172c50364d2b717e090700f23ef77719e3fce57d799d18b036666129f3632a3a62ae555e071ee85e46777d5eb07b6f23bbdf7a0305a4b09e91cb06cdbc7f30a45b0529708795cb67e1fea141b912d1259c8c0023d821351e9732a19a2b60d654e685dafe25d481ed6946e70dbd07f7c7fd4db7f5ab24784a32ae755c2dca298ee56697ea2b7b01beeb8f787438497d1f9f2d69d493a73d862e5ce00dd7567701ab9ecb86728d3ae794d65a9167f9439f4152c0159715e2514c9f7b2baae6010eeac65daadd23a8300d57b3c8d401b59446c7a8fb236b53f10e506fb7505c12c1c342838a810f0ca01c6ac5ad09199813a74fc830681b4d42609409d9815b90c3299b08ebd8bba6d516a1601ebf071c0977f7aa47df3c4c8298088f9315323040f48ad67dc2168dfdeb5d3def7caf9c2b2cce79b75fc48fbd845a7c5fd4848bda6b19a4f8e6e41547c30b5da089f45a0ac4d04bafc8a9ed324a98a37479a377ecbcc4840fe1ebdeb56ceec7ca91fb96c66716d6c26648019a7e53a8c5c04dfae820145e9b7cec46dc5a718bd78495a3eebb0abe934c68d7306c871f878551203a55e87bf695b3523234745baa30943f8ec06349c12efcca67cff667a9b9114687417a6343475c04794f4e65556bc6f9284cacd78055800b63a933a34639210836e59c793c75632b5e17d6983e9955e71915c28592056ddf474861cc38ffe6c2399c05239e4f4be9581381cac9c73ac3b68be475070795dd6798395f8b955ac18d84b840e7be11512fadecf563e9a30ba3975a4e37ddf62e0f248b62db49275794fd08a3f9a0a47aa8e91baaef57c1a5cc27d127610dc4cb59f58b67f117ba0b7237150799da3927f6394b8be16cf1bc5ae8a141fdacfa9b141bce17ba2ebb36f875bd75dee04ea64712fbb259403071b6c89b1763402c8ad7a7603e4857b439fff4e57a452917d0b82f0dc9d89a5676bcc6560b5318cbd2ea65d50e870b06e826139a626e95b0e934adb60325b1b0d944d4f9700e68c0a73728c5ab92209555de8ee39603fc573ff7f321a8dc8fb740a7634e700ce4b2ea59bbf7e36f951157", 0xe46}, {0x0}, {&(0x7f00000072c0)="f7", 0x1}], 0x6)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
r1 = socket$inet_tcp(0x2, 0x1, 0x0)
splice(r0, 0x0, r1, 0x0, 0x3ff, 0x0)
rt_sigreturn()

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mremap(&(0x7f0000a94000/0x2000)=nil, 0x2000, 0x800000, 0x3, &(0x7f0000130000/0x800000)=nil)
mlockall(0x3)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0, 0x12, r1, 0x0)
clone(0x0, 0x0, 0x0, 0x0, 0x0)
mmap(&(0x7f0000000000/0xfbe000)=nil, 0xfbe000, 0x0, 0x31, 0xffffffffffffffff, 0x0)

openat$pidfd(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r0 = memfd_create(&(0x7f0000000100)='\xfb\xf2\x96\xbb\a\x87\xe1\f7\xa1\xd7\xf3\xdaD\x1b\f\x17T\xc3\xe2%\xec\xf4*4\xf2\xf4h\x8a\b\xc5\xe4,\xdb\x9c\a\xe0\xce]b\xf509', 0x0)
write$FUSE_DIRENT(r0, &(0x7f0000000080)=ANY=[], 0x29)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x81, 0x11, r0, 0x0)
readlinkat(0xffffffffffffffff, &(0x7f0000000000)='./file0\x00', &(0x7f0000007140)=""/122, 0x7a)

mkdir(&(0x7f0000000300)='./file0\x00', 0x0)
mount(0x0, &(0x7f0000000080)='./file0\x00', &(0x7f00000000c0)='ramfs\x00', 0x0, 0x0)
r0 = open(&(0x7f0000021000)='./file0\x00', 0x0, 0x0)
fchdir(r0)
r1 = creat(&(0x7f0000000040)='./bus\x00', 0x0)
ftruncate(r1, 0x1)
lseek(r1, 0x1200, 0x0)
r2 = open(&(0x7f0000000000)='./bus\x00', 0x0, 0x0)
sendfile(r1, r2, 0x0, 0x8400fffffffa)
ptrace$setopts(0xffffffffffffffff, 0x0, 0x0, 0x0)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
creat(&(0x7f0000000140)='./bus\x00', 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
openat(0xffffffffffffffff, &(0x7f00000002c0)='/proc/self/exe\x00', 0x14040, 0x0)
tkill(r0, 0x40)

clone(0x2009214d5fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
tee(0xffffffffffffffff, 0xffffffffffffffff, 0xfffffffffffffffc, 0x0)
exit_group(0x0)

open(&(0x7f0000000240)='./file0\x00', 0x41, 0x0)
stat(&(0x7f0000007f00)='./file0\x00', &(0x7f0000007f40))

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
linkat(0xffffffffffffffff, 0x0, 0xffffffffffffff9c, 0x0, 0x0)
exit_group(0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
link(&(0x7f0000000200)='./file0\x00', &(0x7f0000000480)='./file1\x00')
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

r0 = syz_open_procfs(0x0, &(0x7f00000000c0)='task\x00')
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
fstat(r0, &(0x7f0000000040))

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
semctl$IPC_SET(0x0, 0x0, 0x1, &(0x7f0000000080)={{0x1, 0xee01}})
exit_group(0x0)

unshare(0x20600)
r0 = inotify_init1(0x0)
fadvise64(r0, 0x0, 0x0, 0x0)

r0 = socket$inet(0x2, 0x801, 0x0)
r1 = dup(r0)
connect$inet(r1, &(0x7f0000000240)={0x2, 0x0, @loopback}, 0x10)
getsockopt$sock_int(r1, 0x1, 0x4, &(0x7f0000000000), &(0x7f0000000040)=0x4)

r0 = socket$unix(0x1, 0x2, 0x0)
setsockopt$sock_timeval(r0, 0x1, 0x14, &(0x7f0000002240)={0x0, 0xea60}, 0x10)
r1 = dup2(r0, r0)
syz_fuse_handle_req(r1, &(0x7f0000000180)="ca7a79f773928bdff98a121f279836b015de47a8b543a20d97a066d2f224f0f6010bb51fdf58b6cace82788937f9c03847a3048d9eef624742ac900bbaa81282a355635fed8d835559abfbfc7465f9cc7090fd6eeb0d4a8eba76cb9cd22b24de3f0fd9cdc31d8de69f4b92d8d9eda360741e052dbaa79897c8b89fd75fc6d0c2a2325492682a5153c7a4ff602d4a8fed4f5e0bb715fadb278eecd93ff568381b48bd4384963a783170845bcc8138862309caafa02619cea514b1be4d03098f1ec9a588d336da5a231736ec6743b016aeceb9e6644a2855c2e7c259decaad59077134cdd62a6f8904d96db49f4c73babf29709a0e6a8bee6ff30dbb39a6b9d464573ada910ec013e3a195843aff9bc9e9130d371ff7ab503fb0ff27defd76c4f6fdaac1064e0afb5aff1927ee6f0d03e604fbf25988e30bd3abbc5079ce5aa161a4b739a76614f0789ea4d5561d9d30f74c7cf31e104b07f595b8125687a33634a1a4f08280366822b2f8baed2c09fbe8849ffe4a58c2f25d2198a0168a0d500a3a08c7f08e5a12f9b518b9510e56c6112198abc98b34de34e7a643c302912092fde5fdefe518e782a5617e25714e7f97dfe2877a9cab64d59856f0f455647ff24d03228dfd3bd19092f46c1150aa2a497dd3e0d20a50f8e8f5bbe4dec7200244163820380432fd6d2bae0f2d3c565e443d8e2c3f2c486daf534b63c6b671ec143e9d61a4a5f42023610d83d2e54c265d86a11ac32c9d70d7aabcccdf583fe51b2e4a34ed077082e569a9738f7532d568391eb2ceda24ac883b963a5b3ee37e1d5aea6b4b9aefd1305f9013cb6c579d930d217fcd633fc25b254424ffecd67efb71c57466b19c2a017fa8d73aefbd544b13e94da6c6224300c318caac121dfffcf33abfdea30f8cc1e09a21d8104a4697ae3f9665dff1a255a6462373fa21d9696784f798560e8a38e0e2a2b729fb5d9023fb54e9fc2fa7a1f970fd5050952dbfa6b8b04a982a9104212b2ec9e1f8c5f3105331dc3496d2994831cbccd8216dcd7bd16649fc5ca4c08df51c1229560d52a321ef3532e0598b5a05ed04967472a7028013c27aa579dd34af17e4ff3c95f5dfed3e88be749a35d41f5a86dc746af0030e74519e87c1880e9624108f6eedb4d415ae7f155d3269a7c3883d3ccf39469c06f4dd3e632886f5f03687cff9a2a88df4f9d6d6885fc43e1acaba8440bbf5cee8b38d8d762263f1aa7e249ab5bf9208a9e56b1201dcda00a686d5b0b3b451f5d4ef9cb1c17d4219eb073aba150303f8e69347c1aee2387e3113f033b1acea4ee7e6fc7ad06a40f97c9c40112b1825e65aa308e35e0320f0f3392a5b5f2fff35137ed41f6452599ab35e2a5a4c7db15ea503090c8b88b39cc141b8d73203fffbd844c037479088b10283da229e338e65269af17edb908b66d0b91ea349e90b37a010d44e1963feecd92a5f52150b518989fbb06ede7a123424a36d497e2fe2f9017a3bc38f042c294130ab3107bee8c4e4e17fa29770b743f375ec3092b304ecf44db3b112cb5e18b7f1c4949b1477d5c9462125ab137cd159b7ecf8bb0de045ef999fcaf7acc1d2c1edaf07aa9d4196e9acdf2afe4ecd20bd89d7637fd1eedc3d96ecdc94e646847336fc08861bee70a8a2c9c15d49b9d95347451d39325eee64312e3793de42c992fc7f47d045ba0cceb54a3c3c2c8f841e7b891790268aeb5ac154a40d5f83c8c82d28e8b0ab42e883b944f76264167265607148594971eb1e83a618994c684dbe9a51031ec9b13f923e24b1d99c37cfd5767dfbfb089b135dbff1000813716e0988e21900da0c068b890d04f5bb5c6fea2a7bf480d5b10a3ad494daffe39c3d62ee85650e713f362addafd296294d5df50d09cd9fd439b92a2ddedb4757fdac5f059e423ee45625db2a80930b9e266305bb8738064216f8e88cc0c69f89472ce93db6288eb1eccd84785713652d80e02d48ca24719ba1500b3b4a67eb865c6344ce19465133a5038bbcac4a0ffe8e090cb173cb42ccb7f56585e12deae43fa90fdb163ec0e1c053e83a08b797571d5d934c80bf02c513a2bf5b43c05fa01d390342153fd6222f34d0c5d35680e85f2a4e9ddddb9d02a117d71f6a0575a47e01799f1b864c5d42587f654555fbd2ca9a60aecd03af081899258eb8f41b1a96b36c1687a91421eef5b766a24f04280364a734e44249ff0f1f845fa872564f5610ce1584c7723133570d43e11f62b3805d7f1bc5b7690d6d177aea15f7631fdb50e1cf30a7012d3a60541babb96cd3bece657da8e82c491bf736b07999f60a8bff8d640e9d4358cca0f499fa9f41b2be6bcf5686355770abee48bf3ce7189156c211d87ad1e25a5f85328adde1d7a2ca50d930feabf955209a513e6ebf166d1745fad0450edfb9a2803b87aa8d4066b69861fbd4d24ca027b0ec8705ff63582ade4b42532b5d445cfc4e9f86410f2cc1dbc3daa69510c5ff6ccbbda1683dc76cd3604ba564f68b7349b9a0e0c84f2e7ed249bfbd54b9aa4ba68aae9dd8333e9b5c96dff59d486a15e64e6fcbf421ab9eb42ef52e17cf9e0c48b70c0bb0ae87f0cc91d01c94ed8ce792d9c9570bd66dc1bb713f5edd11e41d51dbb45c7b6bcb003ef7d11fe3801a4d00569e9cee939aeeb0c65eca2bb09c41981f451cb4ccdd41c8574897a9c23950a68043b56e55537a762909598e72f56206758873f982ddef205954bc95a697a7d6cbf892d31066ef4bcf1aa619a61c138ab210652539d54f2a5577733c90507afbc5d1d27a5e3cd01c0182094c51f6b28b0cfc4d4a1687929fd617246284396eab29449a6a2ce43857261566c5c6fe956738990772c86b9005a9dd21f8c9b6c98a297707b067cbd07985d316e831ae81d8ce9acecaac8fd88dad5ffe7d357afd33e192663430509b2fa9179ff8635c0cd54291302a953fe4a2f744340439fed2dc0e8a628b980eb13f6a6fabb9978fa3beb243ceccb0556f3b5b8cc6aa3a533c9db72109584e366561edc5d4cfec99b4728e0e93b7082a4d5bc74e07de4d9e92f8fec29420dcd041e9fc3b8ae1a9d159d40629b60cbdab63498cadd1ccbfa64d704a57a8d6ad023b6f3c1f0c25a314af22c0e0eff5f3ebb6d01bffc2a746e17c67a65ec9d2f8f7a3b5a8fe60a086ab47b1ac3e6bc885ea3b5e810f204da41d97a624069b8efcd4f362f65d4d7cd33027b76b652bb2fb83a74a2fa0e7481f2684c7f2157bde1593bc6936a8881b4de9501ac898371d1e7e11cb4acbe1f6a1d32ebd5f79280f316227398981324a1e5d2aeabbe692c98c5a2fe671fd43fd3cbf595b75c8d33f56561aa87d9eda44ca396b9d99aff39f5205e1118b0fd3a2450ad85d5f930c1efa2e27b8fb19e07f5586fac74b55fa3d96ea92fb41e2621d39027f02e25903b462a7df7c50f565d77d59e74904097242a7cd9cd3b918d0786e9164329d5d358c797f66d588ea8ae2b0ddf82825a6d314a1ad4132405b038d3ff01405c15ccf8390e29c7d2fe52b724488799cadc21396af3abffafe40a822d9d652c8f561adc01c00616467db9d924307240c2f98001e6becf54cb7b2f2f3147a06f2affe393c19ebcf7664d6b98e6533b037d7583ab4b464bd11e1aae5196bc8e516ec9b139659ee9a124d541732ed545f3bb9adc33e96c402fe51aee3b7838c09532e5550bfaec5b155fcd8c20afbd12691f47c3b9cabe65cc4859e96a3fa93c39515a1605a6b237f67e5a2f80451c6e3a6f82e5db9add2bbd518f99716f953badb5ab1892a8e31193c13e0dbc57945d7c294138498f4474ad7c084fa9fd55036f5d367950b51580d064365d16707125016766c29e5e2e8bbc2614f94c536ed7ab0492eff214b060e2ef8152bef26f462a3135414d8addd3d64cb37c47911d680785e41e2ef67f123bc36892151f432ab58f2ab6d6b6c7f49ed1239a75d3990786f64d86ead20a75c326cc3d79508c39cc6d39ef42c28cfbcea76277e006fe19b5695a02a33ac571a6510414a6bc4b379915458639880954d1c216da1bc5084fb8342553bc0138a26f85b9e2ab6e7aa4375b95a084c42550fe1f5e321e2307f9818a6705b502f6750936c0eb9e48d4bad8c2d4dbf7fd87deec932c69899d88083431f203d8f1cd50c45564a44b574243d75c1ed7b1685bd188a3956dd1785d2a14c8b6852daef0ef0c7759bc8c4d841eed0a5c706e811336a981616081771e06fafa9828a24130aa0b6db42ceea4c30b4c605446c950bdd024f105a3264f2a6f645fd170977808d5a8d5a300e41d3f91f8582d08489bd03c5ddd406018fe3586548b62738e6ead254d2d79d1b573a31b99d1637ed105b61d99d8d6f5117851dc95d23a321840ec54cebd0a299c288e97d4819a837da6d6c5218a5b8632442177b6ef301197f99e1fab5f7bf00e5c475902b1c4373a7ac09d6deaa4e923f31ed9df273f296223712cc73b8af718616e1b67070e1240fb5ec79313d522725ac73619a2740e3186d73903c1d4af985ee37bb3df0e35fd3531ed35df33a02c8427d786ccb12ac2adb37131463fc60d9e9ab6396c8c604fdc14cd375660c3ec3579cd604720621720c909a4f3a0e1b59dd558668dd073c38f7cfdfb9af7258c83d070b0f788f6ffac6241a1530ec20acaac9688b5881affdbfca641a1154d0e4920aeff3ec17dcd5e2cb0a0843a66d09a1336fdbb5e00c1acfa595b6216c625245b5e203cc155838166a8fd2b2b95f780be8c1ddb2387bcf624b291823941cab980861db89fb989ecb912888a1c70a5756ed6f6e1a616c85ffe3ce0cee9e1c39753ce4f4038af0934e093b42a282e39b02ceb47191671ebff1f4fc439b36afe167b1374c119a14d067d052897388beecd82605b6f28bd964aebb8fb1a9bf074a09e9bbfd922ca4735e7a172ffc1ef1e97285f231c6d45eee7feb1909f884de905982c40a062317336512df408320f0cc0f052903513185f9f0fa5acc16d3a8f3da6198a6891a97580d03143604f327ccec7b7dcdfcf3a5f4b8dfaf536c0e03a57e4b16f3f3ab5c9f0caf91dab6da19a92580da94baf300ed084d64211f4cbddfe911e3195e40042024705b70f8f056af82fa7be74738aa641619baa02de51d511e022ebef4bcb0fb5cbcd8834832d6efa5b88de4ac0e8556a2916efc8ea357edb0bb3626955247575854b5a6a90af6576b2fd9fa2cf19b4cf528a6c9b2f39a69b8cc2bd814b2ec2d843edfcce635f938cae5f951f54875823cb9e8f842d6ac00035495f8b80b0b7345a8708771aaa4cbbf9a14d74006d4517d3458c27d8f5d22d7107866235df696004c5fd1b86a0af47210f234ec2a33cd20de431d92ab459d3f71a05044354af36d803677170f59962a959e888ae9eeea77e161087006bb694a64e849dfa81e812ccbf4124fcce8ed51e2e1debaf8a81debe024f8e6f49e7145fb844d9bdc650c7da29c9e303aa69b34707ac1219a78fbef03290be250b894b68e09f477b6337189302eea8c4d91c81a878e8640a4ba6fc6c0d7ead8191768f994ac7afc88e36821ac99b2a3fcacf9211f97eb06a26c17fded0ad7120c5c3b0d4e32a60f8529b1b685fb5882116ba5527aa9de1090824e78ad56036102e092412cc1834c8ee6411364428ac04c2e03ed73c3af5924900b5614b45f268ee41a07a48f1862319a235ab977fa44612a60f08b8209ce421de88afd90fa08684b72a6aa5d03cf00fb9a9d7367b7a550971ec3442792dc1e99f8affd32e0371e5a6380e69331b42e5c4c658e105e2661ef2eef3d96af1d3e979e6c46d4ec972e5d416f15ece702d22b0e30ca20a818d7c1ad776effd67e21d55507d76fb272d92d80fccc1603dcb8f386fe043a2dff091da510335b05e71cabeb5038839af990f31013307b81319c43c71e589b8dc5f2c23ce9d428666dfc75ecb585ceb8fe60db960a0439f62b0ac29187595d03c769bbdf3589367adfbae93fca7026283ea0628aa880389e9bc08cde0b51319f3537f1854acf106d91d2e588d28261d12d173adfbf23995801a17ce85f0cbb5d6a1ce65049e19ebc09fd7e8f4b008229eedecbe32c1c6396bcba6fa1276ac9762f01a015589ffb534c44833754866caa8d52519a73593ddf9de792dfa02084a38adbe91afd95bdd437887fd981bd508ded8384c63f67a3ba0d14d498eacb188c56a223cf52624ad812359f4cec73e7b3535e75011e17bf440bc81443d63d4f87c2cc6f58169b55a1edcba26a67600caa33bb73680af87c3bae069dd8b6444bd367e17d45f2f50f357adaba2fa743508282636408386d21c442d209f38f5124fef3a5fb100685c65a91a406b195b1643f5380fe726d3e7dad31122b7270f95bb275d0687d11360779425530529fe23c0ab84ca0eb19f6490cf5753b06356c3404cdda105cfe560fa0fb6eb879d66ae4ce6ac657f39e7c88ae4133af34154659053b59841d5dd6fb74273f08e84f7d6d2b781093e242192348e13286969c96112cf7bc553e0d360897042ef2e9f4547fab326ff5f1be8dad1f06d1acd992f9b895ef17e37452c17171fc2afd5f967ac520f5a115c979af6619d90dd03dbf542a8d04ea7c394645a99b63a4c0c340081e86a51d2ff03ea9cc09cf491807f877be320c23e962a5a0a79a7946a40a1e8b31e301c422ed5adedd1208a5e8eaf9c8ec68594410db7738449fe5182810caff0497d72125f8dc1c89a212842a05b6cfde634971260ea6c94465cadcfe7cabf394544369e07cae8cb83816861c4faa3a7001fd7b8989bf425db6096b62f08c03243365a9701b61374d3aabbdff40d2336dac3825a68d7cd4f767682b727aa53b6f97a69de32c9d96749635b31d1339801da3626a95deb15386b51ac6bb5818fe683e43319b1462a78f0a38801752f5afb2eac718bf3fd35c49b71d8461f8da1523a774bafa1090b6582e2b6ccb24c95ff1f42296b76d697402b25e5c4524c42bfeb8df5585f09283254783f58360e4c5aba9628f372d85fc7e04924e239ac54905abb690ea29ca485ab4af4ede8ed53617878524947450a4b7ef48a8f2d55ee17c40c6cf452ff3367d4f67fa5cfe4e1a86c76347793b9331bd766db5ee9265c27a121fbcd930072204f36b67b5b4c0f7c6fc35c1aed271bec23213d1b5c493fbbdf2646183c2f7e007a089135ef142b10218d861b976a02792dedfb16f94f13673c12d369713cbf9822879939ee37300eee4680e1af563a40fb6eb6605bdf6758bd6f00355afd5fa607c2a55cdf0c44d033619548e9a6b71478a78f1faedbcda7d1cb11f0b5dbc5b0a8aeaccb9763e331b5e00f8d758e68b2b7f6d34541ba2378b6904b678f9868cbce4e5e4f3e2bad558f575359492269b1a2a7676de78b56370dd185881f5bb331b84d63e28b6937edff5c39a043deb3cba57deacad74099b570d299aed0d24b7be90fd18e5b229896f802b43088b749cd2e0f328be9c076ca12be49238b8e5603b8dc086bde70a24f58636081f7fd0afbd7d390f72d340f2dac8c157af09b14fd0cad8fcbcaf8c7f631736753cfafb3b80ecb9b22329f04748da39fe5a969329caa4620ab72563d09abc3353453edc105f4c05770f49d60854314e488ac8811bdd263c08b5b0f8667688517a8d15f233b169e88c4659f4e98069df896fefd2b8acf2487732cc212a86b54257ac8832b3c372b65d465eb1aecbc1fbe42575ee5a2160338b488e9bbfdbeed49a94294d0d41d7aba13d0be023149b27104af639b5a553ab8ec8d427684e38f6d797555104eb3909373602ccb62553afbeaa95f3446ae6c4aad177d4fa8f7aa3e0085782266a1ded423daade28424fd6b0339a0f9ae455d67789a262b31503bb1bc0bdb46364cedce32082310fd0ab172132e9c72b646ccb189d470a577960ec55ccffae693e1973cad11a9108c02ae67ca43031788be8e615b17144770eab22b33aedc851a03759509c1ce8dde3b9790d834d60e103cab4ba2cd0419d0489f2fd1bfc86444df1f7f2cc48abb559dbe9d53d85f5f9d33b7c477e880bf70159ca9900101764968ba4b80c240394ecb3ecda6a7882b4db3c065440ba5ea09233d016d9881afe110dc738493e032796ccf12a4a6a24ad8a6c2d78920158e4592df0b6abb66881401d44a57b85afa584ff57013553b15c8fd605c2ae7536e715264c871690211fc4d81a18c573af606494301bda8fe33f0b4f02e025f880361eb7856a425a352ac55bcbb5ef3e06d2902e5b3c107bf1d0e83a0d059ab301100593451c5f1d2582834319798da7e26164a2fbda63f0b76dfa3c32587cfba40089eb6eee0719cc03597e10a5b2b8dee5a3f23e5a6b1f4d03de60bd1638a641dd641ac2988c54fc1e5e3dd9268d6cc21c65c5e1a206cc4b570f26edb3a159c80fcc905126d038c176e68ceda0716fd053d13a572530933ae109f9965bde6ff49e501f0732184788e100e0b4169e17c3b3d1e17b5bdf5ba4f8244e167ce941a57aafe8477e3f3d9dd61672f973d286ebd94d38b760a57d6f1473899577729db0365bc756048a670674cf45344c205f2301a9110b71fd05eb4280f76363a661338b4e382fc544279c2517b6f03fec051b95d245c8cf617c10db7d70e9938a17a6540b8705f392617fdd0af1724407676a2763b551efb15362906847edeaf062a172db08830331e617218be77e6f035b24e64f5f9ba18916b4b15830f69947146bf4e95e0dff182082b2d2bb44dd9969b611887b3cf582537f608b2a55776d98c43c7c521baea1b5358a7082f0bfce3261883ab30cf55601b14c5ecda29e12b5e6defff3ed9212f8e4da18624a3278c64dd858ca2c8a7f79288cabb49be1e6dfb5c83dc4de9758669ff2e91c40e3c595f7ff5699bd84043436711193ba78fcdd7eec3218d2b770bce205a01841a19b75319cea47a1c17c0c1ce2b1d59a75067fffb6347927e5fa1804fc504730ada2aa88c9573f62792f69d42791442f824144e974c2c2fb7385c9fe50f9c216651939bd2b5362883b4743be0fb30fb320ba3e4f7f4eaf3c6880d27226688360b35ea0c206070c235e92142523718dc8a3678391429ed5cadefbbe40790d23e6e6d74de7fc22449b1e7a09f16aede0c0de919a7abba5ce989b13a7d603a3a5a3e21e50f929df782c7e9f59222fc6a2db6dbb7232eb73a8a1130f23040bca6e725457f19a89d03e4e59b160fcc33ad67982d9b9a3b1f31b62b8de7b9887ca85db284ea9fc1b8e1e66e564a3bdae18d6cfe13c8ea5c48fa7c08dbdc5ab1dfefe332be7cc314f278a3d024aaa9debff36731f11901482bbf17ab90905ee1dc365eb388cc933d198b0d506409f10284b67c319ef472438b19ebea2b6cd3feb9a8bb802092c1e36ad7a22a1f1f4de72d98e398802c96f18f2aa6613a34eb78d55e75bd480de3ff89159213a6e6c15f21dcce60614912b103f36d168852f3a3dd962cdb4a98218638866a42b5b0b9abee9f7b770fe7ea1f28328382d7ffae6fb5c5f24c845fcc2dd70f7bd4aee75383e4a918f9444c0c3d531637c5d65b67aad174512dec8df59824b0f4a2a2ac3d55c3cc108b969c3e583e8de3d66793413ce7f7eb412ec8fcb2c2d2f10e40c5edc23b11ffd49c012e6796da9cd089a9a2e2e3008969d139e16df779b9d168e6f1109eaa09d46e0cdab3043141c4ffb2f85b64d4969ebe65e696c4ce87bf93331929ad25e47d2184292709677482d6de091f9458b4a89afe6467ad26f23c2d7ed3e507f5403b5e08719504f7f31935ed3fa7e8a1b30d4fe34655960936be31fbf306963ef5c625480957472b83e1c52726966394e7988b19d5b834fa2af01d15a56cc8209e50ff993f6db2684da2af5595c9380ff5cd1aabba9f5550c637f713e099a0a0f80de06947c108dad5f3e1d609ca0c1aa44ed3406c7cae155c10e728bd9eb3be0e979625922f75715ed45e96b3ce28022a7e00750bf08c883fcb7706b9f00ae3f5d46ad664b70fac6ca248ce26280c855021c9aba3cc5f199286c61bdcaf5d10c76511046898a8819afac3a927adb785cf5f4377b577c6e926eceb0f7f3c08cc6cb43a589105c87aea05a75cbb95c1fad87957b145d208ffc5cacd12a8fa9d0146d861754abd62292fce087820a3c4fb27fea8211d740db8c827c3efbbef9b5fe3b416512b6214bda4d75e80dddf3ce4e28e92c87512d2547bf7fbc639bb4cdfd6868d0d5d60c210e4466ce1146f0573a0cef3c18caa3554825cb1fd3cb3622ba885f9a1ad49b9fbb1f06cd28cb5f8266e400bd468d0d69afe8ceaa3253b96ca8b1597f60170600b2bd473eadc8b34e099593fb7bbdaa5dfc414851832a26d93892a55ea843433100b17cb15766fb5fd307a3426f44bc67b810b999c521ac1158ea47dd5d906e286df78d23dc6ae13be3ea3e3169c9d49ad243da700fefb79d80165eb81cfe3f5aacf8e702c3c6723e41d9b7d371b63e8b9893adb951ee6b7b1df748a8618ecac95c05f746ff59c0b267b280252e332372bfa98b4f988bf4ddecf960b82d421b55f8db7dacc3b10b421ee921a0ce7e0209c6d480a67ba7af521521402cbef8e8a46d752c5f4d5de03f1787174f1750a87c44d01b530d62c5455bf0ec765568192387dc839e19525551528f86930073352f0b9f3fc6fe922f4edfc8388052bb00c5a8cc3b4013998d08e63062f7dae366a8b0b48fa194794abd33198e26f256e32e382636a1ad17ab26360a3d61089e9850948022babbf2710be0b20e38be80d16ac7f72d193565e35755451f4ea42771c3ea7c6a18fdc8a6c2245c4c98536020ea87172c46adf270d944601daf7825c7c357636953a1c375b29c87861e1ee1b00456bc20901bd1b7c17e399103e4277f64f2c967b410f52acc28b8c2c7aeb9bb3c216ce31dca740351d3e130db10a9b1adc3c68c759f833ed20e628fabb02a06b1a921494c60d7c75ddaab9d5589de87e401194657eef72df4ac9e9e6bfaed733b7825589d5e49b178c54673fe4e9644a5a6041a7f5c1307538c94f4984921be0f7c80884da3f2607d77c9f5c0d27fe0488527889edcc8668b704e0038cde70575ec1682a0e785d185c92255a6f096ac2c5403450a6ce33965ae96a45735a33d546f9b4e4b861a74d8efe06f54c5cbe9815296a1606defd28b09afc57f65aa28248031ba18e1b116b057b858c856fdbe1d2bd069b070d3cca96ceabfc5f2bb56c874d315c0ba9d2348b93fc9f662db4969150f56e5b1c8e30fabc516ee935350ed5c80ec431ff2300db667e92b9083538e7ffcdcf30d8111846731994dda4f045da6aac94f3b12c280abc6350061efb69f4557da9b3bcb53e4542253ffe353a8e4eaf96046b0aea994a72dc3cc9f1e5f33e40374f02852279e7125ffa0416aeb3d8123b0de316756263dae34c2174c2a85e4fd2703f45345769fb7f5b3b8ba244ebdb286d5fbbc077e12879e188976634a45076a8f02acca2c76b699ab0f0aba3408d8a15f5cc2358cf43429a5ca6ad348f3af624dddb56953a12861c10fc691914d8dd6b5f9faf7b9bb6ca69f62cc52d9d80fd34f912b6e4ac9e5de4b1f392cf643627116c71bfa03", 0x2000, &(0x7f0000004a80)={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})

clone(0x6006f00, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = memfd_create(&(0x7f0000001fc1)='\x00\xac=\x9d\xd2\xdb\xe6\xbf\xb4\b\xedcJ\x8e\x84\xd4N\x12\x9b\x1f\t\xbd\x11+\x86T\x16\xa3\xb3\xae0\x9f9?\xefo\xa4k\x012>\xa1\x9c\x86x\x1c\x9f\x84\x195\xde\x97_\t~\xf3Y\x12\"p^\xc1\x0f', 0x0)
write(r0, &(0x7f0000002000)='/', 0x1)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x4, 0x11, r0, 0x0)
rename(&(0x7f0000fdbff8)='./file0\x00', &(0x7f0000000000)='./file1\x00')
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000080)={0x2, &(0x7f0000000040)=[{0x7c}, {0x6, 0x0, 0x0, 0x7fffffff}]})
pipe2(&(0x7f0000001940), 0x0)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
sendmsg(r0, &(0x7f0000000440)={0x0, 0x0, 0x0}, 0x0)
connect(r0, &(0x7f0000000800)=@generic={0x0, "127a3d4f9d6270dc3e8ab691f4222a90c030758eb574e45bc38bb35578a8b598cfc90f58d939e9673079ce17dba07103c0ae455abe7a3e968de2c17f401b1bd8077fd99f3560cba907064863e24969a68dbbd8affa1085a7b2cca7c9419503a2d2b2ec759c9ddd63b4ead69815afa1aa508f3e921ae265d9a96655aa2e69"}, 0x80)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
statx(0xffffffffffffffff, 0x0, 0x2000, 0x0, 0x0)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
syz_mount_image$fuse(0x0, &(0x7f0000000040)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
mount(&(0x7f00000000c0)=ANY=[], &(0x7f0000000080)='./file0\x00', &(0x7f0000000000)='proc\x00', 0x0, 0x0)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
getdents64(r0, &(0x7f0000000180)=""/4076, 0x18b)
getdents(r0, &(0x7f0000000280)=""/87, 0x57)
getdents64(r0, 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000440)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x1, 0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000080), 0x0)

syz_emit_ethernet(0x4e, &(0x7f0000007580)={@local, @local, @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x40, 0x0, 0x0, 0x0, 0x11, 0x0, @initdev={0xac, 0x1e, 0x0, 0x0}, @local}, @source_quench={0xb, 0x0, 0x0, 0x2c00, {0x9, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @dev, @remote, {[@rr={0x7, 0xf, 0x0, [@local, @empty, @broadcast]}]}}}}}}}, 0x0)

syz_emit_ethernet(0x3a, &(0x7f0000000000)={@link_local, @link_local, @void, {@ipv4={0x800, @icmp={{0x6, 0x4, 0x0, 0x0, 0x2c, 0x0, 0x0, 0x0, 0x1, 0x0, @private, @broadcast, {[@noop]}}, @timestamp_reply}}}}, 0x0)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000200)={0x2, &(0x7f0000000040)=[{0x24}, {0x6, 0x0, 0x0, 0x7fff7ffe}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = inotify_init()
read$FUSE(r0, 0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000080), 0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
bind$inet(r0, &(0x7f0000000000)={0x2, 0x4e22, @broadcast}, 0x10)
connect$inet(r0, &(0x7f0000000080)={0x2, 0x4a22, @local}, 0x10)
syz_emit_ethernet(0xbe, &(0x7f0000000200)={@link_local, @dev, @void, {@ipv4={0x800, @udp={{0x5, 0x4, 0x0, 0x0, 0xb0, 0x0, 0x0, 0x0, 0x11, 0x0, @local, @broadcast=0xe0000001}, {0x0, 0x4e22, 0x9c, 0x0, @wg=@initiation={0x1, 0x0, "3df599c871a50ecf7a950f0c9036cad94d3dbfdf36988854a45ed41faac89c62", "e3f972174ce11672964cf6a93d3af8b4ce05a21d9e4a0d0e8043694ced9e94a3ea6c5f0ede2be1a972d6a931780650ed", "9aa18498e257c3cc65e15d25cbccb6f85dc892427aeacae0d14bb05a", {"73476aa6246eceb8b83e70a943dcecaa", "a16f0354b849e50e566b21c0d48e0d00"}}}}}}}, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
wait4(0x0, 0x0, 0x40000004, 0x0)
rt_sigreturn()

clone(0x320e100, 0x0, 0x0, 0x0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
tkill(r0, 0x25)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000380)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
fcntl$lock(r1, 0x7, &(0x7f0000000180))
fcntl$lock(r1, 0x7, &(0x7f0000000000)={0x0, 0x0, 0x4, 0x1})
fcntl$lock(r1, 0x7, &(0x7f00000011c0)={0x0, 0x0, 0x108800001, 0xc})
fcntl$lock(r1, 0x6, &(0x7f0000000040)={0x0, 0x0, 0x8000, 0x0, 0xffffffffffffffff})
rt_sigreturn()

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x210000000013, &(0x7f00000000c0)=0x100000001, 0x4)
bind$inet(r0, &(0x7f0000000080)={0x2, 0x4e21, @local}, 0x10)
setsockopt$inet_tcp_TCP_REPAIR_QUEUE(r0, 0x6, 0x14, &(0x7f0000000140)=0x2, 0x4)
connect$inet(r0, &(0x7f0000000180)={0x2, 0x4e21, @local}, 0x10)
sendto$inet(r0, &(0x7f0000000380)='d', 0x1, 0x0, 0x0, 0x0)
setsockopt$SO_TIMESTAMPING(r0, 0x1, 0x25, &(0x7f0000000300)=0x2b1, 0x4)
sendto$inet(r0, &(0x7f0000000680)='T', 0x1, 0x0, 0x0, 0x0)
fcntl$dupfd(0xffffffffffffffff, 0x0, 0xffffffffffffffff)
setsockopt$inet_tcp_TCP_REPAIR_OPTIONS(r0, 0x6, 0x16, &(0x7f0000000240)=[@window, @window, @window={0x3, 0x0, 0x4c6}, @window, @timestamp, @window, @sack_perm, @timestamp], 0x8)
setsockopt$inet_tcp_TCP_REPAIR(r0, 0x6, 0x13, &(0x7f0000000200), 0x88)
sendto$inet(r0, &(0x7f00000004c0)="34e2de4d8d957a8de4e490b6cd20b988d4edef164bd3377aa381b5f50b7ca40a516489f78cd7208982e9bde22b2b7c1c7606d565477f3db9d2b077283644c0f27ab52a863a42863e06944e40a0b3c5d21c8cbe052e7f726263f28aef1bc12a069063d4c30e8f329fdb36859be727fbef4314161e5fb5f01ae00a2634d5cdecca2089c62e32f4c919886b2b88d237e287318739bec0364caf15889f38a312ef6621c0f21709a4bf2b16274cf933f6ad8fcc9c2024bc1b4713f650e860f93ae93b2361956b3e80c38c5fd29b5c1b5d7ce67edc856a8dc0ba54cee53de9a48c131389426bd06ec7c695add357934fc0321f0d3d7982e4fe5a0039decc491a663afd02facb08dd9695f854c7b031d9af8bd7350897996b5208b23030cc0feb84570730eaf24b9f2ac05d0feb3be07a29f887095f36f3c8f0e77e45509acd14a5be4a1572dd4cd1231087b830fa03e071571d4abd694710ef140469cf6df8a59839aafe046a5bffb97e5247be901789eafd726ba090337a2c49207e6b900c7e982472e6aac70e5d52ca2c1bab47b1f6d00f9601e2281686c21f770ae96e0ffec4b30496d012fa00958f794cdbd721bd155cae87", 0x3aa0, 0x805, 0x0, 0x0)

r0 = socket$inet(0x2, 0x3, 0x6)
bind(r0, &(0x7f0000000000)=@l2tp={0x2, 0x0, @remote}, 0x80)
connect$inet(r0, &(0x7f00000000c0)={0x2, 0x0, @dev}, 0x10)
sendto$inet(r0, 0x0, 0x0, 0x0, &(0x7f0000000080)={0x2, 0x0, @empty}, 0x10)

r0 = openat(0xffffffffffffffff, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = socket$inet6_udp(0xa, 0x2, 0x0)
sendmmsg$inet6(r1, &(0x7f0000006b80)=[{{&(0x7f0000000040)={0xa, 0x4e24, 0x0, @local}, 0x1c, 0x0}}, {{&(0x7f0000000340)={0xa, 0x4e24, 0x0, @remote}, 0x1c, 0x0}}], 0x2, 0x0)

r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
r1 = dup(r0)
ioctl$PERF_EVENT_IOC_ENABLE(r1, 0x8912, 0x400200)
getsockopt$bt_hci(r1, 0x0, 0xd, &(0x7f0000000280)=""/4096, &(0x7f0000001280)=0x1000)

r0 = socket$unix(0x1, 0x2, 0x0)
r1 = dup(r0)
ioctl$PERF_EVENT_IOC_ENABLE(r1, 0x8912, 0x400200)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(0x0, &(0x7f0000000080)='./file0\x00', &(0x7f0000000100)='cgroup2\x00', 0x0, 0x0)
setuid(0xee01)
mkdir(&(0x7f0000000180)='./file0//ile0\x00', 0x0)
rmdir(&(0x7f0000000340)='./file0//ile0\x00')

r0 = semget$private(0x0, 0x7, 0x0)
semtimedop(r0, &(0x7f0000000240)=[{0x0, 0x8001}], 0x1, &(0x7f0000000280))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000040), 0x41, 0x0)
write$binfmt_aout(r1, &(0x7f00000000c0)=ANY=[], 0xff2e)
ioctl$TCSETS(r1, 0x40045431, &(0x7f0000000000)={0x0, 0x0, 0x0, 0x0, 0x0, "00e4d3f26c00000173d5e822a7632200"})
r2 = syz_open_pts(r1, 0x0)
r3 = dup3(r2, r1, 0x0)
ioctl$TCSETS(r3, 0x5402, &(0x7f00000000c0)={0x0, 0x1, 0x0, 0x0, 0x0, "15aa4668c7bb10a48e7633f7a647be5ffc916f"})

mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f0000000040)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x0, 0x0)
exit(0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mount(&(0x7f0000000000)=ANY=[], &(0x7f0000000080)='./file0/../file0\x00', &(0x7f00000000c0)='sysfs\x00', 0x0, 0x0)
chroot(&(0x7f0000000000)='./file0/../file0\x00')
r1 = gettid()
rt_sigqueueinfo(r1, 0x9, &(0x7f0000000100))

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000100)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r1, &(0x7f0000d06ff8)='./file0\x00')
r2 = syz_open_procfs(0x0, &(0x7f0000000040)='task\x00')
linkat(r1, &(0x7f0000000080)='./file0\x00', r2, &(0x7f0000000000)='./file0\x00', 0x0)
r3 = gettid()
rt_sigqueueinfo(r3, 0xa, &(0x7f0000000040))

shmget$private(0x0, 0x1000, 0x0, &(0x7f0000fff000/0x1000)=nil)
r0 = shmget(0x0, 0x4000, 0x0, &(0x7f0000ffb000/0x4000)=nil)
shmctl$SHM_INFO(0x0, 0xe, &(0x7f00000000c0)=""/245)
shmctl$IPC_RMID(r0, 0x0)
shmget(0x2, 0x4000, 0x200, &(0x7f0000ffc000/0x4000)=nil)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
chmod(&(0x7f0000000180)='./file0\x00', 0x23f)
mkdir(&(0x7f0000000040)='./file0/file1\x00', 0x0)
newfstatat(0xffffffffffffff9c, &(0x7f0000000400)='./file0/file1\x00', &(0x7f0000000440)={0x0, 0x0, 0x0, 0x0, <r0=>0x0}, 0x0)
setreuid(0x0, r0)
setxattr$incfs_id(&(0x7f0000000000)='./file0\x00', &(0x7f00000000c0), 0x0, 0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000080), 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
syz_mount_image$fuse(0x0, &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
mount(&(0x7f0000000140)=ANY=[], &(0x7f0000000040)='./file0\x00', &(0x7f0000000080)='proc\x00', 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000140)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800013, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
stat(&(0x7f00000001c0)='./file0\x00', 0x0)
rt_sigreturn()

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
bind$inet6(r0, &(0x7f0000000040)={0xa, 0x0, 0x0, @local}, 0x1c)
sendto$inet6(r0, 0x0, 0x0, 0x0, &(0x7f00000000c0)={0xa, 0x4e21, 0x0, @mcast2}, 0x1c)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$TCSETS(r0, 0x40045431, &(0x7f0000000240)={0x0, 0x0, 0x0, 0x0, 0x0, "00000000000000000000008000"})
r1 = syz_open_pts(r0, 0x0)
ioctl$TIOCSETD(r1, 0x5423, &(0x7f0000000200)=0x2)
read(r1, 0x0, 0x2000)
ioctl$TIOCSETD(r1, 0x5423, &(0x7f0000000040))
ioctl$TCSETS(r1, 0x5402, &(0x7f0000000100)={0x0, 0x0, 0x0, 0x0, 0x0, "a620631c4c0454fbf021d00153cd9c6cd337d8"})
timer_create(0x0, &(0x7f0000000080)={0x0, 0x12}, &(0x7f00000001c0))
r2 = gettid()
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
timer_settime(0x0, 0x0, &(0x7f0000000000)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
tkill(r2, 0x1000000000013)

utimensat(0xffffffffffffff9c, 0x0, &(0x7f0000000040), 0x6)

perf_event_open(&(0x7f00000012c0)={0x0, 0x70}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
syz_read_part_table(0x0, 0x1bf, &(0x7f0000000080)=[{&(0x7f0000000000)="02010500000001000000ff07000000fffffffd000800000000000000004000ffffff8500000000000000887700720030b5829237c300000000000080000055aa", 0x40, 0x1c0}])
r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
getxattr(&(0x7f00000001c0)='./file0\x00', &(0x7f0000000200)=@random={'security.', '/proc/sys/net/ipv4/tcp_wmem\x00'}, 0x0, 0x0)
r1 = gettid()
r2 = socket$unix(0x1, 0x5, 0x0)
getsockopt$sock_timeval(r2, 0x1, 0x14, &(0x7f0000000000), &(0x7f0000000040)=0x10)
tkill(r1, 0x37)

r0 = memfd_create(&(0x7f0000000340)='-B\xd5NI\xc5j\xbappp\xf0\b\x84\xa2m\x00:)\x00\xbb\x8d\xac\xacva}knh#\xcb)\x0f\xc8\xc0:\x9cc\x10d\xee\xa9\x8bCc\xad\x89\x9ck\xde\xc5\xe96\xddU\xa9=\xcdJx\xaa\x8f~\xb90a\xa9\xb2\x04K\x98\x93?\x88Q\xf7\xd6\x1d\xa1\xce\x8b\x19\xea\xef\xe3\xab\xb6\xa5$4\xd6\xfe7\x0f\xe7\xd9$\xce \xabN\xae\xc9\xbd\xd3g@\xe1\'s\x0e\x90\xf2\xcdr\xb8(', 0x0)
write(r0, &(0x7f00000004c0)='1', 0x1)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x4, 0x11, r0, 0x0)
sendfile(0xffffffffffffffff, 0xffffffffffffffff, 0x0, 0x0)
prlimit64(0x0, 0x0, 0x0, &(0x7f00000000c0))

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setitimer(0x2, &(0x7f0000000180)={{0x0, 0xea60}, {0x0, 0xea60}}, 0x0)
clone(0x0, 0x0, 0x0, 0x0, 0x0)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
timer_create(0x0, &(0x7f0000044000)={0x0, 0x12}, &(0x7f0000000400))
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000080)={<r1=>0xffffffffffffffff})
pipe(&(0x7f00000001c0)={<r2=>0xffffffffffffffff, <r3=>0xffffffffffffffff})
splice(r1, 0x0, r3, 0x0, 0x8ec0, 0x0)
r4 = dup(r2)
fcntl$setpipe(r4, 0x408, 0x0)
timer_settime(0x0, 0x0, &(0x7f000006b000)={{0x0, 0x8}, {0x0, 0x1c9c380}}, 0x0)
r5 = gettid()
tkill(r5, 0x16)

setreuid(0xee00, 0xee01)
setregid(0xee01, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$unix(0x1, 0x2, 0x0)
bind$unix(r0, &(0x7f00000006c0)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000001c0)={<r1=>0xffffffffffffffff})
sendmsg$unix(r1, &(0x7f0000001440)={&(0x7f0000000100)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e, &(0x7f0000000180)=[{&(0x7f0000001280)="af", 0x1}], 0x1}, 0x0)
rt_sigreturn()

sched_getparam(0x0, &(0x7f0000000440))

r0 = eventfd2(0x0, 0x0)
write$eventfd(r0, &(0x7f0000000000)=0xffffffffffffffff, 0x3a)

r0 = syz_open_procfs(0x0, &(0x7f0000000040)='ns\x00')
fchdir(r0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
clone(0x6900, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
creat(&(0x7f0000000180)='./file0\x00', 0x0)
r3 = gettid()
r4 = gettid()
tgkill(r3, r4, 0x24)

openat(0xffffffffffffff9c, &(0x7f000000c380)='./file0\x00', 0x40, 0x0)
r0 = openat$fuse(0xffffffffffffff9c, &(0x7f0000002080), 0x42, 0x0)
mount$fuse(0x0, &(0x7f00000020c0)='./file0\x00', &(0x7f0000002100), 0x0, &(0x7f0000002140)=ANY=[@ANYBLOB='fd=', @ANYRESHEX=r0, @ANYBLOB=',rootmode=00000000000000000100000,user_id=', @ANYRESDEC=0x0, @ANYBLOB=',group_id=', @ANYRESDEC=0x0])
read$FUSE(r0, &(0x7f00000021c0)={0x2020, 0x0, <r1=>0x0}, 0x2020)
write$FUSE_INIT(r0, &(0x7f0000000040)={0x50, 0x0, r1, {0x7, 0x1f}}, 0x50)
syz_fuse_handle_req(r0, &(0x7f0000008380)="000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000080000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000dc4e00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ba045abcd5dfc67d000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000230000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000050000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000a000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000008000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000209bfd66eea210560000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000020000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001354c4b600", 0x2000, &(0x7f00000062c0)={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f0000006340)={0x20}, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
r2 = openat(0xffffffffffffff9c, &(0x7f000000c380)='./file0\x00', 0x0, 0x0)
fadvise64(r2, 0x0, 0x0, 0x3)

clone(0x4000c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = socket$inet_udp(0x2, 0x2, 0x0)
accept$inet(r1, 0x0, 0x0)
tkill(r0, 0x18)

perf_event_open(&(0x7f0000000700)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x50d, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={0x0}}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = open(&(0x7f0000002000)='./bus\x00', 0x141042, 0x0)
ftruncate(r0, 0x88801)
r1 = socket(0x2, 0x803, 0xff)
connect$inet(r1, &(0x7f0000000040)={0x2, 0x0, @dev={0xac, 0x14, 0x14, 0xc}}, 0x10)
r2 = dup(r1)
r3 = open(&(0x7f0000000440)='./bus\x00', 0x0, 0x0)
sendfile(r2, r3, 0x0, 0x8000fffffffe)

r0 = openat$fuse(0xffffffffffffff9c, &(0x7f0000000400), 0x2, 0x0)
lseek(r0, 0x0, 0x0)

r0 = syz_open_procfs(0x0, &(0x7f0000000280)='net\x00')
read$FUSE(r0, 0x0, 0x0)

clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = signalfd4(0xffffffffffffffff, &(0x7f0000000000), 0x8, 0x0)
unlinkat(r1, &(0x7f0000001100)='./file0\x00', 0x200)
tkill(r0, 0x25)

mkdir(&(0x7f0000000180)='./file0\x00', 0x0)
mkdir(0x0, 0x0)
mount(0x0, &(0x7f0000000140)='./file0\x00', &(0x7f0000000000)='configfs\x00', 0x0, 0x0)
syz_mount_image$tmpfs(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0)
mount$overlay(0x0, &(0x7f0000000200)='./file0\x00', &(0x7f00000000c0), 0x0, &(0x7f0000000240)=ANY=[@ANYBLOB='lowerdir=.:file0'])
perf_event_open(&(0x7f000025c000)={0x2, 0x70, 0x15}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
r1 = dup(r0)
ioctl$PERF_EVENT_IOC_ENABLE(r1, 0x8912, 0x400200)
mount$overlay(0x0, 0x0, 0x0, 0x0, &(0x7f0000000340)=ANY=[])
chdir(0x0)
r2 = open(&(0x7f0000000200)='./file0\x00', 0x0, 0x0)
lseek(r2, 0x0, 0x0)

pipe(&(0x7f0000000080)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
fcntl$setstatus(r0, 0x4, 0x42000)
read$FUSE(r0, &(0x7f0000000440)={0x2020}, 0x2020)
write$binfmt_misc(r1, &(0x7f0000000000)=ANY=[], 0xfffffecc)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f00000027c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x400000)=nil, 0x400000, 0x6)
r2 = openat$fuse(0xffffffffffffff9c, &(0x7f0000000040), 0x2, 0x0)
read$FUSE(r2, 0x0, 0x0)
setitimer(0x0, &(0x7f0000000000)={{0x77359400}, {0x0, 0xea60}}, 0x0)
clone(0x0, 0x0, 0x0, 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setxattr$smack_xattr_label(0x0, 0x0, 0x0, 0x0, 0x0)
r0 = gettid()
rt_sigqueueinfo(r0, 0xc, &(0x7f0000000040))

setreuid(0x0, 0xee00)
syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='fd\x00')

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x210000000013, &(0x7f00000000c0)=0x100000001, 0x4)
bind$inet(r0, &(0x7f0000000100)={0x2, 0x4e21, @local}, 0x10)
setsockopt$inet_tcp_int(r0, 0x6, 0x2, &(0x7f0000000080)=0x1375, 0x4)
setsockopt$inet_tcp_TCP_REPAIR_QUEUE(r0, 0x6, 0x14, &(0x7f0000000040)=0x2, 0x4)
connect$inet(r0, &(0x7f0000000180)={0x2, 0x4e21, @local}, 0x10)
writev(r0, &(0x7f0000000240)=[{&(0x7f0000000280)='A', 0x1}], 0x1)
setsockopt$inet_tcp_TCP_REPAIR_OPTIONS(r0, 0x6, 0x16, &(0x7f0000000300)=[@window, @mss, @window, @mss, @window, @mss, @timestamp, @sack_perm], 0x8)
setsockopt$sock_int(r0, 0x1, 0x8, &(0x7f0000000000), 0x4)
setsockopt$inet_tcp_TCP_REPAIR(r0, 0x6, 0x13, &(0x7f0000000200), 0x88)
sendto$inet(r0, &(0x7f0000000700)='s', 0x1, 0x861, 0x0, 0x0)
sendto$inet(r0, &(0x7f00000004c0)="34e2de4d8d957a8de4e490b6cd20b988d4edef164bd3377aa381b5f50b7ca40a516489f78cd7208982e9bde22b2b7c1c7606d565477f3db9d2b077283644c0f27ab52a863a42863e06944e40a0b3c5d21c8cbe052e7f726263f28aef1bc12a069063d4c30e8f329fdb36859be727fbef4314161e5fb5f01ae00a2634d5cdecca2089c62e32f4c919886b2b88d237e287318739bec0364caf15889f38a312ef6621c0f21709a4bf2b16274cf933f6ad8fcc9c2024bc1b4713f650e860f93ae93b2361956b3e80c38c5fd29b5c1b5d7ce67edc856a8dc0ba54cee53de9a48c131389426bd06ec7c695add357934fc0321f0d3d7982e4fe5a0039decc491a663afd02facb08dd9695f854c7b031d9af8bd7350897996b5208b23030cc0feb84570730eaf24b9f2ac05d0feb3be07a29f887095f36f3c8f0e77e45509acd14a5be4a1572dd4cd1231087b830fa03e071571d4abd694710ef140469cf6df8a59839aafe046a5bffb97e5247be901789eafd726ba090337a2c49207e6b900c7e982472e6aac70e5d52ca2c1bab47b1f6d00f9601e2281686c21f770ae96e0ffec4b30496d012fa00958f794cdbd721bd155cae87", 0x109e8, 0x805, 0x0, 0x6)

clone(0x28006380, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = socket$inet(0x2, 0x3, 0xff)
sendmsg$inet(r1, &(0x7f00000002c0)={&(0x7f0000000000)={0x2, 0x0, @local}, 0x10, &(0x7f0000000280)=[{&(0x7f0000000040)="453100736f45658789b2f643d3b3314486294cc325ed593c7305ece2bbd5c81758ce9a1a0f7d407e6812abe78b14457de557e8078e1263810e815105f543b6bcbe33cd0b9d955b7750b44009a71a418f0f7c0c8013203d258df996a83dd1986c065dc2bb866a9cf0de60", 0x6a}, {&(0x7f00000000c0)="01874daf0da1b0eca8", 0x9}], 0x2}, 0x0)
tkill(r0, 0x25)
rt_sigreturn()

get_mempolicy(0x0, &(0x7f0000000040), 0xd9, &(0x7f0000004000/0x4000)=nil, 0x5)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r2 = syz_open_pts(r1, 0x0)
ioctl$TIOCSLCKTRMIOS(r2, 0x5403, &(0x7f0000000280))

syz_emit_ethernet(0x46, &(0x7f0000000280)={@dev, @local, @void, {@ipv6={0x86dd, @dccp_packet={0x0, 0x6, "1693eb", 0xfffffc70, 0x21, 0x0, @private0, @dev, {[], {{0x0, 0x0, 0x4, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, "1152c4", 0x0, "c66823"}}}}}}}, 0x0)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
getsockopt$inet6_int(r0, 0x29, 0x42, 0x0, &(0x7f00000003c0))

syz_emit_ethernet(0x150, &(0x7f0000000040)={@local, @broadcast, @void, {@ipv4={0x800, @tcp={{0xd, 0x4, 0x0, 0x0, 0x142, 0x0, 0x0, 0x0, 0x6, 0x0, @empty, @multicast1, {[@end, @lsrr={0x83, 0x13, 0x0, [@private, @local, @private, @rand_addr]}, @generic={0x0, 0x9, "453734fcf9e703"}]}}, {{0x0, 0x0, 0x41424344, 0x41424344, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0, 0x0, {[@window={0x3, 0x3}]}}, {"7537f6ea5e7f36be6b3a5f8aa989e39ffbd72c2b12445b39dc40325dc674e6c50495289061997898cf7e572800991510110f06fcf4d15cc87d42339d46a96e2ba6e92d0879f2b3547ebf90faf9f4d9086b300fcc9cbbb19ba5356401ad53d31ef4df81ba127e6978955d0c33faa36bf07b4b8faaf8e3ac4a4bd534526948fdb60a4b23b292d8707acebd99db88d0e2ff506b163ddffdb0b4cb94bf8776997fda0942552e81dc22a6a372a06d58eba463d0af27f175c8936b61e117fa93bdb8e1e63c55055c8a021e967a28d210d1afa2f5eb2ffd03735fdc6cb1602c73fc72505131971a048b24d71512edf39d795089202ea8925ff5"}}}}}}, 0x0)

r0 = epoll_create1(0x0)
r1 = openat$thread_pidfd(0xffffffffffffff9c, &(0x7f0000000340), 0x0, 0x0)
epoll_ctl$EPOLL_CTL_ADD(r0, 0x1, r1, &(0x7f0000000380))
epoll_ctl$EPOLL_CTL_ADD(r0, 0x1, r1, &(0x7f0000000000))

r0 = gettid()
timer_create(0x0, &(0x7f0000000000)={0x0, 0xb, 0x4, @tid=r0}, &(0x7f0000000240))
r1 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r1, &(0x7f0000000300)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x4c)
listen(r1, 0x0)
accept4(r1, 0x0, 0x0, 0x0)
clock_gettime(0x0, &(0x7f0000000480)={0x0, <r2=>0x0})
timer_settime(0x0, 0x1, &(0x7f00000004c0)={{}, {0x0, r2+10000000}}, 0x0)

syz_open_dev$tty1(0xc, 0x4, 0x1)
r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
r1 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000040), 0x81, 0x0)
r2 = syz_open_procfs(0x0, &(0x7f0000000100)='fdinfo/3\x00')
sendfile(r1, r2, 0x0, 0x7ffff000)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
r2 = socket(0x20000000000000a, 0x2, 0x0)
sendmsg$inet(r2, &(0x7f0000000680)={&(0x7f0000000440)={0x2, 0x4e24, @local}, 0x10, 0x0, 0x0, &(0x7f0000000180)=[@ip_pktinfo={{0x1c, 0x0, 0x8, {0x0, @dev={0xac, 0x14, 0x14, 0x42}, @local}}}], 0x20}, 0x0)

r0 = socket$packet(0x11, 0x3, 0x300)
getsockopt$sock_int(r0, 0x1, 0x4, &(0x7f00000053c0), &(0x7f0000005400)=0x4)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f00000002c0)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x0, 0x0)
chdir(&(0x7f0000000200)='./file0\x00')
mount(&(0x7f0000000680)=ANY=[], &(0x7f00000005c0)='./bus\x00', &(0x7f0000000600)='sysfs\x00', 0x0, 0x0)
mknod(&(0x7f0000000040)='./bus\x00', 0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

waitid(0x1fc9b88108f02f17, 0x0, 0x0, 0x80000002, 0x0)

chroot(0xffffffffffffffff)

r0 = inotify_init1(0x0)
fcntl$setownex(r0, 0xf, &(0x7f0000000080))
fcntl$getown(r0, 0x9)

r0 = socket$unix(0x1, 0x2, 0x0)
sendmmsg(r0, &(0x7f0000000100)=[{{0x0, 0x0, 0x0}}, {{&(0x7f00000001c0)=@un=@file={0x1, './file0\x00'}, 0x80, 0x0}}], 0x2, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = gettid()
r2 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt$inet_mreq(r2, 0x0, 0x20, &(0x7f0000000140)={@empty, @empty}, &(0x7f0000000180)=0x8)
tgkill(r0, r1, 0x24)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000180)='./file0\x00', 0x0)
mkdirat$cgroup_root(0xffffffffffffff9c, &(0x7f0000000000)='./cgroup/syz1\x00', 0x1ff)
mount$fuse(0x20000000, &(0x7f00000000c0)='./file0\x00', 0x0, 0x10f008, 0x0)
r0 = open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
r1 = openat$cgroup_procs(r0, &(0x7f0000000140)='cgroup.procs\x00', 0x2, 0x0)
r2 = gettid()
tkill(r2, 0xd)
write$cgroup_pid(r1, &(0x7f0000000080), 0x12)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
sendmmsg$inet6(r0, &(0x7f00000017c0)=[{{&(0x7f0000000240)={0xa, 0x4e22, 0x0, @dev}, 0x1c, 0x0}}, {{&(0x7f0000000080)={0xa, 0x4e20, 0x0, @dev}, 0x1c, &(0x7f00000015c0)=[{&(0x7f0000000140)="f7ea8109435bc14e120b0d0182f4dca9f88d51ae0c5f77bb910de148e108e45c2b7c13eaaaf138151ab2db0a05fe329d82153eca8e3e5cfc46fa97cc64802e9969db043881de8976211c565ff387fe374c8ef39201f4c67d566e57baef4cc64b926e00a0f37f72fa4598f5", 0x6b}, {&(0x7f00000001c0)="7b34fcec", 0x4}, {&(0x7f00000002c0)="b3f13b455519e4687d1ba59f2a5a3e15ac43ee690a8cfcb57eb4a945ba2968a1945ea3cf7e", 0x25}, {&(0x7f0000000280)="229ec53534d6a1a0cc9cf236b9ac788fbf54987431d8cd1f8343bc02479fff705064", 0x22}, {&(0x7f0000000340)="429fd7288ae45ff81c631bda8520120a116904c460933e951d4bde4def68a20fad1daad4c2746a42a3a62f04f840a99c32b8b7af2bb84e775819cf2724f120b00d50b318839663a31a73695344551a746091e633eae11a5ee58dbd3ba137efe70388b86c2b0a7643d17b2767cd515ea08367c028793a9c7f82955781181f6e8bf95990baf0c68ab7b37a0fb14e3a347100d28d2fa0bfd85e1f6ea5002331b2be8ad65cce6a62acc62bd152f299ac52aee2feb5f5bb0aa6194ce7696dbf484a361cca8364f410d592364350068e81bb072626e5767700de9d6e30895173b4e74e7ffc072349544313c4a91e3112d288cf32df9cf6651393a06e98669e3a2f038b7c729f3fa20b1675ae8a8bf8485675abd21e9cc7104bcd65af77048c664cc67b3a26b5718bedc8e8496689cdce1601b7eab243019c328d98a8329a2ac22c63acf6252eabd7adf13e7f6de109342fc869da28fd82fafc25c66736fdb7be846993a736ce5ea0e6a9234a043f4377b1d5eed13ba6413a4fc5ea4ce393732cecb97112a76a20d167d26642be97bc94620c680d42fdd84dac01073e207d701d4524b67a442d4e7db2cae560a16281aa1bbe036d9635d389fb6f8e0e0cd9374ec52865988cf0ee0ad6bd822351e69b0a425e8901ab09e0603b0a6d5f9bc47e9ff75cbf40a8056eec991d34fd62b78889bf43dbe180f1cda19fc29bceb4f6e47da5418870265e0ef0abab89200f9ce2d3361cee7ea125917ae7612f5a1607a69de781d0eae8cbe0b67a555a6264f5b936ab07efe4dcbb14155a6c8a976b892793034ad8588b7edff3d3ec391137f1efc2375d3c7a15a97167daf4903defbe4507f8700a09b250876659f4422a831182d8d18e7afdabfa2ef2df7bc66e29512eab1ac0830c7b0f69b6a59268907be88cb4d689eb29ca5695d81fdf2b8011e6bbadb51219b8c15a806fb14f1c7aa773c2c637ba8447607718309811b0731ddd03b99902c28c250361dd003e40093673a788ad65accc58b56928a6563f177ad2feba79cd57e39165f9a6eaf2bde76f4c12b9d4695d91cdd7a52dc4f83ca3a263daba3b7ecdb13d4e5b304e65ee6c20a78f91cf532386261b1a1997c5f7718a7d5c2a1e6d39b902061aa14f799773aa1d962759eea8f7cf5e43e19418c48aecb5696e1efa86ecec000468586d3133b59d9cfa1cd37db7f581e4f387735bd278d62b1ec5c2d9af1a544b7d7d50a21ac8e8069c7ca34329a67961d334bfa3b92f1077cf9e4cc9a1b21d851cbac7c52df7f71f44add1ab35507dac9c443d3a9a2aa6aab817827f1158752e43ca5d5e99611b1beb751eed42b543557c9578b653775f32472fcb61ba16bf18fed9bb5cb4b26154b48f88d3fd33673db1a2ea8b249ca9a094421f7753367986d1fb0c98feaf376760c6e1db99e06ab3a6a8f231de65e06958dd8be3a1bf1c5458f8aa58de05240df6793feb63e710d8579fdc74fa4658f654b934ebaf771425f0d719e8485725889e88a76ee4c2e26756abbcd8323729bd8e84182e50fc3d4adcfec9152642743f28b1f9db06ddbaa1a42bf33be5299a1a413ba544a3150471909e10946a6f5aa9613bb4407bd5abcf33deadaab9226e86ee1e4d8e2bf4862fdb298e2207afe84623d68e31a55cc632df88455f30b71f8b238f852a36b430707f897e4e718c1969e11554fd95bacadc08375a9b4e57a722135b80279f923daaaf70d7a23a65fa3704d5a9c80db544b1f77aa0298506e00c6e7512944f0d9c1ee3430b7a070c295d761d2e4841a9cc52a5b0268daae46ae41c73e09a9cf460d965f5c56cd95228a6a01362d34419d74cec6aa91d76bb09dea7f83bcb116749eed896dfe5aa6eede0a3b965014464e03dcb461631631b56e0016cdf8aff4abc89524869a4477e4ae292f7e1a9fc27a16a0fd43b6cb7891d2f7aa67eaa195539eaef0a195f5c8cb92be9945e2c6abe2f1e085596811ee7cb84c7cf746e4381ef4c1ead762055d6b896dfdcc6445d8a3768c4f3ed190ebd1856641e5d7b1da5612f8a4dba7814ce5319a7f70c3d20b79923f8f3e450c648b8382fa4ef0317ea06475958d97463365e439fb79fa9cd19f310df6e1491d14adcc9d6509f4a7a2a3a616cbe3fcd217d2625ba402048e5ccb93962207da6a5adb7e175f89d3fa1b7d80f1c030fbfeec600565a3b602250e049dd6eddac19e338846fd56b741de90431d525a7c65dd0d710bd6d2fcc39fe78a4fa91c2210482530e8122c231b938bdeab138d8df14e09a82b6a6e247796a69fe6232447a470a27e8a3c01dda8369197cfa63ba2d1a71dc9713a8ba92e2922a2d9089cc7f14a6b4820de51e800da37fe320f5223eded8436f690d6e3d593025e846f8db85a8587f94461af70b16409bd514678d08145c83fc41075dfa6459d747065a7e88a0a2afe894554c3baf52242929c208874fbf21f11a73e4e56bd8c2b93a2607039296d2bc4c62ec45ca9d5449cf3a6e901b41b99000a0658245a7415c1ae9b21e7085400849b4406a57fd153d39c2e06fef32bf1799f7c9f632fa6b93e047769e5d1696e813b2da6300b58d8bf94e467fe77ede7088457e3481d22db62850c6d14c64b0e47f337fc41b479b97873d9c3d2810abb0849f500b7363041090625f6589f94b7f9f2a877de46ecb436358f7b0539932cd88b622f01f9c406669640cd7a4d67aba3e0ce385a449db18062668b22982939426c74b1fc47283bdae670b53a1fa2e2471d5edf93ca3edaf3a8e6180cd27a362263efca1083ee205e4e9a2f9c09e7ab277e8860813b992c1371174dee52c5bdad229b378a66d8242317e5889a5378aa6b9e9e75228ecc80fd8e79c2859694f689b292830a354d5bd0dc227879fd9590ff51ffde09feb0970e768a42a323ed85ab040c07d8a0e9639352ca7303637eaf7463de7cd0d1d01b3ed3920eb019375d988cb51833119a99e6a6762d751315ba847d9e4c9aeac0d8e23e78f93f2c98ad69fc054c8aae800d4f9a8bb43e2d6c78d267d3c90ad2a3b142c93711e5450faa1dada8baacdc5b7a8b2400511948b1ea2a3f42b40d923ce168c112a6aa77be738276ed81c841deed5aac9486d357d9015b7e9f90457825c34f6874620343c7ce5cb541d6763a6a150e17f08a98af50f462344197f0a21bbdc699f16bd6b9d358d5a317a689a3de7224a0490888499af69c0f1a9c11b2650a33754bc3762e9a7f4b3272640834ed50e07b46c4580d8fe968392343f713e29e9249acb3be422d59a175219613976320fca65f144cce1276daffc7c70d0af37bffb3eb3f6f05729056a2a2352a6d0d73bd9f53db382d450c13c75111b21866278d8d9f87e180ac15b48ac5b2a71f0082878d505ff1933d2d38bdf2babc5bf253997d917ab75a1ed6084ef736f8a18be1c5ca331cf35ea380d53b43d8b5f5b51b0f6f8c0ec8f72fca6e72228ad1307d9f76652e277ce5860666ca732f4631fb476134706707de02e14b508f56c685c92ed9755edd01a7fbb1b1052d102d52eea05a9d92f5c43bdc398fd4635381c4b8724417576ba93e657739a3eb71eb27556ed49b3b6b90a182ed6300b106d339ee3ff0a43a75e31910e18caa12063e470fe109d69343a6c243b25d47f343c7d53440ed2c7948cd95ed44d829d5b2d1bcb386a213f43d67127bcf996637523dd53f2693e54c7592e7afde2ee527c4c3e6edfaf22761d961dd13cebea64a6b470f6436f3b00609757721ab67a468cfb157a60f0901dff383df18d82144c5db5b95bf95b663e0a4ee26099349e6bbd818c8bcbac567c8e3363aac4c1d9a5386ea508f9bb1858cf111eea1b4380ccecd7592096fddff5e6d540582eff26c06fdcdc64f4ad1ecf8ae946456e5491f198a297a371bf8008734120142196f69bfd9ec78dd79c46aeeac4e8615f668c28860e5ceb6113356cdf1e367a1e7fe1725b3c3ba04b00b79120eb379b2de6bf291068ef90d3d9875164dc9aae76d96ce708840a86c0df8fa613be307c6b5e69edff3e5e3830adade98400290eec2f7ffd06f56b02ee826eb1c7c77021907dea3f2490492ef62abefa8b2a43d56fb1091493009a147de7638ebb9d6f1b68c07e1fa317f29ae0670a063b45ba2d0c320994a8c671488a2b558161cc1a655f3c7e91bc2e65f11bf4991935faa50b65045fc2bd635164c7a5a193301007b36c0036eac159f50881d29411d7e81cdf4564efdcf65dc762490fd2e5f5ff08132cbade516b349c029e8c87367aff8a34a6f244056123ba55bfcd62b2d8a92c5922754a34275cf9193ece94069b7937e4084788b5d5991636e2c86518e3ad7796e18a261466bf5cef175e48bd019ecc65afe3eeb49678c5827d6a2e74217e5b56ca872b3deadeb3f48b6ab9bf02567cf31468a49e261c89e55c712224a418e68182866d371cb96bc1f03836256bd2df9ff7c189fbad582f2c2e016d6559bdbb51ee39532a78787098251b5197c928c2aa403c3fb25b216a36a9b32e9f9315a38f7a8fd064c1c491a9b2ab6a9957d3f4d75ef59ff3b49a81585b594861736475a3ed5456dc0efcd13113e91bbe2a4de2449f5665dc9f31503877bcd3f5a82e5a3796c0aa7cd70bd82852d92cd11d33f1959541063c072590955e04cb92845c602468445e0186eeaf8602154af1e63b54f11bfb1a82a3cc65f3d456276418b96452278813758c2fbaad33c2be4bed2406a695084b95f14323509d352dd2a648e33ab941d487572d1c3641fe73a2b642927decc12229dcb26b8a90d119a384eab7dc5aa0912aad8207ebf992de0243da93050064f8c873585ab6c2a8c95710a26d0d85b7eb99649de80306b552d142f36c322b5b205cae82fe3178b5c16556f9a42d09aa293ff1b9599be4569cb39fbc248fbefa8dd65a2cc7802c215d3261c8f484544935c484fdc878821954d48cb739e555f769eadca9114f197778d75c99694a041d7508f392cc309f82c30f3a635b6769a054534248546353a75a8e9e389850d214d2c26a6d17ab8af81e008d2a68c628c7071c64d852762ba3a97a02edefc0f151e6dbdc98d4e7c56967fe4c3599aab6176409d2549103bc05a7b34af53833607459b6ca8e45c2dd246ca7d2317c892dfd4636ab1183dc04a4b04c4ce29cb5f237f54986d5cba95825d8b41dbbcfc023e35667c4c4b8eab35faac1b0ba71749026785f87cf4871150b65cbf5dd071df1746c8112291f666e352974509d4a36b957990b3a61778a9b3175c4bbd3edb896ad49505b7a9fb60c067b40cc853ea7152c31a990c5eeb7b86346c8054067ec46dd45e21cb42af679f07fad924709281ea822e1624c42376a6b0e41001e44aa8644d0d12a5ec975e3ce215624def2d6f73a051617d4f2070ebe41b261920ff7467208875cecc57204d359926402cd561a7fa31519360a65604761d3c8e4328e16572e850d2cd04ca3985f102ca76c306a61933c15ab7ce3dce9d7c5e63b8acd7466b061dc5a23a4c7a8b6f0043256e4ce8cfecc245b9885f14e969787a0d3b7b00cb99816ebc8a0007f89f16e820df7483758e3967e71ecac71aa97e6b660067cdbbc6aa2ab20e5a743dd0c14d917c05ec5fcc3af2f606164e2d7dfb6e31a36f7dc8075bda6ec075cb5d5f9ae73af475927d8f6d262669949eb09be1b5f8a3e0b0ef5a502ea7bb16082ba576a724eeca43177a97f05b730abccc2c1a1fe7a34833f05d163607ebbec2cd0653ba44b4d54b084c18b84f9adcc8082c113496db1d084115f43de541f286bf0a27b561764417b368d842565", 0x1000}, {&(0x7f0000001340)="a7e5499ee7224d83697fbf2a8524dcd81fbe6c322f37ac8c5f2218d95c6fce66a79a74880ed6d0bfb4d685690ecf8b377875fd7f8c5de247291bcff306ae9ce6a8431a6cf97eacd6a963a576d2c3299bc998857661c50997e8", 0xfe06}, {&(0x7f00000013c0)="d38e0a687dd626082cc40ead412b3728812df15425f58d1eeac07bd69286f8e86efd0a13ebaf3e906245bea4fba7d5330b6406152e24c253980d6eed25064a884f442d3569f3b47e33d32baf8f9cba8b4d19f8f4acb9788cd31a64846d7c754a5b90bcc54daf34728f3f6110b38ad39f6b2566d540125feb35348b1c4a3f33daeb1aed61f08b38285de2fa9d00df3c2a3a06b447c14e4651f083de022ea12432960d826f34f5e3e2425d42875e5341af1bde370b063a2d52c606cb7065cbcc0261f08feb191fdcb0dcb57b7c5d8a81e14f403ef0bccc79675e0611d6d5ccaa77701d8ecceada080d77d2", 0xea}, {&(0x7f00000014c0)="850894660a81f5780d46e8c314fd4fecc299afdc320b8b6e927cb705781c371df6c2003676276978a97d93389725a2e956d6b40c5e17c9190b3117ec5e262c386ee11c8c35a5c104efe2947531d1bd7120036db8fdb39cacb5c5960a481a27853bf213284fc4b726d875c2757c0a8392261b2b77f98aa28c936fd89350f620ee1028953003d49ad261894c32ee055dd463b93301d0073db4f5f57a222d100b9107e51fe247cfc35a9fffe86fb60ad294d342a2c384f1bd06dcadd18ffb91d3a7308ba0e3f68f3cc79e6832cfe68f", 0xce}, {&(0x7f0000001840)="58f8dfc7abf9cca738a8864945d041c92c4abd4e1c5f5154c328fd3166ff3ee80e7a8e1bc0c0b1d015e7182f8a223518aaff9799887e6e65301f72b84d067853aa6ddcf26329dbc2ac2e0a119867cfabed1fd3b80a2a49dbc4aceeb1047c5935d01f7005ddcf04ed328c6a0956da786fc0e48497968f3e178bbd74cea54f470762e7ca281c4e051f5cdb1321077a201696d630e2987d34312c265c13e003bdb526c5a132b7418d4de0a286842a4baeca4f9d59694daf9819b259990f2688a7e6f996933a16bf4911f2ec92f1920105a996c59707e53662774a8d87901a35", 0xde}], 0x9}}], 0x2, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
openat$zero(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800013, 0x12, r0, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000140)='fdinfo/4\x00')
openat(r1, &(0x7f0000000080)='./file0\x00', 0x109140, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigreturn()

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r0 = fork()
ptrace(0x10, r0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ptrace$setregs(0xf, r0, 0x0, 0x0)
exit_group(0x0)

r0 = open(&(0x7f0000000200)='./bus\x00', 0x141042, 0x0)
r1 = creat(&(0x7f00000001c0)='./bus\x00', 0x0)
fcntl$setstatus(0xffffffffffffffff, 0x4, 0x46000)
lseek(r1, 0x4200, 0x0)
r2 = creat(&(0x7f0000000440)='./bus\x00', 0x0)
io_setup(0x1ff, &(0x7f0000000400)=<r3=>0x0)
io_submit(r3, 0x1, &(0x7f0000000540)=[&(0x7f00000000c0)={0x0, 0x0, 0x0, 0x1, 0x0, r2, &(0x7f00000002c0)="da455864eeb9167c1ec76189e57e50c68e739ea84e4043521453c701ff8f874b7856a5a7434f6091814e0c7f4f80d24902643c3d4076752752ef6d2b215fa2a85b69f7e26f3abbc9a62fa52d09fd1f96d8ef9bb84710faa5b92bf6c168c6c46c66ee6c8e3fe567dd749027291261984989f79cee47a0d6f29044bbb70ef418a052f1dfe9aab165af7d3b7f7cd5e21cd5b2b3e62b7223290a2aaa95a4045fc2e6247ff36502396ca21c18a60fe6f4699dc563d8b40c84269a24486af086b392ce38e2c573d683282f3a9a345c8dcc7483eda3a26183108d8b16b0c4c651625c833462f9b5ad5e6b276ef0f6b2dadc5e48bc2f736a1e97e4496b770d515271e0c1a32713988e3e6086ca4694973913d27316b6e1d324b155314b217f991a23e3c04cdc00b1e6bf2a912d88", 0xa9f9, 0x7}])
r4 = open(&(0x7f0000000140)='./bus\x00', 0x0, 0x0)
r5 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r5, 0x0)
preadv(r5, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
sendfile(r1, r4, 0x0, 0x8400fffffffb)
sendfile(r0, r0, 0x0, 0x8080ffffff80)

r0 = socket$netlink(0x10, 0x3, 0x0)
recvmsg(r0, &(0x7f00000011c0)={0x0, 0xfffffe53, 0x0}, 0x21)
write$FUSE_ATTR(r0, &(0x7f0000002180)={0x78, 0x2f, 0x0, {0x0, 0x0, 0x0, {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xee01}}}, 0x78)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = getpid()
rt_tgsigqueueinfo(r1, r1, 0x13, &(0x7f0000000080))
ptrace(0x4206, r1)
ptrace$setsig(0x4203, r1, 0x0, &(0x7f0000000100))
ptrace(0x4208, r1)

pipe(&(0x7f0000000040)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
r1 = syz_open_procfs(0x0, &(0x7f0000dec000)='smaps\x00')
r2 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x3800004, 0x12, r3, 0x0)
preadv(r2, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mlockall(0x7)
sendfile(r0, r1, 0x0, 0x320f)

r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='fd\x00')
r1 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='fd\x00')
linkat(r0, &(0x7f0000000180)='.\x00', r1, &(0x7f00000001c0)='./file0\x00', 0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
bind$inet6(r1, &(0x7f0000000500)={0xa, 0x2, 0x0, @empty}, 0x1c)
listen(r1, 0x0)
connect$inet(r0, &(0x7f00000001c0)={0x2, 0x2, @local}, 0x10)
r2 = socket$inet_icmp(0x2, 0x2, 0x1)
dup3(r2, r0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
r1 = eventfd(0x0)
sendfile(r1, r0, 0x0, 0x81)

perf_event_open(&(0x7f000001d000)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0xa, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
preadv(0xffffffffffffffff, &(0x7f0000001400)=[{&(0x7f0000001440)=""/4088, 0xff8}], 0x1, 0x0, 0x0)
clone(0x4000008006ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f0000000080)='stat\x00')
r1 = socket$netlink(0x10, 0x3, 0x0)
getsockopt$sock_cred(r1, 0x1, 0x11, &(0x7f0000caaffb), 0x0)
exit(0x0)
listen(0xffffffffffffffff, 0x5)
preadv(r0, &(0x7f0000000500), 0x37d, 0x0, 0x0)

syz_emit_ethernet(0x79e, &(0x7f0000000100)={@random="c757cdd5bf05", @broadcast, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "cd8d24", 0x768, 0x3a, 0xff, @initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @mcast2, {[], @ndisc_ra={0x86, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, [{0x3, 0x7, "9d78aa840638ae9e976fb2b5eb2cb112aeac5da27acdc80ce28a1a574e97297852b37585e42a9e209963bfea8bbf1a3abf3e3c04805c2ee4579138e760"}, {0x0, 0x8, "826b4b0bab4a6ae5b5ec6a936712e5c89fac4115b32ec7dd464eed92a7722fd3a506be668ba1515d992701ed7e1f730f8936a8ea928ca8265540fe2a2914ba2b6b8f8aed4e"}, {0x0, 0xda, "39932e77f7901a2ae9099fba2394e5fe135a4dc82cb47d2ee8a2cd085b628a83ba55fbdb2dbc61109c9533ff1c28e5e7bdd032a8840f797ec9444be9ad3228764f286aa659b900557f4755262d6772cde4f692121b7a71d4ddebe9efacd4ab154d7d034979d50af84e6e25c4785d163d62077a522e9711c9d2b9af31be9557e449af1967c71e56f134e09581432f8b17e7856efeb0067a1ab52ae3fcb992f90b76e9a24978fae84ca77f552856b2a7e662b3042f5ac2a39f337693285728f35739b0f2b314f096522e7f1b2c4fbdf6c22ac2def7ea6f8440aeaf911e2bb6579f681d8ffc8c2543a1413ff29d6931457efed2b5cef6a90ffb579539027d867e0883a02031812c1beda83a51b29086d461fd0356466344640ef4a3897197c4fb3135c28d524eab4db014fdfbec70c65c7f378035d10a3b06be51a8f29f88892350718c5c21f361088e11c833afcb4a02e2993dfb778ee05490f1a8d440d53a468dfe3305c171c3a3c0114fedf43500afd6df0be141f832733f09f21445ac9bf5d3166430d948eddf460a9c26e34a50483b8ba43262b9d841d0620be2acdea0bc74538c16294f5b6841d2e34d06d4dac79ec463f57eb76d49d170705bb82228d48a9ea965e9d57db5c041de1bc7dd11aaa894d6299d381a2945f1c5efcde13be9a7d5d96f3c9a643c4c4d9039ac76794e8018bad00581704fe2c2dba3b287f4f160b9be10f2d276053de0af2771eadd21040fc52665b4ed772698f2a536687df167c80fe81ee510867ac1226b66b7e5b28ed8a7f160f75569b43cb54f588697c849fe56f4455a40e3f051dd5019f5c8884d7a8547878c82ecf5dd3e9eb648413c3c75c8b5f208a80b3e9a9062f8d4bb395ae74cfb3176eef28c62de584856bf836a2bdbe70efd5a4310d53df2316d9183b2a4490cb4e9c26ba98b5586d31673c69864f64a8c1bba2dc118b7320f14516da06be59e32a3c12f452cbcd2fada7ff0141bc7ad6e54c166973f9634d4163965369219fb9380e7468779f00fda5d34f5f458ae3df0bc8f0b5d25874732fd6d52d12c44d71f43147ff277318df33e6f84c46aba0449ff6abdf6611803f9642fdb5314beed6bb5761538b373d77e9add355ab401793a564a3fa5a6b31ff4f45f401b2aa607d38d695034295241961c0e851aac68a47d863af7abf32e53b88acc16c61cb324eaee693c3b868ae0cea073d10bed9ab7c8b3a86f94302ef12f3c1ca23a3fcb0f18376db6ffe12302fabdd0ec6f1c759e4addc49f07895dcbc80fb33ebbeb61acf149031994b9c8dd7abb90e2f43f006a0016f1507d71be05511e195660d55f6ed96c704b1f69ecf8084a5ade6cd470a710f64681419519b007fa8a55d01ffd09a5a1c801708a53dce16f0c0d7c9bc2f34380602d5124a0d45a2aa67081988b10f7c9fe606e69073325d2b9e1a55d436d277bab5652d5c53761ce050e49285f39bbb7493b2d66bdd0262139ba79fd7aa7c3e86b365617cf3bb58f4486ca30dd9eb1557f8f306b95810030de0616f499e2a238cab69ec562feb7106b8f9dd47d6d26eed51cb98797e0dbad7b2cd0852da6f761f3a71cebadfccaacc178e0d014ff6bb52bcf0887d56559ac2d846a4928eaf429bde778214c89b9df6f5fc5170ed5969f4778dbbcd8b65260d7fc3c11dfa607312df46a720ae1d24343de5c798688f78202e72dbb10744e925a55cb298a51778277aee5afcce5cdc383b5c6b9cfa21dfee8d2e420ee6207c77ef481ac070b0700e4d9d6ad7c854972f2bfb1c5a877dc05ec0ab8a7aa74eacdf13ed478e3172cb2f6bf78269c164776405a4d8bfc7611e9abf1820ea8bf12e0be1a2910d5cd8d7c1de17ac49b91e660876f4c93f80d28aa2ff3b04726a999e98539de796ece5b4061ddb4b8b2c87f6d83af335accb926b7b7bc2dcf52563d279430f958d8c66d90abcc7ed18a91450b88cf418e72985b6d762c5b90c22b96a7e11b2de2494e56b6f5d175bca6ca59b8ed49ae1051fb775e79f210ca62e3a6c865890b2dbb8c5bd096f36c091091411e779e18f036ca94716e697af552d88146144eccf6314f3ca0f297175cad3fb16b3494949738a388e439779008b2c6a29eaff2d13b571a80e29def5c41ecb672ba0539e0e259f8550b1978929e8af740da55e7feb230a88ece56697b77acfc920324521334e7bc849cfd1529651ea460a255ad2ebd320c6c576bec869cbeb0b99c368f39261a7b6e557d8e6bcda07c01f36e030b0f863c8e89d88117501b81d33f5a06d16989669759a1cf47bcebd89b384f290d77c7f434d10d0992f207b4244235de585891937db7cbd9333ca2b1b37ec502ccbb363580c7a3cd432acc77bbdb2dab4c0b81fba3d645263c0b3ea7fb38bf033d48f3f334411418b773c91af31157035e889ae17a31ed507a04e9ec3fbc4b472345972e2b88188a5428f11c8442bef51f"}]}}}}}}, 0x0)

clone(0x38004100, 0x0, 0x0, 0x0, 0x0)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000080)='uid_map\x00')
write$tcp_mem(r0, &(0x7f00000000c0)={0x0, 0x20, 0x0, 0x20, 0x1, 0xa}, 0x48)
r1 = inotify_init1(0x0)
fstat(r1, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x0, <r2=>0x0})
setuid(r2)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigreturn()

r0 = socket$inet_udp(0x2, 0x2, 0x0)
setsockopt(r0, 0x1, 0x8, &(0x7f0000000200)="0149e488", 0x4)

setresuid(0x0, 0xee00, 0x0)
mlockall(0x5)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000100)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
fcntl$lock(r1, 0x7, &(0x7f0000001b40))
fcntl$lock(r1, 0x7, &(0x7f0000000040)={0x1, 0x0, 0x1})
fcntl$lock(r1, 0x6, &(0x7f0000000000)={0x1, 0x0, 0x0, 0x0, 0xffffffffffffffff})

r0 = shmget$private(0x0, 0x2000, 0x0, &(0x7f0000ffe000/0x2000)=nil)
shmat(r0, &(0x7f0000ffe000/0x1000)=nil, 0x0)
socketpair$unix(0x1, 0x0, 0x0, 0x0)
accept$unix(0xffffffffffffffff, 0x0, 0x0)
flock(0xffffffffffffffff, 0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
getitimer(0x0, &(0x7f0000000700))

syz_emit_ethernet(0x2a, &(0x7f0000000040)={@remote, @broadcast, @void, {@ipv4={0x800, @igmp={{0x5, 0x4, 0x0, 0x0, 0x1c, 0x0, 0x0, 0x0, 0x2, 0x0, @rand_addr, @local}, {0x17, 0x0, 0x0, @loopback}}}}}, 0x0)

clone(0xe5006100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = gettid()
mount$fuse(0xf0ffff, &(0x7f00000020c0)='./file0\x00', &(0x7f0000002100), 0x0, &(0x7f0000000040)=ANY=[@ANYBLOB='fd=', @ANYRESOCT=r1])
tkill(r0, 0x18)

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
capset(&(0x7f0000000480)={0x20071026}, &(0x7f0000000080))
unshare(0x4040000)
rt_sigreturn()

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet(0x2, 0x2, 0x0)
bind$inet(r0, 0x0, 0x0)
rt_sigreturn()

open$dir(&(0x7f0000000000)='./file0\x00', 0x842, 0x0)
link(&(0x7f0000000080)='./file0\x00', &(0x7f00000000c0)='./file1\x00')

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
fadvise64(0xffffffffffffffff, 0x0, 0x0, 0x0)
exit_group(0x0)

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r0 = syz_open_procfs(0x0, &(0x7f0000000100)='uid_map\x00')
write$tcp_mem(r0, &(0x7f0000000000)={0x228a, 0x20, 0x0, 0x20, 0x1}, 0x48)
r1 = gettid()
setreuid(0x0, 0x0)
tgkill(r1, r1, 0x10)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
setsockopt$inet_int(r0, 0x0, 0x2, &(0x7f00000003c0), 0x4)
rt_sigreturn()

r0 = signalfd4(0xffffffffffffffff, &(0x7f0000000140), 0x8, 0x0)
unlinkat(r0, &(0x7f0000000180)='.\x00', 0x0)

syz_emit_ethernet(0x22, &(0x7f0000000000)={@remote, @broadcast, @void, {@ipv4={0x800, @generic={{0x5, 0x4, 0x0, 0x0, 0x14, 0x0, 0xfffe, 0x0, 0x0, 0x0, @dev, @multicast1}}}}}, 0x0)

clone(0x200800059fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
getsockopt$IP_VS_SO_GET_VERSION(r0, 0x0, 0x480, 0x0, &(0x7f0000000240))
exit_group(0x0)

r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
accept(r0, 0x0, 0x0)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x3800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
socketpair(0x1, 0x1, 0x0, &(0x7f0000000040)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff})
sendmsg$netlink(r1, &(0x7f0000001340)={0x0, 0x0, &(0x7f0000001280)=[{&(0x7f0000001e80)=ANY=[], 0x20001290}], 0x1}, 0x0)
recvmmsg(r2, &(0x7f0000000880)=[{{0x0, 0x0, 0x0, 0x0, &(0x7f0000000200)=""/71, 0x47}}, {{0x0, 0x0, &(0x7f0000000780)=[{&(0x7f0000001ec0)=""/4096, 0x20002ec0}], 0x1}}], 0x2, 0x2100, 0x0)

r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
sendmmsg$sock(r0, &(0x7f0000001980)=[{{&(0x7f0000000000)=@in={0x2, 0x0, @dev={0xac, 0x14, 0x14, 0x19}}, 0x80, &(0x7f0000000600)=[{&(0x7f0000001a00)="08781a220df0253e", 0x8}], 0x1}}], 0x1, 0x0)
recvmsg(r0, &(0x7f0000000300)={0x0, 0x0, &(0x7f00000002c0)=[{&(0x7f0000000100)=""/18, 0x12}, {&(0x7f0000000140)=""/148, 0x94}], 0x2}, 0x0)

clone(0x41fe, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_tgsigqueueinfo(r0, r0, 0x10000000016, &(0x7f00000001c0))
ptrace(0x10, r0)
ptrace$poke(0x5, r0, &(0x7f0000000040), 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setreuid(0xee00, 0xee01)
semctl$SEM_STAT(0x0, 0x0, 0x12, 0x0)

syz_emit_ethernet(0x5f, &(0x7f0000000000)={@multicast, @empty, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "a0f009", 0x29, 0x3a, 0xff, @remote, @mcast2, {[], @ndisc_ra={0x86, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, [{0x0, 0x3, "a78ce540cd4f791153d5dea6b259fe8000000000000023"}]}}}}}}, 0x0)

semctl$GETNCNT(0x0, 0x0, 0xd, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
prctl$PR_CAPBSET_DROP(0x18, 0xfffffffffffffffd)
rt_sigreturn()

io_submit(0x0, 0x1, &(0x7f0000000540)=[&(0x7f00000000c0)={0x54c000, 0x0, 0x0, 0x1, 0x0, 0xffffffffffffffff, 0x0}])
io_setup(0x1, &(0x7f0000000300)=<r0=>0x0)
r1 = openat$tun(0xffffffffffffff9c, &(0x7f0000000480), 0x802, 0x0)
ioctl$TUNSETIFF(r1, 0x400454ca, &(0x7f00000000c0))
ppoll(&(0x7f00000002c0)=[{r1}], 0x1, &(0x7f0000000340)={0x77359400}, 0x0, 0x0)
io_submit(r0, 0xa, &(0x7f0000000600)=[&(0x7f0000000180)={0x0, 0x0, 0x0, 0x800000000001, 0x0, r1, &(0x7f0000000040), 0xff66}])

syz_emit_ethernet(0x32, &(0x7f0000000100)={@broadcast, @link_local, @void, {@ipv4={0x800, @dccp={{0x5, 0x4, 0x0, 0x0, 0x24, 0x0, 0x0, 0x0, 0x2, 0x0, @empty, @multicast1}, {{0x0, 0x0, 0x4, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, ' i\"', 0x0, "f51b64"}}}}}}, 0x0)

clone(0x4000000206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
ioctl$TCSETSW(r0, 0x5403, &(0x7f0000000040)={0x0, 0xffffffff, 0x0, 0x0, 0x0, "00020000000000000000001100"})
r1 = syz_open_pts(r0, 0x2)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
sendmsg$unix(0xffffffffffffffff, &(0x7f0000001980)={0x0, 0x0, &(0x7f0000001740)=[{&(0x7f0000000280)="1131c06b799d38928bc5b29a3f4cfd285daae15385a2cf13659abf1444c1245f1be18425acf9e5f70b926f21971e2509", 0x30}], 0x1}, 0x0)
write$binfmt_elf32(r1, &(0x7f0000000080)=ANY=[], 0x43a)
exit_group(0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
bind$inet(r0, &(0x7f0000000040)={0x2, 0x4e22, @empty}, 0x10)
syz_emit_ethernet(0x107, &(0x7f0000000480)={@link_local, @remote, @void, {@ipv4={0x800, @udp={{0x5, 0x4, 0x0, 0x0, 0xb0, 0x0, 0x0, 0x0, 0x11, 0x0, @private, @broadcast=0xe0000001}, {0x0, 0x4e22, 0x9c, 0x0, @opaque="8174b9193f286b3694a6d2dab8860c3187b14e476b3fc791dd683f23da29134646368eb023362bd91f67d60b479de11a32b6392be503d04524a59c543780b27e3f61fb3cd61140da8463194ec24c3903380b119e042667f642c06223285f65a0e288897bf9f0e6389673085b6246d182863eef4a937d7d29c17cff6d3fb06aad9d285571165328714482ddc31e0f66d51754a31c54bc94b5b6bb19da9f35fbf0fd81ba6efb7147085ea52d8371ba870bc1a8276d263a4d7c14c06d456762ba623709a184dacfc4b2f15ffc17688b280f384080d8cc34e6440c5227b731a076c03d97867995b74282c939b437eff2f75c11119b1a0e5d2aaaa5698d47563c0108b26faa15a57b9b36dfaf4718168a37efcdee6253f6e6669abae6d6d894f731166fb676e71c0a7b7143c1c97f4b67fbd8dafa840b4ec81bd7ddd5ff2a23804b285987750ed6ef22e1538e386438b7a8f7e8a5f40c0d6306db9dce627c7ccf15ffe6a402576ca1dc009be797e9f49d8bf579ec1c47c9ad53361db3669b96046a3317833109a2ba1bdc1f61e054a3e66890f3076a4c097b2fe0aeab47d048e643b699a4f8d34e17172bc52eb74a5f1966243361be032b676e0d1e80678d90731d988c7293199f2d1784427fdb89051d3150439fc4e62948b692fab64b24f46ac5a87ecfea676cd281c7a3ee97a92bdcfa31d3d95e0513d55e49556b1b2f6ad837ee56f62d0dfb9946af4f25c6379d2a38ffb66260af6d32672fe245ff08bdef329b41ef2b9a59df5b1d4a989954330b704bbd45d08daf36bcb3886359dc9bcf43f3d033ef5300a74d642fee3814087abcd89cfbe6da7b9c7f9aeb4bc90f35c853296ba2be3434789a9a8170a91b0288019db035ec6d973432b969db0a721208e4d12b079c3f7ef586739ef4bead38781e9ce950ae170c73ff4081bbb617d45564a8f2ef44405d4a55b443c7fa091d81e97bc1aa43fcf9b60d5aec2752dfa9c5ba49adbcdcb8f4129f058126dbc9d362f5a4d67cbd26961c56d80142ec1a3a6fcbce799811da3951b9fb14d87969335a3fdcfd21ec3e2400883782bcc836de26ccf7b28e4759765addbdef82ee8ce9f2e4f8967d63534748165a6fbc6b2d99588d6f0858e63a7f32e4598fde5c6793e5fe6bdcc1c7f7cf4023676f714fcbf5a9ccaee244be6544f1b885ed3d9b48cf922c54056c4efeb02b51f3cebc358e5009836f4746108b5616d25e493363df996cb72118cef6b51a9a13548157483d13339e7054271c097812a5767b335a55d90ee0beccb012eb0799cd64489d1fe12e84a077245f5f84072e5452ceba20eecf7c4c4d4de7fb51a46325f383a64376d93380cf20c402af9dd6b6a9418411477d34bb3335c903a6654748bdb8e7c271ce3e950c27088560e1b2f9763ae1969001a300bfd5e3e41b55693168a75ca8c14f7710d880f5f53a563b2b4d6639072850454bff58719173c0e568fb3d56107fff078a716b2a9702805fcc59da4b45704bcaa9d00fcc3f0f8336c4de0fda0251d2588e7d3bc62934ca5007c756071b7ba7f0da9f874149d13aa33b0ef1086c05200680824a32794998f6daa01995f4d3b9880839d1a7f67dbdbd45b5a4493b21cecc08f8114369509475b75eec1aac7b49837e5cb524b4cb3f34c52dd9d40ff5241c02b1f2bc03139255fefc5280b2275cea5c3c720b3ada2186c9409e97c955b722a6b8f00c1b30ca3efea6642cc146ffdbd030040d329ae3cf9020844bf6244f6106e63bd109829cecbae7b17695fbeaecdb7f5dd61f92d3e0f27a32f0f96be5654bf2ae19244265ab1e7430a166c54339f2d283c3f2eff163951874b819b1ee0a8ef9bb3529868560ebe9221820e3e1f5ac61ed28512e1ed737cd9bc7657fe01699226921a778603d32d9b6894436ab11348b9f27054d3452888a831f3efaa274e78d2ece9ac02c18602a91e09556f804fe1700ad85c191ae514a5c84c38d2c47f81192b7e276afa216d59301b1ce06cd37e4bad93b3890b8dd27d03ce6a225153261a0748af9bdf8bab36b741dde885170071e40bab43ee348cdb9880f0231df59a383829790f0f3e41373027763c26e44bf87e6c135cadd937ffead6ad4799d4c25572016abad88460b53dfa1cf323020d3974ebd101c69510384d058c37f8a64ffdebc5b26c02833c03552dfc8e832b398d9aa6ac8a68a9fa11b741840f9c308139786728fc5a80dd79c86ff97e6aa3dc4c2bfdb8fcdce1da3d1e58aee227702d369738832e0174d72af32576623e0d56e064e3cfce292a5d9365f4b93aebceab58194625b89609fefc48b488e3fee7340fec0425ba695149e80a160955e8275694198b006205e082c95a2f40e58d51808ded1c665ac738026b096528327954a72aae30d0588987f332eebb88d9200b5b9a2ea46c59cf906abbbd77a3ee408d3ba39988953aa18846162ad5cc98b39182a4b79ff544e36a9895cae23452fca675502f03656aa823c7f9aaa28b358f8fa01490cec55df73b620e062d5c6eb34e2128f9a624f651b66a38336f8b4aedbc7d70b19977cad8968d258fea896eb8eeba3a5f5da3406a53103cedcba171bc9f30d20ca20565f56c8912292aa4ff8ed917996cb42777c88b5cf1a67e075dbfdf660aa89055c13d3927f63a99d3dcc6ed283882b91a2a70107d57b9c258ecca08f6d79738353be1f7814a7de3c3698cee72f752846c7bb917f36b7a185418a0ae4639292c6cfdd582ca3f3fa22d424f1f88129854a27a2de4d37aff4cdfd955c20e999af304157112203526ce55b01798de3883daa72ddfd23988b512e7243e31fd468bf591b02fc62e1e5253a8810844baae22defaff6650b3c8a0e7d291367ae212e3e28c70f88aa0cbc804d4ec88a5a9b76d305aa4e99262194fe405d175b3c00524a4fa033aee81f2c92392993077f4a59a2aa8770929ddbd4cf616a05304d7289662aa512992430b038b134b567ee4b2e3775cddfa5611b70c461dc7f18c028c853dc5ee1cd8b39a0732a5fff3ffdd51c05b0c6844794c8d24272856f7f07ac2ef3f3372d47a2224a93dc1e51d5b426f3eb0349fbcbf6149aa77523e0d693aaef51b81495d934501c428fa7b98cde4dd71bc4a6e72bcbdbd54a6145b4f960fed065812cba48b88397fa877bba7f11a27efb0bbaa2f700e68f9afb254975ba8ca4ca985007f19ba439ebe68df5b94d616c18c67a790a28f883d934935e288c297ca9939c22778bbbd3dc91a4be5ed959043c43b1608fbbd9f7750edf4e7e44fc85bc98a2a35545e1f90f3c2566eb167598dc9efab9b70f4bcd895cc563ac32cac459f6f8be191bf98af71591356c855063c0bdd5274f19bfdbd9eb0a2caf6938571b8d544095c5427d468a4d11612b50c32aa3427947710f709828d83bc61608efb3825d50ed19ecc67bd95d145b23aca20828fadc5f4a70ff04aaf7e845e3d5f01342629fc35d19a744f66e10714f297631c9359c4aa94fb7fcb76ba5b0ec88d7dc77d87a79ab2e8573882db4877de92a6108292ec11e331fbc89541c49ed8a16024fc9d91a0ca8c5cbd193f331a52fb5ea9154e187285345be2d7198da1de0f332e8a4d2a5ad1cf2a6ae588456a9b5afc6d1357b70af3ab0a0da2700df487309357e9bc282920e46ddd00f59dc62cc39302f769fc7edd1332882800985bf7f35f2f7a0f61c98f47108116c49ae5714f7b408ed5dc956a507072128f8e38db48e8b88cc5ed15bd6402828b495864bdf0c5cc5dbee67b75a40bea3a1a84d617e9b12cbd6f062679ea2aac1bce0ba1170aa993bf3efedc8edbc28e6df8d61cdad13cdad3b94018ffba8b138605fd690110c65e1210801decedd08217587d19a177c5390ab02288449c3e25bb6aba285098e87a3fc1f0e427e70f7f8f8adbb34abdc1d993570d0e468c7fcdbda2ad06fe076194c4a95d02fa6503aeeb26986dc94816efe272d16d74495d565e7fe7728f4fbb8d9cfcf9d81836653ec6352e455791b30f38b7e33f34635371c94a71282c66f14ef2092c36921a941021a8513690696d28136683b6b26bc19b6783957e62c98fb6c71ab1f141ccea59b7094d6d8d3bd8a502b65e885298199ace086d23d92f1d23de85b1aab534b4e077632da83d09ba4fe69313349b16a7de4098cbfc3fa5a3a0c848e00ad3427fe7024681284d5caa191f90c0ae652e4d1ba1b4b13b1316dd0a2d58aadc16ad594bc6510c80fe82d99ba22b7ce445c14685532d7b2225d41209609f27437f5e1ebbb967dc2ab74bd10e4f3e938fa08f94a550c53d067d6b404177694bb41e36d72f4c3200da660f4b87cec0ef3e9d6eb47a8ff24b9b052c64aa39b72d6668f414c410b97eb00ffd1729053497bcc06f6ef81a0eacfcd7fdb235a214d9d671303bf8c38dd949d98c9bedec93a9f6b2eed7bd4f64c2a431ef1282662be908eb1e5ea8b4e525737da5e809f3f17593b07c27cfd1fc63e8b879a8ec0cb9c142335ca5bc1adbfcf4546c76f6d006bee28f1301c5e605394a6bd01ca1b9e301e047cdaedfeee8f4c3ef9ec52da03a126f41e3c396ab9d3c0267c8ad28fefc363b0ee5a06fab2ffd9c840bdf69c01a7e5840192fd4851ec5ba71b44f275a8896f28851bb809fe803830bd700df6876c5a12d9a0a2a79b518e32560df803d4e0b226b631df29e26e217d7ba3623fab7724b6ea95a5d2539ab01eb2c6d6da84eeecc2c7df0dbc2fd0bed12da656f5a5c3f71f29729d02ecad1df48e86746fb0618bf93cfeefe72e5c1f7baa9d7459cbf232052f10de99b6094d7513b2150874996a45bc69dc2d3e7d28bec48cef4f537995d523874bd189c67a4b3292384f6fdd99dca25aafdf50041cb8a7b88d193fff905b596de6f068bf4295b55ab78f00c3d50fd2323237e374bd9b3d8f820183da9b45c196906538457678d7f459c48522b543c2c4d7a9fe90ab94e306dafef5d8488d328807e681853c6bba12fadef71b5238cf509a747b2b6f2c43bf435ba6dcd2209302ffc196be44338af77c918587198e3ad710c710aed564fcf79ff210dfc7050b88ebfb75f62052eb4b4638c959398e8c7e510be21ba9984b4c80f991523249d62f354094e2f31c134d3c60e81afbec01142d3380ae5992b29f6b6758e95c93f38753fb3afec4f62fafb4eaee86331672a104a2bac3afae6acc2e0117678340310312dc51f72d7b840f308bc22adf251670c8566974c67a1da5238b98eba9a0d391dc4fca1fd29c93ca13ae67dc67a35e18fa1fcdfd5cdb5f733980ca9f2f04fd4d0728830eb7dc8c65b5e4bf14694d2aac21dfb88f7d50d7579498dbde5ebd063fc2f7703d1c322e72c7a69a0afb7e578e4ab4a650ca8333297ffa49e6444469ecaefbc810d22d2d0edbe28bc5e14ffb8000ed50e3fd3d1fddd27f32ab7c38cb7342ca9ad0d451efcf878eb5748fb65f11829efa4ebf600fe1e747dfafdd3a2089c6a3f73ec3c4dd3ec12982017bb7648e3934c0cd12b18c2044c3cc422dd58ca4bcc8850fe1afa0aba340470a0f13c5d3b1366a5c7c3f9a92ef6062123d1dab1873573d6ae2627441a6b3f003294b203fc041ad10c95a21db04d54fb94a0897b372b503866955009114d3ed23e4e69ec2790aabe99a7c390724dea099e296b43db2b1281b73708d757cad1a34e5eb6dd97a45a6accc7b687cb5095e55e98cd5757eaba533dbb0e310a7047675bb0018b51e12fc8df5c3ebbb8c07a62a6555ec3e7eb274e8e67fdeeab880be1ef516769c23926f85ab0fbcaf0f5ebc270aa3e58be53833c75bd431"}}}}}, 0x0)
shutdown(r0, 0x0)
recvmsg(r0, &(0x7f00000017c0)={0x0, 0x0, 0x0}, 0x7)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f00000000c0)={0x1, &(0x7f0000000080)=[{0x6, 0x0, 0x0, 0x7fff0000}]})
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000001400)={0x1, &(0x7f00000013c0)=[{0x6, 0x0, 0x0, 0x7fffffff}]})
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0xe, &(0x7f0000000140)={0x1, &(0x7f0000000100)=[{0x6}]})

semctl$SETVAL(0xffffffffffffffff, 0x0, 0x10, &(0x7f0000000000))
semctl$SEM_STAT(0x0, 0x0, 0x12, &(0x7f0000000100)=""/115)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setrlimit(0x0, &(0x7f0000000080))
r0 = memfd_create(&(0x7f0000000780)='[\xdbX\xae[\x1a\xa9\xfd\xfe\xad\xd1md\xc8\x85HX\xa9%\f\x1ae\xe0\x90\x93\x12l\xb6Z\x95\xab\x00{\xe9\xc2Y\xd1c\x81\x9eG\xf9,\xe2\xc6a\x8b\xe8\xf1\xb3\x86\xe2+Op\xd0\xa2\x8a\x1eb;(\xb5\xe1jS\x96\x91%||\xa0\x8ez\xadT\xc8\f\xe5\x89\xbf3:\x99\x1e\xac`\xc3\xcf\xd3\xae\xd2\a\x11\xa9\x95\xd2q#\xc6\xca\x97!*\x886Ka\x13\xf9\x0fSe\x9c.\xf1\xcd\xd7\xdf< K\\\xb7\xa0\xfbf\xd1\xa9\xa8P\xe0w\x8d\xc1\xa7\x03\xc6\xd1\x94\xf3\x90Y}\xa7c\xdeN\x06\xc8\x91\\\xde\xf2\xea$\xdc=\"\xb2\xfdR}\x84\v\x8a\xde\x92\xc9$\xc7\x833\xc8\x839g\xed\xd5\x1f\xd5R\x80\xba\xae\xd94\xfc,\xebD\xd2\n*\xea\xc3\xee\xad\x15\xb6~u\x8fO\xefZ\xae\xf6\x7f\xbd \xd1\xff>\xcd\x181$d\n\xb5/\"]\t,\xd3\xe3\x05\xb5K\x01\xaa\xf0\xadGp\xafB\fr%\aom\xb8\xa2t\xbeh\x99\xa4m\xcf\xaeYs', 0x3)
fcntl$addseals(r0, 0x409, 0x4)
write(r0, &(0x7f0000000000)="ef", 0x1)
rt_sigreturn()

clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x0, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit(0x0)

rt_sigprocmask(0x0, &(0x7f000078b000)={[0xfffffffffffffffd]}, 0x0, 0x8)
r0 = gettid()
timer_create(0x0, &(0x7f0000533fa0)={0x0, 0x5, 0x4, @tid=r0}, &(0x7f0000000300))
r1 = signalfd(0xffffffffffffffff, &(0x7f00007aeff8)={[0x3d]}, 0x8)
timer_settime(0x0, 0x3, &(0x7f000004a000)={{0x0, 0x1}, {0x7, 0xe4c}}, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r3, &(0x7f0000000280), 0x1, 0x0, 0x0)
readv(r1, 0x0, 0x0)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000080)={0x1, &(0x7f0000000040)=[{0x6, 0x0, 0x0, 0x7ffffff4}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x10000000000000d2, 0xd9f, 0x0)
lseek(r1, 0x0, 0x0)

clone(0x2000004100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
rt_sigsuspend(&(0x7f0000000280), 0x8)
ptrace$setopts(0x4206, r0, 0x0, 0x0)
tkill(r0, 0xb)
ptrace$cont(0x11, r0, 0x0, 0x400001)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
accept(r0, 0x0, 0x0)
exit(0x0)

open(&(0x7f0000000000)='.\x00', 0x10, 0x0)
r0 = open(&(0x7f0000000000)='.\x00', 0x10, 0x0)
fcntl$lock(r0, 0x7, &(0x7f00000000c0)={0x3})

clone(0x4000c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = openat$tcp_mem(0xffffffffffffff9c, &(0x7f0000000000)='/proc/sys/net/ipv4/tcp_wmem\x00', 0x1, 0x0)
write$tcp_mem(r1, &(0x7f0000000140), 0x24)
tkill(r0, 0x18)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
syz_mount_image$fuse(0x0, &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
mount(&(0x7f0000000140)=ANY=[], &(0x7f0000000040)='./file0\x00', &(0x7f0000000080)='proc\x00', 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800013, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigreturn()
readlink(&(0x7f0000000040)='./file0\x00', &(0x7f0000000080)=""/174, 0xae)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = syz_open_procfs(0xffffffffffffffff, &(0x7f00000000c0)='gid_map\x00')
rt_sigreturn()
write$tcp_mem(r1, &(0x7f0000001200)={0x0, 0x20, 0x0, 0x20, 0x1f}, 0x48)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
membarrier(0x0, 0x0)

clone(0x38004100, 0x0, 0x0, 0x0, 0x0)
clone(0x2000204d5fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = semget$private(0x0, 0x6, 0x0)
semctl$GETVAL(r0, 0x0, 0xc, 0x0)
exit_group(0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
exit_group(0x0)
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x20012, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
creat(&(0x7f0000000300)='./bus\x00', 0x0)
utimes(&(0x7f00000000c0)='./bus/file0\x00', 0x0)

r0 = syz_open_procfs$namespace(0xffffffffffffffff, &(0x7f0000000000)='ns/pid\x00')
fstat(r0, &(0x7f0000000140)={0x0, 0x0, 0x0, 0x0, <r1=>0x0})
setuid(r1)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
mount$9p_rdma(&(0x7f0000000040), &(0x7f0000000080)='./file0\x00', &(0x7f0000000100), 0x0, 0x0)
rt_sigreturn()

mbind(&(0x7f0000ffc000/0x1000)=nil, 0x1000, 0xbec4be7c5f483d54, 0x0, 0x0, 0x0)

capset(&(0x7f0000000000)={0x20080522}, &(0x7f0000002000))
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setgid(0xee00)
prlimit64(0x0, 0x0, &(0x7f0000000080), 0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
mmap(&(0x7f0000ff8000/0x2000)=nil, 0x2000, 0x0, 0x39012, r0, 0x0)
clone(0x0, &(0x7f0000000080), 0x0, 0x0, 0x0)

rt_sigprocmask(0xc20d56580c0cee4, &(0x7f0000000080), 0x0, 0x8)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
prlimit64(0xffffffffffffffff, 0x0, 0x0, 0x0)
fallocate(r0, 0x0, 0x102000006, 0x6)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f00000002c0)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x0, 0x0)
chdir(&(0x7f0000000200)='./file0\x00')
rename(&(0x7f0000000080)='./bus\x00', &(0x7f00000000c0)='./bus\x00')
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000080)={0x1, &(0x7f0000000000)=[{0x6, 0x0, 0x0, 0x7fffffff}]})
r0 = gettid()
r1 = getpid()
perf_event_open(0x0, 0x0, 0x0, 0xffffffffffffffff, 0x0)
prctl$PR_SET_PTRACER(0x59616d61, r1)
clone(0x3102002dfe, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rt_sigtimedwait(&(0x7f00000000c0), 0x0, &(0x7f0000000040)={0x0, 0x1c9c380}, 0x8)
ptrace$setopts(0x4206, r0, 0x0, 0x200044)
wait4(0x0, 0x0, 0x0, 0x0)
timer_create(0x0, 0x0, &(0x7f0000000240))

socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
pipe(&(0x7f00000000c0)={0xffffffffffffffff, <r2=>0xffffffffffffffff})
splice(r1, 0x0, r2, 0x0, 0xf3a, 0x0)
fcntl$setpipe(r2, 0x407, 0x8000000000)
r3 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
fork()
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
write$binfmt_elf64(r0, &(0x7f0000001800)=ANY=[], 0x78)

inotify_init1(0x0)
pipe2(&(0x7f0000000180), 0x0)
r0 = socket$netlink(0x10, 0x3, 0x0)
r1 = dup(r0)
write$FUSE_IOCTL(r1, &(0x7f0000000000)={0x20, 0xffffffffffffffda}, 0x20)
select(0x40, &(0x7f0000000ac0)={0xfe}, 0x0, 0x0, 0x0)

seccomp$SECCOMP_SET_MODE_FILTER(0x1, 0x0, &(0x7f0000000140)={0x1, &(0x7f00000000c0)=[{0x6, 0x0, 0x0, 0xfffffffe}]})
statfs(0x0, 0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
r2 = syz_open_procfs(0x0, &(0x7f00000000c0)='net/route\x00')
fchmod(r2, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
pipe(&(0x7f0000000140)={<r1=>0xffffffffffffffff})
fchmod(r1, 0x0)
shmdt(0x0)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
r2 = socket(0x10, 0x803, 0x0)
getsockname$packet(r2, 0x0, 0x0)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000040)={0x2, &(0x7f0000000140)=[{0x82}, {0x6}]})

r0 = socket(0xa, 0x1, 0x0)
setsockopt$sock_timeval(r0, 0x1, 0xa, &(0x7f0000000080)={0x0, 0x2710}, 0x10)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
setsockopt$inet6_MRT6_DEL_MFC(r0, 0x29, 0x19, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setuid(0xee01)
unshare(0x42000000)
exit(0x0)

r0 = socket$inet6(0xa, 0x2, 0x0)
bind$inet6(r0, &(0x7f00000001c0)={0xa, 0x10010000004e20}, 0x1c)
shutdown(r0, 0x0)
sendto$inet6(r0, 0x0, 0x0, 0x0, &(0x7f0000000200)={0xa, 0x4e20, 0x0, @empty}, 0x1c)
setsockopt$sock_int(r0, 0x1, 0x21, &(0x7f0000000240), 0x4)
recvmmsg(r0, &(0x7f0000000000)=[{{0x0, 0xffffffffffffff92, 0x0}}], 0x267, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x1, 0x0, 0x0)
membarrier(0x10, 0x0)
membarrier(0x8, 0x0)

sched_setscheduler(0x0, 0x5, &(0x7f0000000180))
socketpair$nbd(0x1, 0x1, 0x0, 0x0)
r0 = gettid()
setpgid(r0, 0x0)

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
setsockopt$sock_int(r0, 0x1, 0xb, 0x0, 0x0)
rt_sigreturn()

clone(0x100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mknodat(0xffffffffffffff9c, &(0x7f00000000c0)='./file0\x00', 0x1000, 0x0)
open(&(0x7f00000001c0)='./file0\x00', 0x1, 0x0)
open(&(0x7f0000000340)='./file0\x00', 0x0, 0x0)

perf_event_open(&(0x7f00000002c0)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x41c1}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = openat$full(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
pipe2$9p(&(0x7f0000000080)={0xffffffffffffffff, <r1=>0xffffffffffffffff}, 0x0)
fcntl$setpipe(r1, 0x407, 0x80005)
splice(r0, 0x0, r1, 0x0, 0x62ce9f1, 0x0)

r0 = openat$random(0xffffffffffffff9c, &(0x7f0000000000), 0x80201, 0x0)
pwritev(r0, 0x0, 0x0, 0x0, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = memfd_create(&(0x7f00000015c0)='/proc/sys/net/ipv4/vs/ignonneled\x00\x00\x00\x00\x00\x00\x93O\x17\xc8\xc2\x9c\xb7\x95\xd4\xb1$*f\xb4\x86\xa0\x17\v\xa6wxb\x97_2\x18\x953\xde\x93A\xcc\xa1\x0f\xc3\x93\x8a~\xf0#|\xbb\xc0\x85\xc4\xca\xb0n8\xb71\x16\x85\x87w\x95\x7f\xe6\b~\xfa\xb81?\x0eG9n\xf9w\x81\x9e=\xb2ln\xe6Nd\xb5\x97\xc1e{\x86h\xc3\x00\x7f]\x871C\xae\x9f\x00\x00', 0x0)
mmap(&(0x7f0000000000/0xaa2000)=nil, 0xaa2000, 0x0, 0x11, r0, 0x0)
epoll_pwait(0xffffffffffffffff, 0x0, 0x0, 0x0, &(0x7f0000001580), 0x8)
rt_sigreturn()

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
fchownat(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x1d00)
rt_sigreturn()

clone(0xc0006300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$inet6_tcp_int(r0, 0x6, 0x0, 0x0, &(0x7f0000000200))
r1 = gettid()
tkill(r1, 0x25)

syz_emit_ethernet(0x3e, &(0x7f0000000040)={@link_local, @broadcast, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "c1bd74", 0x8, 0x0, 0x0, @remote, @mcast2, {[@routing={0x0, 0x0, 0x0, 0x5}]}}}}}, 0x0)

r0 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r0, &(0x7f0000003000)=@file={0x1, './file0\x00'}, 0x6e)
r1 = syz_open_procfs(0x0, &(0x7f0000000180)='net/unix\x00')
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r3 = open(&(0x7f0000002000)='./bus\x00', 0x141042, 0x0)
sendfile(r3, r1, 0x0, 0x800000bf)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
r1 = eventfd(0x0)
r2 = fcntl$dupfd(r0, 0x0, r1)
read$eventfd(r2, 0x0, 0x0)

socketpair(0x10, 0x2, 0x3, &(0x7f0000000840))

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
rt_sigqueueinfo(r0, 0xa, &(0x7f0000000040))
r1 = open$dir(&(0x7f00000000c0)='.\x00', 0x0, 0x0)
fstat(r1, &(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, <r2=>0x0})
setuid(r2)
tkill(r0, 0x0)

r0 = openat$null(0xffffffffffffff9c, &(0x7f0000000ec0), 0x0, 0x0)
recvmmsg(r0, 0x0, 0x0, 0x0, 0x0)

clone(0x10045be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = timerfd_create(0x0, 0x0)
timerfd_settime(r1, 0x0, &(0x7f0000000300)={{0x77359400}}, 0xffffffffffffffff)
rt_tgsigqueueinfo(r0, r0, 0x25, &(0x7f0000000000))

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
semctl$SEM_STAT_ANY(0xffffffffffffffff, 0x0, 0x14, 0x0)
r0 = gettid()
tkill(r0, 0x25)

r0 = socket$inet6(0xa, 0x401000000001, 0x0)
perf_event_open(&(0x7f0000000040)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xfffffffffffffffd}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
close(r0)
syz_open_procfs(0x0, &(0x7f00000002c0)='comm\x00')
open(&(0x7f0000000100)='./bus\x00', 0x1141042, 0x0)
r1 = creat(&(0x7f0000000040)='./bus\x00', 0x0)
ftruncate(r1, 0x208200)
perf_event_open(&(0x7f0000000200)={0x2, 0x70, 0x41, 0x8001}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
ioctl$sock_SIOCSIFVLAN_SET_VLAN_FLAG_CMD(0xffffffffffffffff, 0x8983, &(0x7f0000000080)={0x9, 'vlan0\x00'})
flistxattr(0xffffffffffffffff, &(0x7f0000000440)=""/138, 0x8a)
r2 = socket$inet(0x2, 0x4000000000000001, 0x0)
setsockopt$inet_tcp_int(r2, 0x6, 0x80000000000002, &(0x7f0000000000)=0x200, 0x4)
bind$inet(r2, &(0x7f0000000380)={0x2, 0x200000000004e23, @local}, 0x10)
sendto$inet(r2, 0x0, 0x0, 0x200007fd, &(0x7f00000008c0)={0x2, 0x4e23, @local}, 0x10)
perf_event_open(&(0x7f0000000500)={0x5, 0x70, 0x4, 0x6, 0x3, 0x81, 0x0, 0x3, 0x20020, 0x9, 0x0, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x0, 0x1, 0x0, 0x0, 0x0, 0x2, 0x0, 0x1, 0x0, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x1108, 0x1000, 0xaf52, 0x9, 0x8, 0x6, 0x2}, 0x0, 0x9, 0xffffffffffffffff, 0x0)
creat(&(0x7f00000000c0)='./bus\x00', 0x180)
setsockopt$sock_int(r2, 0x1, 0x0, &(0x7f0000000100), 0x4)
sendto$inet(r2, 0x0, 0x0, 0x0, 0x0, 0x0)
recvmsg(r2, &(0x7f0000000240)={&(0x7f0000000040)=@nfc, 0x199a, &(0x7f0000000180)=[{&(0x7f0000003ac0)=""/4096, 0xdc00}], 0x1, &(0x7f0000000200)=""/20, 0x14}, 0x100)
write$binfmt_elf64(r2, &(0x7f0000000300)=ANY=[@ANYRESDEC, @ANYRESDEC, @ANYRESOCT, @ANYRESDEC], 0x100000530)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6(0xa, 0x803, 0x6)
bind$inet6(r0, &(0x7f0000000000)={0xa, 0x0, 0x0, @local, 0x1}, 0x1c)
connect$inet6(r0, &(0x7f00000000c0)={0xa, 0x0, 0x0, @mcast1}, 0x1c)
exit_group(0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
syz_mount_image$fuse(0x0, &(0x7f0000000180)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
mount(&(0x7f0000000140)=ANY=[], &(0x7f0000000040)='./file0\x00', &(0x7f0000000080)='proc\x00', 0x0, 0x0)
statfs(&(0x7f00000002c0)='./file0/../file0\x00', 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800013, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigreturn()

setpriority(0x0, 0x0, 0x80000001)

perf_event_open(&(0x7f0000000200)={0x2, 0x70, 0x41, 0x8001}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000780)='.\x00', 0x0, 0x0)
mknodat(r0, &(0x7f0000000000)='./file0\x00', 0x1000, 0x0)
openat(r0, &(0x7f0000000040)='./file0\x00', 0x46081, 0x0)
openat(r0, &(0x7f0000000080)='./file0\x00', 0x0, 0x0)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000000)={0x2, &(0x7f0000000080)=[{0x4d}, {0x6, 0x0, 0x0, 0x7fff0000}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

syz_emit_ethernet(0x2e, &(0x7f0000000000)={@multicast, @remote, @void, {@ipv4={0x800, @udp={{0x5, 0x4, 0x0, 0x0, 0x20, 0x0, 0xf000, 0x0, 0x11, 0x0, @remote, @local}, {0x0, 0x0, 0xc, 0x0, @gue={{0x1, 0x0, 0x0, 0x0, 0x0, @void}}}}}}}, 0x0)

r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
setsockopt$inet_icmp_ICMP_FILTER(r0, 0x1, 0x2, &(0x7f0000000040), 0x4)

r0 = perf_event_open(&(0x7f0000000100)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3c43, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={0x0}}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
fcntl$setown(r0, 0x8, 0xffffffffffffffff)
r1 = inotify_init1(0x0)
open_by_handle_at(0xffffffffffffffff, &(0x7f00000001c0)=ANY=[@ANYRESOCT], 0x220081)
fcntl$setown(r1, 0x8, 0xffffffffffffffff)
fcntl$getownex(r1, 0x10, &(0x7f0000000080)={0x0, <r2=>0x0})
ptrace$setopts(0x4206, r2, 0x0, 0x0)
perf_event_open(0x0, 0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0)
ptrace(0x4207, r2)
openat$cgroup_procs(0xffffffffffffffff, 0x0, 0x2, 0x0)
r3 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
ioctl$sock_inet_SIOCSARP(r3, 0x8955, &(0x7f0000000300)={{0x2, 0x0, @loopback}, {0x0, @remote}, 0x4, {0x2, 0x0, @empty}, 'vlan1\x00'})
ioctl$sock_SIOCSIFVLAN_DEL_VLAN_CMD(r3, 0x8983, &(0x7f0000000000)={0x1, 'vlan1\x00'})
r4 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r4, 0x0)
preadv(r4, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
sendmmsg(r4, &(0x7f0000000040), 0x0, 0x4000081)
fcntl$setown(r3, 0x8, r2)
ptrace$cont(0x20, r2, 0x0, 0x0)

r0 = getpgrp(0xffffffffffffffff)
sched_getaffinity(r0, 0xfffffffffffffff8, &(0x7f0000000000))

syz_emit_ethernet(0x4e, &(0x7f0000000140)={@broadcast, @dev, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "131110", 0x18, 0x3a, 0xff, @empty, @local, {[], @ndisc_na={0x88, 0x0, 0x0, 0xff, '\x00', @local}}}}}}, 0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000002240), 0x0, 0x0)
fallocate(r0, 0x0, 0xffffffffffffff63, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
r1 = openat$fuse(0xffffffffffffff9c, &(0x7f0000000000), 0x2, 0x0)
mount$fuse(0x400000, &(0x7f0000000040)='.\x00', &(0x7f0000000080), 0x0, &(0x7f0000000100)={{'fd', 0x3d, r1}, 0x2c, {'rootmode', 0x3d, 0x4300}})
write$FUSE_NOTIFY_STORE(r1, &(0x7f0000000080)={0x29, 0x4, 0x0, {0x1, 0x0, 0x1, 0x0, [0x0]}}, 0x29)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6806300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setuid(0xee01)
r0 = gettid()
tkill(r0, 0x25)
lstat(&(0x7f0000000140)='.\x00', &(0x7f0000000380)={0x0, 0x0, 0x0, 0x0, <r1=>0x0})
setresuid(0x0, r1, 0x0)
rt_sigqueueinfo(r0, 0x0, &(0x7f00000002c0)={0x0, 0x0, 0xfffffffb})
rt_sigreturn()

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = creat(&(0x7f0000000300)='./control\x00', 0x0)
fsetxattr(r0, &(0x7f00000000c0)=ANY=[@ANYBLOB="757365722e5280a6b758ec14b2d5"], 0x0, 0x0, 0x0)
flistxattr(r0, 0x0, 0xe)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit(0x0)
exit(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = socket$inet6_udp(0xa, 0x2, 0x0)
fgetxattr(r1, &(0x7f0000000000)=@known='security.selinux\x00', &(0x7f00000000c0)=""/229, 0xe5)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = memfd_create(&(0x7f0000000900)='#\'%nodev\x00\x7f\xe5\xd0ql\x86\xcd\xe6\x14\x93\xb0\x7f_,y<', 0x0)
r1 = memfd_create(&(0x7f0000000900)='#\'%nodev\x00\x7f\xe5\xd0ql\x86\xcd\xe6\x14\x93\xb0\x7f_,y<', 0x0)
mmap(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x0, 0x11, r1, 0x0)
sendfile(r0, r1, &(0x7f0000000000), 0x0)
rt_sigreturn()

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r0 = syz_open_procfs(0x0, &(0x7f0000000100)='uid_map\x00')
r1 = dup(r0)
write$tcp_mem(r1, &(0x7f0000000000), 0x48)
r2 = gettid()
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x1, 0x0, 0x0)
tgkill(r2, r2, 0xe)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = syz_open_procfs(0x0, &(0x7f0000000080)='fd\x00')
fcntl$lock(r0, 0x7, &(0x7f0000000180)={0x0, 0x0, 0xfffffffffffffffc})
r1 = gettid()
r2 = gettid()
rt_tgsigqueueinfo(r1, r2, 0x2c, &(0x7f00000005c0))

r0 = openat(0xffffffffffffffff, &(0x7f00000002c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = dup(r1)
setsockopt$inet_buf(r2, 0x0, 0x2c, &(0x7f00000001c0)="814372b19a45da325c757dbaf18057b8939c505e231f69c5ec81e39fec83fa2c9b8b889c18b2d45e3e554e79cb68556353d39b2b124723d3ac977dd644a999d2ee55f522a3be49a5a345fdb90bf330b47a", 0x51)

rt_sigprocmask(0x0, &(0x7f000078b000)={[0xfffffffffffffffd]}, 0x0, 0x8)
r0 = signalfd(0xffffffffffffffff, &(0x7f00007aeff8)={[0xfffffffffffffff8]}, 0x8)
r1 = gettid()
tkill(r1, 0x40)
r2 = gettid()
tkill(r2, 0x40)
read(r0, &(0x7f0000000740)=""/384, 0x200008c0)

r0 = socket$netlink(0x10, 0x3, 0x0)
sendmsg$netlink(r0, &(0x7f0000001740)={0x0, 0x0, &(0x7f0000001700)=[{&(0x7f0000000080)={0x34, 0x12, 0xcdda8773e1e93189, 0x0, 0x0, "", [@nested={0x24, 0x0, 0x0, 0x1, [@typed={0x4}, @typed={0x8, 0x0, 0x0, 0x0, @fd}, @typed={0x4, 0x3, 0x0, 0x0, @ipv6=@local}]}]}, 0x34}], 0x1}, 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = getpid()
timer_getoverrun(0x0)
rt_sigqueueinfo(r1, 0x39, &(0x7f0000000000))

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = socket$inet6(0xa, 0x1, 0x0)
bind$inet6(r1, &(0x7f0000000140)={0xa, 0x0, 0x0, @ipv4={'\x00', '\xff\xff', @local}}, 0x1c)
r2 = dup2(r1, r1)
connect$inet6(r2, &(0x7f0000000040)={0xa, 0x0, 0x0, @private2}, 0x1c)
r3 = gettid()
tgkill(r0, r3, 0x24)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
getxattr(&(0x7f0000000040)='./file0\x00', &(0x7f0000000080)=@known='user.syz\x00', 0x0, 0x0)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = syz_open_pts(r0, 0x0)
fcntl$lock(r1, 0x5, &(0x7f0000000100)={0x1, 0x3})

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
openat(0xffffffffffffffff, &(0x7f0000000000)='./file0\x00', 0x0, 0x0)
r1 = socket$inet_udp(0x2, 0x2, 0x0)
setsockopt$inet6_icmp_ICMP_FILTER(r1, 0x1, 0x15, &(0x7f0000000000), 0x4f)

r0 = semget$private(0x0, 0x2, 0x0)
semtimedop(r0, &(0x7f00000001c0)=[{0x0, 0x0, 0x1800}, {0x0, 0x7ff}, {}], 0x3, 0x0)
semtimedop(r0, &(0x7f0000000000)=[{0x0, 0x1}], 0x1, 0x0)

syz_emit_ethernet(0x3e, &(0x7f0000000000)={@link_local, @local, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "1c3400", 0x8, 0x3c, 0x0, @initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @mcast2, {[@dstopts]}}}}}, 0x0)

r0 = socket$nl_route(0x10, 0x3, 0x0)
recvmmsg(0xffffffffffffffff, &(0x7f0000000280)=[{{0x0, 0x0, &(0x7f0000000180)=[{0x0}, {&(0x7f0000000100)=""/94, 0x5e}], 0x2}}, {{0x0, 0x0, 0x0}, 0xffff0000}], 0x2, 0x0, 0x0)
sendmmsg(r0, &(0x7f0000000000), 0x4000000000001b5, 0x0)

syz_emit_ethernet(0x76, &(0x7f0000000100)={@local, @remote, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "55165b", 0x40, 0x0, 0x0, @ipv4, @mcast2, {[@dstopts={0x2c}, @srh={0x0, 0x6, 0x4, 0x3, 0x0, 0x0, 0x0, [@local, @remote, @remote]}]}}}}}, 0x0)

prctl$PR_SET_NAME(0xf, &(0x7f0000000000)='//selinux\x00\x00\x01\x10')
r0 = openat$cgroup_root(0xffffffffffffff9c, &(0x7f0000000000), 0x200002, 0x0)
fchdir(r0)
r1 = open(&(0x7f00000000c0)='.\x00', 0x0, 0x0)
getdents64(r1, &(0x7f0000000000)=""/112, 0x70)
lseek(r1, 0xfffffffffffffffe, 0x1)

perf_event_open(&(0x7f0000000200)={0x2, 0x70, 0x42, 0x8001}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
syz_emit_ethernet(0x3e, &(0x7f00000000c0)={@broadcast, @random="33defd2be73a", @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "8c5e49", 0x8, 0x3a, 0x0, @rand_addr=' \x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02', @mcast2, {[], @echo_request={0x2}}}}}}, 0x0)

r0 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
r1 = shmget$private(0x0, 0x3000, 0x0, &(0x7f0000ffa000/0x3000)=nil)
shmctl$IPC_SET(r1, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0xffffffffffffffff, 0xffffffffffffffff}, 0x0, 0x0, 0x0, 0xffffffffffffffff})
shmctl$IPC_SET(r1, 0x1, &(0x7f0000000080)={{0x0, 0x0, 0xffffffffffffffff, 0xffffffffffffffff}, 0x0, 0x0, 0x0, 0xffffffffffffffff, 0x0, 0x2e})
shmat(r0, &(0x7f0000ffc000/0x1000)=nil, 0x1000)
pipe(&(0x7f0000000800)={<r2=>0xffffffffffffffff})
r3 = getpgid(0x0)
fcntl$setown(r2, 0x6, r3)
setpgid(r3, 0xffffffffffffffff)
semget(0x0, 0x3, 0x441)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = fork()
setpgid(r2, 0x0)
r3 = fork()
setpgid(r3, r2)

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x30005100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
r1 = gettid()
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
tgkill(r1, r1, 0x35)
syz_open_procfs(0x0, &(0x7f0000000080)='fdinfo/4\x00')
rt_sigreturn()

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
semop(0x0, &(0x7f0000000140)=[{}, {}, {}], 0x3)
semctl$IPC_RMID(0x0, 0x0, 0x0)

sched_yield()
socket$inet6(0xa, 0x3, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r0, 0x0)
preadv(r0, &(0x7f00000001c0)=[{0x0}], 0x1, 0x9, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
syz_emit_ethernet(0x4a, &(0x7f0000000300)={@local, @local, @void, {@ipv6={0x86dd, @tcp={0x0, 0x6, '\x00', 0x14, 0x6, 0x0, @remote, @local, {[], {{0x0, 0x0, 0x41424344, 0x41424344, 0x0, 0x0, 0x5}}}}}}}, 0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
setsockopt$inet_buf(r0, 0x0, 0x20, &(0x7f0000000080)="7e17fa78", 0x4)

clone(0x5100, 0x0, 0x0, 0x0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x4000000206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
openat$ptmx(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
mprotect(&(0x7f0000000000/0x3000)=nil, 0x3000, 0x1)
utimes(&(0x7f0000000000)='./file0\x00', 0x0)
exit(0x0)
exit_group(0x0)
exit(0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x100000a, 0x12, r0, 0xc2bd7000)
mknodat(0xffffffffffffffff, &(0x7f0000000080)='./file0\x00', 0x0, 0x0)
rt_sigreturn()

r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
sendmmsg$sock(r0, &(0x7f0000001d80)=[{{&(0x7f0000000100)=@l2tp6={0xa, 0x0, 0x0, @remote, 0x20}, 0x80, 0x0}}], 0x1, 0x0)

mkdir(&(0x7f0000000280)='./file0\x00', 0x0)
mount(0x0, &(0x7f0000000080)='./file0\x00', &(0x7f0000000940)='tmpfs\x00', 0x0, 0x0)
chdir(&(0x7f0000000300)='./file0\x00')
mkdir(&(0x7f0000000240)='./file1\x00', 0x0)
mkdir(&(0x7f0000000300)='./bus\x00', 0x0)
mknod$loop(&(0x7f0000000180)='./bus/file1\x00', 0x0, 0x1)
setxattr$security_capability(&(0x7f00000001c0)='./bus/file1\x00', &(0x7f0000000200), &(0x7f0000000100)=@v2, 0x14, 0x0)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mount$overlay(0x400000, &(0x7f0000000000)='./bus\x00', &(0x7f0000000400), 0x0, &(0x7f0000000300)=ANY=[@ANYBLOB='lowerdir=./bus,workdir=./file1,upperdir=./file0'])
mkdir(&(0x7f00000000c0)='./bus/file1\x00', 0x0)
chown(&(0x7f0000000600)='./bus/file1\x00', 0x0, 0x0)

mkdir(&(0x7f00000001c0)='./file1\x00', 0x0)
mkdir(&(0x7f00000002c0)='./bus\x00', 0x0)
mkdir(&(0x7f0000000280)='./file0\x00', 0x0)
mkdir(&(0x7f0000000240)='./file0/file0\x00', 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
creat(&(0x7f0000000080)='./bus/file0\x00', 0x0)
mount$overlay(0x400000, &(0x7f0000000000)='./bus\x00', &(0x7f0000000440), 0x0, &(0x7f0000000400)=ANY=[@ANYBLOB='lowerdir=./bus,workdir=./file1,upperdir=./file0'])
rmdir(&(0x7f0000000300)='./bus/file0\x00')
utimes(&(0x7f0000000180)='./bus/file0\x00', 0x0)
unlink(&(0x7f0000000100)='./file0/file0\x00')
rename(&(0x7f0000000040)='./file1\x00', &(0x7f0000000140)='./file0/file0\x00')
mkdir(&(0x7f0000000480)='./bus/file0\x00', 0x0)

openat(0xffffffffffffff9c, &(0x7f000000c380)='./file0\x00', 0x40, 0x0)
r0 = openat$fuse(0xffffffffffffff9c, &(0x7f0000002080), 0x42, 0x0)
mount$fuse(0x0, &(0x7f00000020c0)='./file0\x00', &(0x7f0000002100), 0x0, &(0x7f0000002140)=ANY=[@ANYBLOB='fd=', @ANYRESHEX=r0, @ANYBLOB=',rootmode=00000000000000000100000,user_id=', @ANYRESDEC=0x0, @ANYBLOB=',group_id=', @ANYRESDEC=0x0])
read$FUSE(r0, &(0x7f00000103c0)={0x2020, 0x0, <r1=>0x0}, 0x2020)
write$FUSE_INIT(r0, &(0x7f0000000040)={0x50, 0x0, r1, {0x7, 0x1f, 0x0, 0x19d4d945fb67b165}}, 0x50)
syz_fuse_handle_req(r0, &(0x7f000000e3c0)="000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000080000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000dc4e00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ba045abcd5dfc67d000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000230000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000050000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000a000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000008000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000209bfd66eea210560000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000020000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000040000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001354c4b600", 0x2000, &(0x7f00000062c0)={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
syz_fuse_handle_req(r0, &(0x7f0000004200)="a28096c80abf3543ecde7564abff5085d2227ebcb0f164ae92706ad0b083a3f469a3efd15b4921e9c3063b98b3082068e7c31950dde842eac55df0f991453cad62a6956b0b6f7b8cf49b506a3060fe1127eca99663ade8efa89ee189acb5f3b92f6bc4c46621c803eed0d0bb5f32384870ed08f89d4f74445762fb99715e083c4c92a8878be19ffacc30d0f2da64f971cd40563163adc15670ecf25cd3ad96138967c4b53ad9d04b5193ab5fb674aa0030a9d703d1baf810ce897f969121f142161919e583c275671b999e7f363891dfdfdf3556d01b86ee29eca8fccbfeaf1771395148706cc6e6be7ce29fc9ffef061b5420950c1a525bf75ad06edec51538d1c5bbc77da72dc90fd9998936fffdda2427e5a68966c7e2208f76304680182ec73007e482f034195712af922db2726195d997708734db9e7825a864be00b2a4f800881fc0363f5e618398454f35b148b4ccb88d418269fac868a8ba4a2d5b4f06a1ac01b5ad158b842e05adca22c7372585bf4ce95560b6c1e021a3ed2ff7bd3b6b3c7734c3b66d7e4c460096312082f89b16baa6e73814aa60925780cd92cd65087e260ec046fc363264366a9df2c849c0644911303946adad544521ceb469a3e193ecc9a7876403fac461a4a70d6193b2451189a5c5120b3535e9edf619108af7f517b58abd3fa7fb1ab832213430d2e6901076fba9c9e1acc6c6f48ff0e419bbc45589745a176f52a7407ad5e3dd49acb31b47862806f47077dda04905e45a80a12cbcd4d2dd9fe66c2d1f99394fed8ec60961cd2dc7115a96ece432fac86d51bebb08b95f447a83792fe80291fca7b298c9043ef2c26f0f7e42798d3f54c84b94c24c76c555d83ecc53b99bb22d71845e5cf21a5ba7fbeffeb6306e1730db14561b950a3f24bcfd78d4ab0d97de8054bb1a6077ae7cca6e45d846d3df82298d07212922742cb0facac3b77edfbab90e9ee2d4f7b0ee9b17bb11ec5e5721340d84cb6bd93428167e69b47759172557acda313c3decdfc6fe9336bfade459f43b39d0f2289f9142db280f4ee668e650e12858c577e12e2b9a57ee66c834be97979bcbe94747fa5d8d0b7d3a9f8f218df1bf960f828429a1efe838616b18faf6629236ddbded43a093efae163228e5c38fd7714743c2fcca47e3382bcfb1ab893fd7377527b4ec43f3fa60ebd338161d8de7cad65b15579e4af258f5fe3a63c2637a15703207029b0899b5427767647baef11e291358e6e54f6f13d3d2ca7a5e7969e04d2733b3b9ab822c69a3cfac097384de5071a9b74a656136d55eb190df08747b509fd610ff62b4950ef71c934fe21a48a4931d3d9458b415f112cee65c660f5490e982341da1c58634b3967ca6f3596d20cc90f508382156e36f16539093240ef5f2aa6a2c0dff2a67df30dcf50bf6e0b82a3d49f2d532a8dde1b3ceefcf0837190b74186090d1c18b59917d7efce1adfb238ef4a7b1d22c4cef09320221de883e97e6882466508de06fcdabad3b741bdca2cff879d57ddda52f42b3dcb8a78cfc05826af7e4ff155960ff8491194f4d321ef195990abaeeefdcb852d1e1e3703f317385a9458b6c2dd9db830f757ec29c9939fc7313e639fe485bc1e41ddaaef3fbf1f7cc527c8fad0d21b8082482caad7bee440e5097665f636c3dfec82f8c98afb6243bc3944939675a594277d278ba4361461f7da52e224e4ce5dee4a467bf6ae9f67b61ac6eb0a440406abac2016eec907e241c57f5f44be47290fd0fef785ff04df3810ccd637b4d97a84bae8486a36f75d872e645fe46625969fc2d1f032c56ed44bd98ea27bd9b6ddc8eb2dc2ec9f90f2f1ca1bd20e37ac58b03c84c872f4ba47310654986641460dfdd531ac62a76ad87b89c103ac5c9c2e7e70c66447b3412d4a1e5cbc30e16939505116c04de33ae054ed366de8d1f971c2de439957a194e22a488f58d7efd46439177f3f3c45a1475927eecd846d3d2e6a2ab5c7f8addd99062c2fc6b272d1f51bb8f22f1b6f8bb3faf8aa85e5eb9abf7df5cf8f26267323808b0833a987989cbe59205e7ad06556e2d1b8a4873ca1cbcbc8d43abc145fd4eb832e7a58ab2c793d003ce7b1850ce45eb7480417a1e9eb9d39a1028a2a04a2aa649c098c4f8eee514db5f6021173bb254b8e22b150b2ca01dc7ff235db46ed78d07f43d1adab13b8445d1b32069eb45f9d389fcf5a3f7d3ebe243c5b1fe17b1f5a3d571b65f21b9e471e818172554dc956749b99cb7a5f303ec480d7194a2ba86e204f06aa1becdddc8c49082c527e7064ac2ad77dc05639d3d2a7778f6943ed6105ebf6f0b9e94fddbe05c236ec000f4d1d4e496b10068211ab68ada4c7f7ac61f5f5ba5f1810d5bbe87ff4f8356af0d3f682baedb0ad8f8488b277421f0a03fc5e3095ee34bc4472d8f17e3f7013cf2f79f5ff3ea4b6bae56d1365a33b09bfa9a496323f7da923b7e29dce4beb81035f13130004c96e56d7ef6ca6c101d20c27a218e623227c33c9e488b17e7ae9ac20da8240501f7b614a1730f164553fe479ef149866e4ea47296814284a3d3eb7cbb294289ffb996e0eb053b9c16e54cf267832e3d360eb196ed51305630223309ea97215628f01ec9d3ea48096418d5e962cac5063460f0a18772ec7ce66d14a1cce14b52c40bbbfafccbf1e76f09e57ff0718048e5b993157a6cf4718826b1e09430413a3596a15c4a620fa8c8e1d1663e5739f9f790ddbb3be0e00187d43717d659242467d8681ac10303346157f894d9037641417010e9654c6a5b22263e73a5a37128f50078a980c30930321aa5c5e7851d5d392ddce3a14a96916fa8421ae6728f37f5de7c3e98feb4babd4e1bd2315d595e209d52748f70adc2284fcdaa6ad880470d2a071f3490aaf3491fb64b4547419e8eccdc491a8921156cb4811ad1e66514a32b0b31b641438881f28c1e6461b4f451938999af671e8c6a5cd0c072a9fe4cdbefe24ca616f3d0a15ac97cca835b1a440e04fa28340c6044176c8ecc8ee0d033d47db8a0aacfa0eabdfa1c9509fc2604008f01cbafeb5bd2b503b809ed672340b9a576593f1ef388391b54b605e7a15bef7b1345627a34fca57738b0f8f4f19eea93c903495274a4425a1a1cc6c4c6e335b631df5185c95b485e4257867b5347a40e4e14dcc560f061fd4fd265137dc68afd548adde778f1330f769acb1ccf5da14ff6992c24e210ea6e6179421881b803393bc6974e37106c5b5b3b5d0b3469f8969bffb7e4ceb2c98e928e74366492d27235ae4c74a2f48511aeeaa53a2beafa7a331b50e454c507af1b63350a5cef35668a5b9325014192277e509561008b3601088f79d42eaa8b1e4ae2000b31749e2b8094312ddb7f3c1cd625ef885c11fa22a66e374b52b3425e0b8016154e1fd8471339e32e7373d63ab646d893fbe09ae07b06074c01401ea76b3c382a9d32f24f93c789964e16bc4206ecd75c10917ab84ffd8d6cdf4cd28fd90375ff28518f8c1a3befc538e1b9e427fb671988d29f2fb2fcd039f4d341c84eb4d7cf600ddaba88bb094e4d87a1419180149f491368e648b69985b05ac39a4ecdd3c5135f3a5c8ad7792dacb6470144bb9e67805a211efb3ec9ccaf8e0901345fb19e4da579e1fbe86a1207f4f13c3436009c2c640b7cf3f8b77ca7bd994bf93308027359c6dd1b7db1e153fc0821968ef36c003b6c73fe890f4de24f5c6458dbaaf3819edeaa91783c3cfc7e773689236248195c7bbd60113f2476fa3687621d668d1728ee433d2f8f4db707345d30f1e52ab87a2a0afd547c6bb06500f59f17facde48f693490e22494b75d11df1a143b85068d143ef6a9bb5937a9df380c8948f1a01e9675e18409edb0f6b9605b68e34632fcce472dc50b90b0f6dcd57931f78e1e8861a0fb62e72b0baad6f9d23c1cfb0f19b25013c8d9fcd786a2f6f79768b5fb398f7b2baa31ce8156d1fc4a46c1c463fdf30360d42aeed2ef11611d0b7f654bb51052fd4dc39328f8ec4c58bbda05e6f1b3c8f6d8adca0268f2410e9a4a7d63b6616006d0e02f6edacc10e5c54fd85f15a8bd7648a293f23d6a699bd9a675250475a73a96d7475e4fabb89fb5e7de5d7a3479aa485c0befc60d0ac4fd5ac6dbecceb06cad86e219fc0ce4720758917811a3215f8d13e413bfb64fc065fc421aede0b56691797dac428c7e463479fa591b9072c309b7533e427c5cc11a1f6cf9a5b995d328d796d874c5b55dfc12a5039b413ce319cf5ba1f355c4e0717d32650b43e18010f37f048731931c52c4f36eb969dda702afe96c2a5241350a67ba2d026946189c5e281293c9a8e2cff3784776f1de78b917101b54e5ab00c045ea15f28a0e3f509962cf8bd3385d85250737eae5c34ece86b86669c13b00308a3b13c0ac3c83ff26fb52a4aa83c1233a9490cb9ca917a056908931751bddb88a62379a713395f0764e4a393faf253a4026d0472270e6036287d56850df1751543484d65b3062155b6300e0024241c59a862ae769c1a9232a2d9fb24705177a09cceb3eefbf9f106f67e01be14cdeb4d2fc7d8661df3e75de5ccd09a7e559f028fb9837c621ea0045b4d1b679067f246339c974631aa7134d4e910efb28d3c48929cef1df7e6c73668762d55086b6c59c36ac90154135fd7ca4e4047dd0aa161fa982d8edf9c0cb9666477e096c55718f6e4742415fefd4f696d1f1ccd6322bc19496ddebd36282a7c707d5b44113e30678e6e33ab7d34be04a59ac614d6a54134490998be02636fa91633d6294781c2b9a54c611c0045cfcfe81f49aa21b29d835cd2047c854486fd8e65a2ebf629f7ced602b9dd107bfde483e5c9b5cbba4a08cdce09920bda9978b7fc2b4a89bf1573a26389e52090fdf5dccf22111dc8c42fd3c8c477092895398086cc22cca665269e193fc650742a361a44b857d258429f701f22e9b7615bc3dab78c1479a41cf8575cdb17169470b347adfc03e03daea3e269725cfc72df5664b9df36d2f2b55013b71133e0b80577a47182511ebb308b6248d457bd2af7b28e77182c305241178c4124ab102771fd5a8c3dacb8775de881301d71587c76bcf0a97a72ad244d0c42fd71aceec32dd48bb5c9a95b391166c832ac5bac8c7cae4d18b3f7d9f2e4782fdf97732e3d51f67bbb57f989ee0d7589dbd0c2a5c63840e914b9d7d720fa120acbffebf816b588b2ccc052e7fa78992e0ea39dd21a122add41195f8e2e1acd777c1a4e8ef4362fef441feb4d9252c6bfbd2742152300a32027776e3341620d3c8d9365e10e81adcca7d87a0e555c98a0353c692557d90ee9be3fbaab766abf93e2462149fd99c92a5fc58d899ee75535cd1fe1386c5ab0b157c2102039d6015258f59cef3f15b951893a30ae839f740402a30b34e7be73796286403c5beb0853d856d83f1b00b48328f56dcb32e1faab08a3435b1482bf18b21c95aefeaafa7fd761c7f28d416fcde06bf7aee5c6e9eb50e55874253ba3f1d0ce2505b4fc7c3fc996bfbb8446bafe84f5bea94bfd7ca5aeaf237fe793b66e5c521d4092e4e1f9bde1dfcfe53fa55005d21cfa833a338fd9792614129336060e10d1911862070761aa20c2902eb7c5a355eff4cf6253d7102a2ca1fead4c53b57d576d104c081310d92797e4e2e8c269d19910d0d4cedf30fa28ba680c00137f83de940624229b6a125ce5233c6cf4a3640b74f58f288dad8451fbe37641c5559a5f3caf1299c8bfb230723652278fe378efd8e459b9da26cffeb58468a6301dbc06d713ba2d8d43d9038f5f2dc8b831ba58a88eeb5b1786b21e398aeeeb7c1f3d6f01d82b3947862fb9e7cbd7da5d04c5fcd34da28d53e2246e3ac1e3a619ad174efa6435eaa0fc94d610799ce0158421dce046306eb5042143daa336d52206b12610ea6389cdda49bf5af1d4ee42ac090a94ae7b7612073f3a5c36a2205eda887f41478f7d20f18667f941f71eebcfa76c1ab28f2a49a3bd56bd3f4e6bd079ab3fe2d94782236e83585a03e52907abaef7456a95d5d3f3d37efdc035dbfd7c41b8ba0af2df8adf1cf24f7ff0beccd3d26bc91caf42314ef7e466f74e19ae0df2e2298fc2f694a7ec134632035585d530e7e19f65c256f001d75382d9825ef741bc213af186377d9ca10d3722354e1897ca5c23ac6a52c9ad0e6b686e1776f7ec65df033e8f4d5db80c1bc354093b319cb70df93d610667675816328c99322f14e636b95f04e6497f139d508b453f53ddb5c289d849fd5407c9bdcefd1642abd46e28cb4e94371bdc606eeb67c9fe17747c68f2d50e82711da4d3edb0eda06f41b7f93fa8fb4d83cf21c79da67000bac2275508217ade1659fa8d24e5f8efb9f4bd21073ebef3d06368eb03fa3cf0d638448bd055ed20d292033ffdba538559c8ff9a2a5c8f83b5c393643d6585d1df994c3be43e72b8f3f53114d2a5f6bcedb573842b23b6a3eb7fca8495bf03bd03fde7b19bd39a16cec49e01f38e671af33cae082d9788e3202799bc466babec2080528d0609c0b731964719093735b4c1e73bd0705637c47516922197c552baeaf3516b5e3bbc2cd1afa3ef8215196ed580d9561092f620b897e98e786a0c7cbb0eedda8063292ba6482497f5f6bb62fb5ab4c97cb7658dc6579718eb97b547fcf47ced1426561af93a15fb4dc6d3d93b868644943c2c94b23b0570bbb81df2666c24f5abccfcdd71e209f3bb43c01d17f9bc8b9af2c26762fc6a741a150b7d1186e4f35175f3c315243e1c11e92c43a1fc492eef5a13c77a81fcf514ebfd0f8e645dae15a07e86b2f01fda065db4505a5eea83cb616f744f6bee731be191c65449c02603556d5a51422cf9c2f19f8d6843e0c1091e0708aa271e91f71c8602b9fa72189e036b7cb6af1569f21269283de94a6d7fe5849fd433d5b719c80419873db0587fc29786cc598d896fb16360bddd2ce12e54d05418f4f5e5f2d7aafe9fcd6268cbe2e9e6329ffb6c67fab8f3ce673028cc06aaa6b857556bba3b44d3fab5b6e875e70a2f3ad4b2ff76f31ead3462d3801ba373b3c2f545e94f57021575e2947f81f53283fc0a5137fd44fa3d074c92de54a0a3465c858f5a7ef08313faddbc3663e4e0167f3cba39612057a7518fbfb031f5ad0f9f75831973ebd733b82e554bf3fdec84e51f65dab6028c6c51366d9d4700fdf255e4c7bd70766e7f2281b3f2a5363f85ce49f9135904d14bcb117ad754c2594dcdca2d30e40ff265b5accfb116f64ed99aad570c4c5a91efdbb984ac651d8721405a0342cf77f448c17a152eabf29e88950558a86d0074e1cefab1eb7c366682f686ee1338737e675ea58eb8b4c86b9f28a6f6e96459f29e3b4dc59ff044c61a0dcc5c31d803e6e98420e446229ccdec3d0f705e92ffe016bb3696373eadab7f35ccf65ab4d9be09a085ce21bbd7c0555376e4d7fe68b5e7a64f48b5127825fb2be598d991f9c1a54bf52713417dcc599e812d85513a537e6eafa738edc972b67e065595d11678449bce6cd3d69800a649b560d0e057c502ca3e72e97820829ecfea801192c3f4e2c8763c095a43ee6fe45fe8730130937668df1d4ee577ada28238be03286481f2d2a004cc4d48856e71fbd64f1a0043a4520ecbbf1b3abdc96b87a27be8495a20542967aa4cd3a44a11502419a083d84e97abfde0901b66dde48388649a0ed6d93b9f20c530e990c7c52370a114d800d6ab3f6687d6bbc105b63738fe05fa6cac98ad6663936bb18cb923264e44312c24c2ce8e642bb73c921012b68a26a70977446b8f15f9d62467d8b356560c183a6bd6cd76ec868c3bd94a595cd7bf996755a508a814980c5e588b275200c45afd900c8c2de329ec2484b0e3ecd7b0960e5e3425881d1ff7f8bd8b20f5cc98ffc3acb77f5e88775a4bd3ab9f9eb027e27d3af55ebdf4eebab48ea911128d668d00fc3f5b5480aa0d9a4af563ba577384448e5425157133d59e1cef3c722f33700bd372825046b1fa5824e405154a3af1440bc2b75acfbd07cf92e8c162587e74b5ab66b1c6aeab3ad5fa3ee91da4900ef30ad04baea326df912517dd96e1696b4a91faa66675978a375e81f25464a1073dc6737af08d7e25956bb31d438548a7da38662d49db812a8cf1d6cc65f5c63879fd9ee7fd2a66ca3fc1a768cb239aab88c87206470b4c60592afeb6d69ed97a8f990155862ba4e22b64804142c131a23792937aa8a8696e165c24d7692a04bb4471b0f0d2507fe7c8618421428fc7a0acc984ca5cc6bacb772e8a717bbaa646f9643275910a6037afaf5a80678d18edda138a4e13d06d04a5d06431eab48738225cf1567e960e765728dc12e91b91c6f2b33dfb6e033aa68c1c2334d24335abc4a7a1df5636dec29091da54d5f5a1fff41e4a35a0c2f04f968f7d78e2f51c73577e2192bb20f289aaba5a175c2ed533855bd9ed9a842ad482136dd5e0cf45eb5e2d31ff62a3be1cf8a94a58316e74f4ab9fc54f3a0bb83beef0f355993bdea2c83e61cdc796bf2564ae51fae616799e8711998cd88d35cd9824452fdd65226174b46792cb87f4dd282e4e6f67eb66da413ad877ed6ce775f7e19bc93f48bb9e5ec04009de3c042aeacf7f4b25ad6b30e017303f64fe07ac79e8744aab6926d117f13513d0469cef335fe1d0d787c2d0b2c031a9521786ac10e9f8b768271680337f2c3262abdccb5d3107c632bf1f74c83ee91f49988222fb080cc8faa9b1a02526d8b6087e0b2354173d29016b3309587c16f057dd812aa63c3169150de81f3af97d082a8f8da4ce4f909ff649821d7f96d97613552e8cc4902e046ecfa329b1d980ff5ece69b8f1615fdff5244f41cec0af924624ae1641ecae5fa26c5fb9006e57100ee71377ced7c255ae17a0845e2ee0287c62c1852f93877f9f86157ca9675d383fff5cd6f2b001ec0136c07cf37f5ace1853122c2baa1092d418e2a490c4a5c8f56b828ce1bafeef4e77f095d6b4ed99d56f66812cb19be540ebe5d52e7eff2d69cbb8477e11514f7e3604bf9999f78c2f1ca6f60a2216b87fa0f25269c425b7d50709b200912b3b7899c95e12d6e9c4dacc19e327721860e0477a53e6793fbb7fb9704a848f395f48c24a6e79b9e1358cc3497251de88b8d3a7b22c6d8af1a7fab81530d9f0cc98f62debb222b54780d89794238532717b447d71b46a60ed481c21db85b590b31720009695ecffd4ef029964e5d5149622233ac013e960a005c924f73ea82c318455546c53d74aa3f7e2ff26aa074c40a55aba8b08027fc19b596eec6c4f89bae39e74b9aad88344f7cc5ad3eefa5095f2ab47222e9a357ecd71c6700ac576025201490d9e446603dfd4bda7617dd500981b2d2ab8c43882a5208494cb3f8ebc720bca8a7cf6c80bd7aaaf89507bb3412ea490a78973f12cc30413e9df1458917ea3d68b438d424c1314bc8d01939c5a5a842438281e62d0c800dee704b2a6cd3e1e4b885a6b26b894a98765fa3308c9e4b87f93625faecdb17c29a27cd243bf6030a67874ec9f2443cf8154261ac2a834c01cbe1f314ee7aa3ca552e1648cf8b42a63f249e3538026e09e44d69dc259adb0d1a0cbccb5a5dd5d0dccc90d023da79d5634188ff060f7e35a5f9d7ad99546824d63975d4452de876093f4e997dc46eedcd80a9eebf5e4f077fbb10c7d9e19a3419e7b845972a3b62613c5404a209b16fa88e0ff49d7b4f21fecc1f773c5b4be61021e0cab8602c6e8257649303aaeafcbb178e7a460ff07f219c46eb6fe5bf8113723e454003bd707767c107daf4255751daaf8decf35262640058924eb6587868b2c08230b317e97396ebc928ba8d274ca0eed0bfcb637676003c64e8c1e1a0420b6c96a44226061ced41b8448382abd2f3d0c472afcde231fbc9ee90c2f1132f8e2391246f95ad93354c7460e20de996ad0f61b13b27646887a637cede90b94b7d8c3130f0fe060e8d955c711a2700b302a75bdeb32a0a6802ea795cb114f5f82a1a381a86bbff88b299e47728b746dff964c94c52b661b9429376b1320b46081426b7c340206dc0da151bf84be2a49e78b6b5938753d2b1be8d9e67c43c5d70e72519f5f90d9f95e84ee38f82b191ac4d968b0a37901fd923cb289d585693ac3c3f8a94fca6df45e694e199a9cd0b1bc1fa7394bcc96aae670dca6605a998793b7e067ac410ba631057b8b76fcbe9524df820c02efef1608b743cd2aa6d60d3d8e476fa12d3acc329f8272b087d89471177ed531fec1f9c24a975ca2fcd8c246a33e291a3f00b7f234052067a0059c86762475256bb5e7dac6f121a0925506b18933c6e314915d4b3b2130aafc2483ef22ff8bb7b887565b1bd22fabca22037d8fc9437f675c5313526266f60bb7c7c47f30c7d567ed142ea5ec367c4298328d20e5344f01c0c90cf8a6302f4d84b6ba7495fba314a05ba29b63bb6d458fdb05a4411136958309f418fb178e19aa09ff9e62b29732fb2986c96e738f7a688cb2122dbb8f2ad9a5f28bc49ec0c462413552afee8e403259b55ad6dc334dde7f2d306929dd01f2aa6036cafd41874522689301b81c9e50e86828894140356db0a3317b081ed9d8148c41e77e6bda6287762532b86eb91f5480915680deb8a91fb8656b7f0109064865d2b846af0861f67d3f720d6e306540cd7b68f095ef3690b88ea93fb6a402ff5697597cda83171f159e85307d1a8c01611189bd4eb4f0453ab88d43ae181a562a76902a67c687514079d6f4304d9a7c0fa24b6e86074ea0a9fd8187c120312078f5ebfa674adc0303734bf8f6b5585943706594192ad24c9f7d9794fb83758924f862855ddd50bff58b522c43d73c03289baec628cd693cab93101b1e473b76532510e10f03e86812fea6f2d6f5467dcf29e6d7cf8524f383a0ded3f0951c3ffb171a6b8a6d97b5fa8899a19f1a3d0e934a1d4741076e4394ba225158f697bf7d5651717c6950229a0be22e8120d76a414edbcd03d505264b7ede8272ccbd6dbdcebaf11daf6a652f6f9eb74ba7a3ecc942892891388005ae5d971e4e79d696564906dffd44845b704a9abc2fa5ba1bb69a548423a08044ad6d0e365db7e6bea0f3844a452759716cb98dcf326001ec90c1c343174098cdf47ea2e13341058ca014d2a30e9ba3c526de72a6e387181bf76a278c9cbc518d8c374a3f1d9802a39464a100903dbec16f8f095f5d82d9d09507281e4f7fe0ce4fbeced193902a5f658af2a4c1d0952dabdc6ae5830b6b5a2c3f5b8d33a73665990822e5f4a7ce5366755a1615543bdf78299c71e890e0bedb6ec277b10a389d6a3ba9c037221421279e51ab50fb115de2076cc99444202e88ebd9d0fbe4e60234b7b761495ac6c9e615ddac8176164a88fb6d6cc2b52672c8949afe3efc1e87a598896bc93e421423844fcaafe65af898a015b3bcaf623ebeef9a57155af5278ceb52b995f7ca466d9e18b05e86380679e0257cff6d0c6750078462f2ee4701d6d8289ed848b877cf5918625b7937060d667c11119881c30809056892352c6c53c01e395af6866ea350e6f21fa3db772c1177c759999973b51e11ffc5908", 0x2000, &(0x7f0000000c80)={0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f0000000280)={0x78, 0x0, 0x0, {0x0, 0x0, 0x0, {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x8000}}}, 0x0, 0x0, 0x0, 0x0, 0x0})
openat(0xffffffffffffff9c, &(0x7f000000c380)='./file0\x00', 0x0, 0x0)
utime(&(0x7f0000000000)='./file0\x00', &(0x7f00000000c0))

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x247ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f0000000040)='task\x00')
exit(0x0)
r1 = dup(r0)
getdents64(r1, 0x0, 0x0)
exit(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
mount(0x0, &(0x7f0000000200)='./file0\x00', &(0x7f0000000140)='ramfs\x00', 0x0, 0x0)
r1 = memfd_create(&(0x7f0000000380)='security.selinux\x00', 0x0)
pwrite64(r1, &(0x7f000003bfff)='/', 0x1, 0x0)
mmap(&(0x7f0000001000/0x1000)=nil, 0x1000, 0x4, 0x11, r1, 0x0)
chdir(&(0x7f0000000180)='./file0\x00')
symlink(&(0x7f0000001000)='./file0\x00', &(0x7f00000000c0)='./file0\x00')
llistxattr(&(0x7f00000001c0)='./file0/../file0\x00', 0x0, 0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000380)={0x2, 0x4e22}, 0x10)
r1 = fcntl$dupfd(r0, 0x0, r0)
setsockopt$inet_tcp_int(r1, 0x6, 0x2, &(0x7f0000000080)=0x800, 0x4)
listen(r0, 0x12)
setsockopt$inet_tcp_int(r0, 0x6, 0x22, &(0x7f0000000040)=0x1, 0x4)
syz_emit_ethernet(0x76, &(0x7f0000000140)={@local, @dev, @void, {@ipv4={0x800, @tcp={{0x5, 0x4, 0x0, 0x0, 0x68, 0x0, 0x0, 0x0, 0x6, 0x0, @remote, @local}, {{0x0, 0x4e22, 0x41424344, 0x41424344, 0x0, 0x6, 0x15, 0x2, 0x0, 0x0, 0x0, {[@timestamp={0x8, 0xa}, @sack={0x5, 0x16, [0x0, 0x0, 0x0, 0x0, 0x0]}, @sack={0x5, 0xe, [0x0, 0x0, 0x0]}, @md5sig={0x13, 0x12, "d36d43b30ea346f217a6079d942d51e5"}]}}}}}}}, 0x0)

clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$nl_route(0x10, 0x3, 0x0)
getsockopt$netlink(r0, 0x10e, 0x5, 0x0, &(0x7f0000000080))
exit(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
poll(0x0, 0x0, 0xc3d7)
timer_create(0x0, &(0x7f0000000500)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
timer_settime(0x0, 0x0, &(0x7f000006b000)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x14, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000040)=<r1=>0x0)
clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
umount2(0x0, 0x5)
timer_settime(r1, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)

clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
prlimit64(0x0, 0xf, &(0x7f0000000440), 0x0)
rt_sigqueueinfo(r0, 0x8, &(0x7f0000000040))

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
umount2(&(0x7f00000003c0)='./file0\x00', 0x8)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x38004100, 0x0, 0x0, 0x0, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()
r0 = semget$private(0x0, 0x3, 0x0)
semctl$GETPID(r0, 0x0, 0xb, 0x0)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rt_sigreturn()
rt_sigreturn()

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$IP6T_SO_GET_REVISION_TARGET(r0, 0x29, 0x45, &(0x7f00000001c0)={'IDLETIMER\x00'}, &(0x7f0000000240)=0x1e)

prctl$PR_SET_MM_AUXV(0x23, 0xd, &(0x7f0000000000)='j', 0x1)

socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = dup2(r1, r1)
r3 = dup(r2)
write$FUSE_POLL(r3, &(0x7f00000000c0)={0x18}, 0x18)
read$FUSE(r3, &(0x7f00000018c0)={0x2020}, 0x2020)
dup3(r1, r0, 0x0)

r0 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
getsockopt$sock_int(r0, 0x1, 0x4, &(0x7f0000000140), &(0x7f0000000180)=0x4)

syz_emit_ethernet(0x6a, &(0x7f00000001c0)={@broadcast, @random="c26a9f246194", @void, {@ipv4={0x800, @igmp={{0x15, 0x4, 0x0, 0x0, 0x5c, 0x0, 0x0, 0x0, 0x2, 0x0, @rand_addr, @loopback, {[@ra={0x94, 0x4}, @timestamp_addr={0x44, 0x3c, 0x0, 0x1, 0x0, [{@private}, {@broadcast}, {@dev}, {@private}, {}, {@remote}, {@loopback}]}]}}, {0x0, 0x0, 0x0, @initdev={0xac, 0x1e, 0x0, 0x0}}}}}}, 0x0)

setuid(0xee01)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000002040), 0x0, 0x0)
fchown(r0, 0xffffffffffffffff, 0x0)

r0 = signalfd(0xffffffffffffffff, &(0x7f0000000180), 0x8)
symlinkat(&(0x7f0000000080)='./file0\x00', r0, &(0x7f00000000c0)='.\x00')

setrlimit(0x7, &(0x7f0000000000))
clone(0xd194cd00, 0x0, 0x0, 0x0, 0x0)
inotify_init1(0x0)
rt_sigreturn()

clone(0x9106300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
connect$inet6(r0, &(0x7f0000000140)={0xa, 0x0, 0x0, @private0}, 0x1c)
rt_sigreturn()

r0 = syz_open_procfs(0x0, &(0x7f0000000040)='smaps\x00')
mlockall(0x5)
read$FUSE(r0, &(0x7f0000004140)={0x2020}, 0x2020)
read$FUSE(r0, &(0x7f0000002100)={0x2020}, 0x2020)

r0 = shmget$private(0x0, 0x4000, 0x0, &(0x7f0000ffb000/0x4000)=nil)
setreuid(0x0, 0xee00)
shmctl$IPC_STAT(r0, 0x2, 0x0)

socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff})
sendmmsg$unix(r0, &(0x7f0000002540)=[{0x0, 0x0, &(0x7f0000002200)=[{&(0x7f0000003580)="7a14269bc9bea939c9a7a24d86ba395b6c1248bc5e76e81d1e7db70d7cab7f28810c3a2ada3ad76a3a35c55bf3169fae276b51ae8f33a3d881026a1fe863bfe32afdac89abeb96e12a9c6672c9cf388b1b0274f6d1dd0f9ffb4fd07996692727d612e64ad06e7fb2be6d11960d6979921e6b6a57066323b6c9faa76c40e8f3ec00b0f1c0b882829ffa1d928bbbe1639d84495c6fd7e779ce95acd1ab816e325a46f550c99c9e6633de569a2ebae3165f82286f4bd8239a10f135aee2d578543ded5e89c105560e76f261e90d1cad6c850f3216b944813796f6d402bffc4ac6bf8aeaeeace76b4eb5606f8734d6aa250094af0273ab48f5a0a902ee4a0496e04042f5a58fe2818759420a26c0ce9dec0d405da595598ba578f3ebe5d9046dacc74bbc8c470a7f8f74065e5a91276aed1eef2362abeee41db8a8de74a67f2a0145064105f079adc1da425b43d2d804f045d02aa83475b0ba87f52d5429301ffc31c8308ed0db7c82d778d552bdddacb3ed3968d8190329fa63c5d4b9ed9d62c9f8942f639988cbba98dfe7b0cf93d0f59ab8be61e9fefe30df584cf46d8146d49b4f3c1c1b718da491c08dcdb6653e55300c7783da0e80e37c371aff23e665ac723a38ccd3b652418950a56e1576a461895be1be01cff7942764581b59e998d76255cb43fbb24274582fc7b0f0d739cce7267e2b05fd8e2d8d401ad61add0a4dec611a4d6cbecef02841a5ff7acc3c118166fe04ae58c35d8d44d2627492bf7b213fa3aa69b69ff0bed5fbf60ed8a61103669eaabdbf5fc461f258fc457c85ea0a11feca5779cf8780a22abec559122da9a6ddcbff416f423320cf9e800523f312f36a59f7c572f21fea832f977bdae4697b694410edec47c49aa0552fe2cf695dd434fa7492b263d2ceff0a7e48917968a7bc9ab54817ff5d7160ea1aef7b34b47de2bb0d73a9e9c7c183e95ae04b160c276a3f9012436ab431bbdc9d2ffba77d57e28a6a25deee622ff58710b9f3cf250558a3362abcb6644019a01682b68c54755c92afc1877916c75f76ade4a4793a00da159f6b8929ed26c574829c58a7fe426d747ccf9a2d93c01f71704997ee644be442bcb896f28b3832dbad5da2b466bf535d8d6f5d196b0336334c82af6de7a64ed5c022fb202755cbd0f8ced4f0d7c2f0889869a6cb121753c7837a522f0f0ba5adcdcb25187d204b2dac0f5da90de9e8d2718e23f1ca8c63dac541d3c0df2033c8ff8c1ffe9898de6e14717f489030c625d058f86bd638a1feafc00a39567bc4585c208196818a76de7e25c8bb7baeaefd9fa5c1c5c038b3bc2dedc78024f94e0c6441e95aa34fb6b13d84106b133eef9bcd61eec0ec8ba9902e5b0da85a6a914b62931f42b436e8e2257383ec635468f12c02e555954bf1e934e777514412d74691aef266d263bcccf8f4b840c8771cf6872d0f1be97af47bd9c0f189b376c64783fcb21764adbf4257d6f50a289c17b7f80ff9204230a7485252be2b10487d542c8404857dc41e8f1df45911c2b3ea4a8803965350ef05bca8329abb1756edf6862797330b2ded6ea58ec73f902958126c9bbc4d592220c6fae820de0f0ced972a34ebb0359cefd86d84bcf40083ca9c7f9e7dddd3940000000000000000db83e52f1d5a5e2e35e67b34ab154b55461cd12ca70239ab483b3790527161587aad3e0fe7f4e6900854299bea2a1b8bddd634c151e0da738f5345d2d7340845277e4e2c176979e67c820088baa8b25a8de0d873fdc56a7c8f6506dd14f7f110b340662bb6edeb941fee1d39440142c6f6360cd8af1b2ccf27cee4b93b98b928564c17ac430ac56733b7ad08cb97c34ca55792abd5686ee09600306d61eb15465a35d0fc4ba61a0f80404ee2b5c55e615cd1bdca08de12a16cda2d16b3fe1a4cdd3acc74c15e94129dc88ab5112d847a1563885800d999fe84ca69c521552af42b9e5eb73c7c2b8b756c15cb028e47858bfece239c9d869e62697a07884d57b46bd03b5a1cf2a39b4f598bbf7abfa4339f6587bb1bab5a459fb774ed8bfb3071b2f6b961ce63ba20f10213e7f644242284d9f44b9539605a4a2aeb3b6200b1814a6a9b759511c0faa65852d2f1d6cca1239593c0ac60faaaada2525981267eebfb8c90987e371ce641e7c1265fab787b8792be8bf371689a421d8ec3fa0ac27a7ab363f9a03040261d333326a3953c09de07899bbcae1d0357ce840ef78b844ef53585eeb87dc3d972e163c48ca39f4b2e3d5b447f38096e78716d50c70a5baa524505a4425a232579b26bdb081ad17e08a932c02d01270b2e934fb7c42c17ff6e913d484a898c0fc4cc4d657838bad8b36df635ac47a4c8a22c6524c523d0782dd35020e2c8adce1bb795ff0b0dcb1cd2fbed20b32db5578699e18e56ce1094395b0db955d7c2b033963e9c84bf065d5f209b47280df4e479036fafa4449d47434bd4dd6c74486c47e2551f2712354e7122d82031891144c443d8f14fcfa9602f10dc4e432f0f298530878276db85ad20e207c4449382712c5db9eecd61e424c4c2ca4fde64d3c114ef5b9daaeff05351ae74b4da7ba3d07962050b5aabdb63882d34a10536949a8658573fb6a9e5122fcda983b323fa523e78bf78bb7eb4538c956911316bcc295d95abb1520ad051e05f1eb5c06e294af45e6ca673c24227ca1f5575cbf3695ac2db9c1a4f8779bc56013d12b1b96a2310f439af508d7f140e846a41dbc7562509f10f1bea23842d49ea9e59fff56a1aa949ce49fd2659a8569f70af7f4740521f104ff3bbdb4926f7c1b40e92eed5477b9c5b5ff1aeb331fcbed8d8c4549869508ecf713eebc92945116f707af312abd5c2e9dd6d7fc7a11ead7cf412d525609d500353b1f40248c6b7cbf8fa9d75afe1fa1e127da64c37a4da43d628feb473734044af32f28469feb80e5f893ebf1eecff2bbd9e5c703bbdec2309c69a754a29cc35f7e1264cfff81bede001ffe1413cd8b1d8653595e19ebf6627bacf093b3a009e1f2434fa3be54b215c839a060e46eb59a6bb15854a044458bfe50094467e27840f3cafcb212f53bb1223e9f6c02847ea94a6d4d66ef2dceff5907493dd0474b8c653f0d9ae5269c8ea79e536ce967642d72545cdd7734e8fe0b3231a41ea1424d663cfa4625289609db1a6526ee66fc71320d2dab8b7340aff981ffe9914c802d498ab87679ff2f4851f691d3b96b6b394950f64f7fd9631513927c22d636bc1f5ea399b999c0ccd2f22c72cabfa7ede5922c1883ba4d37021cfbd4ea8451f96a2d28404ab583f216ce31ce22d77e3f0e9f3d039c7b5d2957ad3d2ab0700411f52d0d3abe15d960d5593d8de3a8e01be84c68b877c10d56a311010296772c35b28b3eae2b6c4a584aabb3c403698f3ee6c800858befeaf9affc1565c1bfea2890395ac68d6f979f8323a88d00ea98433debb1712534f63f4c049b8847c197f86a03d8ce6063b9b5fa2198ac7d255c4ec744507436543bb7afa55450f9464422795e1f697ae3691034f4fdc6d0e4139bb9898a6f66f81a8fe2c901a00154a838a179b998f31b9f447bbb902420f8e4d77536117bf95eb0934d04c758e262da6d19db8193cade29bef3046a4416b7631b4bf1c44e1f1390cbb4c77621a4cbb49719a346be880fc58e10310a504c945a07e875185b601693017e6c17c8633bb46542ed5bdfa7fd824430f6de57d80c9ea1f8cd29775c98eda8a1f2ab8397b8a91547078c8d34e34dbce65ce85bf0a6191d6d8cfff228690077caca86fafb476dda3215e047256770c0e243e8074481b295f276006e826c31d83bfcdafd2f9dbde7b140508160b6f3c44f6c02afac055ca57ebd6d36d84c5a2c2197122c601d3f4e310a83f4228b16418fe955c54557fbde8e3aa66419cf28f51e950e423ec36c649f74863c826c23dba221587db285d351730eb7207810da8ead8304ce047834614cb433bc512feb4f4adb58344249c6f7df7e386f1affc717ca25361870e809b08188ec0a6337b99aa6cf2c9d100bb835d916d0f249fb4075879230e3f189147741732d9ec55f35eb5374fdfb81cea6fdbea3d39b5f7150a757c372f13e1ec8be00ac56d3112d65c0aace7576334fa68eb773201037821a4fd9e221c2d042af733a85cd4ab7a65c6b2745fc98b9d269767c697f298cc27290d86e2f6e988758a3a7c30fb0be845bcd0721798acea60fb996341a08d8526e1da6e5406ea926d84b0f30ad6b889322a4a4c7d56b2585ff4c260641901fe40d84c64d955f17753f809c17e90133d3a15ebc18b7120f2cc706aa4040f5bb7178ab7fe04db9a04f4a8e3d555bcd178431d6d4a0363f71f360c70bb59d287a009cd79ae49f3c417a867c4045fee7a34284100c1cb7fc170c9483b34a0be852f28dc79c53b87f3df1b83459af92afc41dd94fcc2117c661ecd66099bd62ef26ed85f282e02addd05e5c3bc0c93047ae79792043bbc20799a8a12a04afc4ea75f6a1f91a9810f01a31f7d6b0320e31a8100d103a80d4327e672a5a97f852d12d515abde9836dc4fbf596dec959b55ec17a44b6c1df4c7746a525fa9c74fc771d5772047f09dcfd22774f25009dc0a74dbb902b03f3cd9c6f4f3fe5a89c30af83f1a3aa6f6f5ed0566cf3c98e3bf688c57754ce670a25aca39927d57302184783a005799aa8e3b441acdabec7faa0d26662654ce6cdecd7c1a52bfadbf2fd65a5abf00cfac282a563ee5df86bc2629f8f22af081639914af526de57743a5471a96664fa8afc62f1626b792e911b0a2aea69091337c0487abf265969cdbc2621c3c509a75866b327e4f867ed4f25e5daea2147bd4df11f4056d8c53d5e385f102a25440b3f01af529e4080fa78e1da8871b261bb8d8bdf5dbda03cafe9c4be0025fa3dbfd702f07b93607297dcb1c824077ba1ac0b25e1a68c9736115e715946ca9c848d6d3d50ae32cafc1e85423db5556dd3fb48dbcf883abd32af3d7f4330622ddddc73cdd0eecfc5dff4e625907cb1feb3203e597b57306ccae06c0818a2cfe37032fd55735b38083d3c9668ce33fc4ccfa872c2c6cbef6d1cab0433a25a515670e2d1f37924c4801c222b3a9c143df0ab20ba664b15c8089315ddd6f7f88edb48b549b649755879ed2bfcd351d9b12a027cd38c2036a2c12781e5b32a26e1f11349642d7bc9e35a9649b7875c756c42026ef3258e4759bbcd18237ea7d93ba13cfd5a766b2fdd2e73a9dc8a89c9a696758a0a16a4e1d6be4cadb265094a6d8585abc775ded1db754c905858f2afc85a43dd4286c71e54e32ca3bfa323a50a68156f0efa5cdf2c7f1a163c3c41b7a2f65d4dcda4dbca17631ea7d09cebe6568f6273c02351c63a33d7dab02d99853838de0fdaa565f0ebd00060d92893f7970c86bb830f86ccb1ee464e5e21040e399eacc9f84c26edaffeda0c791daf6359f6e94494f9d01e4de3dc1678015629a976571eb3793134b4ff840fefc7a9eb4413a9475159ea4cc962780274be2d54fc585974abd27bc0da26e0e741a976faf186ad14952b0fc04c2c5bc07599f4c70190b90dfa3683ee3f2045723ca4ed02115c5f0d2385dae225a7b438da861e36fbdab94336ebc16046bb85f7e4310c33301b80f0766d18175d5fecedb8bc8fe34026289d6c0fe09df834ef1a4c7e8ab674d201ac43f5aeac06bcd07bd9957e0dd6422563502e3aa3a7f1bc685e9b4c6bdce7019c4f6d077a0aa86d8789bb256d98b83affff9f710f6676d3b19f7e3dbf7f65f8b502cb1109516e67dd22915c9b7348a891d174b60363e7f27a8ad93e0e40d26d884e2c9c61beb88e0b0391dc182b2469ab87e68ca209df7cf69518abab5156bcd5ba6e0b06686bd32e58c8d664fdc8645915b34ab3e266aa767aec7f25b61d7e195298ecb4923312aa3ea49201720fc84198ea0ac8e7f808df316", 0x1000}, {&(0x7f0000000f40)="a69b89c0d5f5eb6c2f4c12df2221054821adb70931995cb31b8477742cadc8467f10a94f47a2580345bda0743d7a0dcb12aa98452d40a8d50fd486c8b46da2f2db3d9075377401fdd6337b12f2fd76bf6ef3d58c7500427f4461d82838af1735b587e4b4270acc8b92b02cc8d09637c9b92fab8cd920fd28e9d02349ff309705119da23d782cb360acd6d4298647ca6a3bad874e1c290d2e371d4537f8ccae7d9c3e89e8e9691a8f3e7088740f4d1c23cf867c3c4f3ea716ea9bf033f7fe9cb4e07d22b9b98f2e864f30252710bb2f45eee2857461ffc929242630e63f275de615c84b3072c1a921df619c3433b3bd9e965ec8e0217704646570398a7b78c3e43e2827e0e3bd1116f2ac5080bf9ced312f1ccd733edfc3aad19843391b245621abc4ea35b43472f5079f125771d70428543c10af5dd92c987db6850b2c6e727997308ffd2b1050e8b9a9bf5268393010ccda2a313dc84ff73375a1105b419b78a34d3096f4f515a419dbc7491370422a80058024b323856a3fd3eac03de635dc9650562bb32403d277af56bb94bd575412b30cb0b292c3230fc440a572d9f8c66efd73445576c95a9e5128d6169b35ed608f43e00bb4b8ebb05336784727aacb462a01bfcaf89f8a86a846846665bab2b84b358138fca0f1ba50e8342f768cb1bac3e15350ecc8c85e2788babc002566c599173cd53aa954892c81b6f8720da85e012bcf9dbbcee541c9928fc07d5b6d1fbd44ee65cd72d8d4c01c6a45b66c65d1daa3a6ecda911a845d1bf239151c67181232264eab174eecfdfe9da89e8814bd6b238e8286eff54fb1400f2843b9d3b0d6b0894a2588cdd3648b80f3dcc7d6ac569b002729357f434555c331618abc255b56622f50a93129904ff4f7dcb6c23a2a54c7c70b27ca28df48273a4ab247872e96846c5bf912a7fd780b2936b03292a3294425b39099568bd633a3e89866b9438ae94497f60dbc767961852dd1ea7444436aebd73f4a9bdc1b060e7ff164faf2b6dae0c720895baaf146b5fe6bcc505b89aeadec5d4cc78e796f7cfae0c75e4211fd7d5e1affe6a45e80866b7dc2a54f8101a6efc9e8c511a091031596dd2edb3e7b0e45d2b98031fc8a8d5c323cea4177fd7e9353c208f9e82c2574fb5a47574e04a5b503b540b4435b1997fb32306343a4e7f7cdf7af88313151653c1f73854b450587eb5466883cbc87eb28f7fcd7b693475b3c7a7923e44b1ce10ef7f379c9d9dfebd30717867f70e8888335810f900b96a245b0d03ce8c1cbfb040bfe4adab2a1f3a3a480aeeb7495adc44977172cc195bc44b2bb6650e04459477d0dbdd4660d75aac30ff89c32743e603bb4eada7ede5a3c0fe983c08dd2c85751f0f469f68e70427f1e859c59ed435b88b4511e6afee39a9739b16586040c537e7f5d0c3b40700f6d110833ec4874c04fc4d286677446b8b9500f7a751cbc58dd52e82f8cb55eda7a3cfb89e7b9ade0dd082f9b0874adcf5861c7b663ef480fea8820c2affa1f850e12f12d18a3b125b61e28aa9ff06d7e61ddc96e005b3ffdc1cb47a018df914c5723303f2aa45e0d9a219f54e901d46c2c1ef918fc3ae25947b541c61c61b933103b968d29018946dfd1961fb5b3700557643279b88504f09b48b335dfb7aef032aecfda905f5410c27622d85c3890e9f255c1bd13b84e80741f27a076e71e8a171747e743bd0c30f3d3a8bdfc9042f4ee0965bbcd93fefb85970a4d0acf888b66a03978b769898a927c45bc3001b53b6a5f66acd6a4a08ae3b64bd68bc5f8f64195b1bc8fbcb43fe521a9a132efd10a65f9e73bd6d4e0db7a924fbbcbff6b54752fca71ffa3eda6cd2295521a64f4a9765658029866ab11dad0be0a7bcf95372b02530e6b625b145f4611ce6f9b586b4d9c8c60aa12847884f9302628fe0b6d64053b4af56cd902a2104d494608a9791268c4c1423f3b3c3c755f1b622fd38eed5e1b83a6596c1f44892c3e44afb57c3b0924c2b39efde48d37209ac56bb726460ca88d9350813720ff8b581dd684a59fbbdf012bf69b738347c268dee324e4507b917e44ffe0467d4cd6bbe8d12399033bfc920424cb03236d9c819de763490b7588d6a45617347c952715e4ac14395ca389fa43b26e9589dd56f75d24a559bc184adb418369e902fae30c6c4809abb1a8e59201e903324ca2f610afa799951a5e951ab015341c8f656dcd9a253c7a3df9f05743c8fb3f97c28b8ce08ba773c7fbf2b1f0dbc233c62cef28095f692d70dc68d91278b5d994bc524d0fe0aac57c55d079a91fd3d33d577f989b6fb7c4bd7380b166b1c5b7548a3fdafea5e323b50ef2ac7bf02019446b10678f10fe2befb4147e73ee6857a7654bdb396335c38e2c373c6664c4b402b6e2c08b32e918040f7ba8e9d34d4203689fa437ae23dbc9d0fa0aa58a043695239cac69b34f500249948eb8275927fbbe13f21da20534d499c925948d6aabd5ba8c52350eecbbae3eef415ca846b37d30ca06cc939d47e72b7d195998c2300cabb6b230ed0e98af7dd55e939004de092a9b59f48ad221c3f98f284f0d19f705f460f2a0f29b889e7f1963aeaf10bab39ab3779c23b0a31b7771a7c1cd528dbbdc7ebb8e95c611d22ea7c9569066a5ea11027aee0155abc22299f795c7b79bfb3d5e843ce1ea10bf6491a74d8afa4e668c124b9706c32cf487e48054545892d9eadd5a3ef24e59790d108a132602fbe903768c3724482a2d43c380e0ce02c79db933bb7791cf82601b8de14febdc49ad33dd0abe969f4e1e40597014c2d5f05350b1e6ade12735448e923d4ae0c640acf43effdb3c413daf1ecc3ae724469db028fd58a2d21e413df95c7cc49171f8e2f55452beec3433c756cc8291dde067b20888e8dd51383d02c72ca810f50c4677cf14949654d25e1340fc1a5e73d7822adfe6832b1824d65cbf8c934d1a113c64a276fdda043ba6fe2655beb9c5e38d4fdbaf9724b15c0c46a471b5592f63db6a12d3dbe033ef822a1c09e94dbf7a699fcd1e6fc2f8ad0753a7b49be099f9033ef14e0d2bd676056f547f41bd82a68ab4d35aad70d4810125e4de28484f94ef546476bee5e66616556cbe0166d659c827d82c7feba151b906951cd5f895a79684ef06439b22574cd83194360123be1aa02dcb7879755b5dc19ef1c022d9792a7a932ab08b9004dc43363b0d2e6af90aacbd5cce10da6731f1c49c40284f7bee02fc0337bcd795ad6d7536c3e0884574aec4b5e084a805394242feb555de20e3d052adc096012d020efdd1d132142ca169147b7d48e5a5d5a7a8cfb441b79bd3403ed20326304cf87026e52020b7c6e6f7f1f4ab0bc7bfeae4ed68b167007860bd1ae27b2ac2f84338c3becb4d1c5f708720c403c9eecb9e00007b9d7a529d84933b2713fa95197000e3d004a9f990fff79ee4eeffc65e158489b72df1cd3d5a24674bf6d021779c78978ca0f69adc50cbd3200553abf73944fd361591cc7b44da2a531a80404d14dba08db29daf31a2d4e2744224863316b169d8e2c0fc7225a7b278637f3da250e5a61e45157b6c531558ed69a3daea7ee954c04e343065e30e224a93f258049574b5447d0e2b1719a736b87bc98b59f6c9fb5785aebdd1f2a78ccf747624f624799cb313772bcf733382385aa1be367191a03a1cb2f954d995a81357b98a2354f374e2b6d987554cfb3e9674d644718af2cca2a9b292b0a8dc4b43b39840fa8640b06ff98f47672b694603c0cd63fffb0b646ccec15a63d73f9bb55a579b4cdfdffc1ad91051d4c86479be522b1e6b762f8d761d1fba36f5a16d9cbef23b6d5c3cda4770f3d45b8c36d4fe6fa82c2ebe41ea4ce5cf597647629fae0038432d22564ee683d40c351b11f99915122e47a0bb903b6d4b97b36f9422f0a0189b8ba26a30ab9664678626f8015fe837013ec735c3ce6695a96bc1b78c16d24f09f03feaa85526248ef98dbc3aa1353544110b2b918b3c13b6c19c09ad7a539d20d832275d3258a96dd87220730548ead282b8c26a5cd6edf15549065e32c3eba59a7aa7ccae6886ac030097f276233221bb15188fb40a0655765749ff75524c6bf1dfff28908332f7065e6d922bc85b9d793cc83f66c4d8af2a377d170699c1fa650ef8dd369fe2edfbccebbb18fd40fc70f387aeb15b90ac241406367de4837817db9d14ad97d4e5a02de696195bc8c241ca5f6d2fd2404405a886b1265249a48bae716c3518a719502931a1706681ffb59a022fcafd6827a277ef04e12e22d1eb60d67190c957380c93aa2c9e48838b94e467dc96ab0d7911f06677b1102b225e9c63a1c2ab8d44f8300aaf9db75fc164c50172f93794d8916a7e4742353d21bee7f3a0d2d4b80f5422768258ecabb84328ec4459967f7df09fdad037899b0040f3541b927f2498d684858c22675ee1b0715159446ba2f4c7d59dd62484cf681ae299ec901122e186687b37b72fca5c0dc3361e6e1d8e017a3edda561badb56ae3f4104eaaeeb3c06a2ec51c1f2d97b53deefa7235db9b572a857f76bf7ae33f9e99837c79174ff13d287635164686a658be3e5f920529caa5329f40209a840be5f2a848e7d792949b1e880386603b99e1bf5d8979f5faad91d8c319c7ccbeecd77f4650609991d82fa123be4c02a3caafcb2aa3af443158b38b116a3ced5faa8b39f9dd13819aa37032cd4d3edea72935171bf2ef9808cb3a05c386ddfa2aba7331f77b95cd642d9caadac69988423a592f329bce3cd51256750be23147ce7268fbf9fb3eeb86f1d093a6673a2ba4696a55bdb418b8184810d85c454572b2c48451bbd1afcf7b9c617de8e160e2d248f3f70927a4c4cc316bbad8d42e3c155992079f8ba78cb0855185fb6d789180dae57ac19760a21f085f0756e09845d355140cf45f12abceb795e4ef27213c35d9c80969c385e83ea2345f4437869f41c746ce0593d421fc8eef6bf1bd4023138c3f05731e3fb0fa5f1a44f62bd01a6697d77ca1f1015b665449a17f23f9559018db064ef32f2c5b21bc3f7832f6b6edc973ba8d7be402ac0fa0eb1c8f140e383927d983fcbb8cf3252bdb552271d92762004bea7d7d612d0b5e541cd9fefde5746a962e975218379bac8ce8eda8b696f847b0484f3141cdae5d094ef06a79f11cc8d6c564ac067c9608480eb769ec418e3fa471d87cfcf79b58bbc237cb3198ff249b715f4aed58823445e21c45330eb0c83e04266325ba0e68d3d067aee9c047d8664d38e39f5bca72fe9963f7389590003733e999ba3f7740b7d98d667660da7a39914b0941e83c117347277417cde3f356d26ae5b3f09539f7cac7343c601dfaa8a4f77cb439a", 0xec1}], 0x2}], 0x4924ba0, 0x0)

clone(0x106300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000040)='comm\x00')
r1 = fcntl$dupfd(r0, 0x0, r0)
fchdir(r1)
exit(0x0)

mkdir(&(0x7f0000000340)='./file0\x00', 0x0)
perf_event_open(&(0x7f0000000000)={0x1000000002, 0x70, 0x12, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
mount(0x0, &(0x7f0000000080)='./file0\x00', &(0x7f0000000000)='tmpfs\x00', 0x500, 0x0)
chdir(&(0x7f0000000100)='./file0\x00')
symlink(&(0x7f00000001c0)='.\x00', &(0x7f0000000200)='./file0\x00')
socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000000c0)={0x0, <r0=>0x0})
sendmmsg(r0, &(0x7f0000008600)=[{{0x0, 0x0, &(0x7f0000003140)}}, {{&(0x7f00000072c0)=@un=@file={0x1, './file0\x00'}, 0xa, &(0x7f0000007380), 0x0, &(0x7f0000000600)}}], 0x2, 0x0)

syz_emit_ethernet(0x22, &(0x7f0000000000)={@random="0344a0e080ad", @dev, @void, {@generic={0x800, "1b852c851a7df08e51715ce6e76455dad2076a5d"}}}, 0x0)

r0 = socket$inet6(0xa, 0x3, 0x6)
connect$inet6(r0, &(0x7f0000000080)={0xa, 0x0, 0x0, @loopback}, 0x1c)
sendmmsg$inet(r0, &(0x7f0000000300)=[{{0x0, 0x0, 0x0}}, {{&(0x7f0000000180)={0x2, 0x0, @dev}, 0x10, 0x0}}], 0x2, 0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
bind$inet(r0, &(0x7f00000001c0)={0x2, 0x0, @local}, 0x10)
setsockopt$sock_int(r0, 0x1, 0x6, &(0x7f0000000140)=0x32, 0xfdca)
connect$inet(r0, &(0x7f0000000400)={0x2, 0x0, @broadcast}, 0x10)
sendmmsg(r0, &(0x7f0000007fc0), 0x400000000000695, 0x0)
mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0x3, 0x8031, 0xffffffffffffffff, 0x0)
sendmmsg(r0, &(0x7f00000080c0)=[{{0x0, 0x0, 0x0}}], 0x1, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f0000000000)='gid_map\x00')
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
r2 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
msync(&(0x7f0000ffe000/0x1000)=nil, 0x4000, 0x4)
pwrite64(r0, 0x0, 0x0, 0x100000800)

clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x0, 0x0, 0x0, 0x0, 0x0)
exit(0x0)

r0 = socket$nl_route(0x10, 0x3, 0x0)
r1 = openat$fuse(0xffffffffffffff9c, &(0x7f0000000080), 0x2, 0x0)
dup2(r0, r1)
read$FUSE(r1, 0x0, 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = gettid()
r2 = fork()
r3 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ptrace(0x10, r2)
ptrace$setregset(0x4205, r2, 0x202, &(0x7f0000000080)={0x0})
tgkill(r0, r1, 0x24)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f00000001c0)='net/udp\x00')
read$FUSE(r0, &(0x7f0000000200)={0x2020}, 0xffffffffffffffa7)
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat$tun(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
ioctl$TUNSETIFF(r1, 0x400454ca, &(0x7f00000000c0)={'syzkaller1\x00'})
ioctl$TUNSETTXFILTER(r1, 0x400454dc, 0x0)

capset(&(0x7f00000023c0), 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
unlinkat(r0, &(0x7f00000000c0)='./file0\x00', 0x200)
rt_sigreturn()

ppoll(&(0x7f0000000000), 0x2, &(0x7f0000000080), &(0x7f00000000c0), 0xfffffe52)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x8, &(0x7f0000000000)=0xa, 0x4)

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
splice(0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0, 0x8, 0x1e)
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
timer_create(0x0, &(0x7f0000066000)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f00009b1ffc))
timer_settime(0x0, 0x0, &(0x7f0000000040)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x13}, &(0x7f0000000240)=<r1=>0x0)
timer_settime(r1, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)
pipe(&(0x7f0000000300)={<r2=>0xffffffffffffffff, <r3=>0xffffffffffffffff})
write(r3, &(0x7f00000001c0), 0xfffffef3)
pipe(0x0)
write$binfmt_misc(r3, &(0x7f0000000380)={'syz1'}, 0x4)
read(r2, &(0x7f0000000200)=""/250, 0x50c7e3e3)

clone(0x2009214d5fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f00000000c0)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = fcntl$dupfd(r0, 0x0, r1)
setsockopt$inet6_MRT6_ADD_MIF(r2, 0x29, 0x66, 0x0, 0x0)
exit_group(0x0)

io_setup(0x6, &(0x7f0000000240)=<r0=>0x0)
creat(&(0x7f0000001800)='./bus\x00', 0x0)
r1 = open(&(0x7f0000000400)='./bus\x00', 0x14103e, 0x0)
io_submit(r0, 0x1, &(0x7f0000000100)=[&(0x7f0000000000)={0x0, 0x0, 0x0, 0x1, 0x0, r1, &(0x7f0000000600)="9b", 0x1, 0x7fffffffffffffff}])

clone(0x62044000, &(0x7f0000000000), 0x0, 0x0, 0x0)
clone(0x0, &(0x7f00000001c0)="248c264aff75df2a2d1390de3db0040a9264b926c79d86eadfcce94f2f52c22c057c0a7bc0dda49ee5ad806a3fed10c55ea5e687ec7d73d8cefb302870a75a53c6942c1466af98c40871", &(0x7f0000000240), &(0x7f0000000280), 0x0)
lstat(&(0x7f0000000100)='./file0\x00', 0x0)

mknod(&(0x7f0000000040)='./file0\x00', 0x10e9, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
timer_create(0x0, &(0x7f0000000280)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f00009b1ffc))
creat(&(0x7f00000001c0)='./file0\x00', 0x0)
timer_settime(0x0, 0x0, &(0x7f0000000000)={{0x0, 0x989680}, {0x0, 0x989680}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x14}, &(0x7f0000000040)=<r1=>0x0)
unlink(&(0x7f00000000c0)='./file0\x00')
timer_settime(r1, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
r1 = dup2(r0, r0)
ioctl$PERF_EVENT_IOC_ENABLE(r1, 0x8912, 0x400200)
r2 = socket$inet6(0xa, 0x2, 0x0)
setsockopt$sock_int(r2, 0x1, 0x2, &(0x7f0000000000)=0x20, 0x4)
bind$inet6(r2, &(0x7f0000f67fe4)={0xa, 0x4e20, 0x0, @empty}, 0x1c)
r3 = socket$inet6(0xa, 0x2, 0x0)
setsockopt$sock_int(r3, 0x1, 0x2, &(0x7f0000000000)=0x20, 0x4)
bind$inet6(r3, &(0x7f0000f67fe4)={0xa, 0x4e20, 0x0, @empty}, 0x1c)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f00000002c0)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x9, 0x0)
chdir(&(0x7f0000000200)='./file0\x00')
symlink(&(0x7f0000000080)='./file0\x00', &(0x7f00000000c0)='./file0\x00')
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

rt_sigprocmask(0x0, &(0x7f0000000100)={[0xfffffffffffe]}, 0x0, 0x8)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = creat(&(0x7f0000002680)='./file0\x00', 0x108)
write$binfmt_elf64(r0, &(0x7f0000000140)=ANY=[@ANYBLOB="7f454c4602010100000000800000000803003e0000000000200000e1ff0c000040000000000000000000000100000000040000200000380001000000000000e2030000000000000000000000000000000040ff01a5f4c2302dfd6d69ee544907ff66"], 0x78)
execveat(0xffffffffffffff9c, &(0x7f0000000280)='./file0\x00', 0x0, 0x0, 0x0)
r1 = gettid()
rt_sigqueueinfo(r1, 0xa, &(0x7f0000000040))
ppoll(0x0, 0x0, 0x0, &(0x7f00000000c0), 0x8)

syz_emit_ethernet(0x52, &(0x7f0000000180)={@multicast, @local, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "7f3900", 0x1c, 0x3a, 0x0, @private0, @local, {[], @mlv2_query={0x82, 0x0, 0x0, 0x0, 0x0, @private1}}}}}}, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
semop(0x0, 0x0, 0x0)
rt_sigqueueinfo(r0, 0xa, &(0x7f0000000180))

r0 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
setsockopt$inet6_group_source_req(r0, 0x29, 0x2e, &(0x7f0000000100)={0x0, {{0xa, 0x0, 0x0, @mcast2}}, {{0xa, 0x0, 0x0, @private0}}}, 0x108)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = dup(r1)
ioctl$PERF_EVENT_IOC_ENABLE(r2, 0x8912, 0x400200)
syz_emit_ethernet(0x3e, &(0x7f0000000040)={@multicast, @empty, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "a0f000", 0x8, 0x3a, 0x0, @private0, @mcast2, {[], @echo_request}}}}}, 0x0)

r0 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r0, &(0x7f0000366000)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0xc)
listen(r0, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000080)='net/unix\x00')
preadv(r1, 0x0, 0x0, 0x0, 0x0)

clone(0x38004100, 0x0, 0x0, 0x0, 0x0)
semget(0x2, 0x3, 0x300)
semget(0x2, 0x0, 0x72c)
semctl$GETZCNT(0x0, 0x0, 0xf, 0x0)
rt_sigreturn()

r0 = openat$full(0xffffffffffffff9c, &(0x7f0000002940), 0x0, 0x0)
read$FUSE(r0, 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
faccessat(r0, &(0x7f0000000040)='./file0\x00', 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0xc100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0xfffffffff8000000)
preadv(r0, &(0x7f0000000140)=[{0x0}], 0x1, 0x0, 0x0)
rt_sigreturn()

msync(&(0x7f0000fff000/0x1000)=nil, 0x1006, 0x0)

membarrier(0xc, 0x0)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
r1 = syz_open_pts(r0, 0x0)
syz_open_pts(r1, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
r1 = socket(0x2, 0x803, 0xff)
setsockopt$SO_TIMESTAMPING(r1, 0x1, 0x41, &(0x7f0000000080)=0x4b6, 0x4)
connect$inet(r1, &(0x7f0000000040)={0x2, 0x0, @dev={0xac, 0x14, 0x14, 0x21}}, 0x10)
r2 = dup(r1)
sendfile(r2, r0, 0x0, 0x4000000000000081)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

clone(0x5100, 0x0, 0x0, 0x0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f00000002c0)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x0, 0x0)
chdir(&(0x7f0000000200)='./file0\x00')
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
rename(&(0x7f0000000440)='./bus\x00', &(0x7f0000000480)='./file0\x00')
exit(0x0)

linkat(0xffffffffffffff9c, &(0x7f0000000000)='.\x00', 0xffffffffffffffff, &(0x7f0000000080)='./file0\x00', 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
chdir(&(0x7f00000001c0)='./file0\x00')
open$dir(&(0x7f0000000040)='.\x00', 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigreturn()

r0 = openat(0xffffffffffffffff, &(0x7f00000002c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
timer_create(0x0, &(0x7f0000000300)={0x0, 0x12}, &(0x7f0000000140))
r1 = eventfd2(0x0, 0x0)
r2 = openat$full(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
read$eventfd(r1, &(0x7f00000000c0), 0x250ce47f)
dup3(r2, r1, 0x0)
timer_settime(0x0, 0x0, &(0x7f0000000080)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x14, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000040)=<r3=>0x0)
timer_settime(r3, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)
mkdirat(0xffffffffffffff9c, &(0x7f0000002040)='./file0\x00', 0x0)
r4 = openat$fuse(0xffffffffffffff9c, &(0x7f0000000440), 0x42, 0x0)
mount$fuse(0x0, &(0x7f00000020c0)='./file0\x00', &(0x7f0000002100), 0x0, &(0x7f0000002140)=ANY=[@ANYBLOB='fd=', @ANYRESHEX=r4, @ANYBLOB=',rootmode=00000000000000000040000,user_id=', @ANYRESDEC=0x0, @ANYBLOB, @ANYRESDEC=0x0, @ANYBLOB=',\x00'])
read$FUSE(r4, &(0x7f00000021c0)={0x2020}, 0x2020)
readlink(&(0x7f0000000000)='./file0/file0\x00', &(0x7f00000001c0)=""/205, 0xcd)

r0 = timerfd_create(0x0, 0x800)
read$FUSE(r0, &(0x7f0000000100)={0x2020}, 0x2020)

perf_event_open(&(0x7f0000000000)={0x2, 0x70, 0xfd, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = socket$unix(0x1, 0x5, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
prctl$PR_SET_MM_MAP(0x23, 0xe, &(0x7f0000000280)={&(0x7f0000724000/0x2000)=nil, &(0x7f00006f2000/0x1000)=nil, &(0x7f000008d000/0x3000)=nil, &(0x7f0000ffa000/0x4000)=nil, &(0x7f0000270000/0x4000)=nil, &(0x7f00004b2000/0x3000)=nil, &(0x7f00001dc000/0x3000)=nil, &(0x7f000018e000/0x4000)=nil, &(0x7f0000480000/0x4000)=nil, &(0x7f0000ffa000/0x4000)=nil, &(0x7f00002db000/0x2000)=nil, 0x0, 0x0, r0}, 0x68)
preadv(r1, &(0x7f0000000280), 0x18, 0x0, 0x0)

socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff})
bind$unix(r0, &(0x7f0000000040)=@file={0x1, './file0\x00'}, 0x6e)
lgetxattr(&(0x7f0000000200)='./file0\x00', &(0x7f0000001440)=@known='user.syz\x00', 0x0, 0x0)

r0 = fork()
rt_tgsigqueueinfo(r0, r0, 0x15, &(0x7f0000000000)={0x0, 0x0, 0xfffff800})
waitid(0x0, 0x0, 0x0, 0x2, 0x0)

mkdirat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x0)
r0 = open$dir(&(0x7f00000000c0)='./file0\x00', 0x0, 0x0)
mkdirat(r0, &(0x7f0000000000)='./file0\x00', 0x0)
r1 = open$dir(&(0x7f0000000100)='.\x00', 0x0, 0x0)
r2 = openat(r0, &(0x7f00000001c0)='./file0\x00', 0x0, 0x0)
renameat(r1, &(0x7f0000000080)='./file0\x00', r2, &(0x7f0000000200)='./file0\x00')

munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
mprotect(&(0x7f0000ffd000/0x1000)=nil, 0x1000, 0x0)

r0 = socket$inet(0x2, 0x2, 0x0)
bind$inet(r0, &(0x7f0000000480)={0x2, 0x1004e20, @dev}, 0x10)
connect$inet(r0, &(0x7f00000002c0)={0x2, 0x4e20, @empty}, 0x10)
writev(r0, 0x0, 0x0)
write(r0, 0x0, 0x0)

r0 = openat$pidfd(0xffffffffffffff9c, &(0x7f0000002680), 0x0, 0x0)
lseek(r0, 0xfffffffffffffffe, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
r1 = gettid()
prctl$PR_SET_MM_EXE_FILE(0x23, 0xd, 0xffffffffffffffff)
tkill(r1, 0xf)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
setsockopt$sock_timeval(r0, 0x1, 0x1b, &(0x7f0000000000), 0x10)

timer_create(0x0, &(0x7f0000066000)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f00009b1ffc))
timer_settime(0x0, 0x0, &(0x7f0000000040)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
clock_nanosleep(0xfffffffffffffff0, 0xffc99a3b, &(0x7f0000000100)={0x77359400}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x14, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000200)=<r0=>0x0)
timer_settime(r0, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x3938700}}, 0x0)
setsockopt$sock_int(0xffffffffffffffff, 0x1, 0x0, &(0x7f0000000100), 0x4)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
sendmmsg$inet6(r0, &(0x7f00000075c0)=[{{&(0x7f0000000240)={0xa, 0x4e22, 0x0, @dev}, 0x1c, 0x0}}, {{&(0x7f0000000380)={0xa, 0x0, 0x0, @initdev={0xfe, 0x88, '\x00', 0x0, 0x0}}, 0x17, 0x0}}], 0x2, 0x0)

timer_create(0x0, 0x0, &(0x7f0000000080))
r0 = openat(0xffffffffffffffff, &(0x7f0000000080)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
timer_getoverrun(0x0)

syz_emit_ethernet(0xa2, &(0x7f0000001180)={@local, @link_local, @void, {@ipv6={0x86dd, @tcp={0x0, 0x6, "ba3673", 0x6c, 0x6, 0x0, @private2, @local, {[], {{0x0, 0x0, 0x41424344, 0x41424344, 0x0, 0x0, 0x1b, 0x0, 0x0, 0x0, 0x0, {[@timestamp={0x8, 0xa}, @nop, @sack={0x5, 0xa, [0x0, 0x0]}, @mptcp=@remove_addr={0x1e, 0x3c, 0x0, 0x0, "531d03136ed08009e58626699f61b33b727c457422cba7205fb0112ebfa9679a79bf4614d143c32e6ad7fce93ddddfacc49da90d81d2561359"}, @mss={0x2, 0x4}]}}}}}}}}, 0x0)

io_setup(0x0, &(0x7f0000001100))
msync(&(0x7f0000952000/0x2000)=nil, 0x87abbe8d1cc6ad9, 0x4)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
fgetxattr(r0, 0x0, 0x0, 0x0)
rt_sigreturn()

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
unshare(0x8000800)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
r1 = inotify_init1(0x0)
inotify_add_watch(r1, &(0x7f0000000040)='./file0\x00', 0xd40001e1)
rt_sigqueueinfo(r0, 0x39, &(0x7f0000000000))

clone(0x30005100, 0x0, 0x0, 0x0, 0x0)
getresgid(&(0x7f0000000100), 0x0, 0x0)
r0 = gettid()
tgkill(r0, r0, 0x10)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000180)={<r0=>0xffffffffffffffff})
sendmsg$inet(r0, &(0x7f0000000880)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000740)=[@ip_tos_int={{0x14}}], 0x18}, 0x0)

clone(0x54041bc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
rt_tgsigqueueinfo(r0, r0, 0x16, &(0x7f00000000c0))
ptrace(0x4206, r0)
ptrace$cont(0x7, r0, 0x0, 0x0)
r1 = gettid()
r2 = gettid()
tgkill(r1, r2, 0x24)
ptrace$peekuser(0x3, r2, 0x2)

clone(0x4000c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000005fc0)={<r1=>0xffffffffffffffff})
getsockopt(r1, 0x6, 0x9, &(0x7f0000000000)=""/173, &(0x7f0000000100)=0xad)
tkill(r0, 0x18)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
prlimit64(0x0, 0x7, &(0x7f0000000000), 0x0)
pipe2(0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000080), 0x0)

get_mempolicy(0x0, 0x0, 0x0, &(0x7f0000ffa000/0x2000)=nil, 0xe)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
setsockopt$inet_buf(r0, 0x0, 0x40, &(0x7f00000000c0)="453c141d3a421aa49db50f9ba71007c9a2b36c78504ed695177660e5ee6db4c50938e538740277d27dbffc7cd26668f394790b653a845d3614893ae7bdcd0b7571cea390f5dbfd051804adba482917d1146132c2e7f0d0d63e936ca2880df2b1", 0x60)
r1 = gettid()
tkill(r1, 0x18)

r0 = socket$inet(0x2, 0x3, 0x1)
sendmmsg$inet(r0, &(0x7f0000001b40)=[{{&(0x7f0000000040)={0x2, 0x0, @remote}, 0x10, &(0x7f0000000280)=[{&(0x7f0000000080)="181e", 0x2}], 0x1}}, {{&(0x7f00000002c0)={0x2, 0x0, @broadcast}, 0x10, &(0x7f0000000380)=[{&(0x7f0000000300)="ad9b", 0x2}], 0x1}}], 0x2, 0x0)

openat$ptmx(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
pselect6(0x40, &(0x7f0000000000), 0x0, &(0x7f0000000080)={0x9}, &(0x7f00000000c0)={0x0, 0x3938700}, 0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
connect$inet(r0, &(0x7f0000000040)={0x2, 0x0, @empty}, 0x10)
setsockopt$inet_int(r0, 0x0, 0xb, &(0x7f00000000c0)=0x2, 0x4)
write$binfmt_elf32(r0, &(0x7f0000000880)=ANY=[], 0x483)
recvmmsg(r0, &(0x7f0000003f80)=[{{0x0, 0x0, 0x0, 0x0, &(0x7f00000003c0)=""/145, 0x91}}], 0x1, 0x2000, 0x0)

sched_getparam(0x0, 0xfffffffffffffffc)

poll(0x0, 0x0, 0x7fff)
clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f00000027c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
timer_settime(0x0, 0x0, &(0x7f0000000200)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
r2 = socket$inet_udp(0x2, 0x2, 0x0)
r3 = socket$packet(0x11, 0x3, 0x300)
ioctl$sock_SIOCGIFINDEX(r3, 0x8933, &(0x7f0000000080)={'syz_tun\x00', <r4=>0x0})
setsockopt$inet_mreqn(r2, 0x0, 0x24, &(0x7f0000000000)={@multicast1, @remote, r4}, 0xc)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x16}, &(0x7f00000003c0)=<r5=>0x0)
execveat(0xffffffffffffffff, &(0x7f0000001400)='./file0\x00', &(0x7f0000001680), 0x0, 0x0)
timer_settime(r5, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x3938700}}, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
getpid()
rt_sigqueueinfo(0x0, 0x0, 0x0)
r1 = openat$null(0xffffffffffffff9c, &(0x7f00000001c0), 0x101202, 0x0)
ftruncate(r1, 0x0)

prctl$PR_SET_SECCOMP(0x27, 0x0, 0x0)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000700)={0x2, &(0x7f0000000240)=[{0x50}, {0x6, 0x0, 0x0, 0xffffffff}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
socketpair(0x1, 0x1, 0x0, &(0x7f00000000c0)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
getsockopt$inet_tcp_buf(r1, 0x6, 0xb, 0x0, &(0x7f0000000100))
rt_sigqueueinfo(r0, 0x3a, &(0x7f0000000000))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat$urandom(0xffffffffffffff9c, &(0x7f0000001600), 0x1, 0x0)
writev(r1, &(0x7f0000001b00)=[{0x0}, {&(0x7f0000001740)="b63d1fc11905ab8fa430ca2766ed6206000a2ada7f5017a798483c5b52381d9d6807da68ba81b9a600aec089f3", 0x2d}], 0x2)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
poll(0x0, 0x0, 0xc3d7)
clone(0x2008321cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
timer_create(0x0, &(0x7f0000000500)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
timer_settime(0x0, 0x0, &(0x7f000006b000)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x14, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000040)=<r1=>0x0)
r2 = socket$inet(0x2, 0x3, 0x6)
getsockopt$IPT_SO_GET_ENTRIES(r2, 0x0, 0x41, &(0x7f0000000000)=ANY=[], &(0x7f0000001040)=0x338)
timer_settime(r1, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat$ptmx(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$TIOCGPGRP(r1, 0x540f, 0x0)
rt_sigreturn()

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
sendto$inet6(r0, 0x0, 0x0, 0x0, &(0x7f0000000000)={0xa, 0x4e23, 0x0, @dev, 0x2}, 0x1c)

openat$thread_pidfd(0xffffff9c, &(0x7f0000000040), 0xc8040, 0x0)

clone(0x20081004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = memfd_create(&(0x7f0000000300)='c\x9c\\\xd4\xa4-K\x98.U\xb4#\xe1)\x94:\x04\xff\x96\x13\xae\x83\x1d##\xe8A\x0fB\x13\xb7\xc9\xcc\x8c\xacn(sN\xe4\xfb?\xc5\xd0\"`\n)\xf0\xfc\xfb\x8fY\xa8\xc8\xa6\xe7\x97\xac3\'\x02m\xc1\xbfBR\xbe\xef\xb16\xe57\xb9\x13\xc4\x81j\x10\xaf\x95e|\x90\xf7\x99V\xfa\xc0&\xf2\xb8N\xb6\x1d\x8cG\xb9\xe7\xa5\x1d&\xc1\b\x8b\x88\x144r?3\xb6\x01#\xe2\x8c`QV\x9eA\xe4\x88C\x81\xc5\x01P\xd1^=', 0x0)
write$cgroup_pid(r1, &(0x7f0000000040), 0x12)
mmap(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x4, 0x11, r1, 0x0)
sendfile(r1, r1, &(0x7f0000000100), 0xa5ff)
tkill(r0, 0x25)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = gettid()
semctl$SEM_INFO(0x0, 0x0, 0x13, 0x0)
tgkill(r0, r1, 0x24)

r0 = open(&(0x7f0000000300)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000040)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', r0, &(0x7f00000000c0)='./file0\x00')
mkdirat(r0, &(0x7f0000000200)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0)
creat(&(0x7f0000000000)='./file0/file0\x00', 0x0)
r1 = open(&(0x7f0000000300)='.\x00', 0x0, 0x0)
renameat2(r1, &(0x7f0000000400)='./file0/file0\x00', r1, &(0x7f0000000140)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0)

clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x4000000206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f00000001c0)='cmdline\x00')
exit(0x0)
r1 = gettid()
tkill(r1, 0x18)
read$FUSE(r0, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x3d004500, 0x0, 0x0, 0x0, 0x0)
r1 = fork()
ptrace(0x10, r1)
ptrace$setregset(0x4205, r1, 0x2, &(0x7f0000000640)={0x0})
r2 = gettid()
r3 = gettid()
tgkill(r3, r2, 0x3b)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0, 0x12, r0, 0x0)
exit_group(0x0)
clone(0x106300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r1, &(0x7f0000000180)=@file={0x0, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
rt_sigreturn()

syz_emit_ethernet(0x2a, &(0x7f0000000000)={@local, @dev, @void, {@ipv4={0x800, @igmp={{0x5, 0x4, 0x0, 0x0, 0x1c, 0x0, 0x0, 0x0, 0x2, 0x0, @rand_addr, @broadcast}, {0x16}}}}}, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x3c01e900, 0x0, 0x0, 0x0, 0x0)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = signalfd4(0xffffffffffffffff, &(0x7f0000000300), 0x8, 0x0)
renameat2(r0, &(0x7f00000002c0)='./file0\x00', r1, &(0x7f0000000340)='./file0\x00', 0x0)

r0 = socket$unix(0x1, 0x5, 0x0)
bind$unix(r0, &(0x7f0000003000)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0xc)
listen(r0, 0x0)
ppoll(&(0x7f0000000140)=[{r0, 0xe222}], 0x1, &(0x7f0000000240)={0x0, 0x989680}, 0x0, 0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
getsockopt$inet_mreqn(r0, 0x0, 0x2, &(0x7f0000000040)={@loopback, @remote}, &(0x7f0000000100)=0xc)

clone(0x40005380, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$packet(0x11, 0x3, 0x300)
listen(r0, 0x0)
r1 = gettid()
tkill(r1, 0x18)

r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000040)='fd\x00')
fremovexattr(r0, &(0x7f0000000000)=@known='user.syz\x00')

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setpgid(0xffffffffffffffff, 0x0)
r0 = gettid()
rt_sigqueueinfo(r0, 0xa, &(0x7f0000000080))

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = open$dir(&(0x7f00000000c0)='.\x00', 0x0, 0x0)
getdents64(r0, 0x0, 0x0)
rt_sigreturn()

r0 = socket$unix(0x1, 0x1, 0x0)
sendmmsg$inet(r0, &(0x7f0000000700)=[{{0x0, 0x0, 0x0}}, {{&(0x7f0000000200), 0x10, 0x0}}], 0x2, 0x0)

clone(0x1004e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x180000e, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = shmget$private(0x0, 0x4000, 0x0, &(0x7f000056f000/0x4000)=nil)
shmat(r2, &(0x7f0000420000/0x11000)=nil, 0x0)
rt_sigreturn()

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
r2 = dup3(r1, r0, 0x0)
r3 = dup3(r1, r2, 0x0)
setsockopt$inet_tcp_int(r3, 0x6, 0xc, 0x0, 0x0)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$packet(0x11, 0x3, 0x300)
bind$packet(r0, &(0x7f0000000040)={0x11, 0x0, 0x0, 0x1, 0x0, 0x6, @link_local}, 0x13)
r1 = gettid()
tkill(r1, 0x800000009)

clone(0x4000c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket(0xa, 0x2, 0x0)
setsockopt$inet6_mreq(r0, 0x29, 0x15, &(0x7f0000000140)={@dev}, 0x14)
r1 = gettid()
tgkill(r1, r1, 0xf)

r0 = syz_open_procfs(0x0, &(0x7f0000000040)='statm\x00')
pipe(&(0x7f0000002500)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
sendfile(r1, r0, &(0x7f0000000000), 0x9)

syz_emit_ethernet(0x22, &(0x7f0000000000)={@link_local, @local, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "1c3400", 0x0, 0x0, 0x0, @initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, @mcast2}}}}, 0x0)

clone(0x2e380, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = socket$inet6(0xa, 0x2, 0x0)
bind$inet6(r1, &(0x7f0000000280)={0xa, 0x0, 0x0, @local}, 0x1c)
connect$inet6(r1, &(0x7f0000000000)={0xa, 0x0, 0x0, @initdev={0xfe, 0x88, '\x00', 0x0, 0x0}, 0x800}, 0x1c)
tkill(r0, 0x3c)

sendmsg$sock(0xffffffffffffffff, &(0x7f0000000040)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000000)=[@timestamping={{0x14, 0x1, 0x25, 0x2}}], 0x18}, 0x0)
r0 = openat$tun(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
ioctl$TUNSETIFF(r0, 0x400454ca, &(0x7f0000000000))
ioctl$TUNATTACHFILTER(r0, 0x401054d5, &(0x7f0000000200)={0x1, &(0x7f00000001c0)=[{0x6}]})
ioctl$TUNGETIFF(r0, 0x800454d2, &(0x7f0000000080)={'dummy0\x00'})

clone(0x200800059fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
getsockopt$IP6T_SO_GET_ENTRIES(r0, 0x29, 0x41, 0x0, &(0x7f0000000040))
exit_group(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
fgetxattr(r0, &(0x7f0000000080)=@random={'security.', '*@\x00'}, 0x0, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
chdir(&(0x7f00000001c0)='./file0\x00')
setxattr$smack_xattr_label(&(0x7f0000000140)='./file0\x00', &(0x7f0000000100)='security.SMACK64IPIN\x00', 0x0, 0x0, 0x0)
rt_sigreturn()

r0 = openat(0xffffffffffffffff, &(0x7f00000002c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
capset(&(0x7f0000000280)={0x20071026}, &(0x7f0000000400)={0x1ff})

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
pipe(&(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
syz_open_pts(r0, 0x0)
r1 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt$bt_hci(r1, 0x0, 0x3, 0x0, &(0x7f0000000200))
ioctl$BTRFS_IOC_QUOTA_RESCAN_WAIT(r1, 0x942e, 0x0)
rt_sigreturn()

clone(0x100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
wait4(0x0, 0x0, 0x40000000, 0x0)
exit_group(0x0)
r0 = fork()
tkill(r0, 0x13)
wait4(0x0, 0x0, 0x8, 0x0)
tgkill(r0, r0, 0x12)

socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000200)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
connect$unix(r0, &(0x7f000057eff8)=@abs, 0x6e)
sendmmsg$unix(r1, &(0x7f00000bd000), 0x318, 0x0)
recvmmsg(r0, &(0x7f00000000c0), 0x10106, 0x0, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0, 0x12, r2, 0x0)
sendfile(0xffffffffffffffff, 0xffffffffffffffff, 0x0, 0x0)

perf_event_open(&(0x7f0000000280)={0x0, 0x70, 0x2a, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xffffffffffffffff}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
sync()

io_setup(0x6, &(0x7f0000000300)=<r0=>0x0)
r1 = openat$tun(0xffffffffffffff9c, &(0x7f0000000480), 0x2, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x1, 0x0, 0x0)
io_submit(r0, 0x4, &(0x7f0000000280)=[&(0x7f0000000240)={0x400000, 0x0, 0x0, 0x0, 0x0, r1, &(0x7f0000000340)="8081000000010000000000224dbdf68f4c5c1f32b3c480c5", 0x18}, &(0x7f0000000180)={0x0, 0x0, 0x0, 0x7, 0x0, r2, &(0x7f0000000140)="dd", 0x1}])

r0 = socket$inet_udp(0x2, 0x2, 0x0)
close(r0)
r1 = socket(0x840000000002, 0x3, 0xff)
setsockopt$SO_BINDTODEVICE(r1, 0x1, 0x19, &(0x7f0000000040)='syz_tun\x00', 0x10)
setsockopt$sock_int(r0, 0x1, 0x6, &(0x7f0000000080)=0x32, 0x4)
connect$inet(r0, &(0x7f0000593000)={0x2, 0x0, @broadcast}, 0x10)
sendmmsg$inet(r1, &(0x7f0000000a40)=[{{0x0, 0x0, &(0x7f0000000000)=[{&(0x7f00000001c0)="9061d4d40000000900ce00"/20, 0x14}], 0x1}}], 0x1, 0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000380)={0x2, 0x4e22}, 0x10)
listen(r0, 0x0)
syz_emit_ethernet(0x3e, &(0x7f0000000000)={@remote, @remote, @void, {@ipv4={0x800, @tcp={{0x5, 0x4, 0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x6, 0x0, @dev, @local}, {{0x0, 0x4e22, 0x41424344, 0x41424344, 0x0, 0x6, 0x7, 0x2, 0x0, 0x0, 0x0, {[@mptcp=@mp_join={0x1e, 0x3}, @mss={0x2, 0x4}]}}}}}}}, 0x0)

clone(0x38004100, 0x0, 0x0, 0x0, 0x0)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000080)='uid_map\x00')
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
write$tcp_mem(r0, &(0x7f00000001c0), 0x48)
rt_sigreturn()

clone(0x41fe, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = openat(0xffffffffffffff9c, &(0x7f0000000100)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f0000002140)='/', r1, &(0x7f0000d06ff8)='./file0\x00')
r2 = openat(0xffffffffffffffff, &(0x7f00000002c0)='/proc/self/exe\x00', 0x0, 0x0)
read$FUSE(r2, &(0x7f0000002f40)={0x2020, 0x0, 0x0, <r3=>0x0}, 0x2020)
setuid(r3)
getxattr(&(0x7f00000000c0)='./file0\x00', &(0x7f0000000140)=@known='trusted.overlay.metacopy\x00', 0x0, 0x0)
tkill(r0, 0x18)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x10012, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
preadv(r0, &(0x7f0000001580)=[{&(0x7f0000000040)=""/237, 0xed}, {&(0x7f0000000140)=""/162, 0xa2}], 0x2, 0x0, 0x0)
rt_sigreturn()

r0 = open(&(0x7f0000000300)='.\x00', 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
lseek(r0, 0x0, 0x0)

r0 = socket$packet(0x11, 0x2, 0x300)
ioctl$sock_ifreq(r0, 0x8917, &(0x7f0000000200)={'lo\x00', @ifru_names})

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
setsockopt$inet6_tcp_int(r0, 0x6, 0x3, &(0x7f00000001c0), 0x4)

r0 = epoll_create(0xa)
r1 = openat$null(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
renameat(r0, &(0x7f0000000100)='.\x00', r1, &(0x7f0000000140)='./file0\x00')

pipe(&(0x7f0000000040)={<r0=>0xffffffffffffffff})
listen(r0, 0x0)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
setsockopt$inet6_tcp_int(r0, 0x6, 0xa, &(0x7f00000001c0), 0x4)

r0 = socket$inet_icmp(0x2, 0x2, 0x1)
getsockopt$bt_hci(r0, 0x0, 0x1, &(0x7f0000000400)=""/202, &(0x7f0000000180)=0xca)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER(0x1, 0x0, &(0x7f0000000040)={0x2, &(0x7f0000000000)=[{0x61, 0x0, 0x0, 0xffffffff}, {0x6}]})
rt_sigreturn()

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
sendmmsg$unix(r0, 0x0, 0x0, 0x0)
rt_sigreturn()

perf_event_open(&(0x7f00000003c0)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3c40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3ff}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
set_mempolicy(0x4001, &(0x7f0000000140)=0x100000000000001, 0x9)
set_mempolicy(0x3, &(0x7f0000000280)=0x7, 0x7)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x0, &(0x7f0000000180)="b3074f6949263fef968bcd19771981676b0cc939d529dc0f2a6b89a3aefc1f692e5521a143c243b1d597521155ea9ec6a3dce5c3b571a2faee928e07593751a99aea2fbd388e76a5d08f7ad04b90280e9e069f0a6ebf887966bbab0321e4db8ecf4f2f23586a887fd514a79b60f75353ecc8bcef4dfd3fe94efbc47c531dde35cf9c233d874b9b245897444e6f0047c28ae5fac9", 0x0, 0x0, &(0x7f0000002440)="c5a93675e16b3c411ea199ce59b7b9592eac1cfff0de86b0f7e80a1ad238d482d480f52a22caf0d1563671544fa51ec0b487277318775afe2588e7ae863a6337a9c4a98fa3657233820b6ae04f33fcbb57863cba695ad30d6b1403bc1d095549236c499b264d9a354b278515dcce6028c4a1ae15774482c32541d6c9635d65b3f9a98125c7e0e0a443a0a572e94eb146c11da782fadb62ba4581fd709694ff35096b3aa67f4752caafca0369891646cbacd2b0c64b6013ca21d6e6847850421c8b4bea843049e0780338529033a14d8f58469c54af78aea71bb5a6dccf89480aec32b9d62c9dd8aec88258ef99741589dc044bb51e85339a78edf3a321b900021630f594741958d0fd9c1d9cc88f8644847476486533dbbfbd6360803b77bd80a91d0230c83a45ea544847cdf5c9bd2354194a91df4c54a09877aa393406cde44667ead41e6f7389c584b27c8c45d91fa9fd7cc6a44063cba9fd373b204720343cd26e5123e0b0123b6717bc0cffe3e0265e36e96046bd098cd1e3ae4de5a7682f5d1f7ad02398605558fe5b222fd3baa959b7853b94114166b5220e763f9d064f8f53000ab44f2134e519ea69e62930dba1df6865d1397e2487595abbf9733ae2023b477952b07c134716d61b041e5b94a4cd52ef3c13194d2c1461ba9f2bf13e54650682049a50eaddf958b267dc5a60d7d17f7a09829b63392553dd2c031de31456f55009b5f1d206649974b033d14ea65903327ab8633ebc249702843a8a65694f0517b562fe1727ece055501f59e1a070551c90f634375188311bf18e3a7771f87f607c6f9227e313ce92cc4b1b5347afad5f036a3dc4604234c2d2c79fc9df725e31e081816c43cb90f2daa04ee058d25f2faa91a7fb4582eaf5e081b8601c8858b35117143ffd2633536ca65bf057d137f12b22bee93d07859586677c7e52e85aeaadc7934cd196714172faf58d8b49bd5bdec1bb3638a136f4bf2f4c9c243c05b6000785a5a99c35291b2237db9018b28cfc15d6bdea11793380b8180416c99433cfe156baf29ffeb06ce5ec0ee5d760e2e5bee0d1e84d90ecb19ce6c59b7e50c6ccd3326666a3ae6924f8afef93daa15563d4d6c7b88bdceab6cdbe1c382a894d12f3bb9b914d5838501492d102bdff7ec65c7db4be04bc6a9e5a0b8c9a779df2f53ebfb7904122a6317a6a3ec32717059416e87f1b86348892cf614c48dc23c6b7d367d9e2fd0bc349174c134027d8910ca03ee5cdc30608b05a4a6ce3889a997594e5f0421278b4b204d6d2abad68a508f2b08bea375b392050ecc3db2cefa765b4e8429bb993ca6ea5107afb188c9b4d797191830cb6b862a636c7fbdc6d9d0bf1fd234df4ae36d89f56abd30d9d4b730993578958ea1ad0845195fa0be8d6245c6b2fbb4534a9073a6171bc9b05294511480cdc00163d4d73347b62e6e2143106f3c5ead2b77716e27787a403e322004af7babfc15fa8dc5edab12bdb53f406dfe71ddd8ea6e108116262ea8b1ac6602c40e8aff308f7634152e7370840031816475bae16b72346f8bae35d1af1a67591a2f8dde31f4ea94e5e7e95b87821bb86e8efe3b5fbe5bb631d08a54c0e9e7f1a1224fe79ff90fdd47db663453f8df40bb766d1ad1956270a3cebdd39a52a90a9cc3678742e6942ed7c48cb2b0ee4159a1a949eb9c927853ca33472888bf211a3794985cf7c66920d6ef0ed5a4a6d1ccfe5b55c14ff9059b0878e9a5b0c172eb3ec43a537e98eb8900073342cc1ee638749917dbcab56c02c74a96731fd13f9e345a83640daba1084b9b4591786b2f1d85d19a24d4288384200c0c52041c1d6ec3c2b6721c776a7627e9cc4aab9e32a86f0a156f4ea7bca143a6bcadd675009dc9cd22f57c2279a2116782c7fbfeaa7b7e63388aea374dd505038bf97ea25c2f9e82fa93f0369bf2dbe5661aee1be7aa8f7099c8c4d4346583ef159370edcf5914da17ac09fb1b27a8056332f9a21913ab6a37156b250a490a57f7462c306ea71c0756b8e17b7b13396d7627a2c9b694f615c779b1a5fe4ac2b3f70e231a3fbfd19351bd3415cf6e688ec10954f43c6787c5c469092c57ddf21af18c5ec570d5bb5b379631db0eba31d6b1734f481ab13a42f561deb9c938f63ab209cd1c9e0141333c8d8f6cca6adef4fd556d863c8e125e062c90a495f21feb4cce6b2d42c636ae1255905a42f6f0afc9c33bd4db2d40f8a201558a016fd0e51aa61f62dedbddd79f9410fbbff1ca93476b2b48cac750c58176301c961fb92d21431a804fbefc4c2339eecf1e711ccb31fa1ac1c2cb5bed30260b6194b37b4625945d1e2087162b6cafb342aa2210a69ccad2b14bbddaf2b3021a2a8c214dd6be14ef6fb4ed1236696794e1592f777dc88b466dc1d31cd6e8c22b15eb06c16f4f25421a12c0c483e4096e9c3ae422bc1f7e803ec29e3ecfdea16b8fcc014ef2e381f33ce5dd89d7ad546209099fa9585f45753e062c5c1876092ba44ba288ffaa4bd5f852b774a1da484872e890fd72ed62f81e664faa8b2ec92007c0849e261d86149ea06422aaaafde309d57ce12eb0f1754cea31218c6f625e95e55b7fda457a94aec019f17b5c20aa310301671427d6b505ab5374f5b36a598d0e774c5116673c6ad6ff21d4298f21d124d1aafecea9ae49adc5909661076272434e7d192d36162820969e125f5cad75d7cc9d3780d1329fb68f813b8abb576fe3edc8cd44734fd35f158afde83f2483cc9d98d0aeaa0389eb33404ac143aee2c66e330367472da28d5056076ae908b718b6ec278a2d3129b492315e5ca8dda301ced3385f805c4f97e778467bd8ac76b77406a235dd340c409137b0ed18797c5203f7a0ae8b85e3d52223b2d15774a6c7851fff275d058c9d48dd4c1b8bee46bdab54e481007d37ce47e30c3862deef6629036f3f628a40f63829e713b42a4ae7c787e2ab1fb6e622959c5922a13d5d502722215d94b2af600a5cf6dd3aea7379f94c291df4f35316fc96a17332d713e7ed6c49b683ed68e8e95d37f0ca33c7d334ccf7cd29c6e10fd463a2ebf151a56933fe46ec591af155719f3023a4d6aed5b8b1b7eda044116b846720dfaba71761ae8100287fbc0aaaafdb1c74184bcaa385caca23b93a5338e77f7384a002bd53d329750d8208f06de94d962d764aad7a940d11cbd74b61b0dabd9e0e0feb77c5126570032231a80d62da3b58960328a7f54ac4c7154ce8e952d791288209d29bd981c02bf4e957b6e1f91c750c6d6fad88e45f146967285d51dcb1558a18193ad0676463114f4fc60d07b1bd93e2aae0a52bf6e122af702e800fd73b85cd23ea812fe738204dbd027ebe02524e45890fd926dc7d338d1566e4bc273f25adc304677c298c16036992c177b57aa33a9798861389cfdb228ddd0b940835e3c40934c4537de93276a1222160d3154a974700e91e43b170f7eb1b56ecfb50cde5e9b04cccd99e92a5ff64d08a1a9b3928b59eae0f976764ea82f9dc74049b95f960ac5e748963d202d818d97dd07ae712560bd0fc46ab23ba3986fc0b9ce2cafc227a17db8de30e66f1f142064849657fdf658ddfb9e3430580cd9860c26a7d19cfc640d5e96eda9f381c39ad98e4c764c030deecf907599776b81cf963f58dc3861699b06e8c55931bba8ea0296a332d29e80e18c6f4bf50d7274e53839a5a1a9a1ef75751c4253ee0116630ecefb4f22f37d231ce62be0422ef36318f69d6f50d0e1fd84e2972b01a5e3430d7a32d6c6b7e695bfd2a094dd1a185cdb0285982a0c0abe542fc5a1c18b5f6f2b7bc502eabc0cca70e9da0289946ce8a837442a7e77e331a7bd8742067e5685dca364b415fd2fc65eace91f5bcec0b820d5d724d1092fae7c61b54ddc37c470474decf6dcca5695641b32ac312bc8c7c69d1dc9c8b99fa27e89fc6150fab262c3f41b979d1eb2bcfb93e371fc2866862e5f3f07cd18de69e0839af4db030a8a4d4eaa647cd45810b09a2a92dd81f9bbd2238e784c8a36a43fa7b1049f55ff5a326ea402b3e1a13ab8c45c7c753e6781cae2955aaf23c5d7e2bf926576e49f179ac199d72b29f69bc1e77110e2842e907697ef9979df8c9a26db34c15cb14d852b68bd736a295f27efcb7809b0d6c2af58651cb46a74be5ddbf0e794913eeb9377b540b047355fc92d6659bca4a98c5de3e3d7984093ce0a3df2edb6d3b17f4d0dcd1c9cbb7c0950f87af6589974ca19f9cefd40e45c75f15e200d1fb987e53c10ca08809e596a684b7a14de2aadf105f3a58a2d89290f8e74f8429dbf5441d6f91b46923ad74a83dd96997a1305688118d0d27a32a1ea7f55e5be42684a300730e1f1ca60b7f9d8e056d1432478cb4ad407ad416e41bca1a7edbb58146a19d6c38195b4d80a9afa0ff1e2549d0f16b9295f1d6729545e5a18e8aacb9b895fcb06603a92510700c07616efeb71506aac3e429336d48f375432ae906492a61f071ec30d3c9d86ce5b2a9b74ab2cdd80e4c58344f5d4d3c5660fb525aac559c1c445fb98ee52eb554acc27e6f406062323cc1d56ea5136aad5e66fb948beba0d9a2bde32fa8cbeb7ebf069d330e4c6149843aa3c6d6c9c7a8d3390a6f8fe7cb7d7942ba5c4f7cbf235c4ffb4eb39d1e4c13e9662338da918be97101cd56a8a1d1c80b9a775e995f380774efacc322b3ce90e3d45c5091d08f24064d99ff9698a44247c84e8776167492b3f041a10280350dd0706c2d39066763a95a4ae7ae9e91297c1fd8a0bc5d720bdd99aec83dc32e074e14b5e6d77ac19830b53334f4f72d9ad45f6ba85db1059edfc5aa6f301108d2b456808aa3071e144100bcbee7bdbb8fae8a60f0a2ca0e7ef5c416ee68b61f22bad3b5ac6d32b0d03dca014012c37b2526a85f8d6760d7e2a6b319ae82cdb8ca7da56471217c337095e8c5dd8a61dbee466f5821347d16cbc58809bccec4a84321b78acd91958ddc6242a2fe1324c3f01001da697ccc46959ac9a57cfc02c0f3cb1df0f38b628cd26ab93462925655c767e46a968ec734625f0cdc07ff6e700005851e809b89f31726504cd7eb27849b72ada0042126d006c37bd7214c383831322bd6945ed53c669395ebea2f664141ae588a51c3ef8b1f04e74ce5c6ceedcb822ee87b31b534b18ff775941c5b32bfbbb738535b8fb89af1d866639b7d18aa980bf285c6f5a663eac3ec2f12309406f1c0fc8bfa308513716533044bac6b0bfa72a1451c6ce47d5aab555d9ff622573c161146461cd3affe6303e2318e5611270d1ae4c34d02666d437cc2348f8e1ede26da74f13d8394ef94afead43525b00789cc9124b9c031b4dbc1241ecf369473f571532a87483793bd0570c611611c346f5f27a5c32d31e7e565fb09085720bd5a032fb8e00f4f40ca09a4c4c483179cd6897e013c15ab5df398b7a8d9113de3252dea6c0a3909a4595e994924abbf4f397479dbca0a091132fbbefcb9a17afb8e6b534372217c61d24989086b8a2aa7d19154e35fd90c20c24de5879886f966097d81ca79bcb9913cca9857dafc16941807da23eaa3cee88bc7fd0c4fcf5072d69368ac4b886a543c64feaf573e64875dfe359261943e7")
set_mempolicy(0x2, &(0x7f0000000380)=0x40, 0x0)
clone(0x1000800, &(0x7f0000000440)="f9547a6a7d36ef88c3be02b155b564d6a016f05150e3f6554814252a44f18585f2502f39a55728bff6eb8c38830b76d003bd005d843aa705b3fd7e4a054f076a37e690852055bc29f6d34dcee08e09b7bef7d302240ed9fb7a3e9abd0fa5ce237a1b82d73a371603e5d4a768925d789b0b2e2b4130fc6d3306723125b01e6beec3f9db", &(0x7f0000000040), &(0x7f00000002c0), &(0x7f0000000580)="f911091112126c17f435b67de3356026ca65f0ef4287f5662f904591b905ca457f4fee4a30afd9b832884e21d35fa689da91eb40501bca2d04ce2bff780b7c02a37c0fc7f6236746c77642f81370ea9ca1c9d59dca6c025f2f0b205d12dd5d3943b8c83d2adf416e7c39767c745e314cf306cc3b5c5cd1494c059880205e382023fa57dbbbae9c9410db40586ede8c40f7abdce4b2f8bcf3365f2b39ea4ef6ad6a75275527c5abfb741b9761d4c59f204f930539d8bd2c574b14985468364a4325deb79a41578e2bcf93cf4f")
exit_group(0x0)
syz_emit_ethernet(0x32, &(0x7f0000000680)=ANY=[@ANYBLOB="0180c200000ebbbbbbbbbbbb080045000024000000000011907800000000e000000100004e22c61d79001090780200000000000000a60be2814cc549bd2aebababa06c2435056f42077f49f7667e4ad4c132bf86873a6a219c6e5612e54d36c1bcbf282e808445fedb0d8a45c9e70f49e3873edb765d6d9923e26b819ead734c3ab392e0d79b91450740b7f45bfe43d3838d2c58cae282df7d1fdd2b22eb5bed83858d6fe3b6553eafc6"], 0x0)
syz_open_dev$ttys(0xc, 0x2, 0x0)
socketpair(0x23, 0x0, 0x7fff, &(0x7f0000000240))
io_setup(0x79d0, &(0x7f0000000000))
pwritev(0xffffffffffffffff, &(0x7f0000000080)=[{&(0x7f0000000340)="d4", 0x1}], 0x1, 0x40000c9, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$unix(0x1, 0x5, 0x0)
recvfrom$unix(r0, 0xfffffffffffffffe, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffffff, &(0x7f0000000240)='/', 0x0, 0x0)
r1 = gettid()
r2 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000001180)='auxv\x00')
r3 = dup2(r0, r2)
getdents(r3, &(0x7f00000000c0)=""/143, 0x8f)
getdents(r3, &(0x7f0000000000)=""/92, 0x9a)
r4 = gettid()
tgkill(r4, r1, 0x2d)

getitimer(0x1, &(0x7f0000000300))

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rt_sigaction(0x0, 0x0, 0xfffffffffffffffe, 0x0, 0x0)
rt_sigreturn()
rt_sigreturn()

r0 = socket(0xa, 0x2, 0x0)
ioctl$sock_inet_SIOCGIFADDR(r0, 0x8915, &(0x7f0000000080)={'syz_tun\x00'})

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
setsockopt$inet6_int(r0, 0x29, 0x43, &(0x7f0000000780), 0x4)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
getrandom(&(0x7f0000000180)=""/161, 0xa1, 0x2)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
getsockopt$IPT_SO_GET_REVISION_TARGET(r0, 0x0, 0x43, 0x0, &(0x7f00000000c0))

link(&(0x7f0000000380)='.\x00', &(0x7f00000003c0)='./file0\x00')

clone(0x2006300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = creat(&(0x7f0000000000)='./file0\x00', 0x0)
pwritev2(r0, 0x0, 0x0, 0x0, 0x0, 0x1f)
r1 = gettid()
r2 = gettid()
tgkill(r1, r2, 0x24)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
syz_mount_image$fuse(0x0, &(0x7f0000000040)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
mount(&(0x7f00000000c0)=ANY=[], &(0x7f0000000080)='./file0\x00', &(0x7f0000000000)='proc\x00', 0x0, 0x0)
r0 = open$dir(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)
getdents64(r0, 0x0, 0x0)
r1 = creat(&(0x7f00000013c0)='./file1\x00', 0x0)
r2 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ftruncate(r1, 0x7fffffff)

r0 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
setsockopt$inet6_IPV6_PKTINFO(r0, 0x29, 0x43, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f00000000c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = socket$inet_udp(0x2, 0x2, 0x0)
getsockopt$inet_mreq(r1, 0x0, 0x20, &(0x7f000000cac0)={@rand_addr, @multicast2}, &(0x7f0000000040)=0x8)

syz_emit_ethernet(0x56, &(0x7f0000000100)={@link_local, @empty, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "e60013", 0x20, 0x3a, 0xff, @private2, @local, {[], @ndisc_na={0x88, 0x0, 0x0, 0x0, '\x00', @dev, [{0x2, 0x1, "3a172b70e3fa"}]}}}}}}, 0x0)

r0 = syz_open_procfs(0x0, &(0x7f0000000080)='task\x00')
fchown(r0, 0x0, 0xee00)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
timer_create(0x0, 0x0, &(0x7f0000000040))
timer_getoverrun(0x0)

clone(0x5100, 0x0, 0x0, 0x0, 0x0)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
rt_tgsigqueueinfo(r0, r0, 0x4, &(0x7f00000000c0))
r1 = signalfd4(0xffffffffffffffff, &(0x7f0000000000), 0x8, 0x800)
read$char_usb(r1, 0x0, 0x0)
rt_sigreturn()

syz_emit_ethernet(0x0, 0xfffffffffffffffd, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
perf_event_open(&(0x7f000025c000)={0x2, 0x70, 0x15, 0x0, 0x0, 0x0, 0x0, 0x1}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = openat$null(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
preadv(r2, &(0x7f0000000400)=[{&(0x7f00000001c0)=""/148, 0x94}, {0x0}], 0x2, 0x0, 0x0)

timer_create(0x0, &(0x7f0000000300)={0x0, 0x12}, &(0x7f0000000000))
timer_settime(0x0, 0x0, &(0x7f0000000040)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x14}, &(0x7f0000000040)=<r0=>0x0)
rt_sigsuspend(&(0x7f0000000080), 0x8)
timer_settime(r0, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000002240), 0x0, 0x0)
timerfd_gettime(r0, 0x0)

r0 = socket(0x10, 0x3, 0x0)
connect$packet(r0, &(0x7f0000000000)={0x11, 0x0, 0x0, 0x1, 0x0, 0x6, @dev}, 0x14)

r0 = socket$inet(0x2, 0x4000000000000001, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x80000000000002, &(0x7f00000005c0)=0x169, 0x4)
bind$inet(r0, &(0x7f0000deb000)={0x2, 0x4e23, @multicast1}, 0x10)
sendto$inet(r0, 0x0, 0x5, 0x200007fd, &(0x7f0000000040)={0x2, 0x4e23, @local}, 0x10)
recvmsg(r0, &(0x7f00000003c0)={0x0, 0x0, 0x0}, 0x0)
setsockopt$sock_int(r0, 0x1, 0x8, 0x0, 0x0)
socket(0x1, 0x803, 0x0)
setsockopt$inet_tcp_TCP_REPAIR_WINDOW(0xffffffffffffffff, 0x6, 0x1d, &(0x7f0000000540)={0x0, 0x5, 0xe3e6, 0x9, 0x100}, 0x14)
recvmsg(r0, &(0x7f0000000240)={0x0, 0xfffffffffffffd83, &(0x7f0000000180)=[{&(0x7f0000003ac0)=""/4096, 0x5801}], 0x1, 0x0, 0xf080}, 0x100)
write$binfmt_elf64(r0, &(0x7f00000000c0)=ANY=[], 0x1000001bd)

r0 = creat(&(0x7f0000000000)='./bus\x00', 0x0)
r1 = creat(&(0x7f0000000040)='./bus\x00', 0x0)
write$binfmt_elf64(r1, &(0x7f0000000000)=ANY=[], 0xfd14)
ftruncate(r0, 0x3ff)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x1, 0x0, 0x0)
write$binfmt_elf64(r0, &(0x7f00000000c0)=ANY=[], 0xfd14)

r0 = openat(0xffffffffffffffff, &(0x7f00000001c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x180000f, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mmap(&(0x7f0000000000/0xfbe000)=nil, 0xfbe000, 0x7, 0x31, 0xffffffffffffffff, 0x0)
r1 = socket$inet6(0xa, 0x2, 0x0)
connect$inet6(r1, &(0x7f0000000640)={0xa, 0x0, 0x0, @empty}, 0x1c)
sendmmsg(r1, &(0x7f00000092c0), 0x4ff, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
prctl$PR_SET_KEEPCAPS(0x8, 0x20)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
fallocate(r0, 0x0, 0x102000006, 0x6)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
seccomp$SECCOMP_SET_MODE_FILTER(0x1, 0x0, &(0x7f0000000340)={0x2, &(0x7f0000000300)=[{0x8008}, {0x6}]})
exit(0x0)
r1 = getpid()
tkill(r1, 0x8)

mkdir(&(0x7f0000000040)='./file0\x00', 0x0)
mount(0x0, 0x0, 0x0, 0x0, 0x0)
chroot(&(0x7f0000000000)='./file0/../file0\x00')
perf_event_open(&(0x7f0000000000)={0x2, 0x70, 0xb5, 0x2}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
symlink(&(0x7f0000000080)='./file0/../file0\x00', &(0x7f0000000100)='./file0/../file0\x00')
umount2(&(0x7f00000005c0)='./file0/../file0\x00', 0x0)

socket$netlink(0x10, 0x3, 0xf)
r0 = syz_open_procfs(0x0, &(0x7f00000000c0)='net/udp\x00')
read$FUSE(r0, 0x0, 0x0)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000001340)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
getsockopt(r0, 0x6, 0x2, &(0x7f0000000080)=""/171, &(0x7f0000000140)=0xab)

socketpair$unix(0x1, 0x3, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
pipe(&(0x7f0000000100)={<r2=>0xffffffffffffffff, <r3=>0xffffffffffffffff})
splice(r2, 0x0, r0, 0x0, 0x100420000a7a, 0x0)
write$binfmt_elf64(r3, &(0x7f0000000080)=ANY=[], 0xfffffd88)
r4 = socket$unix(0x1, 0x2, 0x0)
bind$unix(r4, &(0x7f0000003000)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0xc)
connect$unix(r1, &(0x7f0000000200)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = signalfd(0xffffffffffffffff, &(0x7f0000000000), 0x8)
openat$cgroup(r0, &(0x7f0000000140)='syz0\x00', 0x200002, 0x0)
rt_sigreturn()

mmap(&(0x7f00009fd000/0x600000)=nil, 0x600000, 0x2000007, 0x6031, 0xffffffffffffffff, 0x0)
mremap(&(0x7f0000a94000/0x2000)=nil, 0x2000, 0x800000, 0x3, &(0x7f0000130000/0x800000)=nil)
perf_event_open(&(0x7f0000000040)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x41be}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
mlock(&(0x7f00001f4000/0x2000)=nil, 0x2000)
ftruncate(0xffffffffffffffff, 0x0)
mmap(&(0x7f000005d000/0x400000)=nil, 0x400000, 0x0, 0x392d6ad36ec2c8b2, 0xffffffffffffffff, 0x0)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$inet6_tcp_int(r0, 0x6, 0x9, &(0x7f00000003c0), &(0x7f0000000400)=0x4)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
sendmmsg$inet6(r0, &(0x7f0000005180)=[{{&(0x7f0000000040)={0xa, 0x4e20, 0x0, @empty}, 0x1c, &(0x7f0000000400)=[{&(0x7f0000000080)="a907ab1416beed4fdfd4d4e5088f14bdbe694a53f89e81dfef341200d131ad79900a8790c84417ea91b66e914854653ad9e7fbe97ff53fa0f400000000", 0x3d}, {&(0x7f0000000180)="c7", 0xffba}, {&(0x7f0000000300)="221f3e", 0x3}], 0x3}}, {{0x0, 0x16, 0x0}}], 0x2, 0x0)

r0 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000040)='memory.events\x00', 0x275a, 0x0)
writev(r0, &(0x7f0000000000)=[{&(0x7f0000000340)='7', 0x1}, {&(0x7f0000000380)="0c7ac1d5e4331b0e98e38658c5ae29d859f3584a376767754a42fb7b12b7e865d321a91f32e984e475e97b5564127bca51c236ecf0cce33a7b3c3bb553777dfce700e69e570c7a413a01203a0661525a3cc570fbe589052e295508096320450ec7eaf81c097d4fb5310a05af8e9daee3f8d5a70343c47361d5b42e04f8032288fd7764077da5e98d64bf5cb5f1846dd131739fb555de6805e2e3e4a5ee86c9f24ab699786eafb699e409aa8f1d8a6c63613c228780cc2760922ced6d5e6c1063b8f3", 0xc2}], 0x2)
mmap(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x1, 0x10012, r0, 0x0)
rt_sigprocmask(0x0, &(0x7f000078b000)={[0xfffffffffffffffd]}, 0x0, 0x8)
r1 = getpid()
r2 = gettid()
rt_tgsigqueueinfo(r1, r2, 0x11, &(0x7f0000000100))
rt_sigtimedwait(&(0x7f00000000c0), 0x0, &(0x7f0000000200), 0x8)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
timerfd_create(0x0, 0x81000)
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = socket$unix(0x1, 0x2, 0x0)
bind$unix(r1, &(0x7f00000001c0)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
r2 = socket$unix(0x1, 0x2, 0x0)
setsockopt$sock_int(r2, 0x1, 0x10, &(0x7f0000000180)=0x7fffffff, 0x4)
connect$unix(r2, &(0x7f0000000080)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
r1 = dup(r0)
setsockopt$inet_tcp_TLS_TX(r1, 0x6, 0x1, &(0x7f0000001200)=@gcm_128={{}, "cc63c08025b8adee", "5f549bc1dedcf2f5557bf5dabe0efcbd", "2953dbd1", "0df4cc49d2d6f03d"}, 0x28)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_int(r0, 0x0, 0x21, 0x0, 0x0)
exit(0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = epoll_create1(0x0)
fchdir(r0)
rt_sigreturn()

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
flock(r0, 0xc80001aa8de59d5d)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
getsockopt$inet_tcp_buf(r0, 0x6, 0x12, &(0x7f0000000180)=""/140, &(0x7f00000000c0)=0x8c)

r0 = openat$null(0xffffffffffffff9c, &(0x7f0000002240), 0x0, 0x0)
fadvise64(r0, 0x0, 0x0, 0x5)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
mkdir(&(0x7f0000000240)='./file1\x00', 0x0)
mkdir(&(0x7f00000000c0)='./bus\x00', 0x0)
mkdir(&(0x7f0000000040)='./file0\x00', 0x0)
mount$overlay(0x400002, &(0x7f0000000000)='./bus\x00', &(0x7f0000000100), 0x0, &(0x7f0000000680)={[{@upperdir={'upperdir', 0x3d, './file1'}}, {@lowerdir={'lowerdir', 0x3d, './bus'}}, {@workdir={'workdir', 0x3d, './file0'}}]})
r1 = openat$dir(0xffffffffffffff9c, &(0x7f0000000080)='./bus\x00', 0x0, 0x0)
dup3(r0, r1, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x0, 0x12, r0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = gettid()
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r2 = socket$packet(0x11, 0x2, 0x300)
fcntl$setownex(r2, 0xf, 0x0)
r3 = gettid()
tkill(r3, 0x18)
r4 = gettid()
tgkill(r1, r4, 0x24)

syz_emit_ethernet(0x6e, &(0x7f0000000100)={@multicast, @link_local, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "1db319", 0x38, 0x3a, 0x0, @private2, @mcast2, {[], @dest_unreach={0x1, 0x0, 0x0, 0x0, '\x00', {0x0, 0x6, "115f19", 0x0, 0x2c, 0x0, @local, @loopback, [@hopopts]}}}}}}}, 0x0)

mkdir(&(0x7f0000000040)='./file0\x00', 0x0)
mount(0x0, &(0x7f0000026ff8)='./file0\x00', &(0x7f000000c000)='ramfs\x00', 0x0, 0x0)
chdir(&(0x7f00000000c0)='./file0\x00')
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
creat(&(0x7f0000000380)='./bus\x00', 0x0)
r1 = open(&(0x7f00000003c0)='./bus\x00', 0x0, 0x0)
r2 = open(&(0x7f0000000080)='./bus\x00', 0x141042, 0x0)
ftruncate(r2, 0x200004)
mmap(&(0x7f0000001000/0x2000)=nil, 0x2000, 0x2, 0x12, r2, 0x0)
readv(r1, &(0x7f00000007c0)=[{&(0x7f0000002300)=""/4096, 0x1000}], 0x3b6)

clone(0xf38055be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff})
getsockopt$inet6_tcp_int(r0, 0x6, 0x2, 0x0, &(0x7f0000000000))
exit(0x0)

r0 = signalfd(0xffffffffffffffff, &(0x7f0000000000), 0x8)
flistxattr(r0, &(0x7f00000001c0)=""/163, 0xa3)

lsetxattr(&(0x7f0000000000)='./file0\x00', &(0x7f0000000400)=@random={'user.', '\xeb?\x89=:R\x15\x0e\xdb\xbd?g@?\xe3@\xa2\x98\x9b\xfc\xa4/eG\xb5\xba\v\x86\xea+\xc0r-\xd92\xdc\xe4\xb7\v*\xa2\x99Q\x10\xc2p\xc1\xb3\x10\x80)\xab\xd0\xfcil\x1e\xb0\x1e\xabjp\a!<\xaa\x85%\xaf~\xd4\v\x9aq\x8b\xe1@{-\x02Z\x03\xad<\xa4\xc4\x8f\x17\x0e\xd2\x92[g\xf4\xe8\x0e\x90\x84\x02W\x96\x80M\x1b\xde\x17S:C\xc0\x1a\xd2\x90\xcf\xb7W\xd5\xe0\xef\xaa@8EDf\xe3%\xc1\xa8\ay\xf1o\xcf\x8e\xc63\xad\x02\x8dR\x0f\x88:Eq\xc8o\x02b\x11i-\xed#\x05\x8d\xc8p\xbft\xcc\xc3y\xe1\xd2E\xfa\xca\a\xe0\x98z\xc6`\x13zZ\x1b\x87\x12`\xe8\xf1\xbe\xa0\x8d\x01:2\xec\x86\x9b\xc0\xc2\xa0\xdb\a\xad\xc2\x11\xe4\x89\xf0\xd2K\xd4*eu\xb2\xb7\xb7l\xd3\xb5\xc7\x1b\xfe\x9b.\xf1\xdb\x85\xe4\xbdP\xae\x86\xfb7\xb0\xd1M\xd2\xff\xb9/\xaes\xb0\x92T=\xd6o\xbbv\xd5TeX\v\xa4\xfd*\xd0\xe9\xadS\x95\xa7\x1e\xd2\xd2d\xcb=D\"\xc6:\xc8v\v~\xd2\x05%\xe8\xbeX/=#\xe7&Y\x14\xaa;\xbf\x9a\x03\xc5\xe8\xb2\x8c\xe5v!\xe0o[G\xc3\xbcA\xa1\xf2X\xba\xf8\xf6\xb0}c\xce\x81\x1aO9u\x19\x057OY\xcd\x1bd\x1a\x1et<\xa8X7\x1a\xc7[#i\x92\'f\xd0\xe3{\x15\x9c[\x1a\fv\xca]\x00\x00\xd5#1\f\x995\xc5\x8d\rd\x9c\xba\xdc\xa3\xcb\xe1\x0fJ\x17f;\x03}\x05\x7fQ\xac.\xf1kC\xa4\x1d\xa6\xf0\xa4\xcb7w7G\xf3\x9d\xd8\xae\x17\x05?^\xe5\xbaA\x8fXR\xaf\x9f\x95\xf0\xd13\x15\x81\xb1\xf5\x1e,\x8e\"v\xab\xde\x027\x03\x02\xff\xd0$\x83-\x1c\xe2\xdd\xdc\xcf\xf9?\xcf\xe2r\xc8\x82h\x94&\x86\x82\xaf\x1f\xc5\xfe!\xe7\xe8\xc6n\xc5\xd6\f-\xec\xd1b\xac7\xaf\x15\x90\xa2.\x14J\xac\xf5\xa9\xdc\x86\xe2\x9a\x02\xfb\x94\xa2=\x00wH\xe6Bs\xf9K\x89\xbe\x96\xc5\xf5\x14\x89\xc0\x9e\xa5C\xea'}, 0x0, 0x0, 0x0)

io_setup(0xbd0, &(0x7f0000000040)=<r0=>0x0)
io_getevents(r0, 0x2, 0x2, &(0x7f0000000400)=[{}, {}], 0x0)
r1 = memfd_create(&(0x7f0000000340)='-\x04\x00\x00\x00&\x01\x01roxn\x00', 0x0)
io_submit(r0, 0x1, &(0x7f0000000140)=[&(0x7f0000000080)={0x0, 0x0, 0x0, 0x0, 0x0, r1, 0x0}])
mmap(&(0x7f0000000000/0x1000)=nil, 0x1000, 0x0, 0x812, r1, 0x0)
io_submit(r0, 0x1, &(0x7f0000001280)=[&(0x7f0000001100)={0x0, 0x0, 0x0, 0x0, 0x0, r1, 0x0}])

r0 = fork()
setpgid(r0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setpgid(0x0, r0)
getpriority(0x1, r0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1000002, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$inet6_tcp_int(r1, 0x6, 0xb, 0x0, &(0x7f0000000180))

socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000300)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
setsockopt$sock_timeval(r0, 0x1, 0x14, &(0x7f0000000480)={0x0, 0xea60}, 0x10)
r1 = dup(r0)
sendmsg$unix(r1, &(0x7f00000009c0)={0x0, 0x0, &(0x7f00000000c0)=[{&(0x7f0000000540)="38f53f4f89777bff1f1cd395d0fbf0a245032de5453ab4b72e67eaf50adff31eac37a3d9e9b7f2c1df5adea87039ac90d57e71f9978969e0ab40f1a7b5c308aa00e93413e712564ed3f5348a26d5f45fca5ddd2c2366f7895006454c50123ffb8a2f33a35b5d5fec15ebd84273cf0a9c73d0c79f8dba2ae6167ca5be2ee0ef078da95b57a0f2f24dffb2e2a815f7b42830282d82", 0x94}, {&(0x7f0000000600)="d2170a82a3d6bbfe9047c3cb78a4ea33dd200bb79df2e7a9bc910e48d747fc89", 0x20}, {&(0x7f0000002000)="4f0a3be0b12c4753c5baa729e0612a51417968aefbde50f4716b54283c00238a76461029d8445e76ef2c474c48ddf8cf838d54c4cd189e2b0a4a4f4dbde5de824da5b49a8141de66271d4f3b375087a8065f2f7de72527359b946292614bed668bc3d357609b36fdd5d1b5270b7a7f1eab59dec1b57da954c5ad9609da90ff0755ad8363b03c1c59dbc7904c86f20d9f7df61d2dec4f9a0db33c9766604fadec14891f7f602088dda653c44c5fdbd3e1eafa8eeef60be156d5233c131508b3b7bc958d049b33bf21435e711acff88a1c7a8c00f201d02c9f969a102bd2792e3b0c5f289bfff2461f5c826db8a3846a7262fc5b3b70452507a6ff607ad441f97315a25a0a531cb9e420b1d723c1ff797ef3c5f500d5732086cde076646133afa27a4bf679cc2a71062aa068987920053132cd61812823e33bdc6c5148a0bb29b97412db598277e263558accf9991a94936bb9cf3e085e4e8f1fd272951e795002c3cd2f3e84d27bcaaf48ed38ea7017f68fee4f5684ec0d82aa7f0a71458742f1eff6243e6d690c0af83b2bd654850c6ead1e9aeccd2bd45556518f25694dd1c161b59d43c22544da5d3db312667f6b8182e4850f22a523b5c3eb8d78038d43bdd7f4da810ca3428430beda2fd4f56391602dcf3a9470b2301adae8dbfb7b397cc3304f099c1db961b27dac6679956f62e533351dfbca50234b7fe86542f367b0bfb30e9a36fb7459ab025dc066e92dd1b6897164a7c7ec8084ad6b0209d77b1f3d22a5e408621d7cdd6ed329b161e6924b9043aaa2fd529ec96c29896b902a87d50988601170e6aeb480568526c1c6e05767fa851514852b3a9c8656c8682ee39da699806a4fb11349c7fc53f749ca917b2b7c2a1dd5c7ed3504702a380efb8e3d9f17279ae262fdaff1cb534e3d56c57e43bbf1560c9516544889e8bb32085a1362b81c2eac91fc72a58cc676f1f6d72190338568e95be06fbfd0145e54e6f645553ce5ab59a3f887781298f282abee16b5b68fffb6f70832c6c7b5bfd4fcb83ee4918077b1ca8df8ab266ece081ab672e05d3c58372a6d9d49e40db5278283ec04b3953d84a7f6760fcb474015d451aa4da105a61852497ce71572c819a1e967438e0c9866e830def5c83bfb94f3f5ac3e14a7b53402b416ee9aa6a2e09cf2f2732c429de81caf213624cd0dd5038d843044f9a537788cae14fa2ac62035dc82b2714ea1cff8a6dc12bf485542e721a992a76ff48bd67ff3c02b5e86a7ec45dd71a549558edcf18d9d9cea4a3e3f2e5b9f21404a7e1e3f69013a6c0e78f7cbd8599d3e69396feaa0214122034fc67314e3a0c07a7c3bb1cfe53bab3fc6f3baa551cfb757e3b40a295cf8ba85754a75e594abc0d1b8dc4d7892ea581f34749be53af324f90bc0ef445d9d1c3acc42a9d6d9d05eab4c0621df87d190bc9fca7b6b10241f619d1188aa515b4b8c21a804687114d53a60396ae20487cfd484d3ab4684ceaea8e99d82075716ea682bab0116377e237d4a97030ac7d0c3b73410b726fb47b1c46e96b431c354133c114282262790270ced0c682e099b66c07485c34192ca7bf41a91238f0bd46114f4f1eb66f41718168d4f7c534814c91fd06124444e77afd21207b3eec7e9ddbf6d3937d5201c4746a82da55eb726077a75c51437e3ff33b0302dd2bfbf87612dd5584f32d3deb352ee64a9862777098b3133631aae630b102c908578cc3833f509586ee6217cec8c1fd1ea0e9a8630c880038714fdbb46a334b58b3f44e5399676faf2dfa02c4a308e211537cd46594f4227560c8ff5ff0b3d4879c66b4d4781b277ec0c9f958d09bc7e81487a66ba9f7318ed5aa457bf2fde59c1e3f59c7a518072e961fd14cd64a0f2cf49cf0cd6be64e4e9e5a7b7f5d448d88fae31359eccd1ea83988338a0e8efdb34fd16d00a3bbe952f87d9bccdabb4e48278fa51dc7c95c63010e298719446630d0ba57e9c7ab6dd582ff401af7ffebd2e711b840f5d13c60d6258f52ce4d4c61ea1886d8cbd5c602baa0162ba3062574edb828434dd7e024a447bdca56a29bddbe7baea4e8162b2db01a42280824a9441bc466fcc4c7253dcf6c11d5896e6ac8345a42abfdae14f35e08b6ad17b68115e87ab84d16e08ae5f26363d06c5c2731e8d7ab1f4b18d42d34c7cde95839f78c13679f049a915b63c39ad4f707b47ee185f02178d0e756d1c3ddc56ec71bdc7a7dd4035eb2fe9e8fd81056529780c2bd253cd4dbcaec51b572455210dd8258f362de6eb47a3010ebbc2b6066341db69b7167fc1f13e964d5693bca39a3dfa77868559615d2be3c3fda09caff72ee51fc78700c926604f410cb5df91405a31c86b64218fe6c826b7f48fad177ff91602c8f542df3b436dcaec09a909ba9b7a88f8e2f69ca99d00de86523b286fa73ccb855877e945a8634143dd0ff7d1bffb4d4de6bb1bff42734e18cf18bc380ed2c7e88ad10f8e9dad96972b6d71f06a9e9a9a4b2f81e92d13192456fdd3b179a80e6f16aeb2ff46341de5432669ee37f29533de14dcdec909dc74a62b2f3084e849ffe8ae837b7adc79d0ea9db85f9c0bcc67e497350c9766e3533dbac8e30c42a7d5416471fe6b3371fdfa3d4b095d2d986a173906e433a9b8fbf8519f519cfa3ac704733f3b8aca571b4ab8b511a08a55302364c743a4b58359f52849ac57bc7bc000eb1300d044d01da1f85bded587a602bc247a1ba7d339a30ac9adf5e04d01113b71f84e4bad0f9d9b47a2ecc7d2a5b0fa2be1ab0d2ef06270424af2a976ae860eb300d768322f66600f08d3238cbf437fa95365db6e93522675818b17220958f2655896b520cb7d0dd0c73b37cc0139040fe06ca264bbf9f0219955c431bd3c23b8bb3e468d3fc426b8b70313d1f6e9f19ae2907e497e626120cbd317f4ce7cedc9625d417ce5de18c9abb27b192a0545587acfeecdb86f9208f54f45498f120b0f5710eee69cc4dfa88b444c1da02f80c636e0061a929833384310cbd14c25e5c82db771757c0477d7e623f115bb7141f5bce850400790c8992831588ea2b4333bf2d5b811d1c1364c15bb6a1e63580035d743eab94152230721d3ea015afad00ae129e0c32b5df82bd7851b70832ef826b89b4f5a68d253a2aae4c83189bc30bc1d1f425a84841f13dc88ec21fd34df6f102cbb2b30d0020f0737b94253464348d0b4b51d7b4831b7b5dcbde08e6e7a824b21bc6d145b2b254fc1523994f77be34a24da6d50c5a3f9db34d04d299f32bf61593f251492ff4e275e3fdf564e60dbe08bf3bf8c5b7158f18acf6442b53808c8a5d19094b4f6d501b9844201b4d654e6187d8565e4dc03f860ede0e3da428df98424070bf2c1b25905ec5a8cb6de33c57c1423ffd5be9d267a0361042a3ac1a7f7c14eea2330a6086b0b3b14a41fc47137c5c910f12d80f9c53d25f5874c6947893c9c650e7bfe3021fd11f4a5bd713442765cb73c8f6f107f01826c7133d7ee14d299be23ae977d737f553888a137614266a775b28a09b44a6e1d62ad21f7ffc2d4c5b479ac69c0334467e1fd20f270e1708c49d2c85eeedcbf5f440f6abe637787eec08612dead9529bef2c9df72d097cecb0ab0a2ce111793f7e98e225c8e5853cacfc107edeb84ab7fd338523397c98c6e8b74a108224bebff22d6f8b441bcfa8712b3d1f3e408add9bd341a909e7ec59d0da285be113885867b05c3b7c0897c387e55057658bd92aa1c0793fdcb3f1263cb31ab78b56dd6e2447e51c1c443c5938560f589713e6fd1ddf1f49d1fe873a62c02ecdc598384364059fd9a935f057d38a2fa23cf3f0ecd380196a3793570d96044b707427c3f9e40756bf3513a5d09f4eb75d280dc514da85139608e608a4e9e35fa95ba6e893bc6797d5e20b2836e373d4cb33a0cd911d296e813ccb8d4a586eac5c2242a52a7c0e133f7365b9bb2de6cd9ea32d9de1959151a4be7443ab24cb8b1e964c805adeceaf0980e20f1379e5e524db2d9491db41a4a407bd0b2d74fefd5e375d8a6a8b3b2a8636052f5bc3e8d83a83fdb8d3e45661fc968f6d976d0446b86cfa1e93b4737b665b25aebd9f839c7d58063721a7df7e80edc9bf6cdfe214fe0aa290abb4a4fc7f4560b6426f5cb80d3d7f54c2d31a559324cb80352eff7b9e71341a3b18615fabd42f078085b23d8ed95ddc685ea17aee4b210e46a6215a6f384bd1ff2a77a8e8f289d3d7bcb21f6110c2fd7cb551fa83170665e5cdfb2889b4c833d5578e212e05750ae7472cedc0200c6a99de0e53eee96e57b945f65bd45787832cf981eff309d085439b401e9f5e4213b464d7d53673aadb752ef6ddbe969335c4a8a8dec0860badef8d6fa502e5de41e3e368e2252db75c766ea2867069720f7accbd5114a024f956650fb599517b93f87a6d3a51845d0de3e8e07ec7185fe3bd24eaf248240f0e59afce223181bf960db0427a987fc8126f6899b07d9ec145b5ca9a05377662f66c68f050f3b6556a8def39d825ebee379ef9a97d3a21ec2f46e032a599bee473f1605c0ea9ef0c01a61e6a0a6983b078d86aae0e1e9261b64428baecf6f6d2f4963c0678bfeb116683bf93a775acf3a0109ac7799ac7ee3121b4aa6e06d7535fae4bdd53bf9dabc551a476c7f31c9cbf7db775eaa358685fceaf7dd7fd14e9b63b4375a898d8586456ad02fc179660de2b47e6b092c7223c07cbbb971a4a297b6ab3e64405cc5bb81049bc4ed7b69ea097c2ed0900b9de620e4b7b3ec42045fe3ffa337dba2137031c942b4e9de02d6f51c5116be7dbf6961ff68d9cbdc74e083bdab68f11e4a631795e768fb0cc77aa33153aecc87e121a64cc6f4c0b8a7eeb8c455dce8b9b235320f76f4e0cf471907b5b564e984130bff5b909b00e2b1b39fb87d1e3f137eab0039d522f3c291c8bd105a7147ba40d9fb62989d0658f327e186bf72b5624d9936e13974bcac0a63fd478410483627bbd1b28bc02fce653360f6d42c4a70a1cb1144c0a3d92d257988eadc77bc4f3a7e6ddc30412ac22142d352ff4f4c40134c0be702a5c132e23206c5c9dd4e0d4ba478ece45fbfe917a388add6747c165b49ef159b64eae20b00f3965dc7008ddf0311fae7aea545cb175e7b4b0ad391d2478301caf5a2832780f34364617d541b453fcae742e487e75710a91e3a0ca29ffbafae7dbb4105892b33722b87515b52a6aea222d4715bc6a573792ca4dd296cff60dcdbe8469c2d6e54a2edbde280daa9e1fde5873ee31fdd8058b00ff9003d5293a52f36b445b390f22ebf6a8fc511ba7e4ee9f9a534b27763baf27d8a63b3a0b5008a422403d9191e6b21c64ba276a8f5df828deb23dc70298c4d934bae8cd099c5500d1fedb1df23fb8ea8cf6933963a156e1a3fc87465908eabc615ac1055df547cd7da65caafc87e33e139735c778e9695d2142f612c95c72c54dcff23b3e072275cba8e2418ac35257021fbf96d6539bc7cd1601706e4b16b99c1db49351d586767e06e10c2f3d6d9f53f5916cf327bde4ac7f01e8b352a9dedef02da60b010faa9bb4a94c90c8e1527667fc55f786b67ef60e39c40d771d66be31716d4ab4a670f4fda46cf21a149bcaf7a0f8ce41790c82a4afb116ef8d9c960f6f10b7224b1cf6641ba14f9d16e6f8cb96a9f52fb7ffa122826374d9d96b088da29294f65b7e5056dae56a8b668be056b845aed7bf44afacbf192de73470a80fdd8a8b86bd3635d323ce325dff036185fba8c2b2a6151f0d0a15197a119cebb4d1533d00f3862dcd0913275eafff2", 0xffffff9a}, {&(0x7f0000003000)="0cb778f1832fcd8dac3588f0e943860c9259c80a527e1e925e731ad7758b04bbf5666b33572b1fc2840f816170e1c3c60bb126dd95a9f7daa0f0c02828e9fb59b7114e9a912902b0e415f326263653a622b63520a07da9232550fe08ebfb3aa6534d956fba93d2ee1d77e203923dc2c9b47414cf52d16ce390d63438d6f9c31a27ce92f80c551bf2a44b6cab1c4e81c2938daf0d324b52cbee54d259561d7b3b30d65a6696564d98cabe170c7b112cb5194dcfb7c0ff9b5459a97825e80cdb81d7cc713a2841cc1222cfd55a45e8716be537c8633ae6283a4635cc302a8b5aa7baa59df312467c10bc3bd6eeea3756b14d325b611596a460937f560ba893cd089382ba08dd968f47f4ec6768895535f63b0c41ba158709e7ab17574f3b8bb92e0171db56f6755b8ebcebc1ae222317d455bab6795244dcffe57f9d2931d91459002020401c5d1e5c6a2e9b873dd5f951390b8e2575f6f4b1796248d45f9992ca35b84c0735cfded6b8ec7d53e4eb1eee87c674f12948b7ea9b529c52248e37302eee3190a1b3e972c84242cef9b242f8388035f9d97b66dde67b2562eb766551f40f6cf9d42269de0019f2c04959f8dbec41d10322e1eb16a458d43f73fbd78c2d85b5be18a39cdc2bcb80e408ff22af2d2233818902879c34c319920ac0f8fe4ffcc7908e85e239a83e0f9590207c661ade576d59540091098daa457ba4a2f50449b155733ebddbae3115079383bcf757f9399f0e4b45e5f0ab55266ef4da8fe3b7008cb4e1789bc51f5e62cf396361e76d453fb329b23eb49bdbf8c681dda4007f1b076a22e62d9686986be57fef48290f58831dedee50d1552d8d09935b0508b1fc11554284a0e601533118e6a52524fdc39e8653623d49d3c3f2dee8a61454f3f9cc1d490b3bf91cb40c34e75442e254cbc639988d8963896a6ab7f50bf909871bc623db2b4bf3d9a14b993e1c8707292f0d767e458e2eff21debb83004c2881b08b96f4cf088d875c667d35c48c014154088f84872db2600357ddcb1a4a7dbf1b40d7a4f42ce7a16f36e08c828c8cff0af9dbe150d18e7b9866b30b5cad3d7445eec4a7edcb50219a2657a859bc43d30071550f9216852656fd136f0594c4c83361e0da5d67b5c5059b0ee05fbfc6ab1a3b73dd1cbed1914f89ed2ae11e07856158852f396e7a8a8ac62f1c332ad65be1319e0f8379aded9141ffb38782cae3183619ac35e33d39bdb7e33e8996114a6c11985e2930b73d518ebdd115a796ff53ba14e1e21e290a82e615f6bcfbce0b6d4b10f948f0bb9746e5e6484055630295e82457f4f8b7fe11a840b146450597ac90bd188ca5ad3bae00e42c095140543164dc5554fe7bac5206c9da7187129d73dad44edf6a5b2ada80f5b465f27a32d2f5bc79f7506ff52efd0920c353cab3b9f4bb3d82daa70c045bac45dd779bf9f9de1c45bf51eb813ce1bface748ee0e7867cfafbd922424610d212e358641f9921815f2ebbde1050a5ffd361e32b6f84abd636776922bce57105b07deafbc8d3ca9c05517c4fde027e0a96a90fbfc5ff2bbac86c142fc25d0066ad212377387b0556f5bea31abc2f425c5bcabde7c038326815f92258e93602babbdce6d4d12d621f311faef4f169f21c82133fc14dc3e06e3c62097ba04024868f39abc7bea13d76ef627df7754022c64d5906a64bc86b2d513a8c64de58c1b3b9bb64ff701a801d341b873a7016dc6e789512d4f04f3b7b83f939ed23e9e31ee70eec67b0c699024df8ad98710370e6749f7484746f90b8e4a025ad9da60e6f3151acddae57addcfe78875b348579d99c8cce5a1ca06705d62d4ee500d60f864bff80fc99f9a0f5c9fb84b19808bde42e5ba36c89502229d9d209e1de55de82a18809cf2d238216158935d339b05389cde936baec687188b1127a6409546daec0517ce70414db66299521264964b27aec38e550b74f906640272705802c00894ca506fa7b0656401dfc748eb942e6f0b0b7d9418684d4afe0fd0cb809f9f8d4ba93f3b817634c4361c359b2144cb32659c1e71175c1bb1e7f270d4cc089502dfeb585b4e7df39f5f0af6e5000a9c6f59c532c89a63134160fc7270b0f88f2a98fc30e82ca2be07989b5dbca95424b428ab6bbdd1e8577a8a6cd04570cdf51d56a89e5b4adc2e1d638342ba58eb82a0973a5a1f64877637f55b8d36b161483bcaa21a0d39d22fe3c5b2614f9aa31329d2d6470bac3d7a6e5d3b0a1375f3008092a8d835699a8bbe9b94ed8f944ce36608ee6b4a50acc6f45fe4323b6a38f6b17e79552e64a7a5a45d4b6312277c760587d34d06dce9a91d66202b7636c5a36d55e34acf4f9d8d0ff850bf54766361bd3eb3fd0768f16dc9cc7772f31743a48976eabb392255120e9f15d4cdaca87f49f7dca53c9b7431e04bf85f31c0f14b350578d392f8c8b18b0dd1241c97c02e5bbe8bcad55fd2a374fff23084786036482d273280db718407525becc1865019584cccf845c46ba9a1ceb93217c1ac9d8da7c268e6a7e5c0c671b14b479973f95af3e7130317d08f04300b498f9fe506be3c4e8a52200d884979adce34371e282fc51bcf69112c8ae8225951338e39d5029ac19dc21816cd9e4ee7149223ac0168ca8af0207c62a61a089deec76af432d77b22bb4630ca8995dbe859e683c2d6907a4b5ef7f96b1703073239b31f8774dbc480e217a676f805bd1e6b44f6c2a3e653101a136551bf2aac18efdd3ad34b8bd5be50efd9a012cf2160d5a68e1c7b416709566affaa6d9d20c6bb1462f2836c4e0faa167ee92a94799d0ac4c80c18f9f893cddf06f2ce168d80427c1e57c8c3250efb7185c436a454c63a1c18a6cf4254aabc4be929bb831623276d6026de65028b077abbf830e0be0354545c8cdf0329c6348043c397a7b8e286174f2f75d9464c4362800a70ee0e27f14a993c5560836e1624b5344fe946eb1e7f2dc749c02cf603fc0ccf3423847f15fa88b3eefdad057e6eaa6f90580a1e2d855f19ca581169e53bfdaa800a9d4a5847b2ec95c911d64120cda6027ff1681e0a48ccfda032945898ca284eddb56882d45732b147b20c8b8f5520e55540c831e138f743368f2c4683aef10f5abf99a883291d39547a4108a09ca09471b3549b40de85cdf9babd37ba12a44fcbe883d9e391d9f70a3f9b1587ff270791b4a51341896c7a6edd43631c7c9301c7664dd4b8df7f15077723166362f28d021cc7ba515fbd30462c0fe3dfa0fb375b99ebf0d315e06fd541a7c8a3eeedd97bd91d351446a7c1e47bfc582c49f44fa479dcfe1ed87e8a33651021a234247f70e03fab6aaad0848288b5eef2963a3ba24532ada5d04620968c243bc868b57002dcb797d7a6254096c016ec26546a36010a5665c6f95974be40256dfc0c8caeb116e7d9f82db9aad1179b29622e56f07b2dc09d562970fcc3bc02fe69c454117137069c100e2c344b084fcea3a522be333d555a558d3a600ec0804dc2e6540b17f8ae6112785bf4b5763687396a253670590c3a1e02b415484419b430262aa756fd2237f86b06b5bc8a22b3bcfef72cc4af1fea0a042d52cd7ff856c86e3cbeaee0562a0e8b2b0beac5c713ef0475c1ae63f26d496ddbfe984f837ae3890e99572254ff53c5984cbee4ecbd468c19815319e4e2fd215fa1ba7748dc5e1f180adc25bb591c76a141b0426081a7666ac9d4371b5bc4117a12d82d4501d81b51bd52c56c9f0ecbaeac2221949ff7c098962f05300bf4638628695cf4962feaeb12efe3916eb5f56f33ba0b8387ee077d7bbdd807317334821a2e3f2ec0de977d3f36674b40ca1e733028b9bffe0a38f87795f14d0124c3fc2ae25187be7913367bc9526adbba6f58937277a6bb4a6c665fa96251f613ff0d25068cb85754e06ca527d9781783ea005548dd9db17f5975075dbe7ff78f2a1c4d67c916b108dce34d6d8ce00022024864e2fac57b04ba7d967e4889d9542cc57f862cce68af9099e77403fe324bce3bd5b955873e8c88d8d982dbfb38912dbda38c5afa73bc6e93aa35f82d256c699bce7ffdda6558f221df7129f973399c8ba8b9d358c3e499f36de59bf16dba8241c03fe81124a2d352ac563db53fb5ef3b84edc9ea378de49914704c3b44529c256aa366b421e02439908bcce2af463480a298ca2d25bb1286ed5cfc8472dc43b805a2e42e205619d31f846815f011113e09bd4df58867c0a9fa534c8824a84a0492eaf93dad12e44a38851fb731d413e317d397e4a3c8d71b1775903c4923cdaade3ca1b51b85aad0e8d2e876cf41f3e6c3114adb0676d36ae63e645a9005bc5f6856b797ba572b9b178bc3e6368e5d6356256247876259c6958f2da99c52af207bf6bcb307d3d5c60940813e1c3400e3831ad30ab864b62f2ed7a412f3a397e27a2dae0db772b49b585955e2382a99bc684d7754a3d395b1cfbfc75408ff65a7766e80e648c75a5ba2d0d8083adb9581d910c262f4aeb06caaceab3d09054e3cf969da8731c4a765ec2e542e3865b13ddfe2ec421340f82e15b54625fe26e87530e5f84cd48c26dc5a257757498357569033e7e5063ba8d625e4f10823bcbfbd612b51eced480057d80084577d882e726deedf906d97fdf6391b168e6d2743469d22571ed2aa4652d8f07dc2b07da6e7d81e6a6294b202ea06501d2941eeb4240e7dd048e63e6e4bffd78c657f5401e642b9dd7511ce8ea66469f59ea0a5e7f0d5cdcb1e91f4c8b86ff8e023bc9b5fd52607ae1307e7407aeee3b999c10734552b61b7848911d9be8ded579ba11ec0e54a4344601693560108e43a56053b72fe9e6212f3585e512c2f1748a1a834c4e42da23ce895d86141cfa6cf0a5408166b3a2f9700ee7c878aef86c9e3ce36c2f1c853644e449f57519be55b7a9aa2017fbcddbd529b0562d7a03faeba5f9ab1912f033ebc316d8094c2f616e511cbc4189578440c87da8e552d629a30bd12f744ab2b15897ad1e214c7253bc9aa19ad9816d4a2007091d18d8105fec5d4a1a7e6df684f7ddae53034afd574ef9ff435ac8abc4c8bb9ae85226e6f650d3a522cbde15c73", 0xe1b}, {&(0x7f0000000040)="343b3d3354e4894abd72efd30cac8457173482b9337502969fa8aad5613091615f370762dac5016be166e0e8ffae7bec0da4a026e49ae0a8518d6f20bcb79da1208fbcef432c494aacb57b4a4c219053b1e857100cb48b", 0x57}], 0x5}, 0x0)
write$P9_RFLUSH(r1, &(0x7f0000000f00)={0x7}, 0x7)

poll(0x0, 0x0, 0x7fff)
clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f00000027c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
munmap(&(0x7f0000570000/0x2000)=nil, 0x2000)
timer_settime(0x0, 0x0, &(0x7f0000000200)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x16}, &(0x7f00000003c0)=<r2=>0x0)
shmctl$IPC_RMID(0x0, 0x0)
timer_settime(r2, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x3938700}}, 0x0)

rt_sigprocmask(0x0, &(0x7f0000000100)={[0xfffffffffffe]}, 0x0, 0x8)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1000006, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = gettid()
rt_sigqueueinfo(r1, 0xa, &(0x7f0000000040))
r2 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000480), 0x201, 0x0)
writev(r2, &(0x7f0000000400)=[{&(0x7f0000000580)="4db51671657d1365e36710c22dcc3df65167a10de7", 0x15}], 0x1)
r3 = syz_open_pts(r2, 0x0)
r4 = dup(r2)
ioctl$TCSETS(r4, 0x5402, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x809f, 0x0, "6c1442038a844fce02da987bcd4537a0e7a344"})
readv(r3, &(0x7f00000001c0)=[{&(0x7f0000000100)=""/55, 0x37}], 0x1)
ppoll(0x0, 0x0, 0x0, &(0x7f0000000540), 0x8)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000000)={0x1, &(0x7f0000000180)=[{0x6, 0x0, 0x0, 0x7ffffffb}]})
sched_setscheduler(0xffffffffffffffff, 0x0, 0x0)

r0 = memfd_create(&(0x7f0000004fc0), 0x0)
r1 = getpid()
fcntl$setownex(r0, 0xf, &(0x7f0000000000)={0x0, r1})
fcntl$getownex(r0, 0x10, &(0x7f0000002280))

semtimedop(0x0, 0x0, 0x68, &(0x7f0000000080)={0x77359400})

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000200)={0x2, &(0x7f0000000040)=[{0x34, 0x0, 0x0, 0xfffffffc}, {0x6, 0x0, 0x0, 0x7fff7ffe}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

mlock(&(0x7f0000ffa000/0x3000)=nil, 0x3000)
munlock(&(0x7f0000ffb000/0x3000)=nil, 0x3000)
mincore(&(0x7f0000ffa000/0x3000)=nil, 0x3000, &(0x7f00000000c0)=""/153)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$full(0xffffffffffffff9c, &(0x7f0000000540), 0x20d002, 0x0)
fremovexattr(r0, &(0x7f0000000040)=@known='trusted.overlay.opaque\x00')
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x4e100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = signalfd(0xffffffffffffffff, &(0x7f00000000c0), 0x8)
mkdirat$cgroup(r0, &(0x7f0000000000)='syz0\x00', 0x1ff)
rt_sigreturn()

r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
listen(r0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
getrusage(0x0, &(0x7f0000000080))
r1 = openat(0xffffffffffffff9c, &(0x7f0000000180)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r1, &(0x7f0000d06ff8)='./file0\x00')
clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r2 = getpid()
r3 = inotify_init1(0x0)
inotify_add_watch(r3, &(0x7f0000000040)='./file0\x00', 0x8100041a)
rt_sigqueueinfo(r2, 0x39, &(0x7f0000000000))
r4 = gettid()
rt_sigqueueinfo(r4, 0x2b, &(0x7f0000000100))

mremap(&(0x7f0000ffb000/0x1000)=nil, 0x1000, 0xffffe000, 0x3, &(0x7f0000ffc000/0x2000)=nil)

io_setup(0x2, &(0x7f0000000300)=<r0=>0x0)
r1 = creat(&(0x7f0000000000)='./file0\x00', 0x0)
io_submit(r0, 0x2, &(0x7f00000011c0)=[&(0x7f0000000080)={0x0, 0x0, 0x0, 0x1, 0x0, r1, 0x0}, &(0x7f0000001200)={0x0, 0x0, 0x0, 0x0, 0x0, r1, 0x0}])

r0 = openat(0xffffffffffffffff, &(0x7f00000002c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = gettid()
r2 = gettid()
pipe2(&(0x7f00000003c0)={<r3=>0xffffffffffffffff}, 0x0)
r4 = eventfd(0x0)
prlimit64(0x0, 0x7, &(0x7f0000000100), 0x0)
dup3(r3, r4, 0x0)
tgkill(r1, r2, 0x24)

clone(0xc810f100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rename(&(0x7f00000001c0)='./file1\x00', &(0x7f00000002c0)='.\x00')
r0 = gettid()
tkill(r0, 0x9)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000040)={0x1, &(0x7f0000000140)=[{0x6, 0x0, 0x0, 0x7fffff7a}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000040)={0x1, &(0x7f0000000140)=[{0x6}]})

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f0000000300)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
lgetxattr(&(0x7f0000000340)='./file0/file0\x00', &(0x7f0000000380)=@known='user.incfs.id\x00', 0x0, 0x0)
exit(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r0, 0x0)
semtimedop(0x0, &(0x7f0000001340)=[{}], 0x1, &(0x7f0000001380))

socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000100)={<r0=>0xffffffffffffffff})
pipe(&(0x7f00000001c0)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
splice(r0, 0x0, r1, 0x0, 0x8ec3, 0x0)
write$binfmt_elf64(r1, 0x0, 0x643)
timer_create(0x0, &(0x7f0000000500)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
timer_settime(0x0, 0x0, &(0x7f000006b000)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x14}, &(0x7f0000000040)=<r3=>0x0)
fcntl$setpipe(r1, 0x407, 0x0)
timer_settime(r3, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)

rt_sigprocmask(0x0, &(0x7f0000000100)={[0xfffffffffffe]}, 0x0, 0x8)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = creat(&(0x7f0000002680)='./file0\x00', 0x0)
write$binfmt_elf64(r0, 0x0, 0xfffffffffffffe12)
r1 = gettid()
rt_sigqueueinfo(r1, 0xa, &(0x7f0000000040))
ppoll(0x0, 0x0, 0x0, &(0x7f0000000340), 0x8)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
pipe(0x0)
epoll_wait(0xffffffffffffffff, &(0x7f0000000240)=[{}], 0x1, 0x0)
rt_sigreturn()

mkdir(&(0x7f0000000040)='./file0\x00', 0x0)
r0 = open(&(0x7f0000000380)='./file0\x00', 0x0, 0x0)
lsetxattr$trusted_overlay_nlink(&(0x7f0000004200)='./file0\x00', &(0x7f0000004240), 0x0, 0x0, 0x0)
fgetxattr(r0, &(0x7f0000004500)=@known='trusted.overlay.nlink\x00', 0x0, 0x0)

r0 = socket$packet(0x11, 0x3, 0x300)
setsockopt(r0, 0x0, 0x1, &(0x7f0000000000)="81", 0x1)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
mremap(&(0x7f0000ffd000/0x1000)=nil, 0x1000, 0x1000, 0x0, &(0x7f0000ffc000/0x1000)=nil)
rt_sigreturn()
rt_sigreturn()

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
accept4$unix(0xffffffffffffffff, 0x0, 0x0, 0x400)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000008740)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
getsockopt(r0, 0x6, 0x4, 0x0, &(0x7f0000000100)=0x1e)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet(0x2, 0x2, 0x1)
bind$inet(r0, &(0x7f0000000180)={0x2, 0x0, @private=0xa010101}, 0x10)
rt_sigreturn()

seccomp$SECCOMP_SET_MODE_FILTER(0x1, 0x0, &(0x7f0000000040)={0x1, &(0x7f0000000000)=[{0x6, 0x0, 0x0, 0xfffffffc}]})
r0 = openat(0xffffffffffffff9c, &(0x7f00000000c0)='/proc/self/exe\x00', 0x0, 0x0)
syncfs(r0)

semctl$SEM_STAT(0x0, 0x0, 0x12, 0x0)
semctl$IPC_INFO(0x0, 0x0, 0x3, &(0x7f0000000380)=""/130)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x210000000013, &(0x7f00000000c0)=0x100000001, 0x4)
bind$inet(r0, &(0x7f0000000080)={0x2, 0x4e21, @local}, 0x10)
setsockopt$inet_tcp_TCP_REPAIR_QUEUE(r0, 0x6, 0x14, &(0x7f0000000140)=0x2, 0x4)
connect$inet(r0, &(0x7f0000000180)={0x2, 0x4e21, @local}, 0x10)
sendto$inet(r0, &(0x7f0000001e00)='r', 0x1, 0x0, 0x0, 0x0)
setsockopt$inet_tcp_TCP_REPAIR(r0, 0x6, 0x13, &(0x7f0000000200), 0x88)
sendto$inet(r0, &(0x7f0000000240)="d9", 0x1, 0x40001, 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
sendto$inet(r0, &(0x7f00000004c0)='4', 0x1, 0x0, 0x0, 0x0)
close(r0)

poll(0x0, 0x0, 0x7fff)
clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f00000027c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
timer_settime(0x0, 0x0, &(0x7f0000000200)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f00000001c0)={0x0, 0x16, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000100)=<r2=>0x0)
eventfd2(0x0, 0x401)
timer_settime(r2, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)

mbind(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0x8001, 0x0, 0x0, 0x0)

r0 = signalfd4(0xffffffffffffffff, &(0x7f0000000b80), 0x8, 0x0)
r1 = fcntl$dupfd(r0, 0x0, r0)
faccessat(r1, &(0x7f0000000000)='./file0\x00', 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0xa810a00, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
r1 = socket$inet_udp(0x2, 0x2, 0x0)
getsockopt$inet_mreq(r1, 0x0, 0x20, &(0x7f0000000040)={@local, @initdev}, &(0x7f00000001c0)=0x8)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
exit_group(0x0)

clone(0x5100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
listen(r0, 0x0)
connect$inet6(r0, &(0x7f0000000040)={0xa, 0x0, 0x0, @local}, 0x1c)
rt_sigreturn()

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = syz_open_procfs(0x0, &(0x7f0000000080)='fd\x00')
fchdir(r1)
clone(0xd00c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000000c0)={0xffffffffffffffff, <r2=>0xffffffffffffffff})
bind$unix(r2, &(0x7f0000000240)=@file={0x1, './file0\x00'}, 0x6e)
rt_sigreturn()
exit(0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigreturn()

syz_emit_ethernet(0x3f, &(0x7f0000000140)={@local, @dev, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, 'U((', 0x9, 0x2c, 0x0, @remote, @local, {[@routing={0x0, 0x0, 0x0, 0x7}], "e1"}}}}}, 0x0)

r0 = socket$nl_route(0x10, 0x3, 0x0)
write$P9_RREADLINK(r0, &(0x7f0000000440)=ANY=[@ANYBLOB="1600000017"], 0x16)
recvmsg(r0, &(0x7f0000000400)={0x0, 0x0, 0x0}, 0x2020)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
ioctl$TIOCNOTTY(r0, 0x5422)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
exit(0x0)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
setsockopt$inet6_tcp_buf(r0, 0x6, 0xc, &(0x7f0000000000)="708d7594", 0x4)

r0 = gettid()
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000029000)={<r1=>0xffffffffffffffff, <r2=>0xffffffffffffffff})
ioctl$int_in(r1, 0x5452, &(0x7f0000b28000)=0x3e)
fcntl$setsig(r1, 0xa, 0x12)
r3 = signalfd4(0xffffffffffffffff, &(0x7f0000006000)={[0xfffffffffffffffc]}, 0x8, 0x0)
r4 = epoll_create1(0x0)
epoll_ctl$EPOLL_CTL_ADD(r4, 0x1, r3, &(0x7f00000000c0)={0x20000005})
epoll_wait(r4, &(0x7f0000000040)=[{}], 0x1, 0x1000)
poll(&(0x7f0000000000)=[{r2}], 0x1, 0xfffffffffffffff8)
r5 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r5, 0x0)
preadv(r5, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
dup2(r1, r2)
fcntl$setown(r2, 0x8, r0)
tkill(r0, 0x14)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
setsockopt$inet_mreqn(r0, 0x0, 0x20, &(0x7f0000000080)={@dev, @dev}, 0xc)

socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff})
fcntl$getown(r0, 0x5)

r0 = openat$thread_pidfd(0xffffffffffffff9c, &(0x7f0000000000), 0x200601, 0x0)
fcntl$setpipe(r0, 0x407, 0x0)

r0 = socket$nl_route(0x10, 0x3, 0x0)
getsockopt$sock_int(r0, 0x1, 0x8, &(0x7f00000000c0), &(0x7f0000000000)=0x4)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet_icmp(0x2, 0x2, 0x1)
sendmsg$inet(r0, 0x0, 0x0)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

r0 = syz_open_procfs(0x0, &(0x7f00000002c0)='ns\x00')
getdents(r0, &(0x7f0000000100)=""/111, 0x6f)
getdents(r0, &(0x7f0000000040)=""/179, 0xb3)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x12, &(0x7f0000000200), 0x4)

r0 = socket$unix(0x1, 0x2, 0x0)
recvmmsg(0xffffffffffffffff, &(0x7f0000000e40)=[{{0x0, 0x0, &(0x7f0000000640)=[{0x0}, {&(0x7f0000000340)=""/24, 0x18}], 0x2}}], 0x1, 0x0, 0x0)
bind$unix(r0, &(0x7f0000000100)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
connect$unix(r0, &(0x7f0000000180)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0x6e)
writev(r0, &(0x7f0000000040)=[{&(0x7f0000000000)="d2", 0x1}], 0x1)
recvmmsg(r0, &(0x7f0000000300), 0x40000000000049e, 0x1000000000fe, 0x0)

r0 = socket$packet(0x11, 0x2, 0x300)
ioctl$sock_ifreq(r0, 0x8970, &(0x7f0000000200)={'lo\x00', @ifru_names})

pipe(&(0x7f0000000080)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
flock(r0, 0x2)
flock(r0, 0x6)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
unlink(0x0)
fallocate(r0, 0x0, 0x102000006, 0x6)

pipe(&(0x7f00000000c0)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = socket$inet_udp(0x2, 0x2, 0x0)
fcntl$setpipe(r0, 0x407, 0x0)
write$binfmt_misc(r1, &(0x7f0000000200)=ANY=[], 0x4240a2a0)
connect$inet(r2, &(0x7f0000000040)={0x2, 0x0, @loopback}, 0x10)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
splice(r0, 0x0, r2, 0x0, 0x2ffff, 0x0)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000100)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
getsockopt(r0, 0x6, 0x7, 0x0, &(0x7f0000000040)=0xf)

r0 = syz_open_procfs(0x0, &(0x7f0000000040))
fcntl$notify(r0, 0x402, 0xb1c661d318500c51)
fcntl$setownex(r0, 0xf, &(0x7f0000000080))
r1 = syz_open_procfs(0x0, &(0x7f0000001100))
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
getdents64(r1, &(0x7f0000000080)=""/4082, 0xff2)

mknod(&(0x7f0000000000)='./file0\x00', 0x1120, 0x0)
timer_create(0x0, &(0x7f00000000c0)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000080))
r0 = openat(0xffffffffffffffff, &(0x7f00000000c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
open(&(0x7f0000000140)='./file0\x00', 0x0, 0x0)
timer_settime(0x0, 0x0, &(0x7f000006b000)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x14, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000040)=<r1=>0x0)
timer_settime(r1, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)
open(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
creat(&(0x7f00000002c0)='./file0\x00', 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
mount$9p_rdma(&(0x7f0000000040), &(0x7f0000000080)='./file0\x00', &(0x7f0000000100), 0x0, 0x0)
rt_sigreturn()

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
pipe(&(0x7f0000000080)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
flock(r0, 0x5)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rt_sigreturn()
flock(r1, 0x1)
flock(r0, 0x6)
rt_sigreturn()

r0 = getpid()
r1 = getpid()
rt_tgsigqueueinfo(r1, r0, 0x0, &(0x7f0000000100))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000140)='ns\x00')
getdents(r1, &(0x7f0000000080)=""/111, 0x6f)
getdents64(r1, &(0x7f00000001c0)=""/188, 0xbc)

syz_emit_ethernet(0x2a, &(0x7f00000000c0)={@remote, @empty, @void, {@arp={0x806, @ether_ipv4={0x1, 0x800, 0x6, 0x4, 0x1, @random="a4886c9128ef", @broadcast, @remote, @multicast1}}}}, 0x0)

r0 = semget(0x2, 0x0, 0x260)
semctl$GETVAL(r0, 0x2, 0xc, &(0x7f0000000180)=""/19)
r1 = semget$private(0x0, 0x1, 0x12a)
semctl$GETPID(r1, 0x4, 0xb, &(0x7f0000000040)=""/42)
semctl$GETNCNT(r0, 0x1, 0xe, 0x0)
r2 = semget(0x3, 0x0, 0x408)
semctl$SEM_STAT_ANY(r2, 0x3, 0x14, &(0x7f0000000080)=""/250)
r3 = semget(0x3, 0x2, 0x4)
semctl$SEM_STAT(r2, 0x3, 0x12, &(0x7f0000000000)=""/13)
semctl$IPC_RMID(r3, 0x0, 0x0)

clone(0x0, &(0x7f0000001240), 0x0, 0x0, &(0x7f0000001340)="a6")
clone(0x0, 0x0, 0x0, 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
rt_tgsigqueueinfo(r0, r0, 0x16, &(0x7f00000000c0))
ptrace(0x4206, r0)
ptrace$cont(0x7, r0, 0x0, 0x0)
exit(0x0)
waitid(0x0, 0x0, &(0x7f0000000040), 0x4, 0x0)

syz_emit_ethernet(0x3e, &(0x7f0000000100)={@link_local, @local, @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x1, 0x0, @initdev={0xac, 0x1e, 0x0, 0x0}, @local}, @dest_unreach={0xb, 0x0, 0x0, 0x0, 0x0, 0x0, {0x5, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x33, 0x0, @initdev={0xac, 0x1e, 0x0, 0x0}, @multicast1}}}}}}, 0x0)

creat(&(0x7f0000000540)='./file0\x00', 0x0)
mount(&(0x7f0000000000)=ANY=[], &(0x7f00000000c0)='./file0\x00', &(0x7f0000000040)='tmpfs\x00', 0x0, 0x0)
mount(&(0x7f0000000000)=ANY=[], &(0x7f00000000c0)='./file0\x00', &(0x7f0000000080)='sysfs\x00', 0x0, 0x0)
open(&(0x7f0000000000)='./file0\x00', 0x0, 0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000380)={0x2, 0x4e22}, 0x10)
r1 = perf_event_open(&(0x7f0000000700)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={0x0}}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
listen(r0, 0x12)
setsockopt$inet_tcp_int(r0, 0x6, 0x22, &(0x7f0000000040)=0x1, 0x4)
syz_emit_ethernet(0x37, &(0x7f0000000780)={@local, @link_local, @void, {@ipv4={0x800, @tcp={{0x5, 0x4, 0x0, 0x0, 0x29, 0x0, 0x0, 0x0, 0x6, 0x0, @remote, @local}, {{0x0, 0x4e22, 0x41424344, 0x41424344, 0x0, 0x6, 0x5, 0x2}, {"fc"}}}}}}, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
dup3(r1, r0, 0x0)

syz_emit_ethernet(0x36, &(0x7f00000000c0)={@local, @broadcast, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, '\x00', 0x0, 0x3c, 0x0, @local, @empty}}}}, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f0000000080)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x1, 0x0)
chdir(&(0x7f0000000200)='./file0\x00')
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mknod(&(0x7f0000000340)='./file0\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
syz_open_procfs(0x0, &(0x7f0000000080)='fdinfo/3\x00')
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
close(r0)
syz_open_procfs(0x0, &(0x7f0000000080)='fdinfo/3\x00')

r0 = shmget$private(0x0, 0x2000, 0x0, &(0x7f0000ffc000/0x2000)=nil)
munmap(&(0x7f0000ffd000/0x3000)=nil, 0x3000)
shmat(r0, &(0x7f0000ffd000/0x3000)=nil, 0x0)
mlock(&(0x7f0000ffa000/0x4000)=nil, 0x4000)
msync(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0x4)

setgroups(0x1, &(0x7f00000024c0)=[0x0])
getgroups(0x1, &(0x7f0000000240)=[0x0])

r0 = openat$tcp_mem(0xffffff9c, &(0x7f0000000000)='/proc/sys/net/ipv4/tcp_wmem\x00', 0x1, 0x0)
lseek(r0, 0x4, 0x2)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
pipe2$9p(&(0x7f0000000080)={0xffffffffffffffff, <r1=>0xffffffffffffffff}, 0x800)
r2 = openat$full(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
splice(r2, &(0x7f00000001c0), r1, 0x0, 0x9, 0x0)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000040)={<r0=>0xffffffffffffffff})
setsockopt(r0, 0x6, 0x3, &(0x7f0000000080)="e54c93e5", 0x4)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
unshare(0x8000000)
shmget$private(0x0, 0x4000, 0x861, &(0x7f000056d000/0x4000)=nil)
unshare(0x8000000)

r0 = syz_open_procfs(0x0, &(0x7f0000000000)='task\x00')
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
getdents64(r0, &(0x7f00000000c0)=""/95, 0x5f)
getdents(r0, 0x0, 0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
r1 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
mmap(&(0x7f0000000000/0xe7e000)=nil, 0xe7e000, 0x2000009, 0xc011, r1, 0x0)
getsockopt$bt_hci(r0, 0x0, 0x1, 0x0, &(0x7f00000000c0))

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = gettid()
timerfd_settime(0xffffffffffffffff, 0x0, 0x0, 0x0)
tgkill(r0, r1, 0x24)

r0 = socket$packet(0x11, 0x2, 0x300)
ioctl$sock_ifreq(r0, 0x8913, &(0x7f0000000200)={'lo\x00', @ifru_names})

r0 = semget$private(0x0, 0x2, 0x0)
semctl$GETPID(r0, 0x0, 0x10, 0x0)

clock_gettime(0x0, &(0x7f00000000c0))
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
select(0x0, 0x0, 0x0, 0x0, &(0x7f0000000100))
fork()

setgid(0x0)
pipe2(&(0x7f0000000580), 0x100000)
openat$ptmx(0xffffffffffffff9c, &(0x7f0000000080), 0x10000, 0x0)

io_setup(0x100, &(0x7f0000000080)=<r0=>0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1000006, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000002100)={0xffffffffffffffff, <r2=>0xffffffffffffffff})
io_getevents(r0, 0x2, 0x2, &(0x7f0000000200)=[{}, {}], 0x0)
io_submit(r0, 0x1, &(0x7f00000023c0)=[&(0x7f00000021c0)={0x0, 0x0, 0x0, 0x0, 0x0, r2, 0x0}])
fork()
io_destroy(r0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
socketpair$unix(0x1, 0x2, 0x0, &(0x7f0000000200)={<r1=>0xffffffffffffffff})
getsockopt$sock_timeval(r1, 0x1, 0x14, &(0x7f0000000140), &(0x7f0000000200)=0x10)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f0000000000)='gid_map\x00')
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x0, 0x12, r1, 0x0)
semop(0x0, &(0x7f0000000180)=[{}], 0x1)
pwrite64(r0, 0x0, 0x0, 0x100000800)

r0 = socket$inet_icmp(0x2, 0x2, 0x1)
shutdown(r0, 0x1)
write$eventfd(r0, 0x0, 0x0)

r0 = openat$tcp_mem(0xffffffffffffff9c, &(0x7f00000003c0)='/proc/sys/net/ipv4/tcp_rmem\x00', 0x1, 0x0)
writev(r0, &(0x7f0000000140)=[{&(0x7f0000000040)='5', 0x1}], 0x1)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$inet6_buf(r0, 0x6, 0xb, 0x0, &(0x7f0000000040))

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$packet(0x11, 0x2, 0x300)
accept$packet(r0, 0x0, 0x0)
rt_sigreturn()

r0 = signalfd(0xffffffffffffffff, &(0x7f0000001cc0), 0x8)
renameat2(0xffffffffffffff9c, &(0x7f00000000c0)='./file0\x00', r0, &(0x7f0000000100)='.\x00', 0x0)

mkdir(&(0x7f0000000100)='./file0\x00', 0x0)
mount(&(0x7f0000000040)=ANY=[], &(0x7f0000000040)='./file0\x00', &(0x7f0000000480)='sysfs\x00', 0x0, 0x0)
newfstatat(0xffffffffffffff9c, &(0x7f0000000240)='./file0\x00', &(0x7f0000001500)={0x0, 0x0, 0x0, 0x0, <r0=>0x0}, 0x0)
setreuid(0x0, r0)
creat(&(0x7f0000000080)='./file0\x00', 0x0)

mlock2(&(0x7f0000ffa000/0x4000)=nil, 0x4000, 0x0)
mremap(&(0x7f0000ffd000/0x3000)=nil, 0x3000, 0x4000, 0x0, &(0x7f0000ff7000/0x4000)=nil)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000000)={<r0=>0xffffffffffffffff})
r1 = getpgid(0x0)
ioctl$sock_SIOCSPGRP(r0, 0x8902, &(0x7f0000000040)=r1)
ioctl$sock_FIOGETOWN(r0, 0x8903, &(0x7f0000000180))

getpriority(0x0, 0x0)

socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000080)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
shutdown(r0, 0x0)
recvmmsg(r0, &(0x7f0000006d80)=[{{0x0, 0x0, 0x0}}, {{0x0, 0x0, &(0x7f0000003380)=[{0x0}, {0x0, 0xfffffffffffffef4}, {0x0}, {0x0}, {0x0}, {0x0}, {0x0}, {0x0}, {0x0}], 0x9}}], 0x2, 0x0, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
get_robust_list(0x0, &(0x7f0000000100)=0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
openat$zero(0xffffffffffffff9c, &(0x7f0000000180), 0x115441, 0x0)
exit_group(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x180000a, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000280)='fdinfo\x00')
getdents(r1, &(0x7f0000000000)=""/43, 0x2b)
getdents(r1, &(0x7f0000000080)=""/59, 0x3b)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
socketpair$unix(0x1, 0x0, 0x0, 0x0)
splice(0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0)
tkill(r0, 0x18)

r0 = socket$netlink(0x10, 0x3, 0x0)
sendmsg$netlink(r0, &(0x7f0000001ac0)={0x0, 0x0, &(0x7f0000001a40)=[{&(0x7f0000001980)={0x94, 0x14, 0x1, 0x0, 0x0, "", [@typed={0x82, 0x0, 0x0, 0x0, @binary="b8e962b6ea9cb644bc62b39d7eb0abb972681577b1664ca24190ea439d60534e222f2e1dc145089b4500786ee7e0f1d0b9a7d1089ea5f9cfa9026a297487429b866efaf0ab5c19355b5b79199936a93bb52badeeecf60aa33895a34572d529be45410e742eb9afdb50fc98375640203a7c902d8aa1d6ab65b263b106e790"}]}, 0x94}], 0x1}, 0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
bind$inet6(r1, &(0x7f0000000500)={0xa, 0x2}, 0x1c)
listen(r1, 0x0)
connect$inet(r0, &(0x7f00000001c0)={0x2, 0x2, @empty}, 0x10)
setsockopt$inet_tcp_int(r0, 0x6, 0x3, &(0x7f00000002c0)=0x3, 0x4)
sendto$inet(r0, &(0x7f0000000200)='\'', 0x1, 0x0, 0x0, 0x0)
sendto$inet(r0, &(0x7f0000000000)='\f', 0x1, 0x0, 0x0, 0x0)

r0 = eventfd(0x0)
io_setup(0x2e, &(0x7f0000000400)=<r1=>0x0)
r2 = socket$packet(0x11, 0x2, 0x300)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r3, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
io_submit(r1, 0x0, 0x0)
io_submit(r1, 0x2, &(0x7f0000000100)=[&(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, 0x0, r2, 0x0, 0x0, 0x0, 0x0, 0x3, r0}, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, 0x0, r3, 0x0}])

mmap(&(0x7f00009fd000/0x600000)=nil, 0x600000, 0x0, 0x6031, 0xffffffffffffffff, 0x0)
mremap(&(0x7f0000a01000/0x4000)=nil, 0x4000, 0x800000, 0x3, &(0x7f0000130000/0x800000)=nil)
mremap(&(0x7f0000aa2000/0x3000)=nil, 0x3000, 0x3000, 0x3, &(0x7f0000dd1000/0x3000)=nil)
mremap(&(0x7f0000800000/0x800000)=nil, 0x800000, 0x4000, 0x0, &(0x7f0000f03000/0x4000)=nil)

prctl$PR_SET_KEEPCAPS(0x8, 0x1)
prctl$PR_SET_KEEPCAPS(0x7, 0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x8, 0x0, 0x0)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x34006100, 0x0, 0x0, 0x0, 0x0)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
setresgid(0x0, 0x0, 0x0)
rt_sigreturn()

r0 = socket$netlink(0x10, 0x3, 0x0)
getpgrp(0x0)
sendmsg$netlink(r0, &(0x7f0000003bc0)={0x0, 0x0, &(0x7f0000001b00)=[{&(0x7f00000019c0)={0x18, 0x15, 0x1, 0x0, 0x0, "", [@typed={0x8, 0x0, 0x0, 0x0, @ipv4}]}, 0x18}], 0x1}, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
semget$private(0x0, 0x236f01f9a5f13a76, 0x0)
r1 = getpid()
rt_sigqueueinfo(r1, 0x39, &(0x7f0000000000))

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_int(r0, 0x0, 0x2, &(0x7f0000000100)=0xffffffff, 0x4)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
r1 = openat$fuse(0xffffffffffffff9c, &(0x7f00000000c0), 0x2, 0x0)
r2 = dup3(r0, r1, 0x0)
ioctl$int_in(r2, 0x5421, &(0x7f0000000040)=0x4)
read$FUSE(r1, &(0x7f0000000100)={0x2020}, 0x2020)

perf_event_open(&(0x7f0000940000)={0x2, 0x70, 0xfffffffffffffffd, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = memfd_create(&(0x7f0000000040)='\x00\x00\x00\x00\x00\x00z\x9b\xb6\xe8t%\xfc\x02\x00\x00\x009\xa0\x8b\x14d\xa2\xa1\xa8!\xe8\xd1\xa0\x8a\xce0\x1c\xb7\xf1\xccm\xce\xd4\xdb\x89\xe5\x8f\xe2\xb6\xd6\x9cF\xbd\xff\x14\xec\xd4D\x8a\x1f\x1b\xf6\x18\xf3\xdc\x91\'\x06\\8\r\xfc\xeeG\xbeQ\xee\xf0\x99\x1e|C\xd8\x01\xd0\xf5\xbb}\xeb\x86P=\xe51\x9d,\xb7\xe6_M\xbe\x19\xea#\xff[\xd1\xc3\x9a\xa3\x1b\xf9\xe9\x1d \xce1\xc9\x9f\xb0\x14\xc2\xeb\xf9\xceE\xad\xa4\x92\f\xef\x87g\xb6\xabW\xac\rP\xf42\xb7\xc8\xaajnW\n\r\x802\xd7\x1b$\x95tO*\xf4\xae\xb8\xb8m\xbf\r\xd5\xbf*\xfd\xc7\x85\x1b\x8b\xe5\x97j`c\xe0\x88?\xda\x8a#t>r\xae\xe8\xc9)', 0x0)
write$binfmt_elf64(r0, &(0x7f0000000540)=ANY=[@ANYBLOB="7f454c46020000002000000000c4a40003003e000039a59424cc0a00000020000000000000000000deeb7bba0028cc7ebd5d74dafc20380003"], 0x3c)
execveat(r0, &(0x7f0000000000)='\x00', 0x0, 0x0, 0x1100)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f00000000c0)={0x2, &(0x7f0000000040)=[{0x2, 0x0, 0x0, 0x8}, {0x6, 0x0, 0x0, 0x7fff7ffe}]})
r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
r1 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f00000000c0)='memory.events\x00', 0x26e1, 0x0)
write$cgroup_int(r1, &(0x7f0000000200), 0x400086)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

semtimedop(0x0, &(0x7f0000000000)=[{}], 0x1, 0xfffffffffffffffd)

syz_emit_ethernet(0x3e, &(0x7f0000000180)={@random="5bc548d4b2c1", @broadcast, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "bf6176", 0x8, 0x3a, 0x0, @private0, @local, {[], @ndisc_rs}}}}}, 0x0)

r0 = socket$inet6(0xa, 0x3, 0xff)
getsockopt$IP6T_SO_GET_ENTRIES(r0, 0x29, 0x41, &(0x7f0000000000)={'nat\x00', 0x42c, "ffcdd3843ee2b59babeec38825513a69ab794f3d2460540826b8bacfb0ebe706b9c802dd6ce4595adb3e4ee91807e6afb1b7683d0c7d632d381b70d2c3b37993031f2ed6116792171e2c0b27e1a7ea95f83932a42567ce962e0eb3fa1df888be56ba97fd591348e44b684ef3f379d5948e0f680edd72461fad88c65a41d77ad99a87dac9fff84111e4f64560acdb2f1287beaee965fa9118f33cecd6ab14d5210befc958f4efdbe363cf84647b5180b25bed8af4850065c7746fabeb1dee5ebcd644acda5b346b830a7f4e675ba5c593c3c1a13c430d2d42397a1c490ad63e465cf179bb49d27eaf8ca85a04bade91a8143d28e3576e7146c94cc1a943e082bdc9ba6d60af2893e510e01c30be9484cd31ecd08ccd6daf67f513471bf5f9c26827c0909fa171a7e559995da08d959df9eda5113f9ec50b5894e1312444936ddafa6e9eb8183a6d0cd3faa0c27c4aec3e096220ee69d111c9ae5d90f23b3c9a72f0cd66a22000e0a35be0e9292c683902c1dde39e54aca788404b5bf2106f657ecc8871a3db93988eef628a76c857edbdffedf3e91bc081f663ec29b2e88f9b5ed078e412413d2fdf8c31f2b8a38f5e6dea59c07df16062c586fb7871988834da23cde39c9c5e3d757ed248a6fa75ceb38a6940286143e83777ee09e9e96bf97054b8962266769ffd560f87c67843e27f5d400babe5f2bcc2939781ee0a3d751959f58c6863ff84321e0f89560e2e839fb840e2cd071658cb27336f8608380590a65e3e72812f8459e663a566b2edec5aa61ff45ce63ffa42dd93e3b7a4bcec3ffab92e3032cb607d6b15bdb9b27d1486ab816dcbb10790ea6b66dbc9c270d0ec0bdde75ed11f6cbdb871e6b64475a645ec3f0a8b027408065c16b09914146ef2cac4ba5083ae0683c78f3573faf71f163634c57e203ec1d0a960344d2942eaf1dc2dc40551add89f236e130a59d3249397031c097a10d35eee3f2c10b02c4eb42882571a913945ae2ed4ee6beb6d1dc2ea8bdcc10ee27eeafe442c897d744e683a64324020a254cd2ebc4fb5c4197e812e02a2fc1cf88e923a42e632db4e64ee6f2799009ac0d6209117fbfd408befd107e3e91268c8c7f9a2c9faa7446dea59e3641b3d027299634b759acb2bcdccc851489a827faf71bdd303adaf3df034484b1fda3e2e65ef86ea37989d963665007c229c869ea4112aca2cbf0099343ade4019e11df445eca1cb8797995e463d02d8db69b4cc818762d0ff8cdcb007decd057ddeacfcaa5a61a0b9b73d2fb20e5c98dc0459168f069b64f9b4790767d21a84d4ecdeb88d4b94cb4a23b1981680f453fc1c2dd6627686fdb3614f46f4451560fea63c0a5ab42f3fd7695978e92f10920c21acc639c286fa8ac7ffdb50ece94b5c2fb755a6706e4c88bbf5e06b324e562d4e4d01a199b8b9576f374460774ef91215e33a4cd71ac42cfa8ac23abb4f482fa58a61fd865c1b0a8993867466139eeb7f8670a53d45cbea70f5"}, &(0x7f0000001040)=0x450)

utimensat(0xffffffffffffff9c, 0x0, 0xfffffffffffffffd, 0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
setgid(0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x3800009, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
r2 = syz_open_procfs(0x0, &(0x7f0000000040)='net/route\x00')
preadv(r2, &(0x7f0000003640)=[{&(0x7f0000000400)=""/4096, 0x1000}], 0x1, 0x0, 0x0)

perf_event_open(&(0x7f0000000200)={0x2, 0x70, 0x42, 0x8001}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = fork()
waitid(0x0, 0x0, 0x0, 0x1000004, 0x0)
rt_sigqueueinfo(r0, 0x7, &(0x7f0000000000)={0x0, 0x0, 0xffffff79})

r0 = socket$inet(0x2, 0x4000000000000001, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x80000000000002, &(0x7f00000005c0)=0x169, 0x4)
r1 = fcntl$dupfd(r0, 0x0, r0)
setsockopt$inet_tcp_int(r1, 0x6, 0xa, &(0x7f0000000000)=0x2, 0x4)
bind$inet(r0, &(0x7f0000deb000)={0x2, 0x4e23, @multicast1}, 0x10)
sendto$inet(r0, 0x0, 0x5, 0x200007fd, &(0x7f0000000040)={0x2, 0x4e23, @local}, 0x10)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
pipe(&(0x7f0000000100)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
r2 = timerfd_create(0x0, 0x0)
splice(r2, 0x0, r1, 0x0, 0x43, 0x0)
ioctl$sock_TIOCINQ(r1, 0x541b, &(0x7f00000000c0))
timer_create(0x0, &(0x7f0000000040)={0x0, 0x12, 0x0, @thr={0x0, 0x0}}, &(0x7f0000000180))
timer_settime(0x0, 0x0, &(0x7f0000000000)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x16}, &(0x7f0000000100)=<r3=>0x0)
timer_settime(r3, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

r0 = getpid()
get_robust_list(r0, &(0x7f00000000c0)=0x0, &(0x7f0000000300))

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
mknod(&(0x7f0000000200)='./bus\x00', 0x1000, 0x0)
r2 = open(&(0x7f0000000080)='./bus\x00', 0x42202, 0x0)
r3 = open$dir(&(0x7f0000000180)='./file0\x00', 0x7e, 0x0)
dup2(r3, r2)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x8, &(0x7f0000000040)=0xffffffff, 0x4)
getsockopt$inet_tcp_int(r0, 0x6, 0x8, &(0x7f0000000000), &(0x7f00000000c0)=0x4)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
poll(0x0, 0x0, 0x6cd)
clone(0x2008321cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
timer_create(0x0, &(0x7f0000000500)={0x0, 0x12}, &(0x7f00000000c0))
timer_settime(0x0, 0x0, &(0x7f000006b000)={{0x0, 0x989680}, {0x0, 0x3938700}}, 0x0)
timer_create(0x0, &(0x7f0000000000)={0x0, 0x16}, &(0x7f00000003c0)=<r1=>0x0)
timer_settime(r1, 0x0, &(0x7f0000000180)={{0x0, 0x989680}, {0x0, 0x1c9c380}}, 0x0)
recvfrom(0xffffffffffffffff, 0x0, 0x0, 0x2004, 0x0, 0x0)

syz_emit_ethernet(0x36, &(0x7f00000001c0)={@broadcast, @broadcast, @void, {@ipv4={0x800, @dccp={{0x6, 0x4, 0x0, 0x0, 0x28, 0x0, 0x0, 0x0, 0x21, 0x0, @rand_addr=0x64010100, @local, {[@end={0x10}]}}, {{0x0, 0x0, 0x4, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, "53db76", 0x0, "4238e3"}}}}}}, 0x0)

mkdir(&(0x7f0000000100)='./file0\x00', 0x0)
mkdir(&(0x7f0000000040)='./bus\x00', 0x0)
r0 = creat(&(0x7f0000000040)='./bus/file0\x00', 0x0)
r1 = openat$dir(0xffffffffffffff9c, &(0x7f0000000380)='./file0\x00', 0x0, 0x0)
unlink(&(0x7f0000000180)='./bus/file0\x00')
linkat(r0, &(0x7f00000000c0)='\x00', r1, &(0x7f0000000080)='./file1\x00', 0x1400)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x1, 0x0, 0x0)
lseek(r0, 0x0, 0x2)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
ioctl$sock_SIOCDELRT(r0, 0x8906, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x3000002, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setitimer(0x1, 0x0, &(0x7f0000000140))

pipe(&(0x7f00000000c0)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
read(r0, &(0x7f0000000100)=""/167, 0xa7)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r3 = socket$inet_udp(0x2, 0x2, 0x0)
write$binfmt_misc(r1, &(0x7f0000000140)=ANY=[], 0x4240a2a0)
bind$inet(r3, &(0x7f00000002c0)={0x2, 0x0, @local}, 0x10)
connect$inet(r3, &(0x7f0000000040)={0x2, 0x0, @multicast1}, 0x10)
splice(r0, 0x0, r3, 0x0, 0x2ffff, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
r1 = socket$unix(0x1, 0x1, 0x0)
r2 = socket$unix(0x1, 0x1, 0x0)
setrlimit(0x7, &(0x7f0000000080))
dup3(r1, r2, 0x0)
r3 = getpid()
rt_sigqueueinfo(r3, 0x39, &(0x7f0000000000))

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
ptrace(0xffffffffffffffff, 0xffffffffffffffff)
pipe2(&(0x7f0000004440)={0xffffffffffffffff, <r0=>0xffffffffffffffff}, 0x0)
unlinkat(r0, &(0x7f0000000240)='./file0\x00', 0x200)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

r0 = socket$unix(0x1, 0x5, 0x0)
setsockopt(r0, 0x1, 0xd, &(0x7f0000000140)="fb000088", 0x4)

clone(0xa912d700, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
sync_file_range(0xffffffffffffffff, 0x0, 0x0, 0x4951e656620bb816)
setitimer(0x0, &(0x7f0000000000)={{0x0, 0x2710}, {0x0, 0x2710}}, 0x0)
clone(0x0, 0x0, 0x0, 0x0, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
mount$overlay(0x40000a, &(0x7f0000000000)='./file0\x00', &(0x7f00000000c0), 0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="6c6f7765726469723d2fca"])
r0 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setrlimit(0x0, &(0x7f0000000080))
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x3800001, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000200)=[{0x0}, {0x0}], 0x2, 0x0, 0x0)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

mbind(&(0x7f0000ffa000/0x4000)=nil, 0x4000, 0x3, &(0x7f0000000300)=0x1, 0xfff, 0x0)
mlock2(&(0x7f0000ffb000/0x4000)=nil, 0x4000, 0x0)
mlock2(&(0x7f0000ffa000/0x3000)=nil, 0x3000, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = creat(&(0x7f0000000300)='./control\x00', 0x0)
fsetxattr(r0, &(0x7f0000000000)=ANY=[@ANYBLOB='user.'], 0x0, 0x0, 0x0)
flistxattr(r0, 0x0, 0xe)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit(0x0)
exit(0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
umount2(&(0x7f00000005c0)='./file0/../file0\x00', 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
fallocate(r0, 0x0, 0x102000006, 0x6)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
pipe(&(0x7f0000000200)={0xffffffffffffffff, <r2=>0xffffffffffffffff})
ioctl$TUNATTACHFILTER(r2, 0x401054d5, 0x0)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000000140)={<r0=>0xffffffffffffffff})
getsockopt$sock_linger(r0, 0x1, 0xd, &(0x7f0000000180), &(0x7f00000001c0)=0x8)

perf_event_open(&(0x7f0000000040)={0x1, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x50e}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
openat$pidfd(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r0 = memfd_create(&(0x7f0000000180)='\xb3', 0x0)
write$FUSE_DIRENT(r0, &(0x7f0000000080)=ANY=[], 0x29)
mmap(&(0x7f0000000000/0x2000)=nil, 0x2000, 0x81, 0x11, r0, 0x0)
readlinkat(0xffffffffffffffff, &(0x7f0000000000)='./file0\x00', &(0x7f0000007140)=""/122, 0x7a)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
syz_fuse_handle_req(0xffffffffffffffff, &(0x7f0000000000)="9eda438838743bd4e9720bee57093515dc189a5ea685e9556c1c2c3cfc4df50d66d31a48aa312663b68d18c5826b5b55fb738208863dac0f10f423aee7a5d8ddc45ebdfeb7424bae859d7c37ecfc4b63914d5a56d91017dd22bc84f759a15969951aef9d5c88c96560896988fa18cd946cfcc3a0f1c993348377904eac32c980bdf7976ebca2b499cab63c4e841514277fc71d4620e29a92523402485de0e82896484c0ae497a4d686df23ca7b68c3fd5e624d3510d7f94838e54af877ca58a00c5a672bba11f5aa1ed1980dfef47b9973d0bf456ded5e72f1702b3dc5197fce39cba53a038d8dc0ec783ce70577107dc5e8b299e64a0b7f1191f0926bd25762370191710bab2f44e9069f55f8a3f87e4cb488a2fb3348c0bf3b3874291f83e4776b160ea73aafa3919c7c069c73c0052173a63158db8b65541d161f9c964926ad7f06bdd6cb6a32135b04e35701c2e13c49c1f75dc7a25d623378860692d172ec3f1e1f2d9dc77c015c13721efcb101c2390abb847e871132f472a37cc0163b39b1d575a5444e246a08a1afb1a696cabab29498a314429a3b9f44c43ba29f71fac1fbe0d01c3c16d22730932704bcfb0c1b7a432bc51dd3f5dd5afc3b342cbe6a6ff899039e28f9a51881b1d46fdcf31767cb6f5c5c69ab3c80615d77c4d1664fc4ec831b8cea2e752bbb7a9ce79df875b29f1e232751daf32a1a0c4ff8bd0688e2b8e2d668b8a77e20a9eb6ec2e2c23b94e507baeacbcfa31fb6e1ca3343668f43e3aa6d85e7c29bf0bb4dbdabddc92be7f4a6f5d21b19e6da17bfb6cc926e3847532fae29c7b62fb909130ec372d3c16cfe6aaf3ce2af0fe7610fde7aad61bc80d2f96b999c8ccf6d22cf903ca8ae8b879ec4a416f334982e9810c0140a18d4dc81b5edaae23e9f4abaf40ed71512aebbba5bb251545e188db789558a845a2877b14bdaeec3c738b7d730c0860531bf5517d4f0e8f95ed3571f8a35816d5116fcb8d7cbf42b7d5d5e65541508c898bb2e0fe96297d2ab7135662de39df099ebaed5871111f5346278cee5728cec512e6c0a0d65b51e3d627873195b84103341c2bc83b6c8fdd8ba17f5957413f61c69d618c9b9d0b1f08dc81921b6c662ee1da3bfa019b095e9a03c2db4d645ccb7364e895098cbf7d932c72d80663c7a1694d122f7348393079223c11d36c64a5856eae0397ab9a9d948204b74e56525a9d552dd0916de81cbb5af3c59b3d7f8f9154423ce2cb45a5bc808e24bef13212019a19545fe54ba84d01534358380192b8c7b0eda907810375bb66a578a58fec392b47991271c8367b91d710e8a176bc1a4e96f0e137d4c25fbb03eddc392f9f170dd744472b864fbbae7c93d86e682308b21b73c5652065d72cf02e1152b44024a90a3b52eb0bb3cb412e518d37a68aa4c7f46789c54ab30d3a73d0a8712fde612294cda2aa1ccf164930b9b1d17801d4fbb06e849d39bf2b5141330caa0d2618b616f1c67e1ca57080e79ed9092ba7a55e8121cfc825cd26a0199a479a7ab1b7b23d2a4dd82fa6d04ee41ca680435efc934f0451e865e8632ac2f1115f4cdd33b0fccb7a2326127faf20cba37c828613dba5a98f4e1ad25eb6b91078cf73d873df9ef91531476f64b83559ff7ccdc4c070d478b18196ea05fe8d4ea0216ee5273dfabbd04582f40f064c9781afd2cbf30901f28cd09cc934f1b2d50883778274177e3dba8af0a1b931d80ce1a6c4085780ea2195b65ecfd2953f78a5290fe560d0cd6a5e73890a5a82dc410b92a3ef2be05ec5607820fd4ca6b9c3aa258d59022fdcb21665f1ce4e8aad8fd918c43bd3c2afe3dc223ff9f48831d401c8b6996190793d1dd7551f8511b69283992398d8f9b4bd2b3398d3b8c6f3c5d8b802ca5282b702df2b7be4b38e70c3065f8da888631375afcc05ce578089c4f783776b286b7a60d1b5e189e2742a3240c1036a953d886885422eef01413c38099b64505fd5a73488acb4e611820674c58ae74d6c64a885d4beda9bd7903bcdc71e3711e2a057c0eab2100c321050ab14c6e453c53182577ad3178603cd9afde40a701120e9a36074fd582428c74e02781318e6c65450f8f020bd22475696fe13b8c59260e53a06d16eabd135e887a0a6bbc8ad21be7661df76fec5b13844f68b8eed1a7379713738beac9f23c7a26520e19797a910cde9fb285179526889b908b7eb49bb06f70f6271fba8712c1a4269ebcf4b7d043e924e3d2c4c753fd7e547d95841e335179836f76424e728810d7f32b78256ea30c79d9238a6588426e1f2d4c0b03d5605bd826ed24f0f11326b4cf958632b86e017aa80e142db1580c44f76d9c98196f3f6852ab2bfc6a01a3553a130c2d171957f5a45c3550fbbc990ef8742a98a86b280a57b9f198ff436bc01161ada50e6f23026c3254adf2321bff7e20aa54080bbb57d8d52c6a6df6107706a2e5bc6da68f17b474c0edd39401d765086e885cf7992405f856557915603cbe8894676e996bbadbb649a5e7498b91f9bd2f697dd9ebbe4d386050258b9f4c94781e61c660651c3f1e3ae51f8c035eca365bf1c23dcacbd6e225490d7e9c133525f5c9018d752b21b4897bf18b64b6a9936f538a0a8958fc934440aeeaad2b68ac844d76f0900a6c95bd0b353d85d4fb62eb88360112237fd8c636a80e3130b21d66ae8ec58a4b76cba0602f96da919f7e84fd37e3ec2379f58e389a39c78d2482e03c379e3c4649ad63a76e3707ecff07d2fcb0c9dfc524cab49e69a09c92e4f88714335cb57d3f6184d07bef9657280fb5c9fd2d8f940f7ac6c5407e3077aa2e4ba8e217e0ee19e302d6d90e3be05a86dade35d2e454e511afb5cf5936f1d11f2fa6be6ceaa817dbdc7a6aabf2fad8ff3efa8382a25099f0c5989d2ad56ae0f4968b2cfcfc67b4f1c161c75900b4848f59a3c0376dfcb7997bf28e9e85d6dd942a360516de38e1c1a038a796f9a77ff2b0c7e5e8f4932391a0e58e76dacc6f9764178a211dfde3e75d367d2911ff398126ffdf83cf2fbdf1ad5232bed9155f7a168638a572094a9e934d4969b358cf6e121d7fd2aeae2f499068b42c152f0e3403a230885d6f92f038ddaa23499f804ffb06abdbabb51f6c38c92fb1a6271a4b13d6d11125b8ec12efa5907dc65062797fb9cca15e2f254e76b182d3fcdb4e96ac4de36d6df7e7bba5c32f422286b1be3b79bffb6fd693761952d195a84ad9ceb07287a0fbefab9e0347b513c5f60233ccd4b52d90ec144a2f896d9dc7f279f8aa93038f3efa286e1c3006933a4d7183d952f8d28b141b28b2af355b5bd8198dfde1ffb8d09202aff0d16ca3fec194662892a49f829813970a4520f1228aa03d211a45bed3b2e05bf1f10b1a152761e7b6c6ddea863a3c02224256092c70ca70dc185c4c385dd98b09e2682661e1e66f71d9c4037048eb70e8a1cbe57de87ec43713abf5fdcf63b9c482f318e3bec37e878dadbae15a02d731e6c8574eb14c059d72f73be5174add786d06b585a28a06d349d8e434a491b34897b3c1ad786ec8280d7f57edd4fbc6aea5485d659b59d393e331cf91e6ed76f340fcf7cf460892fa7318fc42b883f61d888ad982a751accb613c66661fba5f3d6de751a6a9ef8a4700316aaad04e991aab7903f4ef012ec2a8c092234e74ef335daf360ae47bbd2bbc6ad8c1a4f81efe8bbd703cb55ef36b32b4e30cb5a3b165c02ba295d0e1c40ce6ff8f479a74f01275f113ebfa8ade37a59ce70e6ca2a6f48f1be085f61bf772e2c2da523a2cfe63e99c57bdb1ff23139d4fca49eff7547e9880eefd3f7511a677efa23b52098ba89037c48dfcda2e8c1cfb9f892161049e53f8cee55256279512aecab8c441600dae0fd957883273047cf5c66ba209f830aa2ce0cbe41ca08c0cef4aed7f4324009200661a7ce680e5a8df2d051c1d8b2f63d25d8d74d05c75c46c8f3f24d625539e63459650960498a54ec3b16225bbbf4d3930009df265839d72611f5332a904cdebada108236e4414a2909ad01ec44b9d7f75de4385ad7ca5152e890a0919b3639fd1bcbca3b737ebb8d9ae541b1271cf2166ba15830e66f3d3afd3b754a7f81ad4f0999704ae99c114907c5be4a4797f13b80564f234723a34dbe137dabfd7fa23562df679f54a6ab54def6d63deae9844f72fd73efd0413551f5c4b9ee826eb3b7faf92a59ea34a16723b4fea14d1c8815a4e2d39fc48d1dbce526a7c53f5a96d0ef6463a0cee73fd3505f5c764a264b83c4a21f80e8b61c82d24442d13da99d18dc1b2538e7a510f6093d9ef2bc5cc777d4f98411e93919eddfd69d6e20d227cb61c50f358ea227f4de941fb080c1cf6b1f6e25533768fe133dbfc3f9d29c603bed38aa3c5af5b81a706b0067b40b88f992610d04c7cc36b8f649697cd6a93fae51138161891ae75a7147780fc59af5a6e18c54f9d2a4fe7fa92314b399afba9a40d0cc24f70a2593acf8d179215e06b7a9a88224bafcb2cbf60caf5fe4ff38208a70793b5dc33cd572956260e1c86312d3ba9b3a4b2b44376f2e78c616a6c0880ac8dcbaa30b9f761d500fd03a8518dd0509157b184a2d95e0caf3ffc8ac2db6c54d80c71a1e5b9ea3bf51071e2118af204123daceeb04e4f6f31f32a4d3fbb76ee49440cabda2c121c1b99acab5b87cecc37c3f9066af34ab29d6598bbfd91047a2ac7ce3a8f3027ff5e6d743506f161087278896a98ed37122ba208b61cf54d3929555ab06b564cd5e4f46f4755a6cfa2ef2b30d29ea66f2749d4060d411fa9160c91b6f55cf071ac8222c6313df18759e2958cddfe3db4cbeb9cd39abcf5f0beaecae8437813995cb7ed0b87d42ca942ff7245ece204798d01361c5f008e0d82bdf76660515bc78f7f8f409ccf68614b2cb50f5af2615661326fd971bc57eeeade60ea906b8df1cb0dfafd318cd2c396309c329d0469ca192aa8f51d7c4227685440f073983255baf054b97b9d7be1d1470d7eabd5c09b2116b4e86b0567b7e97e088717a4fe3dbdd310a1c39136ea4d2c47492001f9885dba03bf97e7da376171d666441cdc2f999db137603d57df32b4260fa0165e82917bb1631ea314e7a7437e66fc68cef22cda8f456d6e583f6e3237e0bc79987a9103f7cf0918e26881f67ea582e1ff3a49177599d385bf6e42572a2547933aeddb826530e9adf30dd84c3a7fae5c4c26f6c6f3a9f0906decd314e2407825abef959c5416d18a92ff34e6c521a16e8a0a29937c77d4ee99b41d530a732acbe0bf5d274df9d496b47a9a624546bdcf9976cde12ec989cb2a70b33a7c8a3a77652023164695f9db30dfcf587f0cd4f73e385730bcbdd688f6dcb08ba0efbb9f579220afefa4acfea522e864fce9b1782ce9f14824d16e9d33a2609c23ba3c5a1af02549357a0dcc12e37819d778021762cf895abeac1125b744c8b8225a091e7be9ded9993cfa3ca9abb83e25c8f559009977a2ed9374a89619fae5ef6d164bb73d242004dc8428e44689b33ee3bbe88bb4962ab0a32a90e7aea044f08410752cb2d7aeaf3196648a3a99092665b478bb394b48f79b36db0efc7f50d6a5179c945f5298cfaac5e5dea715296f92abce7281d48a0c9c6b785a35ef5f1697c047ddb254fe9a8ab9f498b0c1ae09ffd01a3d8d427fee7e36c51e0e5c2fee2245fb8464626ab5c9857ebce91f7d22bf024d10c2d71021cd69268472de419e6cefd970ccc5858659be6496799aa7f100411766e712aff08b731460f14f9d7356db12cf8e1c6121968dc68b1d81c086b325ca4ce6fe1f476707e08fa913144b757c6be17cf93150db29544d207f09a896f33b7335d9339215da751e7af2c6bdd19db6f521af2c8a5998dc607f97026d07111488741134c1c86eba123273d1fd5ee4b471e86f9ae9478a04c7482076ab34a1eca5c64f89e5106eed44bceec019c67c12fb4db4fdac153f4ac3b63ffeb6d30de58ec039e2dd3c181e254cd94d0a2b0b44490384cc5915b54ee1db2b6d059879bf8126c9ca976d0f7862da07ecd350930a081810a7afd72b2ad3f65b96ae9c7f91227a2b5513a559f36b90fe01be9ae5ad3ca65e2c26f358fc26b858a3633fda7ae49a5fb705220a5819b3cca41b1ccc21d7c40f5fa9c422288efa5394e4312675899d704a2aab62b8363f58fd4bc12a8bea6ffc45b4414237bf5f019321206dbba439acb5ef26641f30fdac20f964354bce94e4c9d73e137f9806deefaf6f4acaa0e76ad4fef9f6cb7fc01bbabda9612c05adbe46afcf94819e8a4b4b49ff764784fa432d47fb6d4230900043d1b4521cd6839fe8c5df4d1899fdfb13880e207cac73f0a29020bdd563bd9c2f6bcd1ec523b3e03ebf6164fc65af001830c51396f9df2d346f83a59cfc82201cf1150ea57259d579fc2ed199b3fbe42d5188c84e4354610743e5b23a265246313c80b96d936969572e11316bc8926cb23115186f3b2387b82c3898fa41bf16a308da62d5a3eb3609af1943fddde08a4036eb2a41b7292caad9eb082614b02a1fa255bc7abd4d0e3b4ec1801e131e68c7aa9da1a0ff10f9de87dec8fad1ad8bfa99caa49e203a7b9c33e044d4544a537471e7a452468b821959bc488c6b8cbf81e90081a26de273ad1203cc06adb6af242ab19f96c1c66b58c37e2c9309704fba63af99a8d9c5efc651afb631fe9f546b938cc3b8e526c4159e5c9f7afb29fd1d55fabf09367ce2a63a35e7a2062d1c772ed981fd77157a847f687a177cf9886ce41df8cc509302b46bc1e2ba896b1c1656a1bbfdf4cd9ac39cf8510d1c823075f16550fd044aacc8d42a56f03718f7b18475cdc3999faeb25ab3dd8a807ee04d8e5d831d08b4e309dff50330685138797e10c6362636f53f22bfc1f3d5090a5d369282d9de36bb4e2505411ccc6ea395afa1567b15a2fb4be2adeea7126b1a8e80034105e0d98bdd78e796ce1cdc06a4ae666fc0baec5c52614340ed997673e26ec47c88846c000bb7c9075937cd44f5c041fdcc64986e5e1c0f488148f0ee6f842c44c0b72e82109270341bba6e9080b70fcf930d0f10be5a36798ef6051fed72727b72282ff164fc08319d74f1f57cde71b57cb397a9e753f87b97729bafba017a24cbfdee5dfe7fc296c112e93bb8fce560ca80a3afd8370baaa79ad783b51352b5440b144a47378c9ae22eda5794328e95bcca220fd07bb56915529b155c61858efe89ad36a79288e74c0e251addcfaf797432175a5562b46eff5e3aebeb74623e18beef85389383c604d8884431b07dc4bea0174aadc337ff41f558a63f16690feae47efa2a5d1318b7397e1e4ba398727d286791b71610e1d78d32800e7e113c12abf0f60b6ca4401ecd23b7aacd990633b2b017daf6bfef1b2361ece74b7dbcbb1a73d4bc1f9d2e5c9fb0b7980d25cc44d1b10c09ef5a6a05c84669294a5cadf0cd88ab449f9f0bcdd8c48590d416c5c1feaa494a2145949c2a3373df7c6014225f2745bbeb20ff294d22c0d96ca111e6926946207cab56a03162a49e68968e398f70690188ee3ca847ef421742d60b9a6ad029e8a3d607950b2bf8ad8ff297cb39acc94905635770436e134435e28205140331b5100d9f64469792fffac87bca0835cbc617446ff86a7b50418c305f32e658b32130e491e38709fd3697017ac8084cdf1ed81a28375aed092ab4e32ca88a933154dd3a9e99351acbada926b67b310c7070ac1a414a28c5abfe1f45476249a12f18ca2d981528d881ed3c5072e46a6eff3cdf37dcbc89c7f79c88a1f8d15d15beb66a0e4440c7b93e379c4e2bac1d5c8e85f1852887e2cfeb178fba1c67dc2adb0c87df8ca4444ca7f455509f492effb5001328b8cc696e2933207a2d78bbce8562ca34a248193c914406b161c8141479d891b0c6110ec1e25cad38299b489f2ec437017cadba67dcb58abd4933c95b3526f1d4747b8701a7d71e446e4b62e2941d4281faca0cf22914be5aad80f47100000000ceb24e82508fe55a92fb6db70d03d1c1ec09cfee31639341756a4630a0eaaecac7bfbddf9d30c42cbd45eb181d5bd341307ad26f496bb042e2b655c03ac3dcc587acbf50f79b5c239be9938b62d3251b199f8413b020605d5d0552cfd9c39c9132719d6d0a326b000e12fcb51bc274df79d11430060d05978cdd50583f1bca82c57dbee605e2d00fcb5414af13a596d35cb5ba62de6a28cbccc857d23547b1c7fd5ac8fbf6758d5b8451fa46d9acc00344dc2e565674b1dd3547eb8f8aa5fff99042f8d1d59e6ad2f53379211e6832fcb68f5777eb2db85b28f724f4e4ce6342cf55713ff7b0cb4f7f47dd12a6566b86709eaefae024373267ce72a89e7f3e42ab48edcccc96b5d0403fe93a927e5ccf470014f220b8257393226cd7b996f20e6a34f81206733a9fdce03b701943c1b560d3eab68c2c225cf7f7f2b56123be2bb173e9e5b37f4d3348f6b987764ad07c2acd44514ff264d7eda31e5e517a179414841ad4553d51c08f435e05f10aa82d74b97a9ba3a133e6c9175fdcd4f3dc9c16d3be1d5bbaf13240177081ac1d56681bfa988a93af09868afd608520c0bfd71d857a6661fdaf6f2e166987eb007449dd26334ae932c5003fefc0f983b9e49cbfcea325f2de16a9ae935caa46f5b3433957fb370971ed957f138f08a60fed5b84995e428e7ae7d5c22021ff016baef0e713a118344c016a99ad469313ba7f2452da0dd82e019f64aa229cf80a69b3e08ac5847f10d247179855546313232f23e055c2f74ecef14e0fdcc29a9bf0976fbb249bd5c7903183d2a53c70960a183630e7d4928daa7091a85ad987d2a4a5b8f6be6612fa72d9fbb33c67bb38eff19f2e784f94e0354cf6d35a5b2c62233c039de3734b38e97ec72bd673fef09fd56fec329818cc68cdf12cb52f7d37a8350c16e94208880bfcd3e895d7aa4489e3dd15db4a9026f0d2a46f1e89c35845dbd976a1992b87c15a0c7580e6424b8792a7bb7b933d7c5433d4133ba4dbbcf7995d6ed3feaa32f876a287feeb9cc6107778c1f83e0119d980b9e994c2a3ae3de24a103efb3cacb746b49d1ad85746b233ab4aaf0e988ec2a786bc93f32040d3bdc3008031634cdfded5ac95b2279e096243228296591e7ba53c4a127772cc4620e6b238ccad250629194533d0a669ff3366c52d64928693e0b0cbb0b8e2c6029089d4dfe2b4b6c5dcd85f1a02770611e65001e48a32a8b0431a3b9d77fa3a95be38a0436a704c05a8e0183f3214c25531a63796f679bf72885aa766468d42b2543542d7e82544efc5c5e81e6a91a0f5d4e68000cff687d63e45c9a11d4ef515050daa592c9a828ac7c0488e7cdb3d6fdaef5e9176ee68d981ea50d386d74df3b40660351736deb03bfceb721878cf9894b0302df15964242ab6b9f77f98ba1c7993735983d2b022600ab74a19e3636e1400d08ba45d3a5c2774cb06a1c358bbfc11d27efaf7ca53c2e7757c8c76da24707d91a4a5244262898d68083ff91c514d9b9b1ebaa0cb0b10254fda1b1e82b9a1a47f117b5b280ddbec1f6732d11117ef1a7a674699df87fe795d1243cb9c4527e364e2b711b6562a87fafc130ce0baf1701686639b05f0c8dc708f008b1e6ab89e8d623bb83f3d54b7bcdbdacd055ac4eccbd36bbe0af0f65a00e3d6dd985ae8851d176976cfb5816d1fc2a63d3546aecaa4e712ca6961d1f181315d553de6b53485faed0dcfcf819a1ba3badffe797377d3d1ddaed8e7a0acc0c3d277762262a139f94de49faca167b11bf04f2104a5ab9a73367a6461f7124c91a2c4229ef98e6ebde9aac283c7d029400d71293f488ba169b62c1e94689cf5b248ed4aea62b88d65bb764cfe27d5231a58486e7381df518f4ed81cb905108c54a5050a94ca0e94da20d3794bc5fab9127dc95b6404b1e27b4e28136fc27806f7be798444c33aca88ffd45b860eba0d5033839f5a092863954604f1952bd61dad23b11643fe14f3ade08116aa2c13eee701ccd13e506bd65a1060bf69579aea8c8143cd38c0891a3065f251eba0c20ab9c69ddf28e3bd6400cc203bac8de1882239ad4e1b97b0ae2f1abb7bac7c0d8ef82b97ebfb1f5577f06a3a1377b09ada4db87d342f20ab0eca4b9c206042471307511429cb57a578211f92d3647189861cad9145f5eb26ab696abe50a2a6c1b4af61040528aba4e79b586c348a430f5ea61c4be1032fa61d18581f05a07fb8707c8996e0fff1c3eda59b992687fa12483b9327e10224b20d42e8b3fc4670bf070ced602283273d6818acd1f6da567c44d3f5e1377065d43d87d889843ae48e7fa8ba1634815695b8c480ca271e6e833799c70da80fd79acc09b989667a2294de5da73f0363df9a33ad4dab8d27cf7bed0a06838672e3d07d52b6396e9b5576021d5e925abd533bf161c944795065fdd44e8462e3070c479f1c118276653488dd9b2f1a673f8cad3612ca1fab4388ec9c8f834a01a499adb7b3a9a977672f6d75b41bbdd7f91ceb7e7a88568d17bb432be9e4e96e115075bce197ef4754d2914c2c59e2d7f4c08f0dbe34d31f229428f211bf1d7e8f5c319ed4a8273cb6255eb318851ac4557b0278fac63107a54d407c42f300b843a12abd3b893b46c7efac2e388ab42b87aebe2543bd4c15f459bc50aad10ffe1c1196fb52c26e54bdaa7fbd52451f207ffb073ef4b3f71eedd7da40c89505019739e3fa733bcdc84ff4919e8fe2358129ef28291be1d6426b8bafe88463b1d3cd7273745381c7f65221898e6ad361e88b24c54ccc7ac9a830145b6dc096e2d71ef71ec4f03524cb870b724e08d223bdec2f6fdde6200217a13b5136004d455d66547f5a1793e0cad85677d49e5c558852107007c8136812cf021afaf6f7e8f59883371be46cda412dd9c6fcf187c31252ceb5758901d39cd5355ab386d9a7fe6ea46ebf277aaf809c3023211ea9aa189de4d422080ebb9fec50ffab6b95ba4ae5018accc497e79149ed6047ce561ccc10e9194cdccd5c9fb75175c8dbc9d0a916ad59288f010defbbb50d263041ab37aac0f93253bef6f898cd0825d99d27224f26181f9713b8979da64756c95e7505f25a2688960d6155c3613dcc31b6c337a6dbfc6b12cfde1db22b93bbd5e48534fb0bda8b212577a14dcf665c834b0bd24e5f624d2455fe048dbe930328d7cb632db3b0e244bb5d43390b420b15157a339487fc78976f867d3a361aafdd3f50a93c01882da7c220089a544381db22e2c86b228dc2be01820468460437588952a549d37498e529e62aa62bad1580546bcb1e9a6ed1870b7838d05d12f6e3a041e78b1bdb80894626f20889ccb3a468aa4fb24b9c87cbb28623ce59c6b3c6286db366d08004551a25fe4d8d194a2bb7c52e1c85a5fbe4cb15b171489da121bea1c469a6bb185d63213084e3a81ee54dc03a94dc5ecdda7bfaad1df613f51f56627c9d529f13e5c81b5ee4dd228949ca16b9a61d186211d153294470907557e5e14ae665013f285fe4d3766e7b3d8ce5e2a14692072d4d8f79354bcc8db8a2a36c8bcd00", 0x2000, 0x0)
r2 = openat$tun(0xffffffffffffff9c, &(0x7f0000002000), 0x0, 0x0)
r3 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
ioctl$TUNSETIFF(r2, 0x400454ca, &(0x7f0000002040)={'syzkaller1\x00'})
pread64(r2, 0x0, 0x2, 0x0)
r4 = dup2(r3, r2)
ioctl$sock_inet_SIOCSIFADDR(r4, 0x8914, &(0x7f00000000c0)={'syzkaller1\x00', {0x2, 0x0, @initdev}})

r0 = socket$unix(0x1, 0x5, 0x0)
recvmmsg(r0, &(0x7f0000002f00)=[{{0x0, 0x0, 0x0}}, {{0x0, 0x0, &(0x7f0000002d80)=[{&(0x7f0000000940)=""/50, 0x32}], 0x1}}], 0x2, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x54041bc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = getpid()
r2 = gettid()
tkill(r2, 0x14)
ptrace(0x4206, r1)
ptrace(0x4208, r1)
wait4(0xffffffffffffffff, 0x0, 0x80000001, 0x0)

creat(&(0x7f0000000080)='./file0\x00', 0x0)
mount(&(0x7f0000000380)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000040)='proc\x00', 0x0, 0x0)
r0 = syz_open_procfs(0x0, &(0x7f0000000000)='mounts\x00')
chroot(&(0x7f00000000c0)='./file0\x00')
preadv(r0, 0x0, 0x0, 0x0, 0x0)

r0 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
close(r0)
r1 = socket$inet_tcp(0x2, 0x1, 0x0)
r2 = socket$inet6_tcp(0xa, 0x1, 0x0)
bind$inet6(r2, &(0x7f0000000500)={0xa, 0x2}, 0x1c)
listen(r2, 0x0)
connect$inet(r1, &(0x7f00000001c0)={0x2, 0x2, @empty}, 0x10)
dup3(r1, r2, 0x0)
write$P9_RSETATTR(r0, 0x0, 0x0)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
fcntl$lock(r0, 0x7, &(0x7f0000000040)={0x1})

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
perf_event_open(&(0x7f0000000bc0)={0x2, 0x70, 0xc7}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
ioctl$TCSETS(r0, 0x5401, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, 0x0, "e19e4e78dc2661c00409fa3716707ee4022eee"})

sched_getaffinity(0x0, 0xffffffffffffffd0, &(0x7f0000000000))

rt_sigprocmask(0x0, &(0x7f0000000100)={[0xfffffffffffe]}, 0x0, 0x8)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1000006, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mknodat$loop(r0, &(0x7f0000000140)='./file0\x00', 0x0, 0x1)
r1 = gettid()
rt_sigqueueinfo(r1, 0xa, &(0x7f0000000040))
ppoll(0x0, 0x0, 0x0, &(0x7f0000000540), 0x8)

io_setup(0x0, &(0x7f0000000000)=<r0=>0x0)
io_setup(0x0, &(0x7f0000000140))
io_setup(0x0, &(0x7f0000000040)=<r1=>0x0)
io_getevents(r1, 0x3, 0x3, &(0x7f0000000240)=[{}, {}, {}], 0x0)
io_destroy(r1)
io_destroy(r0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdirat(0xffffffffffffffff, &(0x7f0000000200)='\x13\x13w\xc5\xfc5\xd4\x14T\xd5\xd4\x1d)\xad\x1a`)Y\x81F\xe6\xbe\x16nA\xad\r\xbd@T\x03<\x9f3\xbb\xda\x82$\xa2\xf3\xd7r\xe7cnH\xb3<\xbfp\x83r\xe8\xf1\xb9\x93>\xc5\x12wC\xbe\"\x06 \x9e\xf0-\xf9\xcb\xf2\xf6\xe8\x80\xd38/\x00', 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = getpid()
r1 = getpid()
rt_tgsigqueueinfo(r1, r0, 0x15, &(0x7f00000000c0))
ptrace(0x10, r1)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800013, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ptrace$peeksig(0x420a, r1, 0x0, 0x0)

r0 = socket(0x1, 0x5, 0x0)
getsockopt$inet_tcp_TCP_ZEROCOPY_RECEIVE(r0, 0x6, 0x23, 0x0, &(0x7f00000000c0))

syz_emit_ethernet(0x2a, &(0x7f0000000000)={@multicast, @local, @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x1c, 0x0, 0x0, 0x0, 0x1, 0x0, @remote, @local}, @info_reply}}}}, 0x0)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
setresuid(0x0, 0xee01, 0x0)
setresgid(0x0, 0x0, 0xee01)
r0 = gettid()
tkill(r0, 0x25)

perf_event_open(&(0x7f000025c000)={0x2, 0x70, 0x15}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
r1 = openat$tcp_mem(0xffffffffffffff9c, &(0x7f0000000080)='/proc/sys/net/ipv4/tcp_wmem\x00', 0x1, 0x0)
lseek(r1, 0x5e, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
openat$tun(0xffffffffffffff9c, &(0x7f0000000080), 0x50000, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
exit_group(0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
mkdirat(0xffffffffffffff9c, &(0x7f0000000080)='./file0\x00', 0x0)
mkdirat$cgroup_root(0xffffffffffffff9c, &(0x7f0000000000)='./cgroup.cpu/syz1\x00', 0x1ff)
mount$fuse(0x20000000, &(0x7f00000004c0)='./file0\x00', 0x0, 0x7a04, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
futimesat(0xffffffffffffff9c, &(0x7f0000000080)='./file0\x00', &(0x7f0000000100)={{0x0, 0xea60}})

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
timerfd_create(0x3, 0x0)
rt_sigreturn()

symlinkat(&(0x7f0000003f00)='./file0\x00', 0xffffffffffffff9c, &(0x7f0000003f40)='./file0\x00')
mount(&(0x7f0000001300)=ANY=[], &(0x7f00000001c0)='./file0\x00', &(0x7f0000000000)='devtmpfs\x00', 0x0, 0x0)
mount(&(0x7f0000000080)=ANY=[], &(0x7f0000000000)='./file0\x00', &(0x7f00000000c0)='devtmpfs\x00', 0x0, 0x0)
open(&(0x7f0000000600)='./file0/../file0\x00', 0x0, 0x0)

clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = openat(0xffffffffffffff9c, &(0x7f0000000100)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r1, &(0x7f0000d06ff8)='./file0\x00')
r2 = openat(0xffffffffffffff9c, &(0x7f0000000200)='./file0\x00', 0x0, 0x0)
linkat(r1, &(0x7f0000000140)='./file0\x00', r2, &(0x7f0000000240)='.\x00', 0x0)
rt_tgsigqueueinfo(r0, r0, 0x3e, &(0x7f0000000140))

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0xb635773f06ebbeee, 0x8031, 0xffffffffffffffff, 0x0)
r2 = getpid()
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000080)={<r3=>0xffffffffffffffff, <r4=>0xffffffffffffffff})
ioctl$int_in(r3, 0x5452, &(0x7f0000000100)=0x3)
fcntl$setsig(r3, 0xa, 0x12)
r5 = getpgid(0x0)
fcntl$setownex(r3, 0xf, &(0x7f0000000180)={0x0, r5})
recvmsg(r4, &(0x7f000095cfc8)={0x0, 0x0, 0x0}, 0x0)
dup2(r3, r4)
tkill(r2, 0x15)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
timer_getoverrun(0x0)
r1 = gettid()
tkill(r1, 0x37)

clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = inotify_init1(0x0)
mknodat(r0, &(0x7f0000000080)='.\x00', 0x0, 0x0)
exit(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000300)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r1 = fork()
ptrace(0x10, r1)
clone(0xa912d700, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rt_sigreturn()
ptrace$pokeuser(0x6, r1, 0xffff, 0x0)
r2 = getpid()
tkill(r2, 0x22)

mkdir(&(0x7f00000001c0)='./file0\x00', 0x0)
mount(0x0, &(0x7f0000000000)='./file0\x00', &(0x7f0000000140)='cpuset\x00', 0x0, 0x0)
r0 = openat(0xffffffffffffffff, &(0x7f00000001c0)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x180000f, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f00000001c0)='./file0\x00', 0x0, 0x0)
fsetxattr$trusted_overlay_nlink(r1, &(0x7f0000000180), 0x0, 0x0, 0x0)

clone(0x40006300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = gettid()
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
creat(&(0x7f0000000000)='./file0/file1\x00', 0x0)
rename(&(0x7f0000000240)='./file0/file1\x00', &(0x7f0000000280)='./file0\x00')
tgkill(r0, r1, 0x24)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
r1 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000080), 0x80000000008a01, 0x0)
ioctl$KDFONTOP_SET_DEF(0xffffffffffffffff, 0x4b72, &(0x7f0000000040)={0x2, 0x0, 0x0, 0x0, 0x0, &(0x7f0000000140)="8da49567bd6994381e4572e4512c2aa3f40a5142465134daae69738c2c4a48bfa040df495e5b45f45feac7f4cbfc6bc0beec368e04b081a442b8d3f19230e96bb0244e5f2061238b2599e3663ac2ec14fbaa629672e20497d10f12934f5dba72d219b31a1780d56b42acfd57247fa857610f1fa71a4a081c0b7e69fc764fa335c8ab6674bc3468776ef6dfeaed7208e237b46f32dc93f659d2392ec35746a71d686c67ed87332197e1b97ef15b889526af3e1fc4eaed969623c2c5f675d9a58a0164334f98230f8bd7ff3a90c27e68063a6789f85e6ed5d4964d7c4d229160187b2def353ad99c6945cc0c802507ad771695f022998f710c01cb78941e3b0fb3996adcc401c8f64617f4853d7ca470bd05ee8073764dde15ffcd19bb6c9d8133f9b3e37ab3a7dbe0cc27dec34effe1ef312ef188e3a0796a0ae5aa790a20659d2063a3a0c35da238c2012ec99faabe9c2254f55a2776524d80bf315cf19bcc2d1b714c343796ca111ee34b860bcf8a3d4b827f4cb334e4f14744bc2e392023512c50d08f8368665b2ddbbbb727a924b0c99c3a4957fc583df1425867b1bea30d5495eadb56e4a9b5773511a17826a01e512bb2cf3bf2dffb15082ead2889ed316253ad9f6a4c9ac169894cdae52ecc6608f30ab22bc03a7111cdb4f70558649419296d4018a7f7217f54936217610b651b7c9b1203232e2725be92c357edf97f82620d7af1393753fc975c41392cc4c52d454c4044f77fdaae30732eb26374bdd4e4d8a8e5e1f4a444eeb537080e5fa68dc20c28445b0d91e4e7b7b4a958c6a0157ee1404fe7c18532c78aee1c3362f5816ff1f8f9a433bb3b2c87530654c04b41b7bbb3c4bbaf7192f2a482891b3ec1376db392866c71797a926f78d1548f2a7e2bccdea8f82844dc0e38872aa5e1e23d62ded440040ca91400cc4de81015d5d517fc5bab7a877ed7d5a893299dd29fb7b72b5cbe5b7c7bc589c961fe339872380446ca50115d22a7135f522dd1e04f4082fac8ea7269a53fd285c329886395116e68c7e79437a867f1b7a1690d3b9c48885b49d30e00ca5bad17eaa8be7e74d28369224525ba0fe1a4afed4ced14a8bea126f76788cf5b15c28dd21adeb6c5296194e464548357dae9c136ac1b6aaf245ddd3716bab68bf1c13cd8c96926dad37df84601e1c4535df068ddbc56cfdd3505a07acfc1f4c351aa5ba92a2ce6fc605d34f2c0efeb4bb0c16a04f87612c639c12b3200a2d1ae3fabad11200b4f38741be7673a624c3da15956be77b243c1596fb3af7f22633329868842a790430145e103a232e23cceaa42c96cf8a6ad3729ca7224d2fec7073e08329a2ef6cbee1003ec63f3886c9a5e110fd5318970e4f5b751f9fe3403a5676341987351220ca011117e3051ce9681fa8aad6c4e382d314bb8bfc440d229eac05869600d1b8f"})
write$binfmt_aout(r1, &(0x7f00000000c0)=ANY=[], 0xffffff78)
ioctl$TCSETS(r1, 0x5402, &(0x7f0000000100)={0xfffffff8, 0x0, 0x0, 0x0, 0x0, "4e51b4639791a72f7b56ae4cce34c00fc30138"})

mkdir(&(0x7f0000000000)='./file0\x00', 0x0)
symlink(&(0x7f0000000080)='./file2\x00', &(0x7f00000000c0)='./file0/file0\x00')
link(&(0x7f0000000100)='./file0/file0\x00', 0x0)
rmdir(&(0x7f0000000040)='./file0/../file0\x00')

utimensat(0xffffffffffffff9c, &(0x7f0000000000)='.\x00', &(0x7f0000000140)={{}, {0x0, 0x3ffffffe}}, 0x0)

socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000800)={<r0=>0xffffffffffffffff})
setsockopt$sock_int(r0, 0x1, 0x7, &(0x7f0000000000), 0x4)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_mreqn(r0, 0x0, 0x20, 0x0, 0x0)
exit(0x0)

clone(0x2006d380, 0x0, 0x0, 0x0, 0x0)
sendmsg$sock(0xffffffffffffffff, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x0, &(0x7f0000000000)=[@timestamping={{0x14, 0x1, 0x25, 0x2}}], 0x18}, 0x0)
r0 = openat$tun(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$TUNSETIFF(r0, 0x400454ca, &(0x7f0000000000)={'lo\x00'})
rt_sigreturn()

r0 = socket$inet_udp(0x2, 0x2, 0x0)
setsockopt$inet_mreq(r0, 0x0, 0x20, &(0x7f0000000040)={@local, @empty}, 0x8)

r0 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
getsockopt$IP6T_SO_GET_ENTRIES(r0, 0x29, 0x41, &(0x7f0000000040)={'raw\x00', 0x4, "6d1987cb"}, &(0x7f00000000c0)=0x28)

r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
connect(r0, &(0x7f00000000c0)=@in={0x2, 0x0, @empty}, 0x80)
getpeername(r0, 0x0, &(0x7f0000000080))

r0 = socket$inet6(0xa, 0x2, 0x0)
perf_event_open(&(0x7f000025c000)={0x2, 0x70, 0x15, 0x0, 0x0, 0x0, 0x0, 0x1}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000000)={<r1=>0xffffffffffffffff})
r2 = dup(r1)
ioctl$PERF_EVENT_IOC_ENABLE(r2, 0x8912, 0x400200)
setsockopt$inet6_opts(r0, 0x29, 0x19, &(0x7f00000013c0)=@routing, 0x8)

r0 = socket$netlink(0x10, 0x3, 0x0)
bind$netlink(r0, &(0x7f0000000000)={0x10, 0x0, 0x25dfdbfc}, 0xc)
bind$netlink(r0, &(0x7f0000000040)={0x10, 0x0, 0x25dfdbfc}, 0xc)

setrlimit(0x7, &(0x7f00000000c0))
epoll_create1(0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = timerfd_create(0x0, 0x0)
timerfd_settime(r1, 0x1, &(0x7f0000000180)={{}, {0x0, 0x3938700}}, 0x0)
timerfd_settime(r1, 0x0, &(0x7f0000000200)={{0x77359400}, {0x0, 0x989680}}, 0x0)

r0 = syz_open_procfs(0x0, &(0x7f0000000040)='task\x00')
getdents(r0, &(0x7f00000000c0)=""/48, 0x30)

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r0 = fork()
ptrace(0x10, r0)
ptrace$setregs(0xd, r0, 0x0, &(0x7f0000000000))
exit_group(0x0)

clone(0x0, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
getpid()
getpid()
rt_tgsigqueueinfo(0x0, 0x0, 0x0, 0x0)
r0 = getpid()
waitid(0x2, r0, 0x0, 0x60000003, 0x0)

r0 = open$dir(&(0x7f0000000200)='.\x00', 0x0, 0x0)
openat$dir(0xffffffffffffff9c, &(0x7f0000000100)='./file0\x00', 0x40, 0x0)
pipe2(&(0x7f0000000000)={0xffffffffffffffff, <r1=>0xffffffffffffffff}, 0x0)
linkat(r0, &(0x7f0000000000)='./file0\x00', r1, &(0x7f0000000040)='./file0/file0\x00', 0x0)

clone(0x2006d380, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
getsockopt$inet_pktinfo(r0, 0x0, 0x8, 0x0, &(0x7f0000000040))
rt_sigreturn()

clone(0x6900, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0x0, &(0x7f0000000040)='environ\x00')
lseek(r0, 0xfffffffffffffffc, 0x0)
setrlimit(0x0, &(0x7f0000000080))

r0 = syz_open_procfs$namespace(0xffffffffffffffff, &(0x7f0000000000)='ns/pid\x00')
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
fsetxattr$trusted_overlay_origin(r0, &(0x7f0000001800), 0x0, 0x0, 0x0)
rt_sigreturn()

clone(0x2000411cf7c, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mprotect(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x0)
times(&(0x7f00000013c0))
exit(0x0)

r0 = socket$inet(0x2, 0x1, 0x0)
r1 = dup(r0)
bind$inet(r0, &(0x7f0000000040)={0x2, 0x4e23, @broadcast}, 0x10)
connect$inet(r0, &(0x7f0000000780)={0x2, 0x4e23}, 0x10)
write$cgroup_type(r1, &(0x7f0000000000), 0x9)
recvmmsg(r1, &(0x7f00000024c0)=[{{0x0, 0x0, &(0x7f0000000980)=[{&(0x7f0000000140)=""/167, 0xa7}], 0x1}}, {{0x0, 0x0, 0x0}}], 0x2, 0x140, 0x0)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x210000000013, &(0x7f00000000c0)=0x100000001, 0x4)
bind$inet(r0, &(0x7f0000000080)={0x2, 0x4e21, @local}, 0x10)
setsockopt$inet_tcp_TCP_REPAIR_QUEUE(r0, 0x6, 0x14, &(0x7f0000000140)=0x2, 0x4)
connect$inet(r0, &(0x7f0000000180)={0x2, 0x4e21, @local}, 0x10)
writev(r0, &(0x7f0000000240)=[{&(0x7f0000001c40)='Q', 0x1}, {&(0x7f0000001c80)="ffd6ec4fe03d617d31bbc507dd987afbdc601aafa375501579d1ac6e20683db35af44d973d615108d9eb4b2874a437b41de79a93e88e3ffaea8916a965f3ba15dea70b0ef6ba1ccb5a48df6dfd694f4999ebb8f07ccf62bcab19f898ad89aafacfa7d0efc7e9afba5c797726f5c7b2245cb97b176cf5e6b5ca02e8e5301a92a14b722860c2a9a1ba3906941ad104ee55811e8387bb6a9de61a06ad510904e3257efc02c4b03778f055d5b510113ece227b4a030165d86ca3e7a7c01d844c9bac53f8466424a21d147b83a3d2835516f3dd70", 0xd2}, {&(0x7f0000001d80)="c5ddd4117c1b10eff3d7db2717b4e4a48ae3d3d8af6560225b1209d4b9599e40bd138f0224cd4adb3159cbeaa72db0743dad3064ece9da5f1e823c400be22006216f9a0c382ca12f062b409808efc98c05d231608972c8d5c90d9ca236342f155c2d8b996274de70086cf6ae884271efa5a246c423ceae1cbdaa0c96a55c18614d84a4043852da52444077749492", 0x8e}, {&(0x7f0000001e40)="69d18e7f061a3fed97fdd599175efa11d44ffeb51033b5ac696759d3ba21b431b9961caf818f7a9fdfc5abf2e03b92e6e49e31af3bba1a788230da547641b440590fe92185c183d66040b8674d9f0749f67965474bfd676d549f7e09b80565c5622e8b05523bf18bfe54b55d3c9038b2496e9ee03660cfa88329a2906dcbeab802e9ef98ffab3936650f46b6a55d7bc0e787c24e5746359f97111946f647393a9fbf6d829248a9a92866c3bab1dcd65e20374537c1f6ed91", 0xb8}], 0x4)
sendto$inet(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0, 0x0)
setsockopt$inet_tcp_TCP_REPAIR_OPTIONS(r0, 0x6, 0x16, &(0x7f0000000300)=[@window, @window, @window={0x3, 0x200}, @mss, @window, @mss={0x2, 0x53be}, @timestamp, @sack_perm], 0x8)
setsockopt$inet_tcp_TCP_REPAIR(r0, 0x6, 0x13, &(0x7f0000000200), 0x88)
sendto$inet(r0, &(0x7f00000004c0)="34e2de4d8d957a8de4e490b6cd20b988d4edef164bd3377aa381b5f50b7ca40a516489f78cd7208982e9bde22b2b7c1c7606d565477f3db9d2b077283644c0f27ab52a863a42863e06944e40a0b3c5d21c8cbe052e7f726263f28aef1bc12a069063d4c30e8f329fdb36859be727fbef4314161e5fb5f01ae00a2634d5cdecca2089c62e32f4c919886b2b88d237e287318739bec0364caf15889f38a312ef6621c0f21709a4bf2b16274cf933f6ad8fcc9c2024bc1b4713f650e860f93ae93b2361956b3e80c38c5fd29b5c1b5d7ce67edc856a8dc0ba54cee53de9a48c131389426bd06ec7c695add357934fc0321f0d3d7982e4fe5a0039decc491a663afd02facb08dd9695f854c7b031d9af8bd7350897996b5208b23030cc0feb84570730eaf24b9f2ac05d0feb3be07a29f887095f36f3c8f0e77e45509acd14a5be4a1572dd4cd1231087b830fa03e071571d4abd694710ef140469cf6df8a59839aafe046a5bffb97e5247be901789eafd726ba090337a2c49207e6b900c7e982472e6aac70e5d52ca2c1bab47b1f6d00f9601e2281686c21f770ae96e0ffec4b30496d012fa00958f794cdbd721bd155cae87", 0x109e8, 0x805, 0x0, 0x6)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
mount$9p_rdma(&(0x7f0000000040), &(0x7f0000000080)='./file0\x00', &(0x7f0000000100), 0x80011, 0x0)
rt_sigreturn()

io_setup(0x0, &(0x7f0000000140)=<r0=>0x0)
io_setup(0x0, &(0x7f0000000180))
io_destroy(r0)
io_setup(0x0, &(0x7f0000000000))

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f0000000040)={0x2, &(0x7f0000000140)=[{0xfdff}, {0x6}]})

r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
sendto$inet6(r0, &(0x7f0000000000)="8000000000000000", 0x8, 0x0, &(0x7f0000000140)={0xa, 0x0, 0x0, @local}, 0x1c)
sendto$inet6(r0, &(0x7f0000000180)="800000007e76d214", 0x8, 0x0, &(0x7f0000000140)={0xa, 0x0, 0x0, @loopback}, 0x1c)
recvfrom(r0, 0x0, 0x0, 0x0, 0x0, 0x0)

syz_emit_ethernet(0x36, &(0x7f0000000100)={@link_local, @multicast, @void, {@ipv4={0x800, @tcp={{0x5, 0x4, 0x0, 0x0, 0x28, 0x0, 0x300, 0x0, 0x6, 0x0, @dev, @local}, {{0x0, 0x0, 0x41424344, 0x41424344, 0x0, 0x0, 0x5}}}}}}, 0x0)

mkdir(&(0x7f0000000280)='./file0\x00', 0x0)
mount(0x0, &(0x7f0000000080)='./file0\x00', &(0x7f0000000940)='tmpfs\x00', 0x0, 0x0)
setxattr$system_posix_acl(&(0x7f0000000040)='./file0\x00', &(0x7f0000000140)='system.posix_acl_default\x00', &(0x7f0000000440), 0x24, 0x0)
r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
chdir(&(0x7f0000000300)='./file0\x00')
mkdir(&(0x7f00000001c0)='./file0\x00', 0x0)
mkdir(&(0x7f00000009c0)='./file1\x00', 0x0)
mount$overlay(0x40000d, &(0x7f0000000000)='./file0\x00', &(0x7f00000000c0), 0x0, &(0x7f0000000100)={[{@upperdir={'upperdir', 0x3d, './file0'}}, {@lowerdir={'lowerdir', 0x3d, './file0'}}, {@workdir={'workdir', 0x3d, './file1'}, 0x5c}], [], 0xf603000000000000})

getcwd(0xfffffffffffffffe, 0x8f11daa2dba34ceb)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
r1 = dup(r0)
getsockopt$inet6_IPV6_IPSEC_POLICY(r1, 0x29, 0x22, 0x0, &(0x7f00000002c0))

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
io_submit(0x0, 0xfffffffffffffdc6, 0x0)
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = socket$inet6(0xa, 0x800000000000002, 0x0)
connect$inet6(r1, &(0x7f0000000000)={0xa, 0x0, 0x0, @ipv4={'\x00', '\xff\xff', @local}}, 0x1c)
sendmmsg$inet6(r1, &(0x7f000000bd40)=[{{&(0x7f0000003a80)={0xa, 0x4e23, 0x0, @ipv4={'\x00', '\xff\xff', @multicast2}}, 0x1c, 0x0}}], 0x1, 0x8004)
sendmmsg(r1, &(0x7f0000001a80)=[{{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xffffffe0}}, {{&(0x7f0000000040)=@in={0x2, 0x0, @local}, 0x80, 0x0, 0x0, &(0x7f0000002240)=ANY=[], 0x90}}], 0x2, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
setitimer(0x1, 0x0, &(0x7f00000004c0))

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ioctl$TIOCSETD(r0, 0x5423, &(0x7f0000000080)=0x3)
ioctl$TIOCVHANGUP(r0, 0x541b, 0xc04a01)

r0 = fork()
ptrace$setsig(0x4203, 0x0, 0x0, &(0x7f0000000080))
waitid(0x1, r0, &(0x7f0000000180), 0x3, &(0x7f0000000000))

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$unix(0x1, 0x2, 0x0)
ioctl$sock_TIOCOUTQ(r0, 0x5411, 0x0)
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0xa912d700, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = openat$full(0xffffffffffffff9c, &(0x7f00000000c0), 0x41, 0x0)
pwritev(r1, 0x0, 0x0, 0x0, 0x0)
exit(0x0)

pipe2(&(0x7f0000000000)={<r0=>0xffffffffffffffff}, 0x0)
io_setup(0x9, &(0x7f0000000100)=<r1=>0x0)
r2 = socket$unix(0x1, 0x2, 0x0)
io_submit(r1, 0x2, &(0x7f00000014c0)=[&(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, 0x0, r0, 0x0}, &(0x7f0000000300)={0x0, 0x0, 0x0, 0x0, 0x0, r2, 0x0, 0x0, 0xfffffffffffffff8}])

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mknod(&(0x7f0000000000)='./file1\x00', 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
mkdir(&(0x7f00000007c0)='./file0\x00', 0x0)
rename(&(0x7f0000000340)='./file0\x00', &(0x7f0000000380)='./file1\x00')
r1 = gettid()
tkill(r1, 0x18)

syz_open_procfs(0x0, 0x0)
r0 = syz_open_procfs(0x0, &(0x7f0000000040)='net/dev\x00')
read$FUSE(r0, 0x0, 0x0)

mkdir(&(0x7f0000000180)='./bus\x00', 0x0)
chdir(&(0x7f00000002c0)='./bus\x00')
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
capset(&(0x7f00000000c0)={0x20071026}, &(0x7f0000000100))
open(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
setsockopt$inet6_IPV6_ADDRFORM(r0, 0x6, 0x1, 0x0, 0x0)

r0 = openat$tun(0xffffffffffffff9c, &(0x7f00000000c0), 0x208200, 0x0)
ioctl$TUNSETOWNER(r0, 0x400454cc, 0xee00)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
bind$inet6(r1, &(0x7f0000000500)={0xa, 0x2, 0x0, @empty}, 0x1c)
listen(r1, 0x0)
r2 = accept$inet(r1, 0x0, 0x0)
connect$inet(r0, &(0x7f00000001c0)={0x2, 0x2, @local}, 0x10)
setsockopt$inet_mreq(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
getsockopt$inet_tcp_buf(r2, 0x6, 0xb, 0x0, &(0x7f0000000080))

r0 = socket$inet_udp(0x2, 0x2, 0x0)
connect$inet(r0, &(0x7f0000000040)={0x2, 0x0, @empty}, 0x10)
setsockopt$inet_int(r0, 0x0, 0xb, &(0x7f00000000c0)=0x2, 0x4)
write$binfmt_elf32(r0, &(0x7f0000000880)=ANY=[], 0x483)
sendto$inet(r0, 0x0, 0x0, 0x0, 0x0, 0x0)
write$binfmt_misc(r0, 0x0, 0x0)
recvmmsg(r0, &(0x7f0000003f80)=[{{0x0, 0x0, 0x0}}], 0x1, 0x2000, 0x0)

r0 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = dup(r0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r2, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ioctl$TIOCGWINSZ(r1, 0x5413, &(0x7f0000000000))

rt_sigaction(0x12, 0x0, 0xfffffffffffffffd, 0x8, &(0x7f0000000100))

prlimit64(0x0, 0x0, 0x0, 0x0)
getsockopt$sock_cred(0xffffffffffffffff, 0x1, 0x11, 0x0, 0x0)
mmap(&(0x7f00009fd000/0x600000)=nil, 0x600000, 0x2, 0x6031, 0xffffffffffffffff, 0x0)
syz_open_dev$tty20(0xc, 0x4, 0x0)
mremap(&(0x7f0000a01000/0x4000)=nil, 0x4000, 0x800000, 0x2, &(0x7f0000130000/0x800000)=nil)
ioctl$TIOCSLCKTRMIOS(0xffffffffffffffff, 0x5457, 0x0)
mremap(&(0x7f0000f08000/0x3000)=nil, 0x3000, 0x4000, 0x3, &(0x7f0000ff0000/0x4000)=nil)
munmap(&(0x7f0000851000/0x6000)=nil, 0x6000)
munlockall()

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r0 = fork()
ptrace(0x10, r0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000a, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
setitimer(0x0, 0x0, 0x0)
ptrace$cont(0x20, r0, 0x0, 0x0)
exit_group(0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
tee(r0, r0, 0xfff, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
fallocate(r1, 0x0, 0x102000006, 0x6)

syz_emit_ethernet(0x72, &(0x7f0000000080)={@multicast, @random="c2c53fedbfca", @void, {@ipv4={0x800, @udp={{0x7, 0x4, 0x0, 0x0, 0x64, 0x0, 0x0, 0x0, 0x11, 0x0, @rand_addr, @broadcast, {[@timestamp_prespec={0x44, 0x4, 0xb0}, @end]}}, {0x0, 0x0, 0x48, 0x0, @wg=@cookie={0x3, 0x0, "e7e943e9481b0e995019028fdcc541b2c70d0e3dea45fedf", "ded66c725441547c44fccf81effe968208eb17b211f9ff5cee002677090d23a6"}}}}}}, 0x0)

clone(0x106300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$tcp_mem(0xffffff9c, &(0x7f0000000000)='/proc/sys/net/ipv4/tcp_rmem\x00', 0x1, 0x0)
write$tcp_mem(r0, &(0x7f00000000c0), 0x11)
prlimit64(0x0, 0x0, &(0x7f0000000140), 0x0)

times(0x0)
times(&(0x7f0000000080))
mkdirat$cgroup_root(0xffffffffffffff9c, &(0x7f0000000040)='./cgroup.net/syz0\x00', 0x1ff)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = syz_open_procfs$userns(0xffffffffffffffff, &(0x7f0000000080))
fchmod(r1, 0x0)

r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
getsockopt$inet_int(r0, 0x0, 0x2, &(0x7f00000000c0), &(0x7f0000000100)=0x4)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
signalfd4(0xffffffffffffffff, &(0x7f0000000040), 0x8, 0x81000)
rt_sigreturn()

clone(0x5100, 0x0, 0x0, 0x0, 0x0)
openat$full(0xffffffffffffff9c, &(0x7f0000000240), 0x21c0, 0x0)
rt_sigreturn()

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
r1 = dup(r0)
getsockopt$inet6_int(r1, 0x29, 0x43, &(0x7f0000000000), &(0x7f00000000c0)=0x4)
exit_group(0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$nl_route(0x10, 0x3, 0x0)
setsockopt$sock_int(r0, 0x1, 0x8, 0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000080), 0x0)

clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = syz_open_procfs(0xffffffffffffffff, &(0x7f00000000c0)='uid_map\x00')
ppoll(0x0, 0x0, 0x0, 0x0, 0x0)
pread64(r0, 0x0, 0x0, 0x5e5)

r0 = socket$inet(0x2, 0x3, 0xff)
sendto$inet(r0, 0x0, 0x0, 0x0, &(0x7f0000000100)={0x2, 0x0, @local}, 0x10)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat$fuse(0xffffffffffffff9c, &(0x7f0000000280), 0x2, 0x0)
lseek(r0, 0x0, 0x0)
r1 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r2, 0x0)
preadv(r1, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
rt_sigreturn()

openat(0xffffffffffffffff, &(0x7f0000000000)='/', 0x1, 0x400)

perf_event_open(&(0x7f0000000100)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3c43, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={0x0}}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
socket$unix(0x1, 0x2, 0x0)
r0 = getpid()
clone(0x20026045dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
sched_setscheduler(r0, 0x0, &(0x7f0000000240)=0x5)
exit_group(0x0)
syz_emit_ethernet(0xfdef, &(0x7f0000000a80)={@broadcast, @multicast, @void, {@ipv6={0x86dd, @icmpv6={0x0, 0x6, "65db07", 0x28, 0x29, 0x0, @remote, @mcast2, {[], @ndisc_redir={0x89, 0x0, 0x0, '\x00', @empty, @mcast2}}}}}}, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = memfd_create(&(0x7f0000000200)='H)\xd4\x98#\'%nody\xed\xb0/\xa5\x7f\xfb\xd0ql\x86\xcd\xf6\x14\x93\xb0\x7f\x0eK@.\xc8\xa5\xb31\x10\x0f/;7\xce\xc7\xe3)L\x83\x1c\x06\xb7+&\x88i/\xdb6\xd6\xe26\xdd\xbd\xf9\x0e\xc1*\xbf\xe8,\xc3\xcb\xac\xfeq\x91\x17%M\xf6\x1d\xc6\xa7\xaf\xa0\xb0\xfc\xff\x13u\x98\xd7\xf5\x81\x12\xf4d\xc5\x94A\x03\xa8g\x18\xf5\xa5\x84\xb33H.\xce\xcd|\xf9\x86\xb7s\xf4\xb3)~\x83\xd6\xd7\x03\xcdz\xa6\x9b\x176\xb9\x90\xe3\xfb', 0x0)
fallocate(r0, 0x26, 0x0, 0x0)
rt_sigreturn()

r0 = socket$inet6(0xa, 0x802, 0x0)
r1 = openat(0xffffffffffffffff, &(0x7f0000000080)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
r2 = dup3(r0, r1, 0x0)
connect$inet6(r2, &(0x7f00000000c0)={0xa, 0x0, 0x0, @loopback}, 0x1c)
r3 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r3, &(0x7f0000000280), 0x1, 0x0, 0x0)
sendto$inet6(r0, 0x0, 0x0, 0x0, 0x0, 0x0)

getgroups(0x1, &(0x7f0000000080)=[<r0=>0xffffffffffffffff])
setgid(r0)

pipe(&(0x7f0000000d00)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
fstat(r0, &(0x7f0000000440))

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f00000002c0)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x9, 0x0)
chdir(&(0x7f0000000200)='./file0\x00')
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
unlink(&(0x7f00000000c0)='./bus\x00')
preadv(r1, &(0x7f0000000280), 0x1, 0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = openat$null(0xffffffffffffff9c, &(0x7f0000000100), 0x22ad81, 0x0)
fsetxattr$security_evm(r1, &(0x7f0000000000), 0x0, 0x0, 0x0)
rt_sigqueueinfo(r0, 0x24, &(0x7f0000000040))

r0 = socket$unix(0x1, 0x2, 0x0)
shutdown(r0, 0x0)
recvmsg(r0, &(0x7f0000001740)={&(0x7f0000000000)=@in6, 0xc, 0x0, 0x0, &(0x7f0000001680)=""/187, 0xbb}, 0x0)

syz_emit_ethernet(0x4e, &(0x7f0000000140)={@dev, @local, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "1e32b3", 0x18, 0x0, 0x0, @rand_addr=' \x01\x00', @local, {[@dstopts={0x2b}, @fragment, @fragment]}}}}}, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f0000000040)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x0, 0x0)
rename(&(0x7f00000000c0)='./file0/file0\x00', &(0x7f00000001c0)='./file0\x00')
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit(0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)

pselect6(0x40, &(0x7f0000000080), 0xffffffffffffffff, 0x0, &(0x7f0000000100), 0x0)

syz_emit_ethernet(0x4a, &(0x7f0000000000)={@local, @empty, @void, {@ipv4={0x800, @tcp={{0xa, 0x4, 0x0, 0x0, 0x3c, 0x0, 0x0, 0x0, 0x6, 0x0, @remote, @local, {[@timestamp_prespec={0x44, 0x14, 0x5, 0x3, 0x0, [{@local}, {@loopback}]}]}}, {{0x0, 0x0, 0x41424344, 0x41424344, 0x0, 0x6, 0x5}}}}}}, 0x0)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f00000000c0)={0x2, &(0x7f0000000000)=[{0x20}, {0x6, 0x0, 0x0, 0x7ffffff7}]})
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mlockall(0x1)
mknod$loop(&(0x7f0000000740)='./file0\x00', 0x0, 0x0)
newfstatat(0xffffffffffffff9c, &(0x7f00000001c0)='./file0\x00', &(0x7f0000000240)={0x0, 0x0, 0x0, 0x0, <r0=>0x0}, 0x0)
setreuid(0x0, r0)
mremap(&(0x7f000067c000/0x2000)=nil, 0x2000, 0x3000, 0x0, &(0x7f00005bb000/0x3000)=nil)
r1 = gettid()
tkill(r1, 0x18)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
munmap(&(0x7f0000ffb000/0x4000)=nil, 0x4000)
mremap(&(0x7f0000ffa000/0x4000)=nil, 0x4000, 0x2000, 0x3, &(0x7f0000ffe000/0x2000)=nil)
rt_sigreturn()

clone(0x4000c300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = signalfd(0xffffffffffffffff, &(0x7f0000000100), 0x8)
r1 = signalfd(0xffffffffffffffff, &(0x7f0000000100), 0x8)
renameat2(r0, &(0x7f0000000000)='./file0\x00', r1, &(0x7f0000000040)='./file0\x00', 0x0)
r2 = gettid()
tgkill(r2, r2, 0x24)

mlockall(0x3)
setuid(0xee01)
io_setup(0x8, &(0x7f0000000040))

mmap(&(0x7f0000000000/0xa000)=nil, 0xa000, 0x0, 0x2172, 0xffffffffffffffff, 0x0)
mremap(&(0x7f0000005000/0x4000)=nil, 0x4000, 0x1000, 0x0, &(0x7f0000003000/0x1000)=nil)
mremap(&(0x7f0000005000/0x1000)=nil, 0x1000, 0x3000, 0x0, &(0x7f0000ffb000/0x3000)=nil)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
prlimit64(0xffffffffffffffff, 0x6499a19dd6f15892, 0x0, 0x0)
prlimit64(0x0, 0x0, &(0x7f0000000080), 0x0)

syz_emit_ethernet(0x4e, &(0x7f0000000080)={@local, @dev, @void, {@ipv6={0x86dd, @generic={0x0, 0x6, '\x00', 0x18, 0x3c, 0x0, @dev, @local, {[@dstopts={0x0, 0x1, '\x00', [@jumbo, @jumbo]}]}}}}}, 0x0)

munmap(&(0x7f0000ffd000/0x3000)=nil, 0x3000)
get_mempolicy(0x0, 0x0, 0x0, &(0x7f0000ffd000/0x2000)=nil, 0x3)

clone(0x4300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = socket$inet6_icmp_raw(0xa, 0x3, 0x3a)
sendmmsg(r1, &(0x7f0000000b40)=[{{0x0, 0x0, 0x0, 0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="c8000000000000000100000002"], 0xc8}}], 0x1, 0x0)
tkill(r0, 0x18)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0xe0ae1, 0x0)

r0 = gettid()
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
setsockopt$sock_timeval(r1, 0x1, 0x14, &(0x7f0000000100)={0x77359400}, 0x10)
listen(r1, 0x0)
timer_create(0x0, &(0x7f0000066000)={0x0, 0x12}, &(0x7f00009b1ffc))
r2 = dup2(r1, r1)
accept4$packet(r2, 0x0, 0x0, 0x0)
timer_settime(0x0, 0x0, &(0x7f0000000040)={{0x0, 0x989680}, {0x0, 0x9}}, 0x0)
socketpair$unix(0x1, 0x5, 0x0, &(0x7f0000000040)={0xffffffffffffffff, <r3=>0xffffffffffffffff})
r4 = dup(r3)
ioctl$PERF_EVENT_IOC_ENABLE(r4, 0x8912, 0x400200)
tkill(r0, 0x1000000000014)

prctl$PR_SET_SECCOMP(0x16, 0x2, &(0x7f00000001c0)={0x3, &(0x7f0000000140)=[{0x1, 0x0, 0x0, 0x4007}, {0x3c}, {0x200000000006, 0x0, 0x0, 0xfffffff8}]})
socket$inet_icmp_raw(0x2, 0x3, 0x1)

r0 = socket$nl_route(0x10, 0x3, 0x0)
bind$netlink(r0, &(0x7f0000000000), 0xc)
r1 = socket$nl_route(0x10, 0x3, 0x0)
bind$netlink(r1, &(0x7f0000000040), 0xc)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r1 = socket$netlink(0x10, 0x3, 0x0)
fchown(r1, 0x0, 0x0)
clone(0x844640, &(0x7f0000000040), 0x0, 0x0, 0x0)

r0 = getpid()
sched_setaffinity(r0, 0x8, &(0x7f0000000000)=0x9)
r1 = creat(&(0x7f00000000c0)='./file0\x00', 0x0)
write$cgroup_type(r1, &(0x7f0000000180), 0x2d1ee37)
r2 = socket$unix(0x1, 0x4000000001, 0x0)
r3 = socket$unix(0x1, 0x1, 0x0)
bind$unix(r3, &(0x7f0000003000)=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0xc)
listen(r3, 0x0)
r4 = socket$unix(0x1, 0x1, 0x0)
connect(r4, &(0x7f0000931ff4)=@un=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0xc)
connect(r2, &(0x7f0000987ff4)=@un=@file={0x1, '\xe9\x1fq\x89Y\x1e\x923aK\x00'}, 0xc)
listen(r3, 0x5)

r0 = socket$inet_icmp(0x2, 0x2, 0x1)
connect(r0, &(0x7f0000000080)=@l2tp={0x2, 0x0, @empty}, 0x80)
shutdown(r0, 0x0)
recvfrom(r0, 0x0, 0x0, 0x40, 0x0, 0x0)

r0 = socket$inet_udp(0x2, 0x2, 0x0)
bind(r0, &(0x7f00000000c0)=@un=@abs, 0x80)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
setitimer(0x8, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
setrlimit(0x7, &(0x7f0000000100))
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800002, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
signalfd4(0xffffffffffffffff, &(0x7f0000000180), 0x8, 0x0)

r0 = creat(&(0x7f0000000080)='./bus\x00', 0x0)
r1 = creat(&(0x7f0000000140)='./bus\x00', 0x0)
fcntl$lock(r1, 0x7, &(0x7f0000000000)={0x1})
fcntl$lock(r0, 0x5, &(0x7f0000000040))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x100000000000008d, 0x4, 0x0)
capset(&(0x7f0000000540)={0x20071026}, &(0x7f0000000040))
capset(&(0x7f0000000080)={0x20080522}, &(0x7f00000000c0)={0x0, 0x0, 0x0, 0x0, 0xffff})

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r1 = fork()
ptrace(0x10, r1)
clone(0xa912d700, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rt_sigreturn()
r2 = getpid()
ptrace$setsig(0x4203, r1, 0x0, 0x0)
rt_sigqueueinfo(r2, 0x39, &(0x7f0000000000))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
r1 = socket$inet6(0xa, 0x2, 0x0)
bind$inet6(r1, &(0x7f0000000000)={0xa, 0x14e24}, 0x1c)
recvfrom$inet6(r1, 0x0, 0x0, 0x0, 0x0, 0x0)
connect$inet6(r1, &(0x7f0000000080)={0xa, 0x1000000000004e24, 0x0, @ipv4={'\x00', '\xff\xff', @local}}, 0x1c)
setsockopt$inet_int(r1, 0x0, 0x14, &(0x7f0000000040)=0x6, 0x4)
write$binfmt_misc(r1, &(0x7f0000000040)={'syz1'}, 0x4)

rt_sigaction(0x17, &(0x7f00000000c0)={0x0, 0x0, 0x0}, &(0x7f00000001c0)={0x0, 0x0, 0x0}, 0x8, &(0x7f0000000200))

clone(0x0, &(0x7f0000000040), 0x0, 0x0, 0x0)
mprotect(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x6)
waitid(0x0, 0x0, &(0x7f0000003ff8), 0xa0000004, 0x0)

inotify_init1(0xba3ded289b0a1180)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000280)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
r1 = openat(0xffffffffffffff9c, &(0x7f0000000100)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r1, 0x0)
setxattr$incfs_metadata(&(0x7f0000000040)='./file0/file0/file0\x00', &(0x7f0000000180), 0x0, 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x1, 0x0, 0x0)
rt_sigreturn()

mprotect(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x0)
clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mlock2(&(0x7f0000000000/0x1000)=nil, 0x1000, 0x0)
exit_group(0x0)

socketpair$nbd(0x1, 0x1, 0x0, &(0x7f0000001340)={<r0=>0xffffffffffffffff})
getsockopt(r0, 0x1, 0x7, &(0x7f0000000180)=""/172, &(0x7f0000000000)=0xac)

r0 = syz_open_procfs(0x0, &(0x7f0000000200)='stat\x00')
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
preadv(r0, &(0x7f0000000500)=[{&(0x7f0000000440)=""/129, 0x81}], 0x1, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = syz_open_procfs(0x0, &(0x7f0000000100)='status\x00')
r2 = getpid()
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_tgsigqueueinfo(r2, r2, 0x13, &(0x7f0000000000))
socketpair$unix(0x1, 0x2, 0x0, &(0x7f00000000c0)={0xffffffffffffffff, <r4=>0xffffffffffffffff})
sendfile(r4, r1, 0x0, 0x10001ff)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$packet(0x11, 0x3, 0x300)
r1 = socket$packet(0x11, 0x3, 0x300)
ioctl$sock_SIOCGIFINDEX(r1, 0x8933, &(0x7f0000000080)={'syz_tun\x00', <r2=>0x0})
bind$packet(r0, &(0x7f00000000c0)={0x11, 0x3, r2, 0x1, 0x0, 0x6, @broadcast}, 0x14)
exit(0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
getgroups(0x1, &(0x7f0000000140)=[<r1=>0xffffffffffffffff])
setgid(r1)
r2 = getpid()
rt_sigqueueinfo(r2, 0x39, &(0x7f0000000000))

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
capget(&(0x7f0000000200)={0x20071026}, &(0x7f0000000240))

clone(0x5100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
bind$inet6(r0, &(0x7f0000000000)={0xa, 0x0, 0x0, @remote, 0x4}, 0x1c)
rt_sigreturn()

syz_emit_ethernet(0xf0, &(0x7f0000000900)={@link_local, @link_local, @void, {@ipv4={0x800, @icmp={{0x8, 0x4, 0x0, 0x0, 0xa2, 0x0, 0x0, 0x0, 0x1, 0x0, @private, @broadcast, {[@rr={0x7, 0xb, 0x0, [@rand_addr, @remote]}]}}, @dest_unreach={0x3, 0x0, 0x0, 0x0, 0x0, 0x0, {0x1d, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @local, @multicast2, {[@timestamp={0x44, 0xc, 0x0, 0x0, 0x0, [0x0, 0x0]}, @lsrr={0x83, 0xf, 0x0, [@remote, @multicast1, @multicast2]}, @ssrr={0x89, 0x1f, 0x0, [@private, @remote, @dev, @local, @multicast2, @multicast2, @private]}, @lsrr={0x83, 0x23, 0x0, [@initdev={0xac, 0x1e, 0x0, 0x0}, @private, @private, @initdev={0xac, 0x1e, 0x0, 0x0}, @multicast2, @multicast1, @broadcast, @private]}]}}, "41cabf997a5f"}}}}}, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
syz_mount_image$fuse(0x0, &(0x7f0000002080)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
newfstatat(0xffffffffffffff9c, &(0x7f0000000000)='./file0\x00', &(0x7f00000001c0)={0x0, 0x0, 0x0, 0x0, <r0=>0x0}, 0x0)
setreuid(0x0, r0)
truncate(&(0x7f0000000380)='./file0\x00', 0x0)
rt_sigreturn()

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r0 = fork()
ptrace(0x10, r0)
ptrace$setregs(0xd, r0, 0x0, 0x0)
exit_group(0x0)

rt_sigprocmask(0x0, &(0x7f0000000100)={[0xfffffffffffe]}, 0x0, 0x8)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
rt_sigqueueinfo(r0, 0x24, &(0x7f0000000040))
rt_sigtimedwait(&(0x7f0000001680), 0x0, &(0x7f0000001740), 0x8)
r1 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ppoll(0x0, 0x0, 0x0, &(0x7f0000000640), 0x8)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$unix(0x1, 0x2, 0x0)
mprotect(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x0)
pselect6(0x0, 0x0, 0x0, 0x0, 0x0, &(0x7f0000000180)={0x0})
ftruncate(r0, 0x20007ffefffc)

pipe(&(0x7f0000000580)={0xffffffffffffffff, <r0=>0xffffffffffffffff})
mkdirat$cgroup(r0, &(0x7f0000000180)='syz0\x00', 0x1ff)

r0 = socket$nl_route(0x10, 0x3, 0x0)
sendmmsg(r0, &(0x7f0000000d00)=[{{0x0, 0x0, 0x0}}, {{&(0x7f0000000400)=@in={0x2, 0x0, @loopback}, 0xb, 0x0}}], 0x2, 0x0)

creat(&(0x7f0000000540)='./file0\x00', 0x0)
mount(&(0x7f0000000e80)=ANY=[], &(0x7f00000000c0)='./file0\x00', &(0x7f0000000080)='sysfs\x00', 0x0, 0x0)
setxattr$incfs_metadata(&(0x7f0000000300)='./file0/../file0\x00', &(0x7f00000004c0), 0x0, 0x0, 0x0)

syz_emit_ethernet(0x2a, &(0x7f0000000080)={@local, @local, @void, {@arp={0x806, @ether_ipv4={0x1, 0x800, 0x6, 0x4, 0x2, @local, @local, @dev, @broadcast}}}}, 0x0)

r0 = socket$unix(0x1, 0x2, 0x0)
bind$unix(r0, &(0x7f0000000080)=@file={0x1, './file0\x00'}, 0x6e)
bind$unix(r0, &(0x7f0000000000)=@file={0x1, './file0\x00'}, 0x6e)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
capset(&(0x7f0000002ffa)={0x20080522}, &(0x7f0000002000))
setregid(0x0, 0xee01)
rt_sigreturn()

semctl$SEM_STAT_ANY(0x0, 0x0, 0x14, 0xfffffffffffffffe)

clone(0x4000000206ffd, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
openat$cgroup_ro(0xffffffffffffffff, &(0x7f0000000700)='devices.list\x00', 0x0, 0x0)
pipe(&(0x7f0000003400)={<r0=>0xffffffffffffffff})
splice(r0, 0x0, 0xffffffffffffffff, 0x0, 0x88000cc, 0x0)
exit_group(0x0)

clone(0x6900, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
unshare(0x34000200)
clone(0x60000000, 0x0, 0x0, 0x0, 0x0)
setrlimit(0x0, &(0x7f0000000000))

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = syz_open_procfs(0x0, &(0x7f0000000040)='gid_map\x00')
pread64(r1, &(0x7f0000001740)=""/154, 0x9a, 0x0)

r0 = socket$inet_icmp(0x2, 0x2, 0x1)
sendmsg$inet(r0, &(0x7f00000005c0)={&(0x7f0000000000)={0x2, 0x0, @dev}, 0x10, &(0x7f0000000500)=[{&(0x7f0000000040)="268295af489e62", 0x7}, {&(0x7f00000000c0)="b4", 0xfffffe00}], 0x2}, 0x0)

msync(&(0x7f0000400000/0xc00000)=nil, 0xc00000, 0x5)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='/proc/self/exe\x00', 0x0, 0x0)
preadv2(r0, &(0x7f0000000640)=[{&(0x7f0000000300)=""/106, 0x6a}], 0x1, 0x0, 0x0, 0xd)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000000)={0x2, 0x4e21}, 0x10)
connect$inet(r0, &(0x7f0000000180)={0x2, 0x4e21}, 0x10)
sendto$inet(r0, &(0x7f0000000280)="fb", 0x1, 0x0, 0x0, 0x0)
ppoll(&(0x7f0000000040)=[{0xffffffffffffffff, 0x40da}, {0xffffffffffffffff, 0x82d0}, {r0, 0xa64f}, {r0, 0x1}, {0xffffffffffffffff, 0xa004}, {0xffffffffffffffff, 0x1020}, {0xffffffffffffffff, 0x300}, {0xffffffffffffffff, 0x10}, {0xffffffffffffffff, 0x57c4}, {r0}], 0xa, &(0x7f0000000200), &(0x7f0000000140), 0x8)

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
bind$inet(r0, &(0x7f0000000380)={0x2, 0x4e22}, 0x10)
listen(r0, 0x12)
syz_emit_ethernet(0x3e, &(0x7f0000000140)={@local, @link_local, @void, {@ipv4={0x800, @tcp={{0x5, 0x4, 0x0, 0x0, 0x30, 0x0, 0x0, 0x0, 0x6, 0x0, @remote, @local}, {{0x0, 0x4e22, 0x41424344, 0x41424344, 0x0, 0x6, 0x7, 0x2, 0x0, 0x0, 0x0, {[@sack_perm={0x4, 0x2}, @generic={0x2, 0x4, "14"}]}}}}}}}, 0x0)

clone(0x5100, 0x0, 0x0, 0x0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
prlimit64(0x0, 0x0, &(0x7f0000000180), 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000480), 0x1, 0x0)
write(r1, &(0x7f0000c34fff), 0xffffff0b)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000040)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
rt_sigreturn()

r0 = socket$inet6(0xa, 0x2, 0x0)
bind$inet6(r0, &(0x7f0000000000)={0xa, 0x14e24}, 0x1c)
recvmmsg(r0, &(0x7f00000019c0)=[{{0x0, 0x0, 0x0}}], 0x1, 0x0, 0x0)
setsockopt$inet6_int(r0, 0x29, 0x4a, &(0x7f0000000080)=0xe0000000, 0x4)
connect$inet6(r0, &(0x7f0000000640)={0xa, 0x1000000000004e24, 0x0, @empty}, 0x1c)
sendmmsg(r0, &(0x7f0000001800)=[{{&(0x7f00000000c0)=@l2tp={0x2, 0x0, @private}, 0x0, &(0x7f0000000400)=[{&(0x7f0000003a00)="fb4b3c6d88168559cd8d23aeee2c15034bee29f15b796267d8504d4754d2d5dbf67ee0fb6dec5ecaf2d604928dcdb462c22a56f104dec90d169401fe1711067dbcd5872b518002b8ded96dac6f6dd4a26ba761194788df29d4be6d3b0bb47fe6419d5ee8459cda55c3b1176dcd30517a0a92f408b698e0dfa6ff3dbc6b268de443c543fe118772d8f5e596c868fc32ebef89f4cb350d3f36c5cef3ca7e6e444f1df0d6e6a08d144494f8a26e2f412f25168df24996c296d1177f4a1e955b1f1544ffba796c7988a1ee3a42827f28ad84a233b9de99435c9dc1080a17e923e1be8dca9754f4a4f0f817bac31cd099d86c3a2a65e417d98e28816cc55f27332045a4562499821274e993c30f8f85ddea1a60940e871bceb73d13fdcfe874e025e0fde3475698642e2c757c6e1126e44b16a618d07c1e229b453453ecd6065a53eb2f569336cea005fb2d8c024d122a1da9474ad2f82a98e54cb68e03afd299b5240ee8854e95d00857628d0017324640f49ac54bded67f6487385f314cc2fc40e1674668d0f5c581b9df28456079510bdfe7d36c7dacb9c449892a3f441b1b75972ee8000637ea9909fee9a91dd3b33397ac2dc99c57abeb945fbf2d8ade833c16c497bd31f6c401f1974e05c5aff7f3385874d6becf0116c9847d4e8eea6eee3dae98b1d601aa6024e6f25139653a94701ee3d4f8a02d0bb51d2187059dbe7e1837baa366891756cf125dadecde8c93896d8583be2b226356e37e8cfd8bb5bead16b1e6446043e63a9c6beb477e2007f46d76b4243b2c9e7e4ecc09339689b9c3e9de0d06c8aaba409e736354b9c89c5690684edf633582fc440eb92c9c1d2d18f584b75a0eaff1b2d9d93df1e1a7afd454e9082237727bc0b8ca354a575d3cf1bf1a775f030cbd376dafbfd3974de0265c717fcec8b56efebdfeae71a223f7d7afb2efbde13f9002488e17136ec86b25b5d1539974350a160e393200ae46762cfbbb30fd47a9b8a074cad7a4a8f0d40b70eb3cdb36fd9c72e84542d90baf1a45b68a1c5addc7a363e7e441e125f3d9bcd0a7a98ab260f4c97585b4729fadbf90b746cde394a62a0339291fe86d4e5d7b4248fda1d78928ffef38b3c4529026000c26393a60e1d0908088bde0a743a782d56b60de94bf05caa8c82c25b53811e9dacd6ff6f4ed4ec225df56d23bad64d28b88d90e41adb501d0f2d81f16c0539d3f0a86388dad040bd329c5ed04b0a9c2b1d202c99ae45b8e7245f724ad5e16c1fb39351ec84f42f8dfdb667b7eaa4913f81a6d628f599ff3050eb4bb0123502e39868e29161288662956e4fa273126a7a9486714cccac5df9818976e75fbc88257fc1147343a80e8ed8e9eb9380e77c3eef8fb12648d1189963a4b9083d0353484c2f71cc3096a1a1fa10a8c6335ee5f61d200b488c6c8a99bd17e4bdf0e9df68f83db4136e02a9120207149d15edb496f3a83e3a72c645cfafe2b107fbc53232a3a3b36fa4c60e6cff14e7d48cfdeb4eb7a1577e9f992a7912be8e2c79a94e0e81000d0fc7793fd6bd6d4e142d7c01786984b872eaf91b54b3d3625e5985b51b1171b637cc631732cf3552e02954d5129f5c8344f3f93486c160f008a709cf140d1d9f73610997fca9ee3767379674a5885f3f808a65d49c08995a71af10e9eccc0b6c4f3f7a77f573b91b0ff10dd7cca57b4a71f9bcae375d94797d4016e188492e4682805a4a5a79150781625d01d6c601e48baaffb39b322d8f61775ff37dd6658fe1e21c57f38560ee2de64f792a60490c0d131072818cd12adaf5b0c9d11518fe2f419c81038b2e5ec134a3352c7e31c507640f639ca53201f8f488bd7049f37ca8c60ccd826d1a2d30154c7c40e6615713397bf482c11cb592c6315953f08d4f23e8c5d7a8bd30d0ed5a0566488919ab330f14108831f48748f0267775d537cdf2e58b2cadf6b25a37443139e9aaeb9f2eca4c520ace0b8a8be29965977623029151fcf14d129f7b74331f3715ca0b06eb63cdb4148f6c399ae85634118c77be7e657698329175cd0547fc49926a6cef871a8f97871305ba77153ae1c86508f425423a69d0c3bc64c487dc31fea865b7f4d839316ea9fb6fdd72de6cb56a17fe798fda7da14199e56a06088f7514bc8e758be15cb8eec110f062e923019352da6d25a98317efcc83aded502b6e679665badb9f86128cf215be8dccd8b4fd92a518af729e3c7690d592d070b79cf00b29c589eea6313726126a436884b860683d97a9db1dee9379d2e8f1bc2776740bd5db80636f94c0a1eec1d7296e6e763d2d0935b826228f46919d26a4cd6b2e137240c92b5adc90a7ba55fc7663ed8d5e4b43e4c8b9412fb5e00789dde81d38dd91923a8c76140ba0bbe51d812db815d70ade5493946181cd805706e9ce9da634fa996780cb5c0603d4169afd175b141094abea21c266cc75e5e3cbea4d7124f85df82e86c13b41c3c50bde46e03f1c37d928fe6025eb5ce1529773e773e50499a878ef5466864aac0b948afff55b8080fbcdf003d6c061f9d101fe4aa5c88cddc4d419d3d0e0ac6939d3b4cee8648cae28e60515fa0f278483780b8f939eb6d22e4008630dd069f5f24ce2d3ee9df27ffd1bd1d1e22b921553f8a9cc92f4f16c6540a82c5c9acfce44c899cb9d4c8f722b17681166eac31f6313750c55ec904b3491189944db54914f8615e415176eaabc384d86adc7e6bdd7df9268c74a5ee47184bfe95b7a6dd558b648d331d89a87f18a5802253344dc8e15ea83c9b6d1b10a19e2752bd6661906708d7e6f93d57437d024943ec20e503b1847c4d6b246b34f6ef84c3c906246f1d87d2171c3a42799bff51fd3d3cabd48082b055cba4a23133ce6b284b557867c4aeda808f06cc3181def7797080c07a15380c4925fb87008065d878af9d61c70e63ce0031ad17103170e83a4a4ad012dc1afe6cd80ca4493011f468493fa8761f64678450db6a54bae2a86145f6a5f6008557258f34a80f588fd3e8ac0fca315b6e1dcce2647dbd98afa3b344c67537c2228b539f4e2e52cf6b6fe23f3dcd8ddf0d04dc1438f4563e403a6d5cb6c2a241006dd71152298604616f3c7e4ee09282b48272127b28c7c1b6fddc7fee440be4a626049b5d4706381fdb7a6e5d5239e553f58e24ff3feca4fd21897bb2cf7a63d155ae99da747b4f0fb7f60cf69c8b6379ebbb2640091f73437a19d5c46524e7bb6554eba9145b9c43c6b33df748692658948cd728154f177d08b141672d7ed7324ac45bf35575c8f9eaac39d0be02e4c50982fa6705d1bf2381ffe477fbbf04cd08a5128132f6931adcfbef678621cfc089025e9c8a36d91614531d1755d1ed5edca6d59a75f2e2b02810c5efb75e3fb084acdd974464ee58cd40efccaddb513f187954b041caff36ecb2bd0a2157ac0e648d98cf30bcb321ca0fd5b885040d98073ec8815bd956c4711774fb16e6716f3e56b73852fa44227f91f59fa3201c1e7db1c129417dd2447cc6887cd39095815ffbcd9280f87cba354a26a05484713618545829b389fd2d926e3ec4c6c8dcc113a62484863800287a947578d419848a0560601487216b71d831c0b319f08875ee83de93292c011d6619e78ac029c8ce9530f9f239b7ee3ecfc518a2f79586af194402b237d378123cd6cd1fd46f87611374a6659bfcc044e64b0509e61947d6e424e055facdf6e5181a389a3b07a3801654a700cebd04443f5fa8967f1980c8de6caca2cbf7d0da2a9991856acd964bf9a9a86498c5c9c7d17ddd5193bb3f1f404a276cc0733ae77137fc8e1d362d717be81f40b360cea78a144f7d2ea469751d571a33d615bbff25b70e19f2176bc994ce8b6048aa8a180b2ca0bfd0ff8e39eb5f163fe6da2c75f7c08dfeb396b321d4ff49fbe4aef934ee6403110b73730d20f9824f64eb9d7587f32d339276122d8d4c7ff1ca379bee25016707c84a9924f84d0b8b996f64f8e2550a56cd06fd18db89db49d8684e582004befe97af7a89e02599614bdd98d5763ac4a57dbe5dba2f39dfddb40fc84b77e71d5b67db5db31f772d86e3300948cd6aaa38e9721c39b5af850f403b5ebc0f6b2dced5ac2bd0cef4b09711bc9855128104615f380ced7dbd4fe63b0df680c7d614564ccb8d395f021794f0bb87636a4f874e93a0cef310ae3657e84bc6b6f4d911978847c553cb5a63548e17e3609e83afedf6b674d985a9768d77e2faa72ca44f1c02451bf3f29025b9c6e410eba6c67731c7f0767166d0c436035858faf84e9a213fa7b3400f5709e5101927b5cbe71026a5c33370687ab898712efb722d0df83bcc9ab4dbaee825d6b63c4e0fea3ce8d97606a1505d4a484fad19005998867527744bb8bf411c06ad8d542fd7d4f395547ff7ab471c611e161b8127be75cbbf35db9613b5e3e2c2dc0b467e06103eaff91c7b828237a6af2daf5dc47ec1a2842fc3efd27d4e3cd1f080186c1e6f295cb644c42f0963f721b2b577d57fe0d3abcd30f56805e0b7743e250bd20e23be6b9b0666108feb96d590a62c9254833fbf0fafa7147459acabf79bcf96ccbfdbf3586e6d6ca5a8c6275939a6321bfbda8a526d0d0b1cda53667bd1b742a33dabbed0e5e1b1c795be87f593f07da17111940d7dc0991fd6fe958efed4d6354c8b09da4180f1daacd0cb43cd81502343e81e3d9821ac53b01cda8640fb434af05cb261b0e9f1cf0e85fee1d5dcf35d28dc5354a30bd42cb0a6f5d267b845a119688603605faacffe28f633238515bcd58789487553f6f54300b08202b8f6aeb2a3fe51aceec3a00263d48c13a634a30778457843b59f7bcf4231489d1356a6d4cbf85236c008ad062a86a958cf718d222e27f466bf4e102ffd596c8f4d4c4cbaa578288b8eaf983ebf94294ded564af6401ad37763192f06191db8c0af755fbf2d77a981cce1c924debbeb32f98ca637edaac8049a3d3b99595e3916d1fc449dff3cea8b79fa89d92939a188c838097952020a9b721b11caad5277b01b54e8f0f95f27ad4bb4aace0c557ed9c48bca0e5d23922106d3bd341024595bb0e1efa7f3d515f30dfa31fc26aa51693ba36fef3f78b5a6ccdce45f343fc5c5e6c89a23a00e1a44197c3bfc75ddf82ac0b33defb48ac5aa14a0b927070e064c4a3be0a4e5ae2f0ca9c35f6ffb9921b1fd2b23bcaadecaeb60c97288283a8ac862b61edfef51bc7ee406fdc61792df619e6dee2d49f26d9dee7f49ca0dd713fe87a64581b782a7eb12066a714d1474eeb60cc21779db54bf9f44937a019b9f6fcbf36688a63a364ef9e550d6b547cc16842d3701435d3a9853492f06f5cf3c049f18fbd094a4ee8c34a3a132ba0b13b1eb51dd1748cd6cdff47fdb404d685eac5bd75bc8e5ed0870ab0107a8a03df8a44370d727b0080fcec2c00e577904f14d70914aa601b592008eab7c835c6a5d008dccdcca31464322ff29d48563a27c09abdee11bae0f5e96589371a2e629d3b5438739abb7ad8bdbde3853399203dbcc01f5ffc9f98ecb459cef4c8af24bdb889c6e7ec9c2224650b8fc747c4981d91df5481b7f6e0b8fb4dd30d8f887091067bcbfdffb3243f4c75311b7a3bbeafa98a6408657b34d53cd47be7d72c140fb9913447bcd0b00f7fae2074a6a22d0172de465be6516cb8743823f46f74e8daf46a46ffbdece1e7eecee74b59308fee4e06c2b0102fa9593faa00150598c9f270b752c3a2af8880aadda45d39f8b279a61fd380b83907159fa8f8cbe9ad476b64b03a5ce9be2b9646c8adb0820c658fea81a00"/4110, 0x52}, {&(0x7f0000000040)="9e5989dbb0"}, {&(0x7f0000000140)="136436ebda17c83850caba2c3ec945d6e196802ffcd269dc061bd1bd1d8802febe052a4441b558331db8abd0b26a5e1ef45edaee0a2b78320bdf5bd7908806608f10dca0588b6a8c16719ed716544beaf89a269beb14c064385bf823b25b90c18f0c374ccff62f56e37f094d880e448e0e2c80b315ba4be4303be179f8686d01e58480c88d46d4f4e8607b3a0ca0be4276450344821d5908216843e1b7e80df5c86f31d95611fdc34eeb308b9d63f2eb3acd26e75f824d5b594bf568328279eacc5e31a09dafb2baaef2ac9f741cb444b720b40015516dc050855ee530f800"/236}, {&(0x7f0000002a00)="bf3574e6813f6a2e74d519075d51dbb1bd68926f83d651a47262d9f2dd25df93dabb5069d503d5e75cdc3f56efd0dffc00e1f5476c80daeab2a973fbc53cf8c950661db2851b9c7e3e20262957d2b912fe5ced031d5ee153283a1bad1e989ee08064815e4d31a27206127f8c3f7d96dc80c80c4c3458475647b7e2fa157d1f6f5b3ca2a496f2a180e66a56ba0bef026476360a01127d68019a2d9529664698c8a0305c8016b04bae29caacbe9457ba8bd7d05b502ab553bdab83c28ac59d32bfc0c7dd61775e74889074d2959518e5ed4f78414112d1dcbf812f2553f174aa8ff5b7207f06c2d4b20c9949357a40f75c8d9f2c3ab1f32d87b9f3a4995723feafd9f1cb2efab62b256650e628b3378519554de5bb8bf13b49fd935405f5d2066eb9bb4c73dcafd8549f08f5d392df7639207ae35cde023effdfe5e7ed3134a37f88287c5644a58c5d7b23026e8a51fb929ec61c69b52be93861f78592b333dadad8575a951ec08de6f282c4f75dab7d8bf3e59a6c3864fadac2d0ccaccb09c26cbe16483b4807c2235d63899f179e7f1e2e6167e7e64c4bd59d9f16276c754d2158d410f90c3ef69b0d79896cb9ee4df857b10dfcd8e2b4a89bb67a7ed8ee7b183a2377e7bab6cbc55c839916d5af95cde65cb2f122db96cf80b3262d2657a7f2249b48377169b45bc8cfb733988082d710a9ccdacbebf30f938592b1a37c257cdd552c720ea5dc8d03f4ff4558999a44b31e73e08702d894694808ce35a8fff44a2fa4f7b84f4026798931b7f8e10dd395a11bcfc03708234a073a774d7dabd56ac018920e99b031ac7e5904a275ebc333883dcec305f511bb6054fbadac58bac812ad9ea81f595e0e7afee3e48fe498426bc6dd6b70650049cb74a11c3d3e5847444e70700e4b57b70226bb7cab05c34db35db5bc23f274cdd05f4fdc0f67cf854b13ec0382cb89f6c4c86df926874b7d45d82e57ed921fd52827a70457cfcd5cd90f5a2d7b54baba1a43bafb6ecbfbe04770ac40010d137d3e7daf94f51ff10a880049e4c35b7d3ad76e311c28bb1e282f9d5ab5da6b4f895ef81d7a304823076c29e38b17991cd9404426085d08fd4c39a5562762628711c67fc5e28b585a0b9080ea3efc8b68083bdd9de71b954f6d8c70b7c8950fc1559fdc222ba270038bb521229294e915bd933db5b844da397497fbc9863b216a245018b027a125c5b5ffb73785ef2a5c62a189083232c6ae08adce3990e36957f2627ff4b5ebfd9e548fdea28c43c405083e9e078193775ed22aa02d73ae0e2b3c9ef057f01f2a8debca09a25a48806d98d03ee9a0d7a20596a2b4c917ecba2c817b567ca0d40060a3d7daf6efe5d484a1991f855381e3c2b2f74548bf2756178b87c70ec3beda89122883b9b302256a7606dd954e13144e527170e7952f99f91c50fedba66a4913f23928a999ab7578c912cab20cc828a0d4156802c53fbf34d354bf1d1583754221c321600df327be0babdd547578583fb6bd82a2c5d5cb0c1c2f6904f48f0dbaebb5c4e2207df1d7910cd275a2ae7a80ac5890ed0d85a47c9b4f45e032ddc0b78ea10a096e62c9255e2495714ac2e5d8e328df3f3a5d8393a46a70beb0322711ddaca48789d00ae2e0231978d5c3be933f705e9fda94670f0c43b6ec842d68e21feabdaec624dfbaa995a292cddce35e162898cda74f5a3bceedcbeb96f9926a1844415de8714bc9a67f5e3b0b856d43cc6ab4466655e51d4f163b3cc391d92a060168b5560b7e0d19b24343b9c881283c6fde3fdca677f33ad655baa4b5e39999655e736960b98fbee5bef8012eb6c659cb687f972bcb4ba1f0bacf09db958579e36f967d19fa7a4f160fba8be07f568a917061621b06a4fd885a714a946963c7b7066666215206e38b60862dafad35e9881079bc8bcc7a3589294da5ead281edd63b1b04a6a27ebddf764e3d3ada0df134f8326cd0eadaebdde978fca31cf06964121e3bc5c68bcf820757da4682959d4e636d1c98925f81bd45451b46c64a486192f134c3a924a249f96e315e66dd791ab443eae88e040ac246ca33f2a1985dcd601ebcd2abbcdd57f5fe1e5ca79a18e77ed14152c93d2b267414260924c2d2ab0d408a2b5aed5cee1dbdd03aefac55a72edb646436bbab09093363e4566df3461238885c1310c479bff69147bb7a3c4e2f7660248d3a0a15625fcd028f599e34c157646cb06de5c0760d51ed9e487cbf2b2fbdd33085011625d279666a917f0d4a246488857daadb1ea9855ee2bcd5379e9c95ff1b6e57e26607f177c29f6f1132aa253104958c003b3515ce8ba69755d8005bc4db75d37cfc8bd1b37dfbd112950625938ff586c8325ef055e2d3aeeb06d8f307d1e3bb3986fbe7e7006d4b613084eb0d70236c5432e8f495c672a2d94ea077d83f4573e8292b4cf116f57e842422a6b40ded28a033908c18cb85c03c0d9b7b40654f274e16a630456d8446bd6e07a24b34a960459f49d7e59c6432314ae7d5f13a663113daa2ea381c0f971255f5ff441ee0504d9bd70f47426c968c13ba88e298e1fe8f6456ef24b2d4bf7dac17e8abed33ba69506ee53a455cc3527a28ad4a89ff8320fd399e2b17a54ae475823a3ce14a37455ade9afb8ea6015d4d26cadc7bc0530095c35f68f87c2233aa08fd4f8ed95416db6836a73e1b6bd791c9d6f07ca58646d04511ab53048a06eb561a553e2a8ce237886c510a078d4174f7212b378c7ee0397df25ab8a98ab4516df9460db56e1cc0b4ab941de9f55cdbec0fb787a6ad95a2d8d3c617b448def452471957fd25e936676527ff072977b690f3918ff9cb11182f33a90935e9d25b486142cc35d22ea6f0ecb3db0bd2aa2b8b9f34abb666a68aba909d2fe5c5c5237faa7f16ddb02a810b01501696294b5640de8e4829a29718cfcab1df9f3af7aad08c339f7539f38d82b296becabb3d4fed6ab1e9195a2e7e76780afc1808d19304540738e6eda0decd738ed76c4c79c0d9e670f78c534178bdc2f16882ed1495d583ddec3f3912f6270a1ae334cb51aa0a2c8a8999eeb676a86ac600dc502c2ff635865d6166fc532132aff4d0c3b635a92a4adb4040b1b4b0eed6e28914d765d980a7a8e2679c8abedcd5168de12951e76c15b56598d524b12b6d348cfb2706001e144d14a6e8cbb7a65e6dae0921de4f17228f2a792175081be222d01abf7ddc23838615c1fe6f0985157cb720f0ab4e503325876e582a8dd8ed20266d032e16990f57170b998e7e505710d2be14669a57eac489c5da55cefd1746941a4e6efdf4eb7b70cad627a4e89537a0dc47922610f283c6c44f99bec8201b04882a45f1399e5b4a5ef978555b146e2f4a53b868f68e6e69f9c843c806909d6f7fa16fbf1d055148daef188804045dfe0e00ce1db81bbfebcf9e68e64bbcefa3b2776993b61abe43023f245be3b023c42fca55dc40bf86e1826336f24c4a543eb5383870e60b9412859b904fac7d7de4e661fe2fb5fb67fbe2fd5fe927d45139c18c314e36fbef097b733d5a5869899f648cf1ddf19ab9d45a772edd63fa63f7b5f578f9a5554c1b0cb0aae15925e9b0128eb69e085b0f05f6fb7b62633dbf076c725da4945ee894a32e0cd521f5d6158248ef3ece21b604adae716145670b59b0b366adb3aee7e973fc42baf5fc1f6f0bd1de618e83e54782e3065b4df1cb2173378ff368dc2755f56a408671a32ceeafb3b5e525e455c2f0e8b408f38afa34f56dd262e70c6c1e947bd444191c26c7bad6e24d4e662ff4dff2bee33ae4a497f5b8d05a515890f9de5d211b7d0987aae10165d3449f33197280b59c65e0c92b96c2cfa9e379a573816130aa8024b7a0b875121fe0dd98e854fe58524014ff6af87c37810da23ba10ed9246a040b091d8a6c975ed9c094dec0f11ba8215946d9cc938327634ffc1fd1ae98618c58cb27aeeb9fa24380968dccc16199b83caefe8778673646b03d77a280b958387acba130bd5acaba49c38069c7df38435582d23d89490d521c355ced786617bfb215147a8f037ecf42d9610be9045d9d18cd3a2f89ad02e41598a645f2ec8b70caa9eb121e85e0278e6396ae20f7187392c999f1ddc94ff47cfb615b7f67045f045cb8485b08294dac9632f9e949d15c5fb1401528d3d639c9d6f9ca0697a779728580de88dee04dd51f5448781ce93c980e0fd426f29f72a0ea9213ead610b1d40f147e4e2bc53852e29f3cc429363a1d93374227343025312672d86d865c579e130e6bc79aef9ccd8d0a4077c06f27f3aebb94f4ff4e799703971de54dfe59441e75a42170c3e6e9307f8207a0e5ace1978060e65b85add8931b285c110d163521955f86a81d32adac20740269e12d8ab38d2e620c91c5a9533ef3a72238a1cdf4e75f2f663aa80bf090d31c55c91223a9cb0af388a8d41c5ee512b5cb5e8386c8dfc8e528b55df41a8ea1c15550c930d2155e70c31a3e0f2206c5bbf5180d4aa74ce1e11b9770ab81a80fcce0ac6b6a5d0d6caf08bbd835950a2e3ba43ece7bcfbd11f38a62591e1159ddd061e5984493f8b76e7c47bd752a14bf88fe8e8d6987bdc28a0b1783c862da7bc7f3ca9bac08058150b112d2a470771a1009435360e44e0d24758296542505659750c141fc3508c63b22eb482dc76c6761923d79ae9e8fbcfac744eeadb07618d282b3de226fb43774cb543d8c5db2e0b3b1200963814ff572063c6345c1e71a81af5221169f19119607b5bf6d4fbbfa996c6dec36a73ef9c5c1e5515a8713300f4f4391e7a7c17ae256ca89a52bc13e6c7ee221dd4a57b6a995cc296aaa2aa7734f839a32854f57301c6e669b6cbc0e5b198dcb94d103d2b93a7d6401908ccea5f3396e901ce5d52aef6db3a6076e2175e4a05e817fd89c6f2dd8feb196a1f3339942569cb6fe46bff66e77f1c5564bca5cd2af4130a6173d4ea3d5200cd934b6a172205702789a8cd900947f2abd59e2cb52cef86550608a7afebf68d76a6164e265a304a1fa76e1faaf4c6d4ef00cfc3ec933c67c1490a51c6fc067820fe4834bcfd688bd73b6ae993ae4d83fc360eee23387ee37ed02623903a6202b9a770aeaec3e9d9ac4772aa22c6e89b89954a1de07f034240ae0ac86a3834d778f32ba0993619bf23c32f9133a7c55f1a4f87e0da888af5d4ce2eba63cb4ba24aec9df512ac3b85eaa3e112f43a5bc86d287b88065b8bbc7e4d6ff4d4f1b13bbe1bf783c46187e21ff65de4a0027b7408433f0de940763777c143b659879a436db8a773ff7c3084d8acb241801f45c97a857be911cc91f7b7ee46e74536392806618a11a250381578e16db578cb2f827d1942acfafbcc8a3e45ab6234aab1528398a3a8f37bd2b05d1ff69465233557206a506a2b1e173c1b3ed004354cd5ec62cc16ee1d60dc7bf34830ceddae4eb7c4198e311d8385d4580fdc2c61c0246b2ba8b53b97ee090f3b93eb3750ba8179746ab804e3d4d23eba8e8ecad3881948f1d30a5621f03263deb460297ced14ab9dff1c5f88241ff2bae36f93d01e72dcb09777a31eccbf76494c0ad3fe3c9116c05617bff5b8e4508531d454e928966a4a054360aa27c58d5d97bddb50dbd6bceb67a2be6297dbfbd1bc527c39c5baeca709383336f7cae24f25f6a06ef3da5667548095a61d40e60033361f0416b9e873e1ba6412b4f4e6fd3af01dcdbeed04f8ec2312fe5b9d48429485059bc8e4385f57787643e3fd9ad780f4924d7f0b3834aaf6fdbeeb93c04a1402c414404c9f665393e48f94d977207"}, {&(0x7f0000000240)}, {&(0x7f0000000280)="b512f61d4191183d6dd771d1b0c3faf0c4b1f8395531303bd6e00cd7c25a9c2c040c9d57a6877645e15443b8b234737f81f8aa8de5f242ee1d6289e39bfaa6cf849b196e6e96e57f3ace9163725835535a9d633c94cf0aff72e15dcf3321da26f12cf85aaeb9da02e566f7f9b1e503f4ace3104c673ef929097f7ebf71bc1c4100f841323dfbd7a163b6850181395a35fb00"}, {&(0x7f0000000340)="47c56ffbef14d96140f14c53c3eea12bbc1fd9fa07dd05a7baa962532f549a331d1932b89e1563a495fef2710ef79c0cfae7173245d3fc51f8eeaab10e06ff784764dbd4ea0893518073cec7b8e5551c6ba1b755c29bb474e19d7b268f0ec2ef669ef606c6cfc6592e35a19b724d24cb5a1119c9165776e737d34dc6f9cdb3114344136fcc84b2c95e63bb22c259dbc7e812f9d31f319630216e6fc5668f472c8e63b37b16e0bd890ae58ae68626fe4d3fc76603a10b63cd3869586cfc460a"}], 0x0, &(0x7f0000000480)=[{0x0, 0x0, 0x0, "f3817415cd2d507f70a423ec36688ce17c7fb1d88e4acebb231a6c0559fd9b3a57645e8ab462e5d1f26c2bf8f250ab5f309ea4514121158daab81dd052841314ecc569050146f01d31e951db496ebc56ca3e208df7b3846ebab41de970cb1668b5ab54f5c2632286d5e463f62efca293c544013f8853bcf0ae2289090e63bc612b3aed28f50721f69a11cc2fac2e3e3f5a3b19c5"}]}}, {{&(0x7f0000000540)=@pppol2tp={0x18, 0x1, {0x0, 0xffffffffffffffff, {0x2, 0x0, @local}}}, 0x0, &(0x7f0000000700)=[{&(0x7f00000005c0)}, {&(0x7f0000000600)="d50b85eca641173ae0400628eba74c2e7aba0711"}, {&(0x7f0000000680)}, {&(0x7f00000006c0)="1a489fb71c570dfece0eaf804cbf6547468c5ed80b14efd3887efc0d7ff2634b8d5d"}], 0x0, &(0x7f0000000240)=[{0x0, 0x0, 0x0, "d7a6450804308c47667e88626d997ed326394ed42b22fedccca179d97e94e4e54a51cca37f475ad507269ec2d5"}]}}], 0x400000000000318, 0x0)

mmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000, 0x0, 0x10, 0xffffffffffffffff, 0x0)
r0 = inotify_init()
inotify_add_watch(0xffffffffffffffff, 0x0, 0x0)
inotify_add_watch(r0, &(0x7f0000000080)='.\x00', 0x80000122)
openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000040)='cgroup.controllers\x00', 0x275a, 0x0)
read(r0, &(0x7f0000000000)=""/57, 0x39)

r0 = openat(0xffffffffffffffff, &(0x7f0000000100)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
clone(0x41be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = getpid()
r2 = getpid()
rt_tgsigqueueinfo(r1, r1, 0x15, &(0x7f0000000300))
ptrace(0x10, r2)
r3 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r3, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
openat$cgroup_ro(0xffffffffffffffff, &(0x7f0000000080)='cgroup.controllers\x00', 0x275a, 0x0)
ptrace$setregs(0xd, r1, 0x0, &(0x7f0000000000))
ptrace$getregset(0x4204, r2, 0x2, &(0x7f0000000400)={&(0x7f0000001800)=""/4096, 0x1000})

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0xb635773f06ebbeee, 0x8031, 0xffffffffffffffff, 0x0)
r2 = syz_open_procfs(0x0, &(0x7f0000000100)='environ\x00')
read$FUSE(r2, &(0x7f0000000140), 0xfffffefa)

r0 = socket$nl_route(0x10, 0x3, 0x0)
write$FUSE_INTERRUPT(r0, &(0x7f0000000140)={0x10, 0xfffffffffffffff5}, 0x10)
read$FUSE(r0, &(0x7f0000000180)={0x2020, 0x0, 0x0, 0x0, 0x0, <r1=>0x0}, 0x2020)
syz_open_procfs(r1, 0x0)

rt_sigprocmask(0x1, &(0x7f0000000500), 0x0, 0x8)

clone(0x5100, 0x0, 0x0, 0x0, 0x0)
r0 = inotify_init1(0x0)
fcntl$lock(r0, 0x7, &(0x7f0000002140))
rt_sigreturn()

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
syz_mount_image$tmpfs(0x0, 0x0, 0x0, 0x2, &(0x7f0000000280)=[{&(0x7f0000000100)='V', 0x1}, {&(0x7f0000000180)='q', 0x1, 0x7fffffff}], 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = getpid()
rt_sigqueueinfo(r1, 0x39, &(0x7f0000000000))

clone(0x4126300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = creat(&(0x7f0000000040)='./file1\x00', 0x0)
close(r0)
r1 = socket$inet_tcp(0x2, 0x1, 0x0)
connect$inet(r1, &(0x7f0000000180)={0x2, 0x0, @local}, 0x10)
listen(r0, 0x0)
setrlimit(0x0, &(0x7f0000000080))

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
r1 = gettid()
getsockopt$sock_cred(r0, 0x1, 0x11, 0x0, &(0x7f0000000040))
tkill(0x0, 0x0)
rt_sigqueueinfo(r1, 0x2b, &(0x7f0000000100))

socketpair(0x0, 0x405, 0x0, 0x0)

clone(0x30045100, 0x0, 0x0, 0x0, 0x0)
r0 = fork()
ptrace(0x10, r0)
ptrace$setregset(0x4205, r0, 0x1, &(0x7f0000000140)={0x0})
exit_group(0x0)

r0 = socket(0x2, 0x3, 0x6)
getpeername$packet(r0, 0x0, 0x0)

r0 = syz_open_procfs$namespace(0xffffffffffffffff, &(0x7f0000000240)='ns/user\x00')
fchmod(r0, 0x0)

syz_mount_image$fuse(0x0, &(0x7f0000000040)='./file0\x00', 0x0, 0x0, 0x0, 0x0, 0x0)
mount(&(0x7f00000000c0)=ANY=[], &(0x7f0000000080)='./file0\x00', &(0x7f0000000000)='proc\x00', 0x0, 0x0)
listxattr(&(0x7f0000000000)='./file0/../file0\x00', 0x0, 0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
r2 = timerfd_create(0x0, 0x0)
dup3(r1, r2, 0x0)

mbind(&(0x7f0000ffc000/0x4000)=nil, 0x4004, 0x0, 0x0, 0x0, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = openat$ptmx(0xffffffffffffff9c, &(0x7f0000000080), 0x80000000000a01, 0x0)
write$binfmt_aout(r1, &(0x7f00000000c0)=ANY=[], 0xffffff78)
write$binfmt_misc(r1, &(0x7f0000000000)={'syz0'}, 0x4)

r0 = openat(0xffffffffffffffff, &(0x7f0000000080)='/', 0x0, 0x0)
fchdir(r0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
newfstatat(0xffffffffffffff9c, &(0x7f0000000040)='./file0\x00', 0x0, 0x0)
setrlimit(0x0, &(0x7f0000000080))

clone(0x200800059fc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
r1 = socket$inet6_tcp(0xa, 0x1, 0x0)
bind$inet6(r1, &(0x7f0000000500)={0xa, 0x2}, 0x1c)
r2 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
setsockopt$inet_int(r2, 0x0, 0xb, &(0x7f0000000040), 0x4)
signalfd4(r2, &(0x7f0000000080)={[0x7fffffff]}, 0x8, 0x800)
listen(r1, 0x0)
connect$inet(r0, &(0x7f0000000000)={0x2, 0x2, @loopback}, 0x10)
r3 = accept$inet6(r1, &(0x7f0000000040)={0xa, 0x0, 0x0, @private2}, &(0x7f0000000100)=0x1c)
bind$inet6(r3, &(0x7f00000000c0)={0xa, 0x4e22, 0xffffffff, @remote, 0x80000000}, 0x1c)
recvfrom$inet(r0, 0x0, 0x0, 0x41, 0x0, 0x0)
exit_group(0x0)

openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000240)='cgroup.controllers\x00', 0x26e1, 0x0)
r0 = openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000080)='cgroup.controllers\x00', 0x7a05, 0x1700)
write$cgroup_int(r0, &(0x7f0000000200), 0x43400)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x180000f, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
openat$cgroup_ro(0xffffffffffffff9c, &(0x7f0000000180)='cgroup.controllers\x00', 0x100002, 0x0)
getpid()
perf_event_open(&(0x7f0000000100)={0x1, 0x70, 0x8, 0x0, 0xff, 0x2, 0x0, 0x203, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6}, 0x0, 0x9, 0xffffffffffffffff, 0x1)
r2 = openat$cgroup_ro(0xffffffffffffff9c, 0x0, 0x26e1, 0x0)
r3 = perf_event_open(&(0x7f00000001c0)={0x5, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2}, 0xffffffffffffffff, 0x0, 0xffffffffffffffff, 0x0)
ioctl$PERF_EVENT_IOC_DISABLE(r3, 0x2401, 0x0)
r4 = fork()
ptrace$setregs(0xd, r4, 0x0, &(0x7f0000000840))
r5 = fcntl$dupfd(r2, 0x0, r2)
perf_event_open(&(0x7f0000000240)={0x2, 0x70, 0x3, 0x1f, 0x7f, 0x80, 0x0, 0x5, 0x10000, 0x2, 0x0, 0x0, 0x1, 0x0, 0x0, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x1, 0x0, 0x2, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x2cc7, 0x1, 0x0, 0x1016c, 0xafe0, 0x401, 0x0, 0x0, 0x4, 0x7}, r4, 0xb, r5, 0xb)
ioctl$PERF_EVENT_IOC_MODIFY_ATTRIBUTES(r3, 0x4008240b, &(0x7f0000000040)={0x5, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3})
fork()
openat$full(0xffffffffffffff9c, &(0x7f0000000080), 0x54000, 0x0)
pipe2$9p(&(0x7f0000000100), 0x0)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
execveat(0xffffffffffffffff, 0x0, 0x0, 0x0, 0x0)
rt_sigreturn()

r0 = syz_open_procfs(0x0, &(0x7f0000000040)='oom_score_adj\x00')
write$tcp_mem(r0, &(0x7f0000000100)={0x3e73}, 0x48)

r0 = socket$unix(0x1, 0x2, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
fcntl$setownex(r0, 0xf, &(0x7f0000000600))

r0 = socket$inet6_icmp(0xa, 0x2, 0x3a)
connect(r0, &(0x7f00000001c0)=@l2tp6={0xa, 0x0, 0x0, @remote}, 0x80)
sendto(r0, 0x0, 0x0, 0x0, &(0x7f0000000140)=@l2tp={0x2, 0x0, @remote}, 0x80)

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = fork()
ptrace(0x10, r0)
ptrace$getregset(0x4204, r0, 0x1, &(0x7f0000000080)={0x0})
rt_sigreturn()

r0 = socket$inet_udp(0x2, 0x2, 0x0)
bind$inet(r0, &(0x7f0000000280)={0x2, 0x0, @local}, 0x10)
setsockopt$sock_int(r0, 0x1, 0xb, &(0x7f0000000040)=0x9, 0x4)
connect$inet(r0, &(0x7f0000000200)={0x2, 0x0, @multicast2}, 0x10)
sendmmsg(r0, &(0x7f0000007fc0), 0x4000000000001a8, 0x0)

perf_event_open(&(0x7f0000000100)={0x1, 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x3c43, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext={0x0, 0x7fffffff}}, 0x0, 0xffffffffffffffff, 0xffffffffffffffff, 0x0)
rt_tgsigqueueinfo(0x0, 0x0, 0x0, 0x0)
unshare(0x40000000)
mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0x1000003, 0x8031, 0xffffffffffffffff, 0x6a97e000)
unshare(0x0)
getpid()
r0 = fork()
ptrace(0x10, r0)
ptrace(0xffffffffffffffff, r0)
ptrace$peeksig(0x4209, r0, &(0x7f00000000c0)={0xfffffffffffeffff, 0x0, 0x56}, &(0x7f0000000580)=[{}, {}, {}, {}])
perf_event_open(0x0, 0x0, 0x9, 0xffffffffffffffff, 0x0)
ioctl$PERF_EVENT_IOC_SET_OUTPUT(0xffffffffffffffff, 0x2405, 0xffffffffffffffff)
ptrace$getsig(0x4202, r0, 0x0, 0x0)
readv(0xffffffffffffffff, 0x0, 0x0)
madvise(&(0x7f0000300000/0x3000)=nil, 0x3000, 0x0)

open(&(0x7f0000000000)='./bus\x00', 0x141042, 0x0)
r0 = open(&(0x7f000000fffa)='./bus\x00', 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000440)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x1, 0x0, 0x0)
mmap(&(0x7f0000001000/0xa000)=nil, 0xa000, 0x800002, 0x12, r0, 0x0)
r2 = open(&(0x7f0000000000)='./bus\x00', 0x800000141042, 0x0)
mlock(&(0x7f0000005000/0x1000)=nil, 0x1000)
ftruncate(r2, 0x200006)
mlock(&(0x7f0000003000/0x3000)=nil, 0x3002)

setgid(0xee00)
prctl$PR_GET_DUMPABLE(0x3)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
setgroups(0x54, 0x0)
rt_sigreturn()

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
r1 = gettid()
chdir(&(0x7f0000000100)='./file0\x00')
getxattr(&(0x7f0000000000)='./file0\x00', &(0x7f0000000040)=@random={'os2.', '\x00'}, 0x0, 0x0)
clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit_group(0x0)
tkill(r1, 0x25)

clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = gettid()
r1 = getpgid(0x0)
setpgid(r1, 0x0)
rt_sigqueueinfo(r0, 0x8, &(0x7f0000000040))

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
io_destroy(0x0)
exit(0x0)

r0 = openat$full(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
pipe(&(0x7f0000000240)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
splice(r0, 0xfffffffffffffffe, r1, 0x0, 0x800000000e7b, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
prlimit64(0x0, 0x9, &(0x7f00000000c0), 0x0)
r0 = creat(&(0x7f0000000000)='./file0\x00', 0x108)
write$binfmt_elf64(r0, &(0x7f00000002c0)=ANY=[@ANYBLOB="7f454c4602010100000000800000ffef02003e0000000000000000000000000100000000000000000000000000000010000000200000380001"], 0x78)
execve(&(0x7f0000000040)='./file0\x00', 0x0, 0x0)
rt_sigreturn()

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
r1 = socket$inet6_udp(0xa, 0x2, 0x0)
sendmmsg$inet(r1, &(0x7f0000000080)=[{{&(0x7f0000000040)={0x2, 0x4e22, @empty}, 0x10, 0x0}}, {{&(0x7f0000000100)={0x2, 0x4e20, @private}, 0x10, 0x0, 0x0, &(0x7f0000000000)=[@ip_ttl], 0x8}}], 0x400000000000104, 0x0)

clone(0x3c004100, 0x0, 0x0, 0x0, 0x0)
r0 = openat$sysfs(0xffffffffffffff9c, &(0x7f0000000100)='/sys/kernel/debug', 0x0, 0x0)
renameat2(r0, &(0x7f0000000000)='./file0\x00', r0, &(0x7f0000000300)='./file0\x00', 0x2)
r1 = getpid()
rt_sigqueueinfo(r1, 0x39, &(0x7f0000000000))

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = socket$inet(0x2, 0x2, 0x0)
bind$inet(r1, &(0x7f0000000480)={0x2, 0x1004e20, @dev={0xac, 0x14, 0x14, 0x41}}, 0x10)
connect$inet(r1, &(0x7f00000002c0)={0x2, 0x4e20, @empty}, 0x10)
write(r1, 0x0, 0x0)
recvmmsg(r1, &(0x7f0000000b40)=[{{0x0, 0x0, &(0x7f0000000780)=[{&(0x7f00000000c0)=""/73, 0x49}], 0x1}}, {{0x0, 0x0, 0x0, 0x4}}], 0x2, 0x0, 0x0)

clone(0x20016406dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = signalfd(0xffffffffffffffff, &(0x7f00000001c0), 0x8)
mkdir(&(0x7f0000000140)='./control\x00', 0x0)
close(r0)
r1 = inotify_init1(0x0)
fcntl$setstatus(r0, 0x4, 0x2c00)
r2 = gettid()
fcntl$setown(r1, 0x8, r2)
rt_sigtimedwait(&(0x7f0000000080), 0x0, 0x0, 0x8)
r3 = openat$zero(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r4 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2800004, 0x12, r4, 0x0)
preadv(r3, &(0x7f00000001c0)=[{0x0}], 0x1, 0x0, 0x0)
mmap(&(0x7f0000000000/0xb36000)=nil, 0xb36000, 0x1000007, 0x800000000009031, 0xffffffffffffffff, 0x0)
inotify_add_watch(r1, &(0x7f0000000000)='./control\x00', 0xa4000000)
rmdir(&(0x7f0000000100)='./control\x00')

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000140)='./file0\x00', 0x0)
mount(&(0x7f00000002c0)=ANY=[], &(0x7f00000000c0)='./file0\x00', &(0x7f0000000240)='sysfs\x00', 0x0, 0x0)
chdir(&(0x7f0000000200)='./file0\x00')
mkdir(&(0x7f0000000080)='./bus/../file0\x00', 0x0)
clone(0x20002044dfc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
exit(0x0)
r0 = gettid()
r1 = gettid()
tgkill(r0, r1, 0x24)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x4000)=nil, 0x4000, 0x4, 0x12, r0, 0x0)
clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r1 = gettid()
clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r2 = socket$packet(0x11, 0x2, 0x300)
fcntl$setownex(r2, 0xf, &(0x7f0000000000)={0x0, 0xffffffffffffffff})
r3 = gettid()
tkill(r3, 0x18)
r4 = gettid()
tgkill(r1, r4, 0x24)

r0 = creat(&(0x7f0000002680)='./file0\x00', 0x108)
write$binfmt_elf64(r0, &(0x7f00000002c0)=ANY=[@ANYBLOB="7f454c46"], 0x78)
ftruncate(r0, 0x4)
execveat(0xffffffffffffff9c, &(0x7f0000000280)='./file0\x00', 0x0, 0x0, 0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
write$binfmt_aout(r0, 0x0, 0x352)
socketpair$unix(0x1, 0x1, 0x0, &(0x7f0000000080)={0xffffffffffffffff, <r1=>0xffffffffffffffff})
fsetxattr$trusted_overlay_opaque(r1, &(0x7f0000000100), 0x0, 0x20, 0x0)
rt_sigreturn()

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = socket$inet_icmp_raw(0x2, 0x3, 0x1)
setsockopt$inet_int(r0, 0x0, 0x2, 0x0, 0x0)
rt_sigreturn()

perf_event_open(&(0x7f0000000040)={0x2, 0x80, 0x7f, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_bp={0x0}}, 0x0, 0x0, 0xffffffffffffffff, 0x0)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r1, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
syz_open_procfs(0x0, &(0x7f0000000100)='fd/3\x00')
syz_open_procfs(0x0, &(0x7f0000000100)='fd/3\x00')

syz_emit_ethernet(0x5e, &(0x7f0000000080)={@random="1d39359e5c80", @random="7bb824d57ab1", @void, {@ipv6={0x86dd, @generic={0x0, 0x6, "4cfcef", 0x28, 0x3c, 0x0, @remote, @mcast2, {[@dstopts={0x0, 0x4, '\x00', [@generic={0x1f}, @padn={0x1, 0x3, [0x0, 0x0, 0x0]}, @ra, @hao={0xc9, 0x10, @mcast2}, @pad1]}]}}}}}, 0x0)

mknod(&(0x7f0000000100)='./file0\x00', 0x0, 0x0)
mount(&(0x7f0000000600)=ANY=[], &(0x7f0000000140)='./file0\x00', &(0x7f0000000680)='sysfs\x00', 0x0, 0x0)
rename(&(0x7f0000000180)='./file0/file0\x00', &(0x7f00000001c0)='./file0\x00')

clone(0x6300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffff9c, &(0x7f0000000080)='.\x00', 0x0, 0x0)
symlinkat(&(0x7f00000000c0)='/', r0, &(0x7f0000d06ff8)='./file0\x00')
openat(r0, &(0x7f0000000000)='./file0\x00', 0x4000, 0x0)
r1 = gettid()
tkill(r1, 0x25)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
clone(0x200300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
wait4(0x0, 0x0, 0x80000002, 0x0)
r1 = getpid()
rt_tgsigqueueinfo(r1, r1, 0x16, &(0x7f0000000000))
ptrace(0x10, r1)
r2 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
preadv(r2, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
ptrace$setregs(0xd, r1, 0x0, &(0x7f0000000080)="be9ff483111ec7c05a6e35766a9c5cd98ed812fee8ee677c468e2d01bb01fd560342c1891c9b259ef048c5ac173518e9cd261fa6cbe6a89b00bbcac9c7a8fc13d6d5661f30c63f72be485d2065e695187bb1482dff9c9d341184640629dc64bb37212a404898297b90eb535ba521052c06a3f59c8a96155e941ed41bc723c4062d6dc6418cd0808ff3")
ptrace$getregset(0x4205, r1, 0x2, &(0x7f00000005c0)={0x0, 0x7ffffffff000})

syz_emit_ethernet(0x46, &(0x7f0000000000)={@empty, @broadcast, @void, {@ipv4={0x800, @icmp={{0x7, 0x4, 0x0, 0x0, 0x38, 0x0, 0x0, 0x0, 0x1, 0x0, @initdev={0xac, 0x1e, 0x0, 0x0}, @broadcast, {[@rr={0x7, 0x7, 0x2a, [@broadcast]}]}}, @source_quench={0x4, 0x0, 0x0, 0x0, {0x5, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @local, @multicast1}}}}}}, 0x0)

pipe2$9p(&(0x7f0000000000)={0xffffffffffffffff, <r0=>0xffffffffffffffff}, 0x0)
r1 = openat$null(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
splice(r1, &(0x7f0000000080), r0, 0x0, 0xfff, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f00000000c0)='/proc/self/exe\x00', 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r2 = syz_open_procfs(0x0, &(0x7f0000000040)='net/dev\x00')
preadv(r2, &(0x7f0000000080)=[{&(0x7f0000000840)=""/4096, 0x1000}], 0x1, 0x4ff, 0x0)

clone(0x14244100, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat(0xffffffffffffffff, &(0x7f0000000080)='/', 0x0, 0x0)
openat$incfs(r0, &(0x7f0000000040)='.pending_reads\x00', 0x1118e0, 0x0)
r1 = gettid()
tkill(r1, 0x18)

seccomp$SECCOMP_SET_MODE_FILTER_LISTENER(0x1, 0x0, &(0x7f00000002c0)={0x1, &(0x7f0000000280)=[{0x6, 0x0, 0x0, 0x7fffffff}]})
timer_create(0x0, 0x0, &(0x7f00000000c0))
r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x1, 0x0, 0x0)
timer_delete(0x0)

r0 = socket$unix(0x1, 0x5, 0x0)
getsockopt$IP_VS_SO_GET_TIMEOUT(r0, 0x0, 0x486, 0x0, &(0x7f0000000200))

statx(0xffffffffffffff9c, 0x0, 0x0, 0x6e458ac4a8b83c0e, 0x0)

r0 = openat$zero(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x380000d, 0x12, r1, 0x0)
preadv(r0, &(0x7f0000000280)=[{0x0}], 0x1, 0x0, 0x0)
fchdir(r0)

clone(0x7300, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
capget(&(0x7f0000001800)={0x0, 0xffffffffffffffff}, &(0x7f0000001840))
rt_sigreturn()

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
mkdir(&(0x7f0000000080)='./file0\x00', 0x0)
mount(&(0x7f0000000340)=ANY=[], &(0x7f0000000100)='./file0\x00', &(0x7f00000001c0)='sysfs\x00', 0x1, 0x0)
creat(&(0x7f0000000000)='./file0/file0\x00', 0x0)
exit_group(0x0)

r0 = openat(0xffffffffffffffff, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x2, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = socket$inet6_udp(0xa, 0x2, 0x0)
sendmsg$inet(r1, &(0x7f0000001580)={&(0x7f0000000100)={0x2, 0x4e24, @empty}, 0x10, 0x0, 0x0, &(0x7f00000003c0)=[@ip_tos_int={{0x14}}], 0x18}, 0x0)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x3800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
mknod$loop(&(0x7f0000000040)='./file0\x00', 0x0, 0x1)
utime(&(0x7f0000000140)='./file0\x00', &(0x7f00000001c0))

r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_tcp_int(r0, 0x6, 0x12, 0x0, 0x0)

clone(0x20002004ffc, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = socket$inet_tcp(0x2, 0x1, 0x0)
setsockopt$inet_group_source_req(r0, 0x6, 0x6, &(0x7f0000000080)={0x0, {{0x2, 0x0, @multicast1}}}, 0x108)
exit(0x0)

clone(0x4100, 0x0, 0x0, 0x0, 0x0)
r0 = openat(0xffffffffffffffff, &(0x7f000060cff8)='/', 0x0, 0x0)
r1 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800007, 0x12, r1, 0x0)
preadv(r1, &(0x7f0000000280), 0x1, 0x0, 0x0)
symlinkat(&(0x7f0000000280)='./file0\x00', r0, &(0x7f00000002c0)='./file0\x00')
rt_sigreturn()

rt_sigprocmask(0x0, &(0x7f0000000100)={[0xfffffffffffa]}, 0x0, 0x8)
clone(0x20204780, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
rt_sigaction(0xa, &(0x7f0000000180)={&(0x7f0000000000)="64460f1d98b4000000c442791c9b322333330f52580136f341e13cc461b057d7d8dbc4c27d78df40ded46566470f3a44d103c4a1375f3b", 0x0, 0x0}, 0x0, 0x8, &(0x7f00000002c0))
r0 = gettid()
sched_getscheduler(0xffffffffffffffff)
rt_sigqueueinfo(r0, 0xa, &(0x7f0000000040))
ppoll(0x0, 0x0, 0x0, &(0x7f00000000c0), 0x8)
r1 = syz_open_procfs(0xffffffffffffffff, &(0x7f0000000000)='gid_map\x00')
pwrite64(r1, 0x0, 0x0, 0x100000001)

r0 = socket$inet6_udp(0xa, 0x2, 0x0)
setsockopt$inet6_mreq(r0, 0x29, 0x15, &(0x7f0000000040)={@mcast1}, 0x14)

r0 = openat(0xffffffffffffff9c, &(0x7f0000000000)='/proc/self/exe\x00', 0x0, 0x0)
mmap(&(0x7f0000000000/0x800000)=nil, 0x800000, 0x1800003, 0x12, r0, 0x0)
preadv(r0, &(0x7f0000000280), 0x18, 0xd9f, 0x0)
r1 = open(&(0x7f0000000000)='./bus\x00', 0x141042, 0x0)
write$P9_RREADLINK(r1, &(0x7f0000000100)=ANY=[], 0x44)
r2 = eventfd(0x0)
sendfile(r2, r1, 0x0, 0x0)
r3 = open(&(0x7f0000000000)='./bus\x00', 0x141042, 0x0)
r4 = eventfd(0x0)
sendfile(r4, r3, 0x0, 0x7fff)

r0 = openat$null(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
recvfrom$unix(r0, 0x0, 0x0, 0x0, 0x0, 0x0)

syz_emit_ethernet(0x46, &(0x7f0000000780)={@random="5b9045686eee", @local, @void, {@ipv4={0x800, @icmp={{0x5, 0x4, 0x0, 0x0, 0x38, 0x0, 0x0, 0x0, 0x1, 0x0, @initdev={0xac, 0x1e, 0x0, 0x0}, @local}, @time_exceeded={0x3, 0x4, 0x0, 0x3, 0x0, 0x0, {0x5, 0x4, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2f, 0x0, @empty, @local}, "000086ddffffa98b"}}}}}, 0x0)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$inet6_tcp_int(r0, 0x6, 0x3, 0x0, &(0x7f00000000c0))

clone(0xf38055be, 0x0, 0xfffffffffffffffe, 0x0, 0xffffffffffffffff)
r0 = openat$thread_pidfd(0xffffffffffffff9c, &(0x7f00000001c0), 0x0, 0x0)
fcntl$setstatus(r0, 0x4, 0x4000)
exit(0x0)

r0 = socket$inet6_tcp(0xa, 0x1, 0x0)
getsockopt$inet6_tcp_buf(r0, 0x6, 0x8, 0x0, &(0x7f0000001000))