1// Package cap provides all the Linux Capabilities userspace library API 2// bindings in native Go. 3// 4// Capabilities are a feature of the Linux kernel that allow fine 5// grain permissions to perform privileged operations. Privileged 6// operations are required to do irregular system level operations 7// from code. You can read more about how Capabilities are intended to 8// work here: 9// 10// https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33528.pdf 11// 12// This package supports native Go bindings for all the features 13// described in that paper as well as supporting subsequent changes to 14// the kernel for other styles of inheritable Capability. 15// 16// Some simple things you can do with this package are: 17// 18// // Read and display the capabilities of the running process 19// c := cap.GetProc() 20// log.Printf("this process has these caps:", c) 21// 22// // Drop any privilege a process might have (including for root, 23// // but note root 'owns' a lot of system files so a cap-limited 24// // root can still do considerable damage to a running system). 25// old := cap.GetProc() 26// empty := cap.NewSet() 27// if err := empty.SetProc(); err != nil { 28// log.Fatalf("failed to drop privilege: %q -> %q: %v", old, empty, err) 29// } 30// now := cap.GetProc() 31// if cf, _ := now.Compare(empty); cf != 0 { 32// log.Fatalf("failed to fully drop privilege: have=%q, wanted=%q", now, empty) 33// } 34// 35// The "cap" package operates with POSIX semantics for security 36// state. That is all OS threads are kept in sync at all times. The 37// package "kernel.org/pub/linux/libs/security/libcap/psx" is used to 38// implement POSIX semantics system calls that manipulate thread state 39// uniformly over the whole Go (and any CGo linked) process runtime. 40// 41// Note, if the Go runtime syscall interface contains the Linux 42// variant syscall.AllThreadsSyscall() API (it debuted in go1.16 see 43// https://github.com/golang/go/issues/1435 for its history) then the 44// "libcap/psx" package will use that to invoke Capability setting 45// system calls in pure Go binaries. With such an enhanced Go runtime, 46// to force this behavior, use the CGO_ENABLED=0 environment variable. 47// 48// POSIX semantics are more secure than trying to manage privilege at 49// a thread level when those threads share a common memory image as 50// they do under Linux: it is trivial to exploit a vulnerability in 51// one thread of a process to cause execution on any another 52// thread. So, any imbalance in security state, in such cases will 53// readily create an opportunity for a privilege escalation 54// vulnerability. 55// 56// POSIX semantics also work well with Go, which deliberately tries to 57// insulate the user from worrying about the number of OS threads that 58// are actually running in their program. Indeed, Go can efficiently 59// launch and manage tens of thousands of concurrent goroutines 60// without bogging the program or wider system down. It does this by 61// aggressively migrating idle threads to make progress on unblocked 62// goroutines. So, inconsistent security state across OS threads can 63// also lead to program misbehavior. 64// 65// The only exception to this process-wide common security state is 66// the cap.Launcher related functionality. This briefly locks an OS 67// thread to a goroutine in order to launch another executable - the 68// robust implementation of this kind of support is quite subtle, so 69// please read its documentation carefully, if you find that you need 70// it. 71// 72// See https://sites.google.com/site/fullycapable/ for recent updates, 73// some more complete walk-through examples of ways of using 74// 'cap.Set's etc and information on how to file bugs. 75// 76// Copyright (c) 2019-21 Andrew G. Morgan <morgan@kernel.org> 77// 78// The cap and psx packages are licensed with a (you choose) BSD 79// 3-clause or GPL2. See LICENSE file for details. 80package cap // import "kernel.org/pub/linux/libs/security/libcap/cap" 81 82import ( 83 "errors" 84 "sort" 85 "sync" 86 "syscall" 87 "unsafe" 88) 89 90// Value is the type of a single capability (or permission) bit. 91type Value uint 92 93// Flag is the type of one of the three Value dimensions held in a 94// Set. It is also used in the (*IAB).Fill() method for changing the 95// Bounding and Ambient Vectors. 96type Flag uint 97 98// Effective, Permitted, Inheritable are the three Flags of Values 99// held in a Set. 100const ( 101 Effective Flag = iota 102 Permitted 103 Inheritable 104) 105 106// String identifies a Flag value by its conventional "e", "p" or "i" 107// string abbreviation. 108func (f Flag) String() string { 109 switch f { 110 case Effective: 111 return "e" 112 case Permitted: 113 return "p" 114 case Inheritable: 115 return "i" 116 default: 117 return "<Error>" 118 } 119} 120 121// data holds a 32-bit slice of the compressed bitmaps of capability 122// sets as understood by the kernel. 123type data [Inheritable + 1]uint32 124 125// Set is an opaque capabilities container for a set of system 126// capbilities. It holds individually addressable capability Value's 127// for the three capability Flag's. See GetFlag() and SetFlag() for 128// how to adjust them individually, and Clear() and ClearFlag() for 129// how to do bulk operations. 130// 131// For admin tasks associated with managing namespace specific file 132// capabilities, Set can also support a namespace-root-UID value which 133// defaults to zero. See GetNSOwner() and SetNSOwner(). 134type Set struct { 135 // mu protects all other members of a Set. 136 mu sync.RWMutex 137 138 // flat holds Flag Value bitmaps for all capabilities 139 // associated with this Set. 140 flat []data 141 142 // Linux specific 143 nsRoot int 144} 145 146// Various known kernel magic values. 147const ( 148 kv1 = 0x19980330 // First iteration of process capabilities (32 bits). 149 kv2 = 0x20071026 // First iteration of process and file capabilities (64 bits) - deprecated. 150 kv3 = 0x20080522 // Most recently supported process and file capabilities (64 bits). 151) 152 153var ( 154 // startUp protects setting of the following values: magic, 155 // words, maxValues. 156 startUp sync.Once 157 158 // magic holds the preferred magic number for the kernel ABI. 159 magic uint32 160 161 // words holds the number of uint32's associated with each 162 // capability Flag for this session. 163 words int 164 165 // maxValues holds the number of bit values that are named by 166 // the running kernel. This is generally expected to match 167 // ValueCount which is autogenerated at packaging time. 168 maxValues uint 169) 170 171type header struct { 172 magic uint32 173 pid int32 174} 175 176// syscaller is a type for abstracting syscalls. The r* variants are 177// for reading state, and can be parallelized, the w* variants need to 178// be serialized so all OS threads can share state. 179type syscaller struct { 180 r3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) 181 w3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) 182 r6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno) 183 w6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno) 184} 185 186// caprcall provides a pointer etc wrapper for the system calls 187// associated with getcap. 188//go:uintptrescapes 189func (sc *syscaller) caprcall(call uintptr, h *header, d []data) error { 190 x := uintptr(0) 191 if d != nil { 192 x = uintptr(unsafe.Pointer(&d[0])) 193 } 194 _, _, err := sc.r3(call, uintptr(unsafe.Pointer(h)), x, 0) 195 if err != 0 { 196 return err 197 } 198 return nil 199} 200 201// capwcall provides a pointer etc wrapper for the system calls 202// associated with setcap. 203//go:uintptrescapes 204func (sc *syscaller) capwcall(call uintptr, h *header, d []data) error { 205 x := uintptr(0) 206 if d != nil { 207 x = uintptr(unsafe.Pointer(&d[0])) 208 } 209 _, _, err := sc.w3(call, uintptr(unsafe.Pointer(h)), x, 0) 210 if err != 0 { 211 return err 212 } 213 return nil 214} 215 216// prctlrcall provides a wrapper for the prctl systemcalls that only 217// read kernel state. There is a limited number of arguments needed 218// and the caller should use 0 for those not needed. 219func (sc *syscaller) prctlrcall(prVal, v1, v2 uintptr) (int, error) { 220 r, _, err := sc.r3(syscall.SYS_PRCTL, prVal, v1, v2) 221 if err != 0 { 222 return int(r), err 223 } 224 return int(r), nil 225} 226 227// prctlrcall6 provides a wrapper for the prctl systemcalls that only 228// read kernel state and require 6 arguments - ambient cap API, I'm 229// looking at you. There is a limited number of arguments needed and 230// the caller should use 0 for those not needed. 231func (sc *syscaller) prctlrcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) { 232 r, _, err := sc.r6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5) 233 if err != 0 { 234 return int(r), err 235 } 236 return int(r), nil 237} 238 239// prctlwcall provides a wrapper for the prctl systemcalls that 240// write/modify kernel state. Where available, these will use the 241// POSIX semantics fixup system calls. There is a limited number of 242// arguments needed and the caller should use 0 for those not needed. 243func (sc *syscaller) prctlwcall(prVal, v1, v2 uintptr) (int, error) { 244 r, _, err := sc.w3(syscall.SYS_PRCTL, prVal, v1, v2) 245 if err != 0 { 246 return int(r), err 247 } 248 return int(r), nil 249} 250 251// prctlwcall6 provides a wrapper for the prctl systemcalls that 252// write/modify kernel state and require 6 arguments - ambient cap 253// API, I'm looking at you. (Where available, these will use the POSIX 254// semantics fixup system calls). There is a limited number of 255// arguments needed and the caller should use 0 for those not needed. 256func (sc *syscaller) prctlwcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) { 257 r, _, err := sc.w6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5) 258 if err != 0 { 259 return int(r), err 260 } 261 return int(r), nil 262} 263 264// cInit performs the lazy identification of the capability vintage of 265// the running system. 266func (sc *syscaller) cInit() { 267 h := &header{ 268 magic: kv3, 269 } 270 sc.caprcall(syscall.SYS_CAPGET, h, nil) 271 magic = h.magic 272 switch magic { 273 case kv1: 274 words = 1 275 case kv2, kv3: 276 words = 2 277 default: 278 // Fall back to a known good version. 279 magic = kv3 280 words = 2 281 } 282 // Use the bounding set to evaluate which capabilities exist. 283 maxValues = uint(sort.Search(32*words, func(n int) bool { 284 _, err := GetBound(Value(n)) 285 return err != nil 286 })) 287 if maxValues == 0 { 288 // Fall back to using the largest value defined at build time. 289 maxValues = NamedCount 290 } 291} 292 293// MaxBits returns the number of kernel-named capabilities discovered 294// at runtime in the current system. 295func MaxBits() Value { 296 startUp.Do(multisc.cInit) 297 return Value(maxValues) 298} 299 300// NewSet returns an empty capability set. 301func NewSet() *Set { 302 startUp.Do(multisc.cInit) 303 return &Set{ 304 flat: make([]data, words), 305 } 306} 307 308// ErrBadSet indicates a nil pointer was used for a *Set, or the 309// request of the Set is invalid in some way. 310var ErrBadSet = errors.New("bad capability set") 311 312// Dup returns a copy of the specified capability set. 313func (c *Set) Dup() (*Set, error) { 314 if c == nil || len(c.flat) == 0 { 315 return nil, ErrBadSet 316 } 317 n := NewSet() 318 c.mu.RLock() 319 defer c.mu.RUnlock() 320 copy(n.flat, c.flat) 321 n.nsRoot = c.nsRoot 322 return n, nil 323} 324 325// GetPID returns the capability set associated with the target process 326// id; pid=0 is an alias for current. 327func GetPID(pid int) (*Set, error) { 328 v := NewSet() 329 if err := multisc.caprcall(syscall.SYS_CAPGET, &header{magic: magic, pid: int32(pid)}, v.flat); err != nil { 330 return nil, err 331 } 332 return v, nil 333} 334 335// GetProc returns the capability Set of the current process. If the 336// kernel is unable to determine the Set associated with the current 337// process, the function panic()s. 338func GetProc() *Set { 339 c, err := GetPID(0) 340 if err != nil { 341 panic(err) 342 } 343 return c 344} 345 346func (sc *syscaller) setProc(c *Set) error { 347 if c == nil || len(c.flat) == 0 { 348 return ErrBadSet 349 } 350 return sc.capwcall(syscall.SYS_CAPSET, &header{magic: magic}, c.flat) 351} 352 353// SetProc attempts to set the capability Set of the current 354// process. The kernel will perform permission checks and an error 355// will be returned if the attempt fails. Should the attempt fail 356// no process capabilities will have been modified. 357// 358// Note, the general behavior of this call is to set the 359// process-shared capabilities. However, when called from a callback 360// function as part of a (*Launcher).Launch(), the call only sets the 361// capabilities of the thread being used to perform the launch. 362func (c *Set) SetProc() error { 363 state, sc := scwStateSC() 364 defer scwSetState(launchBlocked, state, -1) 365 return sc.setProc(c) 366} 367 368// defines from uapi/linux/prctl.h 369const ( 370 prCapBSetRead = 23 371 prCapBSetDrop = 24 372) 373 374// GetBound determines if a specific capability is currently part of 375// the local bounding set. On systems where the bounding set Value is 376// not present, this function returns an error. 377func GetBound(val Value) (bool, error) { 378 v, err := multisc.prctlrcall(prCapBSetRead, uintptr(val), 0) 379 if err != nil { 380 return false, err 381 } 382 return v > 0, nil 383} 384 385//go:uintptrescapes 386func (sc *syscaller) dropBound(val ...Value) error { 387 for _, v := range val { 388 if _, err := sc.prctlwcall(prCapBSetDrop, uintptr(v), 0); err != nil { 389 return err 390 } 391 } 392 return nil 393} 394 395// DropBound attempts to suppress bounding set Values. The kernel will 396// never allow a bounding set Value bit to be raised once successfully 397// dropped. However, dropping requires the current process is 398// sufficiently capable (usually via cap.SETPCAP being raised in the 399// Effective flag of the process' Set). Note, the drops are performed 400// in order and if one bounding value cannot be dropped, the function 401// returns immediately with an error which may leave the system in an 402// ill-defined state. The caller can determine where things went wrong 403// using GetBound(). 404func DropBound(val ...Value) error { 405 state, sc := scwStateSC() 406 defer scwSetState(launchBlocked, state, -1) 407 return sc.dropBound(val...) 408} 409 410// defines from uapi/linux/prctl.h 411const ( 412 prCapAmbient = 47 413 414 prCapAmbientIsSet = 1 415 prCapAmbientRaise = 2 416 prCapAmbientLower = 3 417 prCapAmbientClearAll = 4 418) 419 420// GetAmbient determines if a specific capability is currently part of 421// the local ambient set. On systems where the ambient set Value is 422// not present, this function returns an error. 423func GetAmbient(val Value) (bool, error) { 424 r, err := multisc.prctlrcall6(prCapAmbient, prCapAmbientIsSet, uintptr(val), 0, 0, 0) 425 return r > 0, err 426} 427 428//go:uintptrescapes 429func (sc *syscaller) setAmbient(enable bool, val ...Value) error { 430 dir := uintptr(prCapAmbientLower) 431 if enable { 432 dir = prCapAmbientRaise 433 } 434 for _, v := range val { 435 _, err := sc.prctlwcall6(prCapAmbient, dir, uintptr(v), 0, 0, 0) 436 if err != nil { 437 return err 438 } 439 } 440 return nil 441} 442 443// SetAmbient attempts to set a specific Value bit to the state, 444// enable. This function will return an error if insufficient 445// permission is available to perform this task. The settings are 446// performed in order and the function returns immediately an error is 447// detected. Use GetAmbient() to unravel where things went 448// wrong. Note, the cap package manages an abstraction IAB that 449// captures all three inheritable vectors in a single type. Consider 450// using that. 451func SetAmbient(enable bool, val ...Value) error { 452 state, sc := scwStateSC() 453 defer scwSetState(launchBlocked, state, -1) 454 return sc.setAmbient(enable, val...) 455} 456 457func (sc *syscaller) resetAmbient() error { 458 var v bool 459 var err error 460 461 for c := Value(0); !v; c++ { 462 if v, err = GetAmbient(c); err != nil { 463 // no non-zero values found. 464 return nil 465 } 466 } 467 _, err = sc.prctlwcall6(prCapAmbient, prCapAmbientClearAll, 0, 0, 0, 0) 468 return err 469} 470 471// ResetAmbient attempts to ensure the Ambient set is fully 472// cleared. It works by first reading the set and if it finds any bits 473// raised it will attempt a reset. The test before attempting a reset 474// behavior is a workaround for situations where the Ambient API is 475// locked, but a reset is not actually needed. No Ambient bit not 476// already raised in both the Permitted and Inheritable Set is allowed 477// to be raised by the kernel. 478func ResetAmbient() error { 479 state, sc := scwStateSC() 480 defer scwSetState(launchBlocked, state, -1) 481 return sc.resetAmbient() 482} 483