1// Package cap provides all the Linux Capabilities userspace library API 2// bindings in native Go. 3// 4// Capabilities are a feature of the Linux kernel that allow fine 5// grain permissions to perform privileged operations. Privileged 6// operations are required to do irregular system level operations 7// from code. You can read more about how Capabilities are intended to 8// work here: 9// 10// https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33528.pdf 11// 12// This package supports native Go bindings for all the features 13// described in that paper as well as supporting subsequent changes to 14// the kernel for other styles of inheritable Capability. 15// 16// Some simple things you can do with this package are: 17// 18// // Read and display the capabilities of the running process 19// c := cap.GetProc() 20// log.Printf("this process has these caps:", c) 21// 22// // Drop any privilege a process might have (including for root, 23// // but note root 'owns' a lot of system files so a cap-limited 24// // root can still do considerable damage to a running system). 25// old := cap.GetProc() 26// empty := cap.NewSet() 27// if err := empty.SetProc(); err != nil { 28// log.Fatalf("failed to drop privilege: %q -> %q: %v", old, empty, err) 29// } 30// now := cap.GetProc() 31// if cap.Differs(now.Compare(empty)) { 32// log.Fatalf("failed to fully drop privilege: have=%q, wanted=%q", now, empty) 33// } 34// 35// See https://sites.google.com/site/fullycapable/ for recent updates, 36// some more complete walk-through examples of ways of using 37// 'cap.Set's etc and information on how to file bugs. 38// 39// For CGo linked binaries, behind the scenes, the package 40// "kernel.org/pub/linux/libs/security/libcap/psx" is used to perform 41// POSIX semantics system calls that manipulate thread state 42// uniformly over the whole Go (and CGo linked) process runtime. 43// 44// Note, if the Go runtime syscall interface contains the Linux 45// variant syscall.AllThreadsSyscall() API (it debuted in go1.16 see 46// https://github.com/golang/go/issues/1435 for its history) then 47// the "psx" package will use that to invoke Capability setting system 48// calls in pure Go binaries. In such an enhanced Go runtime, to force 49// this behavior, use the CGO_ENABLED=0 environment variable. 50// 51// 52// Copyright (c) 2019-21 Andrew G. Morgan <morgan@kernel.org> 53// 54// The cap and psx packages are licensed with a (you choose) BSD 55// 3-clause or GPL2. See LICENSE file for details. 56package cap // import "kernel.org/pub/linux/libs/security/libcap/cap" 57 58import ( 59 "errors" 60 "sort" 61 "sync" 62 "syscall" 63 "unsafe" 64) 65 66// Value is the type of a single capability (or permission) bit. 67type Value uint 68 69// Flag is the type of one of the three Value dimensions held in a 70// Set. It is also used in the (*IAB).Fill() method for changing the 71// Bounding and Ambient Vectors. 72type Flag uint 73 74// Effective, Permitted, Inheritable are the three Flags of Values 75// held in a Set. 76const ( 77 Effective Flag = iota 78 Permitted 79 Inheritable 80) 81 82// String identifies a Flag value by its conventional "e", "p" or "i" 83// string abbreviation. 84func (f Flag) String() string { 85 switch f { 86 case Effective: 87 return "e" 88 case Permitted: 89 return "p" 90 case Inheritable: 91 return "i" 92 default: 93 return "<Error>" 94 } 95} 96 97// data holds a 32-bit slice of the compressed bitmaps of capability 98// sets as understood by the kernel. 99type data [Inheritable + 1]uint32 100 101// Set is an opaque capabilities container for a set of system 102// capbilities. It holds individually addressable capability Value's 103// for the three capability Flag's. See GetFlag() and SetFlag() for 104// how to adjust them individually, and Clear() and ClearFlag() for 105// how to do bulk operations. 106// 107// For admin tasks associated with managing namespace specific file 108// capabilities, Set can also support a namespace-root-UID value which 109// defaults to zero. See GetNSOwner() and SetNSOwner(). 110type Set struct { 111 // mu protects all other members of a Set. 112 mu sync.RWMutex 113 114 // flat holds Flag Value bitmaps for all capabilities 115 // associated with this Set. 116 flat []data 117 118 // Linux specific 119 nsRoot int 120} 121 122// Various known kernel magic values. 123const ( 124 kv1 = 0x19980330 // First iteration of process capabilities (32 bits). 125 kv2 = 0x20071026 // First iteration of process and file capabilities (64 bits) - deprecated. 126 kv3 = 0x20080522 // Most recently supported process and file capabilities (64 bits). 127) 128 129var ( 130 // starUp protects setting of the following values: magic, 131 // words, maxValues. 132 startUp sync.Once 133 134 // magic holds the preferred magic number for the kernel ABI. 135 magic uint32 136 137 // words holds the number of uint32's associated with each 138 // capability Flag for this session. 139 words int 140 141 // maxValues holds the number of bit values that are named by 142 // the running kernel. This is generally expected to match 143 // ValueCount which is autogenerated at packaging time. 144 maxValues uint 145) 146 147type header struct { 148 magic uint32 149 pid int32 150} 151 152// scwMu is used to fully serialize the write system calls. Note, this 153// is generally not necesary, but in the case of Launch we get into a 154// situation where the launching thread is temporarily allowed to 155// deviate from the kernel state of the rest of the runtime and 156// allowing other threads to perform w* syscalls will potentially 157// interfere with the launching process. 158var scwMu sync.Mutex 159 160// syscaller is a type for abstracting syscalls. The r* variants are 161// for reading state, and can be parallelized, the w* variants need to 162// be serialized so all OS threads can share state. 163type syscaller struct { 164 r3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) 165 w3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) 166 r6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno) 167 w6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno) 168} 169 170// caprcall provides a pointer etc wrapper for the system calls 171// associated with getcap. 172//go:uintptrescapes 173func (sc *syscaller) caprcall(call uintptr, h *header, d []data) error { 174 x := uintptr(0) 175 if d != nil { 176 x = uintptr(unsafe.Pointer(&d[0])) 177 } 178 _, _, err := sc.r3(call, uintptr(unsafe.Pointer(h)), x, 0) 179 if err != 0 { 180 return err 181 } 182 return nil 183} 184 185// capwcall provides a pointer etc wrapper for the system calls 186// associated with setcap. 187//go:uintptrescapes 188func (sc *syscaller) capwcall(call uintptr, h *header, d []data) error { 189 x := uintptr(0) 190 if d != nil { 191 x = uintptr(unsafe.Pointer(&d[0])) 192 } 193 _, _, err := sc.w3(call, uintptr(unsafe.Pointer(h)), x, 0) 194 if err != 0 { 195 return err 196 } 197 return nil 198} 199 200// prctlrcall provides a wrapper for the prctl systemcalls that only 201// read kernel state. There is a limited number of arguments needed 202// and the caller should use 0 for those not needed. 203func (sc *syscaller) prctlrcall(prVal, v1, v2 uintptr) (int, error) { 204 r, _, err := sc.r3(syscall.SYS_PRCTL, prVal, v1, v2) 205 if err != 0 { 206 return int(r), err 207 } 208 return int(r), nil 209} 210 211// prctlrcall6 provides a wrapper for the prctl systemcalls that only 212// read kernel state and require 6 arguments - ambient cap API, I'm 213// looking at you. There is a limited number of arguments needed and 214// the caller should use 0 for those not needed. 215func (sc *syscaller) prctlrcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) { 216 r, _, err := sc.r6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5) 217 if err != 0 { 218 return int(r), err 219 } 220 return int(r), nil 221} 222 223// prctlwcall provides a wrapper for the prctl systemcalls that 224// write/modify kernel state. Where available, these will use the 225// POSIX semantics fixup system calls. There is a limited number of 226// arguments needed and the caller should use 0 for those not needed. 227func (sc *syscaller) prctlwcall(prVal, v1, v2 uintptr) (int, error) { 228 r, _, err := sc.w3(syscall.SYS_PRCTL, prVal, v1, v2) 229 if err != 0 { 230 return int(r), err 231 } 232 return int(r), nil 233} 234 235// prctlwcall6 provides a wrapper for the prctl systemcalls that 236// write/modify kernel state and require 6 arguments - ambient cap 237// API, I'm looking at you. (Where available, these will use the POSIX 238// semantics fixup system calls). There is a limited number of 239// arguments needed and the caller should use 0 for those not needed. 240func (sc *syscaller) prctlwcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) { 241 r, _, err := sc.w6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5) 242 if err != 0 { 243 return int(r), err 244 } 245 return int(r), nil 246} 247 248// cInit perfoms the lazy identification of the capability vintage of 249// the running system. 250func (sc *syscaller) cInit() { 251 h := &header{ 252 magic: kv3, 253 } 254 sc.caprcall(syscall.SYS_CAPGET, h, nil) 255 magic = h.magic 256 switch magic { 257 case kv1: 258 words = 1 259 case kv2, kv3: 260 words = 2 261 default: 262 // Fall back to a known good version. 263 magic = kv3 264 words = 2 265 } 266 // Use the bounding set to evaluate which capabilities exist. 267 maxValues = uint(sort.Search(32*words, func(n int) bool { 268 _, err := GetBound(Value(n)) 269 return err != nil 270 })) 271 if maxValues == 0 { 272 // Fall back to using the largest value defined at build time. 273 maxValues = NamedCount 274 } 275} 276 277// MaxBits returns the number of kernel-named capabilities discovered 278// at runtime in the current system. 279func MaxBits() Value { 280 startUp.Do(multisc.cInit) 281 return Value(maxValues) 282} 283 284// NewSet returns an empty capability set. 285func NewSet() *Set { 286 startUp.Do(multisc.cInit) 287 return &Set{ 288 flat: make([]data, words), 289 } 290} 291 292// ErrBadSet indicates a nil pointer was used for a *Set, or the 293// request of the Set is invalid in some way. 294var ErrBadSet = errors.New("bad capability set") 295 296// Dup returns a copy of the specified capability set. 297func (c *Set) Dup() (*Set, error) { 298 if c == nil || len(c.flat) == 0 { 299 return nil, ErrBadSet 300 } 301 n := NewSet() 302 c.mu.RLock() 303 defer c.mu.RUnlock() 304 copy(n.flat, c.flat) 305 n.nsRoot = c.nsRoot 306 return n, nil 307} 308 309// GetPID returns the capability set associated with the target process 310// id; pid=0 is an alias for current. 311func GetPID(pid int) (*Set, error) { 312 v := NewSet() 313 if err := multisc.caprcall(syscall.SYS_CAPGET, &header{magic: magic, pid: int32(pid)}, v.flat); err != nil { 314 return nil, err 315 } 316 return v, nil 317} 318 319// GetProc returns the capability Set of the current process. If the 320// kernel is unable to determine the Set associated with the current 321// process, the function panic()s. 322func GetProc() *Set { 323 c, err := GetPID(0) 324 if err != nil { 325 panic(err) 326 } 327 return c 328} 329 330func (sc *syscaller) setProc(c *Set) error { 331 if c == nil || len(c.flat) == 0 { 332 return ErrBadSet 333 } 334 return sc.capwcall(syscall.SYS_CAPSET, &header{magic: magic}, c.flat) 335} 336 337// SetProc attempts to set the capability Set of the current 338// process. The kernel will perform permission checks and an error 339// will be returned if the attempt fails. Should the attempt fail 340// no process capabilities will have been modified. 341func (c *Set) SetProc() error { 342 scwMu.Lock() 343 defer scwMu.Unlock() 344 return multisc.setProc(c) 345} 346 347// defines from uapi/linux/prctl.h 348const ( 349 prCapBSetRead = 23 350 prCapBSetDrop = 24 351) 352 353// GetBound determines if a specific capability is currently part of 354// the local bounding set. On systems where the bounding set Value is 355// not present, this function returns an error. 356func GetBound(val Value) (bool, error) { 357 v, err := multisc.prctlrcall(prCapBSetRead, uintptr(val), 0) 358 if err != nil { 359 return false, err 360 } 361 return v > 0, nil 362} 363 364//go:uintptrescapes 365func (sc *syscaller) dropBound(val ...Value) error { 366 for _, v := range val { 367 if _, err := sc.prctlwcall(prCapBSetDrop, uintptr(v), 0); err != nil { 368 return err 369 } 370 } 371 return nil 372} 373 374// DropBound attempts to suppress bounding set Values. The kernel will 375// never allow a bounding set Value bit to be raised once successfully 376// dropped. However, dropping requires the current process is 377// sufficiently capable (usually via cap.SETPCAP being raised in the 378// Effective flag of the process' Set). Note, the drops are performed 379// in order and if one bounding value cannot be dropped, the function 380// returns immediately with an error which may leave the system in an 381// ill-defined state. The caller can determine where things went wrong 382// using GetBound(). 383func DropBound(val ...Value) error { 384 scwMu.Lock() 385 defer scwMu.Unlock() 386 return multisc.dropBound(val...) 387} 388 389// defines from uapi/linux/prctl.h 390const ( 391 prCapAmbient = 47 392 393 prCapAmbientIsSet = 1 394 prCapAmbientRaise = 2 395 prCapAmbientLower = 3 396 prCapAmbientClearAll = 4 397) 398 399// GetAmbient determines if a specific capability is currently part of 400// the local ambient set. On systems where the ambient set Value is 401// not present, this function returns an error. 402func GetAmbient(val Value) (bool, error) { 403 r, err := multisc.prctlrcall6(prCapAmbient, prCapAmbientIsSet, uintptr(val), 0, 0, 0) 404 return r > 0, err 405} 406 407//go:uintptrescapes 408func (sc *syscaller) setAmbient(enable bool, val ...Value) error { 409 dir := uintptr(prCapAmbientLower) 410 if enable { 411 dir = prCapAmbientRaise 412 } 413 for _, v := range val { 414 _, err := sc.prctlwcall6(prCapAmbient, dir, uintptr(v), 0, 0, 0) 415 if err != nil { 416 return err 417 } 418 } 419 return nil 420} 421 422// SetAmbient attempts to set a specific Value bit to the state, 423// enable. This function will return an error if insufficient 424// permission is available to perform this task. The settings are 425// performed in order and the function returns immediately an error is 426// detected. Use GetAmbient() to unravel where things went 427// wrong. Note, the cap package manages an abstraction IAB that 428// captures all three inheritable vectors in a single type. Consider 429// using that. 430func SetAmbient(enable bool, val ...Value) error { 431 scwMu.Lock() 432 defer scwMu.Unlock() 433 return multisc.setAmbient(enable, val...) 434} 435 436func (sc *syscaller) resetAmbient() error { 437 var v bool 438 var err error 439 440 for c := Value(0); !v; c++ { 441 if v, err = GetAmbient(c); err != nil { 442 // no non-zero values found. 443 return nil 444 } 445 } 446 _, err = sc.prctlwcall6(prCapAmbient, prCapAmbientClearAll, 0, 0, 0, 0) 447 return err 448} 449 450// ResetAmbient attempts to ensure the Ambient set is fully 451// cleared. It works by first reading the set and if it finds any bits 452// raised it will attempt a reset. The test before attempting a reset 453// behavior is a workaround for situations where the Ambient API is 454// locked, but a reset is not actually needed. No Ambient bit not 455// already raised in both the Permitted and Inheritable Set is allowed 456// to be raised by the kernel. 457func ResetAmbient() error { 458 scwMu.Lock() 459 defer scwMu.Unlock() 460 return multisc.resetAmbient() 461} 462