• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Package cap provides all the Linux Capabilities userspace library API
2// bindings in native Go.
3//
4// Capabilities are a feature of the Linux kernel that allow fine
5// grain permissions to perform privileged operations. Privileged
6// operations are required to do irregular system level operations
7// from code. You can read more about how Capabilities are intended to
8// work here:
9//
10//   https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33528.pdf
11//
12// This package supports native Go bindings for all the features
13// described in that paper as well as supporting subsequent changes to
14// the kernel for other styles of inheritable Capability.
15//
16// Some simple things you can do with this package are:
17//
18//   // Read and display the capabilities of the running process
19//   c := cap.GetProc()
20//   log.Printf("this process has these caps:", c)
21//
22//   // Drop any privilege a process might have (including for root,
23//   // but note root 'owns' a lot of system files so a cap-limited
24//   // root can still do considerable damage to a running system).
25//   old := cap.GetProc()
26//   empty := cap.NewSet()
27//   if err := empty.SetProc(); err != nil {
28//       log.Fatalf("failed to drop privilege: %q -> %q: %v", old, empty, err)
29//   }
30//   now := cap.GetProc()
31//   if cf, _ := now.Compare(empty); cf != 0 {
32//       log.Fatalf("failed to fully drop privilege: have=%q, wanted=%q", now, empty)
33//   }
34//
35// The "cap" package operates with POSIX semantics for security
36// state. That is all OS threads are kept in sync at all times. The
37// package "kernel.org/pub/linux/libs/security/libcap/psx" is used to
38// implement POSIX semantics system calls that manipulate thread state
39// uniformly over the whole Go (and any CGo linked) process runtime.
40//
41// Note, if the Go runtime syscall interface contains the Linux
42// variant syscall.AllThreadsSyscall() API (it debuted in go1.16 see
43// https://github.com/golang/go/issues/1435 for its history) then the
44// "libcap/psx" package will use that to invoke Capability setting
45// system calls in pure Go binaries. With such an enhanced Go runtime,
46// to force this behavior, use the CGO_ENABLED=0 environment variable.
47//
48// POSIX semantics are more secure than trying to manage privilege at
49// a thread level when those threads share a common memory image as
50// they do under Linux: it is trivial to exploit a vulnerability in
51// one thread of a process to cause execution on any another
52// thread. So, any imbalance in security state, in such cases will
53// readily create an opportunity for a privilege escalation
54// vulnerability.
55//
56// POSIX semantics also work well with Go, which deliberately tries to
57// insulate the user from worrying about the number of OS threads that
58// are actually running in their program. Indeed, Go can efficiently
59// launch and manage tens of thousands of concurrent goroutines
60// without bogging the program or wider system down. It does this by
61// aggressively migrating idle threads to make progress on unblocked
62// goroutines. So, inconsistent security state across OS threads can
63// also lead to program misbehavior.
64//
65// The only exception to this process-wide common security state is
66// the cap.Launcher related functionality. This briefly locks an OS
67// thread to a goroutine in order to launch another executable - the
68// robust implementation of this kind of support is quite subtle, so
69// please read its documentation carefully, if you find that you need
70// it.
71//
72// See https://sites.google.com/site/fullycapable/ for recent updates,
73// some more complete walk-through examples of ways of using
74// 'cap.Set's etc and information on how to file bugs.
75//
76// Copyright (c) 2019-21 Andrew G. Morgan <morgan@kernel.org>
77//
78// The cap and psx packages are licensed with a (you choose) BSD
79// 3-clause or GPL2. See LICENSE file for details.
80package cap // import "kernel.org/pub/linux/libs/security/libcap/cap"
81
82import (
83	"errors"
84	"sort"
85	"sync"
86	"syscall"
87	"unsafe"
88)
89
90// Value is the type of a single capability (or permission) bit.
91type Value uint
92
93// Flag is the type of one of the three Value dimensions held in a
94// Set.  It is also used in the (*IAB).Fill() method for changing the
95// Bounding and Ambient Vectors.
96type Flag uint
97
98// Effective, Permitted, Inheritable are the three Flags of Values
99// held in a Set.
100const (
101	Effective Flag = iota
102	Permitted
103	Inheritable
104)
105
106// String identifies a Flag value by its conventional "e", "p" or "i"
107// string abbreviation.
108func (f Flag) String() string {
109	switch f {
110	case Effective:
111		return "e"
112	case Permitted:
113		return "p"
114	case Inheritable:
115		return "i"
116	default:
117		return "<Error>"
118	}
119}
120
121// data holds a 32-bit slice of the compressed bitmaps of capability
122// sets as understood by the kernel.
123type data [Inheritable + 1]uint32
124
125// Set is an opaque capabilities container for a set of system
126// capbilities. It holds individually addressable capability Value's
127// for the three capability Flag's. See GetFlag() and SetFlag() for
128// how to adjust them individually, and Clear() and ClearFlag() for
129// how to do bulk operations.
130//
131// For admin tasks associated with managing namespace specific file
132// capabilities, Set can also support a namespace-root-UID value which
133// defaults to zero. See GetNSOwner() and SetNSOwner().
134type Set struct {
135	// mu protects all other members of a Set.
136	mu sync.RWMutex
137
138	// flat holds Flag Value bitmaps for all capabilities
139	// associated with this Set.
140	flat []data
141
142	// Linux specific
143	nsRoot int
144}
145
146// Various known kernel magic values.
147const (
148	kv1 = 0x19980330 // First iteration of process capabilities (32 bits).
149	kv2 = 0x20071026 // First iteration of process and file capabilities (64 bits) - deprecated.
150	kv3 = 0x20080522 // Most recently supported process and file capabilities (64 bits).
151)
152
153var (
154	// startUp protects setting of the following values: magic,
155	// words, maxValues.
156	startUp sync.Once
157
158	// magic holds the preferred magic number for the kernel ABI.
159	magic uint32
160
161	// words holds the number of uint32's associated with each
162	// capability Flag for this session.
163	words int
164
165	// maxValues holds the number of bit values that are named by
166	// the running kernel. This is generally expected to match
167	// ValueCount which is autogenerated at packaging time.
168	maxValues uint
169)
170
171type header struct {
172	magic uint32
173	pid   int32
174}
175
176// syscaller is a type for abstracting syscalls. The r* variants are
177// for reading state, and can be parallelized, the w* variants need to
178// be serialized so all OS threads can share state.
179type syscaller struct {
180	r3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno)
181	w3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno)
182	r6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno)
183	w6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno)
184}
185
186// caprcall provides a pointer etc wrapper for the system calls
187// associated with getcap.
188//go:uintptrescapes
189func (sc *syscaller) caprcall(call uintptr, h *header, d []data) error {
190	x := uintptr(0)
191	if d != nil {
192		x = uintptr(unsafe.Pointer(&d[0]))
193	}
194	_, _, err := sc.r3(call, uintptr(unsafe.Pointer(h)), x, 0)
195	if err != 0 {
196		return err
197	}
198	return nil
199}
200
201// capwcall provides a pointer etc wrapper for the system calls
202// associated with setcap.
203//go:uintptrescapes
204func (sc *syscaller) capwcall(call uintptr, h *header, d []data) error {
205	x := uintptr(0)
206	if d != nil {
207		x = uintptr(unsafe.Pointer(&d[0]))
208	}
209	_, _, err := sc.w3(call, uintptr(unsafe.Pointer(h)), x, 0)
210	if err != 0 {
211		return err
212	}
213	return nil
214}
215
216// prctlrcall provides a wrapper for the prctl systemcalls that only
217// read kernel state. There is a limited number of arguments needed
218// and the caller should use 0 for those not needed.
219func (sc *syscaller) prctlrcall(prVal, v1, v2 uintptr) (int, error) {
220	r, _, err := sc.r3(syscall.SYS_PRCTL, prVal, v1, v2)
221	if err != 0 {
222		return int(r), err
223	}
224	return int(r), nil
225}
226
227// prctlrcall6 provides a wrapper for the prctl systemcalls that only
228// read kernel state and require 6 arguments - ambient cap API, I'm
229// looking at you. There is a limited number of arguments needed and
230// the caller should use 0 for those not needed.
231func (sc *syscaller) prctlrcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) {
232	r, _, err := sc.r6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5)
233	if err != 0 {
234		return int(r), err
235	}
236	return int(r), nil
237}
238
239// prctlwcall provides a wrapper for the prctl systemcalls that
240// write/modify kernel state. Where available, these will use the
241// POSIX semantics fixup system calls. There is a limited number of
242// arguments needed and the caller should use 0 for those not needed.
243func (sc *syscaller) prctlwcall(prVal, v1, v2 uintptr) (int, error) {
244	r, _, err := sc.w3(syscall.SYS_PRCTL, prVal, v1, v2)
245	if err != 0 {
246		return int(r), err
247	}
248	return int(r), nil
249}
250
251// prctlwcall6 provides a wrapper for the prctl systemcalls that
252// write/modify kernel state and require 6 arguments - ambient cap
253// API, I'm looking at you. (Where available, these will use the POSIX
254// semantics fixup system calls). There is a limited number of
255// arguments needed and the caller should use 0 for those not needed.
256func (sc *syscaller) prctlwcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) {
257	r, _, err := sc.w6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5)
258	if err != 0 {
259		return int(r), err
260	}
261	return int(r), nil
262}
263
264// cInit performs the lazy identification of the capability vintage of
265// the running system.
266func (sc *syscaller) cInit() {
267	h := &header{
268		magic: kv3,
269	}
270	sc.caprcall(syscall.SYS_CAPGET, h, nil)
271	magic = h.magic
272	switch magic {
273	case kv1:
274		words = 1
275	case kv2, kv3:
276		words = 2
277	default:
278		// Fall back to a known good version.
279		magic = kv3
280		words = 2
281	}
282	// Use the bounding set to evaluate which capabilities exist.
283	maxValues = uint(sort.Search(32*words, func(n int) bool {
284		_, err := GetBound(Value(n))
285		return err != nil
286	}))
287	if maxValues == 0 {
288		// Fall back to using the largest value defined at build time.
289		maxValues = NamedCount
290	}
291}
292
293// MaxBits returns the number of kernel-named capabilities discovered
294// at runtime in the current system.
295func MaxBits() Value {
296	startUp.Do(multisc.cInit)
297	return Value(maxValues)
298}
299
300// NewSet returns an empty capability set.
301func NewSet() *Set {
302	startUp.Do(multisc.cInit)
303	return &Set{
304		flat: make([]data, words),
305	}
306}
307
308// ErrBadSet indicates a nil pointer was used for a *Set, or the
309// request of the Set is invalid in some way.
310var ErrBadSet = errors.New("bad capability set")
311
312// Dup returns a copy of the specified capability set.
313func (c *Set) Dup() (*Set, error) {
314	if c == nil || len(c.flat) == 0 {
315		return nil, ErrBadSet
316	}
317	n := NewSet()
318	c.mu.RLock()
319	defer c.mu.RUnlock()
320	copy(n.flat, c.flat)
321	n.nsRoot = c.nsRoot
322	return n, nil
323}
324
325// GetPID returns the capability set associated with the target process
326// id; pid=0 is an alias for current.
327func GetPID(pid int) (*Set, error) {
328	v := NewSet()
329	if err := multisc.caprcall(syscall.SYS_CAPGET, &header{magic: magic, pid: int32(pid)}, v.flat); err != nil {
330		return nil, err
331	}
332	return v, nil
333}
334
335// GetProc returns the capability Set of the current process. If the
336// kernel is unable to determine the Set associated with the current
337// process, the function panic()s.
338func GetProc() *Set {
339	c, err := GetPID(0)
340	if err != nil {
341		panic(err)
342	}
343	return c
344}
345
346func (sc *syscaller) setProc(c *Set) error {
347	if c == nil || len(c.flat) == 0 {
348		return ErrBadSet
349	}
350	return sc.capwcall(syscall.SYS_CAPSET, &header{magic: magic}, c.flat)
351}
352
353// SetProc attempts to set the capability Set of the current
354// process. The kernel will perform permission checks and an error
355// will be returned if the attempt fails. Should the attempt fail
356// no process capabilities will have been modified.
357//
358// Note, the general behavior of this call is to set the
359// process-shared capabilities. However, when called from a callback
360// function as part of a (*Launcher).Launch(), the call only sets the
361// capabilities of the thread being used to perform the launch.
362func (c *Set) SetProc() error {
363	state, sc := scwStateSC()
364	defer scwSetState(launchBlocked, state, -1)
365	return sc.setProc(c)
366}
367
368// defines from uapi/linux/prctl.h
369const (
370	prCapBSetRead = 23
371	prCapBSetDrop = 24
372)
373
374// GetBound determines if a specific capability is currently part of
375// the local bounding set. On systems where the bounding set Value is
376// not present, this function returns an error.
377func GetBound(val Value) (bool, error) {
378	v, err := multisc.prctlrcall(prCapBSetRead, uintptr(val), 0)
379	if err != nil {
380		return false, err
381	}
382	return v > 0, nil
383}
384
385//go:uintptrescapes
386func (sc *syscaller) dropBound(val ...Value) error {
387	for _, v := range val {
388		if _, err := sc.prctlwcall(prCapBSetDrop, uintptr(v), 0); err != nil {
389			return err
390		}
391	}
392	return nil
393}
394
395// DropBound attempts to suppress bounding set Values. The kernel will
396// never allow a bounding set Value bit to be raised once successfully
397// dropped. However, dropping requires the current process is
398// sufficiently capable (usually via cap.SETPCAP being raised in the
399// Effective flag of the process' Set). Note, the drops are performed
400// in order and if one bounding value cannot be dropped, the function
401// returns immediately with an error which may leave the system in an
402// ill-defined state. The caller can determine where things went wrong
403// using GetBound().
404func DropBound(val ...Value) error {
405	state, sc := scwStateSC()
406	defer scwSetState(launchBlocked, state, -1)
407	return sc.dropBound(val...)
408}
409
410// defines from uapi/linux/prctl.h
411const (
412	prCapAmbient = 47
413
414	prCapAmbientIsSet    = 1
415	prCapAmbientRaise    = 2
416	prCapAmbientLower    = 3
417	prCapAmbientClearAll = 4
418)
419
420// GetAmbient determines if a specific capability is currently part of
421// the local ambient set. On systems where the ambient set Value is
422// not present, this function returns an error.
423func GetAmbient(val Value) (bool, error) {
424	r, err := multisc.prctlrcall6(prCapAmbient, prCapAmbientIsSet, uintptr(val), 0, 0, 0)
425	return r > 0, err
426}
427
428//go:uintptrescapes
429func (sc *syscaller) setAmbient(enable bool, val ...Value) error {
430	dir := uintptr(prCapAmbientLower)
431	if enable {
432		dir = prCapAmbientRaise
433	}
434	for _, v := range val {
435		_, err := sc.prctlwcall6(prCapAmbient, dir, uintptr(v), 0, 0, 0)
436		if err != nil {
437			return err
438		}
439	}
440	return nil
441}
442
443// SetAmbient attempts to set a specific Value bit to the state,
444// enable. This function will return an error if insufficient
445// permission is available to perform this task. The settings are
446// performed in order and the function returns immediately an error is
447// detected. Use GetAmbient() to unravel where things went
448// wrong. Note, the cap package manages an abstraction IAB that
449// captures all three inheritable vectors in a single type. Consider
450// using that.
451func SetAmbient(enable bool, val ...Value) error {
452	state, sc := scwStateSC()
453	defer scwSetState(launchBlocked, state, -1)
454	return sc.setAmbient(enable, val...)
455}
456
457func (sc *syscaller) resetAmbient() error {
458	var v bool
459	var err error
460
461	for c := Value(0); !v; c++ {
462		if v, err = GetAmbient(c); err != nil {
463			// no non-zero values found.
464			return nil
465		}
466	}
467	_, err = sc.prctlwcall6(prCapAmbient, prCapAmbientClearAll, 0, 0, 0, 0)
468	return err
469}
470
471// ResetAmbient attempts to ensure the Ambient set is fully
472// cleared. It works by first reading the set and if it finds any bits
473// raised it will attempt a reset. The test before attempting a reset
474// behavior is a workaround for situations where the Ambient API is
475// locked, but a reset is not actually needed. No Ambient bit not
476// already raised in both the Permitted and Inheritable Set is allowed
477// to be raised by the kernel.
478func ResetAmbient() error {
479	state, sc := scwStateSC()
480	defer scwSetState(launchBlocked, state, -1)
481	return sc.resetAmbient()
482}
483