Update opencontainers/runc to v1.0.0-rc5

```
$ gvt delete github.com/opencontainers/runc/libcontainer/cgroups
$ gvt delete github.com/opencontainers/runc/libcontainer/configs
$ gvt delete github.com/opencontainers/runc/libcontainer/system
$ gvt delete github.com/opencontainers/runc/libcontainer/user
$ gvt delete github.com/opencontainers/runc/libcontainer/utils
$ gvt fetch --tag v1.0.0-rc5 github.com/opencontainers/runc/libcontainer
2018/07/23 17:08:18 Fetching: github.com/opencontainers/runc/libcontainer
2018/07/23 17:08:24 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/vishvananda/netlink
2018/07/23 17:08:24 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/golang.org/x/sys/unix
2018/07/23 17:08:24 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/cyphar/filepath-securejoin
2018/07/23 17:08:24 ·· Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/pkg/errors
2018/07/23 17:08:24 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/opencontainers/selinux/go-selinux/label
2018/07/23 17:08:25 ·· Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/opencontainers/selinux/go-selinux
2018/07/23 17:08:25 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/containerd/console
2018/07/23 17:08:25 ·· Fetching recursive dependency: github.com/opencontainers/runc/vendor/golang.org/x/sys/windows
2018/07/23 17:08:25 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/sirupsen/logrus
2018/07/23 17:08:25 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/godbus/dbus
2018/07/23 17:08:25 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/mrunalp/fileutils
2018/07/23 17:08:25 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/coreos/go-systemd/util
2018/07/23 17:08:25 ·· Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/coreos/pkg/dlopen
2018/07/23 17:08:25 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/golang/protobuf/proto
2018/07/23 17:08:25 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/syndtr/gocapability/capability
2018/07/23 17:08:25 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/coreos/go-systemd/dbus
2018/07/23 17:08:25 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/opencontainers/runtime-spec/specs-go
2018/07/23 17:08:25 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/seccomp/libseccomp-golang
2018/07/23 17:08:25 · Fetching recursive dependency: github.com/opencontainers/runc/vendor/github.com/docker/go-units
```
This commit is contained in:
Marc Carré
2018-07-23 17:09:16 +02:00
parent a74f203668
commit a82ba60760
477 changed files with 184518 additions and 3724 deletions

View File

@@ -0,0 +1,191 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
Copyright 2014 Docker, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@@ -0,0 +1,54 @@
// +build apparmor,linux
package apparmor
import (
"fmt"
"io/ioutil"
"os"
)
// IsEnabled returns true if apparmor is enabled for the host.
func IsEnabled() bool {
if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" {
if _, err = os.Stat("/sbin/apparmor_parser"); err == nil {
buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled")
return err == nil && len(buf) > 1 && buf[0] == 'Y'
}
}
return false
}
func setprocattr(attr, value string) error {
// Under AppArmor you can only change your own attr, so use /proc/self/
// instead of /proc/<tid>/ like libapparmor does
path := fmt.Sprintf("/proc/self/attr/%s", attr)
f, err := os.OpenFile(path, os.O_WRONLY, 0)
if err != nil {
return err
}
defer f.Close()
_, err = fmt.Fprintf(f, "%s", value)
return err
}
// changeOnExec reimplements aa_change_onexec from libapparmor in Go
func changeOnExec(name string) error {
value := "exec " + name
if err := setprocattr("exec", value); err != nil {
return fmt.Errorf("apparmor failed to apply profile: %s", err)
}
return nil
}
// ApplyProfile will apply the profile with the specified name to the process after
// the next exec.
func ApplyProfile(name string) error {
if name == "" {
return nil
}
return changeOnExec(name)
}

View File

@@ -0,0 +1,20 @@
// +build !apparmor !linux
package apparmor
import (
"errors"
)
var ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported")
func IsEnabled() bool {
return false
}
func ApplyProfile(name string) error {
if name != "" {
return ErrApparmorNotEnabled
}
return nil
}

View File

@@ -0,0 +1,113 @@
// +build linux
package libcontainer
import (
"fmt"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/syndtr/gocapability/capability"
)
const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS
var capabilityMap map[string]capability.Cap
func init() {
capabilityMap = make(map[string]capability.Cap)
last := capability.CAP_LAST_CAP
// workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap
if last == capability.Cap(63) {
last = capability.CAP_BLOCK_SUSPEND
}
for _, cap := range capability.List() {
if cap > last {
continue
}
capKey := fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String()))
capabilityMap[capKey] = cap
}
}
func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilities, error) {
bounding := []capability.Cap{}
for _, c := range capConfig.Bounding {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
bounding = append(bounding, v)
}
effective := []capability.Cap{}
for _, c := range capConfig.Effective {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
effective = append(effective, v)
}
inheritable := []capability.Cap{}
for _, c := range capConfig.Inheritable {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
inheritable = append(inheritable, v)
}
permitted := []capability.Cap{}
for _, c := range capConfig.Permitted {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
permitted = append(permitted, v)
}
ambient := []capability.Cap{}
for _, c := range capConfig.Ambient {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
ambient = append(ambient, v)
}
pid, err := capability.NewPid(0)
if err != nil {
return nil, err
}
return &containerCapabilities{
bounding: bounding,
effective: effective,
inheritable: inheritable,
permitted: permitted,
ambient: ambient,
pid: pid,
}, nil
}
type containerCapabilities struct {
pid capability.Capabilities
bounding []capability.Cap
effective []capability.Cap
inheritable []capability.Cap
permitted []capability.Cap
ambient []capability.Cap
}
// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
func (c *containerCapabilities) ApplyBoundingSet() error {
c.pid.Clear(capability.BOUNDS)
c.pid.Set(capability.BOUNDS, c.bounding...)
return c.pid.Apply(capability.BOUNDS)
}
// Apply sets all the capabilities for the current process in the config.
func (c *containerCapabilities) ApplyCaps() error {
c.pid.Clear(allCapabilityTypes)
c.pid.Set(capability.BOUNDS, c.bounding...)
c.pid.Set(capability.PERMITTED, c.permitted...)
c.pid.Set(capability.INHERITABLE, c.inheritable...)
c.pid.Set(capability.EFFECTIVE, c.effective...)
c.pid.Set(capability.AMBIENT, c.ambient...)
return c.pid.Apply(allCapabilityTypes)
}

View File

@@ -9,7 +9,7 @@ import (
)
type Manager interface {
// Apply cgroup configuration to the process with the specified pid
// Applies cgroup configuration to the process with the specified pid
Apply(pid int) error
// Returns the PIDs inside the cgroup set
@@ -27,9 +27,9 @@ type Manager interface {
// Destroys the cgroup set
Destroy() error
// NewCgroupManager() and LoadCgroupManager() require following attributes:
// The option func SystemdCgroups() and Cgroupfs() require following attributes:
// Paths map[string]string
// Cgroups *cgroups.Cgroup
// Cgroups *configs.Cgroup
// Paths maps cgroup subsystem to path at which it is mounted.
// Cgroups specifies specific cgroup settings for the various subsystems
@@ -37,7 +37,7 @@ type Manager interface {
// restore the object later.
GetPaths() map[string]string
// Set the cgroup as configured.
// Sets the cgroup as configured.
Set(container *configs.Config) error
}

View File

@@ -1,18 +0,0 @@
// +build linux
package cgroups
import (
"testing"
)
func TestParseCgroups(t *testing.T) {
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
t.Fatal(err)
}
if _, ok := cgroups["cpu"]; !ok {
t.Fail()
}
}

View File

@@ -9,11 +9,11 @@ import (
"io/ioutil"
"os"
"path/filepath"
"strconv"
"sync"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
)
var (
@@ -30,8 +30,8 @@ var (
&NetPrioGroup{},
&PerfEventGroup{},
&FreezerGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
}
CgroupProcesses = "cgroup.procs"
HugePageSizes, _ = cgroups.GetHugePageSize()
)
@@ -104,6 +104,8 @@ func (m *Manager) Apply(pid int) (err error) {
if m.Cgroups == nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
var c = m.Cgroups
@@ -112,8 +114,8 @@ func (m *Manager) Apply(pid int) (err error) {
return err
}
m.Paths = make(map[string]string)
if c.Paths != nil {
paths := make(map[string]string)
for name, path := range c.Paths {
_, err := d.path(name)
if err != nil {
@@ -122,35 +124,39 @@ func (m *Manager) Apply(pid int) (err error) {
}
return err
}
paths[name] = path
m.Paths[name] = path
}
m.Paths = paths
return cgroups.EnterPid(m.Paths, pid)
}
paths := make(map[string]string)
defer func() {
if err != nil {
cgroups.RemovePaths(paths)
}
}()
for _, sys := range subsystems {
if err := sys.Apply(d); err != nil {
return err
}
// TODO: Apply should, ideally, be reentrant or be broken up into a separate
// create and join phase so that the cgroup hierarchy for a container can be
// created then join consists of writing the process pids to cgroup.procs
p, err := d.path(sys.Name())
if err != nil {
if cgroups.IsNotFound(err) {
// The non-presence of the devices subsystem is
// considered fatal for security reasons.
if cgroups.IsNotFound(err) && sys.Name() != "devices" {
continue
}
return err
}
paths[sys.Name()] = p
m.Paths[sys.Name()] = p
if err := sys.Apply(d); err != nil {
if os.IsPermission(err) && m.Cgroups.Path == "" {
// If we didn't set a cgroup path, then let's defer the error here
// until we know whether we have set limits or not.
// If we hadn't set limits, then it's ok that we couldn't join this cgroup, because
// it will have the same limits as its parent.
delete(m.Paths, sys.Name())
continue
}
return err
}
}
m.Paths = paths
return nil
}
@@ -191,19 +197,20 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
}
func (m *Manager) Set(container *configs.Config) error {
for _, sys := range subsystems {
// Generate fake cgroup data.
d, err := getCgroupData(container.Cgroups, -1)
if err != nil {
return err
}
// Get the path, but don't error out if the cgroup wasn't found.
path, err := d.path(sys.Name())
if err != nil && !cgroups.IsNotFound(err) {
return err
}
// If Paths are set, then we are just joining cgroups paths
// and there is no need to set any values.
if m.Cgroups.Paths != nil {
return nil
}
paths := m.GetPaths()
for _, sys := range subsystems {
path := paths[sys.Name()]
if err := sys.Set(path, container.Cgroups); err != nil {
if path == "" {
// cgroup never applied
return fmt.Errorf("cannot set limits on the %s cgroup, as the container has not joined it", sys.Name())
}
return err
}
}
@@ -219,14 +226,8 @@ func (m *Manager) Set(container *configs.Config) error {
// Freeze toggles the container's freezer cgroup depending on the state
// provided
func (m *Manager) Freeze(state configs.FreezerState) error {
d, err := getCgroupData(m.Cgroups, 0)
if err != nil {
return err
}
dir, err := d.path("freezer")
if err != nil {
return err
}
paths := m.GetPaths()
dir := paths["freezer"]
prevState := m.Cgroups.Resources.Freezer
m.Cgroups.Resources.Freezer = state
freezer, err := subsystems.Get("freezer")
@@ -242,28 +243,13 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
}
func (m *Manager) GetPids() ([]int, error) {
dir, err := getCgroupPath(m.Cgroups)
if err != nil {
return nil, err
}
return cgroups.GetPids(dir)
paths := m.GetPaths()
return cgroups.GetPids(paths["devices"])
}
func (m *Manager) GetAllPids() ([]int, error) {
dir, err := getCgroupPath(m.Cgroups)
if err != nil {
return nil, err
}
return cgroups.GetAllPids(dir)
}
func getCgroupPath(c *configs.Cgroup) (string, error) {
d, err := getCgroupData(c, 0)
if err != nil {
return "", err
}
return d.path("devices")
paths := m.GetPaths()
return cgroups.GetAllPids(paths["devices"])
}
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
@@ -276,38 +262,26 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
}
innerPath := c.Path
// XXX: Do not remove this code. Path safety is important! -- cyphar
cgPath := libcontainerUtils.CleanPath(c.Path)
cgParent := libcontainerUtils.CleanPath(c.Parent)
cgName := libcontainerUtils.CleanPath(c.Name)
innerPath := cgPath
if innerPath == "" {
innerPath = filepath.Join(c.Parent, c.Name)
innerPath = filepath.Join(cgParent, cgName)
}
return &cgroupData{
root: root,
innerPath: c.Path,
innerPath: innerPath,
config: c,
pid: pid,
}, nil
}
func (raw *cgroupData) parentPath(subsystem, mountpoint, root string) (string, error) {
// Use GetThisCgroupDir instead of GetInitCgroupDir, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
initPath, err := cgroups.GetThisCgroupDir(subsystem)
if err != nil {
return "", err
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see pathes from host, which don't exist in container.
relDir, err := filepath.Rel(root, initPath)
if err != nil {
return "", err
}
return filepath.Join(mountpoint, relDir), nil
}
func (raw *cgroupData) path(subsystem string) (string, error) {
mnt, root, err := cgroups.FindCgroupMountpointAndRoot(subsystem)
mnt, err := cgroups.FindCgroupMountpoint(subsystem)
// If we didn't mount the subsystem, there is no point we make the path.
if err != nil {
return "", err
@@ -315,11 +289,14 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(raw.innerPath) {
// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'.
// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
}
parentPath, err := raw.parentPath(subsystem, mnt, root)
// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
if err != nil {
return "", err
}
@@ -335,7 +312,7 @@ func (raw *cgroupData) join(subsystem string) (string, error) {
if err := os.MkdirAll(path, 0755); err != nil {
return "", err
}
if err := writeFile(path, CgroupProcesses, strconv.Itoa(raw.pid)); err != nil {
if err := cgroups.WriteCgroupProc(path, raw.pid); err != nil {
return "", err
}
return path, nil
@@ -345,9 +322,12 @@ func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s.", file)
return fmt.Errorf("no such directory for %s", file)
}
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", data, file, err)
}
return nil
}
func readFile(dir, file string) (string, error) {
@@ -365,8 +345,8 @@ func removePath(p string, err error) error {
return nil
}
func CheckCpushares(path string, c int64) error {
var cpuShares int64
func CheckCpushares(path string, c uint64) error {
var cpuShares uint64
if c == 0 {
return nil

View File

@@ -1,636 +0,0 @@
// +build linux
package fs
import (
"strconv"
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
const (
sectorsRecursiveContents = `8:0 1024`
serviceBytesRecursiveContents = `8:0 Read 100
8:0 Write 200
8:0 Sync 300
8:0 Async 500
8:0 Total 500
Total 500`
servicedRecursiveContents = `8:0 Read 10
8:0 Write 40
8:0 Sync 20
8:0 Async 30
8:0 Total 50
Total 50`
queuedRecursiveContents = `8:0 Read 1
8:0 Write 4
8:0 Sync 2
8:0 Async 3
8:0 Total 5
Total 5`
serviceTimeRecursiveContents = `8:0 Read 173959
8:0 Write 0
8:0 Sync 0
8:0 Async 173959
8:0 Total 17395
Total 17395`
waitTimeRecursiveContents = `8:0 Read 15571
8:0 Write 0
8:0 Sync 0
8:0 Async 15571
8:0 Total 15571`
mergedRecursiveContents = `8:0 Read 5
8:0 Write 10
8:0 Sync 0
8:0 Async 0
8:0 Total 15
Total 15`
timeRecursiveContents = `8:0 8`
throttleServiceBytes = `8:0 Read 11030528
8:0 Write 23
8:0 Sync 42
8:0 Async 11030528
8:0 Total 11030528
252:0 Read 11030528
252:0 Write 23
252:0 Sync 42
252:0 Async 11030528
252:0 Total 11030528
Total 22061056`
throttleServiced = `8:0 Read 164
8:0 Write 23
8:0 Sync 42
8:0 Async 164
8:0 Total 164
252:0 Read 164
252:0 Write 23
252:0 Sync 42
252:0 Async 164
252:0 Total 164
Total 328`
)
func appendBlkioStatEntry(blkioStatEntries *[]cgroups.BlkioStatEntry, major, minor, value uint64, op string) {
*blkioStatEntries = append(*blkioStatEntries, cgroups.BlkioStatEntry{Major: major, Minor: minor, Value: value, Op: op})
}
func TestBlkioSetWeight(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
const (
weightBefore = 100
weightAfter = 200
)
helper.writeFileContents(map[string]string{
"blkio.weight": strconv.Itoa(weightBefore),
})
helper.CgroupData.config.Resources.BlkioWeight = weightAfter
blkio := &BlkioGroup{}
if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamUint(helper.CgroupPath, "blkio.weight")
if err != nil {
t.Fatalf("Failed to parse blkio.weight - %s", err)
}
if value != weightAfter {
t.Fatal("Got the wrong value, set blkio.weight failed.")
}
}
func TestBlkioSetWeightDevice(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
const (
weightDeviceBefore = "8:0 400"
)
wd := configs.NewWeightDevice(8, 0, 500, 0)
weightDeviceAfter := wd.WeightString()
helper.writeFileContents(map[string]string{
"blkio.weight_device": weightDeviceBefore,
})
helper.CgroupData.config.Resources.BlkioWeightDevice = []*configs.WeightDevice{wd}
blkio := &BlkioGroup{}
if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamString(helper.CgroupPath, "blkio.weight_device")
if err != nil {
t.Fatalf("Failed to parse blkio.weight_device - %s", err)
}
if value != weightDeviceAfter {
t.Fatal("Got the wrong value, set blkio.weight_device failed.")
}
}
// regression #274
func TestBlkioSetMultipleWeightDevice(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
const (
weightDeviceBefore = "8:0 400"
)
wd1 := configs.NewWeightDevice(8, 0, 500, 0)
wd2 := configs.NewWeightDevice(8, 16, 500, 0)
// we cannot actually set and check both because normal ioutil.WriteFile
// when writing to cgroup file will overwrite the whole file content instead
// of updating it as the kernel is doing. Just check the second device
// is present will suffice for the test to ensure multiple writes are done.
weightDeviceAfter := wd2.WeightString()
helper.writeFileContents(map[string]string{
"blkio.weight_device": weightDeviceBefore,
})
helper.CgroupData.config.Resources.BlkioWeightDevice = []*configs.WeightDevice{wd1, wd2}
blkio := &BlkioGroup{}
if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamString(helper.CgroupPath, "blkio.weight_device")
if err != nil {
t.Fatalf("Failed to parse blkio.weight_device - %s", err)
}
if value != weightDeviceAfter {
t.Fatal("Got the wrong value, set blkio.weight_device failed.")
}
}
func TestBlkioStats(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
"blkio.io_serviced_recursive": servicedRecursiveContents,
"blkio.io_queued_recursive": queuedRecursiveContents,
"blkio.io_service_time_recursive": serviceTimeRecursiveContents,
"blkio.io_wait_time_recursive": waitTimeRecursiveContents,
"blkio.io_merged_recursive": mergedRecursiveContents,
"blkio.time_recursive": timeRecursiveContents,
"blkio.sectors_recursive": sectorsRecursiveContents,
})
blkio := &BlkioGroup{}
actualStats := *cgroups.NewStats()
err := blkio.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatal(err)
}
// Verify expected stats.
expectedStats := cgroups.BlkioStats{}
appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 1024, "")
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 100, "Read")
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 200, "Write")
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 300, "Sync")
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Async")
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Total")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 10, "Read")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 40, "Write")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 20, "Sync")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 30, "Async")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 50, "Total")
appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 1, "Read")
appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Write")
appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Sync")
appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Async")
appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Total")
appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read")
appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write")
appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync")
appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Async")
appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 17395, "Total")
appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Read")
appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write")
appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync")
appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Async")
appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Total")
appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 5, "Read")
appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 10, "Write")
appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync")
appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async")
appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 15, "Total")
appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 8, "")
expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats)
}
func TestBlkioStatsNoSectorsFile(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
"blkio.io_serviced_recursive": servicedRecursiveContents,
"blkio.io_queued_recursive": queuedRecursiveContents,
"blkio.io_service_time_recursive": serviceTimeRecursiveContents,
"blkio.io_wait_time_recursive": waitTimeRecursiveContents,
"blkio.io_merged_recursive": mergedRecursiveContents,
"blkio.time_recursive": timeRecursiveContents,
})
blkio := &BlkioGroup{}
actualStats := *cgroups.NewStats()
err := blkio.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatalf("Failed unexpectedly: %s", err)
}
}
func TestBlkioStatsNoServiceBytesFile(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"blkio.io_serviced_recursive": servicedRecursiveContents,
"blkio.io_queued_recursive": queuedRecursiveContents,
"blkio.sectors_recursive": sectorsRecursiveContents,
"blkio.io_service_time_recursive": serviceTimeRecursiveContents,
"blkio.io_wait_time_recursive": waitTimeRecursiveContents,
"blkio.io_merged_recursive": mergedRecursiveContents,
"blkio.time_recursive": timeRecursiveContents,
})
blkio := &BlkioGroup{}
actualStats := *cgroups.NewStats()
err := blkio.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatalf("Failed unexpectedly: %s", err)
}
}
func TestBlkioStatsNoServicedFile(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
"blkio.io_queued_recursive": queuedRecursiveContents,
"blkio.sectors_recursive": sectorsRecursiveContents,
"blkio.io_service_time_recursive": serviceTimeRecursiveContents,
"blkio.io_wait_time_recursive": waitTimeRecursiveContents,
"blkio.io_merged_recursive": mergedRecursiveContents,
"blkio.time_recursive": timeRecursiveContents,
})
blkio := &BlkioGroup{}
actualStats := *cgroups.NewStats()
err := blkio.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatalf("Failed unexpectedly: %s", err)
}
}
func TestBlkioStatsNoQueuedFile(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
"blkio.io_serviced_recursive": servicedRecursiveContents,
"blkio.sectors_recursive": sectorsRecursiveContents,
"blkio.io_service_time_recursive": serviceTimeRecursiveContents,
"blkio.io_wait_time_recursive": waitTimeRecursiveContents,
"blkio.io_merged_recursive": mergedRecursiveContents,
"blkio.time_recursive": timeRecursiveContents,
})
blkio := &BlkioGroup{}
actualStats := *cgroups.NewStats()
err := blkio.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatalf("Failed unexpectedly: %s", err)
}
}
func TestBlkioStatsNoServiceTimeFile(t *testing.T) {
if testing.Short() {
t.Skip("skipping test in short mode.")
}
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
"blkio.io_serviced_recursive": servicedRecursiveContents,
"blkio.io_queued_recursive": queuedRecursiveContents,
"blkio.io_wait_time_recursive": waitTimeRecursiveContents,
"blkio.io_merged_recursive": mergedRecursiveContents,
"blkio.time_recursive": timeRecursiveContents,
"blkio.sectors_recursive": sectorsRecursiveContents,
})
blkio := &BlkioGroup{}
actualStats := *cgroups.NewStats()
err := blkio.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatalf("Failed unexpectedly: %s", err)
}
}
func TestBlkioStatsNoWaitTimeFile(t *testing.T) {
if testing.Short() {
t.Skip("skipping test in short mode.")
}
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
"blkio.io_serviced_recursive": servicedRecursiveContents,
"blkio.io_queued_recursive": queuedRecursiveContents,
"blkio.io_service_time_recursive": serviceTimeRecursiveContents,
"blkio.io_merged_recursive": mergedRecursiveContents,
"blkio.time_recursive": timeRecursiveContents,
"blkio.sectors_recursive": sectorsRecursiveContents,
})
blkio := &BlkioGroup{}
actualStats := *cgroups.NewStats()
err := blkio.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatalf("Failed unexpectedly: %s", err)
}
}
func TestBlkioStatsNoMergedFile(t *testing.T) {
if testing.Short() {
t.Skip("skipping test in short mode.")
}
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
"blkio.io_serviced_recursive": servicedRecursiveContents,
"blkio.io_queued_recursive": queuedRecursiveContents,
"blkio.io_service_time_recursive": serviceTimeRecursiveContents,
"blkio.io_wait_time_recursive": waitTimeRecursiveContents,
"blkio.time_recursive": timeRecursiveContents,
"blkio.sectors_recursive": sectorsRecursiveContents,
})
blkio := &BlkioGroup{}
actualStats := *cgroups.NewStats()
err := blkio.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatalf("Failed unexpectedly: %s", err)
}
}
func TestBlkioStatsNoTimeFile(t *testing.T) {
if testing.Short() {
t.Skip("skipping test in short mode.")
}
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"blkio.io_service_bytes_recursive": serviceBytesRecursiveContents,
"blkio.io_serviced_recursive": servicedRecursiveContents,
"blkio.io_queued_recursive": queuedRecursiveContents,
"blkio.io_service_time_recursive": serviceTimeRecursiveContents,
"blkio.io_wait_time_recursive": waitTimeRecursiveContents,
"blkio.io_merged_recursive": mergedRecursiveContents,
"blkio.sectors_recursive": sectorsRecursiveContents,
})
blkio := &BlkioGroup{}
actualStats := *cgroups.NewStats()
err := blkio.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatalf("Failed unexpectedly: %s", err)
}
}
func TestBlkioStatsUnexpectedNumberOfFields(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"blkio.io_service_bytes_recursive": "8:0 Read 100 100",
"blkio.io_serviced_recursive": servicedRecursiveContents,
"blkio.io_queued_recursive": queuedRecursiveContents,
"blkio.sectors_recursive": sectorsRecursiveContents,
"blkio.io_service_time_recursive": serviceTimeRecursiveContents,
"blkio.io_wait_time_recursive": waitTimeRecursiveContents,
"blkio.io_merged_recursive": mergedRecursiveContents,
"blkio.time_recursive": timeRecursiveContents,
})
blkio := &BlkioGroup{}
actualStats := *cgroups.NewStats()
err := blkio.GetStats(helper.CgroupPath, &actualStats)
if err == nil {
t.Fatal("Expected to fail, but did not")
}
}
func TestBlkioStatsUnexpectedFieldType(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"blkio.io_service_bytes_recursive": "8:0 Read Write",
"blkio.io_serviced_recursive": servicedRecursiveContents,
"blkio.io_queued_recursive": queuedRecursiveContents,
"blkio.sectors_recursive": sectorsRecursiveContents,
"blkio.io_service_time_recursive": serviceTimeRecursiveContents,
"blkio.io_wait_time_recursive": waitTimeRecursiveContents,
"blkio.io_merged_recursive": mergedRecursiveContents,
"blkio.time_recursive": timeRecursiveContents,
})
blkio := &BlkioGroup{}
actualStats := *cgroups.NewStats()
err := blkio.GetStats(helper.CgroupPath, &actualStats)
if err == nil {
t.Fatal("Expected to fail, but did not")
}
}
func TestNonCFQBlkioStats(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"blkio.io_service_bytes_recursive": "",
"blkio.io_serviced_recursive": "",
"blkio.io_queued_recursive": "",
"blkio.sectors_recursive": "",
"blkio.io_service_time_recursive": "",
"blkio.io_wait_time_recursive": "",
"blkio.io_merged_recursive": "",
"blkio.time_recursive": "",
"blkio.throttle.io_service_bytes": throttleServiceBytes,
"blkio.throttle.io_serviced": throttleServiced,
})
blkio := &BlkioGroup{}
actualStats := *cgroups.NewStats()
err := blkio.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatal(err)
}
// Verify expected stats.
expectedStats := cgroups.BlkioStats{}
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Read")
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 23, "Write")
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 42, "Sync")
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Async")
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Total")
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Read")
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 23, "Write")
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 42, "Sync")
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Async")
appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Total")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Read")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 23, "Write")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 42, "Sync")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Async")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Total")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Read")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 23, "Write")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 42, "Sync")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Async")
appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Total")
expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats)
}
func TestBlkioSetThrottleReadBpsDevice(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
const (
throttleBefore = `8:0 1024`
)
td := configs.NewThrottleDevice(8, 0, 2048)
throttleAfter := td.String()
helper.writeFileContents(map[string]string{
"blkio.throttle.read_bps_device": throttleBefore,
})
helper.CgroupData.config.Resources.BlkioThrottleReadBpsDevice = []*configs.ThrottleDevice{td}
blkio := &BlkioGroup{}
if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamString(helper.CgroupPath, "blkio.throttle.read_bps_device")
if err != nil {
t.Fatalf("Failed to parse blkio.throttle.read_bps_device - %s", err)
}
if value != throttleAfter {
t.Fatal("Got the wrong value, set blkio.throttle.read_bps_device failed.")
}
}
func TestBlkioSetThrottleWriteBpsDevice(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
const (
throttleBefore = `8:0 1024`
)
td := configs.NewThrottleDevice(8, 0, 2048)
throttleAfter := td.String()
helper.writeFileContents(map[string]string{
"blkio.throttle.write_bps_device": throttleBefore,
})
helper.CgroupData.config.Resources.BlkioThrottleWriteBpsDevice = []*configs.ThrottleDevice{td}
blkio := &BlkioGroup{}
if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamString(helper.CgroupPath, "blkio.throttle.write_bps_device")
if err != nil {
t.Fatalf("Failed to parse blkio.throttle.write_bps_device - %s", err)
}
if value != throttleAfter {
t.Fatal("Got the wrong value, set blkio.throttle.write_bps_device failed.")
}
}
func TestBlkioSetThrottleReadIOpsDevice(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
const (
throttleBefore = `8:0 1024`
)
td := configs.NewThrottleDevice(8, 0, 2048)
throttleAfter := td.String()
helper.writeFileContents(map[string]string{
"blkio.throttle.read_iops_device": throttleBefore,
})
helper.CgroupData.config.Resources.BlkioThrottleReadIOPSDevice = []*configs.ThrottleDevice{td}
blkio := &BlkioGroup{}
if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamString(helper.CgroupPath, "blkio.throttle.read_iops_device")
if err != nil {
t.Fatalf("Failed to parse blkio.throttle.read_iops_device - %s", err)
}
if value != throttleAfter {
t.Fatal("Got the wrong value, set blkio.throttle.read_iops_device failed.")
}
}
func TestBlkioSetThrottleWriteIOpsDevice(t *testing.T) {
helper := NewCgroupTestUtil("blkio", t)
defer helper.cleanup()
const (
throttleBefore = `8:0 1024`
)
td := configs.NewThrottleDevice(8, 0, 2048)
throttleAfter := td.String()
helper.writeFileContents(map[string]string{
"blkio.throttle.write_iops_device": throttleBefore,
})
helper.CgroupData.config.Resources.BlkioThrottleWriteIOPSDevice = []*configs.ThrottleDevice{td}
blkio := &BlkioGroup{}
if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamString(helper.CgroupPath, "blkio.throttle.write_iops_device")
if err != nil {
t.Fatalf("Failed to parse blkio.throttle.write_iops_device - %s", err)
}
if value != throttleAfter {
t.Fatal("Got the wrong value, set blkio.throttle.write_iops_device failed.")
}
}

View File

@@ -22,21 +22,59 @@ func (s *CpuGroup) Name() string {
func (s *CpuGroup) Apply(d *cgroupData) error {
// We always want to join the cpu group, to allow fair cpu scheduling
// on a container basis
_, err := d.join("cpu")
path, err := d.path("cpu")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return s.ApplyDir(path, d.config, d.pid)
}
func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error {
// This might happen if we have no cpu cgroup mounted.
// Just do nothing and don't fail.
if path == "" {
return nil
}
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
// We should set the real-Time group scheduling settings before moving
// in the process because if the process is already in SCHED_RR mode
// and no RT bandwidth is set, adding it will fail.
if err := s.SetRtSched(path, cgroup); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
if err := cgroups.WriteCgroupProc(path, pid); err != nil {
return err
}
return nil
}
func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuRtPeriod != 0 {
if err := writeFile(path, "cpu.rt_period_us", strconv.FormatUint(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
return err
}
}
if cgroup.Resources.CpuRtRuntime != 0 {
if err := writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil {
return err
}
}
return nil
}
func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuShares != 0 {
if err := writeFile(path, "cpu.shares", strconv.FormatInt(cgroup.Resources.CpuShares, 10)); err != nil {
if err := writeFile(path, "cpu.shares", strconv.FormatUint(cgroup.Resources.CpuShares, 10)); err != nil {
return err
}
}
if cgroup.Resources.CpuPeriod != 0 {
if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(cgroup.Resources.CpuPeriod, 10)); err != nil {
if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil {
return err
}
}
@@ -45,15 +83,8 @@ func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
return err
}
}
if cgroup.Resources.CpuRtPeriod != 0 {
if err := writeFile(path, "cpu.rt_period_us", strconv.FormatInt(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
return err
}
}
if cgroup.Resources.CpuRtRuntime != 0 {
if err := writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil {
return err
}
if err := s.SetRtSched(path, cgroup); err != nil {
return err
}
return nil

View File

@@ -1,163 +0,0 @@
// +build linux
package fs
import (
"fmt"
"strconv"
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups"
)
func TestCpuSetShares(t *testing.T) {
helper := NewCgroupTestUtil("cpu", t)
defer helper.cleanup()
const (
sharesBefore = 1024
sharesAfter = 512
)
helper.writeFileContents(map[string]string{
"cpu.shares": strconv.Itoa(sharesBefore),
})
helper.CgroupData.config.Resources.CpuShares = sharesAfter
cpu := &CpuGroup{}
if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamUint(helper.CgroupPath, "cpu.shares")
if err != nil {
t.Fatalf("Failed to parse cpu.shares - %s", err)
}
if value != sharesAfter {
t.Fatal("Got the wrong value, set cpu.shares failed.")
}
}
func TestCpuSetBandWidth(t *testing.T) {
helper := NewCgroupTestUtil("cpu", t)
defer helper.cleanup()
const (
quotaBefore = 8000
quotaAfter = 5000
periodBefore = 10000
periodAfter = 7000
rtRuntimeBefore = 8000
rtRuntimeAfter = 5000
rtPeriodBefore = 10000
rtPeriodAfter = 7000
)
helper.writeFileContents(map[string]string{
"cpu.cfs_quota_us": strconv.Itoa(quotaBefore),
"cpu.cfs_period_us": strconv.Itoa(periodBefore),
"cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore),
"cpu.rt_period_us": strconv.Itoa(rtPeriodBefore),
})
helper.CgroupData.config.Resources.CpuQuota = quotaAfter
helper.CgroupData.config.Resources.CpuPeriod = periodAfter
helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter
helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter
cpu := &CpuGroup{}
if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
quota, err := getCgroupParamUint(helper.CgroupPath, "cpu.cfs_quota_us")
if err != nil {
t.Fatalf("Failed to parse cpu.cfs_quota_us - %s", err)
}
if quota != quotaAfter {
t.Fatal("Got the wrong value, set cpu.cfs_quota_us failed.")
}
period, err := getCgroupParamUint(helper.CgroupPath, "cpu.cfs_period_us")
if err != nil {
t.Fatalf("Failed to parse cpu.cfs_period_us - %s", err)
}
if period != periodAfter {
t.Fatal("Got the wrong value, set cpu.cfs_period_us failed.")
}
rtRuntime, err := getCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us")
if err != nil {
t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err)
}
if rtRuntime != rtRuntimeAfter {
t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.")
}
rtPeriod, err := getCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us")
if err != nil {
t.Fatalf("Failed to parse cpu.rt_period_us - %s", err)
}
if rtPeriod != rtPeriodAfter {
t.Fatal("Got the wrong value, set cpu.rt_period_us failed.")
}
}
func TestCpuStats(t *testing.T) {
helper := NewCgroupTestUtil("cpu", t)
defer helper.cleanup()
const (
kNrPeriods = 2000
kNrThrottled = 200
kThrottledTime = uint64(18446744073709551615)
)
cpuStatContent := fmt.Sprintf("nr_periods %d\n nr_throttled %d\n throttled_time %d\n",
kNrPeriods, kNrThrottled, kThrottledTime)
helper.writeFileContents(map[string]string{
"cpu.stat": cpuStatContent,
})
cpu := &CpuGroup{}
actualStats := *cgroups.NewStats()
err := cpu.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatal(err)
}
expectedStats := cgroups.ThrottlingData{
Periods: kNrPeriods,
ThrottledPeriods: kNrThrottled,
ThrottledTime: kThrottledTime}
expectThrottlingDataEquals(t, expectedStats, actualStats.CpuStats.ThrottlingData)
}
func TestNoCpuStatFile(t *testing.T) {
helper := NewCgroupTestUtil("cpu", t)
defer helper.cleanup()
cpu := &CpuGroup{}
actualStats := *cgroups.NewStats()
err := cpu.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatal("Expected not to fail, but did")
}
}
func TestInvalidCpuStat(t *testing.T) {
helper := NewCgroupTestUtil("cpu", t)
defer helper.cleanup()
cpuStatContent := `nr_periods 2000
nr_throttled 200
throttled_time fortytwo`
helper.writeFileContents(map[string]string{
"cpu.stat": cpuStatContent,
})
cpu := &CpuGroup{}
actualStats := *cgroups.NewStats()
err := cpu.GetStats(helper.CgroupPath, &actualStats)
if err == nil {
t.Fatal("Expected failed stat parsing.")
}
}

View File

@@ -8,7 +8,6 @@ import (
"io/ioutil"
"os"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
@@ -58,16 +57,34 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
if dir == "" {
return nil
}
root, err := getCgroupRoot()
mountInfo, err := ioutil.ReadFile("/proc/self/mountinfo")
if err != nil {
return err
}
if err := s.ensureParent(dir, root); err != nil {
root := filepath.Dir(cgroups.GetClosestMountpointAncestor(dir, string(mountInfo)))
// 'ensureParent' start with parent because we don't want to
// explicitly inherit from parent, it could conflict with
// 'cpuset.cpu_exclusive'.
if err := s.ensureParent(filepath.Dir(dir), root); err != nil {
return err
}
if err := os.MkdirAll(dir, 0755); err != nil {
return err
}
// We didn't inherit cpuset configs from parent, but we have
// to ensure cpuset configs are set before moving task into the
// cgroup.
// The logic is, if user specified cpuset configs, use these
// specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatbility.
if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil {
if err := cgroups.WriteCgroupProc(dir, pid); err != nil {
return err
}
@@ -137,3 +154,10 @@ func (s *CpusetGroup) copyIfNeeded(current, parent string) error {
func (s *CpusetGroup) isEmpty(b []byte) bool {
return len(bytes.Trim(b, "\n")) == 0
}
func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error {
if err := s.Set(path, cgroup); err != nil {
return err
}
return s.copyIfNeeded(path, filepath.Dir(path))
}

View File

@@ -1,65 +0,0 @@
// +build linux
package fs
import (
"testing"
)
func TestCpusetSetCpus(t *testing.T) {
helper := NewCgroupTestUtil("cpuset", t)
defer helper.cleanup()
const (
cpusBefore = "0"
cpusAfter = "1-3"
)
helper.writeFileContents(map[string]string{
"cpuset.cpus": cpusBefore,
})
helper.CgroupData.config.Resources.CpusetCpus = cpusAfter
cpuset := &CpusetGroup{}
if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamString(helper.CgroupPath, "cpuset.cpus")
if err != nil {
t.Fatalf("Failed to parse cpuset.cpus - %s", err)
}
if value != cpusAfter {
t.Fatal("Got the wrong value, set cpuset.cpus failed.")
}
}
func TestCpusetSetMems(t *testing.T) {
helper := NewCgroupTestUtil("cpuset", t)
defer helper.cleanup()
const (
memsBefore = "0"
memsAfter = "1"
)
helper.writeFileContents(map[string]string{
"cpuset.mems": memsBefore,
})
helper.CgroupData.config.Resources.CpusetMems = memsAfter
cpuset := &CpusetGroup{}
if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamString(helper.CgroupPath, "cpuset.mems")
if err != nil {
t.Fatalf("Failed to parse cpuset.mems - %s", err)
}
if value != memsAfter {
t.Fatal("Got the wrong value, set cpuset.mems failed.")
}
}

View File

@@ -5,6 +5,7 @@ package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
)
type DevicesGroup struct {
@@ -25,6 +26,10 @@ func (s *DevicesGroup) Apply(d *cgroupData) error {
}
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
if system.RunningInUserNS() {
return nil
}
devices := cgroup.Resources.Devices
if len(devices) > 0 {
for _, dev := range devices {
@@ -38,21 +43,23 @@ func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
}
return nil
}
if !cgroup.Resources.AllowAllDevices {
if err := writeFile(path, "devices.deny", "a"); err != nil {
return err
}
for _, dev := range cgroup.Resources.AllowedDevices {
if err := writeFile(path, "devices.allow", dev.CgroupString()); err != nil {
if cgroup.Resources.AllowAllDevices != nil {
if *cgroup.Resources.AllowAllDevices == false {
if err := writeFile(path, "devices.deny", "a"); err != nil {
return err
}
}
return nil
}
if err := writeFile(path, "devices.allow", "a"); err != nil {
return err
for _, dev := range cgroup.Resources.AllowedDevices {
if err := writeFile(path, "devices.allow", dev.CgroupString()); err != nil {
return err
}
}
return nil
}
if err := writeFile(path, "devices.allow", "a"); err != nil {
return err
}
}
for _, dev := range cgroup.Resources.DeniedDevices {

View File

@@ -1,84 +0,0 @@
// +build linux
package fs
import (
"testing"
"github.com/opencontainers/runc/libcontainer/configs"
)
var (
allowedDevices = []*configs.Device{
{
Path: "/dev/zero",
Type: 'c',
Major: 1,
Minor: 5,
Permissions: "rwm",
FileMode: 0666,
},
}
allowedList = "c 1:5 rwm"
deniedDevices = []*configs.Device{
{
Path: "/dev/null",
Type: 'c',
Major: 1,
Minor: 3,
Permissions: "rwm",
FileMode: 0666,
},
}
deniedList = "c 1:3 rwm"
)
func TestDevicesSetAllow(t *testing.T) {
helper := NewCgroupTestUtil("devices", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"devices.deny": "a",
})
helper.CgroupData.config.Resources.AllowAllDevices = false
helper.CgroupData.config.Resources.AllowedDevices = allowedDevices
devices := &DevicesGroup{}
if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamString(helper.CgroupPath, "devices.allow")
if err != nil {
t.Fatalf("Failed to parse devices.allow - %s", err)
}
if value != allowedList {
t.Fatal("Got the wrong value, set devices.allow failed.")
}
}
func TestDevicesSetDeny(t *testing.T) {
helper := NewCgroupTestUtil("devices", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"devices.allow": "a",
})
helper.CgroupData.config.Resources.AllowAllDevices = true
helper.CgroupData.config.Resources.DeniedDevices = deniedDevices
devices := &DevicesGroup{}
if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamString(helper.CgroupPath, "devices.deny")
if err != nil {
t.Fatalf("Failed to parse devices.deny - %s", err)
}
if value != deniedList {
t.Fatal("Got the wrong value, set devices.deny failed.")
}
}

View File

@@ -29,11 +29,15 @@ func (s *FreezerGroup) Apply(d *cgroupData) error {
func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
switch cgroup.Resources.Freezer {
case configs.Frozen, configs.Thawed:
if err := writeFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
return err
}
for {
// In case this loop does not exit because it doesn't get the expected
// state, let's write again this state, hoping it's going to be properly
// set this time. Otherwise, this loop could run infinitely, waiting for
// a state change that would never happen.
if err := writeFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
return err
}
state, err := readFile(path, "freezer.state")
if err != nil {
return err
@@ -41,6 +45,7 @@ func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) {
break
}
time.Sleep(1 * time.Millisecond)
}
case configs.Undefined:

View File

@@ -1,47 +0,0 @@
// +build linux
package fs
import (
"testing"
"github.com/opencontainers/runc/libcontainer/configs"
)
func TestFreezerSetState(t *testing.T) {
helper := NewCgroupTestUtil("freezer", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"freezer.state": string(configs.Frozen),
})
helper.CgroupData.config.Resources.Freezer = configs.Thawed
freezer := &FreezerGroup{}
if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamString(helper.CgroupPath, "freezer.state")
if err != nil {
t.Fatalf("Failed to parse freezer.state - %s", err)
}
if value != string(configs.Thawed) {
t.Fatal("Got the wrong value, set freezer.state failed.")
}
}
func TestFreezerSetInvalidState(t *testing.T) {
helper := NewCgroupTestUtil("freezer", t)
defer helper.cleanup()
const (
invalidArg configs.FreezerState = "Invalid"
)
helper.CgroupData.config.Resources.Freezer = invalidArg
freezer := &FreezerGroup{}
if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err == nil {
t.Fatal("Failed to return invalid argument error")
}
}

View File

@@ -1,154 +0,0 @@
// +build linux
package fs
import (
"fmt"
"strconv"
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
const (
hugetlbUsageContents = "128\n"
hugetlbMaxUsageContents = "256\n"
hugetlbFailcnt = "100\n"
)
var (
usage = "hugetlb.%s.usage_in_bytes"
limit = "hugetlb.%s.limit_in_bytes"
maxUsage = "hugetlb.%s.max_usage_in_bytes"
failcnt = "hugetlb.%s.failcnt"
)
func TestHugetlbSetHugetlb(t *testing.T) {
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
const (
hugetlbBefore = 256
hugetlbAfter = 512
)
for _, pageSize := range HugePageSizes {
helper.writeFileContents(map[string]string{
fmt.Sprintf(limit, pageSize): strconv.Itoa(hugetlbBefore),
})
}
for _, pageSize := range HugePageSizes {
helper.CgroupData.config.Resources.HugetlbLimit = []*configs.HugepageLimit{
{
Pagesize: pageSize,
Limit: hugetlbAfter,
},
}
hugetlb := &HugetlbGroup{}
if err := hugetlb.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
}
for _, pageSize := range HugePageSizes {
limit := fmt.Sprintf(limit, pageSize)
value, err := getCgroupParamUint(helper.CgroupPath, limit)
if err != nil {
t.Fatalf("Failed to parse %s - %s", limit, err)
}
if value != hugetlbAfter {
t.Fatalf("Set hugetlb.limit_in_bytes failed. Expected: %v, Got: %v", hugetlbAfter, value)
}
}
}
func TestHugetlbStats(t *testing.T) {
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
for _, pageSize := range HugePageSizes {
helper.writeFileContents(map[string]string{
fmt.Sprintf(usage, pageSize): hugetlbUsageContents,
fmt.Sprintf(maxUsage, pageSize): hugetlbMaxUsageContents,
fmt.Sprintf(failcnt, pageSize): hugetlbFailcnt,
})
}
hugetlb := &HugetlbGroup{}
actualStats := *cgroups.NewStats()
err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatal(err)
}
expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100}
for _, pageSize := range HugePageSizes {
expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize])
}
}
func TestHugetlbStatsNoUsageFile(t *testing.T) {
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
maxUsage: hugetlbMaxUsageContents,
})
hugetlb := &HugetlbGroup{}
actualStats := *cgroups.NewStats()
err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
for _, pageSize := range HugePageSizes {
helper.writeFileContents(map[string]string{
fmt.Sprintf(usage, pageSize): hugetlbUsageContents,
})
}
hugetlb := &HugetlbGroup{}
actualStats := *cgroups.NewStats()
err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestHugetlbStatsBadUsageFile(t *testing.T) {
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
for _, pageSize := range HugePageSizes {
helper.writeFileContents(map[string]string{
fmt.Sprintf(usage, pageSize): "bad",
maxUsage: hugetlbMaxUsageContents,
})
}
hugetlb := &HugetlbGroup{}
actualStats := *cgroups.NewStats()
err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestHugetlbStatsBadMaxUsageFile(t *testing.T) {
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
usage: hugetlbUsageContents,
maxUsage: "bad",
})
hugetlb := &HugetlbGroup{}
actualStats := *cgroups.NewStats()
err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}

View File

@@ -5,13 +5,23 @@ package fs
import (
"bufio"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"syscall" // only for Errno
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
const (
cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
cgroupMemoryLimit = "memory.limit_in_bytes"
)
type MemoryGroup struct {
@@ -25,20 +35,23 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
path, err := d.path("memory")
if err != nil && !cgroups.IsNotFound(err) {
return err
} else if path == "" {
return nil
}
if memoryAssigned(d.config) {
if path != "" {
if _, err := os.Stat(path); os.IsNotExist(err) {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
}
// We have to set kernel memory here, as we can't change it once
// processes have been attached.
if err := s.SetKernelMemory(path, d.config); err != nil {
return err
// Only enable kernel memory accouting when this cgroup
// is created by libcontainer, otherwise we might get
// error when people use `cgroupsPath` to join an existed
// cgroup whose kernel memory is not initialized.
if err := EnableKernelMemoryAccounting(path); err != nil {
return err
}
}
}
defer func() {
if err != nil {
os.RemoveAll(path)
@@ -54,30 +67,115 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
return nil
}
func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error {
// This has to be done separately because it has special constraints (it
// can't be done after there are processes attached to the cgroup).
if cgroup.Resources.KernelMemory > 0 {
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
func EnableKernelMemoryAccounting(path string) error {
// Check if kernel memory is enabled
// We have to limit the kernel memory here as it won't be accounted at all
// until a limit is set on the cgroup and limit cannot be set once the
// cgroup has children, or if there are already tasks in the cgroup.
for _, i := range []int64{1, -1} {
if err := setKernelMemory(path, i); err != nil {
return err
}
}
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
if path == "" {
return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
}
if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
// kernel memory is not enabled on the system so we should do nothing
return nil
}
if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
// Check if the error number returned by the syscall is "EBUSY"
// The EBUSY signal is returned on attempts to write to the
// memory.kmem.limit_in_bytes file if the cgroup has children or
// once tasks have been attached to the cgroup
if pathErr, ok := err.(*os.PathError); ok {
if errNo, ok := pathErr.Err.(syscall.Errno); ok {
if errNo == unix.EBUSY {
return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
}
}
}
return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
}
return nil
}
func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
// If the memory update is set to -1 we should also
// set swap to -1, it means unlimited memory.
if cgroup.Resources.Memory == -1 {
// Only set swap if it's enabled in kernel
if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
cgroup.Resources.MemorySwap = -1
}
}
// When memory and swap memory are both set, we need to handle the cases
// for updating container.
if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 {
memoryUsage, err := getMemoryData(path, "")
if err != nil {
return err
}
// When update memory limit, we should adapt the write sequence
// for memory and swap memory, so it won't fail because the new
// value and the old value don't fit kernel's validation.
if cgroup.Resources.MemorySwap == -1 || memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) {
if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
if err := writeFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
} else {
if err := writeFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
}
} else {
if cgroup.Resources.Memory != 0 {
if err := writeFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
}
if cgroup.Resources.MemorySwap != 0 {
if err := writeFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
}
}
return nil
}
func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.Memory != 0 {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
if err := setMemoryAndSwap(path, cgroup); err != nil {
return err
}
if cgroup.Resources.KernelMemory != 0 {
if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil {
return err
}
}
if cgroup.Resources.MemoryReservation != 0 {
if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
return err
}
}
if cgroup.Resources.MemorySwap > 0 {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
if cgroup.Resources.KernelMemoryTCP != 0 {
if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
return err
}
}
@@ -86,14 +184,14 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
return err
}
}
if cgroup.Resources.MemorySwappiness >= 0 && cgroup.Resources.MemorySwappiness <= 100 {
if err := writeFile(path, "memory.swappiness", strconv.FormatInt(cgroup.Resources.MemorySwappiness, 10)); err != nil {
if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
return nil
} else if *cgroup.Resources.MemorySwappiness <= 100 {
if err := writeFile(path, "memory.swappiness", strconv.FormatUint(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
return err
}
} else if cgroup.Resources.MemorySwappiness == -1 {
return nil
} else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", cgroup.Resources.MemorySwappiness)
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *cgroup.Resources.MemorySwappiness)
}
return nil
@@ -139,7 +237,20 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
return err
}
stats.MemoryStats.KernelUsage = kernelUsage
kernelTCPUsage, err := getMemoryData(path, "kmem.tcp")
if err != nil {
return err
}
stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
useHierarchy := strings.Join([]string{"memory", "use_hierarchy"}, ".")
value, err := getCgroupParamUint(path, useHierarchy)
if err != nil {
return err
}
if value == 1 {
stats.MemoryStats.UseHierarchy = true
}
return nil
}
@@ -148,8 +259,9 @@ func memoryAssigned(cgroup *configs.Cgroup) bool {
cgroup.Resources.MemoryReservation != 0 ||
cgroup.Resources.MemorySwap > 0 ||
cgroup.Resources.KernelMemory > 0 ||
cgroup.Resources.KernelMemoryTCP > 0 ||
cgroup.Resources.OomKillDisable ||
cgroup.Resources.MemorySwappiness != -1
(cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1)
}
func getMemoryData(path, name string) (cgroups.MemoryData, error) {

View File

@@ -1,339 +0,0 @@
// +build linux
package fs
import (
"strconv"
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups"
)
const (
memoryStatContents = `cache 512
rss 1024`
memoryUsageContents = "2048\n"
memoryMaxUsageContents = "4096\n"
memoryFailcnt = "100\n"
memoryLimitContents = "8192\n"
)
func TestMemorySetMemory(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
const (
memoryBefore = 314572800 // 300M
memoryAfter = 524288000 // 500M
reservationBefore = 209715200 // 200M
reservationAfter = 314572800 // 300M
)
helper.writeFileContents(map[string]string{
"memory.limit_in_bytes": strconv.Itoa(memoryBefore),
"memory.soft_limit_in_bytes": strconv.Itoa(reservationBefore),
})
helper.CgroupData.config.Resources.Memory = memoryAfter
helper.CgroupData.config.Resources.MemoryReservation = reservationAfter
memory := &MemoryGroup{}
if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
if err != nil {
t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
}
if value != memoryAfter {
t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
}
value, err = getCgroupParamUint(helper.CgroupPath, "memory.soft_limit_in_bytes")
if err != nil {
t.Fatalf("Failed to parse memory.soft_limit_in_bytes - %s", err)
}
if value != reservationAfter {
t.Fatal("Got the wrong value, set memory.soft_limit_in_bytes failed.")
}
}
func TestMemorySetMemoryswap(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
const (
memoryswapBefore = 314572800 // 300M
memoryswapAfter = 524288000 // 500M
)
helper.writeFileContents(map[string]string{
"memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
})
helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
memory := &MemoryGroup{}
if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
if err != nil {
t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
}
if value != memoryswapAfter {
t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
}
}
func TestMemorySetKernelMemory(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
const (
kernelMemoryBefore = 314572800 // 300M
kernelMemoryAfter = 524288000 // 500M
)
helper.writeFileContents(map[string]string{
"memory.kmem.limit_in_bytes": strconv.Itoa(kernelMemoryBefore),
})
helper.CgroupData.config.Resources.KernelMemory = kernelMemoryAfter
memory := &MemoryGroup{}
if err := memory.SetKernelMemory(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamUint(helper.CgroupPath, "memory.kmem.limit_in_bytes")
if err != nil {
t.Fatalf("Failed to parse memory.kmem.limit_in_bytes - %s", err)
}
if value != kernelMemoryAfter {
t.Fatal("Got the wrong value, set memory.kmem.limit_in_bytes failed.")
}
}
func TestMemorySetMemorySwappinessDefault(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
const (
swappinessBefore = 60 //deafult is 60
swappinessAfter = 0
)
helper.writeFileContents(map[string]string{
"memory.swappiness": strconv.Itoa(swappinessBefore),
})
helper.CgroupData.config.Resources.Memory = swappinessAfter
memory := &MemoryGroup{}
if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamUint(helper.CgroupPath, "memory.swappiness")
if err != nil {
t.Fatalf("Failed to parse memory.swappiness - %s", err)
}
if value != swappinessAfter {
t.Fatal("Got the wrong value, set memory.swappiness failed.")
}
}
func TestMemoryStats(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"memory.stat": memoryStatContents,
"memory.usage_in_bytes": memoryUsageContents,
"memory.limit_in_bytes": memoryLimitContents,
"memory.max_usage_in_bytes": memoryMaxUsageContents,
"memory.failcnt": memoryFailcnt,
"memory.memsw.usage_in_bytes": memoryUsageContents,
"memory.memsw.max_usage_in_bytes": memoryMaxUsageContents,
"memory.memsw.failcnt": memoryFailcnt,
"memory.memsw.limit_in_bytes": memoryLimitContents,
"memory.kmem.usage_in_bytes": memoryUsageContents,
"memory.kmem.max_usage_in_bytes": memoryMaxUsageContents,
"memory.kmem.failcnt": memoryFailcnt,
"memory.kmem.limit_in_bytes": memoryLimitContents,
})
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatal(err)
}
expectedStats := cgroups.MemoryStats{Cache: 512, Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, Stats: map[string]uint64{"cache": 512, "rss": 1024}}
expectMemoryStatEquals(t, expectedStats, actualStats.MemoryStats)
}
func TestMemoryStatsNoStatFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"memory.usage_in_bytes": memoryUsageContents,
"memory.max_usage_in_bytes": memoryMaxUsageContents,
"memory.limit_in_bytes": memoryLimitContents,
})
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
if err != nil {
t.Fatal(err)
}
}
func TestMemoryStatsNoUsageFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"memory.stat": memoryStatContents,
"memory.max_usage_in_bytes": memoryMaxUsageContents,
"memory.limit_in_bytes": memoryLimitContents,
})
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestMemoryStatsNoMaxUsageFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"memory.stat": memoryStatContents,
"memory.usage_in_bytes": memoryUsageContents,
"memory.limit_in_bytes": memoryLimitContents,
})
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestMemoryStatsNoLimitInBytesFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"memory.stat": memoryStatContents,
"memory.usage_in_bytes": memoryUsageContents,
"memory.max_usage_in_bytes": memoryMaxUsageContents,
})
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestMemoryStatsBadStatFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"memory.stat": "rss rss",
"memory.usage_in_bytes": memoryUsageContents,
"memory.max_usage_in_bytes": memoryMaxUsageContents,
"memory.limit_in_bytes": memoryLimitContents,
})
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestMemoryStatsBadUsageFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"memory.stat": memoryStatContents,
"memory.usage_in_bytes": "bad",
"memory.max_usage_in_bytes": memoryMaxUsageContents,
"memory.limit_in_bytes": memoryLimitContents,
})
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestMemoryStatsBadMaxUsageFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"memory.stat": memoryStatContents,
"memory.usage_in_bytes": memoryUsageContents,
"memory.max_usage_in_bytes": "bad",
"memory.limit_in_bytes": memoryLimitContents,
})
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestMemoryStatsBadLimitInBytesFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"memory.stat": memoryStatContents,
"memory.usage_in_bytes": memoryUsageContents,
"memory.max_usage_in_bytes": memoryMaxUsageContents,
"memory.limit_in_bytes": "bad",
})
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestMemorySetOomControl(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
const (
oom_kill_disable = 1 // disable oom killer, default is 0
)
helper.writeFileContents(map[string]string{
"memory.oom_control": strconv.Itoa(oom_kill_disable),
})
memory := &MemoryGroup{}
if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamUint(helper.CgroupPath, "memory.oom_control")
if err != nil {
t.Fatalf("Failed to parse memory.oom_control - %s", err)
}
if value != oom_kill_disable {
t.Fatalf("Got the wrong value, set memory.oom_control failed.")
}
}

View File

@@ -9,6 +9,7 @@ import (
type NameGroup struct {
GroupName string
Join bool
}
func (s *NameGroup) Name() string {
@@ -16,6 +17,10 @@ func (s *NameGroup) Name() string {
}
func (s *NameGroup) Apply(d *cgroupData) error {
if s.Join {
// ignore errors if the named cgroup does not exist
d.join(s.GroupName)
}
return nil
}
@@ -24,6 +29,9 @@ func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error {
}
func (s *NameGroup) Remove(d *cgroupData) error {
if s.Join {
removePath(d.path(s.GroupName))
}
return nil
}

View File

@@ -3,6 +3,8 @@
package fs
import (
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
@@ -23,8 +25,8 @@ func (s *NetClsGroup) Apply(d *cgroupData) error {
}
func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.NetClsClassid != "" {
if err := writeFile(path, "net_cls.classid", cgroup.Resources.NetClsClassid); err != nil {
if cgroup.Resources.NetClsClassid != 0 {
if err := writeFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil {
return err
}
}

View File

@@ -1,38 +0,0 @@
// +build linux
package fs
import (
"testing"
)
const (
classidBefore = "0x100002"
classidAfter = "0x100001"
)
func TestNetClsSetClassid(t *testing.T) {
helper := NewCgroupTestUtil("net_cls", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"net_cls.classid": classidBefore,
})
helper.CgroupData.config.Resources.NetClsClassid = classidAfter
netcls := &NetClsGroup{}
if err := netcls.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
// As we are in mock environment, we can't get correct value of classid from
// net_cls.classid.
// So. we just judge if we successfully write classid into file
value, err := getCgroupParamString(helper.CgroupPath, "net_cls.classid")
if err != nil {
t.Fatalf("Failed to parse net_cls.classid - %s", err)
}
if value != classidAfter {
t.Fatal("Got the wrong value, set net_cls.classid failed.")
}
}

View File

@@ -1,38 +0,0 @@
// +build linux
package fs
import (
"strings"
"testing"
"github.com/opencontainers/runc/libcontainer/configs"
)
var (
prioMap = []*configs.IfPrioMap{
{
Interface: "test",
Priority: 5,
},
}
)
func TestNetPrioSetIfPrio(t *testing.T) {
helper := NewCgroupTestUtil("net_prio", t)
defer helper.cleanup()
helper.CgroupData.config.Resources.NetPrioIfpriomap = prioMap
netPrio := &NetPrioGroup{}
if err := netPrio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamString(helper.CgroupPath, "net_prio.ifpriomap")
if err != nil {
t.Fatalf("Failed to parse net_prio.ifpriomap - %s", err)
}
if !strings.Contains(value, "test 5") {
t.Fatal("Got the wrong value, set net_prio.ifpriomap failed.")
}
}

View File

@@ -4,6 +4,7 @@ package fs
import (
"fmt"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -47,11 +48,26 @@ func (s *PidsGroup) Remove(d *cgroupData) error {
}
func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
value, err := getCgroupParamUint(path, "pids.current")
current, err := getCgroupParamUint(path, "pids.current")
if err != nil {
return fmt.Errorf("failed to parse pids.current - %s", err)
}
stats.PidsStats.Current = value
maxString, err := getCgroupParamString(path, "pids.max")
if err != nil {
return fmt.Errorf("failed to parse pids.max - %s", err)
}
// Default if pids.max == "max" is 0 -- which represents "no limit".
var max uint64
if maxString != "max" {
max, err = parseUint(maxString, 10, 64)
if err != nil {
return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max"))
}
}
stats.PidsStats.Current = current
stats.PidsStats.Limit = max
return nil
}

View File

@@ -1,83 +0,0 @@
// +build linux
package fs
import (
"strconv"
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups"
)
const (
maxUnlimited = -1
maxLimited = 1024
)
func TestPidsSetMax(t *testing.T) {
helper := NewCgroupTestUtil("pids", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"pids.max": "max",
})
helper.CgroupData.config.Resources.PidsLimit = maxLimited
pids := &PidsGroup{}
if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamUint(helper.CgroupPath, "pids.max")
if err != nil {
t.Fatalf("Failed to parse pids.max - %s", err)
}
if value != maxLimited {
t.Fatalf("Expected %d, got %d for setting pids.max - limited", maxLimited, value)
}
}
func TestPidsSetUnlimited(t *testing.T) {
helper := NewCgroupTestUtil("pids", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"pids.max": strconv.Itoa(maxLimited),
})
helper.CgroupData.config.Resources.PidsLimit = maxUnlimited
pids := &PidsGroup{}
if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := getCgroupParamString(helper.CgroupPath, "pids.max")
if err != nil {
t.Fatalf("Failed to parse pids.max - %s", err)
}
if value != "max" {
t.Fatalf("Expected %s, got %s for setting pids.max - unlimited", "max", value)
}
}
func TestPidsStats(t *testing.T) {
helper := NewCgroupTestUtil("pids", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"pids.current": strconv.Itoa(1337),
"pids.max": strconv.Itoa(maxLimited),
})
pids := &PidsGroup{}
stats := *cgroups.NewStats()
if err := pids.GetStats(helper.CgroupPath, &stats); err != nil {
t.Fatal(err)
}
if stats.PidsStats.Current != 1337 {
t.Fatalf("Expected %d, got %d for pids.current", 1337, stats.PidsStats.Current)
}
}

View File

@@ -1,117 +0,0 @@
// +build linux
package fs
import (
"fmt"
"testing"
"github.com/Sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/cgroups"
)
func blkioStatEntryEquals(expected, actual []cgroups.BlkioStatEntry) error {
if len(expected) != len(actual) {
return fmt.Errorf("blkioStatEntries length do not match")
}
for i, expValue := range expected {
actValue := actual[i]
if expValue != actValue {
return fmt.Errorf("Expected blkio stat entry %v but found %v", expValue, actValue)
}
}
return nil
}
func expectBlkioStatsEquals(t *testing.T, expected, actual cgroups.BlkioStats) {
if err := blkioStatEntryEquals(expected.IoServiceBytesRecursive, actual.IoServiceBytesRecursive); err != nil {
logrus.Printf("blkio IoServiceBytesRecursive do not match - %s\n", err)
t.Fail()
}
if err := blkioStatEntryEquals(expected.IoServicedRecursive, actual.IoServicedRecursive); err != nil {
logrus.Printf("blkio IoServicedRecursive do not match - %s\n", err)
t.Fail()
}
if err := blkioStatEntryEquals(expected.IoQueuedRecursive, actual.IoQueuedRecursive); err != nil {
logrus.Printf("blkio IoQueuedRecursive do not match - %s\n", err)
t.Fail()
}
if err := blkioStatEntryEquals(expected.SectorsRecursive, actual.SectorsRecursive); err != nil {
logrus.Printf("blkio SectorsRecursive do not match - %s\n", err)
t.Fail()
}
if err := blkioStatEntryEquals(expected.IoServiceTimeRecursive, actual.IoServiceTimeRecursive); err != nil {
logrus.Printf("blkio IoServiceTimeRecursive do not match - %s\n", err)
t.Fail()
}
if err := blkioStatEntryEquals(expected.IoWaitTimeRecursive, actual.IoWaitTimeRecursive); err != nil {
logrus.Printf("blkio IoWaitTimeRecursive do not match - %s\n", err)
t.Fail()
}
if err := blkioStatEntryEquals(expected.IoMergedRecursive, actual.IoMergedRecursive); err != nil {
logrus.Printf("blkio IoMergedRecursive do not match - %v vs %v\n", expected.IoMergedRecursive, actual.IoMergedRecursive)
t.Fail()
}
if err := blkioStatEntryEquals(expected.IoTimeRecursive, actual.IoTimeRecursive); err != nil {
logrus.Printf("blkio IoTimeRecursive do not match - %s\n", err)
t.Fail()
}
}
func expectThrottlingDataEquals(t *testing.T, expected, actual cgroups.ThrottlingData) {
if expected != actual {
logrus.Printf("Expected throttling data %v but found %v\n", expected, actual)
t.Fail()
}
}
func expectHugetlbStatEquals(t *testing.T, expected, actual cgroups.HugetlbStats) {
if expected != actual {
logrus.Printf("Expected hugetlb stats %v but found %v\n", expected, actual)
t.Fail()
}
}
func expectMemoryStatEquals(t *testing.T, expected, actual cgroups.MemoryStats) {
expectMemoryDataEquals(t, expected.Usage, actual.Usage)
expectMemoryDataEquals(t, expected.SwapUsage, actual.SwapUsage)
expectMemoryDataEquals(t, expected.KernelUsage, actual.KernelUsage)
for key, expValue := range expected.Stats {
actValue, ok := actual.Stats[key]
if !ok {
logrus.Printf("Expected memory stat key %s not found\n", key)
t.Fail()
}
if expValue != actValue {
logrus.Printf("Expected memory stat value %d but found %d\n", expValue, actValue)
t.Fail()
}
}
}
func expectMemoryDataEquals(t *testing.T, expected, actual cgroups.MemoryData) {
if expected.Usage != actual.Usage {
logrus.Printf("Expected memory usage %d but found %d\n", expected.Usage, actual.Usage)
t.Fail()
}
if expected.MaxUsage != actual.MaxUsage {
logrus.Printf("Expected memory max usage %d but found %d\n", expected.MaxUsage, actual.MaxUsage)
t.Fail()
}
if expected.Failcnt != actual.Failcnt {
logrus.Printf("Expected memory failcnt %d but found %d\n", expected.Failcnt, actual.Failcnt)
t.Fail()
}
if expected.Limit != actual.Limit {
logrus.Printf("Expected memory limit %d but found %d\n", expected.Limit, actual.Limit)
t.Fail()
}
}

View File

@@ -1,67 +0,0 @@
// +build linux
/*
Utility for testing cgroup operations.
Creates a mock of the cgroup filesystem for the duration of the test.
*/
package fs
import (
"io/ioutil"
"os"
"path/filepath"
"testing"
"github.com/opencontainers/runc/libcontainer/configs"
)
type cgroupTestUtil struct {
// cgroup data to use in tests.
CgroupData *cgroupData
// Path to the mock cgroup directory.
CgroupPath string
// Temporary directory to store mock cgroup filesystem.
tempDir string
t *testing.T
}
// Creates a new test util for the specified subsystem
func NewCgroupTestUtil(subsystem string, t *testing.T) *cgroupTestUtil {
d := &cgroupData{
config: &configs.Cgroup{},
}
d.config.Resources = &configs.Resources{}
tempDir, err := ioutil.TempDir("", "cgroup_test")
if err != nil {
t.Fatal(err)
}
d.root = tempDir
testCgroupPath := filepath.Join(d.root, subsystem)
if err != nil {
t.Fatal(err)
}
// Ensure the full mock cgroup path exists.
err = os.MkdirAll(testCgroupPath, 0755)
if err != nil {
t.Fatal(err)
}
return &cgroupTestUtil{CgroupData: d, CgroupPath: testCgroupPath, tempDir: tempDir, t: t}
}
func (c *cgroupTestUtil) cleanup() {
os.RemoveAll(c.tempDir)
}
// Write the specified contents on the mock of the specified cgroup files.
func (c *cgroupTestUtil) writeFileContents(fileContents map[string]string) {
for file, contents := range fileContents {
err := writeFile(c.CgroupPath, file, contents)
if err != nil {
c.t.Fatal(err)
}
}
}

View File

@@ -12,7 +12,6 @@ import (
)
var (
ErrNotSupportStat = errors.New("stats are not supported for subsystem")
ErrNotValidFormat = errors.New("line is not a valid key value format")
)

View File

@@ -1,97 +0,0 @@
// +build linux
package fs
import (
"io/ioutil"
"math"
"os"
"path/filepath"
"strconv"
"testing"
)
const (
cgroupFile = "cgroup.file"
floatValue = 2048.0
floatString = "2048"
)
func TestGetCgroupParamsInt(t *testing.T) {
// Setup tempdir.
tempDir, err := ioutil.TempDir("", "cgroup_utils_test")
if err != nil {
t.Fatal(err)
}
defer os.RemoveAll(tempDir)
tempFile := filepath.Join(tempDir, cgroupFile)
// Success.
err = ioutil.WriteFile(tempFile, []byte(floatString), 0755)
if err != nil {
t.Fatal(err)
}
value, err := getCgroupParamUint(tempDir, cgroupFile)
if err != nil {
t.Fatal(err)
} else if value != floatValue {
t.Fatalf("Expected %d to equal %f", value, floatValue)
}
// Success with new line.
err = ioutil.WriteFile(tempFile, []byte(floatString+"\n"), 0755)
if err != nil {
t.Fatal(err)
}
value, err = getCgroupParamUint(tempDir, cgroupFile)
if err != nil {
t.Fatal(err)
} else if value != floatValue {
t.Fatalf("Expected %d to equal %f", value, floatValue)
}
// Success with negative values
err = ioutil.WriteFile(tempFile, []byte("-12345"), 0755)
if err != nil {
t.Fatal(err)
}
value, err = getCgroupParamUint(tempDir, cgroupFile)
if err != nil {
t.Fatal(err)
} else if value != 0 {
t.Fatalf("Expected %d to equal %d", value, 0)
}
// Success with negative values lesser than min int64
s := strconv.FormatFloat(math.MinInt64, 'f', -1, 64)
err = ioutil.WriteFile(tempFile, []byte(s), 0755)
if err != nil {
t.Fatal(err)
}
value, err = getCgroupParamUint(tempDir, cgroupFile)
if err != nil {
t.Fatal(err)
} else if value != 0 {
t.Fatalf("Expected %d to equal %d", value, 0)
}
// Not a float.
err = ioutil.WriteFile(tempFile, []byte("not-a-float"), 0755)
if err != nil {
t.Fatal(err)
}
_, err = getCgroupParamUint(tempDir, cgroupFile)
if err == nil {
t.Fatal("Expecting error, got none")
}
// Unknown file.
err = os.Remove(tempFile)
if err != nil {
t.Fatal(err)
}
_, err = getCgroupParamUint(tempDir, cgroupFile)
if err == nil {
t.Fatal("Expecting error, got none")
}
}

View File

@@ -11,6 +11,7 @@ type ThrottlingData struct {
ThrottledTime uint64 `json:"throttled_time,omitempty"`
}
// CpuUsage denotes the usage of a CPU.
// All CPU stats are aggregate since container inception.
type CpuUsage struct {
// Total CPU time consumed.
@@ -46,14 +47,21 @@ type MemoryStats struct {
Usage MemoryData `json:"usage,omitempty"`
// usage of memory + swap
SwapUsage MemoryData `json:"swap_usage,omitempty"`
// usafe of kernel memory
KernelUsage MemoryData `json:"kernel_usage,omitempty"`
Stats map[string]uint64 `json:"stats,omitempty"`
// usage of kernel memory
KernelUsage MemoryData `json:"kernel_usage,omitempty"`
// usage of kernel TCP memory
KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
// if true, memory usage is accounted for throughout a hierarchy of cgroups.
UseHierarchy bool `json:"use_hierarchy"`
Stats map[string]uint64 `json:"stats,omitempty"`
}
type PidsStats struct {
// number of pids in the cgroup
Current uint64 `json:"current,omitempty"`
// active pids hard limit
Limit uint64 `json:"limit,omitempty"`
}
type BlkioStatEntry struct {
@@ -80,7 +88,7 @@ type HugetlbStats struct {
Usage uint64 `json:"usage,omitempty"`
// maximum usage ever recorded.
MaxUsage uint64 `json:"max_usage,omitempty"`
// number of times htgetlb usage allocation failure.
// number of times hugetlb usage allocation failure.
Failcnt uint64 `json:"failcnt"`
}

View File

@@ -1,4 +1,4 @@
// +build !linux
// +build !linux static_build
package systemd
@@ -43,7 +43,7 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
}
func (m *Manager) Set(container *configs.Config) error {
return nil, fmt.Errorf("Systemd not supported")
return fmt.Errorf("Systemd not supported")
}
func (m *Manager) Freeze(state configs.FreezerState) error {

View File

@@ -1,14 +1,12 @@
// +build linux
// +build linux,!static_build
package systemd
import (
"errors"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
@@ -67,13 +65,16 @@ var subsystems = subsystemSet{
const (
testScopeWait = 4
testSliceWait = 4
)
var (
connLock sync.Mutex
theConn *systemdDbus.Conn
hasStartTransientUnit bool
hasStartTransientSliceUnit bool
hasTransientDefaultDependencies bool
hasDelegate bool
)
func newProp(name string, units interface{}) systemdDbus.Property {
@@ -146,20 +147,52 @@ func UseSystemd() bool {
// Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil)
// Assume StartTransientUnit on a scope allows Delegate
hasDelegate = true
dl := newProp("Delegate", true)
if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
hasDelegate = false
}
}
}
// Assume we have the ability to start a transient unit as a slice
// This was broken until systemd v229, but has been back-ported on RHEL environments >= 219
// For details, see: https://bugzilla.redhat.com/show_bug.cgi?id=1370299
hasStartTransientSliceUnit = true
// To ensure simple clean-up, we create a slice off the root with no hierarchy
slice := fmt.Sprintf("libcontainer_%d_systemd_test_default.slice", os.Getpid())
if _, err := theConn.StartTransientUnit(slice, "replace", nil, nil); err != nil {
if _, ok := err.(dbus.Error); ok {
hasStartTransientSliceUnit = false
}
}
for i := 0; i <= testSliceWait; i++ {
if _, err := theConn.StopUnit(slice, "replace", nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") {
hasStartTransientSliceUnit = false
break
}
}
} else {
break
}
time.Sleep(time.Millisecond)
}
// Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil)
theConn.StopUnit(slice, "replace", nil)
}
return hasStartTransientUnit
}
func getIfaceForUnit(unitName string) string {
if strings.HasSuffix(unitName, ".scope") {
return "Scope"
}
if strings.HasSuffix(unitName, ".service") {
return "Service"
}
return "Unit"
}
func (m *Manager) Apply(pid int) error {
var (
c = m.Cgroups
@@ -189,11 +222,29 @@ func (m *Manager) Apply(pid int) error {
slice = c.Parent
}
properties = append(properties,
systemdDbus.PropSlice(slice),
systemdDbus.PropDescription("docker container "+c.Name),
newProp("PIDs", []uint32{uint32(pid)}),
)
properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
// if we create a slice, the parent is defined via a Wants=
if strings.HasSuffix(unitName, ".slice") {
// This was broken until systemd v229, but has been back-ported on RHEL environments >= 219
if !hasStartTransientSliceUnit {
return fmt.Errorf("systemd version does not support ability to start a slice as transient unit")
}
properties = append(properties, systemdDbus.PropWants(slice))
} else {
// otherwise, we use Slice=
properties = append(properties, systemdDbus.PropSlice(slice))
}
// only add pid if its valid, -1 is used w/ general slice creation.
if pid != -1 {
properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
}
if hasDelegate {
// This is only supported on systemd versions 218 and above.
properties = append(properties, newProp("Delegate", true))
}
// Always enable accounting, this gets us the same behaviour as the fs implementation,
// plus the kernel has some problems with joining the memory cgroup at a later time.
@@ -214,7 +265,21 @@ func (m *Manager) Apply(pid int) error {
if c.Resources.CpuShares != 0 {
properties = append(properties,
newProp("CPUShares", uint64(c.Resources.CpuShares)))
newProp("CPUShares", c.Resources.CpuShares))
}
// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
cpuQuotaPerSecUSec := uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
// (integer percentage of CPU) internally. This means that if a fractional percent of
// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
if cpuQuotaPerSecUSec%10000 != 0 {
cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
}
properties = append(properties,
newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
}
if c.Resources.BlkioWeight != 0 {
@@ -222,67 +287,22 @@ func (m *Manager) Apply(pid int) error {
newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
}
// We need to set kernel memory before processes join cgroup because
// kmem.limit_in_bytes can only be set when the cgroup is empty.
// And swap memory limit needs to be set after memory limit, only
// memory limit is handled by systemd, so it's kind of ugly here.
if c.Resources.KernelMemory > 0 {
// We have to set kernel memory here, as we can't change it once
// processes have been attached to the cgroup.
if c.Resources.KernelMemory != 0 {
if err := setKernelMemory(c); err != nil {
return err
}
}
if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil {
statusChan := make(chan string)
if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err != nil && !isUnitExists(err) {
return err
}
if err := joinDevices(c, pid); err != nil {
return err
}
<-statusChan
// TODO: CpuQuota and CpuPeriod not available in systemd
// we need to manually join the cpu.cfs_quota_us and cpu.cfs_period_us
if err := joinCpu(c, pid); err != nil {
return err
}
// TODO: MemoryReservation and MemorySwap not available in systemd
if err := joinMemory(c, pid); err != nil {
return err
}
// we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd
// because it does not currently support it via the dbus api.
if err := joinFreezer(c, pid); err != nil {
return err
}
if err := joinNetPrio(c, pid); err != nil {
return err
}
if err := joinNetCls(c, pid); err != nil {
return err
}
if err := joinPids(c, pid); err != nil {
return err
}
if err := joinCpuset(c, pid); err != nil {
return err
}
if err := joinHugetlb(c, pid); err != nil {
return err
}
if err := joinPerfEvent(c, pid); err != nil {
return err
}
// FIXME: Systemd does have `BlockIODeviceWeight` property, but we got problem
// using that (at least on systemd 208, see https://github.com/opencontainers/runc/libcontainer/pull/354),
// so use fs work around for now.
if err := joinBlkio(c, pid); err != nil {
if err := joinCgroups(c, pid); err != nil {
return err
}
@@ -323,15 +343,6 @@ func (m *Manager) GetPaths() map[string]string {
return paths
}
func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s.", file)
}
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
}
func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
path, err := getSubsystemPath(c, subsystem)
if err != nil {
@@ -340,57 +351,52 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
if err := os.MkdirAll(path, 0755); err != nil {
return "", err
}
if err := writeFile(path, "cgroup.procs", strconv.Itoa(pid)); err != nil {
if err := cgroups.WriteCgroupProc(path, pid); err != nil {
return "", err
}
return path, nil
}
func joinCpu(c *configs.Cgroup, pid int) error {
_, err := join(c, "cpu", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
func joinCgroups(c *configs.Cgroup, pid int) error {
for _, sys := range subsystems {
name := sys.Name()
switch name {
case "name=systemd":
// let systemd handle this
case "cpuset":
path, err := getSubsystemPath(c, name)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
s := &fs.CpusetGroup{}
if err := s.ApplyDir(path, c, pid); err != nil {
return err
}
default:
_, err := join(c, name, pid)
if err != nil {
// Even if it's `not found` error, we'll return err
// because devices cgroup is hard requirement for
// container security.
if name == "devices" {
return err
}
// For other subsystems, omit the `not found` error
// because they are optional.
if !cgroups.IsNotFound(err) {
return err
}
}
}
}
return nil
}
func joinFreezer(c *configs.Cgroup, pid int) error {
_, err := join(c, "freezer", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinNetPrio(c *configs.Cgroup, pid int) error {
_, err := join(c, "net_prio", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinNetCls(c *configs.Cgroup, pid int) error {
_, err := join(c, "net_cls", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinPids(c *configs.Cgroup, pid int) error {
_, err := join(c, "pids", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
// systemd represents slice heirarchy using `-`, so we need to follow suit when
// systemd represents slice hierarchy using `-`, so we need to follow suit when
// generating the path of slice. Essentially, test-a-b.slice becomes
// test.slice/test-a.slice/test-a-b.slice.
func expandSlice(slice string) (string, error) {
// /test.slice/test-a.slice/test-a-b.slice.
func ExpandSlice(slice string) (string, error) {
suffix := ".slice"
// Name has to end with ".slice", but can't be just ".slice".
if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
@@ -404,6 +410,10 @@ func expandSlice(slice string) (string, error) {
var path, prefix string
sliceName := strings.TrimSuffix(slice, suffix)
// if input was -.slice, we should just return root now
if sliceName == "-" {
return "/", nil
}
for _, component := range strings.Split(sliceName, "-") {
// test--a.slice isn't permitted, nor is -test.slice.
if component == "" {
@@ -411,10 +421,9 @@ func expandSlice(slice string) (string, error) {
}
// Append the component to the path and to the prefix.
path += prefix + component + suffix + "/"
path += "/" + prefix + component + suffix
prefix += component + "-"
}
return path, nil
}
@@ -424,17 +433,19 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
return "", err
}
initPath, err := cgroups.GetInitCgroupDir(subsystem)
initPath, err := cgroups.GetInitCgroup(subsystem)
if err != nil {
return "", err
}
// if pid 1 is systemd 226 or later, it will be in init.scope, not the root
initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope")
slice := "system.slice"
if c.Parent != "" {
slice = c.Parent
}
slice, err = expandSlice(slice)
slice, err = ExpandSlice(slice)
if err != nil {
return "", err
}
@@ -495,6 +506,11 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
}
func (m *Manager) Set(container *configs.Config) error {
// If Paths are set, then we are just joining cgroups paths
// and there is no need to set any values.
if m.Cgroups.Paths != nil {
return nil
}
for _, sys := range subsystems {
// Get the subsystem path, but don't error out for not found cgroups.
path, err := getSubsystemPath(container.Cgroups, sys.Name())
@@ -516,28 +532,11 @@ func (m *Manager) Set(container *configs.Config) error {
}
func getUnitName(c *configs.Cgroup) string {
return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
}
// Atm we can't use the systemd device support because of two missing things:
// * Support for wildcards to allow mknod on any device
// * Support for wildcards to allow /dev/pts support
//
// The second is available in more recent systemd as "char-pts", but not in e.g. v208 which is
// in wide use. When both these are available we will be able to switch, but need to keep the old
// implementation for backwards compat.
//
// Note: we can't use systemd to set up the initial limits, and then change the cgroup
// because systemd will re-write the device settings if it needs to re-apply the cgroup context.
// This happens at least for v208 when any sibling unit is started.
func joinDevices(c *configs.Cgroup, pid int) error {
_, err := join(c, "devices", pid)
// Even if it's `not found` error, we'll return err because devices cgroup
// is hard requirement for container security.
if err != nil {
return err
// by default, we create a scope unless the user explicitly asks for a slice.
if !strings.HasSuffix(c.Name, ".slice") {
return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
}
return nil
return c.Name
}
func setKernelMemory(c *configs.Cgroup) error {
@@ -549,57 +548,15 @@ func setKernelMemory(c *configs.Cgroup) error {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
// This doesn't get called by manager.Set, so we need to do it here.
s := &fs.MemoryGroup{}
return s.SetKernelMemory(path, c)
return fs.EnableKernelMemoryAccounting(path)
}
func joinMemory(c *configs.Cgroup, pid int) error {
_, err := join(c, "memory", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
// systemd does not atm set up the cpuset controller, so we must manually
// join it. Additionally that is a very finicky controller where each
// level must have a full setup as the default for a new directory is "no cpus"
func joinCpuset(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "cpuset")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
s := &fs.CpusetGroup{}
return s.ApplyDir(path, c, pid)
}
// `BlockIODeviceWeight` property of systemd does not work properly, and systemd
// expects device path instead of major minor numbers, which is also confusing
// for users. So we use fs work around for now.
func joinBlkio(c *configs.Cgroup, pid int) error {
_, err := join(c, "blkio", pid)
// isUnitExists returns true if the error is that a systemd unit already exists.
func isUnitExists(err error) bool {
if err != nil {
return err
if dbusError, ok := err.(dbus.Error); ok {
return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
}
}
return nil
}
func joinHugetlb(c *configs.Cgroup, pid int) error {
_, err := join(c, "hugetlb", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinPerfEvent(c *configs.Cgroup, pid int) error {
_, err := join(c, "perf_event", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
return false
}

View File

@@ -16,37 +16,24 @@ import (
"github.com/docker/go-units"
)
const cgroupNamePrefix = "name="
const (
cgroupNamePrefix = "name="
CgroupProcesses = "cgroup.procs"
)
// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt
// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
func FindCgroupMountpoint(subsystem string) (string, error) {
// We are not using mount.GetMounts() because it's super-inefficient,
// parsing it directly sped up x10 times because of not using Sscanf.
// It was one of two major performance drawbacks in container start.
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", err
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
txt := scanner.Text()
fields := strings.Split(txt, " ")
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if opt == subsystem {
return fields[4], nil
}
}
}
if err := scanner.Err(); err != nil {
return "", err
}
return "", NewNotFoundError(subsystem)
mnt, _, err := FindCgroupMountpointAndRoot(subsystem)
return mnt, err
}
func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
// We are not using mount.GetMounts() because it's super-inefficient,
// parsing it directly sped up x10 times because of not using Sscanf.
// It was one of two major performance drawbacks in container start.
if !isSubsystemAvailable(subsystem) {
return "", "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", "", err
@@ -70,6 +57,30 @@ func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
return "", "", NewNotFoundError(subsystem)
}
func isSubsystemAvailable(subsystem string) bool {
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return false
}
_, avail := cgroups[subsystem]
return avail
}
func GetClosestMountpointAncestor(dir, mountinfo string) string {
deepestMountPoint := ""
for _, mountInfoEntry := range strings.Split(mountinfo, "\n") {
mountInfoParts := strings.Fields(mountInfoEntry)
if len(mountInfoParts) < 5 {
continue
}
mountPoint := mountInfoParts[4]
if strings.HasPrefix(mountPoint, deepestMountPoint) && strings.HasPrefix(dir, mountPoint) {
deepestMountPoint = mountPoint
}
}
return deepestMountPoint
}
func FindCgroupMountpointDir() (string, error) {
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
@@ -113,7 +124,7 @@ type Mount struct {
Subsystems []string
}
func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) {
func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
if len(m.Subsystems) == 0 {
return "", fmt.Errorf("no subsystem for mount")
}
@@ -121,16 +132,17 @@ func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) {
return getControllerPath(m.Subsystems[0], cgroups)
}
func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
res := make([]Mount, 0, len(ss))
scanner := bufio.NewScanner(mi)
for scanner.Scan() {
numFound := 0
for scanner.Scan() && numFound < len(ss) {
txt := scanner.Text()
sepIdx := strings.IndexByte(txt, '-')
sepIdx := strings.Index(txt, " - ")
if sepIdx == -1 {
return nil, fmt.Errorf("invalid mountinfo format")
}
if txt[sepIdx+2:sepIdx+8] != "cgroup" {
if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
continue
}
fields := strings.Split(txt, " ")
@@ -139,12 +151,17 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
Root: fields[3],
}
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if !ss[opt] {
continue
}
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
}
if ss[opt] {
} else {
m.Subsystems = append(m.Subsystems, opt)
}
if !all {
numFound++
}
}
res = append(res, m)
}
@@ -154,26 +171,28 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
return res, nil
}
func GetCgroupMounts() ([]Mount, error) {
// GetCgroupMounts returns the mounts for the cgroup subsystems.
// all indicates whether to return just the first instance or all the mounts.
func GetCgroupMounts(all bool) ([]Mount, error) {
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return nil, err
}
defer f.Close()
all, err := GetAllSubsystems()
allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return nil, err
}
allMap := make(map[string]bool)
for _, s := range all {
for s := range allSubsystems {
allMap[s] = true
}
return getCgroupMountsHelper(allMap, f)
return getCgroupMountsHelper(allMap, f, all)
}
// Returns all the cgroup subsystems supported by the kernel
// GetAllSubsystems returns all the cgroup subsystems supported by the kernel
func GetAllSubsystems() ([]string, error) {
f, err := os.Open("/proc/cgroups")
if err != nil {
@@ -185,9 +204,6 @@ func GetAllSubsystems() ([]string, error) {
s := bufio.NewScanner(f)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
text := s.Text()
if text[0] != '#' {
parts := strings.Fields(text)
@@ -196,11 +212,14 @@ func GetAllSubsystems() ([]string, error) {
}
}
}
if err := s.Err(); err != nil {
return nil, err
}
return subsystems, nil
}
// Returns the relative path to the cgroup docker is running in.
func GetThisCgroupDir(subsystem string) (string, error) {
// GetOwnCgroup returns the relative path to the cgroup docker is running in.
func GetOwnCgroup(subsystem string) (string, error) {
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return "", err
@@ -209,8 +228,16 @@ func GetThisCgroupDir(subsystem string) (string, error) {
return getControllerPath(subsystem, cgroups)
}
func GetInitCgroupDir(subsystem string) (string, error) {
func GetOwnCgroupPath(subsystem string) (string, error) {
cgroup, err := GetOwnCgroup(subsystem)
if err != nil {
return "", err
}
return getCgroupPathHelper(subsystem, cgroup)
}
func GetInitCgroup(subsystem string) (string, error) {
cgroups, err := ParseCgroupFile("/proc/1/cgroup")
if err != nil {
return "", err
@@ -219,8 +246,33 @@ func GetInitCgroupDir(subsystem string) (string, error) {
return getControllerPath(subsystem, cgroups)
}
func GetInitCgroupPath(subsystem string) (string, error) {
cgroup, err := GetInitCgroup(subsystem)
if err != nil {
return "", err
}
return getCgroupPathHelper(subsystem, cgroup)
}
func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
mnt, root, err := FindCgroupMountpointAndRoot(subsystem)
if err != nil {
return "", err
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see pathes from host, which don't exist in container.
relCgroup, err := filepath.Rel(root, cgroup)
if err != nil {
return "", err
}
return filepath.Join(mnt, relCgroup), nil
}
func readProcsFile(dir string) ([]int, error) {
f, err := os.Open(filepath.Join(dir, "cgroup.procs"))
f, err := os.Open(filepath.Join(dir, CgroupProcesses))
if err != nil {
return nil, err
}
@@ -243,6 +295,8 @@ func readProcsFile(dir string) ([]int, error) {
return out, nil
}
// ParseCgroupFile parses the given cgroup file, typically from
// /proc/<pid>/cgroup, into a map of subgroups to cgroup names.
func ParseCgroupFile(path string) (map[string]string, error) {
f, err := os.Open(path)
if err != nil {
@@ -250,21 +304,35 @@ func ParseCgroupFile(path string) (map[string]string, error) {
}
defer f.Close()
s := bufio.NewScanner(f)
return parseCgroupFromReader(f)
}
// helper function for ParseCgroupFile to make testing easier
func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
s := bufio.NewScanner(r)
cgroups := make(map[string]string)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
text := s.Text()
parts := strings.Split(text, ":")
// from cgroups(7):
// /proc/[pid]/cgroup
// ...
// For each cgroup hierarchy ... there is one entry
// containing three colon-separated fields of the form:
// hierarchy-ID:subsystem-list:cgroup-path
parts := strings.SplitN(text, ":", 3)
if len(parts) < 3 {
return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
}
for _, subs := range strings.Split(parts[1], ",") {
cgroups[subs] = parts[2]
}
}
if err := s.Err(); err != nil {
return nil, err
}
return cgroups, nil
}
@@ -291,8 +359,7 @@ func PathExists(path string) bool {
func EnterPid(cgroupPaths map[string]string, pid int) error {
for _, path := range cgroupPaths {
if PathExists(path) {
if err := ioutil.WriteFile(filepath.Join(path, "cgroup.procs"),
[]byte(strconv.Itoa(pid)), 0700); err != nil {
if err := WriteCgroupProc(path, pid); err != nil {
return err
}
}
@@ -326,7 +393,7 @@ func RemovePaths(paths map[string]string) (err error) {
return nil
}
}
return fmt.Errorf("Failed to remove paths: %s", paths)
return fmt.Errorf("Failed to remove paths: %v", paths)
}
func GetHugePageSize() ([]string, error) {
@@ -361,7 +428,7 @@ func GetAllPids(path string) ([]int, error) {
// collect pids from all sub-cgroups
err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
dir, file := filepath.Split(p)
if file != "cgroup.procs" {
if file != CgroupProcesses {
return nil
}
if iErr != nil {
@@ -376,3 +443,20 @@ func GetAllPids(path string) ([]int, error) {
})
return pids, err
}
// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
func WriteCgroupProc(dir string, pid int) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s", CgroupProcesses)
}
// Dont attach any pid to the cgroup if -1 is specified as a pid
if pid != -1 {
if err := ioutil.WriteFile(filepath.Join(dir, CgroupProcesses), []byte(strconv.Itoa(pid)), 0700); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
}
}
return nil
}

View File

@@ -1,138 +0,0 @@
package cgroups
import (
"bytes"
"strings"
"testing"
)
const fedoraMountinfo = `15 35 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
16 35 0:14 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel
17 35 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8056484k,nr_inodes=2014121,mode=755
18 16 0:15 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
19 16 0:13 / /sys/fs/selinux rw,relatime shared:8 - selinuxfs selinuxfs rw
20 17 0:16 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel
21 17 0:10 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000
22 35 0:17 / /run rw,nosuid,nodev shared:21 - tmpfs tmpfs rw,seclabel,mode=755
23 16 0:18 / /sys/fs/cgroup rw,nosuid,nodev,noexec shared:9 - tmpfs tmpfs rw,seclabel,mode=755
24 23 0:19 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
25 16 0:20 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw
26 23 0:21 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuset,clone_children
27 23 0:22 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,cpuacct,cpu,clone_children
28 23 0:23 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,memory,clone_children
29 23 0:24 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,devices,clone_children
30 23 0:25 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,freezer,clone_children
31 23 0:26 / /sys/fs/cgroup/net_cls rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,net_cls,clone_children
32 23 0:27 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,blkio,clone_children
33 23 0:28 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,perf_event,clone_children
34 23 0:29 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,hugetlb,clone_children
35 1 253:2 / / rw,relatime shared:1 - ext4 /dev/mapper/ssd-root--f20 rw,seclabel,data=ordered
36 15 0:30 / /proc/sys/fs/binfmt_misc rw,relatime shared:22 - autofs systemd-1 rw,fd=38,pgrp=1,timeout=300,minproto=5,maxproto=5,direct
37 17 0:12 / /dev/mqueue rw,relatime shared:23 - mqueue mqueue rw,seclabel
38 35 0:31 / /tmp rw shared:24 - tmpfs tmpfs rw,seclabel
39 17 0:32 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw,seclabel
40 16 0:7 / /sys/kernel/debug rw,relatime shared:26 - debugfs debugfs rw
41 16 0:33 / /sys/kernel/config rw,relatime shared:27 - configfs configfs rw
42 35 0:34 / /var/lib/nfs/rpc_pipefs rw,relatime shared:28 - rpc_pipefs sunrpc rw
43 15 0:35 / /proc/fs/nfsd rw,relatime shared:29 - nfsd sunrpc rw
45 35 8:17 / /boot rw,relatime shared:30 - ext4 /dev/sdb1 rw,seclabel,data=ordered
46 35 253:4 / /home rw,relatime shared:31 - ext4 /dev/mapper/ssd-home rw,seclabel,data=ordered
47 35 253:5 / /var/lib/libvirt/images rw,noatime,nodiratime shared:32 - ext4 /dev/mapper/ssd-virt rw,seclabel,discard,data=ordered
48 35 253:12 / /mnt/old rw,relatime shared:33 - ext4 /dev/mapper/HelpDeskRHEL6-FedoraRoot rw,seclabel,data=ordered
121 22 0:36 / /run/user/1000/gvfs rw,nosuid,nodev,relatime shared:104 - fuse.gvfsd-fuse gvfsd-fuse rw,user_id=1000,group_id=1000
124 16 0:37 / /sys/fs/fuse/connections rw,relatime shared:107 - fusectl fusectl rw
165 38 253:3 / /tmp/mnt rw,relatime shared:147 - ext4 /dev/mapper/ssd-root rw,seclabel,data=ordered
167 35 253:15 / /var/lib/docker/devicemapper/mnt/aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,relatime shared:149 - ext4 /dev/mapper/docker-253:2-425882-aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,seclabel,discard,stripe=16,data=ordered
171 35 253:16 / /var/lib/docker/devicemapper/mnt/c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,relatime shared:153 - ext4 /dev/mapper/docker-253:2-425882-c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,seclabel,discard,stripe=16,data=ordered
175 35 253:17 / /var/lib/docker/devicemapper/mnt/1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,relatime shared:157 - ext4 /dev/mapper/docker-253:2-425882-1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,seclabel,discard,stripe=16,data=ordered
179 35 253:18 / /var/lib/docker/devicemapper/mnt/d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,relatime shared:161 - ext4 /dev/mapper/docker-253:2-425882-d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,seclabel,discard,stripe=16,data=ordered
183 35 253:19 / /var/lib/docker/devicemapper/mnt/6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,relatime shared:165 - ext4 /dev/mapper/docker-253:2-425882-6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,seclabel,discard,stripe=16,data=ordered
187 35 253:20 / /var/lib/docker/devicemapper/mnt/8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,relatime shared:169 - ext4 /dev/mapper/docker-253:2-425882-8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,seclabel,discard,stripe=16,data=ordered
191 35 253:21 / /var/lib/docker/devicemapper/mnt/c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,relatime shared:173 - ext4 /dev/mapper/docker-253:2-425882-c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,seclabel,discard,stripe=16,data=ordered
195 35 253:22 / /var/lib/docker/devicemapper/mnt/2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,relatime shared:177 - ext4 /dev/mapper/docker-253:2-425882-2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,seclabel,discard,stripe=16,data=ordered
199 35 253:23 / /var/lib/docker/devicemapper/mnt/37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,relatime shared:181 - ext4 /dev/mapper/docker-253:2-425882-37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,seclabel,discard,stripe=16,data=ordered
203 35 253:24 / /var/lib/docker/devicemapper/mnt/aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,relatime shared:185 - ext4 /dev/mapper/docker-253:2-425882-aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,seclabel,discard,stripe=16,data=ordered
207 35 253:25 / /var/lib/docker/devicemapper/mnt/928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,relatime shared:189 - ext4 /dev/mapper/docker-253:2-425882-928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,seclabel,discard,stripe=16,data=ordered
211 35 253:26 / /var/lib/docker/devicemapper/mnt/0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,relatime shared:193 - ext4 /dev/mapper/docker-253:2-425882-0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,seclabel,discard,stripe=16,data=ordered
215 35 253:27 / /var/lib/docker/devicemapper/mnt/d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,relatime shared:197 - ext4 /dev/mapper/docker-253:2-425882-d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,seclabel,discard,stripe=16,data=ordered
219 35 253:28 / /var/lib/docker/devicemapper/mnt/bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,relatime shared:201 - ext4 /dev/mapper/docker-253:2-425882-bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,seclabel,discard,stripe=16,data=ordered
223 35 253:29 / /var/lib/docker/devicemapper/mnt/7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,relatime shared:205 - ext4 /dev/mapper/docker-253:2-425882-7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,seclabel,discard,stripe=16,data=ordered
227 35 253:30 / /var/lib/docker/devicemapper/mnt/c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,relatime shared:209 - ext4 /dev/mapper/docker-253:2-425882-c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,seclabel,discard,stripe=16,data=ordered
231 35 253:31 / /var/lib/docker/devicemapper/mnt/8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,relatime shared:213 - ext4 /dev/mapper/docker-253:2-425882-8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,seclabel,discard,stripe=16,data=ordered
235 35 253:32 / /var/lib/docker/devicemapper/mnt/1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,relatime shared:217 - ext4 /dev/mapper/docker-253:2-425882-1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,seclabel,discard,stripe=16,data=ordered
239 35 253:33 / /var/lib/docker/devicemapper/mnt/e9aa60c60128cad1 rw,relatime shared:221 - ext4 /dev/mapper/docker-253:2-425882-e9aa60c60128cad1 rw,seclabel,discard,stripe=16,data=ordered
243 35 253:34 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,relatime shared:225 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,seclabel,discard,stripe=16,data=ordered
247 35 253:35 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,relatime shared:229 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,seclabel,discard,stripe=16,data=ordered
31 21 0:23 / /DATA/foo_bla_bla rw,relatime - cifs //foo/BLA\040BLA\040BLA/ rw,sec=ntlm,cache=loose,unc=\\foo\BLA BLA BLA,username=my_login,domain=mydomain.com,uid=12345678,forceuid,gid=12345678,forcegid,addr=10.1.30.10,file_mode=0755,dir_mode=0755,nounix,rsize=61440,wsize=65536,actimeo=1`
func TestGetCgroupMounts(t *testing.T) {
subsystems := map[string]bool{
"cpuset": true,
"cpu": true,
"cpuacct": true,
"memory": true,
"devices": true,
"freezer": true,
"net_cls": true,
"blkio": true,
"perf_event": true,
"hugetlb": true,
}
mi := bytes.NewBufferString(fedoraMountinfo)
cgMounts, err := getCgroupMountsHelper(subsystems, mi)
if err != nil {
t.Fatal(err)
}
cgMap := make(map[string]Mount)
for _, m := range cgMounts {
for _, ss := range m.Subsystems {
cgMap[ss] = m
}
}
for ss := range subsystems {
m, ok := cgMap[ss]
if !ok {
t.Fatalf("%s not found", ss)
}
if m.Root != "/" {
t.Fatalf("unexpected root for %s: %s", ss, m.Root)
}
if !strings.HasPrefix(m.Mountpoint, "/sys/fs/cgroup/") && !strings.Contains(m.Mountpoint, ss) {
t.Fatalf("unexpected mountpoint for %s: %s", ss, m.Mountpoint)
}
var ssFound bool
for _, mss := range m.Subsystems {
if mss == ss {
ssFound = true
break
}
}
if !ssFound {
t.Fatalf("subsystem %s not found in Subsystems field %v", ss, m.Subsystems)
}
}
}
func BenchmarkGetCgroupMounts(b *testing.B) {
subsystems := map[string]bool{
"cpuset": true,
"cpu": true,
"cpuacct": true,
"memory": true,
"devices": true,
"freezer": true,
"net_cls": true,
"blkio": true,
"perf_event": true,
"hugetlb": true,
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
b.StopTimer()
mi := bytes.NewBufferString(fedoraMountinfo)
b.StartTimer()
if _, err := getCgroupMountsHelper(subsystems, mi); err != nil {
b.Fatal(err)
}
}
}

View File

@@ -1,5 +1,3 @@
// +build linux freebsd
package configs
type FreezerState string
@@ -22,7 +20,7 @@ type Cgroup struct {
// The path is assumed to be relative to the host system cgroup mountpoint.
Path string `json:"path"`
// ScopePrefix decribes prefix for the scope name
// ScopePrefix describes prefix for the scope name
ScopePrefix string `json:"scope_prefix"`
// Paths represent the absolute cgroups paths to join.
@@ -36,7 +34,7 @@ type Cgroup struct {
type Resources struct {
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
// Deprecated
AllowAllDevices bool `json:"allow_all_devices,omitempty"`
AllowAllDevices *bool `json:"allow_all_devices,omitempty"`
// Deprecated
AllowedDevices []*Device `json:"allowed_devices,omitempty"`
// Deprecated
@@ -56,20 +54,23 @@ type Resources struct {
// Kernel memory limit (in bytes)
KernelMemory int64 `json:"kernel_memory"`
// Kernel memory limit for TCP use (in bytes)
KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
// CPU shares (relative weight vs. other containers)
CpuShares int64 `json:"cpu_shares"`
CpuShares uint64 `json:"cpu_shares"`
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
CpuQuota int64 `json:"cpu_quota"`
// CPU period to be used for hardcapping (in usecs). 0 to use system default.
CpuPeriod int64 `json:"cpu_period"`
CpuPeriod uint64 `json:"cpu_period"`
// How many time CPU will use in realtime scheduling (in usecs).
CpuRtRuntime int64 `json:"cpu_quota"`
CpuRtRuntime int64 `json:"cpu_rt_quota"`
// CPU period to be used for realtime scheduling (in usecs).
CpuRtPeriod int64 `json:"cpu_period"`
CpuRtPeriod uint64 `json:"cpu_rt_period"`
// CPU to use
CpusetCpus string `json:"cpuset_cpus"`
@@ -92,7 +93,7 @@ type Resources struct {
// IO read rate limit per cgroup per device, bytes per second.
BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
// IO write rate limit per cgroup per divice, bytes per second.
// IO write rate limit per cgroup per device, bytes per second.
BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
// IO read rate limit per cgroup per device, IO per second.
@@ -111,11 +112,11 @@ type Resources struct {
OomKillDisable bool `json:"oom_kill_disable"`
// Tuning swappiness behaviour per cgroup
MemorySwappiness int64 `json:"memory_swappiness"`
MemorySwappiness *uint64 `json:"memory_swappiness"`
// Set priority of network traffic for container
NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
// Set class identifier for container's network packets
NetClsClassid string `json:"net_cls_classid"`
NetClsClassid uint32 `json:"net_cls_classid_u"`
}

View File

@@ -1,6 +0,0 @@
// +build !windows,!linux,!freebsd
package configs
type Cgroup struct {
}

View File

@@ -3,7 +3,13 @@ package configs
import (
"bytes"
"encoding/json"
"fmt"
"os/exec"
"time"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
)
type Rlimit struct {
@@ -29,7 +35,7 @@ type Seccomp struct {
Syscalls []*Syscall `json:"syscalls"`
}
// An action to be taken upon rule match in Seccomp
// Action is taken upon rule match in Seccomp
type Action int
const (
@@ -40,7 +46,7 @@ const (
Trace
)
// A comparison operator to be used when matching syscall arguments in Seccomp
// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
type Operator int
const (
@@ -53,7 +59,7 @@ const (
MaskEqualTo
)
// A rule to match a specific syscall argument in Seccomp
// Arg is a rule to match a specific syscall argument in Seccomp
type Arg struct {
Index uint `json:"index"`
Value uint64 `json:"value"`
@@ -61,7 +67,7 @@ type Arg struct {
Op Operator `json:"op"`
}
// An rule to match a syscall in Seccomp
// Syscall is a rule to match a syscall in Seccomp
type Syscall struct {
Name string `json:"name"`
Action Action `json:"action"`
@@ -81,11 +87,6 @@ type Config struct {
// that the parent process dies.
ParentDeathSignal int `json:"parent_death_signal"`
// PivotDir allows a custom directory inside the container's root filesystem to be used as pivot, when NoPivotRoot is not set.
// When a custom PivotDir not set, a temporary dir inside the root filesystem will be used. The pivot dir needs to be writeable.
// This is required when using read only root filesystems. In these cases, a read/writeable path can be (bind) mounted somewhere inside the root filesystem to act as pivot.
PivotDir string `json:"pivot_dir"`
// Path to a directory containing the container's root filesystem.
Rootfs string `json:"rootfs"`
@@ -113,8 +114,8 @@ type Config struct {
Namespaces Namespaces `json:"namespaces"`
// Capabilities specify the capabilities to keep when executing the process inside the container
// All capbilities not specified will be dropped from the processes capability mask
Capabilities []string `json:"capabilities"`
// All capabilities not specified will be dropped from the processes capability mask
Capabilities *Capabilities `json:"capabilities"`
// Networks specifies the container's network setup to be created
Networks []*Network `json:"networks"`
@@ -128,15 +129,15 @@ type Config struct {
// AppArmorProfile specifies the profile to apply to the process running in the container and is
// change at the time the process is execed
AppArmorProfile string `json:"apparmor_profile"`
AppArmorProfile string `json:"apparmor_profile,omitempty"`
// ProcessLabel specifies the label to apply to the process running in the container. It is
// commonly used by selinux
ProcessLabel string `json:"process_label"`
ProcessLabel string `json:"process_label,omitempty"`
// Rlimits specifies the resource limits, such as max open files, to set in the container
// If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits []Rlimit `json:"rlimits"`
Rlimits []Rlimit `json:"rlimits,omitempty"`
// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
// for a process. Valid values are between the range [-1000, '1000'], where processes with
@@ -144,10 +145,6 @@ type Config struct {
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
OomScoreAdj int `json:"oom_score_adj"`
// AdditionalGroups specifies the gids that should be added to supplementary groups
// in addition to those that the user belongs to.
AdditionalGroups []string `json:"additional_groups"`
// UidMappings is an array of User ID mappings for User Namespaces
UidMappings []IDMap `json:"uid_mappings"`
@@ -171,12 +168,29 @@ type Config struct {
// A default action to be taken if no rules match is also given.
Seccomp *Seccomp `json:"seccomp"`
// NoNewPrivileges controls whether processes in the container can gain additional privileges.
NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
// Hooks are a collection of actions to perform at various container lifecycle events.
// Hooks are not able to be marshaled to json but they are also not needed to.
Hooks *Hooks `json:"-"`
// CommandHooks are serialized to JSON, but other hooks are not.
Hooks *Hooks
// Version is the version of opencontainer specification that is supported.
Version string `json:"version"`
// Labels are user defined metadata that is stored in the config and populated on the state
Labels []string `json:"labels"`
// NoNewKeyring will not allocated a new session keyring for the container. It will use the
// callers keyring in this case.
NoNewKeyring bool `json:"no_new_keyring"`
// Rootless specifies whether the container is a rootless container.
Rootless bool `json:"rootless"`
// IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into
// to limit the resources (e.g., L3 cache) the container has available
IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
}
type Hooks struct {
@@ -191,20 +205,74 @@ type Hooks struct {
Poststop []Hook
}
// HookState is the payload provided to a hook on execution.
type HookState struct {
Version string `json:"version"`
ID string `json:"id"`
Pid int `json:"pid"`
Root string `json:"root"`
type Capabilities struct {
// Bounding is the set of capabilities checked by the kernel.
Bounding []string
// Effective is the set of capabilities checked by the kernel.
Effective []string
// Inheritable is the capabilities preserved across execve.
Inheritable []string
// Permitted is the limiting superset for effective capabilities.
Permitted []string
// Ambient is the ambient set of capabilities that are kept.
Ambient []string
}
func (hooks *Hooks) UnmarshalJSON(b []byte) error {
var state struct {
Prestart []CommandHook
Poststart []CommandHook
Poststop []CommandHook
}
if err := json.Unmarshal(b, &state); err != nil {
return err
}
deserialize := func(shooks []CommandHook) (hooks []Hook) {
for _, shook := range shooks {
hooks = append(hooks, shook)
}
return hooks
}
hooks.Prestart = deserialize(state.Prestart)
hooks.Poststart = deserialize(state.Poststart)
hooks.Poststop = deserialize(state.Poststop)
return nil
}
func (hooks Hooks) MarshalJSON() ([]byte, error) {
serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
for _, hook := range hooks {
switch chook := hook.(type) {
case CommandHook:
serializableHooks = append(serializableHooks, chook)
default:
logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
}
}
return serializableHooks
}
return json.Marshal(map[string]interface{}{
"prestart": serialize(hooks.Prestart),
"poststart": serialize(hooks.Poststart),
"poststop": serialize(hooks.Poststop),
})
}
// HookState is the payload provided to a hook on execution.
type HookState specs.State
type Hook interface {
// Run executes the hook with the provided state.
Run(HookState) error
}
// NewFunctionHooks will call the provided function when the hook is run.
// NewFunctionHook will call the provided function when the hook is run.
func NewFunctionHook(f func(HookState) error) FuncHook {
return FuncHook{
run: f,
@@ -220,13 +288,14 @@ func (f FuncHook) Run(s HookState) error {
}
type Command struct {
Path string `json:"path"`
Args []string `json:"args"`
Env []string `json:"env"`
Dir string `json:"dir"`
Path string `json:"path"`
Args []string `json:"args"`
Env []string `json:"env"`
Dir string `json:"dir"`
Timeout *time.Duration `json:"timeout"`
}
// NewCommandHooks will execute the provided command when the hook is run.
// NewCommandHook will execute the provided command when the hook is run.
func NewCommandHook(cmd Command) CommandHook {
return CommandHook{
Command: cmd,
@@ -242,11 +311,38 @@ func (c Command) Run(s HookState) error {
if err != nil {
return err
}
var stdout, stderr bytes.Buffer
cmd := exec.Cmd{
Path: c.Path,
Args: c.Args,
Env: c.Env,
Stdin: bytes.NewReader(b),
Path: c.Path,
Args: c.Args,
Env: c.Env,
Stdin: bytes.NewReader(b),
Stdout: &stdout,
Stderr: &stderr,
}
if err := cmd.Start(); err != nil {
return err
}
errC := make(chan error, 1)
go func() {
err := cmd.Wait()
if err != nil {
err = fmt.Errorf("error running hook: %v, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
}
errC <- err
}()
var timerCh <-chan time.Time
if c.Timeout != nil {
timer := time.NewTimer(*c.Timeout)
defer timer.Stop()
timerCh = timer.C
}
select {
case err := <-errC:
return err
case <-timerCh:
cmd.Process.Kill()
cmd.Wait()
return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
}
return cmd.Run()
}

View File

@@ -0,0 +1,61 @@
package configs
import "fmt"
// HostUID gets the translated uid for the process on host which could be
// different when user namespaces are enabled.
func (c Config) HostUID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.UidMappings == nil {
return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.")
}
id, found := c.hostIDFromMapping(containerId, c.UidMappings)
if !found {
return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.")
}
return id, nil
}
// Return unchanged id.
return containerId, nil
}
// HostRootUID gets the root uid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostRootUID() (int, error) {
return c.HostUID(0)
}
// HostGID gets the translated gid for the process on host which could be
// different when user namespaces are enabled.
func (c Config) HostGID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.GidMappings == nil {
return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.")
}
id, found := c.hostIDFromMapping(containerId, c.GidMappings)
if !found {
return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.")
}
return id, nil
}
// Return unchanged id.
return containerId, nil
}
// HostRootGID gets the root gid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostRootGID() (int, error) {
return c.HostGID(0)
}
// Utility function that gets a host ID for a container ID from user namespace map
// if that ID is present in the map.
func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) {
for _, m := range uMap {
if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
hostID := m.HostID + (containerID - m.ContainerID)
return hostID, true
}
}
return -1, false
}

View File

@@ -1,51 +0,0 @@
// +build freebsd linux
package configs
import "fmt"
// Gets the root uid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostUID() (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.UidMappings == nil {
return -1, fmt.Errorf("User namespaces enabled, but no user mappings found.")
}
id, found := c.hostIDFromMapping(0, c.UidMappings)
if !found {
return -1, fmt.Errorf("User namespaces enabled, but no root user mapping found.")
}
return id, nil
}
// Return default root uid 0
return 0, nil
}
// Gets the root gid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostGID() (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.GidMappings == nil {
return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.")
}
id, found := c.hostIDFromMapping(0, c.GidMappings)
if !found {
return -1, fmt.Errorf("User namespaces enabled, but no root group mapping found.")
}
return id, nil
}
// Return default root gid 0
return 0, nil
}
// Utility function that gets a host ID for a container ID from user namespace map
// if that ID is present in the map.
func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) {
for _, m := range uMap {
if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
hostID := m.HostID + (containerID - m.ContainerID)
return hostID, true
}
}
return -1, false
}

View File

@@ -1,156 +0,0 @@
// +build linux freebsd
package configs
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"testing"
)
// Checks whether the expected capability is specified in the capabilities.
func contains(expected string, values []string) bool {
for _, v := range values {
if v == expected {
return true
}
}
return false
}
func containsDevice(expected *Device, values []*Device) bool {
for _, d := range values {
if d.Path == expected.Path &&
d.Permissions == expected.Permissions &&
d.FileMode == expected.FileMode &&
d.Major == expected.Major &&
d.Minor == expected.Minor &&
d.Type == expected.Type {
return true
}
}
return false
}
func loadConfig(name string) (*Config, error) {
f, err := os.Open(filepath.Join("../sample_configs", name))
if err != nil {
return nil, err
}
defer f.Close()
var container *Config
if err := json.NewDecoder(f).Decode(&container); err != nil {
return nil, err
}
// Check that a config doesn't contain extra fields
var configMap, abstractMap map[string]interface{}
if _, err := f.Seek(0, 0); err != nil {
return nil, err
}
if err := json.NewDecoder(f).Decode(&abstractMap); err != nil {
return nil, err
}
configData, err := json.Marshal(&container)
if err != nil {
return nil, err
}
if err := json.Unmarshal(configData, &configMap); err != nil {
return nil, err
}
for k := range configMap {
delete(abstractMap, k)
}
if len(abstractMap) != 0 {
return nil, fmt.Errorf("unknown fields: %s", abstractMap)
}
return container, nil
}
func TestRemoveNamespace(t *testing.T) {
ns := Namespaces{
{Type: NEWNET},
}
if !ns.Remove(NEWNET) {
t.Fatal("NEWNET was not removed")
}
if len(ns) != 0 {
t.Fatalf("namespaces should have 0 items but reports %d", len(ns))
}
}
func TestHostUIDNoUSERNS(t *testing.T) {
config := &Config{
Namespaces: Namespaces{},
}
uid, err := config.HostUID()
if err != nil {
t.Fatal(err)
}
if uid != 0 {
t.Fatalf("expected uid 0 with no USERNS but received %d", uid)
}
}
func TestHostUIDWithUSERNS(t *testing.T) {
config := &Config{
Namespaces: Namespaces{{Type: NEWUSER}},
UidMappings: []IDMap{
{
ContainerID: 0,
HostID: 1000,
Size: 1,
},
},
}
uid, err := config.HostUID()
if err != nil {
t.Fatal(err)
}
if uid != 1000 {
t.Fatalf("expected uid 1000 with no USERNS but received %d", uid)
}
}
func TestHostGIDNoUSERNS(t *testing.T) {
config := &Config{
Namespaces: Namespaces{},
}
uid, err := config.HostGID()
if err != nil {
t.Fatal(err)
}
if uid != 0 {
t.Fatalf("expected gid 0 with no USERNS but received %d", uid)
}
}
func TestHostGIDWithUSERNS(t *testing.T) {
config := &Config{
Namespaces: Namespaces{{Type: NEWUSER}},
GidMappings: []IDMap{
{
ContainerID: 0,
HostID: 1000,
Size: 1,
},
},
}
uid, err := config.HostGID()
if err != nil {
t.Fatal(err)
}
if uid != 1000 {
t.Fatalf("expected gid 1000 with no USERNS but received %d", uid)
}
}

View File

@@ -1,3 +0,0 @@
package configs
// All current tests are for Unix-specific functionality

View File

@@ -1,9 +1,9 @@
// +build linux freebsd
// +build linux
package configs
var (
// These are devices that are to be both allowed and created.
// DefaultSimpleDevices are devices that are to be both allowed and created.
DefaultSimpleDevices = []*Device{
// /dev/null and zero
{
@@ -107,19 +107,5 @@ var (
Permissions: "rwm",
},
}, DefaultSimpleDevices...)
DefaultAutoCreatedDevices = append([]*Device{
{
// /dev/fuse is created but not allowed.
// This is to allow java to work. Because java
// Insists on there being a /dev/fuse
// https://github.com/docker/docker/issues/514
// https://github.com/docker/docker/issues/2393
//
Path: "/dev/fuse",
Type: 'c',
Major: 10,
Minor: 229,
Permissions: "rwm",
},
}, DefaultSimpleDevices...)
DefaultAutoCreatedDevices = append([]*Device{}, DefaultSimpleDevices...)
)

View File

@@ -0,0 +1,7 @@
package configs
type IntelRdt struct {
// The schema for L3 cache id and capacity bitmask (CBM)
// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
L3CacheSchema string `json:"l3_cache_schema,omitempty"`
}

View File

@@ -1,5 +1,11 @@
package configs
const (
// EXT_COPYUP is a directive to copy up the contents of a directory when
// a tmpfs is mounted over it.
EXT_COPYUP = 1 << iota
)
type Mount struct {
// Source path for the mount.
Source string `json:"source"`
@@ -22,6 +28,9 @@ type Mount struct {
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
// Optional Command to be run before Source is mounted.
PremountCmds []Command `json:"premount_cmds"`

View File

@@ -1,8 +1,10 @@
// +build linux freebsd
package configs
import "fmt"
import (
"fmt"
"os"
"sync"
)
const (
NEWNET NamespaceType = "NEWNET"
@@ -13,14 +15,59 @@ const (
NEWUSER NamespaceType = "NEWUSER"
)
var (
nsLock sync.Mutex
supportedNamespaces = make(map[NamespaceType]bool)
)
// NsName converts the namespace type to its filename
func NsName(ns NamespaceType) string {
switch ns {
case NEWNET:
return "net"
case NEWNS:
return "mnt"
case NEWPID:
return "pid"
case NEWIPC:
return "ipc"
case NEWUSER:
return "user"
case NEWUTS:
return "uts"
}
return ""
}
// IsNamespaceSupported returns whether a namespace is available or
// not
func IsNamespaceSupported(ns NamespaceType) bool {
nsLock.Lock()
defer nsLock.Unlock()
supported, ok := supportedNamespaces[ns]
if ok {
return supported
}
nsFile := NsName(ns)
// if the namespace type is unknown, just return false
if nsFile == "" {
return false
}
_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
// a namespace is supported if it exists and we have permissions to read it
supported = err == nil
supportedNamespaces[ns] = supported
return supported
}
func NamespaceTypes() []NamespaceType {
return []NamespaceType{
NEWUSER, // Keep user NS always first, don't move it.
NEWIPC,
NEWUTS,
NEWNET,
NEWPID,
NEWNS,
NEWUTS,
NEWIPC,
NEWUSER,
}
}
@@ -32,29 +79,7 @@ type Namespace struct {
}
func (n *Namespace) GetPath(pid int) string {
if n.Path != "" {
return n.Path
}
return fmt.Sprintf("/proc/%d/ns/%s", pid, n.file())
}
func (n *Namespace) file() string {
file := ""
switch n.Type {
case NEWNET:
file = "net"
case NEWNS:
file = "mnt"
case NEWPID:
file = "pid"
case NEWIPC:
file = "ipc"
case NEWUSER:
file = "user"
case NEWUTS:
file = "uts"
}
return file
return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
}
func (n *Namespaces) Remove(t NamespaceType) bool {
@@ -87,3 +112,11 @@ func (n *Namespaces) index(t NamespaceType) int {
func (n *Namespaces) Contains(t NamespaceType) bool {
return n.index(t) != -1
}
func (n *Namespaces) PathOf(t NamespaceType) string {
i := n.index(t)
if i == -1 {
return ""
}
return (*n)[i].Path
}

View File

@@ -2,23 +2,23 @@
package configs
import "syscall"
import "golang.org/x/sys/unix"
func (n *Namespace) Syscall() int {
return namespaceInfo[n.Type]
}
var namespaceInfo = map[NamespaceType]int{
NEWNET: syscall.CLONE_NEWNET,
NEWNS: syscall.CLONE_NEWNS,
NEWUSER: syscall.CLONE_NEWUSER,
NEWIPC: syscall.CLONE_NEWIPC,
NEWUTS: syscall.CLONE_NEWUTS,
NEWPID: syscall.CLONE_NEWPID,
NEWNET: unix.CLONE_NEWNET,
NEWNS: unix.CLONE_NEWNS,
NEWUSER: unix.CLONE_NEWUSER,
NEWIPC: unix.CLONE_NEWIPC,
NEWUTS: unix.CLONE_NEWUTS,
NEWPID: unix.CLONE_NEWPID,
}
// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This functions returns flags only for new namespaces.
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
var flag int
for _, v := range *n {

View File

@@ -4,12 +4,10 @@ package configs
func (n *Namespace) Syscall() int {
panic("No namespace syscall support")
return 0
}
// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This functions returns flags only for new namespaces.
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
panic("No namespace syscall support")
return uintptr(0)
}

View File

@@ -1,4 +1,4 @@
// +build !linux,!freebsd
// +build !linux
package configs

View File

@@ -1,93 +0,0 @@
package validate
import (
"fmt"
"os"
"path/filepath"
"github.com/opencontainers/runc/libcontainer/configs"
)
type Validator interface {
Validate(*configs.Config) error
}
func New() Validator {
return &ConfigValidator{}
}
type ConfigValidator struct {
}
func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.rootfs(config); err != nil {
return err
}
if err := v.network(config); err != nil {
return err
}
if err := v.hostname(config); err != nil {
return err
}
if err := v.security(config); err != nil {
return err
}
if err := v.usernamespace(config); err != nil {
return err
}
return nil
}
// rootfs validates the the rootfs is an absolute path and is not a symlink
// to the container's root filesystem.
func (v *ConfigValidator) rootfs(config *configs.Config) error {
cleaned, err := filepath.Abs(config.Rootfs)
if err != nil {
return err
}
if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil {
return err
}
if config.Rootfs != cleaned {
return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs)
}
return nil
}
func (v *ConfigValidator) network(config *configs.Config) error {
if !config.Namespaces.Contains(configs.NEWNET) {
if len(config.Networks) > 0 || len(config.Routes) > 0 {
return fmt.Errorf("unable to apply network settings without a private NET namespace")
}
}
return nil
}
func (v *ConfigValidator) hostname(config *configs.Config) error {
if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) {
return fmt.Errorf("unable to set hostname without a private UTS namespace")
}
return nil
}
func (v *ConfigValidator) security(config *configs.Config) error {
// restrict sys without mount namespace
if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) &&
!config.Namespaces.Contains(configs.NEWNS) {
return fmt.Errorf("unable to restrict sys entries without a private MNT namespace")
}
return nil
}
func (v *ConfigValidator) usernamespace(config *configs.Config) error {
if config.Namespaces.Contains(configs.NEWUSER) {
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
return fmt.Errorf("USER namespaces aren't enabled in the kernel")
}
} else {
if config.UidMappings != nil || config.GidMappings != nil {
return fmt.Errorf("User namespace mappings specified, but USER namespace isn't enabled in the config")
}
}
return nil
}

View File

@@ -0,0 +1,117 @@
package validate
import (
"fmt"
"os"
"reflect"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
)
var (
geteuid = os.Geteuid
getegid = os.Getegid
)
func (v *ConfigValidator) rootless(config *configs.Config) error {
if err := rootlessMappings(config); err != nil {
return err
}
if err := rootlessMount(config); err != nil {
return err
}
// XXX: We currently can't verify the user config at all, because
// configs.Config doesn't store the user-related configs. So this
// has to be verified by setupUser() in init_linux.go.
return nil
}
func hasIDMapping(id int, mappings []configs.IDMap) bool {
for _, m := range mappings {
if id >= m.ContainerID && id < m.ContainerID+m.Size {
return true
}
}
return false
}
func rootlessMappings(config *configs.Config) error {
if euid := geteuid(); euid != 0 {
if !config.Namespaces.Contains(configs.NEWUSER) {
return fmt.Errorf("rootless containers require user namespaces")
}
}
if len(config.UidMappings) == 0 {
return fmt.Errorf("rootless containers requires at least one UID mapping")
}
if len(config.GidMappings) == 0 {
return fmt.Errorf("rootless containers requires at least one UID mapping")
}
return nil
}
// cgroup verifies that the user isn't trying to set any cgroup limits or paths.
func rootlessCgroup(config *configs.Config) error {
// Nothing set at all.
if config.Cgroups == nil || config.Cgroups.Resources == nil {
return nil
}
// Used for comparing to the zero value.
left := reflect.ValueOf(*config.Cgroups.Resources)
right := reflect.Zero(left.Type())
// This is all we need to do, since specconv won't add cgroup options in
// rootless mode.
if !reflect.DeepEqual(left.Interface(), right.Interface()) {
return fmt.Errorf("cannot specify resource limits in rootless container")
}
return nil
}
// mount verifies that the user isn't trying to set up any mounts they don't have
// the rights to do. In addition, it makes sure that no mount has a `uid=` or
// `gid=` option that doesn't resolve to root.
func rootlessMount(config *configs.Config) error {
// XXX: We could whitelist allowed devices at this point, but I'm not
// convinced that's a good idea. The kernel is the best arbiter of
// access control.
for _, mount := range config.Mounts {
// Check that the options list doesn't contain any uid= or gid= entries
// that don't resolve to root.
for _, opt := range strings.Split(mount.Data, ",") {
if strings.HasPrefix(opt, "uid=") {
var uid int
n, err := fmt.Sscanf(opt, "uid=%d", &uid)
if n != 1 || err != nil {
// Ignore unknown mount options.
continue
}
if !hasIDMapping(uid, config.UidMappings) {
return fmt.Errorf("cannot specify uid= mount options for unmapped uid in rootless containers")
}
}
if strings.HasPrefix(opt, "gid=") {
var gid int
n, err := fmt.Sscanf(opt, "gid=%d", &gid)
if n != 1 || err != nil {
// Ignore unknown mount options.
continue
}
if !hasIDMapping(gid, config.GidMappings) {
return fmt.Errorf("cannot specify gid= mount options for unmapped gid in rootless containers")
}
}
}
}
return nil
}

View File

@@ -0,0 +1,212 @@
package validate
import (
"fmt"
"os"
"path/filepath"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/intelrdt"
selinux "github.com/opencontainers/selinux/go-selinux"
)
type Validator interface {
Validate(*configs.Config) error
}
func New() Validator {
return &ConfigValidator{}
}
type ConfigValidator struct {
}
func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.rootfs(config); err != nil {
return err
}
if err := v.network(config); err != nil {
return err
}
if err := v.hostname(config); err != nil {
return err
}
if err := v.security(config); err != nil {
return err
}
if err := v.usernamespace(config); err != nil {
return err
}
if err := v.sysctl(config); err != nil {
return err
}
if err := v.intelrdt(config); err != nil {
return err
}
if config.Rootless {
if err := v.rootless(config); err != nil {
return err
}
}
return nil
}
// rootfs validates if the rootfs is an absolute path and is not a symlink
// to the container's root filesystem.
func (v *ConfigValidator) rootfs(config *configs.Config) error {
if _, err := os.Stat(config.Rootfs); err != nil {
if os.IsNotExist(err) {
return fmt.Errorf("rootfs (%s) does not exist", config.Rootfs)
}
return err
}
cleaned, err := filepath.Abs(config.Rootfs)
if err != nil {
return err
}
if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil {
return err
}
if filepath.Clean(config.Rootfs) != cleaned {
return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs)
}
return nil
}
func (v *ConfigValidator) network(config *configs.Config) error {
if !config.Namespaces.Contains(configs.NEWNET) {
if len(config.Networks) > 0 || len(config.Routes) > 0 {
return fmt.Errorf("unable to apply network settings without a private NET namespace")
}
}
return nil
}
func (v *ConfigValidator) hostname(config *configs.Config) error {
if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) {
return fmt.Errorf("unable to set hostname without a private UTS namespace")
}
return nil
}
func (v *ConfigValidator) security(config *configs.Config) error {
// restrict sys without mount namespace
if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) &&
!config.Namespaces.Contains(configs.NEWNS) {
return fmt.Errorf("unable to restrict sys entries without a private MNT namespace")
}
if config.ProcessLabel != "" && !selinux.GetEnabled() {
return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported")
}
return nil
}
func (v *ConfigValidator) usernamespace(config *configs.Config) error {
if config.Namespaces.Contains(configs.NEWUSER) {
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
return fmt.Errorf("USER namespaces aren't enabled in the kernel")
}
} else {
if config.UidMappings != nil || config.GidMappings != nil {
return fmt.Errorf("User namespace mappings specified, but USER namespace isn't enabled in the config")
}
}
return nil
}
// sysctl validates that the specified sysctl keys are valid or not.
// /proc/sys isn't completely namespaced and depending on which namespaces
// are specified, a subset of sysctls are permitted.
func (v *ConfigValidator) sysctl(config *configs.Config) error {
validSysctlMap := map[string]bool{
"kernel.msgmax": true,
"kernel.msgmnb": true,
"kernel.msgmni": true,
"kernel.sem": true,
"kernel.shmall": true,
"kernel.shmmax": true,
"kernel.shmmni": true,
"kernel.shm_rmid_forced": true,
}
for s := range config.Sysctl {
if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") {
if config.Namespaces.Contains(configs.NEWIPC) {
continue
} else {
return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s)
}
}
if strings.HasPrefix(s, "net.") {
if config.Namespaces.Contains(configs.NEWNET) {
if path := config.Namespaces.PathOf(configs.NEWNET); path != "" {
if err := checkHostNs(s, path); err != nil {
return err
}
}
continue
} else {
return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s)
}
}
return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s)
}
return nil
}
func (v *ConfigValidator) intelrdt(config *configs.Config) error {
if config.IntelRdt != nil {
if !intelrdt.IsEnabled() {
return fmt.Errorf("intelRdt is specified in config, but Intel RDT feature is not supported or enabled")
}
if config.IntelRdt.L3CacheSchema == "" {
return fmt.Errorf("intelRdt is specified in config, but intelRdt.l3CacheSchema is empty")
}
}
return nil
}
func isSymbolicLink(path string) (bool, error) {
fi, err := os.Lstat(path)
if err != nil {
return false, err
}
return fi.Mode()&os.ModeSymlink == os.ModeSymlink, nil
}
// checkHostNs checks whether network sysctl is used in host namespace.
func checkHostNs(sysctlConfig string, path string) error {
var currentProcessNetns = "/proc/self/ns/net"
// readlink on the current processes network namespace
destOfCurrentProcess, err := os.Readlink(currentProcessNetns)
if err != nil {
return fmt.Errorf("read soft link %q error", currentProcessNetns)
}
// First check if the provided path is a symbolic link
symLink, err := isSymbolicLink(path)
if err != nil {
return fmt.Errorf("could not check that %q is a symlink: %v", path, err)
}
if symLink == false {
// The provided namespace is not a symbolic link,
// it is not the host namespace.
return nil
}
// readlink on the path provided in the struct
destOfContainer, err := os.Readlink(path)
if err != nil {
return fmt.Errorf("read soft link %q error", path)
}
if destOfContainer == destOfCurrentProcess {
return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", sysctlConfig)
}
return nil
}

View File

@@ -0,0 +1,41 @@
package libcontainer
import (
"os"
"golang.org/x/sys/unix"
)
// mount initializes the console inside the rootfs mounting with the specified mount label
// and applying the correct ownership of the console.
func mountConsole(slavePath string) error {
oldMask := unix.Umask(0000)
defer unix.Umask(oldMask)
f, err := os.Create("/dev/console")
if err != nil && !os.IsExist(err) {
return err
}
if f != nil {
f.Close()
}
return unix.Mount(slavePath, "/dev/console", "bind", unix.MS_BIND, "")
}
// dupStdio opens the slavePath for the console and dups the fds to the current
// processes stdio, fd 0,1,2.
func dupStdio(slavePath string) error {
fd, err := unix.Open(slavePath, unix.O_RDWR, 0)
if err != nil {
return &os.PathError{
Op: "open",
Path: slavePath,
Err: err,
}
}
for _, i := range []int{0, 1, 2} {
if err := unix.Dup3(fd, i, 0); err != nil {
return err
}
}
return nil
}

View File

@@ -0,0 +1,166 @@
// Package libcontainer provides a native Go implementation for creating containers
// with namespaces, cgroups, capabilities, and filesystem access controls.
// It allows you to manage the lifecycle of the container performing additional operations
// after the container is created.
package libcontainer
import (
"os"
"time"
"github.com/opencontainers/runc/libcontainer/configs"
)
// Status is the status of a container.
type Status int
const (
// Created is the status that denotes the container exists but has not been run yet.
Created Status = iota
// Running is the status that denotes the container exists and is running.
Running
// Pausing is the status that denotes the container exists, it is in the process of being paused.
Pausing
// Paused is the status that denotes the container exists, but all its processes are paused.
Paused
// Stopped is the status that denotes the container does not have a created or running process.
Stopped
)
func (s Status) String() string {
switch s {
case Created:
return "created"
case Running:
return "running"
case Pausing:
return "pausing"
case Paused:
return "paused"
case Stopped:
return "stopped"
default:
return "unknown"
}
}
// BaseState represents the platform agnostic pieces relating to a
// running container's state
type BaseState struct {
// ID is the container ID.
ID string `json:"id"`
// InitProcessPid is the init process id in the parent namespace.
InitProcessPid int `json:"init_process_pid"`
// InitProcessStartTime is the init process start time in clock cycles since boot time.
InitProcessStartTime uint64 `json:"init_process_start"`
// Created is the unix timestamp for the creation time of the container in UTC
Created time.Time `json:"created"`
// Config is the container's configuration.
Config configs.Config `json:"config"`
}
// BaseContainer is a libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
// was not found. BaseContainer includes methods that are platform agnostic.
type BaseContainer interface {
// Returns the ID of the container
ID() string
// Returns the current status of the container.
//
// errors:
// ContainerNotExists - Container no longer exists,
// Systemerror - System error.
Status() (Status, error)
// State returns the current container's state information.
//
// errors:
// SystemError - System error.
State() (*State, error)
// Returns the current config of the container.
Config() configs.Config
// Returns the PIDs inside this container. The PIDs are in the namespace of the calling process.
//
// errors:
// ContainerNotExists - Container no longer exists,
// Systemerror - System error.
//
// Some of the returned PIDs may no longer refer to processes in the Container, unless
// the Container state is PAUSED in which case every PID in the slice is valid.
Processes() ([]int, error)
// Returns statistics for the container.
//
// errors:
// ContainerNotExists - Container no longer exists,
// Systemerror - System error.
Stats() (*Stats, error)
// Set resources of container as configured
//
// We can use this to change resources when containers are running.
//
// errors:
// SystemError - System error.
Set(config configs.Config) error
// Start a process inside the container. Returns error if process fails to
// start. You can track process lifecycle with passed Process structure.
//
// errors:
// ContainerNotExists - Container no longer exists,
// ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused,
// SystemError - System error.
Start(process *Process) (err error)
// Run immediately starts the process inside the container. Returns error if process
// fails to start. It does not block waiting for the exec fifo after start returns but
// opens the fifo after start returns.
//
// errors:
// ContainerNotExists - Container no longer exists,
// ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused,
// SystemError - System error.
Run(process *Process) (err error)
// Destroys the container, if its in a valid state, after killing any
// remaining running processes.
//
// Any event registrations are removed before the container is destroyed.
// No error is returned if the container is already destroyed.
//
// Running containers must first be stopped using Signal(..).
// Paused containers must first be resumed using Resume(..).
//
// errors:
// ContainerNotStopped - Container is still running,
// ContainerPaused - Container is paused,
// SystemError - System error.
Destroy() error
// Signal sends the provided signal code to the container's initial process.
//
// If all is specified the signal is sent to all processes in the container
// including the initial process.
//
// errors:
// SystemError - System error.
Signal(s os.Signal, all bool) error
// Exec signals the container to exec the users process at the end of the init.
//
// errors:
// SystemError - System error.
Exec() error
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,40 @@
package libcontainer
// cgroup restoring strategy provided by criu
type cgMode uint32
const (
CRIU_CG_MODE_SOFT cgMode = 3 + iota // restore cgroup properties if only dir created by criu
CRIU_CG_MODE_FULL // always restore all cgroups and their properties
CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system
CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT
)
type CriuPageServerInfo struct {
Address string // IP address of CRIU page server
Port int32 // port number of CRIU page server
}
type VethPairName struct {
ContainerInterfaceName string
HostInterfaceName string
}
type CriuOpts struct {
ImagesDirectory string // directory for storing image files
WorkDirectory string // directory to cd and write logs/pidfiles/stats to
ParentImage string // directory for storing parent image files in pre-dump and dump
LeaveRunning bool // leave container in running state after checkpoint
TcpEstablished bool // checkpoint/restore established TCP connections
ExternalUnixConnections bool // allow external unix connections
ShellJob bool // allow to dump and restore shell jobs
FileLocks bool // handle file locks, for safety
PreDump bool // call criu predump to perform iterative checkpoint
PageServer CriuPageServerInfo // allow to dump to criu page server
VethPairs []VethPairName // pass the veth to criu when restore
ManageCgroupsMode cgMode // dump or restore cgroup mode
EmptyNs uint32 // don't c/r properties for namespace from this mask
AutoDedup bool // auto deduplication for incremental dumps
LazyPages bool // restore memory pages lazily using userfaultfd
StatusFd string // fd for feedback when lazy server is ready
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,104 @@
package devices
import (
"errors"
"io/ioutil"
"os"
"path/filepath"
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
var (
ErrNotADevice = errors.New("not a device node")
)
// Testing dependencies
var (
unixLstat = unix.Lstat
ioutilReadDir = ioutil.ReadDir
)
// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the information about a linux device and return that information as a Device struct.
func DeviceFromPath(path, permissions string) (*configs.Device, error) {
var stat unix.Stat_t
err := unixLstat(path, &stat)
if err != nil {
return nil, err
}
var (
devNumber = stat.Rdev
major = unix.Major(devNumber)
)
if major == 0 {
return nil, ErrNotADevice
}
var (
devType rune
mode = stat.Mode
)
switch {
case mode&unix.S_IFBLK == unix.S_IFBLK:
devType = 'b'
case mode&unix.S_IFCHR == unix.S_IFCHR:
devType = 'c'
}
return &configs.Device{
Type: devType,
Path: path,
Major: int64(major),
Minor: int64(unix.Minor(devNumber)),
Permissions: permissions,
FileMode: os.FileMode(mode),
Uid: stat.Uid,
Gid: stat.Gid,
}, nil
}
func HostDevices() ([]*configs.Device, error) {
return getDevices("/dev")
}
func getDevices(path string) ([]*configs.Device, error) {
files, err := ioutilReadDir(path)
if err != nil {
return nil, err
}
out := []*configs.Device{}
for _, f := range files {
switch {
case f.IsDir():
switch f.Name() {
// ".lxc" & ".lxd-mounts" added to address https://github.com/lxc/lxd/issues/2825
case "pts", "shm", "fd", "mqueue", ".lxc", ".lxd-mounts":
continue
default:
sub, err := getDevices(filepath.Join(path, f.Name()))
if err != nil {
return nil, err
}
out = append(out, sub...)
continue
}
case f.Name() == "console":
continue
}
device, err := DeviceFromPath(filepath.Join(path, f.Name()), "rwm")
if err != nil {
if err == ErrNotADevice {
continue
}
if os.IsNotExist(err) {
continue
}
return nil, err
}
out = append(out, device)
}
return out, nil
}

View File

@@ -0,0 +1,70 @@
package libcontainer
import "io"
// ErrorCode is the API error code type.
type ErrorCode int
// API error codes.
const (
// Factory errors
IdInUse ErrorCode = iota
InvalidIdFormat
// Container errors
ContainerNotExists
ContainerPaused
ContainerNotStopped
ContainerNotRunning
ContainerNotPaused
// Process errors
NoProcessOps
// Common errors
ConfigInvalid
ConsoleExists
SystemError
)
func (c ErrorCode) String() string {
switch c {
case IdInUse:
return "Id already in use"
case InvalidIdFormat:
return "Invalid format"
case ContainerPaused:
return "Container paused"
case ConfigInvalid:
return "Invalid configuration"
case SystemError:
return "System error"
case ContainerNotExists:
return "Container does not exist"
case ContainerNotStopped:
return "Container is not stopped"
case ContainerNotRunning:
return "Container is not running"
case ConsoleExists:
return "Console exists for process"
case ContainerNotPaused:
return "Container is not paused"
case NoProcessOps:
return "No process operations"
default:
return "Unknown error"
}
}
// Error is the API error type.
type Error interface {
error
// Returns an error if it failed to write the detail of the Error to w.
// The detail of the Error may include the error message and a
// representation of the stack trace.
Detail(w io.Writer) error
// Returns the error code for this error.
Code() ErrorCode
}

View File

@@ -0,0 +1,44 @@
package libcontainer
import (
"github.com/opencontainers/runc/libcontainer/configs"
)
type Factory interface {
// Creates a new container with the given id and starts the initial process inside it.
// id must be a string containing only letters, digits and underscores and must contain
// between 1 and 1024 characters, inclusive.
//
// The id must not already be in use by an existing container. Containers created using
// a factory with the same path (and filesystem) must have distinct ids.
//
// Returns the new container with a running process.
//
// errors:
// IdInUse - id is already in use by a container
// InvalidIdFormat - id has incorrect format
// ConfigInvalid - config is invalid
// Systemerror - System error
//
// On error, any partially created container parts are cleaned up (the operation is atomic).
Create(id string, config *configs.Config) (Container, error)
// Load takes an ID for an existing container and returns the container information
// from the state. This presents a read only view of the container.
//
// errors:
// Path does not exist
// System error
Load(id string) (Container, error)
// StartInitialization is an internal API to libcontainer used during the reexec of the
// container.
//
// Errors:
// Pipe connection error
// System error
StartInitialization() error
// Type returns info string about factory type (e.g. lxc, libcontainer...)
Type() string
}

View File

@@ -0,0 +1,364 @@
// +build linux
package libcontainer
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"regexp"
"runtime/debug"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/configs/validate"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/mount"
"github.com/opencontainers/runc/libcontainer/utils"
"golang.org/x/sys/unix"
)
const (
stateFilename = "state.json"
execFifoFilename = "exec.fifo"
)
var idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
// InitArgs returns an options func to configure a LinuxFactory with the
// provided init binary path and arguments.
func InitArgs(args ...string) func(*LinuxFactory) error {
return func(l *LinuxFactory) (err error) {
if len(args) > 0 {
// Resolve relative paths to ensure that its available
// after directory changes.
if args[0], err = filepath.Abs(args[0]); err != nil {
return newGenericError(err, ConfigInvalid)
}
}
l.InitArgs = args
return nil
}
}
// SystemdCgroups is an options func to configure a LinuxFactory to return
// containers that use systemd to create and manage cgroups.
func SystemdCgroups(l *LinuxFactory) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &systemd.Manager{
Cgroups: config,
Paths: paths,
}
}
return nil
}
// Cgroupfs is an options func to configure a LinuxFactory to return
// containers that use the native cgroups filesystem implementation to
// create and manage cgroups.
func Cgroupfs(l *LinuxFactory) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &fs.Manager{
Cgroups: config,
Paths: paths,
}
}
return nil
}
// IntelRdtfs is an options func to configure a LinuxFactory to return
// containers that use the Intel RDT "resource control" filesystem to
// create and manage Intel Xeon platform shared resources (e.g., L3 cache).
func IntelRdtFs(l *LinuxFactory) error {
l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager {
return &intelrdt.IntelRdtManager{
Config: config,
Id: id,
Path: path,
}
}
return nil
}
// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs.
func TmpfsRoot(l *LinuxFactory) error {
mounted, err := mount.Mounted(l.Root)
if err != nil {
return err
}
if !mounted {
if err := unix.Mount("tmpfs", l.Root, "tmpfs", 0, ""); err != nil {
return err
}
}
return nil
}
// CriuPath returns an option func to configure a LinuxFactory with the
// provided criupath
func CriuPath(criupath string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error {
l.CriuPath = criupath
return nil
}
}
// New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
if root != "" {
if err := os.MkdirAll(root, 0700); err != nil {
return nil, newGenericError(err, SystemError)
}
}
l := &LinuxFactory{
Root: root,
InitPath: "/proc/self/exe",
InitArgs: []string{os.Args[0], "init"},
Validator: validate.New(),
CriuPath: "criu",
}
Cgroupfs(l)
for _, opt := range options {
if opt == nil {
continue
}
if err := opt(l); err != nil {
return nil, err
}
}
return l, nil
}
// LinuxFactory implements the default factory interface for linux based systems.
type LinuxFactory struct {
// Root directory for the factory to store state.
Root string
// InitPath is the path for calling the init responsibilities for spawning
// a container.
InitPath string
// InitArgs are arguments for calling the init responsibilities for spawning
// a container.
InitArgs []string
// CriuPath is the path to the criu binary used for checkpoint and restore of
// containers.
CriuPath string
// New{u,g}uidmapPath is the path to the binaries used for mapping with
// rootless containers.
NewuidmapPath string
NewgidmapPath string
// Validator provides validation to container configurations.
Validator validate.Validator
// NewCgroupsManager returns an initialized cgroups manager for a single container.
NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager
// NewIntelRdtManager returns an initialized Intel RDT manager for a single container.
NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager
}
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
if l.Root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
}
if err := l.validateID(id); err != nil {
return nil, err
}
if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
}
containerRoot := filepath.Join(l.Root, id)
if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) {
return nil, newGenericError(err, SystemError)
}
if err := os.MkdirAll(containerRoot, 0711); err != nil {
return nil, newGenericError(err, SystemError)
}
if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
return nil, newGenericError(err, SystemError)
}
c := &linuxContainer{
id: id,
root: containerRoot,
config: config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
newuidmapPath: l.NewuidmapPath,
newgidmapPath: l.NewgidmapPath,
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
}
if intelrdt.IsEnabled() {
c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
}
c.state = &stoppedState{c: c}
return c, nil
}
func (l *LinuxFactory) Load(id string) (Container, error) {
if l.Root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
}
containerRoot := filepath.Join(l.Root, id)
state, err := l.loadState(containerRoot, id)
if err != nil {
return nil, err
}
r := &nonChildProcess{
processPid: state.InitProcessPid,
processStartTime: state.InitProcessStartTime,
fds: state.ExternalDescriptors,
}
c := &linuxContainer{
initProcess: r,
initProcessStartTime: state.InitProcessStartTime,
id: id,
config: &state.Config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
newuidmapPath: l.NewuidmapPath,
newgidmapPath: l.NewgidmapPath,
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
root: containerRoot,
created: state.Created,
}
c.state = &loadedState{c: c}
if err := c.refreshState(); err != nil {
return nil, err
}
if intelrdt.IsEnabled() {
c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath)
}
return c, nil
}
func (l *LinuxFactory) Type() string {
return "libcontainer"
}
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) {
var (
pipefd, fifofd int
consoleSocket *os.File
envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE")
envFifoFd = os.Getenv("_LIBCONTAINER_FIFOFD")
envConsole = os.Getenv("_LIBCONTAINER_CONSOLE")
)
// Get the INITPIPE.
pipefd, err = strconv.Atoi(envInitPipe)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
}
var (
pipe = os.NewFile(uintptr(pipefd), "pipe")
it = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
)
defer pipe.Close()
// Only init processes have FIFOFD.
fifofd = -1
if it == initStandard {
if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err)
}
}
if envConsole != "" {
console, err := strconv.Atoi(envConsole)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err)
}
consoleSocket = os.NewFile(uintptr(console), "console-socket")
defer consoleSocket.Close()
}
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
defer func() {
// We have an error during the initialization of the container's init,
// send it back to the parent process in the form of an initError.
if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
fmt.Fprintln(os.Stderr, err)
return
}
if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
fmt.Fprintln(os.Stderr, err)
return
}
}()
defer func() {
if e := recover(); e != nil {
err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
}
}()
i, err := newContainerInit(it, pipe, consoleSocket, fifofd)
if err != nil {
return err
}
// If Init succeeds, syscall.Exec will not return, hence none of the defers will be called.
return i.Init()
}
func (l *LinuxFactory) loadState(root, id string) (*State, error) {
f, err := os.Open(filepath.Join(root, stateFilename))
if err != nil {
if os.IsNotExist(err) {
return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists)
}
return nil, newGenericError(err, SystemError)
}
defer f.Close()
var state *State
if err := json.NewDecoder(f).Decode(&state); err != nil {
return nil, newGenericError(err, SystemError)
}
return state, nil
}
func (l *LinuxFactory) validateID(id string) error {
if !idRegex.MatchString(id) {
return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat)
}
return nil
}
// NewuidmapPath returns an option func to configure a LinuxFactory with the
// provided ..
func NewuidmapPath(newuidmapPath string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error {
l.NewuidmapPath = newuidmapPath
return nil
}
}
// NewgidmapPath returns an option func to configure a LinuxFactory with the
// provided ..
func NewgidmapPath(newgidmapPath string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error {
l.NewgidmapPath = newgidmapPath
return nil
}
}

View File

@@ -0,0 +1,92 @@
package libcontainer
import (
"fmt"
"io"
"text/template"
"time"
"github.com/opencontainers/runc/libcontainer/stacktrace"
)
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
Code: {{.ECode}}
{{if .Message }}
Message: {{.Message}}
{{end}}
Frames:{{range $i, $frame := .Stack.Frames}}
---
{{$i}}: {{$frame.Function}}
Package: {{$frame.Package}}
File: {{$frame.File}}@{{$frame.Line}}{{end}}
`))
func newGenericError(err error, c ErrorCode) Error {
if le, ok := err.(Error); ok {
return le
}
gerr := &genericError{
Timestamp: time.Now(),
Err: err,
ECode: c,
Stack: stacktrace.Capture(1),
}
if err != nil {
gerr.Message = err.Error()
}
return gerr
}
func newSystemError(err error) Error {
return createSystemError(err, "")
}
func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error {
return createSystemError(err, fmt.Sprintf(cause, v...))
}
func newSystemErrorWithCause(err error, cause string) Error {
return createSystemError(err, cause)
}
// createSystemError creates the specified error with the correct number of
// stack frames skipped. This is only to be called by the other functions for
// formatting the error.
func createSystemError(err error, cause string) Error {
gerr := &genericError{
Timestamp: time.Now(),
Err: err,
ECode: SystemError,
Cause: cause,
Stack: stacktrace.Capture(2),
}
if err != nil {
gerr.Message = err.Error()
}
return gerr
}
type genericError struct {
Timestamp time.Time
ECode ErrorCode
Err error `json:"-"`
Cause string
Message string
Stack stacktrace.Stacktrace
}
func (e *genericError) Error() string {
if e.Cause == "" {
return e.Message
}
frame := e.Stack.Frames[0]
return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message)
}
func (e *genericError) Code() ErrorCode {
return e.ECode
}
func (e *genericError) Detail(w io.Writer) error {
return errorTemplate.Execute(w, e)
}

View File

@@ -0,0 +1,534 @@
// +build linux
package libcontainer
import (
"encoding/json"
"fmt"
"io"
"net"
"os"
"strings"
"syscall" // only for Errno
"unsafe"
"golang.org/x/sys/unix"
"github.com/containerd/console"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/user"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
)
type initType string
const (
initSetns initType = "setns"
initStandard initType = "standard"
)
type pid struct {
Pid int `json:"pid"`
PidFirstChild int `json:"pid_first"`
}
// network is an internal struct used to setup container networks.
type network struct {
configs.Network
// TempVethPeerName is a unique temporary veth peer name that was placed into
// the container's namespace.
TempVethPeerName string `json:"temp_veth_peer_name"`
}
// initConfig is used for transferring parameters from Exec() to Init()
type initConfig struct {
Args []string `json:"args"`
Env []string `json:"env"`
Cwd string `json:"cwd"`
Capabilities *configs.Capabilities `json:"capabilities"`
ProcessLabel string `json:"process_label"`
AppArmorProfile string `json:"apparmor_profile"`
NoNewPrivileges bool `json:"no_new_privileges"`
User string `json:"user"`
AdditionalGroups []string `json:"additional_groups"`
Config *configs.Config `json:"config"`
Networks []*network `json:"network"`
PassedFilesCount int `json:"passed_files_count"`
ContainerId string `json:"containerid"`
Rlimits []configs.Rlimit `json:"rlimits"`
CreateConsole bool `json:"create_console"`
ConsoleWidth uint16 `json:"console_width"`
ConsoleHeight uint16 `json:"console_height"`
Rootless bool `json:"rootless"`
}
type initer interface {
Init() error
}
func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err
}
if err := populateProcessEnvironment(config.Env); err != nil {
return nil, err
}
switch t {
case initSetns:
return &linuxSetnsInit{
pipe: pipe,
consoleSocket: consoleSocket,
config: config,
}, nil
case initStandard:
return &linuxStandardInit{
pipe: pipe,
consoleSocket: consoleSocket,
parentPid: unix.Getppid(),
config: config,
fifoFd: fifoFd,
}, nil
}
return nil, fmt.Errorf("unknown init type %q", t)
}
// populateProcessEnvironment loads the provided environment variables into the
// current processes's environment.
func populateProcessEnvironment(env []string) error {
for _, pair := range env {
p := strings.SplitN(pair, "=", 2)
if len(p) < 2 {
return fmt.Errorf("invalid environment '%v'", pair)
}
if err := os.Setenv(p[0], p[1]); err != nil {
return err
}
}
return nil
}
// finalizeNamespace drops the caps, sets the correct user
// and working dir, and closes any leaked file descriptors
// before executing the command inside the namespace
func finalizeNamespace(config *initConfig) error {
// Ensure that all unwanted fds we may have accidentally
// inherited are marked close-on-exec so they stay out of the
// container
if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
return err
}
capabilities := &configs.Capabilities{}
if config.Capabilities != nil {
capabilities = config.Capabilities
} else if config.Config.Capabilities != nil {
capabilities = config.Config.Capabilities
}
w, err := newContainerCapList(capabilities)
if err != nil {
return err
}
// drop capabilities in bounding set before changing user
if err := w.ApplyBoundingSet(); err != nil {
return err
}
// preserve existing capabilities while we change users
if err := system.SetKeepCaps(); err != nil {
return err
}
if err := setupUser(config); err != nil {
return err
}
if err := system.ClearKeepCaps(); err != nil {
return err
}
if err := w.ApplyCaps(); err != nil {
return err
}
if config.Cwd != "" {
if err := unix.Chdir(config.Cwd); err != nil {
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err)
}
}
return nil
}
// setupConsole sets up the console from inside the container, and sends the
// master pty fd to the config.Pipe (using cmsg). This is done to ensure that
// consoles are scoped to a container properly (see runc#814 and the many
// issues related to that). This has to be run *after* we've pivoted to the new
// rootfs (and the users' configuration is entirely set up).
func setupConsole(socket *os.File, config *initConfig, mount bool) error {
defer socket.Close()
// At this point, /dev/ptmx points to something that we would expect. We
// used to change the owner of the slave path, but since the /dev/pts mount
// can have gid=X set (at the users' option). So touching the owner of the
// slave PTY is not necessary, as the kernel will handle that for us. Note
// however, that setupUser (specifically fixStdioPermissions) *will* change
// the UID owner of the console to be the user the process will run as (so
// they can actually control their console).
pty, slavePath, err := console.NewPty()
if err != nil {
return err
}
if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 {
err = pty.Resize(console.WinSize{
Height: config.ConsoleHeight,
Width: config.ConsoleWidth,
})
if err != nil {
return err
}
}
// After we return from here, we don't need the console anymore.
defer pty.Close()
// Mount the console inside our rootfs.
if mount {
if err := mountConsole(slavePath); err != nil {
return err
}
}
// While we can access console.master, using the API is a good idea.
if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil {
return err
}
// Now, dup over all the things.
return dupStdio(slavePath)
}
// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe io.ReadWriter) error {
// Tell parent.
if err := writeSync(pipe, procReady); err != nil {
return err
}
// Wait for parent to give the all-clear.
if err := readSync(pipe, procRun); err != nil {
return err
}
return nil
}
// syncParentHooks sends to the given pipe a JSON payload which indicates that
// the parent should execute pre-start hooks. It then waits for the parent to
// indicate that it is cleared to resume.
func syncParentHooks(pipe io.ReadWriter) error {
// Tell parent.
if err := writeSync(pipe, procHooks); err != nil {
return err
}
// Wait for parent to give the all-clear.
if err := readSync(pipe, procResume); err != nil {
return err
}
return nil
}
// setupUser changes the groups, gid, and uid for the user inside the container
func setupUser(config *initConfig) error {
// Set up defaults.
defaultExecUser := user.ExecUser{
Uid: 0,
Gid: 0,
Home: "/",
}
passwdPath, err := user.GetPasswdPath()
if err != nil {
return err
}
groupPath, err := user.GetGroupPath()
if err != nil {
return err
}
execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
if err != nil {
return err
}
var addGroups []int
if len(config.AdditionalGroups) > 0 {
addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
if err != nil {
return err
}
}
// Rather than just erroring out later in setuid(2) and setgid(2), check
// that the user is mapped here.
if _, err := config.Config.HostUID(execUser.Uid); err != nil {
return fmt.Errorf("cannot set uid to unmapped user in user namespace")
}
if _, err := config.Config.HostGID(execUser.Gid); err != nil {
return fmt.Errorf("cannot set gid to unmapped user in user namespace")
}
if config.Rootless {
// We cannot set any additional groups in a rootless container and thus
// we bail if the user asked us to do so. TODO: We currently can't do
// this check earlier, but if libcontainer.Process.User was typesafe
// this might work.
if len(addGroups) > 0 {
return fmt.Errorf("cannot set any additional groups in a rootless container")
}
}
// Before we change to the container's user make sure that the processes
// STDIO is correctly owned by the user that we are switching to.
if err := fixStdioPermissions(config, execUser); err != nil {
return err
}
// This isn't allowed in an unprivileged user namespace since Linux 3.19.
// There's nothing we can do about /etc/group entries, so we silently
// ignore setting groups here (since the user didn't explicitly ask us to
// set the group).
if !config.Rootless {
suppGroups := append(execUser.Sgids, addGroups...)
if err := unix.Setgroups(suppGroups); err != nil {
return err
}
}
if err := system.Setgid(execUser.Gid); err != nil {
return err
}
if err := system.Setuid(execUser.Uid); err != nil {
return err
}
// if we didn't get HOME already, set it based on the user's HOME
if envHome := os.Getenv("HOME"); envHome == "" {
if err := os.Setenv("HOME", execUser.Home); err != nil {
return err
}
}
return nil
}
// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
// The ownership needs to match because it is created outside of the container and needs to be
// localized.
func fixStdioPermissions(config *initConfig, u *user.ExecUser) error {
var null unix.Stat_t
if err := unix.Stat("/dev/null", &null); err != nil {
return err
}
for _, fd := range []uintptr{
os.Stdin.Fd(),
os.Stderr.Fd(),
os.Stdout.Fd(),
} {
var s unix.Stat_t
if err := unix.Fstat(int(fd), &s); err != nil {
return err
}
// Skip chown of /dev/null if it was used as one of the STDIO fds.
if s.Rdev == null.Rdev {
continue
}
// We only change the uid owner (as it is possible for the mount to
// prefer a different gid, and there's no reason for us to change it).
// The reason why we don't just leave the default uid=X mount setup is
// that users expect to be able to actually use their console. Without
// this code, you couldn't effectively run as a non-root user inside a
// container and also have a console set up.
if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil {
// If we've hit an EINVAL then s.Gid isn't mapped in the user
// namespace. If we've hit an EPERM then the inode's current owner
// is not mapped in our user namespace (in particular,
// privileged_wrt_inode_uidgid() has failed). In either case, we
// are in a configuration where it's better for us to just not
// touch the stdio rather than bail at this point.
if err == unix.EINVAL || err == unix.EPERM {
continue
}
return err
}
}
return nil
}
// setupNetwork sets up and initializes any network interface inside the container.
func setupNetwork(config *initConfig) error {
for _, config := range config.Networks {
strategy, err := getStrategy(config.Type)
if err != nil {
return err
}
if err := strategy.initialize(config); err != nil {
return err
}
}
return nil
}
func setupRoute(config *configs.Config) error {
for _, config := range config.Routes {
_, dst, err := net.ParseCIDR(config.Destination)
if err != nil {
return err
}
src := net.ParseIP(config.Source)
if src == nil {
return fmt.Errorf("Invalid source for route: %s", config.Source)
}
gw := net.ParseIP(config.Gateway)
if gw == nil {
return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
}
l, err := netlink.LinkByName(config.InterfaceName)
if err != nil {
return err
}
route := &netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
Dst: dst,
Src: src,
Gw: gw,
LinkIndex: l.Attrs().Index,
}
if err := netlink.RouteAdd(route); err != nil {
return err
}
}
return nil
}
func setupRlimits(limits []configs.Rlimit, pid int) error {
for _, rlimit := range limits {
if err := system.Prlimit(pid, rlimit.Type, unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil {
return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
}
}
return nil
}
const _P_PID = 1
type siginfo struct {
si_signo int32
si_errno int32
si_code int32
// below here is a union; si_pid is the only field we use
si_pid int32
// Pad to 128 bytes as detailed in blockUntilWaitable
pad [96]byte
}
// isWaitable returns true if the process has exited false otherwise.
// Its based off blockUntilWaitable in src/os/wait_waitid.go
func isWaitable(pid int) (bool, error) {
si := &siginfo{}
_, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0)
if e != 0 {
return false, os.NewSyscallError("waitid", e)
}
return si.si_pid != 0, nil
}
// isNoChildren returns true if err represents a unix.ECHILD (formerly syscall.ECHILD) false otherwise
func isNoChildren(err error) bool {
switch err := err.(type) {
case syscall.Errno:
if err == unix.ECHILD {
return true
}
case *os.SyscallError:
if err.Err == unix.ECHILD {
return true
}
}
return false
}
// signalAllProcesses freezes then iterates over all the processes inside the
// manager's cgroups sending the signal s to them.
// If s is SIGKILL then it will wait for each process to exit.
// For all other signals it will check if the process is ready to report its
// exit status and only if it is will a wait be performed.
func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
var procs []*os.Process
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Warn(err)
}
pids, err := m.GetAllPids()
if err != nil {
m.Freeze(configs.Thawed)
return err
}
for _, pid := range pids {
p, err := os.FindProcess(pid)
if err != nil {
logrus.Warn(err)
continue
}
procs = append(procs, p)
if err := p.Signal(s); err != nil {
logrus.Warn(err)
}
}
if err := m.Freeze(configs.Thawed); err != nil {
logrus.Warn(err)
}
subreaper, err := system.GetSubreaper()
if err != nil {
// The error here means that PR_GET_CHILD_SUBREAPER is not
// supported because this code might run on a kernel older
// than 3.4. We don't want to throw an error in that case,
// and we simplify things, considering there is no subreaper
// set.
subreaper = 0
}
for _, p := range procs {
if s != unix.SIGKILL {
if ok, err := isWaitable(p.Pid); err != nil {
if !isNoChildren(err) {
logrus.Warn("signalAllProcesses: ", p.Pid, err)
}
continue
} else if !ok {
// Not ready to report so don't wait
continue
}
}
// In case a subreaper has been setup, this code must not
// wait for the process. Otherwise, we cannot be sure the
// current process will be reaped by the subreaper, while
// the subreaper might be waiting for this process in order
// to retrieve its exit code.
if subreaper == 0 {
if _, err := p.Wait(); err != nil {
if !isNoChildren(err) {
logrus.Warn("wait: ", err)
}
}
}
}
return nil
}

View File

@@ -0,0 +1,2 @@
// integration is used for integration testing of libcontainer
package integration

View File

@@ -0,0 +1,553 @@
// +build linux
package intelrdt
import (
"bufio"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"github.com/opencontainers/runc/libcontainer/configs"
)
/*
* About Intel RDT/CAT feature:
* Intel platforms with new Xeon CPU support Resource Director Technology (RDT).
* Intel Cache Allocation Technology (CAT) is a sub-feature of RDT. Currently L3
* Cache is the only resource that is supported in RDT.
*
* This feature provides a way for the software to restrict cache allocation to a
* defined 'subset' of L3 cache which may be overlapping with other 'subsets'.
* The different subsets are identified by class of service (CLOS) and each CLOS
* has a capacity bitmask (CBM).
*
* For more information about Intel RDT/CAT can be found in the section 17.17
* of Intel Software Developer Manual.
*
* About Intel RDT/CAT kernel interface:
* In Linux 4.10 kernel or newer, the interface is defined and exposed via
* "resource control" filesystem, which is a "cgroup-like" interface.
*
* Comparing with cgroups, it has similar process management lifecycle and
* interfaces in a container. But unlike cgroups' hierarchy, it has single level
* filesystem layout.
*
* Intel RDT "resource control" filesystem hierarchy:
* mount -t resctrl resctrl /sys/fs/resctrl
* tree /sys/fs/resctrl
* /sys/fs/resctrl/
* |-- info
* | |-- L3
* | |-- cbm_mask
* | |-- min_cbm_bits
* | |-- num_closids
* |-- cpus
* |-- schemata
* |-- tasks
* |-- <container_id>
* |-- cpus
* |-- schemata
* |-- tasks
*
* For runc, we can make use of `tasks` and `schemata` configuration for L3 cache
* resource constraints.
*
* The file `tasks` has a list of tasks that belongs to this group (e.g.,
* <container_id>" group). Tasks can be added to a group by writing the task ID
* to the "tasks" file (which will automatically remove them from the previous
* group to which they belonged). New tasks created by fork(2) and clone(2) are
* added to the same group as their parent. If a pid is not in any sub group, it is
* in root group.
*
* The file `schemata` has allocation bitmasks/values for L3 cache on each socket,
* which contains L3 cache id and capacity bitmask (CBM).
* Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
* For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0`
* which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
*
* The valid L3 cache CBM is a *contiguous bits set* and number of bits that can
* be set is less than the max bit. The max bits in the CBM is varied among
* supported Intel Xeon platforms. In Intel RDT "resource control" filesystem
* layout, the CBM in a group should be a subset of the CBM in root. Kernel will
* check if it is valid when writing. e.g., 0xfffff in root indicates the max bits
* of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM
* values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
*
* For more information about Intel RDT/CAT kernel interface:
* https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt
*
* An example for runc:
* Consider a two-socket machine with two L3 caches where the default CBM is
* 0xfffff and the max CBM length is 20 bits. With this configuration, tasks
* inside the container only have access to the "upper" 80% of L3 cache id 0 and
* the "lower" 50% L3 cache id 1:
*
* "linux": {
* "intelRdt": {
* "l3CacheSchema": "L3:0=ffff0;1=3ff"
* }
* }
*/
type Manager interface {
// Applies Intel RDT configuration to the process with the specified pid
Apply(pid int) error
// Returns statistics for Intel RDT
GetStats() (*Stats, error)
// Destroys the Intel RDT 'container_id' group
Destroy() error
// Returns Intel RDT path to save in a state file and to be able to
// restore the object later
GetPath() string
// Set Intel RDT "resource control" filesystem as configured.
Set(container *configs.Config) error
}
// This implements interface Manager
type IntelRdtManager struct {
mu sync.Mutex
Config *configs.Config
Id string
Path string
}
const (
IntelRdtTasks = "tasks"
)
var (
// The absolute root path of the Intel RDT "resource control" filesystem
intelRdtRoot string
intelRdtRootLock sync.Mutex
// The flag to indicate if Intel RDT is supported
isEnabled bool
)
type intelRdtData struct {
root string
config *configs.Config
pid int
}
// Check if Intel RDT is enabled in init()
func init() {
// 1. Check if hardware and kernel support Intel RDT/CAT feature
// "cat_l3" flag is set if supported
isFlagSet, err := parseCpuInfoFile("/proc/cpuinfo")
if !isFlagSet || err != nil {
isEnabled = false
return
}
// 2. Check if Intel RDT "resource control" filesystem is mounted
// The user guarantees to mount the filesystem
isEnabled = isIntelRdtMounted()
}
// Return the mount point path of Intel RDT "resource control" filesysem
func findIntelRdtMountpointDir() (string, error) {
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", err
}
defer f.Close()
s := bufio.NewScanner(f)
for s.Scan() {
text := s.Text()
fields := strings.Split(text, " ")
// Safe as mountinfo encodes mountpoints with spaces as \040.
index := strings.Index(text, " - ")
postSeparatorFields := strings.Fields(text[index+3:])
numPostFields := len(postSeparatorFields)
// This is an error as we can't detect if the mount is for "Intel RDT"
if numPostFields == 0 {
return "", fmt.Errorf("Found no fields post '-' in %q", text)
}
if postSeparatorFields[0] == "resctrl" {
// Check that the mount is properly formated.
if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
}
return fields[4], nil
}
}
if err := s.Err(); err != nil {
return "", err
}
return "", NewNotFoundError("Intel RDT")
}
// Gets the root path of Intel RDT "resource control" filesystem
func getIntelRdtRoot() (string, error) {
intelRdtRootLock.Lock()
defer intelRdtRootLock.Unlock()
if intelRdtRoot != "" {
return intelRdtRoot, nil
}
root, err := findIntelRdtMountpointDir()
if err != nil {
return "", err
}
if _, err := os.Stat(root); err != nil {
return "", err
}
intelRdtRoot = root
return intelRdtRoot, nil
}
func isIntelRdtMounted() bool {
_, err := getIntelRdtRoot()
if err != nil {
return false
}
return true
}
func parseCpuInfoFile(path string) (bool, error) {
f, err := os.Open(path)
if err != nil {
return false, err
}
defer f.Close()
s := bufio.NewScanner(f)
for s.Scan() {
if err := s.Err(); err != nil {
return false, err
}
text := s.Text()
flags := strings.Split(text, " ")
// "cat_l3" flag is set if Intel RDT/CAT is supported
for _, flag := range flags {
if flag == "cat_l3" {
return true, nil
}
}
}
return false, nil
}
func parseUint(s string, base, bitSize int) (uint64, error) {
value, err := strconv.ParseUint(s, base, bitSize)
if err != nil {
intValue, intErr := strconv.ParseInt(s, base, bitSize)
// 1. Handle negative values greater than MinInt64 (and)
// 2. Handle negative values lesser than MinInt64
if intErr == nil && intValue < 0 {
return 0, nil
} else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
return 0, nil
}
return value, err
}
return value, nil
}
// Gets a single uint64 value from the specified file.
func getIntelRdtParamUint(path, file string) (uint64, error) {
fileName := filepath.Join(path, file)
contents, err := ioutil.ReadFile(fileName)
if err != nil {
return 0, err
}
res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64)
if err != nil {
return res, fmt.Errorf("unable to parse %q as a uint from file %q", string(contents), fileName)
}
return res, nil
}
// Gets a string value from the specified file
func getIntelRdtParamString(path, file string) (string, error) {
contents, err := ioutil.ReadFile(filepath.Join(path, file))
if err != nil {
return "", err
}
return strings.TrimSpace(string(contents)), nil
}
func readTasksFile(dir string) ([]int, error) {
f, err := os.Open(filepath.Join(dir, IntelRdtTasks))
if err != nil {
return nil, err
}
defer f.Close()
var (
s = bufio.NewScanner(f)
out = []int{}
)
for s.Scan() {
if t := s.Text(); t != "" {
pid, err := strconv.Atoi(t)
if err != nil {
return nil, err
}
out = append(out, pid)
}
}
return out, nil
}
func writeFile(dir, file, data string) error {
if dir == "" {
return fmt.Errorf("no such directory for %s", file)
}
if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0700); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", data, file, err)
}
return nil
}
func getIntelRdtData(c *configs.Config, pid int) (*intelRdtData, error) {
rootPath, err := getIntelRdtRoot()
if err != nil {
return nil, err
}
return &intelRdtData{
root: rootPath,
config: c,
pid: pid,
}, nil
}
// Get the read-only L3 cache information
func getL3CacheInfo() (*L3CacheInfo, error) {
l3CacheInfo := &L3CacheInfo{}
rootPath, err := getIntelRdtRoot()
if err != nil {
return l3CacheInfo, err
}
path := filepath.Join(rootPath, "info", "L3")
cbmMask, err := getIntelRdtParamString(path, "cbm_mask")
if err != nil {
return l3CacheInfo, err
}
minCbmBits, err := getIntelRdtParamUint(path, "min_cbm_bits")
if err != nil {
return l3CacheInfo, err
}
numClosids, err := getIntelRdtParamUint(path, "num_closids")
if err != nil {
return l3CacheInfo, err
}
l3CacheInfo.CbmMask = cbmMask
l3CacheInfo.MinCbmBits = minCbmBits
l3CacheInfo.NumClosids = numClosids
return l3CacheInfo, nil
}
// WriteIntelRdtTasks writes the specified pid into the "tasks" file
func WriteIntelRdtTasks(dir string, pid int) error {
if dir == "" {
return fmt.Errorf("no such directory for %s", IntelRdtTasks)
}
// Dont attach any pid if -1 is specified as a pid
if pid != -1 {
if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err)
}
}
return nil
}
// Check if Intel RDT is enabled
func IsEnabled() bool {
return isEnabled
}
// Get the 'container_id' path in Intel RDT "resource control" filesystem
func GetIntelRdtPath(id string) (string, error) {
rootPath, err := getIntelRdtRoot()
if err != nil {
return "", err
}
path := filepath.Join(rootPath, id)
return path, nil
}
// Applies Intel RDT configuration to the process with the specified pid
func (m *IntelRdtManager) Apply(pid int) (err error) {
// If intelRdt is not specified in config, we do nothing
if m.Config.IntelRdt == nil {
return nil
}
d, err := getIntelRdtData(m.Config, pid)
if err != nil && !IsNotFound(err) {
return err
}
m.mu.Lock()
defer m.mu.Unlock()
path, err := d.join(m.Id)
if err != nil {
return err
}
m.Path = path
return nil
}
// Destroys the Intel RDT 'container_id' group
func (m *IntelRdtManager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
if err := os.RemoveAll(m.Path); err != nil {
return err
}
m.Path = ""
return nil
}
// Returns Intel RDT path to save in a state file and to be able to
// restore the object later
func (m *IntelRdtManager) GetPath() string {
if m.Path == "" {
m.Path, _ = GetIntelRdtPath(m.Id)
}
return m.Path
}
// Returns statistics for Intel RDT
func (m *IntelRdtManager) GetStats() (*Stats, error) {
// If intelRdt is not specified in config
if m.Config.IntelRdt == nil {
return nil, nil
}
m.mu.Lock()
defer m.mu.Unlock()
stats := NewStats()
// The read-only L3 cache information
l3CacheInfo, err := getL3CacheInfo()
if err != nil {
return nil, err
}
stats.L3CacheInfo = l3CacheInfo
// The read-only L3 cache schema in root
rootPath, err := getIntelRdtRoot()
if err != nil {
return nil, err
}
tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata")
if err != nil {
return nil, err
}
// L3 cache schema is in the first line
schemaRootStrings := strings.Split(tmpRootStrings, "\n")
stats.L3CacheSchemaRoot = schemaRootStrings[0]
// The L3 cache schema in 'container_id' group
tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata")
if err != nil {
return nil, err
}
// L3 cache schema is in the first line
schemaStrings := strings.Split(tmpStrings, "\n")
stats.L3CacheSchema = schemaStrings[0]
return stats, nil
}
// Set Intel RDT "resource control" filesystem as configured.
func (m *IntelRdtManager) Set(container *configs.Config) error {
path := m.GetPath()
// About L3 cache schema file:
// The schema has allocation masks/values for L3 cache on each socket,
// which contains L3 cache id and capacity bitmask (CBM).
// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
// For example, on a two-socket machine, L3's schema line could be:
// L3:0=ff;1=c0
// Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0.
//
// About L3 cache CBM validity:
// The valid L3 cache CBM is a *contiguous bits set* and number of
// bits that can be set is less than the max bit. The max bits in the
// CBM is varied among supported Intel Xeon platforms. In Intel RDT
// "resource control" filesystem layout, the CBM in a group should
// be a subset of the CBM in root. Kernel will check if it is valid
// when writing.
// e.g., 0xfffff in root indicates the max bits of CBM is 20 bits,
// which mapping to entire L3 cache capacity. Some valid CBM values
// to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc.
if container.IntelRdt != nil {
l3CacheSchema := container.IntelRdt.L3CacheSchema
if l3CacheSchema != "" {
if err := writeFile(path, "schemata", l3CacheSchema); err != nil {
return err
}
}
}
return nil
}
func (raw *intelRdtData) join(id string) (string, error) {
path := filepath.Join(raw.root, id)
if err := os.MkdirAll(path, 0755); err != nil {
return "", err
}
if err := WriteIntelRdtTasks(path, raw.pid); err != nil {
return "", err
}
return path, nil
}
type NotFoundError struct {
ResourceControl string
}
func (e *NotFoundError) Error() string {
return fmt.Sprintf("mountpoint for %s not found", e.ResourceControl)
}
func NewNotFoundError(res string) error {
return &NotFoundError{
ResourceControl: res,
}
}
func IsNotFound(err error) bool {
if err == nil {
return false
}
_, ok := err.(*NotFoundError)
return ok
}

View File

@@ -0,0 +1,24 @@
// +build linux
package intelrdt
type L3CacheInfo struct {
CbmMask string `json:"cbm_mask,omitempty"`
MinCbmBits uint64 `json:"min_cbm_bits,omitempty"`
NumClosids uint64 `json:"num_closids,omitempty"`
}
type Stats struct {
// The read-only L3 cache information
L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"`
// The read-only L3 cache schema in root
L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"`
// The L3 cache schema in 'container_id' group
L3CacheSchema string `json:"l3_cache_schema,omitempty"`
}
func NewStats() *Stats {
return &Stats{}
}

View File

@@ -0,0 +1,50 @@
// +build linux
package keys
import (
"fmt"
"strconv"
"strings"
"golang.org/x/sys/unix"
)
type KeySerial uint32
func JoinSessionKeyring(name string) (KeySerial, error) {
sessKeyId, err := unix.KeyctlJoinSessionKeyring(name)
if err != nil {
return 0, fmt.Errorf("could not create session key: %v", err)
}
return KeySerial(sessKeyId), nil
}
// ModKeyringPerm modifies permissions on a keyring by reading the current permissions,
// anding the bits with the given mask (clearing permissions) and setting
// additional permission bits
func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
dest, err := unix.KeyctlString(unix.KEYCTL_DESCRIBE, int(ringId))
if err != nil {
return err
}
res := strings.Split(dest, ";")
if len(res) < 5 {
return fmt.Errorf("Destination buffer for key description is too small")
}
// parse permissions
perm64, err := strconv.ParseUint(res[3], 16, 32)
if err != nil {
return err
}
perm := (uint32(perm64) & mask) | setbits
if err := unix.KeyctlSetperm(int(ringId), perm); err != nil {
return err
}
return nil
}

View File

@@ -0,0 +1,89 @@
// +build linux
package libcontainer
import (
"github.com/vishvananda/netlink/nl"
"golang.org/x/sys/unix"
)
// list of known message types we want to send to bootstrap program
// The number is randomly chosen to not conflict with known netlink types
const (
InitMsg uint16 = 62000
CloneFlagsAttr uint16 = 27281
NsPathsAttr uint16 = 27282
UidmapAttr uint16 = 27283
GidmapAttr uint16 = 27284
SetgroupAttr uint16 = 27285
OomScoreAdjAttr uint16 = 27286
RootlessAttr uint16 = 27287
UidmapPathAttr uint16 = 27288
GidmapPathAttr uint16 = 27289
)
type Int32msg struct {
Type uint16
Value uint32
}
// Serialize serializes the message.
// Int32msg has the following representation
// | nlattr len | nlattr type |
// | uint32 value |
func (msg *Int32msg) Serialize() []byte {
buf := make([]byte, msg.Len())
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(msg.Len()))
native.PutUint16(buf[2:4], msg.Type)
native.PutUint32(buf[4:8], msg.Value)
return buf
}
func (msg *Int32msg) Len() int {
return unix.NLA_HDRLEN + 4
}
// Bytemsg has the following representation
// | nlattr len | nlattr type |
// | value | pad |
type Bytemsg struct {
Type uint16
Value []byte
}
func (msg *Bytemsg) Serialize() []byte {
l := msg.Len()
buf := make([]byte, (l+unix.NLA_ALIGNTO-1) & ^(unix.NLA_ALIGNTO-1))
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(l))
native.PutUint16(buf[2:4], msg.Type)
copy(buf[4:], msg.Value)
return buf
}
func (msg *Bytemsg) Len() int {
return unix.NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
}
type Boolmsg struct {
Type uint16
Value bool
}
func (msg *Boolmsg) Serialize() []byte {
buf := make([]byte, msg.Len())
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(msg.Len()))
native.PutUint16(buf[2:4], msg.Type)
if msg.Value {
buf[4] = 1
} else {
buf[4] = 0
}
return buf
}
func (msg *Boolmsg) Len() int {
return unix.NLA_HDRLEN + 1
}

View File

@@ -0,0 +1,23 @@
package mount
// GetMounts retrieves a list of mounts for the current running process.
func GetMounts() ([]*Info, error) {
return parseMountTable()
}
// Mounted looks at /proc/self/mountinfo to determine of the specified
// mountpoint has been mounted
func Mounted(mountpoint string) (bool, error) {
entries, err := parseMountTable()
if err != nil {
return false, err
}
// Search the table for the mountpoint
for _, e := range entries {
if e.Mountpoint == mountpoint {
return true, nil
}
}
return false, nil
}

View File

@@ -0,0 +1,82 @@
// +build linux
package mount
import (
"bufio"
"fmt"
"io"
"os"
"strings"
)
const (
/* 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
(1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11)
(1) mount ID: unique identifier of the mount (may be reused after umount)
(2) parent ID: ID of parent (or of self for the top of the mount tree)
(3) major:minor: value of st_dev for files on filesystem
(4) root: root of the mount within the filesystem
(5) mount point: mount point relative to the process's root
(6) mount options: per mount options
(7) optional fields: zero or more fields of the form "tag[:value]"
(8) separator: marks the end of the optional fields
(9) filesystem type: name of filesystem of the form "type[.subtype]"
(10) mount source: filesystem specific information or "none"
(11) super options: per super block options*/
mountinfoFormat = "%d %d %d:%d %s %s %s %s"
)
// Parse /proc/self/mountinfo because comparing Dev and ino does not work from
// bind mounts
func parseMountTable() ([]*Info, error) {
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return nil, err
}
defer f.Close()
return parseInfoFile(f)
}
func parseInfoFile(r io.Reader) ([]*Info, error) {
var (
s = bufio.NewScanner(r)
out = []*Info{}
)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
var (
p = &Info{}
text = s.Text()
optionalFields string
)
if _, err := fmt.Sscanf(text, mountinfoFormat,
&p.ID, &p.Parent, &p.Major, &p.Minor,
&p.Root, &p.Mountpoint, &p.Opts, &optionalFields); err != nil {
return nil, fmt.Errorf("Scanning '%s' failed: %s", text, err)
}
// Safe as mountinfo encodes mountpoints with spaces as \040.
index := strings.Index(text, " - ")
postSeparatorFields := strings.Fields(text[index+3:])
if len(postSeparatorFields) < 3 {
return nil, fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
}
if optionalFields != "-" {
p.Optional = optionalFields
}
p.Fstype = postSeparatorFields[0]
p.Source = postSeparatorFields[1]
p.VfsOpts = strings.Join(postSeparatorFields[2:], " ")
out = append(out, p)
}
return out, nil
}

View File

@@ -0,0 +1,40 @@
package mount
// Info reveals information about a particular mounted filesystem. This
// struct is populated from the content in the /proc/<pid>/mountinfo file.
type Info struct {
// ID is a unique identifier of the mount (may be reused after umount).
ID int
// Parent indicates the ID of the mount parent (or of self for the top of the
// mount tree).
Parent int
// Major indicates one half of the device ID which identifies the device class.
Major int
// Minor indicates one half of the device ID which identifies a specific
// instance of device.
Minor int
// Root of the mount within the filesystem.
Root string
// Mountpoint indicates the mount point relative to the process's root.
Mountpoint string
// Opts represents mount-specific options.
Opts string
// Optional represents optional fields.
Optional string
// Fstype indicates the type of filesystem, such as EXT3.
Fstype string
// Source indicates filesystem specific information or "none".
Source string
// VfsOpts represents per super block options.
VfsOpts string
}

View File

@@ -0,0 +1,259 @@
// +build linux
package libcontainer
import (
"fmt"
"io/ioutil"
"net"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/vishvananda/netlink"
)
var strategies = map[string]networkStrategy{
"veth": &veth{},
"loopback": &loopback{},
}
// networkStrategy represents a specific network configuration for
// a container's networking stack
type networkStrategy interface {
create(*network, int) error
initialize(*network) error
detach(*configs.Network) error
attach(*configs.Network) error
}
// getStrategy returns the specific network strategy for the
// provided type.
func getStrategy(tpe string) (networkStrategy, error) {
s, exists := strategies[tpe]
if !exists {
return nil, fmt.Errorf("unknown strategy type %q", tpe)
}
return s, nil
}
// Returns the network statistics for the network interfaces represented by the NetworkRuntimeInfo.
func getNetworkInterfaceStats(interfaceName string) (*NetworkInterface, error) {
out := &NetworkInterface{Name: interfaceName}
// This can happen if the network runtime information is missing - possible if the
// container was created by an old version of libcontainer.
if interfaceName == "" {
return out, nil
}
type netStatsPair struct {
// Where to write the output.
Out *uint64
// The network stats file to read.
File string
}
// Ingress for host veth is from the container. Hence tx_bytes stat on the host veth is actually number of bytes received by the container.
netStats := []netStatsPair{
{Out: &out.RxBytes, File: "tx_bytes"},
{Out: &out.RxPackets, File: "tx_packets"},
{Out: &out.RxErrors, File: "tx_errors"},
{Out: &out.RxDropped, File: "tx_dropped"},
{Out: &out.TxBytes, File: "rx_bytes"},
{Out: &out.TxPackets, File: "rx_packets"},
{Out: &out.TxErrors, File: "rx_errors"},
{Out: &out.TxDropped, File: "rx_dropped"},
}
for _, netStat := range netStats {
data, err := readSysfsNetworkStats(interfaceName, netStat.File)
if err != nil {
return nil, err
}
*(netStat.Out) = data
}
return out, nil
}
// Reads the specified statistics available under /sys/class/net/<EthInterface>/statistics
func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) {
data, err := ioutil.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile))
if err != nil {
return 0, err
}
return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
}
// loopback is a network strategy that provides a basic loopback device
type loopback struct {
}
func (l *loopback) create(n *network, nspid int) error {
return nil
}
func (l *loopback) initialize(config *network) error {
return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}})
}
func (l *loopback) attach(n *configs.Network) (err error) {
return nil
}
func (l *loopback) detach(n *configs.Network) (err error) {
return nil
}
// veth is a network strategy that uses a bridge and creates
// a veth pair, one that is attached to the bridge on the host and the other
// is placed inside the container's namespace
type veth struct {
}
func (v *veth) detach(n *configs.Network) (err error) {
return netlink.LinkSetMaster(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil)
}
// attach a container network interface to an external network
func (v *veth) attach(n *configs.Network) (err error) {
brl, err := netlink.LinkByName(n.Bridge)
if err != nil {
return err
}
br, ok := brl.(*netlink.Bridge)
if !ok {
return fmt.Errorf("Wrong device type %T", brl)
}
host, err := netlink.LinkByName(n.HostInterfaceName)
if err != nil {
return err
}
if err := netlink.LinkSetMaster(host, br); err != nil {
return err
}
if err := netlink.LinkSetMTU(host, n.Mtu); err != nil {
return err
}
if n.HairpinMode {
if err := netlink.LinkSetHairpin(host, true); err != nil {
return err
}
}
if err := netlink.LinkSetUp(host); err != nil {
return err
}
return nil
}
func (v *veth) create(n *network, nspid int) (err error) {
tmpName, err := v.generateTempPeerName()
if err != nil {
return err
}
n.TempVethPeerName = tmpName
if n.Bridge == "" {
return fmt.Errorf("bridge is not specified")
}
veth := &netlink.Veth{
LinkAttrs: netlink.LinkAttrs{
Name: n.HostInterfaceName,
TxQLen: n.TxQueueLen,
},
PeerName: n.TempVethPeerName,
}
if err := netlink.LinkAdd(veth); err != nil {
return err
}
defer func() {
if err != nil {
netlink.LinkDel(veth)
}
}()
if err := v.attach(&n.Network); err != nil {
return err
}
child, err := netlink.LinkByName(n.TempVethPeerName)
if err != nil {
return err
}
return netlink.LinkSetNsPid(child, nspid)
}
func (v *veth) generateTempPeerName() (string, error) {
return utils.GenerateRandomName("veth", 7)
}
func (v *veth) initialize(config *network) error {
peer := config.TempVethPeerName
if peer == "" {
return fmt.Errorf("peer is not specified")
}
child, err := netlink.LinkByName(peer)
if err != nil {
return err
}
if err := netlink.LinkSetDown(child); err != nil {
return err
}
if err := netlink.LinkSetName(child, config.Name); err != nil {
return err
}
// get the interface again after we changed the name as the index also changes.
if child, err = netlink.LinkByName(config.Name); err != nil {
return err
}
if config.MacAddress != "" {
mac, err := net.ParseMAC(config.MacAddress)
if err != nil {
return err
}
if err := netlink.LinkSetHardwareAddr(child, mac); err != nil {
return err
}
}
ip, err := netlink.ParseAddr(config.Address)
if err != nil {
return err
}
if err := netlink.AddrAdd(child, ip); err != nil {
return err
}
if config.IPv6Address != "" {
ip6, err := netlink.ParseAddr(config.IPv6Address)
if err != nil {
return err
}
if err := netlink.AddrAdd(child, ip6); err != nil {
return err
}
}
if err := netlink.LinkSetMTU(child, config.Mtu); err != nil {
return err
}
if err := netlink.LinkSetUp(child); err != nil {
return err
}
if config.Gateway != "" {
gw := net.ParseIP(config.Gateway)
if err := netlink.RouteAdd(&netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
LinkIndex: child.Attrs().Index,
Gw: gw,
}); err != nil {
return err
}
}
if config.IPv6Gateway != "" {
gw := net.ParseIP(config.IPv6Gateway)
if err := netlink.RouteAdd(&netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
LinkIndex: child.Attrs().Index,
Gw: gw,
}); err != nil {
return err
}
}
return nil
}

View File

@@ -0,0 +1,90 @@
// +build linux
package libcontainer
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"golang.org/x/sys/unix"
)
const oomCgroupName = "memory"
type PressureLevel uint
const (
LowPressure PressureLevel = iota
MediumPressure
CriticalPressure
)
func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) {
evFile, err := os.Open(filepath.Join(cgDir, evName))
if err != nil {
return nil, err
}
fd, err := unix.Eventfd(0, unix.EFD_CLOEXEC)
if err != nil {
evFile.Close()
return nil, err
}
eventfd := os.NewFile(uintptr(fd), "eventfd")
eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil {
eventfd.Close()
evFile.Close()
return nil, err
}
ch := make(chan struct{})
go func() {
defer func() {
eventfd.Close()
evFile.Close()
close(ch)
}()
buf := make([]byte, 8)
for {
if _, err := eventfd.Read(buf); err != nil {
return
}
// When a cgroup is destroyed, an event is sent to eventfd.
// So if the control path is gone, return instead of notifying.
if _, err := os.Lstat(eventControlPath); os.IsNotExist(err) {
return
}
ch <- struct{}{}
}
}()
return ch, nil
}
// notifyOnOOM returns channel on which you can expect event about OOM,
// if process died without OOM this channel will be closed.
func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
dir := paths[oomCgroupName]
if dir == "" {
return nil, fmt.Errorf("path %q missing", oomCgroupName)
}
return registerMemoryEvent(dir, "memory.oom_control", "")
}
func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) {
dir := paths[oomCgroupName]
if dir == "" {
return nil, fmt.Errorf("path %q missing", oomCgroupName)
}
if level > CriticalPressure {
return nil, fmt.Errorf("invalid pressure level %d", level)
}
levelStr := []string{"low", "medium", "critical"}[level]
return registerMemoryEvent(dir, "memory.pressure_level", levelStr)
}

View File

@@ -0,0 +1,32 @@
#ifndef NSENTER_NAMESPACE_H
#define NSENTER_NAMESPACE_H
#ifndef _GNU_SOURCE
# define _GNU_SOURCE
#endif
#include <sched.h>
/* All of these are taken from include/uapi/linux/sched.h */
#ifndef CLONE_NEWNS
# define CLONE_NEWNS 0x00020000 /* New mount namespace group */
#endif
#ifndef CLONE_NEWCGROUP
# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
#endif
#ifndef CLONE_NEWUTS
# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
#endif
#ifndef CLONE_NEWIPC
# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
#endif
#ifndef CLONE_NEWUSER
# define CLONE_NEWUSER 0x10000000 /* New user namespace */
#endif
#ifndef CLONE_NEWPID
# define CLONE_NEWPID 0x20000000 /* New pid namespace */
#endif
#ifndef CLONE_NEWNET
# define CLONE_NEWNET 0x40000000 /* New network namespace */
#endif
#endif /* NSENTER_NAMESPACE_H */

View File

@@ -0,0 +1,12 @@
// +build linux,!gccgo
package nsenter
/*
#cgo CFLAGS: -Wall
extern void nsexec();
void __attribute__((constructor)) init(void) {
nsexec();
}
*/
import "C"

View File

@@ -0,0 +1,25 @@
// +build linux,gccgo
package nsenter
/*
#cgo CFLAGS: -Wall
extern void nsexec();
void __attribute__((constructor)) init(void) {
nsexec();
}
*/
import "C"
// AlwaysFalse is here to stay false
// (and be exported so the compiler doesn't optimize out its reference)
var AlwaysFalse bool
func init() {
if AlwaysFalse {
// by referencing this C init() in a noop test, it will ensure the compiler
// links in the C function.
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134
C.init()
}
}

View File

@@ -0,0 +1,5 @@
// +build !linux !cgo
package nsenter
import "C"

View File

@@ -0,0 +1,963 @@
#define _GNU_SOURCE
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <sched.h>
#include <setjmp.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/prctl.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <linux/limits.h>
#include <linux/netlink.h>
#include <linux/types.h>
/* Get all of the CLONE_NEW* flags. */
#include "namespace.h"
/* Synchronisation values. */
enum sync_t {
SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */
/* XXX: This doesn't help with segfaults and other such issues. */
SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
};
/* longjmp() arguments. */
#define JUMP_PARENT 0x00
#define JUMP_CHILD 0xA0
#define JUMP_INIT 0xA1
/* JSON buffer. */
#define JSON_MAX 4096
/* Assume the stack grows down, so arguments should be above it. */
struct clone_t {
/*
* Reserve some space for clone() to locate arguments
* and retcode in this place
*/
char stack[4096] __attribute__ ((aligned(16)));
char stack_ptr[0];
/* There's two children. This is used to execute the different code. */
jmp_buf *env;
int jmpval;
};
struct nlconfig_t {
char *data;
/* Process settings. */
uint32_t cloneflags;
char *oom_score_adj;
size_t oom_score_adj_len;
/* User namespace settings. */
char *uidmap;
size_t uidmap_len;
char *gidmap;
size_t gidmap_len;
char *namespaces;
size_t namespaces_len;
uint8_t is_setgroup;
/* Rootless container settings. */
uint8_t is_rootless;
char *uidmappath;
size_t uidmappath_len;
char *gidmappath;
size_t gidmappath_len;
};
/*
* List of netlink message types sent to us as part of bootstrapping the init.
* These constants are defined in libcontainer/message_linux.go.
*/
#define INIT_MSG 62000
#define CLONE_FLAGS_ATTR 27281
#define NS_PATHS_ATTR 27282
#define UIDMAP_ATTR 27283
#define GIDMAP_ATTR 27284
#define SETGROUP_ATTR 27285
#define OOM_SCORE_ADJ_ATTR 27286
#define ROOTLESS_ATTR 27287
#define UIDMAPPATH_ATTR 27288
#define GIDMAPPATH_ATTR 27289
/*
* Use the raw syscall for versions of glibc which don't include a function for
* it, namely (glibc 2.12).
*/
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
# define _GNU_SOURCE
# include "syscall.h"
# if !defined(SYS_setns) && defined(__NR_setns)
# define SYS_setns __NR_setns
# endif
#ifndef SYS_setns
# error "setns(2) syscall not supported by glibc version"
#endif
int setns(int fd, int nstype)
{
return syscall(SYS_setns, fd, nstype);
}
#endif
/* XXX: This is ugly. */
static int syncfd = -1;
/* TODO(cyphar): Fix this so it correctly deals with syncT. */
#define bail(fmt, ...) \
do { \
int ret = __COUNTER__ + 1; \
fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
if (syncfd >= 0) { \
enum sync_t s = SYNC_ERR; \
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) \
fprintf(stderr, "nsenter: failed: write(s)"); \
if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret)) \
fprintf(stderr, "nsenter: failed: write(ret)"); \
} \
exit(ret); \
} while(0)
static int write_file(char *data, size_t data_len, char *pathfmt, ...)
{
int fd, len, ret = 0;
char path[PATH_MAX];
va_list ap;
va_start(ap, pathfmt);
len = vsnprintf(path, PATH_MAX, pathfmt, ap);
va_end(ap);
if (len < 0)
return -1;
fd = open(path, O_RDWR);
if (fd < 0) {
return -1;
}
len = write(fd, data, data_len);
if (len != data_len) {
ret = -1;
goto out;
}
out:
close(fd);
return ret;
}
enum policy_t {
SETGROUPS_DEFAULT = 0,
SETGROUPS_ALLOW,
SETGROUPS_DENY,
};
/* This *must* be called before we touch gid_map. */
static void update_setgroups(int pid, enum policy_t setgroup)
{
char *policy;
switch (setgroup) {
case SETGROUPS_ALLOW:
policy = "allow";
break;
case SETGROUPS_DENY:
policy = "deny";
break;
case SETGROUPS_DEFAULT:
default:
/* Nothing to do. */
return;
}
if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
/*
* If the kernel is too old to support /proc/pid/setgroups,
* open(2) or write(2) will return ENOENT. This is fine.
*/
if (errno != ENOENT)
bail("failed to write '%s' to /proc/%d/setgroups", policy, pid);
}
}
static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len)
{
int child;
/*
* If @app is NULL, execve will segfault. Just check it here and bail (if
* we're in this path, the caller is already getting desparate and there
* isn't a backup to this failing). This usually would be a configuration
* or programming issue.
*/
if (!app)
bail("mapping tool not present");
child = fork();
if (child < 0)
bail("failed to fork");
if (!child) {
#define MAX_ARGV 20
char *argv[MAX_ARGV];
char *envp[] = { NULL };
char pid_fmt[16];
int argc = 0;
char *next;
snprintf(pid_fmt, 16, "%d", pid);
argv[argc++] = (char *)app;
argv[argc++] = pid_fmt;
/*
* Convert the map string into a list of argument that
* newuidmap/newgidmap can understand.
*/
while (argc < MAX_ARGV) {
if (*map == '\0') {
argv[argc++] = NULL;
break;
}
argv[argc++] = map;
next = strpbrk(map, "\n ");
if (next == NULL)
break;
*next++ = '\0';
map = next + strspn(next, "\n ");
}
execve(app, argv, envp);
bail("failed to execv");
} else {
int status;
while (true) {
if (waitpid(child, &status, 0) < 0) {
if (errno == EINTR)
continue;
bail("failed to waitpid");
}
if (WIFEXITED(status) || WIFSIGNALED(status))
return WEXITSTATUS(status);
}
}
return -1;
}
static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
{
if (map == NULL || map_len <= 0)
return;
if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
if (errno != EPERM)
bail("failed to update /proc/%d/uid_map", pid);
if (try_mapping_tool(path, pid, map, map_len))
bail("failed to use newuid map on %d", pid);
}
}
static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
{
if (map == NULL || map_len <= 0)
return;
if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
if (errno != EPERM)
bail("failed to update /proc/%d/gid_map", pid);
if (try_mapping_tool(path, pid, map, map_len))
bail("failed to use newgid map on %d", pid);
}
}
static void update_oom_score_adj(char *data, size_t len)
{
if (data == NULL || len <= 0)
return;
if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
bail("failed to update /proc/self/oom_score_adj");
}
/* A dummy function that just jumps to the given jumpval. */
static int child_func(void *arg) __attribute__ ((noinline));
static int child_func(void *arg)
{
struct clone_t *ca = (struct clone_t *)arg;
longjmp(*ca->env, ca->jmpval);
}
static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
static int clone_parent(jmp_buf *env, int jmpval)
{
struct clone_t ca = {
.env = env,
.jmpval = jmpval,
};
return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
}
/*
* Gets the init pipe fd from the environment, which is used to read the
* bootstrap data and tell the parent what the new pid is after we finish
* setting up the environment.
*/
static int initpipe(void)
{
int pipenum;
char *initpipe, *endptr;
initpipe = getenv("_LIBCONTAINER_INITPIPE");
if (initpipe == NULL || *initpipe == '\0')
return -1;
pipenum = strtol(initpipe, &endptr, 10);
if (*endptr != '\0')
bail("unable to parse _LIBCONTAINER_INITPIPE");
return pipenum;
}
/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
static int nsflag(char *name)
{
if (!strcmp(name, "cgroup"))
return CLONE_NEWCGROUP;
else if (!strcmp(name, "ipc"))
return CLONE_NEWIPC;
else if (!strcmp(name, "mnt"))
return CLONE_NEWNS;
else if (!strcmp(name, "net"))
return CLONE_NEWNET;
else if (!strcmp(name, "pid"))
return CLONE_NEWPID;
else if (!strcmp(name, "user"))
return CLONE_NEWUSER;
else if (!strcmp(name, "uts"))
return CLONE_NEWUTS;
/* If we don't recognise a name, fallback to 0. */
return 0;
}
static uint32_t readint32(char *buf)
{
return *(uint32_t *) buf;
}
static uint8_t readint8(char *buf)
{
return *(uint8_t *) buf;
}
static void nl_parse(int fd, struct nlconfig_t *config)
{
size_t len, size;
struct nlmsghdr hdr;
char *data, *current;
/* Retrieve the netlink header. */
len = read(fd, &hdr, NLMSG_HDRLEN);
if (len != NLMSG_HDRLEN)
bail("invalid netlink header length %zu", len);
if (hdr.nlmsg_type == NLMSG_ERROR)
bail("failed to read netlink message");
if (hdr.nlmsg_type != INIT_MSG)
bail("unexpected msg type %d", hdr.nlmsg_type);
/* Retrieve data. */
size = NLMSG_PAYLOAD(&hdr, 0);
current = data = malloc(size);
if (!data)
bail("failed to allocate %zu bytes of memory for nl_payload", size);
len = read(fd, data, size);
if (len != size)
bail("failed to read netlink payload, %zu != %zu", len, size);
/* Parse the netlink payload. */
config->data = data;
while (current < data + size) {
struct nlattr *nlattr = (struct nlattr *)current;
size_t payload_len = nlattr->nla_len - NLA_HDRLEN;
/* Advance to payload. */
current += NLA_HDRLEN;
/* Handle payload. */
switch (nlattr->nla_type) {
case CLONE_FLAGS_ATTR:
config->cloneflags = readint32(current);
break;
case ROOTLESS_ATTR:
config->is_rootless = readint8(current);
break;
case OOM_SCORE_ADJ_ATTR:
config->oom_score_adj = current;
config->oom_score_adj_len = payload_len;
break;
case NS_PATHS_ATTR:
config->namespaces = current;
config->namespaces_len = payload_len;
break;
case UIDMAP_ATTR:
config->uidmap = current;
config->uidmap_len = payload_len;
break;
case GIDMAP_ATTR:
config->gidmap = current;
config->gidmap_len = payload_len;
break;
case UIDMAPPATH_ATTR:
config->uidmappath = current;
config->uidmappath_len = payload_len;
break;
case GIDMAPPATH_ATTR:
config->gidmappath = current;
config->gidmappath_len = payload_len;
break;
case SETGROUP_ATTR:
config->is_setgroup = readint8(current);
break;
default:
bail("unknown netlink message type %d", nlattr->nla_type);
}
current += NLA_ALIGN(payload_len);
}
}
void nl_free(struct nlconfig_t *config)
{
free(config->data);
}
void join_namespaces(char *nslist)
{
int num = 0, i;
char *saveptr = NULL;
char *namespace = strtok_r(nslist, ",", &saveptr);
struct namespace_t {
int fd;
int ns;
char type[PATH_MAX];
char path[PATH_MAX];
} *namespaces = NULL;
if (!namespace || !strlen(namespace) || !strlen(nslist))
bail("ns paths are empty");
/*
* We have to open the file descriptors first, since after
* we join the mnt namespace we might no longer be able to
* access the paths.
*/
do {
int fd;
char *path;
struct namespace_t *ns;
/* Resize the namespace array. */
namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
if (!namespaces)
bail("failed to reallocate namespace array");
ns = &namespaces[num - 1];
/* Split 'ns:path'. */
path = strstr(namespace, ":");
if (!path)
bail("failed to parse %s", namespace);
*path++ = '\0';
fd = open(path, O_RDONLY);
if (fd < 0)
bail("failed to open %s", path);
ns->fd = fd;
ns->ns = nsflag(namespace);
strncpy(ns->path, path, PATH_MAX);
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
/*
* The ordering in which we join namespaces is important. We should
* always join the user namespace *first*. This is all guaranteed
* from the container_linux.go side of this, so we're just going to
* follow the order given to us.
*/
for (i = 0; i < num; i++) {
struct namespace_t ns = namespaces[i];
if (setns(ns.fd, ns.ns) < 0)
bail("failed to setns to %s", ns.path);
close(ns.fd);
}
free(namespaces);
}
void nsexec(void)
{
int pipenum;
jmp_buf env;
int sync_child_pipe[2], sync_grandchild_pipe[2];
struct nlconfig_t config = { 0 };
/*
* If we don't have an init pipe, just return to the go routine.
* We'll only get an init pipe for start or exec.
*/
pipenum = initpipe();
if (pipenum == -1)
return;
/* Parse all of the netlink configuration. */
nl_parse(pipenum, &config);
/* Set oom_score_adj. This has to be done before !dumpable because
* /proc/self/oom_score_adj is not writeable unless you're an privileged
* user (if !dumpable is set). All children inherit their parent's
* oom_score_adj value on fork(2) so this will always be propagated
* properly.
*/
update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len);
/*
* Make the process non-dumpable, to avoid various race conditions that
* could cause processes in namespaces we're joining to access host
* resources (or potentially execute code).
*
* However, if the number of namespaces we are joining is 0, we are not
* going to be switching to a different security context. Thus setting
* ourselves to be non-dumpable only breaks things (like rootless
* containers), which is the recommendation from the kernel folks.
*/
if (config.namespaces) {
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
bail("failed to set process as non-dumpable");
}
/* Pipe so we can tell the child when we've finished setting up. */
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0)
bail("failed to setup sync pipe between parent and child");
/*
* We need a new socketpair to sync with grandchild so we don't have
* race condition with child.
*/
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0)
bail("failed to setup sync pipe between parent and grandchild");
/* TODO: Currently we aren't dealing with child deaths properly. */
/*
* Okay, so this is quite annoying.
*
* In order for this unsharing code to be more extensible we need to split
* up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case
* would be if we did clone(CLONE_NEWUSER) and the other namespaces
* separately, but because of SELinux issues we cannot really do that. But
* we cannot just dump the namespace flags into clone(...) because several
* usecases (such as rootless containers) require more granularity around
* the namespace setup. In addition, some older kernels had issues where
* CLONE_NEWUSER wasn't handled before other namespaces (but we cannot
* handle this while also dealing with SELinux so we choose SELinux support
* over broken kernel support).
*
* However, if we unshare(2) the user namespace *before* we clone(2), then
* all hell breaks loose.
*
* The parent no longer has permissions to do many things (unshare(2) drops
* all capabilities in your old namespace), and the container cannot be set
* up to have more than one {uid,gid} mapping. This is obviously less than
* ideal. In order to fix this, we have to first clone(2) and then unshare.
*
* Unfortunately, it's not as simple as that. We have to fork to enter the
* PID namespace (the PID namespace only applies to children). Since we'll
* have to double-fork, this clone_parent() call won't be able to get the
* PID of the _actual_ init process (without doing more synchronisation than
* I can deal with at the moment). So we'll just get the parent to send it
* for us, the only job of this process is to update
* /proc/pid/{setgroups,uid_map,gid_map}.
*
* And as a result of the above, we also need to setns(2) in the first child
* because if we join a PID namespace in the topmost parent then our child
* will be in that namespace (and it will not be able to give us a PID value
* that makes sense without resorting to sending things with cmsg).
*
* This also deals with an older issue caused by dumping cloneflags into
* clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
* we have to unshare(2) before clone(2) in order to do this. This was fixed
* in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
* introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
* aware, the last mainline kernel which had this bug was Linux 3.12.
* However, we cannot comment on which kernels the broken patch was
* backported to.
*
* -- Aleksa "what has my life come to?" Sarai
*/
switch (setjmp(env)) {
/*
* Stage 0: We're in the parent. Our job is just to create a new child
* (stage 1: JUMP_CHILD) process and write its uid_map and
* gid_map. That process will go on to create a new process, then
* it will send us its PID which we will send to the bootstrap
* process.
*/
case JUMP_PARENT:{
int len;
pid_t child, first_child = -1;
char buf[JSON_MAX];
bool ready = false;
/* For debugging. */
prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
/* Start the process of getting a container. */
child = clone_parent(&env, JUMP_CHILD);
if (child < 0)
bail("unable to fork: child_func");
/*
* State machine for synchronisation with the children.
*
* Father only return when both child and grandchild are
* ready, so we can receive all possible error codes
* generated by children.
*/
while (!ready) {
enum sync_t s;
int ret;
syncfd = sync_child_pipe[1];
close(sync_child_pipe[0]);
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with child: next state");
switch (s) {
case SYNC_ERR:
/* We have to mirror the error code of the child. */
if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
bail("failed to sync with child: read(error code)");
exit(ret);
case SYNC_USERMAP_PLS:
/*
* Enable setgroups(2) if we've been asked to. But we also
* have to explicitly disable setgroups(2) if we're
* creating a rootless container (this is required since
* Linux 3.19).
*/
if (config.is_rootless && config.is_setgroup) {
kill(child, SIGKILL);
bail("cannot allow setgroup in an unprivileged user namespace setup");
}
if (config.is_setgroup)
update_setgroups(child, SETGROUPS_ALLOW);
if (config.is_rootless)
update_setgroups(child, SETGROUPS_DENY);
/* Set up mappings. */
update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
s = SYNC_USERMAP_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
}
break;
case SYNC_RECVPID_PLS:{
first_child = child;
/* Get the init_func pid. */
if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
kill(first_child, SIGKILL);
bail("failed to sync with child: read(childpid)");
}
/* Send ACK. */
s = SYNC_RECVPID_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(first_child, SIGKILL);
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
}
}
break;
case SYNC_CHILD_READY:
ready = true;
break;
default:
bail("unexpected sync value: %u", s);
}
}
/* Now sync with grandchild. */
ready = false;
while (!ready) {
enum sync_t s;
int ret;
syncfd = sync_grandchild_pipe[1];
close(sync_grandchild_pipe[0]);
s = SYNC_GRANDCHILD;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_GRANDCHILD)");
}
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with child: next state");
switch (s) {
case SYNC_ERR:
/* We have to mirror the error code of the child. */
if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
bail("failed to sync with child: read(error code)");
exit(ret);
case SYNC_CHILD_READY:
ready = true;
break;
default:
bail("unexpected sync value: %u", s);
}
}
/*
* Send the init_func pid and the pid of the first child back to our parent.
*
* We need to send both back because we can't reap the first child we created (CLONE_PARENT).
* It becomes the responsibility of our parent to reap the first child.
*/
len = snprintf(buf, JSON_MAX, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
if (len < 0) {
kill(child, SIGKILL);
bail("unable to generate JSON for child pid");
}
if (write(pipenum, buf, len) != len) {
kill(child, SIGKILL);
bail("unable to send child pid to bootstrapper");
}
exit(0);
}
/*
* Stage 1: We're in the first child process. Our job is to join any
* provided namespaces in the netlink payload and unshare all
* of the requested namespaces. If we've been asked to
* CLONE_NEWUSER, we will ask our parent (stage 0) to set up
* our user mappings for us. Then, we create a new child
* (stage 2: JUMP_INIT) for PID namespace. We then send the
* child's PID to our parent (stage 0).
*/
case JUMP_CHILD:{
pid_t child;
enum sync_t s;
/* We're in a child and thus need to tell the parent if we die. */
syncfd = sync_child_pipe[0];
close(sync_child_pipe[1]);
/* For debugging. */
prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
/*
* We need to setns first. We cannot do this earlier (in stage 0)
* because of the fact that we forked to get here (the PID of
* [stage 2: JUMP_INIT]) would be meaningless). We could send it
* using cmsg(3) but that's just annoying.
*/
if (config.namespaces)
join_namespaces(config.namespaces);
/*
* Unshare all of the namespaces. Now, it should be noted that this
* ordering might break in the future (especially with rootless
* containers). But for now, it's not possible to split this into
* CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
*
* Note that we don't merge this with clone() because there were
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway.
*/
if (unshare(config.cloneflags) < 0)
bail("failed to unshare namespaces");
/*
* Deal with user namespaces first. They are quite special, as they
* affect our ability to unshare other namespaces and are used as
* context for privilege checks.
*/
if (config.cloneflags & CLONE_NEWUSER) {
/*
* We don't have the privileges to do any mapping here (see the
* clone_parent rant). So signal our parent to hook us up.
*/
/* Switching is only necessary if we joined namespaces. */
if (config.namespaces) {
if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
bail("failed to set process as dumpable");
}
s = SYNC_USERMAP_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
/* ... wait for mapping ... */
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
if (s != SYNC_USERMAP_ACK)
bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
/* Switching is only necessary if we joined namespaces. */
if (config.namespaces) {
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
bail("failed to set process as dumpable");
}
}
/*
* TODO: What about non-namespace clone flags that we're dropping here?
*
* We fork again because of PID namespace, setns(2) or unshare(2) don't
* change the PID namespace of the calling process, because doing so
* would change the caller's idea of its own PID (as reported by getpid()),
* which would break many applications and libraries, so we must fork
* to actually enter the new PID namespace.
*/
child = clone_parent(&env, JUMP_INIT);
if (child < 0)
bail("unable to fork: init_func");
/* Send the child to our parent, which knows what it's doing. */
s = SYNC_RECVPID_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
}
if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
kill(child, SIGKILL);
bail("failed to sync with parent: write(childpid)");
}
/* ... wait for parent to get the pid ... */
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
}
if (s != SYNC_RECVPID_ACK) {
kill(child, SIGKILL);
bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
}
s = SYNC_CHILD_READY;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with parent: write(SYNC_CHILD_READY)");
}
/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
exit(0);
}
/*
* Stage 2: We're the final child process, and the only process that will
* actually return to the Go runtime. Our job is to just do the
* final cleanup steps and then return to the Go runtime to allow
* init_linux.go to run.
*/
case JUMP_INIT:{
/*
* We're inside the child now, having jumped from the
* start_child() code after forking in the parent.
*/
enum sync_t s;
/* We're in a child and thus need to tell the parent if we die. */
syncfd = sync_grandchild_pipe[0];
close(sync_grandchild_pipe[1]);
close(sync_child_pipe[0]);
close(sync_child_pipe[1]);
/* For debugging. */
prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
if (s != SYNC_GRANDCHILD)
bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s);
if (setsid() < 0)
bail("setsid failed");
if (setuid(0) < 0)
bail("setuid failed");
if (setgid(0) < 0)
bail("setgid failed");
if (!config.is_rootless && config.is_setgroup) {
if (setgroups(0, NULL) < 0)
bail("setgroups failed");
}
s = SYNC_CHILD_READY;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with patent: write(SYNC_CHILD_READY)");
/* Close sync pipes. */
close(sync_grandchild_pipe[0]);
/* Free netlink data. */
nl_free(&config);
/* Finish executing, let the Go runtime take over. */
return;
}
default:
bail("unexpected jump value");
}
/* Should never be reached. */
bail("should never be reached");
}

View File

@@ -0,0 +1,110 @@
package libcontainer
import (
"fmt"
"io"
"math"
"os"
"github.com/opencontainers/runc/libcontainer/configs"
)
type processOperations interface {
wait() (*os.ProcessState, error)
signal(sig os.Signal) error
pid() int
}
// Process specifies the configuration and IO for a process inside
// a container.
type Process struct {
// The command to be run followed by any arguments.
Args []string
// Env specifies the environment variables for the process.
Env []string
// User will set the uid and gid of the executing process running inside the container
// local to the container's user and group configuration.
User string
// AdditionalGroups specifies the gids that should be added to supplementary groups
// in addition to those that the user belongs to.
AdditionalGroups []string
// Cwd will change the processes current working directory inside the container's rootfs.
Cwd string
// Stdin is a pointer to a reader which provides the standard input stream.
Stdin io.Reader
// Stdout is a pointer to a writer which receives the standard output stream.
Stdout io.Writer
// Stderr is a pointer to a writer which receives the standard error stream.
Stderr io.Writer
// ExtraFiles specifies additional open files to be inherited by the container
ExtraFiles []*os.File
// Initial sizings for the console
ConsoleWidth uint16
ConsoleHeight uint16
// Capabilities specify the capabilities to keep when executing the process inside the container
// All capabilities not specified will be dropped from the processes capability mask
Capabilities *configs.Capabilities
// AppArmorProfile specifies the profile to apply to the process and is
// changed at the time the process is execed
AppArmorProfile string
// Label specifies the label to apply to the process. It is commonly used by selinux
Label string
// NoNewPrivileges controls whether processes can gain additional privileges.
NoNewPrivileges *bool
// Rlimits specifies the resource limits, such as max open files, to set in the container
// If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits []configs.Rlimit
// ConsoleSocket provides the masterfd console.
ConsoleSocket *os.File
ops processOperations
}
// Wait waits for the process to exit.
// Wait releases any resources associated with the Process
func (p Process) Wait() (*os.ProcessState, error) {
if p.ops == nil {
return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
}
return p.ops.wait()
}
// Pid returns the process ID
func (p Process) Pid() (int, error) {
// math.MinInt32 is returned here, because it's invalid value
// for the kill() system call.
if p.ops == nil {
return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
}
return p.ops.pid(), nil
}
// Signal sends a signal to the Process.
func (p Process) Signal(sig os.Signal) error {
if p.ops == nil {
return newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
}
return p.ops.signal(sig)
}
// IO holds the process's STDIO
type IO struct {
Stdin io.WriteCloser
Stdout io.ReadCloser
Stderr io.ReadCloser
}

View File

@@ -0,0 +1,547 @@
// +build linux
package libcontainer
import (
"encoding/json"
"errors"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"strconv"
"syscall" // only for Signal
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
"golang.org/x/sys/unix"
)
type parentProcess interface {
// pid returns the pid for the running process.
pid() int
// start starts the process execution.
start() error
// send a SIGKILL to the process and wait for the exit.
terminate() error
// wait waits on the process returning the process state.
wait() (*os.ProcessState, error)
// startTime returns the process start time.
startTime() (uint64, error)
signal(os.Signal) error
externalDescriptors() []string
setExternalDescriptors(fds []string)
}
type setnsProcess struct {
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
cgroupPaths map[string]string
intelRdtPath string
config *initConfig
fds []string
process *Process
bootstrapData io.Reader
}
func (p *setnsProcess) startTime() (uint64, error) {
stat, err := system.Stat(p.pid())
return stat.StartTime, err
}
func (p *setnsProcess) signal(sig os.Signal) error {
s, ok := sig.(syscall.Signal)
if !ok {
return errors.New("os: unsupported signal type")
}
return unix.Kill(p.pid(), s)
}
func (p *setnsProcess) start() (err error) {
defer p.parentPipe.Close()
err = p.cmd.Start()
p.childPipe.Close()
if err != nil {
return newSystemErrorWithCause(err, "starting setns process")
}
if p.bootstrapData != nil {
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}
}
if err = p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "executing setns process")
}
if len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
}
}
if p.intelRdtPath != "" {
// if Intel RDT "resource control" filesystem path exists
_, err := os.Stat(p.intelRdtPath)
if err == nil {
if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid())
}
}
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for process")
}
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
return newSystemErrorWithCause(err, "writing config to pipe")
}
ierr := parseSync(p.parentPipe, func(sync *syncT) error {
switch sync.Type {
case procReady:
// This shouldn't happen.
panic("unexpected procReady in setns")
case procHooks:
// This shouldn't happen.
panic("unexpected procHooks in setns")
default:
return newSystemError(fmt.Errorf("invalid JSON payload from child"))
}
})
if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil {
return newSystemErrorWithCause(err, "calling shutdown on init pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return ierr
}
return nil
}
// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
func (p *setnsProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
return newSystemErrorWithCause(err, "waiting on setns process to finish")
}
if !status.Success() {
p.cmd.Wait()
return newSystemError(&exec.ExitError{ProcessState: status})
}
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return newSystemErrorWithCause(err, "reading pid from init pipe")
}
// Clean up the zombie parent process
firstChildProcess, err := os.FindProcess(pid.PidFirstChild)
if err != nil {
return err
}
// Ignore the error in case the child has already been reaped for any reason
_, _ = firstChildProcess.Wait()
process, err := os.FindProcess(pid.Pid)
if err != nil {
return err
}
p.cmd.Process = process
p.process.ops = p
return nil
}
// terminate sends a SIGKILL to the forked process for the setns routine then waits to
// avoid the process becoming a zombie.
func (p *setnsProcess) terminate() error {
if p.cmd.Process == nil {
return nil
}
err := p.cmd.Process.Kill()
if _, werr := p.wait(); err == nil {
err = werr
}
return err
}
func (p *setnsProcess) wait() (*os.ProcessState, error) {
err := p.cmd.Wait()
// Return actual ProcessState even on Wait error
return p.cmd.ProcessState, err
}
func (p *setnsProcess) pid() int {
return p.cmd.Process.Pid
}
func (p *setnsProcess) externalDescriptors() []string {
return p.fds
}
func (p *setnsProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
type initProcess struct {
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
config *initConfig
manager cgroups.Manager
intelRdtManager intelrdt.Manager
container *linuxContainer
fds []string
process *Process
bootstrapData io.Reader
sharePidns bool
}
func (p *initProcess) pid() int {
return p.cmd.Process.Pid
}
func (p *initProcess) externalDescriptors() []string {
return p.fds
}
// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
// This is called by initProcess.start function
func (p *initProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
return err
}
if !status.Success() {
p.cmd.Wait()
return &exec.ExitError{ProcessState: status}
}
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return err
}
// Clean up the zombie parent process
firstChildProcess, err := os.FindProcess(pid.PidFirstChild)
if err != nil {
return err
}
// Ignore the error in case the child has already been reaped for any reason
_, _ = firstChildProcess.Wait()
process, err := os.FindProcess(pid.Pid)
if err != nil {
return err
}
p.cmd.Process = process
p.process.ops = p
return nil
}
func (p *initProcess) start() error {
defer p.parentPipe.Close()
err := p.cmd.Start()
p.process.ops = p
p.childPipe.Close()
if err != nil {
p.process.ops = nil
return newSystemErrorWithCause(err, "starting init process command")
}
// Do this before syncing with child so that no children can escape the
// cgroup. We don't need to worry about not doing this and not being root
// because we'd be using the rootless cgroup manager in that case.
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
}
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
}
}
defer func() {
if err != nil {
// TODO: should not be the responsibility to call here
p.manager.Destroy()
if p.intelRdtManager != nil {
p.intelRdtManager.Destroy()
}
}
}()
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}
if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "running exec setns process for init")
}
// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(p.pid())
if err != nil {
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
}
p.setExternalDescriptors(fds)
if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating network interfaces")
}
if err := p.sendConfig(); err != nil {
return newSystemErrorWithCause(err, "sending config to init process")
}
var (
sentRun bool
sentResume bool
)
ierr := parseSync(p.parentPipe, func(sync *syncT) error {
switch sync.Type {
case procReady:
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for ready process")
}
// call prestart hooks
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
// Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
}
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting Intel RDT config for ready process")
}
}
if p.config.Config.Hooks != nil {
bundle, annotations := utils.Annotations(p.container.config.Labels)
s := configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Bundle: bundle,
Annotations: annotations,
}
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
}
// Sync with child.
if err := writeSync(p.parentPipe, procRun); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'run'")
}
sentRun = true
case procHooks:
// Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
}
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")
}
}
if p.config.Config.Hooks != nil {
bundle, annotations := utils.Annotations(p.container.config.Labels)
s := configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Bundle: bundle,
Annotations: annotations,
}
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
// Sync with child.
if err := writeSync(p.parentPipe, procResume); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'resume'")
}
sentResume = true
default:
return newSystemError(fmt.Errorf("invalid JSON payload from child"))
}
return nil
})
if !sentRun {
return newSystemErrorWithCause(ierr, "container init")
}
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
}
if err := unix.Shutdown(int(p.parentPipe.Fd()), unix.SHUT_WR); err != nil {
return newSystemErrorWithCause(err, "shutting down init pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return ierr
}
return nil
}
func (p *initProcess) wait() (*os.ProcessState, error) {
err := p.cmd.Wait()
if err != nil {
return p.cmd.ProcessState, err
}
// we should kill all processes in cgroup when init is died if we use host PID namespace
if p.sharePidns {
signalAllProcesses(p.manager, unix.SIGKILL)
}
return p.cmd.ProcessState, nil
}
func (p *initProcess) terminate() error {
if p.cmd.Process == nil {
return nil
}
err := p.cmd.Process.Kill()
if _, werr := p.wait(); err == nil {
err = werr
}
return err
}
func (p *initProcess) startTime() (uint64, error) {
stat, err := system.Stat(p.pid())
return stat.StartTime, err
}
func (p *initProcess) sendConfig() error {
// send the config to the container's init process, we don't use JSON Encode
// here because there might be a problem in JSON decoder in some cases, see:
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
return utils.WriteJSON(p.parentPipe, p.config)
}
func (p *initProcess) createNetworkInterfaces() error {
for _, config := range p.config.Config.Networks {
strategy, err := getStrategy(config.Type)
if err != nil {
return err
}
n := &network{
Network: *config,
}
if err := strategy.create(n, p.pid()); err != nil {
return err
}
p.config.Networks = append(p.config.Networks, n)
}
return nil
}
func (p *initProcess) signal(sig os.Signal) error {
s, ok := sig.(syscall.Signal)
if !ok {
return errors.New("os: unsupported signal type")
}
return unix.Kill(p.pid(), s)
}
func (p *initProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
func getPipeFds(pid int) ([]string, error) {
fds := make([]string, 3)
dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
for i := 0; i < 3; i++ {
// XXX: This breaks if the path is not a valid symlink (which can
// happen in certain particularly unlucky mount namespace setups).
f := filepath.Join(dirPath, strconv.Itoa(i))
target, err := os.Readlink(f)
if err != nil {
// Ignore permission errors, for rootless containers and other
// non-dumpable processes. if we can't get the fd for a particular
// file, there's not much we can do.
if os.IsPermission(err) {
continue
}
return fds, err
}
fds[i] = target
}
return fds, nil
}
// InitializeIO creates pipes for use with the process's stdio and returns the
// opposite side for each. Do not use this if you want to have a pseudoterminal
// set up for you by libcontainer (TODO: fix that too).
// TODO: This is mostly unnecessary, and should be handled by clients.
func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
var fds []uintptr
i = &IO{}
// cleanup in case of an error
defer func() {
if err != nil {
for _, fd := range fds {
unix.Close(int(fd))
}
}
}()
// STDIN
r, w, err := os.Pipe()
if err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stdin, i.Stdin = r, w
// STDOUT
if r, w, err = os.Pipe(); err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stdout, i.Stdout = w, r
// STDERR
if r, w, err = os.Pipe(); err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stderr, i.Stderr = w, r
// change ownership of the pipes incase we are in a user namespace
for _, fd := range fds {
if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
return nil, err
}
}
return i, nil
}

View File

@@ -0,0 +1,122 @@
// +build linux
package libcontainer
import (
"fmt"
"os"
"github.com/opencontainers/runc/libcontainer/system"
)
func newRestoredProcess(pid int, fds []string) (*restoredProcess, error) {
var (
err error
)
proc, err := os.FindProcess(pid)
if err != nil {
return nil, err
}
stat, err := system.Stat(pid)
if err != nil {
return nil, err
}
return &restoredProcess{
proc: proc,
processStartTime: stat.StartTime,
fds: fds,
}, nil
}
type restoredProcess struct {
proc *os.Process
processStartTime uint64
fds []string
}
func (p *restoredProcess) start() error {
return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError)
}
func (p *restoredProcess) pid() int {
return p.proc.Pid
}
func (p *restoredProcess) terminate() error {
err := p.proc.Kill()
if _, werr := p.wait(); err == nil {
err = werr
}
return err
}
func (p *restoredProcess) wait() (*os.ProcessState, error) {
// TODO: how do we wait on the actual process?
// maybe use --exec-cmd in criu
st, err := p.proc.Wait()
if err != nil {
return nil, err
}
return st, nil
}
func (p *restoredProcess) startTime() (uint64, error) {
return p.processStartTime, nil
}
func (p *restoredProcess) signal(s os.Signal) error {
return p.proc.Signal(s)
}
func (p *restoredProcess) externalDescriptors() []string {
return p.fds
}
func (p *restoredProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
// nonChildProcess represents a process where the calling process is not
// the parent process. This process is created when a factory loads a container from
// a persisted state.
type nonChildProcess struct {
processPid int
processStartTime uint64
fds []string
}
func (p *nonChildProcess) start() error {
return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError)
}
func (p *nonChildProcess) pid() int {
return p.processPid
}
func (p *nonChildProcess) terminate() error {
return newGenericError(fmt.Errorf("restored process cannot be terminated"), SystemError)
}
func (p *nonChildProcess) wait() (*os.ProcessState, error) {
return nil, newGenericError(fmt.Errorf("restored process cannot be waited on"), SystemError)
}
func (p *nonChildProcess) startTime() (uint64, error) {
return p.processStartTime, nil
}
func (p *nonChildProcess) signal(s os.Signal) error {
proc, err := os.FindProcess(p.processPid)
if err != nil {
return err
}
return proc.Signal(s)
}
func (p *nonChildProcess) externalDescriptors() []string {
return p.fds
}
func (p *nonChildProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}

View File

@@ -0,0 +1,838 @@
// +build linux
package libcontainer
import (
"fmt"
"io"
"io/ioutil"
"os"
"os/exec"
"path"
"path/filepath"
"strings"
"time"
"github.com/cyphar/filepath-securejoin"
"github.com/mrunalp/fileutils"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/mount"
"github.com/opencontainers/runc/libcontainer/system"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/selinux/go-selinux/label"
"golang.org/x/sys/unix"
)
const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
// needsSetupDev returns true if /dev needs to be set up.
func needsSetupDev(config *configs.Config) bool {
for _, m := range config.Mounts {
if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" {
return false
}
}
return true
}
// prepareRootfs sets up the devices, mount points, and filesystems for use
// inside a new mount namespace. It doesn't set anything as ro. You must call
// finalizeRootfs after this function to finish setting up the rootfs.
func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
config := iConfig.Config
if err := prepareRoot(config); err != nil {
return newSystemErrorWithCause(err, "preparing rootfs")
}
setupDev := needsSetupDev(config)
for _, m := range config.Mounts {
for _, precmd := range m.PremountCmds {
if err := mountCmd(precmd); err != nil {
return newSystemErrorWithCause(err, "running premount command")
}
}
if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil {
return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination)
}
for _, postcmd := range m.PostmountCmds {
if err := mountCmd(postcmd); err != nil {
return newSystemErrorWithCause(err, "running postmount command")
}
}
}
if setupDev {
if err := createDevices(config); err != nil {
return newSystemErrorWithCause(err, "creating device nodes")
}
if err := setupPtmx(config); err != nil {
return newSystemErrorWithCause(err, "setting up ptmx")
}
if err := setupDevSymlinks(config.Rootfs); err != nil {
return newSystemErrorWithCause(err, "setting up /dev symlinks")
}
}
// Signal the parent to run the pre-start hooks.
// The hooks are run after the mounts are setup, but before we switch to the new
// root, so that the old root is still available in the hooks for any mount
// manipulations.
// Note that iConfig.Cwd is not guaranteed to exist here.
if err := syncParentHooks(pipe); err != nil {
return err
}
// The reason these operations are done here rather than in finalizeRootfs
// is because the console-handling code gets quite sticky if we have to set
// up the console before doing the pivot_root(2). This is because the
// Console API has to also work with the ExecIn case, which means that the
// API must be able to deal with being inside as well as outside the
// container. It's just cleaner to do this here (at the expense of the
// operation not being perfectly split).
if err := unix.Chdir(config.Rootfs); err != nil {
return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs)
}
if config.NoPivotRoot {
err = msMoveRoot(config.Rootfs)
} else if config.Namespaces.Contains(configs.NEWNS) {
err = pivotRoot(config.Rootfs)
} else {
err = chroot(config.Rootfs)
}
if err != nil {
return newSystemErrorWithCause(err, "jailing process inside rootfs")
}
if setupDev {
if err := reOpenDevNull(); err != nil {
return newSystemErrorWithCause(err, "reopening /dev/null inside container")
}
}
if cwd := iConfig.Cwd; cwd != "" {
// Note that spec.Process.Cwd can contain unclean value like "../../../../foo/bar...".
// However, we are safe to call MkDirAll directly because we are in the jail here.
if err := os.MkdirAll(cwd, 0755); err != nil {
return err
}
}
return nil
}
// finalizeRootfs sets anything to ro if necessary. You must call
// prepareRootfs first.
func finalizeRootfs(config *configs.Config) (err error) {
// remount dev as ro if specified
for _, m := range config.Mounts {
if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY {
if err := remountReadonly(m); err != nil {
return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
}
}
break
}
}
// set rootfs ( / ) as readonly
if config.Readonlyfs {
if err := setReadonly(); err != nil {
return newSystemErrorWithCause(err, "setting rootfs as readonly")
}
}
unix.Umask(0022)
return nil
}
func mountCmd(cmd configs.Command) error {
command := exec.Command(cmd.Path, cmd.Args[:]...)
command.Env = cmd.Env
command.Dir = cmd.Dir
if out, err := command.CombinedOutput(); err != nil {
return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err)
}
return nil
}
func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
var (
dest = m.Destination
)
if !strings.HasPrefix(dest, rootfs) {
dest = filepath.Join(rootfs, dest)
}
switch m.Device {
case "proc", "sysfs":
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
// Selinux kernels do not support labeling of /proc or /sys
return mountPropagate(m, rootfs, "")
case "mqueue":
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
if err := mountPropagate(m, rootfs, mountLabel); err != nil {
// older kernels do not support labeling of /dev/mqueue
if err := mountPropagate(m, rootfs, ""); err != nil {
return err
}
return label.SetFileLabel(dest, mountLabel)
}
return nil
case "tmpfs":
copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
tmpDir := ""
stat, err := os.Stat(dest)
if err != nil {
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
}
if copyUp {
tmpDir, err = ioutil.TempDir("/tmp", "runctmpdir")
if err != nil {
return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
}
defer os.RemoveAll(tmpDir)
m.Destination = tmpDir
}
if err := mountPropagate(m, rootfs, mountLabel); err != nil {
return err
}
if copyUp {
if err := fileutils.CopyDirectory(dest, tmpDir); err != nil {
errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err)
if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
}
return errMsg
}
if err := unix.Mount(tmpDir, dest, "", unix.MS_MOVE, ""); err != nil {
errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err)
if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil {
return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg)
}
return errMsg
}
}
if stat != nil {
if err = os.Chmod(dest, stat.Mode()); err != nil {
return err
}
}
return nil
case "bind":
stat, err := os.Stat(m.Source)
if err != nil {
// error out if the source of a bind mount does not exist as we will be
// unable to bind anything to it.
return err
}
// ensure that the destination of the bind mount is resolved of symlinks at mount time because
// any previous mounts can invalidate the next mount's destination.
// this can happen when a user specifies mounts within other mounts to cause breakouts or other
// evil stuff to try to escape the container's rootfs.
if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
return err
}
if err := checkMountDestination(rootfs, dest); err != nil {
return err
}
// update the mount with the correct dest after symlinks are resolved.
m.Destination = dest
if err := createIfNotExists(dest, stat.IsDir()); err != nil {
return err
}
if err := mountPropagate(m, rootfs, mountLabel); err != nil {
return err
}
// bind mount won't change mount options, we need remount to make mount options effective.
// first check that we have non-default options required before attempting a remount
if m.Flags&^(unix.MS_REC|unix.MS_REMOUNT|unix.MS_BIND) != 0 {
// only remount if unique mount options are set
if err := remount(m, rootfs); err != nil {
return err
}
}
if m.Relabel != "" {
if err := label.Validate(m.Relabel); err != nil {
return err
}
shared := label.IsShared(m.Relabel)
if err := label.Relabel(m.Source, mountLabel, shared); err != nil {
return err
}
}
case "cgroup":
binds, err := getCgroupMounts(m)
if err != nil {
return err
}
var merged []string
for _, b := range binds {
ss := filepath.Base(b.Destination)
if strings.Contains(ss, ",") {
merged = append(merged, ss)
}
}
tmpfs := &configs.Mount{
Source: "tmpfs",
Device: "tmpfs",
Destination: m.Destination,
Flags: defaultMountFlags,
Data: "mode=755",
PropagationFlags: m.PropagationFlags,
}
if err := mountToRootfs(tmpfs, rootfs, mountLabel); err != nil {
return err
}
for _, b := range binds {
if err := mountToRootfs(b, rootfs, mountLabel); err != nil {
return err
}
}
for _, mc := range merged {
for _, ss := range strings.Split(mc, ",") {
// symlink(2) is very dumb, it will just shove the path into
// the link and doesn't do any checks or relative path
// conversion. Also, don't error out if the cgroup already exists.
if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) {
return err
}
}
}
if m.Flags&unix.MS_RDONLY != 0 {
// remount cgroup root as readonly
mcgrouproot := &configs.Mount{
Source: m.Destination,
Device: "bind",
Destination: m.Destination,
Flags: defaultMountFlags | unix.MS_RDONLY | unix.MS_BIND,
}
if err := remount(mcgrouproot, rootfs); err != nil {
return err
}
}
default:
// ensure that the destination of the mount is resolved of symlinks at mount time because
// any previous mounts can invalidate the next mount's destination.
// this can happen when a user specifies mounts within other mounts to cause breakouts or other
// evil stuff to try to escape the container's rootfs.
var err error
if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
return err
}
if err := checkMountDestination(rootfs, dest); err != nil {
return err
}
// update the mount with the correct dest after symlinks are resolved.
m.Destination = dest
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
return mountPropagate(m, rootfs, mountLabel)
}
return nil
}
func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
mounts, err := cgroups.GetCgroupMounts(false)
if err != nil {
return nil, err
}
cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return nil, err
}
var binds []*configs.Mount
for _, mm := range mounts {
dir, err := mm.GetOwnCgroup(cgroupPaths)
if err != nil {
return nil, err
}
relDir, err := filepath.Rel(mm.Root, dir)
if err != nil {
return nil, err
}
binds = append(binds, &configs.Mount{
Device: "bind",
Source: filepath.Join(mm.Mountpoint, relDir),
Destination: filepath.Join(m.Destination, filepath.Base(mm.Mountpoint)),
Flags: unix.MS_BIND | unix.MS_REC | m.Flags,
PropagationFlags: m.PropagationFlags,
})
}
return binds, nil
}
// checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
// dest is required to be an abs path and have any symlinks resolved before calling this function.
func checkMountDestination(rootfs, dest string) error {
invalidDestinations := []string{
"/proc",
}
// White list, it should be sub directories of invalid destinations
validDestinations := []string{
// These entries can be bind mounted by files emulated by fuse,
// so commands like top, free displays stats in container.
"/proc/cpuinfo",
"/proc/diskstats",
"/proc/meminfo",
"/proc/stat",
"/proc/swaps",
"/proc/uptime",
"/proc/net/dev",
}
for _, valid := range validDestinations {
path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
if err != nil {
return err
}
if path == "." {
return nil
}
}
for _, invalid := range invalidDestinations {
path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest)
if err != nil {
return err
}
if path == "." || !strings.HasPrefix(path, "..") {
return fmt.Errorf("%q cannot be mounted because it is located inside %q", dest, invalid)
}
}
return nil
}
func setupDevSymlinks(rootfs string) error {
var links = [][2]string{
{"/proc/self/fd", "/dev/fd"},
{"/proc/self/fd/0", "/dev/stdin"},
{"/proc/self/fd/1", "/dev/stdout"},
{"/proc/self/fd/2", "/dev/stderr"},
}
// kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
// in /dev if it exists in /proc.
if _, err := os.Stat("/proc/kcore"); err == nil {
links = append(links, [2]string{"/proc/kcore", "/dev/core"})
}
for _, link := range links {
var (
src = link[0]
dst = filepath.Join(rootfs, link[1])
)
if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) {
return fmt.Errorf("symlink %s %s %s", src, dst, err)
}
}
return nil
}
// If stdin, stdout, and/or stderr are pointing to `/dev/null` in the parent's rootfs
// this method will make them point to `/dev/null` in this container's rootfs. This
// needs to be called after we chroot/pivot into the container's rootfs so that any
// symlinks are resolved locally.
func reOpenDevNull() error {
var stat, devNullStat unix.Stat_t
file, err := os.OpenFile("/dev/null", os.O_RDWR, 0)
if err != nil {
return fmt.Errorf("Failed to open /dev/null - %s", err)
}
defer file.Close()
if err := unix.Fstat(int(file.Fd()), &devNullStat); err != nil {
return err
}
for fd := 0; fd < 3; fd++ {
if err := unix.Fstat(fd, &stat); err != nil {
return err
}
if stat.Rdev == devNullStat.Rdev {
// Close and re-open the fd.
if err := unix.Dup3(int(file.Fd()), fd, 0); err != nil {
return err
}
}
}
return nil
}
// Create the device nodes in the container.
func createDevices(config *configs.Config) error {
useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
oldMask := unix.Umask(0000)
for _, node := range config.Devices {
// containers running in a user namespace are not allowed to mknod
// devices so we can just bind mount it from the host.
if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
unix.Umask(oldMask)
return err
}
}
unix.Umask(oldMask)
return nil
}
func bindMountDeviceNode(dest string, node *configs.Device) error {
f, err := os.Create(dest)
if err != nil && !os.IsExist(err) {
return err
}
if f != nil {
f.Close()
}
return unix.Mount(node.Path, dest, "bind", unix.MS_BIND, "")
}
// Creates the device node in the rootfs of the container.
func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
dest := filepath.Join(rootfs, node.Path)
if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
return err
}
if bind {
return bindMountDeviceNode(dest, node)
}
if err := mknodDevice(dest, node); err != nil {
if os.IsExist(err) {
return nil
} else if os.IsPermission(err) {
return bindMountDeviceNode(dest, node)
}
return err
}
return nil
}
func mknodDevice(dest string, node *configs.Device) error {
fileMode := node.FileMode
switch node.Type {
case 'c', 'u':
fileMode |= unix.S_IFCHR
case 'b':
fileMode |= unix.S_IFBLK
case 'p':
fileMode |= unix.S_IFIFO
default:
return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path)
}
if err := unix.Mknod(dest, uint32(fileMode), node.Mkdev()); err != nil {
return err
}
return unix.Chown(dest, int(node.Uid), int(node.Gid))
}
func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info {
for _, m := range mountinfo {
if m.Mountpoint == dir {
return m
}
}
return nil
}
// Get the parent mount point of directory passed in as argument. Also return
// optional fields.
func getParentMount(rootfs string) (string, string, error) {
var path string
mountinfos, err := mount.GetMounts()
if err != nil {
return "", "", err
}
mountinfo := getMountInfo(mountinfos, rootfs)
if mountinfo != nil {
return rootfs, mountinfo.Optional, nil
}
path = rootfs
for {
path = filepath.Dir(path)
mountinfo = getMountInfo(mountinfos, path)
if mountinfo != nil {
return path, mountinfo.Optional, nil
}
if path == "/" {
break
}
}
// If we are here, we did not find parent mount. Something is wrong.
return "", "", fmt.Errorf("Could not find parent mount of %s", rootfs)
}
// Make parent mount private if it was shared
func rootfsParentMountPrivate(rootfs string) error {
sharedMount := false
parentMount, optionalOpts, err := getParentMount(rootfs)
if err != nil {
return err
}
optsSplit := strings.Split(optionalOpts, " ")
for _, opt := range optsSplit {
if strings.HasPrefix(opt, "shared:") {
sharedMount = true
break
}
}
// Make parent mount PRIVATE if it was shared. It is needed for two
// reasons. First of all pivot_root() will fail if parent mount is
// shared. Secondly when we bind mount rootfs it will propagate to
// parent namespace and we don't want that to happen.
if sharedMount {
return unix.Mount("", parentMount, "", unix.MS_PRIVATE, "")
}
return nil
}
func prepareRoot(config *configs.Config) error {
flag := unix.MS_SLAVE | unix.MS_REC
if config.RootPropagation != 0 {
flag = config.RootPropagation
}
if err := unix.Mount("", "/", "", uintptr(flag), ""); err != nil {
return err
}
// Make parent mount private to make sure following bind mount does
// not propagate in other namespaces. Also it will help with kernel
// check pass in pivot_root. (IS_SHARED(new_mnt->mnt_parent))
if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
return err
}
return unix.Mount(config.Rootfs, config.Rootfs, "bind", unix.MS_BIND|unix.MS_REC, "")
}
func setReadonly() error {
return unix.Mount("/", "/", "bind", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "")
}
func setupPtmx(config *configs.Config) error {
ptmx := filepath.Join(config.Rootfs, "dev/ptmx")
if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
return err
}
if err := os.Symlink("pts/ptmx", ptmx); err != nil {
return fmt.Errorf("symlink dev ptmx %s", err)
}
return nil
}
// pivotRoot will call pivot_root such that rootfs becomes the new root
// filesystem, and everything else is cleaned up.
func pivotRoot(rootfs string) error {
// While the documentation may claim otherwise, pivot_root(".", ".") is
// actually valid. What this results in is / being the new root but
// /proc/self/cwd being the old root. Since we can play around with the cwd
// with pivot_root this allows us to pivot without creating directories in
// the rootfs. Shout-outs to the LXC developers for giving us this idea.
oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return err
}
defer unix.Close(oldroot)
newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return err
}
defer unix.Close(newroot)
// Change to the new root so that the pivot_root actually acts on it.
if err := unix.Fchdir(newroot); err != nil {
return err
}
if err := unix.PivotRoot(".", "."); err != nil {
return fmt.Errorf("pivot_root %s", err)
}
// Currently our "." is oldroot (according to the current kernel code).
// However, purely for safety, we will fchdir(oldroot) since there isn't
// really any guarantee from the kernel what /proc/self/cwd will be after a
// pivot_root(2).
if err := unix.Fchdir(oldroot); err != nil {
return err
}
// Make oldroot rslave to make sure our unmounts don't propagate to the
// host (and thus bork the machine). We don't use rprivate because this is
// known to cause issues due to races where we still have a reference to a
// mount while a process in the host namespace are trying to operate on
// something they think has no mounts (devicemapper in particular).
if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
return err
}
// Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
if err := unix.Unmount(".", unix.MNT_DETACH); err != nil {
return err
}
// Switch back to our shiny new root.
if err := unix.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err)
}
return nil
}
func msMoveRoot(rootfs string) error {
if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil {
return err
}
return chroot(rootfs)
}
func chroot(rootfs string) error {
if err := unix.Chroot("."); err != nil {
return err
}
return unix.Chdir("/")
}
// createIfNotExists creates a file or a directory only if it does not already exist.
func createIfNotExists(path string, isDir bool) error {
if _, err := os.Stat(path); err != nil {
if os.IsNotExist(err) {
if isDir {
return os.MkdirAll(path, 0755)
}
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return err
}
f, err := os.OpenFile(path, os.O_CREATE, 0755)
if err != nil {
return err
}
f.Close()
}
}
return nil
}
// readonlyPath will make a path read only.
func readonlyPath(path string) error {
if err := unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
return unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "")
}
// remountReadonly will remount an existing mount point and ensure that it is read-only.
func remountReadonly(m *configs.Mount) error {
var (
dest = m.Destination
flags = m.Flags
)
for i := 0; i < 5; i++ {
// There is a special case in the kernel for
// MS_REMOUNT | MS_BIND, which allows us to change only the
// flags even as an unprivileged user (i.e. user namespace)
// assuming we don't drop any security related flags (nodev,
// nosuid, etc.). So, let's use that case so that we can do
// this re-mount without failing in a userns.
flags |= unix.MS_REMOUNT | unix.MS_BIND | unix.MS_RDONLY
if err := unix.Mount("", dest, "", uintptr(flags), ""); err != nil {
switch err {
case unix.EBUSY:
time.Sleep(100 * time.Millisecond)
continue
default:
return err
}
}
return nil
}
return fmt.Errorf("unable to mount %s as readonly max retries reached", dest)
}
// maskPath masks the top of the specified path inside a container to avoid
// security issues from processes reading information from non-namespace aware
// mounts ( proc/kcore ).
// For files, maskPath bind mounts /dev/null over the top of the specified path.
// For directories, maskPath mounts read-only tmpfs over the top of the specified path.
func maskPath(path string) error {
if err := unix.Mount("/dev/null", path, "", unix.MS_BIND, ""); err != nil && !os.IsNotExist(err) {
if err == unix.ENOTDIR {
return unix.Mount("tmpfs", path, "tmpfs", unix.MS_RDONLY, "")
}
return err
}
return nil
}
// writeSystemProperty writes the value to a path under /proc/sys as determined from the key.
// For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward.
func writeSystemProperty(key, value string) error {
keyPath := strings.Replace(key, ".", "/", -1)
return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0644)
}
func remount(m *configs.Mount, rootfs string) error {
var (
dest = m.Destination
)
if !strings.HasPrefix(dest, rootfs) {
dest = filepath.Join(rootfs, dest)
}
if err := unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), ""); err != nil {
return err
}
return nil
}
// Do the mount operation followed by additional mounts required to take care
// of propagation flags.
func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
var (
dest = m.Destination
data = label.FormatMountLabel(m.Data, mountLabel)
flags = m.Flags
)
if libcontainerUtils.CleanPath(dest) == "/dev" {
flags &= ^unix.MS_RDONLY
}
copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
if !(copyUp || strings.HasPrefix(dest, rootfs)) {
dest = filepath.Join(rootfs, dest)
}
if err := unix.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
return err
}
for _, pflag := range m.PropagationFlags {
if err := unix.Mount("", dest, "", uintptr(pflag), ""); err != nil {
return err
}
}
return nil
}

View File

@@ -0,0 +1,76 @@
package seccomp
import (
"fmt"
"github.com/opencontainers/runc/libcontainer/configs"
)
var operators = map[string]configs.Operator{
"SCMP_CMP_NE": configs.NotEqualTo,
"SCMP_CMP_LT": configs.LessThan,
"SCMP_CMP_LE": configs.LessThanOrEqualTo,
"SCMP_CMP_EQ": configs.EqualTo,
"SCMP_CMP_GE": configs.GreaterThanOrEqualTo,
"SCMP_CMP_GT": configs.GreaterThan,
"SCMP_CMP_MASKED_EQ": configs.MaskEqualTo,
}
var actions = map[string]configs.Action{
"SCMP_ACT_KILL": configs.Kill,
"SCMP_ACT_ERRNO": configs.Errno,
"SCMP_ACT_TRAP": configs.Trap,
"SCMP_ACT_ALLOW": configs.Allow,
"SCMP_ACT_TRACE": configs.Trace,
}
var archs = map[string]string{
"SCMP_ARCH_X86": "x86",
"SCMP_ARCH_X86_64": "amd64",
"SCMP_ARCH_X32": "x32",
"SCMP_ARCH_ARM": "arm",
"SCMP_ARCH_AARCH64": "arm64",
"SCMP_ARCH_MIPS": "mips",
"SCMP_ARCH_MIPS64": "mips64",
"SCMP_ARCH_MIPS64N32": "mips64n32",
"SCMP_ARCH_MIPSEL": "mipsel",
"SCMP_ARCH_MIPSEL64": "mipsel64",
"SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
"SCMP_ARCH_PPC": "ppc",
"SCMP_ARCH_PPC64": "ppc64",
"SCMP_ARCH_PPC64LE": "ppc64le",
"SCMP_ARCH_S390": "s390",
"SCMP_ARCH_S390X": "s390x",
}
// ConvertStringToOperator converts a string into a Seccomp comparison operator.
// Comparison operators use the names they are assigned by Libseccomp's header.
// Attempting to convert a string that is not a valid operator results in an
// error.
func ConvertStringToOperator(in string) (configs.Operator, error) {
if op, ok := operators[in]; ok == true {
return op, nil
}
return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in)
}
// ConvertStringToAction converts a string into a Seccomp rule match action.
// Actions use the names they are assigned in Libseccomp's header, though some
// (notable, SCMP_ACT_TRACE) are not available in this implementation and will
// return errors.
// Attempting to convert a string that is not a valid action results in an
// error.
func ConvertStringToAction(in string) (configs.Action, error) {
if act, ok := actions[in]; ok == true {
return act, nil
}
return 0, fmt.Errorf("string %s is not a valid action for seccomp", in)
}
// ConvertStringToArch converts a string into a Seccomp comparison arch.
func ConvertStringToArch(in string) (string, error) {
if arch, ok := archs[in]; ok == true {
return arch, nil
}
return "", fmt.Errorf("string %s is not a valid arch for seccomp", in)
}

View File

@@ -0,0 +1,258 @@
// +build linux,cgo,seccomp
package seccomp
import (
"bufio"
"fmt"
"os"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
libseccomp "github.com/seccomp/libseccomp-golang"
"golang.org/x/sys/unix"
)
var (
actAllow = libseccomp.ActAllow
actTrap = libseccomp.ActTrap
actKill = libseccomp.ActKill
actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM))
actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM))
)
const (
// Linux system calls can have at most 6 arguments
syscallMaxArguments int = 6
)
// Filters given syscalls in a container, preventing them from being used
// Started in the container init process, and carried over to all child processes
// Setns calls, however, require a separate invocation, as they are not children
// of the init until they join the namespace
func InitSeccomp(config *configs.Seccomp) error {
if config == nil {
return fmt.Errorf("cannot initialize Seccomp - nil config passed")
}
defaultAction, err := getAction(config.DefaultAction)
if err != nil {
return fmt.Errorf("error initializing seccomp - invalid default action")
}
filter, err := libseccomp.NewFilter(defaultAction)
if err != nil {
return fmt.Errorf("error creating filter: %s", err)
}
// Add extra architectures
for _, arch := range config.Architectures {
scmpArch, err := libseccomp.GetArchFromString(arch)
if err != nil {
return fmt.Errorf("error validating Seccomp architecture: %s", err)
}
if err := filter.AddArch(scmpArch); err != nil {
return fmt.Errorf("error adding architecture to seccomp filter: %s", err)
}
}
// Unset no new privs bit
if err := filter.SetNoNewPrivsBit(false); err != nil {
return fmt.Errorf("error setting no new privileges: %s", err)
}
// Add a rule for each syscall
for _, call := range config.Syscalls {
if call == nil {
return fmt.Errorf("encountered nil syscall while initializing Seccomp")
}
if err = matchCall(filter, call); err != nil {
return err
}
}
if err = filter.Load(); err != nil {
return fmt.Errorf("error loading seccomp filter into kernel: %s", err)
}
return nil
}
// IsEnabled returns if the kernel has been configured to support seccomp.
func IsEnabled() bool {
// Try to read from /proc/self/status for kernels > 3.8
s, err := parseStatusFile("/proc/self/status")
if err != nil {
// Check if Seccomp is supported, via CONFIG_SECCOMP.
if err := unix.Prctl(unix.PR_GET_SECCOMP, 0, 0, 0, 0); err != unix.EINVAL {
// Make sure the kernel has CONFIG_SECCOMP_FILTER.
if err := unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, 0, 0, 0); err != unix.EINVAL {
return true
}
}
return false
}
_, ok := s["Seccomp"]
return ok
}
// Convert Libcontainer Action to Libseccomp ScmpAction
func getAction(act configs.Action) (libseccomp.ScmpAction, error) {
switch act {
case configs.Kill:
return actKill, nil
case configs.Errno:
return actErrno, nil
case configs.Trap:
return actTrap, nil
case configs.Allow:
return actAllow, nil
case configs.Trace:
return actTrace, nil
default:
return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule")
}
}
// Convert Libcontainer Operator to Libseccomp ScmpCompareOp
func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) {
switch op {
case configs.EqualTo:
return libseccomp.CompareEqual, nil
case configs.NotEqualTo:
return libseccomp.CompareNotEqual, nil
case configs.GreaterThan:
return libseccomp.CompareGreater, nil
case configs.GreaterThanOrEqualTo:
return libseccomp.CompareGreaterEqual, nil
case configs.LessThan:
return libseccomp.CompareLess, nil
case configs.LessThanOrEqualTo:
return libseccomp.CompareLessOrEqual, nil
case configs.MaskEqualTo:
return libseccomp.CompareMaskedEqual, nil
default:
return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule")
}
}
// Convert Libcontainer Arg to Libseccomp ScmpCondition
func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) {
cond := libseccomp.ScmpCondition{}
if arg == nil {
return cond, fmt.Errorf("cannot convert nil to syscall condition")
}
op, err := getOperator(arg.Op)
if err != nil {
return cond, err
}
return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo)
}
// Add a rule to match a single syscall
func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
if call == nil || filter == nil {
return fmt.Errorf("cannot use nil as syscall to block")
}
if len(call.Name) == 0 {
return fmt.Errorf("empty string is not a valid syscall")
}
// If we can't resolve the syscall, assume it's not supported on this kernel
// Ignore it, don't error out
callNum, err := libseccomp.GetSyscallFromName(call.Name)
if err != nil {
return nil
}
// Convert the call's action to the libseccomp equivalent
callAct, err := getAction(call.Action)
if err != nil {
return fmt.Errorf("action in seccomp profile is invalid: %s", err)
}
// Unconditional match - just add the rule
if len(call.Args) == 0 {
if err = filter.AddRule(callNum, callAct); err != nil {
return fmt.Errorf("error adding seccomp filter rule for syscall %s: %s", call.Name, err)
}
} else {
// If two or more arguments have the same condition,
// Revert to old behavior, adding each condition as a separate rule
argCounts := make([]uint, syscallMaxArguments)
conditions := []libseccomp.ScmpCondition{}
for _, cond := range call.Args {
newCond, err := getCondition(cond)
if err != nil {
return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %s", call.Name, err)
}
argCounts[cond.Index] += 1
conditions = append(conditions, newCond)
}
hasMultipleArgs := false
for _, count := range argCounts {
if count > 1 {
hasMultipleArgs = true
break
}
}
if hasMultipleArgs {
// Revert to old behavior
// Add each condition attached to a separate rule
for _, cond := range conditions {
condArr := []libseccomp.ScmpCondition{cond}
if err = filter.AddRuleConditional(callNum, callAct, condArr); err != nil {
return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
}
}
} else {
// No conditions share same argument
// Use new, proper behavior
if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
}
}
}
return nil
}
func parseStatusFile(path string) (map[string]string, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
s := bufio.NewScanner(f)
status := make(map[string]string)
for s.Scan() {
text := s.Text()
parts := strings.Split(text, ":")
if len(parts) <= 1 {
continue
}
status[parts[0]] = parts[1]
}
if err := s.Err(); err != nil {
return nil, err
}
return status, nil
}

View File

@@ -0,0 +1,24 @@
// +build !linux !cgo !seccomp
package seccomp
import (
"errors"
"github.com/opencontainers/runc/libcontainer/configs"
)
var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
// InitSeccomp does nothing because seccomp is not supported.
func InitSeccomp(config *configs.Seccomp) error {
if config != nil {
return ErrSeccompNotEnabled
}
return nil
}
// IsEnabled returns false, because it is not supported.
func IsEnabled() bool {
return false
}

View File

@@ -0,0 +1,76 @@
// +build linux
package libcontainer
import (
"fmt"
"os"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/selinux/go-selinux/label"
"golang.org/x/sys/unix"
)
// linuxSetnsInit performs the container's initialization for running a new process
// inside an existing container.
type linuxSetnsInit struct {
pipe *os.File
consoleSocket *os.File
config *initConfig
}
func (l *linuxSetnsInit) getSessionRingName() string {
return fmt.Sprintf("_ses.%s", l.config.ContainerId)
}
func (l *linuxSetnsInit) Init() error {
if !l.config.Config.NoNewKeyring {
// do not inherit the parent's session keyring
if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil {
return err
}
}
if l.config.CreateConsole {
if err := setupConsole(l.consoleSocket, l.config, false); err != nil {
return err
}
if err := system.Setctty(); err != nil {
return err
}
}
if l.config.NoNewPrivileges {
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err
}
}
if err := finalizeNamespace(l.config); err != nil {
return err
}
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
}
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
}

View File

@@ -0,0 +1,221 @@
package specconv
import (
"os"
"strings"
"github.com/opencontainers/runtime-spec/specs-go"
)
// Example returns an example spec file, with many options set so a user can
// see what a standard spec file looks like.
func Example() *specs.Spec {
return &specs.Spec{
Version: specs.Version,
Root: &specs.Root{
Path: "rootfs",
Readonly: true,
},
Process: &specs.Process{
Terminal: true,
User: specs.User{},
Args: []string{
"sh",
},
Env: []string{
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"TERM=xterm",
},
Cwd: "/",
NoNewPrivileges: true,
Capabilities: &specs.LinuxCapabilities{
Bounding: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
Permitted: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
Inheritable: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
Ambient: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
Effective: []string{
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE",
},
},
Rlimits: []specs.POSIXRlimit{
{
Type: "RLIMIT_NOFILE",
Hard: uint64(1024),
Soft: uint64(1024),
},
},
},
Hostname: "runc",
Mounts: []specs.Mount{
{
Destination: "/proc",
Type: "proc",
Source: "proc",
Options: nil,
},
{
Destination: "/dev",
Type: "tmpfs",
Source: "tmpfs",
Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
},
{
Destination: "/dev/pts",
Type: "devpts",
Source: "devpts",
Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
},
{
Destination: "/dev/shm",
Type: "tmpfs",
Source: "shm",
Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"},
},
{
Destination: "/dev/mqueue",
Type: "mqueue",
Source: "mqueue",
Options: []string{"nosuid", "noexec", "nodev"},
},
{
Destination: "/sys",
Type: "sysfs",
Source: "sysfs",
Options: []string{"nosuid", "noexec", "nodev", "ro"},
},
{
Destination: "/sys/fs/cgroup",
Type: "cgroup",
Source: "cgroup",
Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
},
},
Linux: &specs.Linux{
MaskedPaths: []string{
"/proc/kcore",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/sys/firmware",
"/proc/scsi",
},
ReadonlyPaths: []string{
"/proc/asound",
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger",
},
Resources: &specs.LinuxResources{
Devices: []specs.LinuxDeviceCgroup{
{
Allow: false,
Access: "rwm",
},
},
},
Namespaces: []specs.LinuxNamespace{
{
Type: "pid",
},
{
Type: "network",
},
{
Type: "ipc",
},
{
Type: "uts",
},
{
Type: "mount",
},
},
},
}
}
// ToRootless converts the given spec file into one that should work with
// rootless containers, by removing incompatible options and adding others that
// are needed.
func ToRootless(spec *specs.Spec) {
var namespaces []specs.LinuxNamespace
// Remove networkns from the spec.
for _, ns := range spec.Linux.Namespaces {
switch ns.Type {
case specs.NetworkNamespace, specs.UserNamespace:
// Do nothing.
default:
namespaces = append(namespaces, ns)
}
}
// Add userns to the spec.
namespaces = append(namespaces, specs.LinuxNamespace{
Type: specs.UserNamespace,
})
spec.Linux.Namespaces = namespaces
// Add mappings for the current user.
spec.Linux.UIDMappings = []specs.LinuxIDMapping{{
HostID: uint32(os.Geteuid()),
ContainerID: 0,
Size: 1,
}}
spec.Linux.GIDMappings = []specs.LinuxIDMapping{{
HostID: uint32(os.Getegid()),
ContainerID: 0,
Size: 1,
}}
// Fix up mounts.
var mounts []specs.Mount
for _, mount := range spec.Mounts {
// Ignore all mounts that are under /sys.
if strings.HasPrefix(mount.Destination, "/sys") {
continue
}
// Remove all gid= and uid= mappings.
var options []string
for _, option := range mount.Options {
if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") {
options = append(options, option)
}
}
mount.Options = options
mounts = append(mounts, mount)
}
// Add the sysfs mount as an rbind.
mounts = append(mounts, specs.Mount{
Source: "/sys",
Destination: "/sys",
Type: "none",
Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"},
})
spec.Mounts = mounts
// Remove cgroup settings.
spec.Linux.Resources = nil
}

View File

@@ -0,0 +1,832 @@
// +build linux
// Package specconv implements conversion of specifications to libcontainer
// configurations
package specconv
import (
"fmt"
"os"
"path/filepath"
"strings"
"time"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/seccomp"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
)
const wildcard = -1
var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{
specs.PIDNamespace: configs.NEWPID,
specs.NetworkNamespace: configs.NEWNET,
specs.MountNamespace: configs.NEWNS,
specs.UserNamespace: configs.NEWUSER,
specs.IPCNamespace: configs.NEWIPC,
specs.UTSNamespace: configs.NEWUTS,
}
var mountPropagationMapping = map[string]int{
"rprivate": unix.MS_PRIVATE | unix.MS_REC,
"private": unix.MS_PRIVATE,
"rslave": unix.MS_SLAVE | unix.MS_REC,
"slave": unix.MS_SLAVE,
"rshared": unix.MS_SHARED | unix.MS_REC,
"shared": unix.MS_SHARED,
"runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
"unbindable": unix.MS_UNBINDABLE,
"": 0,
}
var allowedDevices = []*configs.Device{
// allow mknod for any device
{
Type: 'c',
Major: wildcard,
Minor: wildcard,
Permissions: "m",
Allow: true,
},
{
Type: 'b',
Major: wildcard,
Minor: wildcard,
Permissions: "m",
Allow: true,
},
{
Type: 'c',
Path: "/dev/null",
Major: 1,
Minor: 3,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/random",
Major: 1,
Minor: 8,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/full",
Major: 1,
Minor: 7,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/tty",
Major: 5,
Minor: 0,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/zero",
Major: 1,
Minor: 5,
Permissions: "rwm",
Allow: true,
},
{
Type: 'c',
Path: "/dev/urandom",
Major: 1,
Minor: 9,
Permissions: "rwm",
Allow: true,
},
{
Path: "/dev/console",
Type: 'c',
Major: 5,
Minor: 1,
Permissions: "rwm",
Allow: true,
},
// /dev/pts/ - pts namespaces are "coming soon"
{
Path: "",
Type: 'c',
Major: 136,
Minor: wildcard,
Permissions: "rwm",
Allow: true,
},
{
Path: "",
Type: 'c',
Major: 5,
Minor: 2,
Permissions: "rwm",
Allow: true,
},
// tuntap
{
Path: "",
Type: 'c',
Major: 10,
Minor: 200,
Permissions: "rwm",
Allow: true,
},
}
type CreateOpts struct {
CgroupName string
UseSystemdCgroup bool
NoPivotRoot bool
NoNewKeyring bool
Spec *specs.Spec
Rootless bool
}
// CreateLibcontainerConfig creates a new libcontainer configuration from a
// given specification and a cgroup name
func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
// runc's cwd will always be the bundle path
rcwd, err := os.Getwd()
if err != nil {
return nil, err
}
cwd, err := filepath.Abs(rcwd)
if err != nil {
return nil, err
}
spec := opts.Spec
if spec.Root == nil {
return nil, fmt.Errorf("Root must be specified")
}
rootfsPath := spec.Root.Path
if !filepath.IsAbs(rootfsPath) {
rootfsPath = filepath.Join(cwd, rootfsPath)
}
labels := []string{}
for k, v := range spec.Annotations {
labels = append(labels, fmt.Sprintf("%s=%s", k, v))
}
config := &configs.Config{
Rootfs: rootfsPath,
NoPivotRoot: opts.NoPivotRoot,
Readonlyfs: spec.Root.Readonly,
Hostname: spec.Hostname,
Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)),
NoNewKeyring: opts.NoNewKeyring,
Rootless: opts.Rootless,
}
exists := false
for _, m := range spec.Mounts {
config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m))
}
if err := createDevices(spec, config); err != nil {
return nil, err
}
c, err := createCgroupConfig(opts)
if err != nil {
return nil, err
}
config.Cgroups = c
// set linux-specific config
if spec.Linux != nil {
if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists {
return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation)
}
if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) {
return nil, fmt.Errorf("rootfsPropagation of [r]private is not safe without pivot_root")
}
for _, ns := range spec.Linux.Namespaces {
t, exists := namespaceMapping[ns.Type]
if !exists {
return nil, fmt.Errorf("namespace %q does not exist", ns)
}
if config.Namespaces.Contains(t) {
return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns)
}
config.Namespaces.Add(t, ns.Path)
}
if config.Namespaces.Contains(configs.NEWNET) {
config.Networks = []*configs.Network{
{
Type: "loopback",
},
}
}
if config.Namespaces.Contains(configs.NEWUSER) {
if err := setupUserNamespace(spec, config); err != nil {
return nil, err
}
}
config.MaskPaths = spec.Linux.MaskedPaths
config.ReadonlyPaths = spec.Linux.ReadonlyPaths
config.MountLabel = spec.Linux.MountLabel
config.Sysctl = spec.Linux.Sysctl
if spec.Linux.Seccomp != nil {
seccomp, err := setupSeccomp(spec.Linux.Seccomp)
if err != nil {
return nil, err
}
config.Seccomp = seccomp
}
}
if spec.Process.SelinuxLabel != "" {
config.ProcessLabel = spec.Process.SelinuxLabel
}
if spec.Process != nil && spec.Process.OOMScoreAdj != nil {
config.OomScoreAdj = *spec.Process.OOMScoreAdj
}
if spec.Process.Capabilities != nil {
config.Capabilities = &configs.Capabilities{
Bounding: spec.Process.Capabilities.Bounding,
Effective: spec.Process.Capabilities.Effective,
Permitted: spec.Process.Capabilities.Permitted,
Inheritable: spec.Process.Capabilities.Inheritable,
Ambient: spec.Process.Capabilities.Ambient,
}
}
createHooks(spec, config)
config.Version = specs.Version
if spec.Linux.IntelRdt != nil {
config.IntelRdt = &configs.IntelRdt{}
if spec.Linux.IntelRdt.L3CacheSchema != "" {
config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema
}
}
return config, nil
}
func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount {
flags, pgflags, data, ext := parseMountOptions(m.Options)
source := m.Source
if m.Type == "bind" {
if !filepath.IsAbs(source) {
source = filepath.Join(cwd, m.Source)
}
}
return &configs.Mount{
Device: m.Type,
Source: source,
Destination: m.Destination,
Data: data,
Flags: flags,
PropagationFlags: pgflags,
Extensions: ext,
}
}
func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) {
var (
myCgroupPath string
spec = opts.Spec
useSystemdCgroup = opts.UseSystemdCgroup
name = opts.CgroupName
)
c := &configs.Cgroup{
Resources: &configs.Resources{},
}
if spec.Linux != nil && spec.Linux.CgroupsPath != "" {
myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath)
if useSystemdCgroup {
myCgroupPath = spec.Linux.CgroupsPath
}
}
if useSystemdCgroup {
if myCgroupPath == "" {
c.Parent = "system.slice"
c.ScopePrefix = "runc"
c.Name = name
} else {
// Parse the path from expected "slice:prefix:name"
// for e.g. "system.slice:docker:1234"
parts := strings.Split(myCgroupPath, ":")
if len(parts) != 3 {
return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups")
}
c.Parent = parts[0]
c.ScopePrefix = parts[1]
c.Name = parts[2]
}
} else {
if myCgroupPath == "" {
c.Name = name
}
c.Path = myCgroupPath
}
// In rootless containers, any attempt to make cgroup changes will fail.
// libcontainer will validate this and we shouldn't add any cgroup options
// the user didn't specify.
if !opts.Rootless {
c.Resources.AllowedDevices = allowedDevices
}
if spec.Linux != nil {
r := spec.Linux.Resources
if r == nil {
return c, nil
}
for i, d := range spec.Linux.Resources.Devices {
var (
t = "a"
major = int64(-1)
minor = int64(-1)
)
if d.Type != "" {
t = d.Type
}
if d.Major != nil {
major = *d.Major
}
if d.Minor != nil {
minor = *d.Minor
}
if d.Access == "" {
return nil, fmt.Errorf("device access at %d field cannot be empty", i)
}
dt, err := stringToCgroupDeviceRune(t)
if err != nil {
return nil, err
}
dd := &configs.Device{
Type: dt,
Major: major,
Minor: minor,
Permissions: d.Access,
Allow: d.Allow,
}
c.Resources.Devices = append(c.Resources.Devices, dd)
}
if r.Memory != nil {
if r.Memory.Limit != nil {
c.Resources.Memory = *r.Memory.Limit
}
if r.Memory.Reservation != nil {
c.Resources.MemoryReservation = *r.Memory.Reservation
}
if r.Memory.Swap != nil {
c.Resources.MemorySwap = *r.Memory.Swap
}
if r.Memory.Kernel != nil {
c.Resources.KernelMemory = *r.Memory.Kernel
}
if r.Memory.KernelTCP != nil {
c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP
}
if r.Memory.Swappiness != nil {
c.Resources.MemorySwappiness = r.Memory.Swappiness
}
if r.Memory.DisableOOMKiller != nil {
c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller
}
}
if r.CPU != nil {
if r.CPU.Shares != nil {
c.Resources.CpuShares = *r.CPU.Shares
}
if r.CPU.Quota != nil {
c.Resources.CpuQuota = *r.CPU.Quota
}
if r.CPU.Period != nil {
c.Resources.CpuPeriod = *r.CPU.Period
}
if r.CPU.RealtimeRuntime != nil {
c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime
}
if r.CPU.RealtimePeriod != nil {
c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod
}
if r.CPU.Cpus != "" {
c.Resources.CpusetCpus = r.CPU.Cpus
}
if r.CPU.Mems != "" {
c.Resources.CpusetMems = r.CPU.Mems
}
}
if r.Pids != nil {
c.Resources.PidsLimit = r.Pids.Limit
}
if r.BlockIO != nil {
if r.BlockIO.Weight != nil {
c.Resources.BlkioWeight = *r.BlockIO.Weight
}
if r.BlockIO.LeafWeight != nil {
c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight
}
if r.BlockIO.WeightDevice != nil {
for _, wd := range r.BlockIO.WeightDevice {
var weight, leafWeight uint16
if wd.Weight != nil {
weight = *wd.Weight
}
if wd.LeafWeight != nil {
leafWeight = *wd.LeafWeight
}
weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight)
c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice)
}
}
if r.BlockIO.ThrottleReadBpsDevice != nil {
for _, td := range r.BlockIO.ThrottleReadBpsDevice {
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleWriteBpsDevice != nil {
for _, td := range r.BlockIO.ThrottleWriteBpsDevice {
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleReadIOPSDevice != nil {
for _, td := range r.BlockIO.ThrottleReadIOPSDevice {
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice)
}
}
if r.BlockIO.ThrottleWriteIOPSDevice != nil {
for _, td := range r.BlockIO.ThrottleWriteIOPSDevice {
rate := td.Rate
throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate)
c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice)
}
}
}
for _, l := range r.HugepageLimits {
c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{
Pagesize: l.Pagesize,
Limit: l.Limit,
})
}
if r.Network != nil {
if r.Network.ClassID != nil {
c.Resources.NetClsClassid = *r.Network.ClassID
}
for _, m := range r.Network.Priorities {
c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{
Interface: m.Name,
Priority: int64(m.Priority),
})
}
}
}
if !opts.Rootless {
// append the default allowed devices to the end of the list
c.Resources.Devices = append(c.Resources.Devices, allowedDevices...)
}
return c, nil
}
func stringToCgroupDeviceRune(s string) (rune, error) {
switch s {
case "a":
return 'a', nil
case "b":
return 'b', nil
case "c":
return 'c', nil
default:
return 0, fmt.Errorf("invalid cgroup device type %q", s)
}
}
func stringToDeviceRune(s string) (rune, error) {
switch s {
case "p":
return 'p', nil
case "u":
return 'u', nil
case "b":
return 'b', nil
case "c":
return 'c', nil
default:
return 0, fmt.Errorf("invalid device type %q", s)
}
}
func createDevices(spec *specs.Spec, config *configs.Config) error {
// add whitelisted devices
config.Devices = []*configs.Device{
{
Type: 'c',
Path: "/dev/null",
Major: 1,
Minor: 3,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/random",
Major: 1,
Minor: 8,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/full",
Major: 1,
Minor: 7,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/tty",
Major: 5,
Minor: 0,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/zero",
Major: 1,
Minor: 5,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
{
Type: 'c',
Path: "/dev/urandom",
Major: 1,
Minor: 9,
FileMode: 0666,
Uid: 0,
Gid: 0,
},
}
// merge in additional devices from the spec
if spec.Linux != nil {
for _, d := range spec.Linux.Devices {
var uid, gid uint32
var filemode os.FileMode = 0666
if d.UID != nil {
uid = *d.UID
}
if d.GID != nil {
gid = *d.GID
}
dt, err := stringToDeviceRune(d.Type)
if err != nil {
return err
}
if d.FileMode != nil {
filemode = *d.FileMode
}
device := &configs.Device{
Type: dt,
Path: d.Path,
Major: d.Major,
Minor: d.Minor,
FileMode: filemode,
Uid: uid,
Gid: gid,
}
config.Devices = append(config.Devices, device)
}
}
return nil
}
func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
create := func(m specs.LinuxIDMapping) configs.IDMap {
return configs.IDMap{
HostID: int(m.HostID),
ContainerID: int(m.ContainerID),
Size: int(m.Size),
}
}
if spec.Linux != nil {
for _, m := range spec.Linux.UIDMappings {
config.UidMappings = append(config.UidMappings, create(m))
}
for _, m := range spec.Linux.GIDMappings {
config.GidMappings = append(config.GidMappings, create(m))
}
}
rootUID, err := config.HostRootUID()
if err != nil {
return err
}
rootGID, err := config.HostRootGID()
if err != nil {
return err
}
for _, node := range config.Devices {
node.Uid = uint32(rootUID)
node.Gid = uint32(rootGID)
}
return nil
}
// parseMountOptions parses the string and returns the flags, propagation
// flags and any mount data that it contains.
func parseMountOptions(options []string) (int, []int, string, int) {
var (
flag int
pgflag []int
data []string
extFlags int
)
flags := map[string]struct {
clear bool
flag int
}{
"acl": {false, unix.MS_POSIXACL},
"async": {true, unix.MS_SYNCHRONOUS},
"atime": {true, unix.MS_NOATIME},
"bind": {false, unix.MS_BIND},
"defaults": {false, 0},
"dev": {true, unix.MS_NODEV},
"diratime": {true, unix.MS_NODIRATIME},
"dirsync": {false, unix.MS_DIRSYNC},
"exec": {true, unix.MS_NOEXEC},
"iversion": {false, unix.MS_I_VERSION},
"lazytime": {false, unix.MS_LAZYTIME},
"loud": {true, unix.MS_SILENT},
"mand": {false, unix.MS_MANDLOCK},
"noacl": {true, unix.MS_POSIXACL},
"noatime": {false, unix.MS_NOATIME},
"nodev": {false, unix.MS_NODEV},
"nodiratime": {false, unix.MS_NODIRATIME},
"noexec": {false, unix.MS_NOEXEC},
"noiversion": {true, unix.MS_I_VERSION},
"nolazytime": {true, unix.MS_LAZYTIME},
"nomand": {true, unix.MS_MANDLOCK},
"norelatime": {true, unix.MS_RELATIME},
"nostrictatime": {true, unix.MS_STRICTATIME},
"nosuid": {false, unix.MS_NOSUID},
"rbind": {false, unix.MS_BIND | unix.MS_REC},
"relatime": {false, unix.MS_RELATIME},
"remount": {false, unix.MS_REMOUNT},
"ro": {false, unix.MS_RDONLY},
"rw": {true, unix.MS_RDONLY},
"silent": {false, unix.MS_SILENT},
"strictatime": {false, unix.MS_STRICTATIME},
"suid": {true, unix.MS_NOSUID},
"sync": {false, unix.MS_SYNCHRONOUS},
}
propagationFlags := map[string]int{
"private": unix.MS_PRIVATE,
"shared": unix.MS_SHARED,
"slave": unix.MS_SLAVE,
"unbindable": unix.MS_UNBINDABLE,
"rprivate": unix.MS_PRIVATE | unix.MS_REC,
"rshared": unix.MS_SHARED | unix.MS_REC,
"rslave": unix.MS_SLAVE | unix.MS_REC,
"runbindable": unix.MS_UNBINDABLE | unix.MS_REC,
}
extensionFlags := map[string]struct {
clear bool
flag int
}{
"tmpcopyup": {false, configs.EXT_COPYUP},
}
for _, o := range options {
// If the option does not exist in the flags table or the flag
// is not supported on the platform,
// then it is a data value for a specific fs type
if f, exists := flags[o]; exists && f.flag != 0 {
if f.clear {
flag &= ^f.flag
} else {
flag |= f.flag
}
} else if f, exists := propagationFlags[o]; exists && f != 0 {
pgflag = append(pgflag, f)
} else if f, exists := extensionFlags[o]; exists && f.flag != 0 {
if f.clear {
extFlags &= ^f.flag
} else {
extFlags |= f.flag
}
} else {
data = append(data, o)
}
}
return flag, pgflag, strings.Join(data, ","), extFlags
}
func setupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) {
if config == nil {
return nil, nil
}
// No default action specified, no syscalls listed, assume seccomp disabled
if config.DefaultAction == "" && len(config.Syscalls) == 0 {
return nil, nil
}
newConfig := new(configs.Seccomp)
newConfig.Syscalls = []*configs.Syscall{}
if len(config.Architectures) > 0 {
newConfig.Architectures = []string{}
for _, arch := range config.Architectures {
newArch, err := seccomp.ConvertStringToArch(string(arch))
if err != nil {
return nil, err
}
newConfig.Architectures = append(newConfig.Architectures, newArch)
}
}
// Convert default action from string representation
newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction))
if err != nil {
return nil, err
}
newConfig.DefaultAction = newDefaultAction
// Loop through all syscall blocks and convert them to libcontainer format
for _, call := range config.Syscalls {
newAction, err := seccomp.ConvertStringToAction(string(call.Action))
if err != nil {
return nil, err
}
for _, name := range call.Names {
newCall := configs.Syscall{
Name: name,
Action: newAction,
Args: []*configs.Arg{},
}
// Loop through all the arguments of the syscall and convert them
for _, arg := range call.Args {
newOp, err := seccomp.ConvertStringToOperator(string(arg.Op))
if err != nil {
return nil, err
}
newArg := configs.Arg{
Index: arg.Index,
Value: arg.Value,
ValueTwo: arg.ValueTwo,
Op: newOp,
}
newCall.Args = append(newCall.Args, &newArg)
}
newConfig.Syscalls = append(newConfig.Syscalls, &newCall)
}
}
return newConfig, nil
}
func createHooks(rspec *specs.Spec, config *configs.Config) {
config.Hooks = &configs.Hooks{}
if rspec.Hooks != nil {
for _, h := range rspec.Hooks.Prestart {
cmd := createCommandHook(h)
config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd))
}
for _, h := range rspec.Hooks.Poststart {
cmd := createCommandHook(h)
config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd))
}
for _, h := range rspec.Hooks.Poststop {
cmd := createCommandHook(h)
config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd))
}
}
}
func createCommandHook(h specs.Hook) configs.Command {
cmd := configs.Command{
Path: h.Path,
Args: h.Args,
Env: h.Env,
}
if h.Timeout != nil {
d := time.Duration(*h.Timeout) * time.Second
cmd.Timeout = &d
}
return cmd
}

View File

@@ -0,0 +1,27 @@
package stacktrace
import "runtime"
// Capture captures a stacktrace for the current calling go program
//
// skip is the number of frames to skip
func Capture(userSkip int) Stacktrace {
var (
skip = userSkip + 1 // add one for our own function
frames []Frame
prevPc uintptr
)
for i := skip; ; i++ {
pc, file, line, ok := runtime.Caller(i)
//detect if caller is repeated to avoid loop, gccgo
//currently runs into a loop without this check
if !ok || pc == prevPc {
break
}
frames = append(frames, NewFrame(pc, file, line))
prevPc = pc
}
return Stacktrace{
Frames: frames,
}
}

View File

@@ -0,0 +1,38 @@
package stacktrace
import (
"path/filepath"
"runtime"
"strings"
)
// NewFrame returns a new stack frame for the provided information
func NewFrame(pc uintptr, file string, line int) Frame {
fn := runtime.FuncForPC(pc)
if fn == nil {
return Frame{}
}
pack, name := parseFunctionName(fn.Name())
return Frame{
Line: line,
File: filepath.Base(file),
Package: pack,
Function: name,
}
}
func parseFunctionName(name string) (string, string) {
i := strings.LastIndex(name, ".")
if i == -1 {
return "", name
}
return name[:i], name[i+1:]
}
// Frame contains all the information for a stack frame within a go program
type Frame struct {
File string
Function string
Package string
Line int
}

View File

@@ -0,0 +1,5 @@
package stacktrace
type Stacktrace struct {
Frames []Frame
}

View File

@@ -0,0 +1,193 @@
// +build linux
package libcontainer
import (
"fmt"
"os"
"os/exec"
"syscall" //only for Exec
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/selinux/go-selinux/label"
"golang.org/x/sys/unix"
)
type linuxStandardInit struct {
pipe *os.File
consoleSocket *os.File
parentPid int
fifoFd int
config *initConfig
}
func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
var newperms uint32
if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
// With user ns we need 'other' search permissions.
newperms = 0x8
} else {
// Without user ns we need 'UID' search permissions.
newperms = 0x80000
}
// Create a unique per session container name that we can join in setns;
// However, other containers can also join it.
return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
}
func (l *linuxStandardInit) Init() error {
if !l.config.Config.NoNewKeyring {
ringname, keepperms, newperms := l.getSessionRingParams()
// Do not inherit the parent's session keyring.
sessKeyId, err := keys.JoinSessionKeyring(ringname)
if err != nil {
return err
}
// Make session keyring searcheable.
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return err
}
}
if err := setupNetwork(l.config); err != nil {
return err
}
if err := setupRoute(l.config.Config); err != nil {
return err
}
label.Init()
if err := prepareRootfs(l.pipe, l.config); err != nil {
return err
}
// Set up the console. This has to be done *before* we finalize the rootfs,
// but *after* we've given the user the chance to set up all of the mounts
// they wanted.
if l.config.CreateConsole {
if err := setupConsole(l.consoleSocket, l.config, true); err != nil {
return err
}
if err := system.Setctty(); err != nil {
return err
}
}
// Finish the rootfs setup.
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := finalizeRootfs(l.config.Config); err != nil {
return err
}
}
if hostname := l.config.Config.Hostname; hostname != "" {
if err := unix.Sethostname([]byte(hostname)); err != nil {
return err
}
}
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
for key, value := range l.config.Config.Sysctl {
if err := writeSystemProperty(key, value); err != nil {
return err
}
}
for _, path := range l.config.Config.ReadonlyPaths {
if err := readonlyPath(path); err != nil {
return err
}
}
for _, path := range l.config.Config.MaskPaths {
if err := maskPath(path); err != nil {
return err
}
}
pdeath, err := system.GetParentDeathSignal()
if err != nil {
return err
}
if l.config.NoNewPrivileges {
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return err
}
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err
}
}
if err := finalizeNamespace(l.config); err != nil {
return err
}
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := pdeath.Restore(); err != nil {
return err
}
// Compare the parent from the initial start of the init process and make
// sure that it did not change. if the parent changes that means it died
// and we were reparented to something else so we should just kill ourself
// and not cause problems for someone else.
if unix.Getppid() != l.parentPid {
return unix.Kill(unix.Getpid(), unix.SIGKILL)
}
// Check for the arg before waiting to make sure it exists and it is
// returned as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// Close the pipe to signal that we have completed our init.
l.pipe.Close()
// Wait for the FIFO to be opened on the other side before exec-ing the
// user process. We open it through /proc/self/fd/$fd, because the fd that
// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
// re-open an O_PATH fd through /proc.
fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
if err != nil {
return newSystemErrorWithCause(err, "open exec fifo")
}
if _, err := unix.Write(fd, []byte("0")); err != nil {
return newSystemErrorWithCause(err, "write 0 exec fifo")
}
// Close the O_PATH fifofd fd before exec because the kernel resets
// dumpable in the wrong order. This has been fixed in newer kernels, but
// we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
// N.B. the core issue itself (passing dirfds to the host filesystem) has
// since been resolved.
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
unix.Close(l.fifoFd)
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
}
}
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process")
}
return nil
}

View File

@@ -0,0 +1,255 @@
// +build linux
package libcontainer
import (
"fmt"
"os"
"path/filepath"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func newStateTransitionError(from, to containerState) error {
return &stateTransitionError{
From: from.status().String(),
To: to.status().String(),
}
}
// stateTransitionError is returned when an invalid state transition happens from one
// state to another.
type stateTransitionError struct {
From string
To string
}
func (s *stateTransitionError) Error() string {
return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
}
type containerState interface {
transition(containerState) error
destroy() error
status() Status
}
func destroy(c *linuxContainer) error {
if !c.config.Namespaces.Contains(configs.NEWPID) {
if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
logrus.Warn(err)
}
}
err := c.cgroupManager.Destroy()
if c.intelRdtManager != nil {
if ierr := c.intelRdtManager.Destroy(); err == nil {
err = ierr
}
}
if rerr := os.RemoveAll(c.root); err == nil {
err = rerr
}
c.initProcess = nil
if herr := runPoststopHooks(c); err == nil {
err = herr
}
c.state = &stoppedState{c: c}
return err
}
func runPoststopHooks(c *linuxContainer) error {
if c.config.Hooks != nil {
bundle, annotations := utils.Annotations(c.config.Labels)
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Bundle: bundle,
Annotations: annotations,
}
for _, hook := range c.config.Hooks.Poststop {
if err := hook.Run(s); err != nil {
return err
}
}
}
return nil
}
// stoppedState represents a container is a stopped/destroyed state.
type stoppedState struct {
c *linuxContainer
}
func (b *stoppedState) status() Status {
return Stopped
}
func (b *stoppedState) transition(s containerState) error {
switch s.(type) {
case *runningState, *restoredState:
b.c.state = s
return nil
case *stoppedState:
return nil
}
return newStateTransitionError(b, s)
}
func (b *stoppedState) destroy() error {
return destroy(b.c)
}
// runningState represents a container that is currently running.
type runningState struct {
c *linuxContainer
}
func (r *runningState) status() Status {
return Running
}
func (r *runningState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
t, err := r.c.runType()
if err != nil {
return err
}
if t == Running {
return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
}
r.c.state = s
return nil
case *pausedState:
r.c.state = s
return nil
case *runningState:
return nil
}
return newStateTransitionError(r, s)
}
func (r *runningState) destroy() error {
t, err := r.c.runType()
if err != nil {
return err
}
if t == Running {
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
}
return destroy(r.c)
}
type createdState struct {
c *linuxContainer
}
func (i *createdState) status() Status {
return Created
}
func (i *createdState) transition(s containerState) error {
switch s.(type) {
case *runningState, *pausedState, *stoppedState:
i.c.state = s
return nil
case *createdState:
return nil
}
return newStateTransitionError(i, s)
}
func (i *createdState) destroy() error {
i.c.initProcess.signal(unix.SIGKILL)
return destroy(i.c)
}
// pausedState represents a container that is currently pause. It cannot be destroyed in a
// paused state and must transition back to running first.
type pausedState struct {
c *linuxContainer
}
func (p *pausedState) status() Status {
return Paused
}
func (p *pausedState) transition(s containerState) error {
switch s.(type) {
case *runningState, *stoppedState:
p.c.state = s
return nil
case *pausedState:
return nil
}
return newStateTransitionError(p, s)
}
func (p *pausedState) destroy() error {
t, err := p.c.runType()
if err != nil {
return err
}
if t != Running && t != Created {
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
return destroy(p.c)
}
return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
}
// restoredState is the same as the running state but also has associated checkpoint
// information that maybe need destroyed when the container is stopped and destroy is called.
type restoredState struct {
imageDir string
c *linuxContainer
}
func (r *restoredState) status() Status {
return Running
}
func (r *restoredState) transition(s containerState) error {
switch s.(type) {
case *stoppedState, *runningState:
return nil
}
return newStateTransitionError(r, s)
}
func (r *restoredState) destroy() error {
if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil {
if !os.IsNotExist(err) {
return err
}
}
return destroy(r.c)
}
// loadedState is used whenever a container is restored, loaded, or setting additional
// processes inside and it should not be destroyed when it is exiting.
type loadedState struct {
c *linuxContainer
s Status
}
func (n *loadedState) status() Status {
return n.s
}
func (n *loadedState) transition(s containerState) error {
n.c.state = s
return nil
}
func (n *loadedState) destroy() error {
if err := n.c.refreshState(); err != nil {
return err
}
return n.c.state.destroy()
}

View File

@@ -0,0 +1,15 @@
package libcontainer
type NetworkInterface struct {
// Name is the name of the network interface.
Name string
RxBytes uint64
RxPackets uint64
RxErrors uint64
RxDropped uint64
TxBytes uint64
TxPackets uint64
TxErrors uint64
TxDropped uint64
}

View File

@@ -0,0 +1,10 @@
package libcontainer
import "github.com/opencontainers/runc/libcontainer/cgroups"
import "github.com/opencontainers/runc/libcontainer/intelrdt"
type Stats struct {
Interfaces []*NetworkInterface
CgroupStats *cgroups.Stats
IntelRdtStats *intelrdt.Stats
}

View File

@@ -0,0 +1,107 @@
package libcontainer
import (
"encoding/json"
"fmt"
"io"
"github.com/opencontainers/runc/libcontainer/utils"
)
type syncType string
// Constants that are used for synchronisation between the parent and child
// during container setup. They come in pairs (with procError being a generic
// response which is followed by a &genericError).
//
// [ child ] <-> [ parent ]
//
// procHooks --> [run hooks]
// <-- procResume
//
// procConsole -->
// <-- procConsoleReq
// [send(fd)] --> [recv(fd)]
// <-- procConsoleAck
//
// procReady --> [final setup]
// <-- procRun
const (
procError syncType = "procError"
procReady syncType = "procReady"
procRun syncType = "procRun"
procHooks syncType = "procHooks"
procResume syncType = "procResume"
)
type syncT struct {
Type syncType `json:"type"`
}
// writeSync is used to write to a synchronisation pipe. An error is returned
// if there was a problem writing the payload.
func writeSync(pipe io.Writer, sync syncType) error {
if err := utils.WriteJSON(pipe, syncT{sync}); err != nil {
return err
}
return nil
}
// readSync is used to read from a synchronisation pipe. An error is returned
// if we got a genericError, the pipe was closed, or we got an unexpected flag.
func readSync(pipe io.Reader, expected syncType) error {
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type == procError {
var ierr genericError
if err := json.NewDecoder(pipe).Decode(&ierr); err != nil {
return fmt.Errorf("failed reading error from parent: %v", err)
}
return &ierr
}
if procSync.Type != expected {
return fmt.Errorf("invalid synchronisation flag from parent")
}
}
return nil
}
// parseSync runs the given callback function on each syncT received from the
// child. It will return once io.EOF is returned from the given pipe.
func parseSync(pipe io.Reader, fn func(*syncT) error) error {
dec := json.NewDecoder(pipe)
for {
var sync syncT
if err := dec.Decode(&sync); err != nil {
if err == io.EOF {
break
}
return err
}
// We handle this case outside fn for cleanliness reasons.
var ierr *genericError
if sync.Type == procError {
if err := dec.Decode(&ierr); err != nil && err != io.EOF {
return newSystemErrorWithCause(err, "decoding proc error from init")
}
if ierr != nil {
return ierr
}
// Programmer error.
panic("No error following JSON procError payload.")
}
if err := fn(&sync); err != nil {
return err
}
}
return nil
}

View File

@@ -7,10 +7,25 @@ import (
"fmt"
"os"
"os/exec"
"syscall"
"syscall" // only for exec
"unsafe"
"golang.org/x/sys/unix"
)
// If arg2 is nonzero, set the "child subreaper" attribute of the
// calling process; if arg2 is zero, unset the attribute. When a
// process is marked as a child subreaper, all of the children
// that it creates, and their descendants, will be marked as
// having a subreaper. In effect, a subreaper fulfills the role
// of init(1) for its descendant processes. Upon termination of
// a process that is orphaned (i.e., its immediate parent has
// already terminated) and marked as having a subreaper, the
// nearest still living ancestor subreaper will receive a SIGCHLD
// signal and be able to wait(2) on the process to discover its
// termination status.
const PR_SET_CHILD_SUBREAPER = 36
type ParentDeathSignal int
func (p ParentDeathSignal) Restore() error {
@@ -40,8 +55,16 @@ func Execv(cmd string, args []string, env []string) error {
return syscall.Exec(name, args, env)
}
func Prlimit(pid, resource int, limit unix.Rlimit) error {
_, _, err := unix.RawSyscall6(unix.SYS_PRLIMIT64, uintptr(pid), uintptr(resource), uintptr(unsafe.Pointer(&limit)), uintptr(unsafe.Pointer(&limit)), 0, 0)
if err != 0 {
return err
}
return nil
}
func SetParentDeathSignal(sig uintptr) error {
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, sig, 0); err != 0 {
if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
return err
}
return nil
@@ -49,15 +72,14 @@ func SetParentDeathSignal(sig uintptr) error {
func GetParentDeathSignal() (ParentDeathSignal, error) {
var sig int
_, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0)
if err != 0 {
if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil {
return -1, err
}
return ParentDeathSignal(sig), nil
}
func SetKeepCaps() error {
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_KEEPCAPS, 1, 0); err != 0 {
if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil {
return err
}
@@ -65,7 +87,7 @@ func SetKeepCaps() error {
}
func ClearKeepCaps() error {
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_KEEPCAPS, 0, 0); err != 0 {
if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil {
return err
}
@@ -73,23 +95,18 @@ func ClearKeepCaps() error {
}
func Setctty() error {
if _, _, err := syscall.RawSyscall(syscall.SYS_IOCTL, 0, uintptr(syscall.TIOCSCTTY), 0); err != 0 {
if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil {
return err
}
return nil
}
/*
* Detect whether we are currently running in a user namespace.
* Copied from github.com/lxc/lxd/shared/util.go
*/
// RunningInUserNS detects whether we are currently running in a user namespace.
// Copied from github.com/lxc/lxd/shared/util.go
func RunningInUserNS() bool {
file, err := os.Open("/proc/self/uid_map")
if err != nil {
/*
* This kernel-provided file only exists if user namespaces are
* supported
*/
// This kernel-provided file only exists if user namespaces are supported
return false
}
defer file.Close()
@@ -112,3 +129,19 @@ func RunningInUserNS() bool {
}
return true
}
// SetSubreaper sets the value i as the subreaper setting for the calling process
func SetSubreaper(i int) error {
return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
}
// GetSubreaper returns the subreaper setting for the calling process
func GetSubreaper() (int, error) {
var i uintptr
if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
return -1, err
}
return int(i), nil
}

View File

@@ -1,27 +1,113 @@
package system
import (
"fmt"
"io/ioutil"
"path/filepath"
"strconv"
"strings"
)
// look in /proc to find the process start time so that we can verify
// that this pid has started after ourself
// State is the status of a process.
type State rune
const ( // Only values for Linux 3.14 and later are listed here
Dead State = 'X'
DiskSleep State = 'D'
Running State = 'R'
Sleeping State = 'S'
Stopped State = 'T'
TracingStop State = 't'
Zombie State = 'Z'
)
// String forms of the state from proc(5)'s documentation for
// /proc/[pid]/status' "State" field.
func (s State) String() string {
switch s {
case Dead:
return "dead"
case DiskSleep:
return "disk sleep"
case Running:
return "running"
case Sleeping:
return "sleeping"
case Stopped:
return "stopped"
case TracingStop:
return "tracing stop"
case Zombie:
return "zombie"
default:
return fmt.Sprintf("unknown (%c)", s)
}
}
// Stat_t represents the information from /proc/[pid]/stat, as
// described in proc(5) with names based on the /proc/[pid]/status
// fields.
type Stat_t struct {
// PID is the process ID.
PID uint
// Name is the command run by the process.
Name string
// State is the state of the process.
State State
// StartTime is the number of clock ticks after system boot (since
// Linux 2.6).
StartTime uint64
}
// Stat returns a Stat_t instance for the specified process.
func Stat(pid int) (stat Stat_t, err error) {
bytes, err := ioutil.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat"))
if err != nil {
return stat, err
}
return parseStat(string(bytes))
}
// GetProcessStartTime is deprecated. Use Stat(pid) and
// Stat_t.StartTime instead.
func GetProcessStartTime(pid int) (string, error) {
data, err := ioutil.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat"))
stat, err := Stat(pid)
if err != nil {
return "", err
}
parts := strings.Split(string(data), " ")
// the starttime is located at pos 22
// from the man page
//
// starttime %llu (was %lu before Linux 2.6)
// (22) The time the process started after system boot. In kernels before Linux 2.6, this
// value was expressed in jiffies. Since Linux 2.6, the value is expressed in clock ticks
// (divide by sysconf(_SC_CLK_TCK)).
return parts[22-1], nil // starts at 1
return fmt.Sprintf("%d", stat.StartTime), nil
}
func parseStat(data string) (stat Stat_t, err error) {
// From proc(5), field 2 could contain space and is inside `(` and `)`.
// The following is an example:
// 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
i := strings.LastIndex(data, ")")
if i <= 2 || i >= len(data)-1 {
return stat, fmt.Errorf("invalid stat data: %q", data)
}
parts := strings.SplitN(data[:i], "(", 2)
if len(parts) != 2 {
return stat, fmt.Errorf("invalid stat data: %q", data)
}
stat.Name = parts[1]
_, err = fmt.Sscanf(parts[0], "%d", &stat.PID)
if err != nil {
return stat, err
}
// parts indexes should be offset by 3 from the field number given
// proc(5), because parts is zero-indexed and we've removed fields
// one (PID) and two (Name) in the paren-split.
parts = strings.Split(data[i+2:], " ")
var state int
fmt.Sscanf(parts[3-3], "%c", &state)
stat.State = State(state)
fmt.Sscanf(parts[22-3], "%d", &stat.StartTime)
return stat, nil
}

View File

@@ -1,40 +0,0 @@
package system
import (
"fmt"
"runtime"
"syscall"
)
// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092
//
// We need different setns values for the different platforms and arch
// We are declaring the macro here because the SETNS syscall does not exist in th stdlib
var setNsMap = map[string]uintptr{
"linux/386": 346,
"linux/arm64": 268,
"linux/amd64": 308,
"linux/arm": 375,
"linux/ppc": 350,
"linux/ppc64": 350,
"linux/ppc64le": 350,
"linux/s390x": 339,
}
var sysSetns = setNsMap[fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH)]
func SysSetns() uint32 {
return uint32(sysSetns)
}
func Setns(fd uintptr, flags uintptr) error {
ns, exists := setNsMap[fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH)]
if !exists {
return fmt.Errorf("unsupported platform %s/%s", runtime.GOOS, runtime.GOARCH)
}
_, _, err := syscall.RawSyscall(ns, fd, flags, 0)
if err != 0 {
return err
}
return nil
}

View File

@@ -1,14 +1,15 @@
// +build linux,arm
// +build linux
// +build 386 arm
package system
import (
"syscall"
"golang.org/x/sys/unix"
)
// Setuid sets the uid of the calling thread to the specified uid.
func Setuid(uid int) (err error) {
_, _, e1 := syscall.RawSyscall(syscall.SYS_SETUID32, uintptr(uid), 0, 0)
_, _, e1 := unix.RawSyscall(unix.SYS_SETUID32, uintptr(uid), 0, 0)
if e1 != 0 {
err = e1
}
@@ -17,7 +18,7 @@ func Setuid(uid int) (err error) {
// Setgid sets the gid of the calling thread to the specified gid.
func Setgid(gid int) (err error) {
_, _, e1 := syscall.RawSyscall(syscall.SYS_SETGID32, uintptr(gid), 0, 0)
_, _, e1 := unix.RawSyscall(unix.SYS_SETGID32, uintptr(gid), 0, 0)
if e1 != 0 {
err = e1
}

View File

@@ -1,25 +0,0 @@
// +build linux,386
package system
import (
"syscall"
)
// Setuid sets the uid of the calling thread to the specified uid.
func Setuid(uid int) (err error) {
_, _, e1 := syscall.RawSyscall(syscall.SYS_SETUID, uintptr(uid), 0, 0)
if e1 != 0 {
err = e1
}
return
}
// Setgid sets the gid of the calling thread to the specified gid.
func Setgid(gid int) (err error) {
_, _, e1 := syscall.RawSyscall(syscall.SYS_SETGID32, uintptr(gid), 0, 0)
if e1 != 0 {
err = e1
}
return
}

Some files were not shown because too many files have changed in this diff Show More