container_linux.go 65 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271
  1. package libcontainer
  2. import (
  3. "bytes"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "net"
  9. "os"
  10. "os/exec"
  11. "path"
  12. "path/filepath"
  13. "reflect"
  14. "strconv"
  15. "strings"
  16. "sync"
  17. "time"
  18. "github.com/checkpoint-restore/go-criu/v5"
  19. criurpc "github.com/checkpoint-restore/go-criu/v5/rpc"
  20. securejoin "github.com/cyphar/filepath-securejoin"
  21. "github.com/opencontainers/runtime-spec/specs-go"
  22. "github.com/sirupsen/logrus"
  23. "github.com/vishvananda/netlink/nl"
  24. "golang.org/x/sys/unix"
  25. "google.golang.org/protobuf/proto"
  26. "github.com/opencontainers/runc/libcontainer/cgroups"
  27. "github.com/opencontainers/runc/libcontainer/configs"
  28. "github.com/opencontainers/runc/libcontainer/intelrdt"
  29. "github.com/opencontainers/runc/libcontainer/system"
  30. "github.com/opencontainers/runc/libcontainer/utils"
  31. )
  32. const stdioFdCount = 3
  33. type linuxContainer struct {
  34. id string
  35. root string
  36. config *configs.Config
  37. cgroupManager cgroups.Manager
  38. intelRdtManager intelrdt.Manager
  39. initPath string
  40. initArgs []string
  41. initProcess parentProcess
  42. initProcessStartTime uint64
  43. criuPath string
  44. newuidmapPath string
  45. newgidmapPath string
  46. m sync.Mutex
  47. criuVersion int
  48. state containerState
  49. created time.Time
  50. fifo *os.File
  51. }
  52. // State represents a running container's state
  53. type State struct {
  54. BaseState
  55. // Platform specific fields below here
  56. // Specified if the container was started under the rootless mode.
  57. // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups
  58. Rootless bool `json:"rootless"`
  59. // Paths to all the container's cgroups, as returned by (*cgroups.Manager).GetPaths
  60. //
  61. // For cgroup v1, a key is cgroup subsystem name, and the value is the path
  62. // to the cgroup for this subsystem.
  63. //
  64. // For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
  65. CgroupPaths map[string]string `json:"cgroup_paths"`
  66. // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
  67. // with the value as the path.
  68. NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
  69. // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
  70. ExternalDescriptors []string `json:"external_descriptors,omitempty"`
  71. // Intel RDT "resource control" filesystem path
  72. IntelRdtPath string `json:"intel_rdt_path"`
  73. }
  74. // Container is a libcontainer container object.
  75. //
  76. // Each container is thread-safe within the same process. Since a container can
  77. // be destroyed by a separate process, any function may return that the container
  78. // was not found.
  79. type Container interface {
  80. BaseContainer
  81. // Methods below here are platform specific
  82. // Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
  83. Checkpoint(criuOpts *CriuOpts) error
  84. // Restore restores the checkpointed container to a running state using the criu(8) utility.
  85. Restore(process *Process, criuOpts *CriuOpts) error
  86. // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
  87. // the execution of any user processes. Asynchronously, when the container finished being paused the
  88. // state is changed to PAUSED.
  89. // If the Container state is PAUSED, do nothing.
  90. Pause() error
  91. // If the Container state is PAUSED, resumes the execution of any user processes in the
  92. // Container before setting the Container state to RUNNING.
  93. // If the Container state is RUNNING, do nothing.
  94. Resume() error
  95. // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
  96. NotifyOOM() (<-chan struct{}, error)
  97. // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
  98. NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
  99. }
  100. // ID returns the container's unique ID
  101. func (c *linuxContainer) ID() string {
  102. return c.id
  103. }
  104. // Config returns the container's configuration
  105. func (c *linuxContainer) Config() configs.Config {
  106. return *c.config
  107. }
  108. func (c *linuxContainer) Status() (Status, error) {
  109. c.m.Lock()
  110. defer c.m.Unlock()
  111. return c.currentStatus()
  112. }
  113. func (c *linuxContainer) State() (*State, error) {
  114. c.m.Lock()
  115. defer c.m.Unlock()
  116. return c.currentState()
  117. }
  118. func (c *linuxContainer) OCIState() (*specs.State, error) {
  119. c.m.Lock()
  120. defer c.m.Unlock()
  121. return c.currentOCIState()
  122. }
  123. func (c *linuxContainer) Processes() ([]int, error) {
  124. var pids []int
  125. status, err := c.currentStatus()
  126. if err != nil {
  127. return pids, err
  128. }
  129. // for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited
  130. if status == Stopped && !c.cgroupManager.Exists() {
  131. return pids, nil
  132. }
  133. pids, err = c.cgroupManager.GetAllPids()
  134. if err != nil {
  135. return nil, fmt.Errorf("unable to get all container pids: %w", err)
  136. }
  137. return pids, nil
  138. }
  139. func (c *linuxContainer) Stats() (*Stats, error) {
  140. var (
  141. err error
  142. stats = &Stats{}
  143. )
  144. if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
  145. return stats, fmt.Errorf("unable to get container cgroup stats: %w", err)
  146. }
  147. if c.intelRdtManager != nil {
  148. if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil {
  149. return stats, fmt.Errorf("unable to get container Intel RDT stats: %w", err)
  150. }
  151. }
  152. for _, iface := range c.config.Networks {
  153. switch iface.Type {
  154. case "veth":
  155. istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
  156. if err != nil {
  157. return stats, fmt.Errorf("unable to get network stats for interface %q: %w", iface.HostInterfaceName, err)
  158. }
  159. stats.Interfaces = append(stats.Interfaces, istats)
  160. }
  161. }
  162. return stats, nil
  163. }
  164. func (c *linuxContainer) Set(config configs.Config) error {
  165. c.m.Lock()
  166. defer c.m.Unlock()
  167. status, err := c.currentStatus()
  168. if err != nil {
  169. return err
  170. }
  171. if status == Stopped {
  172. return ErrNotRunning
  173. }
  174. if err := c.cgroupManager.Set(config.Cgroups.Resources); err != nil {
  175. // Set configs back
  176. if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
  177. logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
  178. }
  179. return err
  180. }
  181. if c.intelRdtManager != nil {
  182. if err := c.intelRdtManager.Set(&config); err != nil {
  183. // Set configs back
  184. if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
  185. logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
  186. }
  187. if err2 := c.intelRdtManager.Set(c.config); err2 != nil {
  188. logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
  189. }
  190. return err
  191. }
  192. }
  193. // After config setting succeed, update config and states
  194. c.config = &config
  195. _, err = c.updateState(nil)
  196. return err
  197. }
  198. func (c *linuxContainer) Start(process *Process) error {
  199. c.m.Lock()
  200. defer c.m.Unlock()
  201. if c.config.Cgroups.Resources.SkipDevices {
  202. return errors.New("can't start container with SkipDevices set")
  203. }
  204. if process.Init {
  205. if err := c.createExecFifo(); err != nil {
  206. return err
  207. }
  208. }
  209. if err := c.start(process); err != nil {
  210. if process.Init {
  211. c.deleteExecFifo()
  212. }
  213. return err
  214. }
  215. return nil
  216. }
  217. func (c *linuxContainer) Run(process *Process) error {
  218. if err := c.Start(process); err != nil {
  219. return err
  220. }
  221. if process.Init {
  222. return c.exec()
  223. }
  224. return nil
  225. }
  226. func (c *linuxContainer) Exec() error {
  227. c.m.Lock()
  228. defer c.m.Unlock()
  229. return c.exec()
  230. }
  231. func (c *linuxContainer) exec() error {
  232. path := filepath.Join(c.root, execFifoFilename)
  233. pid := c.initProcess.pid()
  234. blockingFifoOpenCh := awaitFifoOpen(path)
  235. for {
  236. select {
  237. case result := <-blockingFifoOpenCh:
  238. return handleFifoResult(result)
  239. case <-time.After(time.Millisecond * 100):
  240. stat, err := system.Stat(pid)
  241. if err != nil || stat.State == system.Zombie {
  242. // could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check.
  243. // see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete).
  244. if err := handleFifoResult(fifoOpen(path, false)); err != nil {
  245. return errors.New("container process is already dead")
  246. }
  247. return nil
  248. }
  249. }
  250. }
  251. }
  252. func readFromExecFifo(execFifo io.Reader) error {
  253. data, err := io.ReadAll(execFifo)
  254. if err != nil {
  255. return err
  256. }
  257. if len(data) <= 0 {
  258. return errors.New("cannot start an already running container")
  259. }
  260. return nil
  261. }
  262. func awaitFifoOpen(path string) <-chan openResult {
  263. fifoOpened := make(chan openResult)
  264. go func() {
  265. result := fifoOpen(path, true)
  266. fifoOpened <- result
  267. }()
  268. return fifoOpened
  269. }
  270. func fifoOpen(path string, block bool) openResult {
  271. flags := os.O_RDONLY
  272. if !block {
  273. flags |= unix.O_NONBLOCK
  274. }
  275. f, err := os.OpenFile(path, flags, 0)
  276. if err != nil {
  277. return openResult{err: fmt.Errorf("exec fifo: %w", err)}
  278. }
  279. return openResult{file: f}
  280. }
  281. func handleFifoResult(result openResult) error {
  282. if result.err != nil {
  283. return result.err
  284. }
  285. f := result.file
  286. defer f.Close()
  287. if err := readFromExecFifo(f); err != nil {
  288. return err
  289. }
  290. return os.Remove(f.Name())
  291. }
  292. type openResult struct {
  293. file *os.File
  294. err error
  295. }
  296. func (c *linuxContainer) start(process *Process) (retErr error) {
  297. parent, err := c.newParentProcess(process)
  298. if err != nil {
  299. return fmt.Errorf("unable to create new parent process: %w", err)
  300. }
  301. logsDone := parent.forwardChildLogs()
  302. if logsDone != nil {
  303. defer func() {
  304. // Wait for log forwarder to finish. This depends on
  305. // runc init closing the _LIBCONTAINER_LOGPIPE log fd.
  306. err := <-logsDone
  307. if err != nil && retErr == nil {
  308. retErr = fmt.Errorf("unable to forward init logs: %w", err)
  309. }
  310. }()
  311. }
  312. if err := parent.start(); err != nil {
  313. return fmt.Errorf("unable to start container process: %w", err)
  314. }
  315. if process.Init {
  316. c.fifo.Close()
  317. if c.config.Hooks != nil {
  318. s, err := c.currentOCIState()
  319. if err != nil {
  320. return err
  321. }
  322. if err := c.config.Hooks[configs.Poststart].RunHooks(s); err != nil {
  323. if err := ignoreTerminateErrors(parent.terminate()); err != nil {
  324. logrus.Warn(fmt.Errorf("error running poststart hook: %w", err))
  325. }
  326. return err
  327. }
  328. }
  329. }
  330. return nil
  331. }
  332. func (c *linuxContainer) Signal(s os.Signal, all bool) error {
  333. c.m.Lock()
  334. defer c.m.Unlock()
  335. status, err := c.currentStatus()
  336. if err != nil {
  337. return err
  338. }
  339. if all {
  340. // for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited
  341. if status == Stopped && !c.cgroupManager.Exists() {
  342. return nil
  343. }
  344. return signalAllProcesses(c.cgroupManager, s)
  345. }
  346. // to avoid a PID reuse attack
  347. if status == Running || status == Created || status == Paused {
  348. if err := c.initProcess.signal(s); err != nil {
  349. return fmt.Errorf("unable to signal init: %w", err)
  350. }
  351. if status == Paused {
  352. // For cgroup v1, killing a process in a frozen cgroup
  353. // does nothing until it's thawed. Only thaw the cgroup
  354. // for SIGKILL.
  355. if s, ok := s.(unix.Signal); ok && s == unix.SIGKILL {
  356. _ = c.cgroupManager.Freeze(configs.Thawed)
  357. }
  358. }
  359. return nil
  360. }
  361. return ErrNotRunning
  362. }
  363. func (c *linuxContainer) createExecFifo() error {
  364. rootuid, err := c.Config().HostRootUID()
  365. if err != nil {
  366. return err
  367. }
  368. rootgid, err := c.Config().HostRootGID()
  369. if err != nil {
  370. return err
  371. }
  372. fifoName := filepath.Join(c.root, execFifoFilename)
  373. if _, err := os.Stat(fifoName); err == nil {
  374. return fmt.Errorf("exec fifo %s already exists", fifoName)
  375. }
  376. oldMask := unix.Umask(0o000)
  377. if err := unix.Mkfifo(fifoName, 0o622); err != nil {
  378. unix.Umask(oldMask)
  379. return err
  380. }
  381. unix.Umask(oldMask)
  382. return os.Chown(fifoName, rootuid, rootgid)
  383. }
  384. func (c *linuxContainer) deleteExecFifo() {
  385. fifoName := filepath.Join(c.root, execFifoFilename)
  386. os.Remove(fifoName)
  387. }
  388. // includeExecFifo opens the container's execfifo as a pathfd, so that the
  389. // container cannot access the statedir (and the FIFO itself remains
  390. // un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited
  391. // fd, with _LIBCONTAINER_FIFOFD set to its fd number.
  392. func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
  393. fifoName := filepath.Join(c.root, execFifoFilename)
  394. fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
  395. if err != nil {
  396. return err
  397. }
  398. c.fifo = fifo
  399. cmd.ExtraFiles = append(cmd.ExtraFiles, fifo)
  400. cmd.Env = append(cmd.Env,
  401. "_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
  402. return nil
  403. }
  404. func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
  405. parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
  406. if err != nil {
  407. return nil, fmt.Errorf("unable to create init pipe: %w", err)
  408. }
  409. messageSockPair := filePair{parentInitPipe, childInitPipe}
  410. parentLogPipe, childLogPipe, err := os.Pipe()
  411. if err != nil {
  412. return nil, fmt.Errorf("unable to create log pipe: %w", err)
  413. }
  414. logFilePair := filePair{parentLogPipe, childLogPipe}
  415. cmd := c.commandTemplate(p, childInitPipe, childLogPipe)
  416. if !p.Init {
  417. return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair)
  418. }
  419. // We only set up fifoFd if we're not doing a `runc exec`. The historic
  420. // reason for this is that previously we would pass a dirfd that allowed
  421. // for container rootfs escape (and not doing it in `runc exec` avoided
  422. // that problem), but we no longer do that. However, there's no need to do
  423. // this for `runc exec` so we just keep it this way to be safe.
  424. if err := c.includeExecFifo(cmd); err != nil {
  425. return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
  426. }
  427. return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
  428. }
  429. func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd {
  430. cmd := exec.Command(c.initPath, c.initArgs[1:]...)
  431. cmd.Args[0] = c.initArgs[0]
  432. cmd.Stdin = p.Stdin
  433. cmd.Stdout = p.Stdout
  434. cmd.Stderr = p.Stderr
  435. cmd.Dir = c.config.Rootfs
  436. if cmd.SysProcAttr == nil {
  437. cmd.SysProcAttr = &unix.SysProcAttr{}
  438. }
  439. cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS"))
  440. cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
  441. if p.ConsoleSocket != nil {
  442. cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
  443. cmd.Env = append(cmd.Env,
  444. "_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
  445. )
  446. }
  447. cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe)
  448. cmd.Env = append(cmd.Env,
  449. "_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
  450. "_LIBCONTAINER_STATEDIR="+c.root,
  451. )
  452. cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
  453. cmd.Env = append(cmd.Env,
  454. "_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
  455. "_LIBCONTAINER_LOGLEVEL="+p.LogLevel,
  456. )
  457. // NOTE: when running a container with no PID namespace and the parent process spawning the container is
  458. // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
  459. // even with the parent still running.
  460. if c.config.ParentDeathSignal > 0 {
  461. cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal)
  462. }
  463. return cmd
  464. }
  465. // shouldSendMountSources says whether the child process must setup bind mounts with
  466. // the source pre-opened (O_PATH) in the host user namespace.
  467. // See https://github.com/opencontainers/runc/issues/2484
  468. func (c *linuxContainer) shouldSendMountSources() bool {
  469. // Passing the mount sources via SCM_RIGHTS is only necessary when
  470. // both userns and mntns are active.
  471. if !c.config.Namespaces.Contains(configs.NEWUSER) ||
  472. !c.config.Namespaces.Contains(configs.NEWNS) {
  473. return false
  474. }
  475. // nsexec.c send_mountsources() requires setns(mntns) capabilities
  476. // CAP_SYS_CHROOT and CAP_SYS_ADMIN.
  477. if c.config.RootlessEUID {
  478. return false
  479. }
  480. // We need to send sources if there are bind-mounts.
  481. for _, m := range c.config.Mounts {
  482. if m.IsBind() {
  483. return true
  484. }
  485. }
  486. return false
  487. }
  488. func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
  489. cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
  490. nsMaps := make(map[configs.NamespaceType]string)
  491. for _, ns := range c.config.Namespaces {
  492. if ns.Path != "" {
  493. nsMaps[ns.Type] = ns.Path
  494. }
  495. }
  496. _, sharePidns := nsMaps[configs.NEWPID]
  497. data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
  498. if err != nil {
  499. return nil, err
  500. }
  501. if c.shouldSendMountSources() {
  502. // Elements on this slice will be paired with mounts (see StartInitialization() and
  503. // prepareRootfs()). This slice MUST have the same size as c.config.Mounts.
  504. mountFds := make([]int, len(c.config.Mounts))
  505. for i, m := range c.config.Mounts {
  506. if !m.IsBind() {
  507. // Non bind-mounts do not use an fd.
  508. mountFds[i] = -1
  509. continue
  510. }
  511. // The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need
  512. // to allocate a fd so that we know the number to pass in the environment variable. The fd
  513. // must not be closed before cmd.Start(), so we reuse messageSockPair.child because the
  514. // lifecycle of that fd is already taken care of.
  515. cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child)
  516. mountFds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
  517. }
  518. mountFdsJson, err := json.Marshal(mountFds)
  519. if err != nil {
  520. return nil, fmt.Errorf("Error creating _LIBCONTAINER_MOUNT_FDS: %w", err)
  521. }
  522. cmd.Env = append(cmd.Env,
  523. "_LIBCONTAINER_MOUNT_FDS="+string(mountFdsJson),
  524. )
  525. }
  526. init := &initProcess{
  527. cmd: cmd,
  528. messageSockPair: messageSockPair,
  529. logFilePair: logFilePair,
  530. manager: c.cgroupManager,
  531. intelRdtManager: c.intelRdtManager,
  532. config: c.newInitConfig(p),
  533. container: c,
  534. process: p,
  535. bootstrapData: data,
  536. sharePidns: sharePidns,
  537. }
  538. c.initProcess = init
  539. return init, nil
  540. }
  541. func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) {
  542. cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
  543. state, err := c.currentState()
  544. if err != nil {
  545. return nil, fmt.Errorf("unable to get container state: %w", err)
  546. }
  547. // for setns process, we don't have to set cloneflags as the process namespaces
  548. // will only be set via setns syscall
  549. data, err := c.bootstrapData(0, state.NamespacePaths, initSetns)
  550. if err != nil {
  551. return nil, err
  552. }
  553. proc := &setnsProcess{
  554. cmd: cmd,
  555. cgroupPaths: state.CgroupPaths,
  556. rootlessCgroups: c.config.RootlessCgroups,
  557. intelRdtPath: state.IntelRdtPath,
  558. messageSockPair: messageSockPair,
  559. logFilePair: logFilePair,
  560. manager: c.cgroupManager,
  561. config: c.newInitConfig(p),
  562. process: p,
  563. bootstrapData: data,
  564. initProcessPid: state.InitProcessPid,
  565. }
  566. if len(p.SubCgroupPaths) > 0 {
  567. if add, ok := p.SubCgroupPaths[""]; ok {
  568. // cgroup v1: using the same path for all controllers.
  569. // cgroup v2: the only possible way.
  570. for k := range proc.cgroupPaths {
  571. subPath := path.Join(proc.cgroupPaths[k], add)
  572. if !strings.HasPrefix(subPath, proc.cgroupPaths[k]) {
  573. return nil, fmt.Errorf("%s is not a sub cgroup path", add)
  574. }
  575. proc.cgroupPaths[k] = subPath
  576. }
  577. // cgroup v2: do not try to join init process's cgroup
  578. // as a fallback (see (*setnsProcess).start).
  579. proc.initProcessPid = 0
  580. } else {
  581. // Per-controller paths.
  582. for ctrl, add := range p.SubCgroupPaths {
  583. if val, ok := proc.cgroupPaths[ctrl]; ok {
  584. subPath := path.Join(val, add)
  585. if !strings.HasPrefix(subPath, val) {
  586. return nil, fmt.Errorf("%s is not a sub cgroup path", add)
  587. }
  588. proc.cgroupPaths[ctrl] = subPath
  589. } else {
  590. return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl)
  591. }
  592. }
  593. }
  594. }
  595. return proc, nil
  596. }
  597. func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
  598. cfg := &initConfig{
  599. Config: c.config,
  600. Args: process.Args,
  601. Env: process.Env,
  602. User: process.User,
  603. AdditionalGroups: process.AdditionalGroups,
  604. Cwd: process.Cwd,
  605. Capabilities: process.Capabilities,
  606. PassedFilesCount: len(process.ExtraFiles),
  607. ContainerId: c.ID(),
  608. NoNewPrivileges: c.config.NoNewPrivileges,
  609. RootlessEUID: c.config.RootlessEUID,
  610. RootlessCgroups: c.config.RootlessCgroups,
  611. AppArmorProfile: c.config.AppArmorProfile,
  612. ProcessLabel: c.config.ProcessLabel,
  613. Rlimits: c.config.Rlimits,
  614. CreateConsole: process.ConsoleSocket != nil,
  615. ConsoleWidth: process.ConsoleWidth,
  616. ConsoleHeight: process.ConsoleHeight,
  617. }
  618. if process.NoNewPrivileges != nil {
  619. cfg.NoNewPrivileges = *process.NoNewPrivileges
  620. }
  621. if process.AppArmorProfile != "" {
  622. cfg.AppArmorProfile = process.AppArmorProfile
  623. }
  624. if process.Label != "" {
  625. cfg.ProcessLabel = process.Label
  626. }
  627. if len(process.Rlimits) > 0 {
  628. cfg.Rlimits = process.Rlimits
  629. }
  630. if cgroups.IsCgroup2UnifiedMode() {
  631. cfg.Cgroup2Path = c.cgroupManager.Path("")
  632. }
  633. return cfg
  634. }
  635. func (c *linuxContainer) Destroy() error {
  636. c.m.Lock()
  637. defer c.m.Unlock()
  638. return c.state.destroy()
  639. }
  640. func (c *linuxContainer) Pause() error {
  641. c.m.Lock()
  642. defer c.m.Unlock()
  643. status, err := c.currentStatus()
  644. if err != nil {
  645. return err
  646. }
  647. switch status {
  648. case Running, Created:
  649. if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
  650. return err
  651. }
  652. return c.state.transition(&pausedState{
  653. c: c,
  654. })
  655. }
  656. return ErrNotRunning
  657. }
  658. func (c *linuxContainer) Resume() error {
  659. c.m.Lock()
  660. defer c.m.Unlock()
  661. status, err := c.currentStatus()
  662. if err != nil {
  663. return err
  664. }
  665. if status != Paused {
  666. return ErrNotPaused
  667. }
  668. if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
  669. return err
  670. }
  671. return c.state.transition(&runningState{
  672. c: c,
  673. })
  674. }
  675. func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
  676. // XXX(cyphar): This requires cgroups.
  677. if c.config.RootlessCgroups {
  678. logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups")
  679. }
  680. path := c.cgroupManager.Path("memory")
  681. if cgroups.IsCgroup2UnifiedMode() {
  682. return notifyOnOOMV2(path)
  683. }
  684. return notifyOnOOM(path)
  685. }
  686. func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
  687. // XXX(cyphar): This requires cgroups.
  688. if c.config.RootlessCgroups {
  689. logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups")
  690. }
  691. return notifyMemoryPressure(c.cgroupManager.Path("memory"), level)
  692. }
  693. var criuFeatures *criurpc.CriuFeatures
  694. func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {
  695. t := criurpc.CriuReqType_FEATURE_CHECK
  696. // make sure the features we are looking for are really not from
  697. // some previous check
  698. criuFeatures = nil
  699. req := &criurpc.CriuReq{
  700. Type: &t,
  701. // Theoretically this should not be necessary but CRIU
  702. // segfaults if Opts is empty.
  703. // Fixed in CRIU 2.12
  704. Opts: rpcOpts,
  705. Features: criuFeat,
  706. }
  707. err := c.criuSwrk(nil, req, criuOpts, nil)
  708. if err != nil {
  709. logrus.Debugf("%s", err)
  710. return errors.New("CRIU feature check failed")
  711. }
  712. missingFeatures := false
  713. // The outer if checks if the fields actually exist
  714. if (criuFeat.MemTrack != nil) &&
  715. (criuFeatures.MemTrack != nil) {
  716. // The inner if checks if they are set to true
  717. if *criuFeat.MemTrack && !*criuFeatures.MemTrack {
  718. missingFeatures = true
  719. logrus.Debugf("CRIU does not support MemTrack")
  720. }
  721. }
  722. // This needs to be repeated for every new feature check.
  723. // Is there a way to put this in a function. Reflection?
  724. if (criuFeat.LazyPages != nil) &&
  725. (criuFeatures.LazyPages != nil) {
  726. if *criuFeat.LazyPages && !*criuFeatures.LazyPages {
  727. missingFeatures = true
  728. logrus.Debugf("CRIU does not support LazyPages")
  729. }
  730. }
  731. if missingFeatures {
  732. return errors.New("CRIU is missing features")
  733. }
  734. return nil
  735. }
  736. func compareCriuVersion(criuVersion int, minVersion int) error {
  737. // simple function to perform the actual version compare
  738. if criuVersion < minVersion {
  739. return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion)
  740. }
  741. return nil
  742. }
  743. // checkCriuVersion checks Criu version greater than or equal to minVersion
  744. func (c *linuxContainer) checkCriuVersion(minVersion int) error {
  745. // If the version of criu has already been determined there is no need
  746. // to ask criu for the version again. Use the value from c.criuVersion.
  747. if c.criuVersion != 0 {
  748. return compareCriuVersion(c.criuVersion, minVersion)
  749. }
  750. criu := criu.MakeCriu()
  751. criu.SetCriuPath(c.criuPath)
  752. var err error
  753. c.criuVersion, err = criu.GetCriuVersion()
  754. if err != nil {
  755. return fmt.Errorf("CRIU version check failed: %w", err)
  756. }
  757. return compareCriuVersion(c.criuVersion, minVersion)
  758. }
  759. const descriptorsFilename = "descriptors.json"
  760. func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
  761. mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
  762. if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
  763. mountDest = dest[len(c.config.Rootfs):]
  764. }
  765. extMnt := &criurpc.ExtMountMap{
  766. Key: proto.String(mountDest),
  767. Val: proto.String(mountDest),
  768. }
  769. req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
  770. }
  771. func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error {
  772. for _, path := range c.config.MaskPaths {
  773. fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path))
  774. if err != nil {
  775. if os.IsNotExist(err) {
  776. continue
  777. }
  778. return err
  779. }
  780. if fi.IsDir() {
  781. continue
  782. }
  783. extMnt := &criurpc.ExtMountMap{
  784. Key: proto.String(path),
  785. Val: proto.String("/dev/null"),
  786. }
  787. req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
  788. }
  789. return nil
  790. }
  791. func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) {
  792. // CRIU will evaluate a configuration starting with release 3.11.
  793. // Settings in the configuration file will overwrite RPC settings.
  794. // Look for annotations. The annotation 'org.criu.config'
  795. // specifies if CRIU should use a different, container specific
  796. // configuration file.
  797. _, annotations := utils.Annotations(c.config.Labels)
  798. configFile, exists := annotations["org.criu.config"]
  799. if exists {
  800. // If the annotation 'org.criu.config' exists and is set
  801. // to a non-empty string, tell CRIU to use that as a
  802. // configuration file. If the file does not exist, CRIU
  803. // will just ignore it.
  804. if configFile != "" {
  805. rpcOpts.ConfigFile = proto.String(configFile)
  806. }
  807. // If 'org.criu.config' exists and is set to an empty
  808. // string, a runc specific CRIU configuration file will
  809. // be not set at all.
  810. } else {
  811. // If the mentioned annotation has not been found, specify
  812. // a default CRIU configuration file.
  813. rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf")
  814. }
  815. }
  816. func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool {
  817. var minVersion int
  818. switch t {
  819. case configs.NEWNET:
  820. // CRIU supports different external namespace with different released CRIU versions.
  821. // For network namespaces to work we need at least criu 3.11.0 => 31100.
  822. minVersion = 31100
  823. case configs.NEWPID:
  824. // For PID namespaces criu 31500 is needed.
  825. minVersion = 31500
  826. default:
  827. return false
  828. }
  829. return c.checkCriuVersion(minVersion) == nil
  830. }
  831. func criuNsToKey(t configs.NamespaceType) string {
  832. return "extRoot" + strings.Title(configs.NsName(t)) + "NS"
  833. }
  834. func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error {
  835. if !c.criuSupportsExtNS(t) {
  836. return nil
  837. }
  838. nsPath := c.config.Namespaces.PathOf(t)
  839. if nsPath == "" {
  840. return nil
  841. }
  842. // CRIU expects the information about an external namespace
  843. // like this: --external <TYPE>[<inode>]:<key>
  844. // This <key> is always 'extRoot<TYPE>NS'.
  845. var ns unix.Stat_t
  846. if err := unix.Stat(nsPath, &ns); err != nil {
  847. return err
  848. }
  849. criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t))
  850. rpcOpts.External = append(rpcOpts.External, criuExternal)
  851. return nil
  852. }
  853. func (c *linuxContainer) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error {
  854. for _, ns := range c.config.Namespaces {
  855. switch ns.Type {
  856. case configs.NEWNET, configs.NEWPID:
  857. // If the container is running in a network or PID namespace and has
  858. // a path to the network or PID namespace configured, we will dump
  859. // that network or PID namespace as an external namespace and we
  860. // will expect that the namespace exists during restore.
  861. // This basically means that CRIU will ignore the namespace
  862. // and expect it to be setup correctly.
  863. if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil {
  864. return err
  865. }
  866. default:
  867. // For all other namespaces except NET and PID CRIU has
  868. // a simpler way of joining the existing namespace if set
  869. nsPath := c.config.Namespaces.PathOf(ns.Type)
  870. if nsPath == "" {
  871. continue
  872. }
  873. if ns.Type == configs.NEWCGROUP {
  874. // CRIU has no code to handle NEWCGROUP
  875. return fmt.Errorf("Do not know how to handle namespace %v", ns.Type)
  876. }
  877. // CRIU has code to handle NEWTIME, but it does not seem to be defined in runc
  878. // CRIU will issue a warning for NEWUSER:
  879. // criu/namespaces.c: 'join-ns with user-namespace is not fully tested and dangerous'
  880. rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{
  881. Ns: proto.String(configs.NsName(ns.Type)),
  882. NsFile: proto.String(nsPath),
  883. })
  884. }
  885. }
  886. return nil
  887. }
  888. func (c *linuxContainer) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error {
  889. if !c.criuSupportsExtNS(t) {
  890. return nil
  891. }
  892. nsPath := c.config.Namespaces.PathOf(t)
  893. if nsPath == "" {
  894. return nil
  895. }
  896. // CRIU wants the information about an existing namespace
  897. // like this: --inherit-fd fd[<fd>]:<key>
  898. // The <key> needs to be the same as during checkpointing.
  899. // We are always using 'extRoot<TYPE>NS' as the key in this.
  900. nsFd, err := os.Open(nsPath)
  901. if err != nil {
  902. logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
  903. return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
  904. }
  905. inheritFd := &criurpc.InheritFd{
  906. Key: proto.String(criuNsToKey(t)),
  907. // The offset of four is necessary because 0, 1, 2 and 3 are
  908. // already used by stdin, stdout, stderr, 'criu swrk' socket.
  909. Fd: proto.Int32(int32(4 + len(*extraFiles))),
  910. }
  911. rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd)
  912. // All open FDs need to be transferred to CRIU via extraFiles
  913. *extraFiles = append(*extraFiles, nsFd)
  914. return nil
  915. }
  916. func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
  917. c.m.Lock()
  918. defer c.m.Unlock()
  919. // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
  920. // (CLI prints a warning)
  921. // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has
  922. // support for doing unprivileged dumps, but the setup of
  923. // rootless containers might make this complicated.
  924. // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0
  925. if err := c.checkCriuVersion(30000); err != nil {
  926. return err
  927. }
  928. if criuOpts.ImagesDirectory == "" {
  929. return errors.New("invalid directory to save checkpoint")
  930. }
  931. // Since a container can be C/R'ed multiple times,
  932. // the checkpoint directory may already exist.
  933. if err := os.Mkdir(criuOpts.ImagesDirectory, 0o700); err != nil && !os.IsExist(err) {
  934. return err
  935. }
  936. imageDir, err := os.Open(criuOpts.ImagesDirectory)
  937. if err != nil {
  938. return err
  939. }
  940. defer imageDir.Close()
  941. rpcOpts := criurpc.CriuOpts{
  942. ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
  943. LogLevel: proto.Int32(4),
  944. LogFile: proto.String("dump.log"),
  945. Root: proto.String(c.config.Rootfs),
  946. ManageCgroups: proto.Bool(true),
  947. NotifyScripts: proto.Bool(true),
  948. Pid: proto.Int32(int32(c.initProcess.pid())),
  949. ShellJob: proto.Bool(criuOpts.ShellJob),
  950. LeaveRunning: proto.Bool(criuOpts.LeaveRunning),
  951. TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
  952. ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
  953. FileLocks: proto.Bool(criuOpts.FileLocks),
  954. EmptyNs: proto.Uint32(criuOpts.EmptyNs),
  955. OrphanPtsMaster: proto.Bool(true),
  956. AutoDedup: proto.Bool(criuOpts.AutoDedup),
  957. LazyPages: proto.Bool(criuOpts.LazyPages),
  958. }
  959. // if criuOpts.WorkDirectory is not set, criu default is used.
  960. if criuOpts.WorkDirectory != "" {
  961. if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
  962. return err
  963. }
  964. workDir, err := os.Open(criuOpts.WorkDirectory)
  965. if err != nil {
  966. return err
  967. }
  968. defer workDir.Close()
  969. rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
  970. }
  971. c.handleCriuConfigurationFile(&rpcOpts)
  972. // If the container is running in a network namespace and has
  973. // a path to the network namespace configured, we will dump
  974. // that network namespace as an external namespace and we
  975. // will expect that the namespace exists during restore.
  976. // This basically means that CRIU will ignore the namespace
  977. // and expect to be setup correctly.
  978. if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil {
  979. return err
  980. }
  981. // Same for possible external PID namespaces
  982. if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil {
  983. return err
  984. }
  985. // CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup
  986. // is not set, CRIU uses ptrace() to pause the processes.
  987. // Note cgroup v2 freezer is only supported since CRIU release 3.14.
  988. if !cgroups.IsCgroup2UnifiedMode() || c.checkCriuVersion(31400) == nil {
  989. if fcg := c.cgroupManager.Path("freezer"); fcg != "" {
  990. rpcOpts.FreezeCgroup = proto.String(fcg)
  991. }
  992. }
  993. // append optional criu opts, e.g., page-server and port
  994. if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
  995. rpcOpts.Ps = &criurpc.CriuPageServerInfo{
  996. Address: proto.String(criuOpts.PageServer.Address),
  997. Port: proto.Int32(criuOpts.PageServer.Port),
  998. }
  999. }
  1000. // pre-dump may need parentImage param to complete iterative migration
  1001. if criuOpts.ParentImage != "" {
  1002. rpcOpts.ParentImg = proto.String(criuOpts.ParentImage)
  1003. rpcOpts.TrackMem = proto.Bool(true)
  1004. }
  1005. // append optional manage cgroups mode
  1006. if criuOpts.ManageCgroupsMode != 0 {
  1007. mode := criuOpts.ManageCgroupsMode
  1008. rpcOpts.ManageCgroupsMode = &mode
  1009. }
  1010. var t criurpc.CriuReqType
  1011. if criuOpts.PreDump {
  1012. feat := criurpc.CriuFeatures{
  1013. MemTrack: proto.Bool(true),
  1014. }
  1015. if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
  1016. return err
  1017. }
  1018. t = criurpc.CriuReqType_PRE_DUMP
  1019. } else {
  1020. t = criurpc.CriuReqType_DUMP
  1021. }
  1022. if criuOpts.LazyPages {
  1023. // lazy migration requested; check if criu supports it
  1024. feat := criurpc.CriuFeatures{
  1025. LazyPages: proto.Bool(true),
  1026. }
  1027. if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil {
  1028. return err
  1029. }
  1030. if fd := criuOpts.StatusFd; fd != -1 {
  1031. // check that the FD is valid
  1032. flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0)
  1033. if err != nil {
  1034. return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err)
  1035. }
  1036. // and writable
  1037. if flags&unix.O_WRONLY == 0 {
  1038. return fmt.Errorf("invalid --status-fd argument %d: not writable", fd)
  1039. }
  1040. if c.checkCriuVersion(31500) != nil {
  1041. // For criu 3.15+, use notifications (see case "status-ready"
  1042. // in criuNotifications). Otherwise, rely on criu status fd.
  1043. rpcOpts.StatusFd = proto.Int32(int32(fd))
  1044. }
  1045. }
  1046. }
  1047. req := &criurpc.CriuReq{
  1048. Type: &t,
  1049. Opts: &rpcOpts,
  1050. }
  1051. // no need to dump all this in pre-dump
  1052. if !criuOpts.PreDump {
  1053. hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
  1054. for _, m := range c.config.Mounts {
  1055. switch m.Device {
  1056. case "bind":
  1057. c.addCriuDumpMount(req, m)
  1058. case "cgroup":
  1059. if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
  1060. // real mount(s)
  1061. continue
  1062. }
  1063. // a set of "external" bind mounts
  1064. binds, err := getCgroupMounts(m)
  1065. if err != nil {
  1066. return err
  1067. }
  1068. for _, b := range binds {
  1069. c.addCriuDumpMount(req, b)
  1070. }
  1071. }
  1072. }
  1073. if err := c.addMaskPaths(req); err != nil {
  1074. return err
  1075. }
  1076. for _, node := range c.config.Devices {
  1077. m := &configs.Mount{Destination: node.Path, Source: node.Path}
  1078. c.addCriuDumpMount(req, m)
  1079. }
  1080. // Write the FD info to a file in the image directory
  1081. fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
  1082. if err != nil {
  1083. return err
  1084. }
  1085. err = os.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600)
  1086. if err != nil {
  1087. return err
  1088. }
  1089. }
  1090. err = c.criuSwrk(nil, req, criuOpts, nil)
  1091. if err != nil {
  1092. return err
  1093. }
  1094. return nil
  1095. }
  1096. func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
  1097. mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
  1098. if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
  1099. mountDest = dest[len(c.config.Rootfs):]
  1100. }
  1101. extMnt := &criurpc.ExtMountMap{
  1102. Key: proto.String(mountDest),
  1103. Val: proto.String(m.Source),
  1104. }
  1105. req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
  1106. }
  1107. func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
  1108. for _, iface := range c.config.Networks {
  1109. switch iface.Type {
  1110. case "veth":
  1111. veth := new(criurpc.CriuVethPair)
  1112. veth.IfOut = proto.String(iface.HostInterfaceName)
  1113. veth.IfIn = proto.String(iface.Name)
  1114. req.Opts.Veths = append(req.Opts.Veths, veth)
  1115. case "loopback":
  1116. // Do nothing
  1117. }
  1118. }
  1119. for _, i := range criuOpts.VethPairs {
  1120. veth := new(criurpc.CriuVethPair)
  1121. veth.IfOut = proto.String(i.HostInterfaceName)
  1122. veth.IfIn = proto.String(i.ContainerInterfaceName)
  1123. req.Opts.Veths = append(req.Opts.Veths, veth)
  1124. }
  1125. }
  1126. // makeCriuRestoreMountpoints makes the actual mountpoints for the
  1127. // restore using CRIU. This function is inspired from the code in
  1128. // rootfs_linux.go
  1129. func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error {
  1130. switch m.Device {
  1131. case "cgroup":
  1132. // No mount point(s) need to be created:
  1133. //
  1134. // * for v1, mount points are saved by CRIU because
  1135. // /sys/fs/cgroup is a tmpfs mount
  1136. //
  1137. // * for v2, /sys/fs/cgroup is a real mount, but
  1138. // the mountpoint appears as soon as /sys is mounted
  1139. return nil
  1140. case "bind":
  1141. // The prepareBindMount() function checks if source
  1142. // exists. So it cannot be used for other filesystem types.
  1143. // TODO: pass something else than nil? Not sure if criu is
  1144. // impacted by issue #2484
  1145. if err := prepareBindMount(m, c.config.Rootfs, nil); err != nil {
  1146. return err
  1147. }
  1148. default:
  1149. // for all other filesystems just create the mountpoints
  1150. dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination)
  1151. if err != nil {
  1152. return err
  1153. }
  1154. if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil {
  1155. return err
  1156. }
  1157. if err := os.MkdirAll(dest, 0o755); err != nil {
  1158. return err
  1159. }
  1160. }
  1161. return nil
  1162. }
  1163. // isPathInPrefixList is a small function for CRIU restore to make sure
  1164. // mountpoints, which are on a tmpfs, are not created in the roofs
  1165. func isPathInPrefixList(path string, prefix []string) bool {
  1166. for _, p := range prefix {
  1167. if strings.HasPrefix(path, p+"/") {
  1168. return true
  1169. }
  1170. }
  1171. return false
  1172. }
  1173. // prepareCriuRestoreMounts tries to set up the rootfs of the
  1174. // container to be restored in the same way runc does it for
  1175. // initial container creation. Even for a read-only rootfs container
  1176. // runc modifies the rootfs to add mountpoints which do not exist.
  1177. // This function also creates missing mountpoints as long as they
  1178. // are not on top of a tmpfs, as CRIU will restore tmpfs content anyway.
  1179. func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
  1180. // First get a list of a all tmpfs mounts
  1181. tmpfs := []string{}
  1182. for _, m := range mounts {
  1183. switch m.Device {
  1184. case "tmpfs":
  1185. tmpfs = append(tmpfs, m.Destination)
  1186. }
  1187. }
  1188. // Now go through all mounts and create the mountpoints
  1189. // if the mountpoints are not on a tmpfs, as CRIU will
  1190. // restore the complete tmpfs content from its checkpoint.
  1191. umounts := []string{}
  1192. defer func() {
  1193. for _, u := range umounts {
  1194. _ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error {
  1195. if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil {
  1196. if e != unix.EINVAL { //nolint:errorlint // unix errors are bare
  1197. // Ignore EINVAL as it means 'target is not a mount point.'
  1198. // It probably has already been unmounted.
  1199. logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e)
  1200. }
  1201. }
  1202. return nil
  1203. })
  1204. }
  1205. }()
  1206. for _, m := range mounts {
  1207. if !isPathInPrefixList(m.Destination, tmpfs) {
  1208. if err := c.makeCriuRestoreMountpoints(m); err != nil {
  1209. return err
  1210. }
  1211. // If the mount point is a bind mount, we need to mount
  1212. // it now so that runc can create the necessary mount
  1213. // points for mounts in bind mounts.
  1214. // This also happens during initial container creation.
  1215. // Without this CRIU restore will fail
  1216. // See: https://github.com/opencontainers/runc/issues/2748
  1217. // It is also not necessary to order the mount points
  1218. // because during initial container creation mounts are
  1219. // set up in the order they are configured.
  1220. if m.Device == "bind" {
  1221. if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(procfd string) error {
  1222. if err := mount(m.Source, m.Destination, procfd, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
  1223. return err
  1224. }
  1225. return nil
  1226. }); err != nil {
  1227. return err
  1228. }
  1229. umounts = append(umounts, m.Destination)
  1230. }
  1231. }
  1232. }
  1233. return nil
  1234. }
  1235. func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
  1236. c.m.Lock()
  1237. defer c.m.Unlock()
  1238. var extraFiles []*os.File
  1239. // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS().
  1240. // (CLI prints a warning)
  1241. // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have
  1242. // support for unprivileged restore at the moment.
  1243. // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0
  1244. if err := c.checkCriuVersion(30000); err != nil {
  1245. return err
  1246. }
  1247. if criuOpts.ImagesDirectory == "" {
  1248. return errors.New("invalid directory to restore checkpoint")
  1249. }
  1250. imageDir, err := os.Open(criuOpts.ImagesDirectory)
  1251. if err != nil {
  1252. return err
  1253. }
  1254. defer imageDir.Close()
  1255. // CRIU has a few requirements for a root directory:
  1256. // * it must be a mount point
  1257. // * its parent must not be overmounted
  1258. // c.config.Rootfs is bind-mounted to a temporary directory
  1259. // to satisfy these requirements.
  1260. root := filepath.Join(c.root, "criu-root")
  1261. if err := os.Mkdir(root, 0o755); err != nil {
  1262. return err
  1263. }
  1264. defer os.Remove(root)
  1265. root, err = filepath.EvalSymlinks(root)
  1266. if err != nil {
  1267. return err
  1268. }
  1269. err = mount(c.config.Rootfs, root, "", "", unix.MS_BIND|unix.MS_REC, "")
  1270. if err != nil {
  1271. return err
  1272. }
  1273. defer unix.Unmount(root, unix.MNT_DETACH) //nolint: errcheck
  1274. t := criurpc.CriuReqType_RESTORE
  1275. req := &criurpc.CriuReq{
  1276. Type: &t,
  1277. Opts: &criurpc.CriuOpts{
  1278. ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
  1279. EvasiveDevices: proto.Bool(true),
  1280. LogLevel: proto.Int32(4),
  1281. LogFile: proto.String("restore.log"),
  1282. RstSibling: proto.Bool(true),
  1283. Root: proto.String(root),
  1284. ManageCgroups: proto.Bool(true),
  1285. NotifyScripts: proto.Bool(true),
  1286. ShellJob: proto.Bool(criuOpts.ShellJob),
  1287. ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
  1288. TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
  1289. FileLocks: proto.Bool(criuOpts.FileLocks),
  1290. EmptyNs: proto.Uint32(criuOpts.EmptyNs),
  1291. OrphanPtsMaster: proto.Bool(true),
  1292. AutoDedup: proto.Bool(criuOpts.AutoDedup),
  1293. LazyPages: proto.Bool(criuOpts.LazyPages),
  1294. },
  1295. }
  1296. if criuOpts.LsmProfile != "" {
  1297. // CRIU older than 3.16 has a bug which breaks the possibility
  1298. // to set a different LSM profile.
  1299. if err := c.checkCriuVersion(31600); err != nil {
  1300. return errors.New("--lsm-profile requires at least CRIU 3.16")
  1301. }
  1302. req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile)
  1303. }
  1304. if criuOpts.LsmMountContext != "" {
  1305. if err := c.checkCriuVersion(31600); err != nil {
  1306. return errors.New("--lsm-mount-context requires at least CRIU 3.16")
  1307. }
  1308. req.Opts.LsmMountContext = proto.String(criuOpts.LsmMountContext)
  1309. }
  1310. if criuOpts.WorkDirectory != "" {
  1311. // Since a container can be C/R'ed multiple times,
  1312. // the work directory may already exist.
  1313. if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
  1314. return err
  1315. }
  1316. workDir, err := os.Open(criuOpts.WorkDirectory)
  1317. if err != nil {
  1318. return err
  1319. }
  1320. defer workDir.Close()
  1321. req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
  1322. }
  1323. c.handleCriuConfigurationFile(req.Opts)
  1324. if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil {
  1325. return err
  1326. }
  1327. // This will modify the rootfs of the container in the same way runc
  1328. // modifies the container during initial creation.
  1329. if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil {
  1330. return err
  1331. }
  1332. hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP)
  1333. for _, m := range c.config.Mounts {
  1334. switch m.Device {
  1335. case "bind":
  1336. c.addCriuRestoreMount(req, m)
  1337. case "cgroup":
  1338. if cgroups.IsCgroup2UnifiedMode() || hasCgroupns {
  1339. continue
  1340. }
  1341. // cgroup v1 is a set of bind mounts, unless cgroupns is used
  1342. binds, err := getCgroupMounts(m)
  1343. if err != nil {
  1344. return err
  1345. }
  1346. for _, b := range binds {
  1347. c.addCriuRestoreMount(req, b)
  1348. }
  1349. }
  1350. }
  1351. if len(c.config.MaskPaths) > 0 {
  1352. m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"}
  1353. c.addCriuRestoreMount(req, m)
  1354. }
  1355. for _, node := range c.config.Devices {
  1356. m := &configs.Mount{Destination: node.Path, Source: node.Path}
  1357. c.addCriuRestoreMount(req, m)
  1358. }
  1359. if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 {
  1360. c.restoreNetwork(req, criuOpts)
  1361. }
  1362. // append optional manage cgroups mode
  1363. if criuOpts.ManageCgroupsMode != 0 {
  1364. mode := criuOpts.ManageCgroupsMode
  1365. req.Opts.ManageCgroupsMode = &mode
  1366. }
  1367. var (
  1368. fds []string
  1369. fdJSON []byte
  1370. )
  1371. if fdJSON, err = os.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
  1372. return err
  1373. }
  1374. if err := json.Unmarshal(fdJSON, &fds); err != nil {
  1375. return err
  1376. }
  1377. for i := range fds {
  1378. if s := fds[i]; strings.Contains(s, "pipe:") {
  1379. inheritFd := new(criurpc.InheritFd)
  1380. inheritFd.Key = proto.String(s)
  1381. inheritFd.Fd = proto.Int32(int32(i))
  1382. req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
  1383. }
  1384. }
  1385. err = c.criuSwrk(process, req, criuOpts, extraFiles)
  1386. // Now that CRIU is done let's close all opened FDs CRIU needed.
  1387. for _, fd := range extraFiles {
  1388. fd.Close()
  1389. }
  1390. return err
  1391. }
  1392. func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
  1393. // need to apply cgroups only on restore
  1394. if req.GetType() != criurpc.CriuReqType_RESTORE {
  1395. return nil
  1396. }
  1397. // XXX: Do we need to deal with this case? AFAIK criu still requires root.
  1398. if err := c.cgroupManager.Apply(pid); err != nil {
  1399. return err
  1400. }
  1401. if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil {
  1402. return err
  1403. }
  1404. if cgroups.IsCgroup2UnifiedMode() {
  1405. return nil
  1406. }
  1407. // the stuff below is cgroupv1-specific
  1408. path := fmt.Sprintf("/proc/%d/cgroup", pid)
  1409. cgroupsPaths, err := cgroups.ParseCgroupFile(path)
  1410. if err != nil {
  1411. return err
  1412. }
  1413. for c, p := range cgroupsPaths {
  1414. cgroupRoot := &criurpc.CgroupRoot{
  1415. Ctrl: proto.String(c),
  1416. Path: proto.String(p),
  1417. }
  1418. req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
  1419. }
  1420. return nil
  1421. }
  1422. func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, extraFiles []*os.File) error {
  1423. fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0)
  1424. if err != nil {
  1425. return err
  1426. }
  1427. var logPath string
  1428. if opts != nil {
  1429. logPath = filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
  1430. } else {
  1431. // For the VERSION RPC 'opts' is set to 'nil' and therefore
  1432. // opts.WorkDirectory does not exist. Set logPath to "".
  1433. logPath = ""
  1434. }
  1435. criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
  1436. criuClientFileCon, err := net.FileConn(criuClient)
  1437. criuClient.Close()
  1438. if err != nil {
  1439. return err
  1440. }
  1441. criuClientCon := criuClientFileCon.(*net.UnixConn)
  1442. defer criuClientCon.Close()
  1443. criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
  1444. defer criuServer.Close()
  1445. args := []string{"swrk", "3"}
  1446. if c.criuVersion != 0 {
  1447. // If the CRIU Version is still '0' then this is probably
  1448. // the initial CRIU run to detect the version. Skip it.
  1449. logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
  1450. }
  1451. cmd := exec.Command(c.criuPath, args...)
  1452. if process != nil {
  1453. cmd.Stdin = process.Stdin
  1454. cmd.Stdout = process.Stdout
  1455. cmd.Stderr = process.Stderr
  1456. }
  1457. cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
  1458. if extraFiles != nil {
  1459. cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...)
  1460. }
  1461. if err := cmd.Start(); err != nil {
  1462. return err
  1463. }
  1464. // we close criuServer so that even if CRIU crashes or unexpectedly exits, runc will not hang.
  1465. criuServer.Close()
  1466. // cmd.Process will be replaced by a restored init.
  1467. criuProcess := cmd.Process
  1468. var criuProcessState *os.ProcessState
  1469. defer func() {
  1470. if criuProcessState == nil {
  1471. criuClientCon.Close()
  1472. _, err := criuProcess.Wait()
  1473. if err != nil {
  1474. logrus.Warnf("wait on criuProcess returned %v", err)
  1475. }
  1476. }
  1477. }()
  1478. if err := c.criuApplyCgroups(criuProcess.Pid, req); err != nil {
  1479. return err
  1480. }
  1481. var extFds []string
  1482. if process != nil {
  1483. extFds, err = getPipeFds(criuProcess.Pid)
  1484. if err != nil {
  1485. return err
  1486. }
  1487. }
  1488. logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
  1489. // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts()
  1490. // should be empty. For older CRIU versions it still will be
  1491. // available but empty. criurpc.CriuReqType_VERSION actually
  1492. // has no req.GetOpts().
  1493. if logrus.GetLevel() >= logrus.DebugLevel &&
  1494. !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK ||
  1495. req.GetType() == criurpc.CriuReqType_VERSION) {
  1496. val := reflect.ValueOf(req.GetOpts())
  1497. v := reflect.Indirect(val)
  1498. for i := 0; i < v.NumField(); i++ {
  1499. st := v.Type()
  1500. name := st.Field(i).Name
  1501. if 'A' <= name[0] && name[0] <= 'Z' {
  1502. value := val.MethodByName("Get" + name).Call([]reflect.Value{})
  1503. logrus.Debugf("CRIU option %s with value %v", name, value[0])
  1504. }
  1505. }
  1506. }
  1507. data, err := proto.Marshal(req)
  1508. if err != nil {
  1509. return err
  1510. }
  1511. _, err = criuClientCon.Write(data)
  1512. if err != nil {
  1513. return err
  1514. }
  1515. buf := make([]byte, 10*4096)
  1516. oob := make([]byte, 4096)
  1517. for {
  1518. n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob)
  1519. if req.Opts != nil && req.Opts.StatusFd != nil {
  1520. // Close status_fd as soon as we got something back from criu,
  1521. // assuming it has consumed (reopened) it by this time.
  1522. // Otherwise it will might be left open forever and whoever
  1523. // is waiting on it will wait forever.
  1524. fd := int(*req.Opts.StatusFd)
  1525. _ = unix.Close(fd)
  1526. req.Opts.StatusFd = nil
  1527. }
  1528. if err != nil {
  1529. return err
  1530. }
  1531. if n == 0 {
  1532. return errors.New("unexpected EOF")
  1533. }
  1534. if n == len(buf) {
  1535. return errors.New("buffer is too small")
  1536. }
  1537. resp := new(criurpc.CriuResp)
  1538. err = proto.Unmarshal(buf[:n], resp)
  1539. if err != nil {
  1540. return err
  1541. }
  1542. if !resp.GetSuccess() {
  1543. typeString := req.GetType().String()
  1544. return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
  1545. }
  1546. t := resp.GetType()
  1547. switch {
  1548. case t == criurpc.CriuReqType_FEATURE_CHECK:
  1549. logrus.Debugf("Feature check says: %s", resp)
  1550. criuFeatures = resp.GetFeatures()
  1551. case t == criurpc.CriuReqType_NOTIFY:
  1552. if err := c.criuNotifications(resp, process, cmd, opts, extFds, oob[:oobn]); err != nil {
  1553. return err
  1554. }
  1555. t = criurpc.CriuReqType_NOTIFY
  1556. req = &criurpc.CriuReq{
  1557. Type: &t,
  1558. NotifySuccess: proto.Bool(true),
  1559. }
  1560. data, err = proto.Marshal(req)
  1561. if err != nil {
  1562. return err
  1563. }
  1564. _, err = criuClientCon.Write(data)
  1565. if err != nil {
  1566. return err
  1567. }
  1568. continue
  1569. case t == criurpc.CriuReqType_RESTORE:
  1570. case t == criurpc.CriuReqType_DUMP:
  1571. case t == criurpc.CriuReqType_PRE_DUMP:
  1572. default:
  1573. return fmt.Errorf("unable to parse the response %s", resp.String())
  1574. }
  1575. break
  1576. }
  1577. _ = criuClientCon.CloseWrite()
  1578. // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
  1579. // Here we want to wait only the CRIU process.
  1580. criuProcessState, err = criuProcess.Wait()
  1581. if err != nil {
  1582. return err
  1583. }
  1584. // In pre-dump mode CRIU is in a loop and waits for
  1585. // the final DUMP command.
  1586. // The current runc pre-dump approach, however, is
  1587. // start criu in PRE_DUMP once for a single pre-dump
  1588. // and not the whole series of pre-dump, pre-dump, ...m, dump
  1589. // If we got the message CriuReqType_PRE_DUMP it means
  1590. // CRIU was successful and we need to forcefully stop CRIU
  1591. if !criuProcessState.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP {
  1592. return fmt.Errorf("criu failed: %s\nlog file: %s", criuProcessState.String(), logPath)
  1593. }
  1594. return nil
  1595. }
  1596. // block any external network activity
  1597. func lockNetwork(config *configs.Config) error {
  1598. for _, config := range config.Networks {
  1599. strategy, err := getStrategy(config.Type)
  1600. if err != nil {
  1601. return err
  1602. }
  1603. if err := strategy.detach(config); err != nil {
  1604. return err
  1605. }
  1606. }
  1607. return nil
  1608. }
  1609. func unlockNetwork(config *configs.Config) error {
  1610. for _, config := range config.Networks {
  1611. strategy, err := getStrategy(config.Type)
  1612. if err != nil {
  1613. return err
  1614. }
  1615. if err = strategy.attach(config); err != nil {
  1616. return err
  1617. }
  1618. }
  1619. return nil
  1620. }
  1621. func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, cmd *exec.Cmd, opts *CriuOpts, fds []string, oob []byte) error {
  1622. notify := resp.GetNotify()
  1623. if notify == nil {
  1624. return fmt.Errorf("invalid response: %s", resp.String())
  1625. }
  1626. script := notify.GetScript()
  1627. logrus.Debugf("notify: %s\n", script)
  1628. switch script {
  1629. case "post-dump":
  1630. f, err := os.Create(filepath.Join(c.root, "checkpoint"))
  1631. if err != nil {
  1632. return err
  1633. }
  1634. f.Close()
  1635. case "network-unlock":
  1636. if err := unlockNetwork(c.config); err != nil {
  1637. return err
  1638. }
  1639. case "network-lock":
  1640. if err := lockNetwork(c.config); err != nil {
  1641. return err
  1642. }
  1643. case "setup-namespaces":
  1644. if c.config.Hooks != nil {
  1645. s, err := c.currentOCIState()
  1646. if err != nil {
  1647. return nil
  1648. }
  1649. s.Pid = int(notify.GetPid())
  1650. if err := c.config.Hooks[configs.Prestart].RunHooks(s); err != nil {
  1651. return err
  1652. }
  1653. if err := c.config.Hooks[configs.CreateRuntime].RunHooks(s); err != nil {
  1654. return err
  1655. }
  1656. }
  1657. case "post-restore":
  1658. pid := notify.GetPid()
  1659. p, err := os.FindProcess(int(pid))
  1660. if err != nil {
  1661. return err
  1662. }
  1663. cmd.Process = p
  1664. r, err := newRestoredProcess(cmd, fds)
  1665. if err != nil {
  1666. return err
  1667. }
  1668. process.ops = r
  1669. if err := c.state.transition(&restoredState{
  1670. imageDir: opts.ImagesDirectory,
  1671. c: c,
  1672. }); err != nil {
  1673. return err
  1674. }
  1675. // create a timestamp indicating when the restored checkpoint was started
  1676. c.created = time.Now().UTC()
  1677. if _, err := c.updateState(r); err != nil {
  1678. return err
  1679. }
  1680. if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
  1681. if !os.IsNotExist(err) {
  1682. logrus.Error(err)
  1683. }
  1684. }
  1685. case "orphan-pts-master":
  1686. scm, err := unix.ParseSocketControlMessage(oob)
  1687. if err != nil {
  1688. return err
  1689. }
  1690. fds, err := unix.ParseUnixRights(&scm[0])
  1691. if err != nil {
  1692. return err
  1693. }
  1694. master := os.NewFile(uintptr(fds[0]), "orphan-pts-master")
  1695. defer master.Close()
  1696. // While we can access console.master, using the API is a good idea.
  1697. if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil {
  1698. return err
  1699. }
  1700. case "status-ready":
  1701. if opts.StatusFd != -1 {
  1702. // write \0 to status fd to notify that lazy page server is ready
  1703. _, err := unix.Write(opts.StatusFd, []byte{0})
  1704. if err != nil {
  1705. logrus.Warnf("can't write \\0 to status fd: %v", err)
  1706. }
  1707. _ = unix.Close(opts.StatusFd)
  1708. opts.StatusFd = -1
  1709. }
  1710. }
  1711. return nil
  1712. }
  1713. func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
  1714. if process != nil {
  1715. c.initProcess = process
  1716. }
  1717. state, err := c.currentState()
  1718. if err != nil {
  1719. return nil, err
  1720. }
  1721. err = c.saveState(state)
  1722. if err != nil {
  1723. return nil, err
  1724. }
  1725. return state, nil
  1726. }
  1727. func (c *linuxContainer) saveState(s *State) (retErr error) {
  1728. tmpFile, err := os.CreateTemp(c.root, "state-")
  1729. if err != nil {
  1730. return err
  1731. }
  1732. defer func() {
  1733. if retErr != nil {
  1734. tmpFile.Close()
  1735. os.Remove(tmpFile.Name())
  1736. }
  1737. }()
  1738. err = utils.WriteJSON(tmpFile, s)
  1739. if err != nil {
  1740. return err
  1741. }
  1742. err = tmpFile.Close()
  1743. if err != nil {
  1744. return err
  1745. }
  1746. stateFilePath := filepath.Join(c.root, stateFilename)
  1747. return os.Rename(tmpFile.Name(), stateFilePath)
  1748. }
  1749. func (c *linuxContainer) currentStatus() (Status, error) {
  1750. if err := c.refreshState(); err != nil {
  1751. return -1, err
  1752. }
  1753. return c.state.status(), nil
  1754. }
  1755. // refreshState needs to be called to verify that the current state on the
  1756. // container is what is true. Because consumers of libcontainer can use it
  1757. // out of process we need to verify the container's status based on runtime
  1758. // information and not rely on our in process info.
  1759. func (c *linuxContainer) refreshState() error {
  1760. paused, err := c.isPaused()
  1761. if err != nil {
  1762. return err
  1763. }
  1764. if paused {
  1765. return c.state.transition(&pausedState{c: c})
  1766. }
  1767. t := c.runType()
  1768. switch t {
  1769. case Created:
  1770. return c.state.transition(&createdState{c: c})
  1771. case Running:
  1772. return c.state.transition(&runningState{c: c})
  1773. }
  1774. return c.state.transition(&stoppedState{c: c})
  1775. }
  1776. func (c *linuxContainer) runType() Status {
  1777. if c.initProcess == nil {
  1778. return Stopped
  1779. }
  1780. pid := c.initProcess.pid()
  1781. stat, err := system.Stat(pid)
  1782. if err != nil {
  1783. return Stopped
  1784. }
  1785. if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead {
  1786. return Stopped
  1787. }
  1788. // We'll create exec fifo and blocking on it after container is created,
  1789. // and delete it after start container.
  1790. if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil {
  1791. return Created
  1792. }
  1793. return Running
  1794. }
  1795. func (c *linuxContainer) isPaused() (bool, error) {
  1796. state, err := c.cgroupManager.GetFreezerState()
  1797. if err != nil {
  1798. return false, err
  1799. }
  1800. return state == configs.Frozen, nil
  1801. }
  1802. func (c *linuxContainer) currentState() (*State, error) {
  1803. var (
  1804. startTime uint64
  1805. externalDescriptors []string
  1806. pid = -1
  1807. )
  1808. if c.initProcess != nil {
  1809. pid = c.initProcess.pid()
  1810. startTime, _ = c.initProcess.startTime()
  1811. externalDescriptors = c.initProcess.externalDescriptors()
  1812. }
  1813. intelRdtPath := ""
  1814. if c.intelRdtManager != nil {
  1815. intelRdtPath = c.intelRdtManager.GetPath()
  1816. }
  1817. state := &State{
  1818. BaseState: BaseState{
  1819. ID: c.ID(),
  1820. Config: *c.config,
  1821. InitProcessPid: pid,
  1822. InitProcessStartTime: startTime,
  1823. Created: c.created,
  1824. },
  1825. Rootless: c.config.RootlessEUID && c.config.RootlessCgroups,
  1826. CgroupPaths: c.cgroupManager.GetPaths(),
  1827. IntelRdtPath: intelRdtPath,
  1828. NamespacePaths: make(map[configs.NamespaceType]string),
  1829. ExternalDescriptors: externalDescriptors,
  1830. }
  1831. if pid > 0 {
  1832. for _, ns := range c.config.Namespaces {
  1833. state.NamespacePaths[ns.Type] = ns.GetPath(pid)
  1834. }
  1835. for _, nsType := range configs.NamespaceTypes() {
  1836. if !configs.IsNamespaceSupported(nsType) {
  1837. continue
  1838. }
  1839. if _, ok := state.NamespacePaths[nsType]; !ok {
  1840. ns := configs.Namespace{Type: nsType}
  1841. state.NamespacePaths[ns.Type] = ns.GetPath(pid)
  1842. }
  1843. }
  1844. }
  1845. return state, nil
  1846. }
  1847. func (c *linuxContainer) currentOCIState() (*specs.State, error) {
  1848. bundle, annotations := utils.Annotations(c.config.Labels)
  1849. state := &specs.State{
  1850. Version: specs.Version,
  1851. ID: c.ID(),
  1852. Bundle: bundle,
  1853. Annotations: annotations,
  1854. }
  1855. status, err := c.currentStatus()
  1856. if err != nil {
  1857. return nil, err
  1858. }
  1859. state.Status = specs.ContainerState(status.String())
  1860. if status != Stopped {
  1861. if c.initProcess != nil {
  1862. state.Pid = c.initProcess.pid()
  1863. }
  1864. }
  1865. return state, nil
  1866. }
  1867. // orderNamespacePaths sorts namespace paths into a list of paths that we
  1868. // can setns in order.
  1869. func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
  1870. paths := []string{}
  1871. for _, ns := range configs.NamespaceTypes() {
  1872. // Remove namespaces that we don't need to join.
  1873. if !c.config.Namespaces.Contains(ns) {
  1874. continue
  1875. }
  1876. if p, ok := namespaces[ns]; ok && p != "" {
  1877. // check if the requested namespace is supported
  1878. if !configs.IsNamespaceSupported(ns) {
  1879. return nil, fmt.Errorf("namespace %s is not supported", ns)
  1880. }
  1881. // only set to join this namespace if it exists
  1882. if _, err := os.Lstat(p); err != nil {
  1883. return nil, fmt.Errorf("namespace path: %w", err)
  1884. }
  1885. // do not allow namespace path with comma as we use it to separate
  1886. // the namespace paths
  1887. if strings.ContainsRune(p, ',') {
  1888. return nil, fmt.Errorf("invalid namespace path %s", p)
  1889. }
  1890. paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
  1891. }
  1892. }
  1893. return paths, nil
  1894. }
  1895. func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
  1896. data := bytes.NewBuffer(nil)
  1897. for _, im := range idMap {
  1898. line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
  1899. if _, err := data.WriteString(line); err != nil {
  1900. return nil, err
  1901. }
  1902. }
  1903. return data.Bytes(), nil
  1904. }
  1905. // netlinkError is an error wrapper type for use by custom netlink message
  1906. // types. Panics with errors are wrapped in netlinkError so that the recover
  1907. // in bootstrapData can distinguish intentional panics.
  1908. type netlinkError struct{ error }
  1909. // bootstrapData encodes the necessary data in netlink binary format
  1910. // as a io.Reader.
  1911. // Consumer can write the data to a bootstrap program
  1912. // such as one that uses nsenter package to bootstrap the container's
  1913. // init process correctly, i.e. with correct namespaces, uid/gid
  1914. // mapping etc.
  1915. func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) {
  1916. // create the netlink message
  1917. r := nl.NewNetlinkRequest(int(InitMsg), 0)
  1918. // Our custom messages cannot bubble up an error using returns, instead
  1919. // they will panic with the specific error type, netlinkError. In that
  1920. // case, recover from the panic and return that as an error.
  1921. defer func() {
  1922. if r := recover(); r != nil {
  1923. if e, ok := r.(netlinkError); ok {
  1924. Err = e.error
  1925. } else {
  1926. panic(r)
  1927. }
  1928. }
  1929. }()
  1930. // write cloneFlags
  1931. r.AddData(&Int32msg{
  1932. Type: CloneFlagsAttr,
  1933. Value: uint32(cloneFlags),
  1934. })
  1935. // write custom namespace paths
  1936. if len(nsMaps) > 0 {
  1937. nsPaths, err := c.orderNamespacePaths(nsMaps)
  1938. if err != nil {
  1939. return nil, err
  1940. }
  1941. r.AddData(&Bytemsg{
  1942. Type: NsPathsAttr,
  1943. Value: []byte(strings.Join(nsPaths, ",")),
  1944. })
  1945. }
  1946. // write namespace paths only when we are not joining an existing user ns
  1947. _, joinExistingUser := nsMaps[configs.NEWUSER]
  1948. if !joinExistingUser {
  1949. // write uid mappings
  1950. if len(c.config.UidMappings) > 0 {
  1951. if c.config.RootlessEUID && c.newuidmapPath != "" {
  1952. r.AddData(&Bytemsg{
  1953. Type: UidmapPathAttr,
  1954. Value: []byte(c.newuidmapPath),
  1955. })
  1956. }
  1957. b, err := encodeIDMapping(c.config.UidMappings)
  1958. if err != nil {
  1959. return nil, err
  1960. }
  1961. r.AddData(&Bytemsg{
  1962. Type: UidmapAttr,
  1963. Value: b,
  1964. })
  1965. }
  1966. // write gid mappings
  1967. if len(c.config.GidMappings) > 0 {
  1968. b, err := encodeIDMapping(c.config.GidMappings)
  1969. if err != nil {
  1970. return nil, err
  1971. }
  1972. r.AddData(&Bytemsg{
  1973. Type: GidmapAttr,
  1974. Value: b,
  1975. })
  1976. if c.config.RootlessEUID && c.newgidmapPath != "" {
  1977. r.AddData(&Bytemsg{
  1978. Type: GidmapPathAttr,
  1979. Value: []byte(c.newgidmapPath),
  1980. })
  1981. }
  1982. if requiresRootOrMappingTool(c.config) {
  1983. r.AddData(&Boolmsg{
  1984. Type: SetgroupAttr,
  1985. Value: true,
  1986. })
  1987. }
  1988. }
  1989. }
  1990. if c.config.OomScoreAdj != nil {
  1991. // write oom_score_adj
  1992. r.AddData(&Bytemsg{
  1993. Type: OomScoreAdjAttr,
  1994. Value: []byte(strconv.Itoa(*c.config.OomScoreAdj)),
  1995. })
  1996. }
  1997. // write rootless
  1998. r.AddData(&Boolmsg{
  1999. Type: RootlessEUIDAttr,
  2000. Value: c.config.RootlessEUID,
  2001. })
  2002. // Bind mount source to open.
  2003. if it == initStandard && c.shouldSendMountSources() {
  2004. var mounts []byte
  2005. for _, m := range c.config.Mounts {
  2006. if m.IsBind() {
  2007. if strings.IndexByte(m.Source, 0) >= 0 {
  2008. return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
  2009. }
  2010. mounts = append(mounts, []byte(m.Source)...)
  2011. }
  2012. mounts = append(mounts, byte(0))
  2013. }
  2014. r.AddData(&Bytemsg{
  2015. Type: MountSourcesAttr,
  2016. Value: mounts,
  2017. })
  2018. }
  2019. return bytes.NewReader(r.Serialize()), nil
  2020. }
  2021. // ignoreTerminateErrors returns nil if the given err matches an error known
  2022. // to indicate that the terminate occurred successfully or err was nil, otherwise
  2023. // err is returned unaltered.
  2024. func ignoreTerminateErrors(err error) error {
  2025. if err == nil {
  2026. return nil
  2027. }
  2028. // terminate() might return an error from either Kill or Wait.
  2029. // The (*Cmd).Wait documentation says: "If the command fails to run
  2030. // or doesn't complete successfully, the error is of type *ExitError".
  2031. // Filter out such errors (like "exit status 1" or "signal: killed").
  2032. var exitErr *exec.ExitError
  2033. if errors.As(err, &exitErr) {
  2034. return nil
  2035. }
  2036. if errors.Is(err, os.ErrProcessDone) {
  2037. return nil
  2038. }
  2039. s := err.Error()
  2040. if strings.Contains(s, "Wait was already called") {
  2041. return nil
  2042. }
  2043. return err
  2044. }
  2045. func requiresRootOrMappingTool(c *configs.Config) bool {
  2046. gidMap := []configs.IDMap{
  2047. {ContainerID: 0, HostID: os.Getegid(), Size: 1},
  2048. }
  2049. return !reflect.DeepEqual(c.GidMappings, gidMap)
  2050. }