LiveRestore 特性从 Docker Engine 1.12 版本开始支持（不支持 Windows），在 dockerd 停止时保证已启动的 Running 容器持续运行，并在 daemon 进程启动后重新接管。

Docker LiveRestore 特性分析

本文基于 Docker Version 1.13.1 进行代码分析

Live Restore ???

该特性从 Docker Engine 1.12 版本开始支持（不支持 Windows），主要且唯一的作用就是：在 dockerd 停止时保证已启动的 Running 容器持续运行，并在 daemon 进程启动后重新接管。几个应用场景

dockerd 异常退出时保证业务容器持续运行
升级 Docker Engine 时（不跨 Release 版本）保证业务容器持续运行

必备条件

跨版本升级时，不建议使用：在 Docker Engine 跨版本升级时启用可能因为代码变动而导致新版本 Engine 启动后不接管先前的容器
调整 daemon 启动参数时，不建议使用：例如调整 Graph 等将导致 DockerRepository/containers 目录变更从而丢失原始容器信息进而无法接管先前的容器

Docker daemon 实现

在 docker/daemon/config_unix.go 中提供了 --live-restore 参数并针对于 Swarm 模式的兼容性进行判断（注：LiveRestore 不能在 Swarm 模式下工作）。

func (config *Config) InstallFlags(flags *pflag.FlagSet) {
    // 其他参数
    flags.BoolVar(&config.LiveRestoreEnabled, "live-restore", false, "Enable live restore of docker when containers are still running")
}

func (config *Config) isSwarmCompatible() error {
	if config.LiveRestoreEnabled {
		return fmt.Errorf("--live-restore daemon configuration is incompatible with swarm mode")
	}
	return nil
}

随后在 docker/cmd/dockerd/daemon.go 中将该参数通过 containerd.RemoteOption 作为 containerdRemote 初始化的参数

func (cli *DaemonCli) start(opts daemonOptions) (err error) {
    // 省略无数
    // cli.getPlatformRemoteOptions() 中含有 LiveRestore 参数
    containerdRemote, err := libcontainerd.New(cli.getLibcontainerdRoot(), cli.getPlatformRemoteOptions()...)
	if err != nil {
		return err
	}

    // ...
    // 初始化完了总该用起来了吧
    // 拿刚才初始化了的 containerdRemote 传给 daemon.NewDaemon 来初始化一个真正的 Docker Daemon
    d, err := daemon.NewDaemon(cli.Config, registryService, containerdRemote)
	if err != nil {
		return fmt.Errorf("Error starting daemon: %v", err)
	}
}

containerd.Remote 初始化细节

初始化 containerdRemote 的时候会对所有的 Option 进行 Apply，其中 LiveRestore 的特性会加载到当前 Remote 的所有 Client 中

docker/libcontainerd/remote_unix.go

// 向 Remote 增加 Client 并使用预先初始化的 remote.LiveRestore 配置设置 Client
func (r *remote) Client(b Backend) (Client, error) {
	c := &client{
		clientCommon: clientCommon{
			backend:    b,
			containers: make(map[string]*container),
			locker:     locker.New(),
		},
		remote:        r,
		exitNotifiers: make(map[string]*exitNotifier),
		liveRestore:   r.liveRestore,
	}

	r.Lock()
	r.clients = append(r.clients, c)
	r.Unlock()
	return c, nil
}

// WithLiveRestore defines if containers are stopped on shutdown or restored.
func WithLiveRestore(v bool) RemoteOption {
	return liveRestore(v)
}

type liveRestore bool

func (l liveRestore) Apply(r Remote) error {
	if remote, ok := r.(*remote); ok {
        // 保留该配置到当前的 containerdRemote，以确保后续新增 Client 的时候能够从 Remote 中读取并设置
        remote.liveRestore = bool(l)
		for _, c := range remote.clients {
			c.liveRestore = bool(l)
		}
		return nil
	}
	return fmt.Errorf("WithLiveRestore option not supported for this remote")
}

daemon.NewDaemon 做了什么

// NewDaemon sets up everything for the daemon to be able to service
// requests from the webserver.
func NewDaemon(config *Config, registryService registry.Service, containerdRemote libcontainerd.Remote) (daemon *Daemon, err error) {

从这段注释来看，启动了各种各样所需要的东西让 daemon 可以开始工作，继续看这段函数

	d.nameIndex = registrar.NewRegistrar()
	d.linkIndex = newLinkIndex()
    // 这不就是刚才初始化传入的 containerdRemote 嘛
	d.containerdRemote = containerdRemote

	...

    // 诶，这个操作是加了一个 Client
	d.containerd, err = containerdRemote.Client(d)
	if err != nil {
		return nil, err
	}

    // 好家伙，开始 restore 了恢复或者清理整个先前留下的现场
	if err := d.restore(); err != nil {
		return nil, err
	}

Restore 具体实现

daemon.restore

docker/daemon/daemon.go

func (daemon *Daemon) restore() error {
	var (
		currentDriver = daemon.GraphDriverName()
		containers    = make(map[string]*container.Container)
	)

	logrus.Info("Loading containers: start.")

	dir, err := ioutil.ReadDir(daemon.repository)
	if err != nil {
		return err
	}

	for _, v := range dir {
		id := v.Name()
		container, err := daemon.load(id)
		if err != nil {
			logrus.Errorf("Failed to load container %v: %v", id, err)
			continue
		}
        // Ignore the container if it does not support the current driver being used by the graph
		if (container.Driver == "" && currentDriver == "aufs") || container.Driver == currentDriver {
			// 设置 RWLayer
			logrus.Debugf("Loaded container %v", container.ID)

			containers[container.ID] = container
		} else {
			logrus.Debugf("Cannot load container %s because it was created with another graph driver.", container.ID)
		}
    }
    ...

}

首先从 dockerd 启动的参数 --graph（默认是 /var/lib/docker）所对应的目录下的 containers 目录读取当前所有的容器 ID，根据存储类型选择，忽略掉当前 daemon 设置的存储驱动不支持的容器。然后将上面筛选出的容器注册到 daemon。

接下来是重头戏，开始对容器进行恢复操作

for _, c := range containers {
    wg.Add(1)
    go func(c *container.Container) {
        defer wg.Done()
        // 对于运行中的或者被暂停的容器**异步**进行 Restore
        if c.IsRunning() || c.IsPaused() {
            // 调用 libcontainerd.Client.Restore
            if err := daemon.containerd.Restore(c.ID, c.InitializeStdio); err != nil {
                logrus.Errorf("Failed to restore %s with containerd: %s", c.ID, err)
                return
            }
        }
        if !c.IsRunning() && !c.IsPaused() {
        }
        if c.RemovalInProgress {
        }
    }
}

docker/libcontainerd/client_linix.go
func (clnt *client) Restore(containerID string, attachStdio StdioCallback, options ...CreateOption) error {
	// Synchronize with live events
	clnt.remote.Lock()
	defer clnt.remote.Unlock()
	// Check that containerd still knows this container.
	//
	// In the unlikely event that Restore for this container process
	// the its past event before the main loop, the event will be
	// processed twice. However, this is not an issue as all those
	// events will do is change the state of the container to be
	// exactly the same.
	cont, err := clnt.getContainerdContainer(containerID)
	// Get its last event
	ev, eerr := clnt.getContainerLastEvent(containerID)
	if err != nil || cont.Status == "Stopped" {
        // 针对于获取异常或者需要停止的容器，执行退出（stop）的操作

    }

    // 走到这里的都是状态 Running 或者 Paused 的容器

    // 如果开启了 LiveRestore 特性，那么开始 restore 这个容器
    if clnt.liveRestore {
        if err := clnt.restore(cont, ev, attachStdio, options...); err != nil {
			logrus.Errorf("libcontainerd: error restoring %s: %v", containerID, err)
		}
		return nil
    }

    // 否则干掉这个容器
    // 执行退出（stop）操作
}

具体到某个容器的 restore 由 containerd.Client 执行

func (clnt *client) restore(cont *containerd.Container, lastEvent *containerd.Event, attachStdio StdioCallback, options ...CreateOption) (err error) {
	clnt.lock(cont.Id)
	defer clnt.unlock(cont.Id)

	logrus.Debugf("libcontainerd: restore container %s state %s", cont.Id, cont.Status)

	containerID := cont.Id
	if _, err := clnt.getContainer(containerID); err == nil {
		return fmt.Errorf("container %s is already active", containerID)
	}

	defer func() {
        // 如果最后出错了，我们就不管这个容器了，把它从当前的 client 容器列表中移除（注：不是真的删除容器）
		if err != nil {
			clnt.deleteContainer(cont.Id)
		}
	}()

	container := clnt.newContainer(cont.BundlePath, options...)
	container.systemPid = systemPid(cont)

    // 找到容器内 PID 的进程状态
	var terminal bool
	for _, p := range cont.Processes {
		if p.Pid == InitFriendlyName {
			terminal = p.Terminal
		}
	}

    // 以下创建一个 FIFO 的管道用于日志收集
	fifoCtx, cancel := context.WithCancel(context.Background())
	defer func() {
		if err != nil {
			cancel()
		}
	}()

	iopipe, err := container.openFifos(fifoCtx, terminal)
	if err != nil {
		return err
	}
	var stdinOnce sync.Once
	stdin := iopipe.Stdin
	iopipe.Stdin = ioutils.NewWriteCloserWrapper(stdin, func() error {
		var err error
		stdinOnce.Do(func() { // on error from attach we don't know if stdin was already closed
			err = stdin.Close()
		})
		return err
	})

    // 将 STD IO 转到管道中
	if err := attachStdio(*iopipe); err != nil {
		container.closeFifos(iopipe)
		return err
	}

    // 看起来都没问题了，这个容器归 Client 管了
	clnt.appendContainer(container)

    // 更新下 event，告知这容器被 Restore 了
	err = clnt.backend.StateChanged(containerID, StateInfo{
		CommonStateInfo: CommonStateInfo{
			State: StateRestore,
			Pid:   container.systemPid,
		}})

	if err != nil {
		container.closeFifos(iopipe)
		return err
	}

    // 发现有 event 要处理，那就改改状态保持下一致性
	if lastEvent != nil {
		// This should only be a pause or resume event
		if lastEvent.Type == StatePause || lastEvent.Type == StateResume {
			return clnt.backend.StateChanged(containerID, StateInfo{
				CommonStateInfo: CommonStateInfo{
					State: lastEvent.Type,
					Pid:   container.systemPid,
				}})
		}

		logrus.Warnf("libcontainerd: unexpected backlog event: %#v", lastEvent)
	}

	return nil
}

Daemon 退出时的特殊处理

在 dockerd 启动方法 docker/cmd/dockerd/daemon.go -> DaemonCli.start 最后处理了退出的场景

func (cli *DaemonCli) start(opts daemonOptions) (err error) {
    // ...
   	// Wait for serve API to complete
	errAPI := <-serveAPIWait
	c.Cleanup()
    // 关闭 daemon
	shutdownDaemon(d)
    // 断开 containerd
	containerdRemote.Cleanup()
	if errAPI != nil {
		return fmt.Errorf("Shutting down due to ServeAPI error: %v", errAPI)
	}
    return nil
}


// shutdownDaemon just wraps daemon.Shutdown() to handle a timeout in case
// d.Shutdown() is waiting too long to kill container or worst it's
// blocked there
func shutdownDaemon(d *daemon.Daemon) {
    // 算算合理的超时时间
    shutdownTimeout := d.ShutdownTimeout()
	ch := make(chan struct{})
	go func() {
        // 开始真的 shutdown daemon
		d.Shutdown()
		close(ch)
	}()
    // 一坨的超时处理，无视
    if shutdownTimeout < 0 {
		<-ch
		logrus.Debug("Clean shutdown succeeded")
		return
	}
	select {
	case <-ch:
		logrus.Debug("Clean shutdown succeeded")
	case <-time.After(time.Duration(shutdownTimeout) * time.Second):
		logrus.Error("Force shutdown daemon")
	}
}



// Shutdown stops the daemon.
func (daemon *Daemon) Shutdown() error {
	daemon.shutdown = true
	// Keep mounts and networking running on daemon shutdown if
	// we are to keep containers running and restore them.

    // 针对于 LiveRestore 开启的状态，不做清理
	if daemon.configStore.LiveRestoreEnabled && daemon.containers != nil {
		// check if there are any running containers, if none we should do some cleanup
		if ls, err := daemon.Containers(&types.ContainerListOptions{}); len(ls) != 0 || err != nil {
			return nil
		}
	}
    // .....
    // 清理挂载
    // 清理网络
}

至此，dockerd 的 LiveRestore 特性相关代码已经分析完成。

隐藏点

std 日志 buffer 问题

containerd-shim 中的应用服务向 FIFO Pipe 输出日志（如果 dockerd 活着就会从这里面取出日志），buffer 的大小受限于 Pipe Size。在日志 buffer 满了之后将阻塞容器内日志输出

systemd

在设置 KillMode 为 control-group mixed 时，containerd-shim 会被 kill 掉但容器内的进程还在。在恢复启动的时旧进程才会被回收。因此这部分要确保 KillMode 设置为 process（只 kill 主进程）。

升级的时候应该怎么做？

假定在 Kubernetes 场景下

跨 Release 版本

例如从 17.03 -> 17.06，这种属于跨 Release 版本升级的操作，建议操作前先将节点置为不可调度，让上面的业务容器漂移到别的节点上。然后手动将该节点上的其他容器关闭，接着进行升级操作

小版本升级

例如从 17.03.0 -> 17.03.2，保险起见，先将节点置为不可调度，而后直接升级

参考

Configure the delivery mode of log messages from container to log driver