- lvmlockd守护进程使用LVM内部的libdaemon库实现与命令行程序的通信,为LVM命令提供分布式锁功能。
- lvmlockd的具体实现依赖sanlock(依赖wdmd服务)和dlm(依赖CoroSync、PaceMaker和Fence设备)。
- lvmlockd必须配合LVM的本地锁(flock)使用,是本地锁在集群环境下的扩充。
- lvmlockd的作者也是sanlock的原作者,发起此项目的目的是替代clvmd,使得用户在dlm外有更多选择(但从他的源码结构来看,没有使用一个好的抽象接口,很难扩充第三种锁的支持,我认为这不是一个好的设计)。
- lvmlockd可以和lvmetad共存,但clvmd却不可以(原因未知,我并没有分析出他们为什么不能共存)。
- lvmlockd的功能没有使用LVM的锁抽象接口来实现,而是直接插入LVM源码中(我认为这不是个好的设计)。
上层接口函数
上层接口函数为LVM命令提供了与lvmlockd守护进程建立连接,并使用其提供的所服务的功能,这些函数的调用分布在LVM命令的所有需要锁的环节。
连接和断开服务
$ cat daemons/lvmlockd/lvmlockd-client.h...#define LVMLOCKD_SOCKET DEFAULT_RUN_DIR "/lvmlockd.socket"/* Wrappers to open/close connection */static inline daemon_handle lvmlockd_open(const char *sock){ daemon_info lvmlockd_info = { .path = "lvmlockd", .socket = sock ?: LVMLOCKD_SOCKET, .protocol = "lvmlockd", .protocol_version = 1, .autostart = 0 }; return daemon_open(lvmlockd_info);}static inline void lvmlockd_close(daemon_handle h){ return daemon_close(h);}...
锁操作
$ cat lib/locking/lvmlockd.h.../* vgcreate/vgremove use init/free */int lockd_init_vg(struct cmd_context *cmd, struct volume_group *vg, const char *lock_type, int lv_lock_count);int lockd_free_vg_before(struct cmd_context *cmd, struct volume_group *vg, int changing);void lockd_free_vg_final(struct cmd_context *cmd, struct volume_group *vg);/* vgrename */int lockd_rename_vg_before(struct cmd_context *cmd, struct volume_group *vg);int lockd_rename_vg_final(struct cmd_context *cmd, struct volume_group *vg, int success);/* start and stop the lockspace for a vg */int lockd_start_vg(struct cmd_context *cmd, struct volume_group *vg, int start_init);int lockd_stop_vg(struct cmd_context *cmd, struct volume_group *vg);int lockd_start_wait(struct cmd_context *cmd);/* locking */int lockd_gl_create(struct cmd_context *cmd, const char *def_mode, const char *vg_lock_type);int lockd_gl(struct cmd_context *cmd, const char *def_mode, uint32_t flags);int lockd_vg(struct cmd_context *cmd, const char *vg_name, const char *def_mode, uint32_t flags, uint32_t *lockd_state);int lockd_vg_update(struct volume_group *vg);int lockd_lv_name(struct cmd_context *cmd, struct volume_group *vg, const char *lv_name, struct id *lv_id, const char *lock_args, const char *def_mode, uint32_t flags);int lockd_lv(struct cmd_context *cmd, struct logical_volume *lv, const char *def_mode, uint32_t flags);/* lvcreate/lvremove use init/free */int lockd_init_lv(struct cmd_context *cmd, struct volume_group *vg, struct logical_volume *lv, struct lvcreate_params *lp);int lockd_init_lv_args(struct cmd_context *cmd, struct volume_group *vg, struct logical_volume *lv, const char *lock_type, const char **lock_args);int lockd_free_lv(struct cmd_context *cmd, struct volume_group *vg, const char *lv_name, struct id *lv_id, const char *lock_args);const char *lockd_running_lock_type(struct cmd_context *cmd, int *found_multiple);int handle_sanlock_lv(struct cmd_context *cmd, struct volume_group *vg);int lockd_lv_uses_lock(struct logical_volume *lv);...
内部数据结构
锁定范围(对象)
- 全局
- 卷组
- 逻辑卷
$ cat daemons/lvmlockd/lvmlockd-internal.h.../* resource types */enum { LD_RT_GL = 1, LD_RT_VG, LD_RT_LV,};...
锁定模式(状态)
- 无效
- 解锁
- 空锁
- 共享
- 独占
.../* lock modes, more restrictive must be larger value */enum { LD_LK_IV = -1, LD_LK_UN = 0, LD_LK_NL = 1, LD_LK_SH = 2, LD_LK_EX = 3,};...
锁管理器类型
目前只支持DLM和sanlock。
.../* lock manager types */enum { LD_LM_NONE = 0, LD_LM_UNUSED = 1, /* place holder so values match lib/locking/lvmlockd.h */ LD_LM_DLM = 2, LD_LM_SANLOCK = 3,};...
客户端管理
每个客户端连接对应一个。
...struct list_head { struct list_head *next, *prev;};struct client { struct list_head list; pthread_mutex_t mutex; int pid; int fd; int pi; uint32_t id; unsigned int recv : 1; unsigned int dead : 1; unsigned int poll_ignore : 1; unsigned int lock_ops : 1; char name[MAX_NAME+1];};...
动作(报文)
每个客户端的请求都被填充为一个请求。
类型
.../* operation types */enum { LD_OP_HELLO = 1, LD_OP_QUIT, LD_OP_INIT, LD_OP_FREE, LD_OP_START, LD_OP_STOP, LD_OP_LOCK, LD_OP_UPDATE, LD_OP_CLOSE, LD_OP_ENABLE, LD_OP_DISABLE, LD_OP_START_WAIT, LD_OP_STOP_ALL, LD_OP_DUMP_INFO, LD_OP_DUMP_LOG, LD_OP_RENAME_BEFORE, LD_OP_RENAME_FINAL, LD_OP_RUNNING_LM, LD_OP_FIND_FREE_LOCK, LD_OP_KILL_VG, LD_OP_DROP_VG, LD_OP_BUSY,};...
其中很多操作仅供内部使用,客户端产生的动作类型有:
$ cat daemons/lvmlockd/lvmlockd-core.c... case LD_OP_LOCK: case LD_OP_UPDATE: case LD_OP_ENABLE: case LD_OP_DISABLE: case LD_OP_FREE: case LD_OP_RENAME_BEFORE: case LD_OP_FIND_FREE_LOCK: case LD_OP_KILL_VG: case LD_OP_DROP_VG: case LD_OP_BUSY: rv = add_lock_action(act); break;...
- 我们最关心的是其中的LD_OP_LOCK和LD_OP_UPDATE动作,这两个锁操作会改资源的版本号。
- 每次解除写入锁或者更新锁时就会更改资源的版本,客户端命令就会据此判断LVM的元数据是否被修改。
- 如果客户端使用了lvmetad元数据服务,则会据此重新扫描本地磁盘中的元数据。
资源的版本号:
$ cat daemons/lvmlockd/lvmlockd-internal.h.../* val_blk version */#define VAL_BLK_VERSION 0x0101/* val_blk flags */#define VBF_REMOVED 0x0001struct val_blk { uint16_t version; uint16_t flags; uint32_t r_version;};...
标志
...#define LD_AF_PERSISTENT 0x00000001#define LD_AF_NO_CLIENT 0x00000002#define LD_AF_UNLOCK_CANCEL 0x00000004#define LD_AF_NEXT_VERSION 0x00000008#define LD_AF_WAIT 0x00000010#define LD_AF_FORCE 0x00000020#define LD_AF_EX_DISABLE 0x00000040#define LD_AF_ENABLE 0x00000080#define LD_AF_DISABLE 0x00000100#define LD_AF_SEARCH_LS 0x00000200#define LD_AF_WAIT_STARTING 0x00001000#define LD_AF_DUP_GL_LS 0x00002000#define LD_AF_ADOPT 0x00010000#define LD_AF_WARN_GL_REMOVED 0x00020000#define LD_AF_LV_LOCK 0x00040000#define LD_AF_LV_UNLOCK 0x00080000...
结构
客户端线程使用libdaemon的ascii格式协议转换构造。
.../* * Number of times to repeat a lock request after * a lock conflict (-EAGAIN) if unspecified in the * request. */#define DEFAULT_MAX_RETRIES 4struct action { struct list_head list; uint32_t client_id; uint32_t flags; /* LD_AF_ */ uint32_t version; uint64_t host_id; int8_t op; /* operation type LD_OP_ */ int8_t rt; /* resource type LD_RT_ */ int8_t mode; /* lock mode LD_LK_ */ int8_t lm_type; /* lock manager: LM_DLM, LM_SANLOCK */ int retries; int max_retries; int result; int lm_rv; /* return value from lm_ function */ char vg_uuid[64]; char vg_name[MAX_NAME+1]; char lv_name[MAX_NAME+1]; char lv_uuid[MAX_NAME+1]; char vg_args[MAX_ARGS+1]; char lv_args[MAX_ARGS+1]; char vg_sysid[MAX_NAME+1];};...
对象管理
锁定的对象被称为资源,每个资源上可以有多个锁,多个锁组成一个锁空间,各个锁空间之间的锁互相隔离。
...struct resource { struct list_head list; /* lockspace.resources */ char name[MAX_NAME+1]; /* vg name or lv name */ int8_t type; /* resource type LD_RT_ */ int8_t mode; unsigned int sh_count; /* number of sh locks on locks list */ uint32_t version; uint32_t last_client_id; /* last client_id to lock or unlock resource */ unsigned int lm_init : 1; /* lm_data is initialized */ unsigned int adopt : 1; /* temp flag in remove_inactive_lvs */ unsigned int version_zero_valid : 1; unsigned int use_vb : 1; struct list_head locks; struct list_head actions; char lv_args[MAX_ARGS+1]; char lm_data[0]; /* lock manager specific data */};#define LD_LF_PERSISTENT 0x00000001struct lock { struct list_head list; /* resource.locks */ int8_t mode; /* lock mode LD_LK_ */ uint32_t version; uint32_t flags; /* LD_LF_ */ uint32_t client_id; /* may be 0 for persistent or internal locks */};struct lockspace { struct list_head list; /* lockspaces */ char name[MAX_NAME+1]; char vg_name[MAX_NAME+1]; char vg_uuid[64]; char vg_args[MAX_ARGS+1]; /* lock manager specific args */ char vg_sysid[MAX_NAME+1]; int8_t lm_type; /* lock manager: LM_DLM, LM_SANLOCK */ void *lm_data; uint64_t host_id; uint64_t free_lock_offset; /* start search for free lock here */ uint32_t start_client_id; /* client_id that started the lockspace */ pthread_t thread; /* makes synchronous lock requests */ pthread_cond_t cond; pthread_mutex_t mutex; unsigned int create_fail : 1; unsigned int create_done : 1; unsigned int thread_work : 1; unsigned int thread_stop : 1; unsigned int thread_done : 1; unsigned int sanlock_gl_enabled: 1; unsigned int sanlock_gl_dup: 1; unsigned int free_vg: 1; unsigned int kill_vg: 1; unsigned int drop_vg: 1; struct list_head actions; /* new client actions */ struct list_head resources; /* resource/lock state for gl/vg/lv */};...
服务模型
- 主线程pool本地套接字,当有新客户端连接时,构造client结构、创建新线程client线程。
- 当client线程收到一个请求时转化成一个action,如果对应的lockspace不存在则创建lockspace线程,并把这个action转发给对应的lockspace线程。
- lockspace线程执行完任务后返回给client线程,client线程返回给客户端。
$ cat daemons/lvmlockd/lvmlockd-core.c.../* * Basic operation of lvmlockd * * lvmlockd main process runs main_loop() which uses poll(). * poll listens for new connections from lvm commands and for * messages from existing connected lvm commands. * * lvm command starts and connects to lvmlockd. * * lvmlockd receives a connection request from command and adds a * 'struct client' to keep track of the connection to the command. * The client's fd is added to the set of fd's in poll(). * * lvm command sends a lock request to lvmlockd. The lock request * can be for the global lock, a vg lock, or an lv lock. * * lvmlockd main_loop/poll sees a message from an existing client. * It sets client.recv = 1, then wakes up client_thread_main. * * client_thread_main iterates through client structs (cl), looking * for any that need processing, finds the one with cl->recv set, * and calls client_recv_action(cl). * * client_recv_action(cl) reads the message/request from the client, * allocates a new 'struct action' (act) to represent the request, * sets the act with what is found in the request, then looks at * the specific operation in act->op (LD_OP_FOO) to decide what to * do with the action: * * . If the action is to start a lockspace, create a new thread * to manage that lockspace: add_lockspace(act). * * . If the action is a lock request, pass the act to the thread * that is managing that lockspace: add_lock_action(act). * * . Other misc actions are are passed to the worker_thread: * add_work_action(act). * * Onec the client_thread has passed the action off to another * thread to process, it goes back to waiting for more client * handling work to do. * * The thread that was given the action by the client_thread * now processes that action according to the operation, act->op. * This is either a lockspace_thread (for lock ops or ops that * add/rem a lockspace), or the worker_thread. See below for * how these ops are processed by these threads. When the * given thread is done processing the action, the result is * set in act->result, and the act struct for the completed action * is passed back to the client_thread (client_results list). * * The client_thread takes completed actions (from client_results * list), and sends the result back to the client that sent the * request represented by the action. The act struct is then freed. * * This completes the cycle of work between lvm commands (clients) * and lvmlockd. In summary: * * - main process polls for new client connections and new requests * from lvm commands * - client_thread reads requests from clients * - client_thread creates an action struct for each request * - client_thread passes the act to another thread for processing * - other threads pass completed act structs back to client_thread * - client_thread sends the act result back to the client and frees the act * * * Lockspace threads: * Each lockd VG has its own lockspace that contains locks for that VG. * Each 'struct lockspace' is managed by a separate lockspace_thread. * When the lockspace_thread is first created, the first thing it does * is join the lockspace in the lock manager. This can take a long time. * If the join fails, the thread exits. After the join, the thread * enters a loop waiting for lock actions to perform in the lockspace. * * The request to remove/leave a lockspace causes a flag to be set in * the lockspace struct. When the lockspace_thread sees this flag * set, it leaves the lockspace, and exits. * * When the client_thread passes a new action to a lockspace_thread, * i.e. a new lock request, the lockspace_thread identifies which resource * is being locked (GL, VG, LV), and gets the 'struct resource' (r) for it. * r->type will be LD_RT_GL, LD_RT_VG, or LD_RT_LV. r->name is the * resource name, and is fixed for GL and VG resources, but is based on * the LV name for LV resources. The act is added to the resource's * list of actions: r->actions, i.e. outstanding lock requests on the * resource. * * The lockspace thread then iterates through each resource in the * lockspace, processing any outstanding actions on each: res_process(ls, r). * * res_process() compares the outstanding actions/requests in r->actions * against any existing locks on the resource in r->locks. If the * action is blocked by existing locks, it's left on r->actions. If not, * the action/request is passed to the lock manager. If the result from * the lock manager is success, a new 'struct lock' is created for the * action and saved on r->locks. The result is set in act->result and * the act is passed back to the client_thread to be returned to the client. */...
最终处理lockspace线程处理锁空间action的函数如下:
.../* * Go through queued actions, and make lock/unlock calls on the resource * based on the actions and the existing lock state. * * All lock operations sent to the lock manager are non-blocking. * This is because sanlock does not support lock queueing. * Eventually we could enhance this to take advantage of lock * queueing when available (i.e. for the dlm). * * act_close_list: list of CLOSE actions, identifying clients that have * closed/terminated their lvmlockd connection, and whose locks should * be released. Do not remove these actions from act_close_list. * * retry_out: set to 1 if the lock manager said we should retry, * meaning we should call res_process() again in a short while to retry. */static void res_process(struct lockspace *ls, struct resource *r, struct list_head *act_close_list, int *retry_out){ struct action *act, *safe, *act_close; struct lock *lk; int lm_retry; int rv; /* * handle version updates for ex locks * (new version will be written by unlock) */ list_for_each_entry_safe(act, safe, &r->actions, list) { if (act->op == LD_OP_UPDATE) { rv = res_update(ls, r, act); act->result = rv; list_del(&act->list); add_client_result(act); } } /* * handle explicit unlock actions */ list_for_each_entry_safe(act, safe, &r->actions, list) { if ((act->op == LD_OP_LOCK) && (act->mode == LD_LK_IV || act->mode == LD_LK_NL)) { act->result = -EINVAL; list_del(&act->list); add_client_result(act); } if (act->op == LD_OP_LOCK && act->mode == LD_LK_UN) { rv = res_unlock(ls, r, act); if (rv == -ENOENT && (act->flags & LD_AF_UNLOCK_CANCEL)) rv = res_cancel(ls, r, act); /* * possible unlock results: * 0: unlock succeeded * -ECANCELED: cancel succeeded * -ENOENT: nothing to unlock or cancel */ act->result = rv; list_del(&act->list); add_client_result(act); } } /* * handle implicit unlocks due to client exit, * also clear any outstanding actions for the client */ list_for_each_entry(act_close, act_close_list, list) { res_unlock(ls, r, act_close); res_cancel(ls, r, act_close); } /* * handle freeing a lock for an lv that has been removed */ list_for_each_entry_safe(act, safe, &r->actions, list) { if (act->op == LD_OP_FREE && act->rt == LD_RT_LV) { log_debug("S %s R %s free_lv", ls->name, r->name); rv = free_lv(ls, r); act->result = rv; list_del(&act->list); add_client_result(act); goto r_free; } } /* * handle enable/disable */ list_for_each_entry_safe(act, safe, &r->actions, list) { if (act->op == LD_OP_ENABLE || act->op == LD_OP_DISABLE) { rv = res_able(ls, r, act); act->result = rv; list_del(&act->list); add_client_result(act); if (!rv && act->op == LD_OP_DISABLE) { log_debug("S %s R %s free disabled", ls->name, r->name); goto r_free; } } } /* * transient requests on existing transient locks */ list_for_each_entry_safe(act, safe, &r->actions, list) { if (act->flags & LD_AF_PERSISTENT) continue; lk = find_lock_client(r, act->client_id); if (!lk) continue; if (lk->mode != act->mode) { /* convert below */ /* act->result = -EEXIST; list_del(&act->list); add_client_result(act); */ continue; } else { /* success */ r->last_client_id = act->client_id; act->result = -EALREADY; list_del(&act->list); add_client_result(act); } } /* * persistent requests on existing persistent locks * * persistent locks are not owned by a client, so any * existing with matching mode satisfies a request. * only one persistent lock is kept on a resource. * a single "unowned" persistent lock satisfies * any/multiple client requests for a persistent lock. */ list_for_each_entry_safe(act, safe, &r->actions, list) { if (!(act->flags & LD_AF_PERSISTENT)) continue; lk = find_lock_persistent(r); if (!lk) continue; if (lk->mode != act->mode) { /* convert below */ /* act->result = -EEXIST; list_del(&act->list); add_client_result(act); */ continue; } else { /* success */ r->last_client_id = act->client_id; act->result = -EALREADY; list_del(&act->list); add_client_result(act); } } /* * transient requests with existing persistent locks * * Just grant the transient request and do not * keep a record of it. Assume that the persistent * lock will not go away while the transient lock * is needed. * * This would be used when an ex, persistent lv lock * exists from activation, and then something like * lvextend asks for a transient ex lock to change * the lv. The lv could not be unlocked by deactivation * while the lvextend was running. * * The logic here for mixing T/P locks is not general * support; there are a number of cases where it will * not work: updating version number (lv locks have * none), ex locks from multiple clients will not * conflict, explicit un of the transient lock will fail. */ list_for_each_entry_safe(act, safe, &r->actions, list) { if (act->flags & LD_AF_PERSISTENT) continue; lk = find_lock_persistent(r); if (!lk) continue; if ((lk->mode == LD_LK_EX) || (lk->mode == LD_LK_SH && act->mode == LD_LK_SH)) { r->last_client_id = act->client_id; act->result = 0; list_del(&act->list); add_client_result(act); } else { /* persistent lock is sh, transient request is ex */ /* FIXME: can we remove this case? do a convert here? */ log_debug("res_process %s existing persistent lock new transient", r->name); r->last_client_id = act->client_id; act->result = -EEXIST; list_del(&act->list); add_client_result(act); } } /* * persistent requests with existing transient locks * * If a client requests a P (persistent) lock for a T (transient) * lock it already holds, we can just change T to P. Fail if the * same happens for locks from different clients. Changing * another client's lock from T to P may cause problems * if that client tries to unlock or update version. * * I don't think this P/T combination will be used. * It might be used if a command was able to take a P * vg lock, in which case the T vg lock would already * be held for reading. If the T lock was sh, it would * be converted to P ex. If the T/P modes matched, the * lock could just be changed from T to P. */ list_for_each_entry_safe(act, safe, &r->actions, list) { if (!(act->flags & LD_AF_PERSISTENT)) continue; lk = find_lock_client(r, act->client_id); if (!lk) continue; if (lk->mode != act->mode) { /* FIXME: convert and change to persistent? */ log_debug("res_process %s existing transient lock new persistent", r->name); r->last_client_id = act->client_id; act->result = -EEXIST; list_del(&act->list); add_client_result(act); } else { r->last_client_id = act->client_id; lk->flags |= LD_LF_PERSISTENT; lk->client_id = 0; act->result = 0; list_del(&act->list); add_client_result(act); } } /* * convert mode of existing locks */ list_for_each_entry_safe(act, safe, &r->actions, list) { if (act->flags & LD_AF_PERSISTENT) lk = find_lock_persistent(r); else lk = find_lock_client(r, act->client_id); if (!lk) continue; if (lk->mode == act->mode) { /* should never happen, should be found above */ log_error("convert same mode"); continue; } /* convert fails immediately, no EAGAIN retry */ rv = res_convert(ls, r, lk, act); act->result = rv; list_del(&act->list); add_client_result(act); } /* * Cases above are all requests addressed by existing locks. * Below handles the rest. Transient and persistent are * handled the same, except * - if mode of existing lock is incompat with requested, * leave the act on r->actions * - if r mode is EX, any lock action is blocked, just quit * * Retry a lock request that fails due to a lock conflict (-EAGAIN): * if we have not exceeded max retries and lm sets lm_retry (sanlock * transient conflicts from shared lock implementation), or r type * is gl or vg (transient real conflicts we want to hide from command). * lv lock conflicts won't be transient so don't retry them. */ if (r->mode == LD_LK_EX) return; /* * r mode is SH or UN, pass lock-sh actions to lm */ list_for_each_entry_safe(act, safe, &r->actions, list) { /* grant in order, so break here */ if (act->op == LD_OP_LOCK && act->mode == LD_LK_EX) break; if (act->op == LD_OP_LOCK && act->mode == LD_LK_SH) { lm_retry = 0; rv = res_lock(ls, r, act, &lm_retry); if ((rv == -EAGAIN) && (act->retries <= act->max_retries) && (lm_retry || (r->type != LD_RT_LV))) { /* leave act on list */ log_debug("S %s R %s res_lock EAGAIN retry", ls->name, r->name); act->retries++; *retry_out = 1; } else { act->result = rv; list_del(&act->list); add_client_result(act); } if (rv == -EUNATCH) goto r_free; } } /* * r mode is SH, any ex lock action is blocked, just quit */ if (r->mode == LD_LK_SH) return; /* * r mode is UN, pass lock-ex action to lm */ list_for_each_entry_safe(act, safe, &r->actions, list) { if (act->op == LD_OP_LOCK && act->mode == LD_LK_EX) { lm_retry = 0; rv = res_lock(ls, r, act, &lm_retry); if ((rv == -EAGAIN) && (act->retries <= act->max_retries) && (lm_retry || (r->type != LD_RT_LV))) { /* leave act on list */ log_debug("S %s R %s res_lock EAGAIN retry", ls->name, r->name); act->retries++; *retry_out = 1; } else { act->result = rv; list_del(&act->list); add_client_result(act); } if (rv == -EUNATCH) goto r_free; break; } } return;r_free: /* For the EUNATCH case it may be possible there are queued actions? */ list_for_each_entry_safe(act, safe, &r->actions, list) { log_error("S %s R %s res_process r_free cancel %s client %d", ls->name, r->name, op_str(act->op), act->client_id); act->result = -ECANCELED; list_del(&act->list); add_client_result(act); } log_debug("S %s R %s res_process free", ls->name, r->name); lm_rem_resource(ls, r); list_del(&r->list); free_resource(r);}...