You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
[root@CentOS1 ~]$ gluster volume set help |grep cluster.read-hash-mode -A7
Option: cluster.read-hash-mode
Default Value: 1
Description: inode-read fops happen only on one of the bricks in replicate. AFR will prefer the one computed using the method specified using this option.
0 = first readable child of AFR, starting from 1st child.
1 = hash by GFID of file (all clients use same subvolume).
2 = hash by GFID of file and client PID.
3 = brick having the least outstanding read requests.
4 = brick having the least network ping latency.
//priv->hash_mode为1或者2的情况下,是根据gfid或者pid进行SuperFastHash计算哈希值然后在针对priv->child_count取余,这可能会导致根据gfid或者pid导致IO读写不均衡
int afr_hash_child(afr_read_subvol_args_t *args, afr_private_t *priv,
unsigned char *readable)
{
uuid_t gfid_copy = {
0,
};
pid_t pid;
int child = -1;
switch (priv->hash_mode) {
case AFR_READ_POLICY_FIRST_UP:
break;
//根据gfid计算哈希取余,选取volume rep_vol-replicate-XXX其中一个rep_vol-client-XXX进行读写操作
case AFR_READ_POLICY_GFID_HASH:
gf_uuid_copy(gfid_copy, args->gfid);
//当前测试环境是副本卷,3*3的副本卷分别有rep_vol-replicate-0,rep_vol-replicate-1,rep_vol-replicate-2,每个rep_vol-replicate-XXX包括了三个rep_vol-client-XXX,这里的afr_hash_child是在rep_vol-replicate-XXX中选择操作哪一个rep_vol-client-XXX,volume层级关系参见后续的replicat 卷的volume层级关系
child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy))%priv->child_count;
break;
//根据glustefs fuse客户端的进程ID选择
case AFR_READ_POLICY_GFID_PID_HASH:
if (args->ia_type != IA_IFDIR) {
/*
* Why getpid? Because it's one of the cheapest calls
* available - faster than gethostname etc. - and
* returns a constant-length value that's sure to be
* shorter than a UUID. It's still very unlikely to be
* the same across clients, so it still provides good
* mixing. We're not trying for perfection here. All we
* need is a low probability that multiple clients
* won't converge on the same subvolume.
*/
pid = getpid();
memcpy(gfid_copy, &pid, sizeof(pid));
}
child = SuperFastHash((char *)gfid_copy, sizeof(gfid_copy))%priv->child_count;
break;
//选择一个读事务最少的一个subvolume
case AFR_READ_POLICY_LESS_LOAD:
child = afr_least_pending_reads_child(priv, readable);
break;
//选择网络延迟最小的volume
case AFR_READ_POLICY_LEAST_LATENCY:
child = afr_least_latency_child(priv, readable);
break;
//这个策略不明白为啥会用child的当前read回调函数次数 * child延迟,来选择最小的,具体的依据是什么?官方也没有一个定论
case AFR_READ_POLICY_LOAD_LATENCY_HYBRID:
child = afr_least_latency_times_pending_reads_child(priv, readable);
break;
}
return child;
}
static int32_t afr_least_latency_times_pending_reads_child(afr_private_t *priv,unsigned char *readable)
{
for (i = 0; i < priv->child_count; i++) {
//pending_reads代表当前volume的subvolume中read回调函数的总数
//priv->child_latency[i]代表当前subvolume延迟最小的,这个值会在afr_notify中设置
pending_read = GF_ATOMIC_GET(priv->pending_reads[i]);
latency = (pending_read + 1) * priv->child_latency[i];
if (child == -1 || latency < least_latency) {
least_latency = latency;
child = i;
}
}
return child;
}