Merge pull request #1039 from d2l-ai/master

Release v2.0.0-beta0
d2l-ai · Dec 9, 2021 · e51c411 · e51c411
2 parents 2d9071c + 2464143
commit e51c411
Show file tree

Hide file tree

Showing 105 changed files with 2,859 additions and 1,668 deletions.
diff --git a/Jenkinsfile_origin b/Jenkinsfile_origin
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 [![Build Status](http://ci.d2l.ai/job/d2l-zh/job/master/badge/icon)](http://ci.d2l.ai/job/d2l-zh/job/master/)
 
-[第一版：zh-v1.D2L.ai](https://zh-v1.d2l.ai/) | [第二版预览版：zh-v2.D2L.ai](https://zh-v2.d2l.ai) | 安装和使用书中源代码：[第一版](https://zh-v1.d2l.ai/chapter_prerequisite/install.html) [第二版](https://zh-v2.d2l.ai/chapter_installation/index.html) | 当前版本: v2.0.0-alpha2
+[第一版：zh-v1.D2L.ai](https://zh-v1.d2l.ai/) | [第二版预览版：zh.D2L.ai](https://zh.d2l.ai) | 安装和使用书中源代码：[第一版](https://zh-v1.d2l.ai/chapter_prerequisite/install.html) [第二版](https://zh.d2l.ai/chapter_installation/index.html) | 当前版本: v2.0.0-alpha2
 
 <h5 align="center"><i>理解深度学习的最佳方法是学以致用。</i></h5>
 

diff --git a/chapter_appendix-tools-for-deep-learning/aws.md b/chapter_appendix-tools-for-deep-learning/aws.md
@@ -202,4 +202,4 @@ jupyter notebook
 1. 尝试使用不同的GPU服务器。它们有多快？
 1. 尝试使用多GPU服务器。你能把事情扩大到什么程度？
 
-[Discussions](https://discuss.d2l.ai/t/423)
+[Discussions](https://discuss.d2l.ai/t/5733)
diff --git a/chapter_appendix-tools-for-deep-learning/contributing.md b/chapter_appendix-tools-for-deep-learning/contributing.md
@@ -157,4 +157,4 @@ git push
 1. 如果发现任何需要改进的地方（例如，缺少引用），请提交Pull请求。
 1. 通常更好的做法是使用新分支创建Pull请求。学习如何用[Git分支](https://git-scm.com/book/en/v2/Git-Branching-Branches-in-a-Nutshell)来做这件事。
 
-[Discussions](https://discuss.d2l.ai/t/426)
+[Discussions](https://discuss.d2l.ai/t/5730)
diff --git a/chapter_appendix-tools-for-deep-learning/jupyter.md b/chapter_appendix-tools-for-deep-learning/jupyter.md
@@ -109,4 +109,4 @@ jupyter nbextension enable execute_time/ExecuteTime
 1. 使用Jupyter Notebook通过端口转发来远程编辑和运行本书中的代码。
 1. 对于两个方矩阵，测量$\mathbf{A}^\top \mathbf{B}$与$\mathbf{A} \mathbf{B}$在$\mathbb{R}^{1024 \times 1024}$中的运行时间。哪一个更快？
 
-[Discussions](https://discuss.d2l.ai/t/421)
+[Discussions](https://discuss.d2l.ai/t/5731)
diff --git a/chapter_appendix-tools-for-deep-learning/sagemaker.md b/chapter_appendix-tools-for-deep-learning/sagemaker.md
@@ -112,4 +112,4 @@ git pull
 1. 使用Amazon SageMaker编辑并运行任何需要GPU的部分。
 1. 打开终端以访问保存本书所有notebooks的本地目录。
 
-[Discussions](https://discuss.d2l.ai/t/422)
+[Discussions](https://discuss.d2l.ai/t/5732)
diff --git a/chapter_attention-mechanisms/attention-cues.md b/chapter_attention-mechanisms/attention-cues.md
@@ -169,15 +169,15 @@ show_heatmaps(attention_weights, xlabel='Keys', ylabel='Queries')
 1. 随机生成一个$10 \times 10$矩阵并使用`softmax`运算来确保每行都是有效的概率分布，然后可视化输出注意力权重。
 
 :begin_tab:`mxnet`
-[Discussions](https://discuss.d2l.ai/t/1596)
+[Discussions](https://discuss.d2l.ai/t/5763)
 :end_tab:
 
 :begin_tab:`pytorch`
-[Discussions](https://discuss.d2l.ai/t/1592)
+[Discussions](https://discuss.d2l.ai/t/5764)
 :end_tab:
 
 :begin_tab:`tensorflow`
-[Discussions](https://discuss.d2l.ai/t/1710)
+[Discussions](https://discuss.d2l.ai/t/5765)
 :end_tab:
 
 
diff --git a/chapter_attention-mechanisms/attention-scoring-functions.md b/chapter_attention-mechanisms/attention-scoring-functions.md
@@ -80,8 +80,8 @@ import tensorflow as tf
 ```{.python .input}
 #@save
 def masked_softmax(X, valid_lens):
- """通过在最后一个轴上掩蔽元素来执行 softmax 操作"""
- # `X`: 3D张量，`valid_lens`: 1D或2D 张量
+ """通过在最后一个轴上掩蔽元素来执行softmax操作"""
+ # X:3D张量，valid_lens:1D或2D张量
  if valid_lens is None:
  return npx.softmax(X)
  else:
@@ -100,8 +100,8 @@ def masked_softmax(X, valid_lens):
 #@tab pytorch
 #@save
 def masked_softmax(X, valid_lens):
- """通过在最后一个轴上掩蔽元素来执行 softmax 操作"""
- # `X`: 3D张量，`valid_lens`: 1D或2D 张量
+ """通过在最后一个轴上掩蔽元素来执行softmax操作"""
+ # X:3D张量，valid_lens:1D或2D张量
  if valid_lens is None:
  return nn.functional.softmax(X, dim=-1)
  else:
@@ -120,8 +120,8 @@ def masked_softmax(X, valid_lens):
 #@tab tensorflow
 #@save
 def masked_softmax(X, valid_lens):
- """通过在最后一个轴上掩蔽元素来执行 softmax 操作"""
- # `X`: 3D张量，`valid_lens`: 1D或2D 张量
+ """通过在最后一个轴上掩蔽元素来执行softmax操作"""
+ # X:3D张量，valid_lens:1D或2D张量
  if valid_lens is None:
  return tf.nn.softmax(X, axis=-1)
  else:
@@ -201,7 +201,7 @@ class AdditiveAttention(nn.Block):
  """加性注意力"""
  def __init__(self, num_hiddens, dropout, **kwargs):
  super(AdditiveAttention, self).__init__(**kwargs)
- # 使用 'flatten=False' 只转换最后一个轴，以便其他轴的形状保持不变
+ # 使用'flatten=False'只转换最后一个轴，以便其他轴的形状保持不变
  self.W_k = nn.Dense(num_hiddens, use_bias=False, flatten=False)
  self.W_q = nn.Dense(num_hiddens, use_bias=False, flatten=False)
  self.w_v = nn.Dense(1, use_bias=False, flatten=False)
@@ -210,17 +210,17 @@ class AdditiveAttention(nn.Block):
  def forward(self, queries, keys, values, valid_lens):
  queries, keys = self.W_q(queries), self.W_k(keys)
  # 在维度扩展后，
- # `queries` 的形状：(`batch_size`，查询的个数，1，`num_hidden`)
- # `key` 的形状：(`batch_size`，1，“键－值”对的个数，`num_hiddens`)
+ # queries的形状：(batch_size，查询的个数，1，num_hidden)
+ # key的形状：(batch_size，1，“键－值”对的个数，num_hiddens)
  # 使用广播的方式进行求和
  features = np.expand_dims(queries, axis=2) + np.expand_dims(
  keys, axis=1)
  features = np.tanh(features)
- # `self.w_v` 仅有一个输出，因此从形状中移除最后那个维度。
- # `scores` 的形状：(`batch_size`，查询的个数，“键-值”对的个数)
+ # self.w_v仅有一个输出，因此从形状中移除最后那个维度。
+ # scores的形状：(batch_size，查询的个数，“键-值”对的个数)
  scores = np.squeeze(self.w_v(features), axis=-1)
  self.attention_weights = masked_softmax(scores, valid_lens)
- # `values` 的形状：(`batch_size`，“键－值”对的个数，值的维度)
+ # values的形状：(batch_size，“键－值”对的个数，值的维度)
  return npx.batch_dot(self.dropout(self.attention_weights), values)
 ```
 
@@ -239,24 +239,24 @@ class AdditiveAttention(nn.Module):
  def forward(self, queries, keys, values, valid_lens):
  queries, keys = self.W_q(queries), self.W_k(keys)
  # 在维度扩展后，
- # `queries` 的形状：(`batch_size`，查询的个数，1，`num_hidden`)
- # `key` 的形状：(`batch_size`，1，“键－值”对的个数，`num_hiddens`)
+ # queries的形状：(batch_size，查询的个数，1，num_hidden)
+ # key的形状：(batch_size，1，“键－值”对的个数，num_hiddens)
  # 使用广播方式进行求和
  features = queries.unsqueeze(2) + keys.unsqueeze(1)
  features = torch.tanh(features)
- # `self.w_v` 仅有一个输出，因此从形状中移除最后那个维度。
- # `scores` 的形状：(`batch_size`，查询的个数，“键-值”对的个数)
+ # self.w_v仅有一个输出，因此从形状中移除最后那个维度。
+ # scores的形状：(batch_size，查询的个数，“键-值”对的个数)
  scores = self.w_v(features).squeeze(-1)
  self.attention_weights = masked_softmax(scores, valid_lens)
- # `values` 的形状：(`batch_size`，“键－值”对的个数，值的维度)
+ # values的形状：(batch_size，“键－值”对的个数，值的维度)
  return torch.bmm(self.dropout(self.attention_weights), values)
 ```
 
 ```{.python .input}
 #@tab tensorflow
 #@save
 class AdditiveAttention(tf.keras.layers.Layer):
- """Additive attention."""
+ """Additiveattention."""
  def __init__(self, key_size, query_size, num_hiddens, dropout, **kwargs):
  super().__init__(**kwargs)
  self.W_k = tf.keras.layers.Dense(num_hiddens, use_bias=False)
@@ -267,17 +267,17 @@ class AdditiveAttention(tf.keras.layers.Layer):
  def call(self, queries, keys, values, valid_lens, **kwargs):
  queries, keys = self.W_q(queries), self.W_k(keys)
  # 在维度扩展后，
- # `queries` 的形状：(`batch_size`，查询的个数，1，`num_hidden`)
- # `key` 的形状：(`batch_size`，1，“键－值”对的个数，`num_hiddens`)
+ # queries的形状：(batch_size，查询的个数，1，num_hidden)
+ # key的形状：(batch_size，1，“键－值”对的个数，num_hiddens)
  # 使用广播方式进行求和
  features = tf.expand_dims(queries, axis=2) + tf.expand_dims(
  keys, axis=1)
  features = tf.nn.tanh(features)
- # `self.w_v` 仅有一个输出，因此从形状中移除最后那个维度。
- # `scores` 的形状：(`batch_size`，查询的个数，“键-值”对的个数)
+ # self.w_v仅有一个输出，因此从形状中移除最后那个维度。
+ # scores的形状：(batch_size，查询的个数，“键-值”对的个数)
  scores = tf.squeeze(self.w_v(features), axis=-1)
  self.attention_weights = masked_softmax(scores, valid_lens)
- # `values` 的形状：(`batch_size`，“键－值”对的个数，值的维度)
+ # values的形状：(batch_size，“键－值”对的个数，值的维度)
  return tf.matmul(self.dropout(
  self.attention_weights, **kwargs), values)
 ```
@@ -289,7 +289,7 @@ class AdditiveAttention(tf.keras.layers.Layer):
 
 ```{.python .input}
 queries, keys = d2l.normal(0, 1, (2, 1, 20)), d2l.ones((2, 10, 2))
-# `values` 的小批量数据集中，两个值矩阵是相同的
+# values的小批量数据集中，两个值矩阵是相同的
 values = np.arange(40).reshape(1, 10, 4).repeat(2, axis=0)
 valid_lens = d2l.tensor([2, 6])
 
@@ -301,7 +301,7 @@ attention(queries, keys, values, valid_lens)
 ```{.python .input}
 #@tab pytorch
 queries, keys = d2l.normal(0, 1, (2, 1, 20)), d2l.ones((2, 10, 2))
-# `values` 的小批量，两个值矩阵是相同的
+# values的小批量，两个值矩阵是相同的
 values = torch.arange(40, dtype=torch.float32).reshape(1, 10, 4).repeat(
  2, 1, 1)
 valid_lens = d2l.tensor([2, 6])
@@ -315,7 +315,7 @@ attention(queries, keys, values, valid_lens)
 ```{.python .input}
 #@tab tensorflow
 queries, keys = tf.random.normal(shape=(2, 1, 20)), tf.ones((2, 10, 2))
-# `values` 的小批量，两个值矩阵是相同的
+# values的小批量，两个值矩阵是相同的
 values = tf.repeat(tf.reshape(
  tf.range(40, dtype=tf.float32), shape=(1, 10, 4)), repeats=2, axis=0)
 valid_lens = tf.constant([2, 6])
@@ -369,13 +369,13 @@ class DotProductAttention(nn.Block):
  super(DotProductAttention, self).__init__(**kwargs)
  self.dropout = nn.Dropout(dropout)
 
- # `queries` 的形状：(`batch_size`，查询的个数，`d`)
- # `keys` 的形状：(`batch_size`，“键－值”对的个数，`d`)
- # `values` 的形状：(`batch_size`，“键－值”对的个数，值的维度)
- # `valid_lens` 的形状: (`batch_size`，) 或者 (`batch_size`，查询的个数)
+ # queries的形状：(batch_size，查询的个数，d)
+ # keys的形状：(batch_size，“键－值”对的个数，d)
+ # values的形状：(batch_size，“键－值”对的个数，值的维度)
+ # valid_lens的形状:(batch_size，)或者(batch_size，查询的个数)
  def forward(self, queries, keys, values, valid_lens=None):
  d = queries.shape[-1]
- # 设置 `transpose_b=True` 为了交换 `keys` 的最后两个维度
+ # 设置transpose_b=True为了交换keys的最后两个维度
  scores = npx.batch_dot(queries, keys, transpose_b=True) / math.sqrt(d)
  self.attention_weights = masked_softmax(scores, valid_lens)
  return npx.batch_dot(self.dropout(self.attention_weights), values)
@@ -390,13 +390,13 @@ class DotProductAttention(nn.Module):
  super(DotProductAttention, self).__init__(**kwargs)
  self.dropout = nn.Dropout(dropout)
 
- # `queries` 的形状：(`batch_size`，查询的个数，`d`)
- # `keys` 的形状：(`batch_size`，“键－值”对的个数，`d`)
- # `values` 的形状：(`batch_size`，“键－值”对的个数，值的维度)
- # `valid_lens` 的形状: (`batch_size`，) 或者 (`batch_size`，查询的个数)
+ # queries的形状：(batch_size，查询的个数，d)
+ # keys的形状：(batch_size，“键－值”对的个数，d)
+ # values的形状：(batch_size，“键－值”对的个数，值的维度)
+ # valid_lens的形状:(batch_size，)或者(batch_size，查询的个数)
  def forward(self, queries, keys, values, valid_lens=None):
  d = queries.shape[-1]
- # 设置 `transpose_b=True` 为了交换 `keys` 的最后两个维度
+ # 设置transpose_b=True为了交换keys的最后两个维度
  scores = torch.bmm(queries, keys.transpose(1,2)) / math.sqrt(d)
  self.attention_weights = masked_softmax(scores, valid_lens)
  return torch.bmm(self.dropout(self.attention_weights), values)
@@ -406,15 +406,15 @@ class DotProductAttention(nn.Module):
 #@tab tensorflow
 #@save
 class DotProductAttention(tf.keras.layers.Layer):
- """Scaled dot product attention."""
+ """Scaleddotproductattention."""
  def __init__(self, dropout, **kwargs):
  super().__init__(**kwargs)
  self.dropout = tf.keras.layers.Dropout(dropout)
  
- # `queries` 的形状：(`batch_size`，查询的个数，`d`)
- # `keys` 的形状：(`batch_size`，“键－值”对的个数，`d`)
- # `values` 的形状：(`batch_size`，“键－值”对的个数，值的维度)
- # `valid_lens` 的形状: (`batch_size`，) 或者 (`batch_size`，查询的个数)
+ # queries的形状：(batch_size，查询的个数，d)
+ # keys的形状：(batch_size，“键－值”对的个数，d)
+ # values的形状：(batch_size，“键－值”对的个数，值的维度)
+ # valid_lens的形状:(batch_size，)或者(batch_size，查询的个数)
  def call(self, queries, keys, values, valid_lens, **kwargs):
  d = queries.shape[-1]
  scores = tf.matmul(queries, keys, transpose_b=True)/tf.math.sqrt(
@@ -470,9 +470,9 @@ d2l.show_heatmaps(d2l.reshape(attention.attention_weights, (1, 1, 2, 10)),
 1. 当查询和键具有相同的矢量长度时，矢量求和作为评分函数是否比“点－积”更好？为什么？
 
 :begin_tab:`mxnet`
-[Discussions](https://discuss.d2l.ai/t/346)
+[Discussions](https://discuss.d2l.ai/t/5751)
 :end_tab:
 
 :begin_tab:`pytorch`
-[Discussions](https://discuss.d2l.ai/t/1064)
+[Discussions](https://discuss.d2l.ai/t/5752)
 :end_tab: