ithaka · czcindy426 · Dec 16, 2024 · Dec 16, 2024 · Jan 10, 2025 · Jan 13, 2025
diff --git a/Exploring-metadata/exploring-metadata.ipynb b/Exploring-metadata/exploring-metadata.ipynb
@@ -101,30 +101,28 @@
     "* Sampled dataset metadata files (1500 items) are downloaded using the `.get_metadata()` method. They are built automatically when a dataset is created."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-12-16T15:03:13.423058Z",
+     "iopub.status.busy": "2024-12-16T15:03:13.422773Z",
+     "iopub.status.idle": "2024-12-16T15:03:13.428440Z",
+     "shell.execute_reply": "2024-12-16T15:03:13.427831Z",
+     "shell.execute_reply.started": "2024-12-16T15:03:13.423039Z"
+    }
+   },
+   "source": [
+    "<h3 style=\"color:red; display:inline\">Note! The following code cell assumes that you have downloaded the metadata csv file to the current working directory.&lt; / &gt; </h3>"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset_id = \"7e41317e-740f-e86a-4729-20dab492e925\"\n",
-    "\n",
-    "# Check if a data folder exists. If not, create it.\n",
-    "data_folder = Path('./data/')\n",
-    "data_folder.mkdir(exist_ok=True)\n",
-    "\n",
-    "# Check to see if a dataset file exists\n",
-    "# If not, download a dataset using the Constellate Client\n",
-    "# The default dataset is Shakespeare Quarterly, 1950-present\n",
-    "dataset_metadata = Path.cwd() / 'data' / 'my_metadata.csv' # Make sure this filepath matches your dataset metadata filename\n",
-    "\n",
-    "if dataset_metadata.exists() == False:\n",
-    "    try: \n",
-    "        dataset_metadata = constellate.download(dataset_id, 'metadata')\n",
-    "        print(f'Full dataset metadata ready')\n",
-    "    except:\n",
-    "        dataset_metadata = constellate.get_metadata(dataset_id)\n",
-    "        print(f'Sampled dataset metadata ready')"
+    "dataset_metadata = '' # copy and paste the path to your metadata csv file here"
    ]
   },
   {
@@ -489,23 +487,31 @@
     "* [Constellate client](https://constellate.org/docs/constellate-client)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h3 style=\"color:red; display:inline\">Note! The following code cell assumes that you have downloaded the JSONL file containing metadata, ngrams and full texts to the current working directory.&lt; / &gt; </h3>"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Check if a data folder exists. If not, create it.\n",
-    "data_folder = Path('./data/')\n",
-    "data_folder.mkdir(exist_ok=True)\n",
-    "\n",
-    "# Try to download the full dataset (.jsonl)\n",
-    "# Otherwise, download the sampled dataset (.jsonl)\n",
+    "# path to the jsonl file in the current directory\n",
+    "file_path = '' # copy and paste the path to the JSONL file \n",
     "\n",
-    "try: \n",
-    "    dataset_file = constellate.download(dataset_id, 'jsonl')\n",
-    "except: \n",
-    "    dataset_file = constellate.get_dataset(dataset_id)"
+    "# function that reads a jsonl file into a generator\n",
+    "def dataset_reader(file_path):\n",
+    "    \"\"\"\n",
+    "    Helper to read in gzip files and yield Python dictionary\n",
+    "    documents.\n",
+    "    \"\"\"\n",
+    "    with gzip.open(file_path, \"rb\") as input_file:\n",
+    "        for row in input_file:\n",
+    "            yield json.loads(row)"
    ]
   },
   {
@@ -515,32 +521,6 @@
     "Next, we will create an empty JSON-L file to filter our results into. We will also check if we want to overwrite the file, if it already exists."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "### Create an empty JSON-L file path ###\n",
-    "\n",
-    "# Check if a data folder exists. If not, create it.\n",
-    "data_folder = Path('./data/')\n",
-    "data_folder.mkdir(exist_ok=True)\n",
-    "\n",
-    "# Define the file output name\n",
-    "file_path = Path.cwd() / 'data' / 'my_data.jsonl' # You may change the name of the file here\n",
-    "\n",
-    "# Delete output files if they already exist\n",
-    "if file_path.exists():\n",
-    "    overwrite = input(f'Overwrite {file_path}? (yes/no)')\n",
-    "    if overwrite.lower() == 'yes':\n",
-    "        print(f'Overwriting older version of {file_path}')\n",
-    "        file_path.unlink()\n",
-    "        file_path.touch()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -556,12 +536,16 @@
    },
    "outputs": [],
    "source": [
-    "# Append all documents with ids in `filtered_id_list`\n",
-    "for document in constellate.dataset_reader(dataset_file):\n",
+    "# Append all documents with ids in `filtered_id_list` to a new JSONL file\n",
+    "\n",
+    "# Define the filtered file output name\n",
+    "new_file_path = Path.cwd() / 'my_filtered_data.jsonl' # You may change the name of the file here\n",
+    "\n",
+    "for document in dataset_reader(file_path): \n",
     "    document_id = document['id']\n",
     "    # Append any documents in the filtered list\n",
     "    if document_id in filtered_id_list:\n",
-    "        with file_path.open('a') as outfile:\n",
+    "        with new_file_path.open('a') as outfile:\n",
     "            json.dump(document, outfile)\n",
     "            outfile.write('\\n')\n",
     "print(f'{file_path} created.')"
@@ -584,12 +568,12 @@
    "source": [
     "# Compress the file using gzip\n",
     "# This may take several minutes to complete\n",
-    "f_in = file_path.open('rb')\n",
-    "f_out = gzip.open(f'{file_path}.gz', 'wb')\n",
+    "f_in = new_file_path.open('rb')\n",
+    "f_out = gzip.open(f'{new_file_path}.gz', 'wb')\n",
     "f_out.writelines(f_in)\n",
     "f_out.close()\n",
     "f_in.close()\n",
-    "print(f'Compression complete. \\n{file_path}.gz has been created.')"
+    "print(f'Compression complete. \\n{new_file_path}.gz has been created.')"
    ]
   },
   {
@@ -652,13 +636,6 @@
     "\n",
     "df.groupby(['publicationYear'])['pageCount'].agg('sum').plot.bar(title='Pages by decade', figsize=(20, 5), fontsize=12);"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {