Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test constellate client removal on one notebook #31

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 44 additions & 67 deletions Exploring-metadata/exploring-metadata.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -101,30 +101,28 @@
"* Sampled dataset metadata files (1500 items) are downloaded using the `.get_metadata()` method. They are built automatically when a dataset is created."
]
},
{
"cell_type": "markdown",
"metadata": {
"execution": {
"iopub.execute_input": "2024-12-16T15:03:13.423058Z",
"iopub.status.busy": "2024-12-16T15:03:13.422773Z",
"iopub.status.idle": "2024-12-16T15:03:13.428440Z",
"shell.execute_reply": "2024-12-16T15:03:13.427831Z",
"shell.execute_reply.started": "2024-12-16T15:03:13.423039Z"
}
},
"source": [
"<h3 style=\"color:red; display:inline\">Note! The following code cell assumes that you have downloaded the metadata csv file to the current working directory.&lt; / &gt; </h3>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset_id = \"7e41317e-740f-e86a-4729-20dab492e925\"\n",
"\n",
"# Check if a data folder exists. If not, create it.\n",
"data_folder = Path('./data/')\n",
"data_folder.mkdir(exist_ok=True)\n",
"\n",
"# Check to see if a dataset file exists\n",
"# If not, download a dataset using the Constellate Client\n",
"# The default dataset is Shakespeare Quarterly, 1950-present\n",
"dataset_metadata = Path.cwd() / 'data' / 'my_metadata.csv' # Make sure this filepath matches your dataset metadata filename\n",
"\n",
"if dataset_metadata.exists() == False:\n",
" try: \n",
" dataset_metadata = constellate.download(dataset_id, 'metadata')\n",
" print(f'Full dataset metadata ready')\n",
" except:\n",
" dataset_metadata = constellate.get_metadata(dataset_id)\n",
" print(f'Sampled dataset metadata ready')"
"dataset_metadata = '' # copy and paste the path to your metadata csv file here"
]
},
{
Expand Down Expand Up @@ -489,23 +487,31 @@
"* [Constellate client](https://constellate.org/docs/constellate-client)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3 style=\"color:red; display:inline\">Note! The following code cell assumes that you have downloaded the JSONL file containing metadata, ngrams and full texts to the current working directory.&lt; / &gt; </h3>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check if a data folder exists. If not, create it.\n",
"data_folder = Path('./data/')\n",
"data_folder.mkdir(exist_ok=True)\n",
"\n",
"# Try to download the full dataset (.jsonl)\n",
"# Otherwise, download the sampled dataset (.jsonl)\n",
"# path to the jsonl file in the current directory\n",
"file_path = '' # copy and paste the path to the JSONL file \n",
"\n",
"try: \n",
" dataset_file = constellate.download(dataset_id, 'jsonl')\n",
"except: \n",
" dataset_file = constellate.get_dataset(dataset_id)"
"# function that reads a jsonl file into a generator\n",
"def dataset_reader(file_path):\n",
" \"\"\"\n",
" Helper to read in gzip files and yield Python dictionary\n",
" documents.\n",
" \"\"\"\n",
" with gzip.open(file_path, \"rb\") as input_file:\n",
" for row in input_file:\n",
" yield json.loads(row)"
]
},
{
Expand All @@ -515,32 +521,6 @@
"Next, we will create an empty JSON-L file to filter our results into. We will also check if we want to overwrite the file, if it already exists."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"### Create an empty JSON-L file path ###\n",
"\n",
"# Check if a data folder exists. If not, create it.\n",
"data_folder = Path('./data/')\n",
"data_folder.mkdir(exist_ok=True)\n",
"\n",
"# Define the file output name\n",
"file_path = Path.cwd() / 'data' / 'my_data.jsonl' # You may change the name of the file here\n",
"\n",
"# Delete output files if they already exist\n",
"if file_path.exists():\n",
" overwrite = input(f'Overwrite {file_path}? (yes/no)')\n",
" if overwrite.lower() == 'yes':\n",
" print(f'Overwriting older version of {file_path}')\n",
" file_path.unlink()\n",
" file_path.touch()"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -556,12 +536,16 @@
},
"outputs": [],
"source": [
"# Append all documents with ids in `filtered_id_list`\n",
"for document in constellate.dataset_reader(dataset_file):\n",
"# Append all documents with ids in `filtered_id_list` to a new JSONL file\n",
"\n",
"# Define the filtered file output name\n",
"new_file_path = Path.cwd() / 'my_filtered_data.jsonl' # You may change the name of the file here\n",
"\n",
"for document in dataset_reader(file_path): \n",
" document_id = document['id']\n",
" # Append any documents in the filtered list\n",
" if document_id in filtered_id_list:\n",
" with file_path.open('a') as outfile:\n",
" with new_file_path.open('a') as outfile:\n",
" json.dump(document, outfile)\n",
" outfile.write('\\n')\n",
"print(f'{file_path} created.')"
Expand All @@ -584,12 +568,12 @@
"source": [
"# Compress the file using gzip\n",
"# This may take several minutes to complete\n",
"f_in = file_path.open('rb')\n",
"f_out = gzip.open(f'{file_path}.gz', 'wb')\n",
"f_in = new_file_path.open('rb')\n",
"f_out = gzip.open(f'{new_file_path}.gz', 'wb')\n",
"f_out.writelines(f_in)\n",
"f_out.close()\n",
"f_in.close()\n",
"print(f'Compression complete. \\n{file_path}.gz has been created.')"
"print(f'Compression complete. \\n{new_file_path}.gz has been created.')"
]
},
{
Expand Down Expand Up @@ -652,13 +636,6 @@
"\n",
"df.groupby(['publicationYear'])['pageCount'].agg('sum').plot.bar(title='Pages by decade', figsize=(20, 5), fontsize=12);"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down