Skip to content

Commit

Permalink
Merge pull request #79 from AnFreTh/main
Browse files Browse the repository at this point in the history
Release v0.1.4
  • Loading branch information
AnFreTh authored Aug 9, 2024
2 parents 3e94e5d + a04931d commit 39fb5d7
Show file tree
Hide file tree
Showing 36 changed files with 859 additions and 766 deletions.
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -183,4 +183,11 @@ post-checkout
post-commit
post-merge
pre-push
docs/notebooks/lightning_logs/*
docs/notebooks/lightning_logs/*
docs/notebooks/lightning_logs
docs/notebooks/data
docs/notebooks/data/*
docs/notebooks/embeddings
docs/notebooks/embeddings/*
docs/notebooks/checkpoints
docs/notebooks/checkpoints/*
118 changes: 49 additions & 69 deletions docs/notebooks/datasets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/develop/docs/notebooks/datasets.ipynb)\n",
"[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/develop/docs/notebooks/datasets.ipynb)\n",
"[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AnFreTh/STREAM/blob/main/docs/notebooks/datasets.ipynb)\n",
"[![Open On GitHub](https://img.shields.io/badge/Open-on%20GitHub-blue?logo=GitHub)](https://github.com/AnFreTh/STREAM/blob/main/docs/notebooks/datasets.ipynb)\n",
"\n",
"# Datasets"
]
Expand Down Expand Up @@ -33,13 +33,25 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# uncomment the below line if running in Colab\n",
"# package neeeds to be installed for the notebook to run\n",
"\n",
"# ! pip install -U stream_topic"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/homebrew/Caskroom/miniforge/base/envs/db/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
" warnings.warn(\n"
"/opt/homebrew/Caskroom/miniforge/base/envs/topicm/lib/python3.10/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
" from tqdm.autonotebook import tqdm, trange\n"
]
}
],
Expand All @@ -60,37 +72,6 @@
"- these datasets are included in the package and can be loaded using the `TMDataset` module"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Stocktwits_GME_large',\n",
" 'BBC_News',\n",
" 'Stocktwits_GME',\n",
" 'Reddit_GME',\n",
" 'Reuters',\n",
" 'Spotify',\n",
" '20NewsGroups',\n",
" 'DummyDataset',\n",
" 'Spotify_most_popular',\n",
" 'Poliblogs',\n",
" 'Spotify_least_popular']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = TMDataset()\n",
"dataset.get_dataset_list()"
]
},
{
"cell_type": "code",
"execution_count": 3,
Expand All @@ -100,12 +81,16 @@
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-08-07 10:31:30.489\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m159\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n",
"\u001b[32m2024-08-07 10:31:31.978\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mDataset loaded successfully from /opt/homebrew/Caskroom/miniforge/base/envs/db/lib/python3.10/site-packages/stream_topic/preprocessed_datasets/Reuters\u001b[0m\n"
"\u001b[32m2024-08-09 12:13:26.847\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: Reuters\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:26.914\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:27.147\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:27.313\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:27.456\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n"
]
}
],
"source": [
"dataset = TMDataset()\n",
"dataset.fetch_dataset(name=\"Reuters\")"
]
},
Expand Down Expand Up @@ -181,10 +166,13 @@
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-08-07 10:31:33.085\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m151\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n",
"\u001b[32m2024-08-07 10:31:33.086\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m153\u001b[0m - \u001b[1mOverwriting the dataset name with the provided name in fetch_dataset: Spotify\u001b[0m\n",
"\u001b[32m2024-08-07 10:31:33.086\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m156\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n",
"\u001b[32m2024-08-07 10:31:33.190\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mDataset loaded successfully from /opt/homebrew/Caskroom/miniforge/base/envs/db/lib/python3.10/site-packages/stream_topic/preprocessed_datasets/Spotify\u001b[0m\n"
"\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m108\u001b[0m - \u001b[1mDataset name already provided while instantiating the class: Reuters\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.464\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m111\u001b[0m - \u001b[1mOverwriting the dataset name with the name provided in fetch_dataset: Spotify\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.465\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m115\u001b[0m - \u001b[1mFetching dataset: Spotify\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.539\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m331\u001b[0m - \u001b[1mDownloading dataset from github\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.749\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m333\u001b[0m - \u001b[1mDataset downloaded successfully at ~/stream_topic_data/\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:28.923\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m361\u001b[0m - \u001b[1mDownloading dataset info from github\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:29.058\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.data_downloader\u001b[0m:\u001b[36mload_custom_dataset_from_url\u001b[0m:\u001b[36m363\u001b[0m - \u001b[1mDataset info downloaded successfully at ~/stream_topic_data/\u001b[0m\n"
]
}
],
Expand Down Expand Up @@ -451,7 +439,7 @@
{
"data": {
"text/plain": [
"[75, 58, 37, 45, 41]"
"[75, 58]"
]
},
"execution_count": 11,
Expand All @@ -460,7 +448,7 @@
}
],
"source": [
"dataset.labels[:5]"
"dataset.labels[:2]"
]
},
{
Expand All @@ -475,18 +463,6 @@
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from stream_topic.utils import TMDataset\n",
"\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
Expand All @@ -508,19 +484,16 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 267.71it/s]\n",
"\u001b[32m2024-08-07 10:31:37.027\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m409\u001b[0m - \u001b[1mDataset save directory does not exist: data/\u001b[0m\n",
"\u001b[32m2024-08-07 10:31:37.027\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m410\u001b[0m - \u001b[1mCreating directory: data/\u001b[0m\n",
"\u001b[32m2024-08-07 10:31:37.031\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m415\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n",
"\u001b[32m2024-08-07 10:31:37.032\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m430\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n",
"\u001b[32m2024-08-07 10:31:37.032\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m433\u001b[0m - \u001b[1mDataset name appended to avaliable datasets list: ['Stocktwits_GME_large', 'BBC_News', 'Stocktwits_GME', 'Reddit_GME', 'Reuters', 'Spotify', '20NewsGroups', 'DummyDataset', 'Spotify_most_popular', 'Poliblogs', 'Spotify_least_popular', 'sample_data']\u001b[0m\n"
"Preprocessing documents: 100%|██████████| 1000/1000 [00:03<00:00, 263.32it/s]\n",
"\u001b[32m2024-08-09 12:13:32.967\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m237\u001b[0m - \u001b[1mDataset saved to data/sample_data.parquet\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:32.968\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mcreate_load_save_dataset\u001b[0m:\u001b[36m252\u001b[0m - \u001b[1mDataset info saved to data/sample_data_info.pkl\u001b[0m\n"
]
}
],
Expand All @@ -537,27 +510,27 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2024-08-07 10:31:37.036\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m159\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n",
"\u001b[32m2024-08-07 10:31:37.045\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m165\u001b[0m - \u001b[1mDataset loaded successfully from data/\u001b[0m\n"
"\u001b[32m2024-08-09 12:13:32.972\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m118\u001b[0m - \u001b[1mFetching dataset: sample_data\u001b[0m\n",
"\u001b[32m2024-08-09 12:13:32.973\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mstream_topic.utils.dataset\u001b[0m:\u001b[36mfetch_dataset\u001b[0m:\u001b[36m128\u001b[0m - \u001b[1mFetching dataset from local path\u001b[0m\n"
]
}
],
"source": [
"# the new data is saved in the data folder unlike the default datasets which are saved in package directory under preprocessed_data folder.\n",
"# therefore, you need to provide the path to the data folder to fetch the dataset\n",
"dataset.fetch_dataset(name=\"sample_data\", dataset_path=\"data/\")"
"dataset.fetch_dataset(name=\"sample_data\", dataset_path=\"data/\", source=\"local\")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 15,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -630,7 +603,7 @@
"4 BGHXO 3 [BGHXO]"
]
},
"execution_count": 16,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -639,6 +612,13 @@
"dataset.dataframe.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -663,7 +643,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
"version": "3.10.14"
}
},
"nbformat": 4,
Expand Down
139 changes: 0 additions & 139 deletions docs/notebooks/datasets.md

This file was deleted.

Loading

0 comments on commit 39fb5d7

Please sign in to comment.