diff --git a/.eslintrc.json b/.eslintrc.json
index 85107470a..4ca16399d 100644
--- a/.eslintrc.json
+++ b/.eslintrc.json
@@ -22,6 +22,7 @@
"requestAnimationFrame": "readonly",
"React": "readonly",
"Block": "readonly",
+ "classifai_term_cleanup_params": "readonly",
"classifAISettings": "readonly"
},
"rules": {
diff --git a/README.md b/README.md
index 882281d54..acf3e66e7 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,7 @@ Tap into leading cloud-based services like [OpenAI](https://openai.com/), [Micro
* Convert text content into audio and output a "read-to-me" feature on the front-end to play this audio using [Microsoft Azure's Text to Speech API](https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/text-to-speech), [Amazon Polly](https://aws.amazon.com/polly/) or [OpenAI's Text to Speech API](https://platform.openai.com/docs/guides/text-to-speech)
* Classify post content using [IBM Watson's Natural Language Understanding API](https://www.ibm.com/watson/services/natural-language-understanding/), [OpenAI's Embedding API](https://platform.openai.com/docs/guides/embeddings) or [Microsoft Azure's OpenAI service](https://azure.microsoft.com/en-us/products/ai-services/openai-service)
* Create a smart 404 page that has a recommended results section that suggests relevant content to the user based on the page URL they were trying to access using either [OpenAI's Embedding API](https://platform.openai.com/docs/guides/embeddings) or [Microsoft Azure's OpenAI service](https://azure.microsoft.com/en-us/products/ai-services/openai-service) in combination with [ElasticPress](https://github.com/10up/ElasticPress)
+* Find similar terms to merge together using either [OpenAI's Embedding API](https://platform.openai.com/docs/guides/embeddings) or [Microsoft Azure's OpenAI service](https://azure.microsoft.com/en-us/products/ai-services/openai-service) in combination with [ElasticPress](https://github.com/10up/ElasticPress). Note this only compares top-level terms and if you merge a term that has children, these become top-level terms as per default WordPress behavior
* BETA: Recommend content based on overall site traffic via [Microsoft Azure's AI Personalizer API](https://azure.microsoft.com/en-us/services/cognitive-services/personalizer/) *(note that this service has been [deprecated by Microsoft](https://learn.microsoft.com/en-us/azure/ai-services/personalizer/) and as such, will no longer work. We are looking to replace this with a new provider to maintain the same functionality (see [issue#392](https://github.com/10up/classifai/issues/392))*
* Generate image alt text, image tags, and smartly crop images using [Microsoft Azure's AI Vision API](https://azure.microsoft.com/en-us/services/cognitive-services/computer-vision/)
* Scan images and PDF files for embedded text and save for use in post meta using [Microsoft Azure's AI Vision API](https://azure.microsoft.com/en-us/services/cognitive-services/computer-vision/)
@@ -56,7 +57,8 @@ Tap into leading cloud-based services like [OpenAI](https://openai.com/), [Micro
* To utilize the Azure OpenAI Language Processing functionality, you will need an active [Microsoft Azure](https://signup.azure.com/signup) account and you will need to [apply](https://aka.ms/oai/access) for OpenAI access.
* To utilize the Google Gemini Language Processing functionality, you will need an active [Google Gemini](https://ai.google.dev/tutorials/setup) account.
* To utilize the AWS Language Processing functionality, you will need an active [AWS](https://console.aws.amazon.com/) account.
-* To utilize the Smart 404 feature, you will need to use [ElasticPress](https://github.com/10up/ElasticPress) 5.0.0+ and [Elasticsearch](https://www.elastic.co/elasticsearch) 7.0+.
+* To utilize the Smart 404 feature, you will need an active [OpenAI](https://platform.openai.com/signup) account or [Microsoft Azure](https://signup.azure.com/signup) account with OpenAI access and you will need to use [ElasticPress](https://github.com/10up/ElasticPress) 5.0.0+ and [Elasticsearch](https://www.elastic.co/elasticsearch) 7.0+.
+* To utilize the Term Cleanup feature, you will need an active [OpenAI](https://platform.openai.com/signup) account or [Microsoft Azure](https://signup.azure.com/signup) account with OpenAI access. For better performance, you will need [ElasticPress](https://github.com/10up/ElasticPress) 5.0.0+ and [Elasticsearch](https://www.elastic.co/elasticsearch) 7.0+.
## Pricing
@@ -561,6 +563,46 @@ docker run -p 9200:9200 -d --name elasticsearch \
This will download, install and start Elasticsearch v7.9.0 to your local machine. You can then access Elasticsearch at `http://localhost:9200`, which is the same URL you can use to configure ElasticPress with. It is recommended that you change the `Content Items per Index Cycle` setting in ElasticPress to `20` to ensure indexing doesn't timeout. Also be aware of API rate limits on the OpenAI Embeddings API.
+## Set Up the Term Cleanup Feature
+
+### 1. Decide on Provider
+
+* This Feature is powered by either OpenAI or Azure OpenAI.
+* Once you've chosen a Provider, you'll need to create an account and get authentication details.
+ * When setting things up on the Azure side, ensure you choose either the `text-embedding-3-small` or `text-embedding-3-large` model. The Feature will not work with other models.
+
+### 2. Configure Settings under Tools > ClassifAI > Language Processing > Term Cleanup
+
+* Select the proper Provider in the provider dropdown.
+* Enter your authentication details.
+* Configure any other settings as desired.
+
+### 3. ElasticPress configuration
+
+It is recommended to use ElasticPress with this Feature, especially if processing more than 500 terms, as performance will be significantly better. Once the Term Cleanup Feature is configured, you can then proceed to get ElasticPress set up to index the data.
+
+If on a standard WordPress installation:
+
+* Install and activate the [ElasticPress](https://github.com/10up/elasticpress) plugin.
+* Set your Elasticsearch URL in the ElasticPress settings (`ElasticPress > Settings`).
+* Enable the [term index](https://www.elasticpress.io/blog/2023/03/enabling-comments-and-terms-in-elasticpress-5-0/) feature.
+* Go to the `ElasticPress > Sync` settings page and trigger a sync, ensuring this is set to run a sync from scratch. This will send over the new schema to Elasticsearch and index all content, including creating vector embeddings for each term.
+
+If on a WordPress VIP hosted environment:
+
+* [Enable Enterprise Search](https://docs.wpvip.com/enterprise-search/enable/).
+* [Enable the term index](https://docs.wpvip.com/enterprise-search/enable-features/#h-terms). Example command: `vip @example-app.develop -- wp vip-search activate-feature terms`.
+* [Run the VIP-CLI `index` command](https://docs.wpvip.com/enterprise-search/index/). This sends the new schema to Elasticsearch and indexes all content, including creating vector embeddings for each term. Note you may need to use the `--setup` flag to ensure the schema is created correctly.
+
+### 4. Start the Term Cleanup Process
+
+Once configured, the plugin will add a new submenu under the Tools menu called Term Cleanup.
+
+* Go to the Term Cleanup page, click on your desired taxonomy, then click on the "Find similar" button.
+* This initializes a background process that will compare each term to find ones that are similar.
+* Once done, all the results will be displayed.
+* You can then skip or merge the potential duplicate terms from the settings page.
+
## Set Up Image Processing features (via Microsoft Azure)
Note that [Azure AI Vision](https://docs.microsoft.com/en-us/azure/cognitive-services/computer-vision/home#image-requirements) can analyze and crop images that meet the following requirements:
diff --git a/includes/Classifai/Admin/SimilarTermsListTable.php b/includes/Classifai/Admin/SimilarTermsListTable.php
new file mode 100644
index 000000000..57c74eccb
--- /dev/null
+++ b/includes/Classifai/Admin/SimilarTermsListTable.php
@@ -0,0 +1,294 @@
+taxonomy = $taxonomy;
+
+ // Set parent defaults.
+ parent::__construct(
+ array(
+ 'singular' => 'similar_term',
+ 'plural' => 'similar_terms',
+ 'ajax' => false,
+ )
+ );
+ }
+
+ /**
+ * Gets the list of columns.
+ *
+ * @return string[] Array of column titles keyed by their column name.
+ */
+ public function get_columns() {
+ $tax = get_taxonomy( $this->taxonomy );
+ $labels = get_taxonomy_labels( $tax );
+ $label = $labels->singular_name ?? __( 'Term', 'classifai' );
+
+ return array(
+ 'term' => $label,
+ // translators: %s: Singular label of the taxonomy.
+ 'similar_term' => sprintf( __( 'Similar %s', 'classifai' ), $label ),
+ 'actions' => __( 'Action', 'classifai' ),
+ );
+ }
+
+ /**
+ * Prepares the list of items for displaying.
+ */
+ public function prepare_items() {
+ $per_page = $this->get_items_per_page( 'edit_post_per_page' );
+ $columns = $this->get_columns();
+ $hidden = array();
+ $sortable = $this->get_sortable_columns();
+ $search = isset( $_REQUEST['s'] ) ? sanitize_text_field( wp_unslash( $_REQUEST['s'] ) ) : ''; // phpcs:ignore WordPress.Security.NonceVerification.Recommended
+
+ $this->_column_headers = array( $columns, $hidden, $sortable );
+
+ $total = wp_count_terms(
+ [
+ 'taxonomy' => $this->taxonomy,
+ 'hide_empty' => false,
+ 'meta_key' => 'classifai_similar_terms', // phpcs:ignore WordPress.DB.SlowDBQuery.slow_db_query_meta_key
+ 'meta_compare' => 'EXISTS',
+ 'search' => $search,
+ ]
+ );
+
+ $this->set_pagination_args(
+ array(
+ 'total_items' => $total, // WE have to calculate the total number of items.
+ 'per_page' => $per_page, // WE have to determine how many items to show on a page.
+ 'total_pages' => ceil( $total / $per_page ), // WE have to calculate the total number of pages.
+ )
+ );
+
+ $current = $this->get_pagenum();
+ $offset = ( $current - 1 ) * $per_page;
+
+ $terms = get_terms(
+ [
+ 'taxonomy' => $this->taxonomy,
+ 'orderby' => 'count',
+ 'order' => 'DESC',
+ 'hide_empty' => false,
+ 'fields' => 'ids',
+ 'meta_key' => 'classifai_similar_terms', // phpcs:ignore WordPress.DB.SlowDBQuery.slow_db_query_meta_key
+ 'meta_compare' => 'EXISTS',
+ 'number' => $per_page,
+ 'offset' => $offset,
+ 'search' => $search,
+ ]
+ );
+
+ $items = [];
+
+ foreach ( $terms as $term_id ) {
+ $similar_terms = get_term_meta( $term_id, 'classifai_similar_terms', true );
+
+ if ( ! $similar_terms ) {
+ continue;
+ }
+
+ foreach ( $similar_terms as $k => $v ) {
+ $similar_term = get_term( $k );
+ if ( $similar_term ) {
+ $items[] = [
+ 'term' => get_term( $term_id ),
+ 'similar_term' => $similar_term,
+ 'score' => $v,
+ ];
+ } else {
+ unset( $similar_terms[ $k ] );
+ update_term_meta( $term_id, 'classifai_similar_terms', $similar_terms );
+ }
+ }
+
+ if ( empty( $similar_terms ) ) {
+ delete_term_meta( $term_id, 'classifai_similar_terms' );
+ }
+ }
+
+ $this->items = $items;
+ }
+
+ /**
+ * Generate term html to show it in Similar terms list table
+ *
+ * @param WP_Term $term Term Object.
+ * @param WP_Term $similar_term Similar Term Object.
+ * @param float $score Similarity score.
+ * @return string
+ */
+ public function generate_term_html( $term, $similar_term, $score = null ) {
+ $args = array(
+ 'action' => 'classifai_merge_term',
+ 'taxonomy' => $this->taxonomy,
+ 'from' => $similar_term->term_id,
+ 'to' => $term->term_id,
+ 'paged' => $this->get_pagenum(),
+ 's' => isset( $_REQUEST['s'] ) ? sanitize_text_field( wp_unslash( $_REQUEST['s'] ) ) : false, // phpcs:ignore WordPress.Security.NonceVerification.Recommended
+ );
+ $merge_url = add_query_arg( $args, wp_nonce_url( admin_url( 'admin-post.php' ), 'classifai_merge_term' ) );
+ $score = $score ? ( $score > 1 ? $score - 1 : $score ) : '';
+
+ return sprintf(
+ // translators: %s: Term name, %d: Term ID.
+ __( '%1$s (ID: %2$s)
', 'classifai' ) .
+ // translators: %s: Term slug.
+ __( 'Slug: %3$s
', 'classifai' ) .
+ // translators: %s: Term count.
+ __( 'Used: %4$s
', 'classifai' ) .
+ // translators: %s: Term parent name.
+ __( 'Parent: %5$s
', 'classifai' ) .
+ // translators: %s: Similarity score.
+ ( $score ? __( 'Similarity: %6$s
', 'classifai' ) : '%6$s' ) .
+ '%8$s',
+ esc_html( $term->name ),
+ '' . esc_html( $term->term_id ) . '',
+ esc_html( $term->slug ),
+ // translators: %d: Term count.
+ '' . esc_html( sprintf( _n( '%d time', '%d times', $term->count, 'classifai' ), $term->count ) ) . '',
+ esc_html( $term->parent > 0 ? get_term( $term->parent )->name : 'None' ),
+ $score ? esc_html( round( $score * 100, 2 ) . '%' ) : '',
+ esc_url( $merge_url ),
+ esc_html__( 'Merge and keep this', 'classifai' )
+ );
+ }
+
+ /**
+ * Handles the term column output.
+ *
+ * @param array $item The current term item.
+ */
+ public function column_term( $item ) {
+ $term = $item['term'];
+ $similar_term = $item['similar_term'];
+ $this->last_item_id = $term->term_id;
+
+ return $this->generate_term_html( $term, $similar_term );
+ }
+
+ /**
+ * Handles the similar term column output.
+ *
+ * @param array $item The current term item.
+ */
+ public function column_similar_term( $item ) {
+ $term = $item['term'];
+ $similar_term = $item['similar_term'];
+
+ return $this->generate_term_html( $similar_term, $term, $item['score'] );
+ }
+
+ /**
+ * Handles the term actions output.
+ *
+ * @param array $item The current term item.
+ */
+ public function column_actions( $item ) {
+ $term = $item['term'];
+ $similar_term = $item['similar_term'];
+
+ $args = array(
+ 'action' => 'classifai_skip_similar_term',
+ 'taxonomy' => $this->taxonomy,
+ 'term' => $term->term_id,
+ 'similar_term' => $similar_term->term_id,
+ 'paged' => $this->get_pagenum(),
+ 's' => isset( $_REQUEST['s'] ) ? sanitize_text_field( wp_unslash( $_REQUEST['s'] ) ) : false, // phpcs:ignore WordPress.Security.NonceVerification.Recommended
+ );
+ $skip_url = add_query_arg( $args, wp_nonce_url( admin_url( 'admin-post.php' ), 'classifai_skip_similar_term' ) );
+
+ return sprintf(
+ "%s",
+ esc_url( $skip_url ),
+ esc_html__( 'Skip', 'classifai' )
+ );
+ }
+
+ /**
+ * Generates content for a single row of the table
+ *
+ * @param array $item The current item.
+ * @param string $column_name The current column name.
+ */
+ protected function column_default( $item, $column_name ) {
+ return esc_html( $item[ $column_name ] );
+ }
+
+ /**
+ * Generates custom table navigation to prevent conflicting nonces.
+ *
+ * @param string $which The location of the bulk actions: Either 'top' or 'bottom'.
+ */
+ protected function display_tablenav( $which ) {
+ ?>
+
+ +
++ +
++ + 'classifai_cancel_term_cleanup', + 'taxonomy' => $taxonomy, + ); + $cancel_url = add_query_arg( $args, wp_nonce_url( admin_url( 'admin-post.php' ), 'classifai_cancel_term_cleanup' ) ); + $label = strtolower( $this->get_taxonomy_label( $taxonomy, true ) ); + ?> + +
+ + +
++ + setting_page_url ); + $refresh = sprintf( + // translators: %s: Refresh the page link. + esc_html__( '%s to see these results.', 'classifai' ), + '' . esc_html__( 'Refresh the page', 'classifai' ) . '' + ); + echo wp_kses_post( + sprintf( + /* translators: %1$s: Taxonomy name, %d: Number of terms processed */ + __( 'Finding similar %1$s, %2$d %1$s processed. %3$s', 'classifai' ), + esc_html( $label ), + absint( $processed ), + ( absint( $processed ) > 0 ) ? $refresh : '' + ) + ); + ?> +
+ get_embeddings_meta_key() ); + $generated = wp_count_terms( + [ + 'taxonomy' => $taxonomy, + 'hide_empty' => false, + 'meta_key' => $meta_key, // phpcs:ignore WordPress.DB.SlowDBQuery.slow_db_query_meta_key + 'meta_compare' => 'EXISTS', + ] + ); + ?> ++ + %2$d %1$s processed.', 'classifai' ), + esc_html( $label ), + absint( $generated ) + ) + ); + ?> +
+ + + ++ +
+