From 57abf96e7bbed64a92a7cff351acbd3e1a8ded88 Mon Sep 17 00:00:00 2001 From: Bas van Dinther Date: Thu, 17 Aug 2023 13:53:43 +0200 Subject: [PATCH] Transition words in content (#37) * Transitionword check * Fix styling * Transition word check including Dutch translations * Fix styling * Update README * Use filterXpath so filtering with CSS select is not needed * wip * Fix styling * Remove unnecessary url param * Use readability to get the amount of phrases * Update README * wip * Fix styling * Improving splitting text in phrases * Fix styling * wip --------- Co-authored-by: Baspa --- README.md | 5 +- config/seo.php | 12 ++ resources/lang/en.json | 87 ++++++++++- resources/lang/nl.json | 141 ++++++++++++++---- src/Checks/Content/AltTagCheck.php | 40 ++--- src/Checks/Content/ContentLengthCheck.php | 6 +- .../Content/TransitionWordRatioCheck.php | 116 ++++++++++++++ src/Helpers/TransitionWords.php | 76 ++++++++++ .../Content/TransitionWordRatioCheckTest.php | 74 +++++++++ 9 files changed, 501 insertions(+), 56 deletions(-) create mode 100644 src/Checks/Content/TransitionWordRatioCheck.php create mode 100644 src/Helpers/TransitionWords.php create mode 100644 tests/Checks/Content/TransitionWordRatioCheckTest.php diff --git a/README.md b/README.md index 29bcc7dd..8aef06c5 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ ## Introduction -This package is your guidance to get a better SEO score on search engines. Laravel SEO Scanner scans your code and crawls the routes from your app. The package has 22 checks that will check on performance, configurations, use of meta tags and content quality. +This package is your guidance to get a better SEO score on search engines. Laravel SEO Scanner scans your code and crawls the routes from your app. The package has 23 checks that will check on performance, configurations, use of meta tags and content quality. Easily configure which routes to scan, exclude or include specific checks or even add your own checks! Completing checks will further improve the SEO score and thus increase the chance of ranking higher at the search engines. @@ -255,6 +255,9 @@ These checks are available in the package. You can add or remove checks in the c ✅ The page contains no broken links.
✅ The page contains no broken images.
✅ Length of the content is at least 2100 characters.
+✅ A minimum of 30% of the sentences contain a transition word or phrase.
+ +> Note: To change the locale of the transition words, you can publish the config file and change the locale in the config file. The default locale is `null` which uses the language of your `app` config. If set to `nl` or `en`, the transition words will be in Dutch or English. If you want to add more locales, you can create a pull request. ### Meta diff --git a/config/seo.php b/config/seo.php index 713f187e..682c6a64 100644 --- a/config/seo.php +++ b/config/seo.php @@ -1,6 +1,18 @@ null, + /* |-------------------------------------------------------------------------- | Cache diff --git a/resources/lang/en.json b/resources/lang/en.json index 4e6cc695..b27fb050 100644 --- a/resources/lang/en.json +++ b/resources/lang/en.json @@ -30,5 +30,88 @@ "failed.performance.javascript_size": "The page contains Javascript files that are too large (max :expectedValue). These files were found: :actualValue.", "failed.performance.response": "The page returned a response code other than :expectedValue. The actual response code was :actualValue.", "failed.performance.ttfb": "The page took too long to load (max :expectedValuems). The actual time was :actualValuems.", - "failed.performance.ttfb.missing_url": "We could not get the TTFB for this page." -} \ No newline at end of file + "failed.performance.ttfb.missing_url": "We could not get the TTFB for this page.", + "failed.content.transition_words_ratio_check.too_few_transition_words": "The page contains too few transition words. The recommended minimum is 30%, while the actual number is :actualValue%.", + "failed.content.transition_words_ratio_check.no_phrases_found": "The page does not contain any transition words.", + "additionally": "additionally", + "moreover": "moreover", + "furthermore": "furthermore", + "in addition": "in addition", + "not only": "not only", + "but also": "but also", + "as well as": "as well as", + "besides": "besides", + "what's more": "what's more", + "however": "however", + "nevertheless": "nevertheless", + "on the other hand": "on the other hand", + "in contrast": "in contrast", + "conversely": "conversely", + "although": "although", + "while": "while", + "yet": "yet", + "even though": "even though", + "nonetheless": "nonetheless", + "similarly": "similarly", + "likewise": "likewise", + "in comparison": "in comparison", + "just as": "just as", + "compared to": "compared to", + "similarly to": "similarly to", + "therefore": "therefore", + "thus": "thus", + "consequently": "consequently", + "as a result": "as a result", + "because": "because", + "since": "since", + "so": "so", + "due to": "due to", + "owing to": "owing to", + "accordingly": "accordingly", + "indeed": "indeed", + "certainly": "certainly", + "of course": "of course", + "undoubtedly": "undoubtedly", + "without a doubt": "without a doubt", + "naturally": "naturally", + "for example": "for example", + "for instance": "for instance", + "such as": "such as", + "to illustrate": "to illustrate", + "in particular": "in particular", + "first": "first", + "second": "second", + "third": "third", + "next": "next", + "then": "then", + "afterward": "afterward", + "finally": "finally", + "in the meantime": "in the meantime", + "subsequently": "subsequently", + "in conclusion": "in conclusion", + "to sum up": "to sum up", + "ultimately": "ultimately", + "in summary": "in summary", + "all in all": "all in all", + "overall": "overall", + "meanwhile": "meanwhile", + "before": "before", + "after": "after", + "during": "during", + "until": "until", + "eventually": "eventually", + "soon": "soon", + "in the past": "in the past", + "in the future": "in the future", + "in other words": "in other words", + "that is to say": "that is to say", + "specifically": "specifically", + "to clarify": "to clarify", + "in this case": "in this case", + "an example of this is": "an example of this is", + "to demonstrate": "to demonstrate", + "admittedly": "admittedly", + "granted": "granted", + "while it is true": "while it is true" +} + diff --git a/resources/lang/nl.json b/resources/lang/nl.json index 8cc4583c..e2ece158 100644 --- a/resources/lang/nl.json +++ b/resources/lang/nl.json @@ -1,31 +1,114 @@ { - "failed.configuration.nofollow.meta": "The page contains a nofollow meta tag, while it should not.", - "failed.configuration.nofollow.tag": "The page contains a nofollow tag, while it should not.", - "failed.configuration.noindex.meta": "The page contains a noindex meta tag, while it should not.", - "failed.configuration.noindex.tag": "The page contains a noindex tag, while it should not.", - "failed.configuration.robots.disallowed": "The robots.txt file for this page contains a disallow rule for this page.", - "failed.configuration.robots.missing_url": "We could not get the robots.txt file for this page.", - "failed.content.alt_tag": "The page contains images without or empty alt tags. These images were found: :actualValue.", - "failed.content.broken_images": "The page contains broken images. These images were found: :actualValue.", - "failed.content.broken_links": "The page contains broken links. These links were found: :actualValue.", - "failed.content.length": "The content is :actualValue characters long. It should be at least :expectedValue characters long.", - "failed.content.mixed_content": "The page contains links to insecure addresses, while it should not. These links were found :actualValue.", - "failed.content.multiple_h1": "The page contains multiple h1 tags, while it should not. These tags were found :actualValue.", - "failed.content.no_heading": "The page does not contain any h1 tag, while it should.", - "failed.content.no_title": "The page does not contain a title tag, while it should.", - "failed.content.title_length": "The page title is :actualValue characters long. It should be max :expectedValue characters long.", - "failed.meta.description": "The page does not contain a description meta tag, while it should.", - "failed.meta.no_lang": "The page does not contain a lang attribute, while it should.", - "failed.meta.open_graph_image": "The page does not contain an open graph image, while it should.", - "failed.meta.open_graph_image.broken": "The page contains a broken open graph image. This image was found: :actualValue.", - "failed.meta.title": "The page title contains :actualValue in the title, while it should not.", - "failed.meta.title.no_content": "The page title is empty, while it should not be.", - "failed.performance.compression": "The page is not compressed (using either gzip or deflate), while it should be.", - "failed.performance.css_size": "The page contains CSS files that are too large (max :expectedValue). These files were found: :actualValue.", - "failed.performance.html_size": "The page contains HTML that is too large (max :expectedValue), the actual size is :actualValue.", - "failed.performance.image_size": "The page contains images that are too large (max :expectedValue). These images were found: :actualValue.", - "failed.performance.javascript_size": "The page contains Javascript files that are too large (max :expectedValue). These files were found: :actualValue.", - "failed.performance.response": "The page returned a response code other than :expectedValue. The actual response code was :actualValue.", - "failed.performance.ttfb": "The page took too long to load (max :expectedValuems). The actual time was :actualValuems.", - "failed.performance.ttfb.missing_url": "We could not get the TTFB for this page." + "failed.configuration.nofollow.meta": "De pagina bevat een nofollow meta-tag, terwijl dat niet zou moeten.", + "failed.configuration.nofollow.tag": "De pagina bevat een nofollow-tag, terwijl dat niet zou moeten.", + "failed.configuration.noindex.meta": "De pagina bevat een noindex meta-tag, terwijl dat niet zou moeten.", + "failed.configuration.noindex.tag": "De pagina bevat een noindex-tag, terwijl dat niet zou moeten.", + "failed.configuration.robots.disallowed": "Het robots.txt-bestand voor deze pagina bevat een regel die deze pagina uitsluit.", + "failed.configuration.robots.missing_url": "We konden het robots.txt-bestand voor deze pagina niet ophalen.", + "failed.content.alt_tag": "De pagina bevat afbeeldingen zonder alt-tags of met lege alt-tags. Deze afbeeldingen zijn gevonden: :actualValue.", + "failed.content.broken_images": "De pagina bevat kapotte afbeeldingen. Deze afbeeldingen zijn gevonden: :actualValue.", + "failed.content.broken_links": "De pagina bevat kapotte links. Deze links zijn gevonden: :actualValue.", + "failed.content.length": "De inhoud is :actualValue tekens lang. Het zou minstens :expectedValue tekens lang moeten zijn.", + "failed.content.length.parse": "We konden de inhoud van deze pagina niet analyseren, probeer het opnieuw.", + "failed.content.mixed_content": "De pagina bevat links naar onveilige adressen, terwijl dat niet zou moeten. Deze links zijn gevonden: :actualValue.", + "failed.content.multiple_h1": "De pagina bevat meerdere h1-tags, terwijl dat niet zou moeten. Deze tags zijn gevonden: :actualValue.", + "failed.content.no_heading": "De pagina bevat geen enkele h1-tag, terwijl dat zou moeten.", + "failed.content.no_title": "De pagina bevat geen titel-tag, terwijl dat zou moeten.", + "failed.content.title_length": "De paginatitel is :actualValue tekens lang. Het zou maximaal :expectedValue tekens lang moeten zijn.", + "failed.meta.description": "De pagina bevat geen beschrijvende meta-tag, terwijl dat zou moeten.", + "failed.meta.no_lang": "De pagina bevat geen taalattribuut, terwijl dat zou moeten.", + "failed.meta.open_graph_image": "De pagina bevat geen Open Graph-afbeelding, terwijl dat zou moeten.", + "failed.meta.open_graph_image.broken": "De pagina bevat een kapotte Open Graph-afbeelding. Deze afbeelding is gevonden: :actualValue.", + "failed.meta.title": "De paginatitel bevat :actualValue in de titel, terwijl dat niet zou moeten.", + "failed.meta.title.no_content": "De paginatitel is leeg, terwijl dat niet zou moeten.", + "failed.performance.compression": "De pagina is niet gecomprimeerd (met gzip of deflate), terwijl dat zou moeten.", + "failed.performance.css_size": "De pagina bevat CSS-bestanden die te groot zijn (maximaal :expectedValue). Deze bestanden zijn gevonden: :actualValue.", + "failed.performance.html_size": "De pagina bevat HTML die te groot is (maximaal :expectedValue), de werkelijke grootte is :actualValue.", + "failed.performance.image_size": "De pagina bevat afbeeldingen die te groot zijn (maximaal :expectedValue). Deze afbeeldingen zijn gevonden: :actualValue.", + "failed.performance.javascript_size": "De pagina bevat Javascript-bestanden die te groot zijn (maximaal :expectedValue). Deze bestanden zijn gevonden: :actualValue.", + "failed.performance.response": "De pagina heeft een andere responscodes dan :expectedValue geretourneerd. De werkelijke responstijd was :actualValue.", + "failed.performance.ttfb": "De pagina heeft te lang geladen (maximaal :expectedValuems). De werkelijke tijd was :actualValuems.", + "failed.performance.ttfb.missing_url": "We konden de TTFB voor deze pagina niet ophalen.", + "failed.content.transition_words_ratio_check.too_few_transition_words": "De pagina bevat te weinig transitiewoorden. Het zou minstens 30% moeten zijn, maar het is :actualValue%.", + "failed.content.transition_words_ratio_check.no_phrases_found": "De pagina bevat geen zinnen.", + "additionally": "daarnaast", + "moreover": "bovendien", + "furthermore": "verder", + "in addition": "daarnaast", + "not only": "niet alleen", + "but also": "maar ook", + "as well as": "evenals", + "besides": "bovendien", + "what's more": "wat meer is", + "however": "echter", + "nevertheless": "desalniettemin", + "on the other hand": "aan de andere kant", + "in contrast": "in tegenstelling", + "conversely": "omgekeerd", + "although": "hoewel", + "while": "terwijl", + "yet": "toch", + "even though": "zelfs als", + "nonetheless": "desondanks", + "similarly": "op dezelfde wijze", + "likewise": "evenzo", + "in comparison": "in vergelijking", + "just as": "net zoals", + "compared to": "vergeleken met", + "similarly to": "net als", + "therefore": "daarom", + "thus": "zo", + "consequently": "als gevolg", + "as a result": "als resultaat", + "because": "omdat", + "since": "aangezien", + "so": "dus", + "due to": "vanwege", + "owing to": "te danken aan", + "accordingly": "overeenkomstig", + "indeed": "inderdaad", + "certainly": "zeker", + "of course": "natuurlijk", + "undoubtedly": "ongetwijfeld", + "without a doubt": "zonder twijfel", + "naturally": "natuurlijkerwijs", + "for example": "bijvoorbeeld", + "for instance": "bijvoorbeeld", + "such as": "zoals", + "to illustrate": "ter illustratie", + "in particular": "in het bijzonder", + "first": "eerst", + "second": "tweede", + "third": "derde", + "next": "volgende", + "then": "dan", + "afterward": "daarna", + "finally": "uiteindelijk", + "in the meantime": "ondertussen", + "subsequently": "vervolgens", + "in conclusion": "ter conclusie", + "to sum up": "om samen te vatten", + "ultimately": "uiteindelijk", + "in summary": "samengevat", + "all in all": "alles bij elkaar genomen", + "overall": "over het geheel genomen", + "meanwhile": "ondertussen", + "before": "voordat", + "after": "na", + "during": "tijdens", + "until": "totdat", + "eventually": "uiteindelijk", + "soon": "binnenkort", + "in the past": "in het verleden", + "in the future": "in de toekomst", + "in other words": "met andere woorden", + "that is to say": "dat wil zeggen", + "specifically": "specifiek", + "to clarify": "om te verduidelijken", + "in this case": "in dit geval", + "an example of this is": "een voorbeeld hiervan is", + "to demonstrate": "om te demonstreren", + "admittedly": "toegegeven", + "granted": "toegegeven", + "while it is true": "terwijl het waar is" } \ No newline at end of file diff --git a/src/Checks/Content/AltTagCheck.php b/src/Checks/Content/AltTagCheck.php index ba5681dc..868cf151 100644 --- a/src/Checks/Content/AltTagCheck.php +++ b/src/Checks/Content/AltTagCheck.php @@ -39,27 +39,10 @@ public function check(Response $response, Crawler $crawler): bool public function validateContent(Crawler $crawler): bool { $imagesWithoutAlt = $crawler->filterXPath('//img[not(@alt)]')->each(function (Crawler $node, $i) { - $src = $node->attr('src'); - - $dimensions = $this->getImageDimensions($src, $node); - - if ($dimensions['width'] < 5 || $dimensions['height'] < 5) { - return null; - } - - return $src; + return $this->filterImage($node); }); - $imagesWithEmptyAlt = $crawler->filterXPath('//img[@alt=""]')->each(function (Crawler $node, $i) { - $src = $node->attr('src'); - - $dimensions = $this->getImageDimensions($src, $node); - - if ($dimensions['width'] < 5 || $dimensions['height'] < 5) { - return null; - } - - return $src; + return $this->filterImage($node); }); // Remove null values from the arrays @@ -81,7 +64,24 @@ public function validateContent(Crawler $crawler): bool return true; } - public function getImageDimensions(string $src, Crawler $node): array + private function filterImage($node): ?string + { + $src = $node->attr('src'); + + if (str_contains($src, '.svg')) { + return $src; + } + + $dimensions = $this->getImageDimensions($src, $node); + + if ($dimensions['width'] < 5 || $dimensions['height'] < 5) { + return null; + } + + return $src; + } + + private function getImageDimensions(string $src, Crawler $node): array { if (app()->runningUnitTests()) { return [ diff --git a/src/Checks/Content/ContentLengthCheck.php b/src/Checks/Content/ContentLengthCheck.php index 47210e07..f8ddfc72 100644 --- a/src/Checks/Content/ContentLengthCheck.php +++ b/src/Checks/Content/ContentLengthCheck.php @@ -41,7 +41,7 @@ public function check(Response $response, Crawler $crawler): bool $content = $this->getContentToValidate($response, $crawler); if (! $content) { - return true; + return false; } return $this->validateContent($content); @@ -49,15 +49,13 @@ public function check(Response $response, Crawler $crawler): bool public function getContentToValidate(Response $response, Crawler $crawler): ?string { - $url = $response->transferStats->getHandlerStats()['url']; - $body = $response->body(); if ($this->useJavascript) { $body = $crawler->filter('body')->html(); } - $readability = new Readability($body, $url); + $readability = new Readability($body); $readability->init(); diff --git a/src/Checks/Content/TransitionWordRatioCheck.php b/src/Checks/Content/TransitionWordRatioCheck.php new file mode 100644 index 00000000..a6d17989 --- /dev/null +++ b/src/Checks/Content/TransitionWordRatioCheck.php @@ -0,0 +1,116 @@ +validateContent($response, $crawler)) { + return false; + } + + return true; + } + + public function validateContent(Response $response, Crawler $crawler): bool + { + $body = $response->body(); + + if ($this->useJavascript) { + $body = $crawler->filter('body')->html(); + } + + $readability = new Readability($body); + + $readability->init(); + + $content = $readability->getContent()->textContent; + + if ($content == 'Sorry, Readability was unable to parse this page for content.') { + $this->failureReason = __('failed.content.length.parse'); + + return false; + } + + $transitionWords = TransitionWords::getTransitionWordsOnly(config('seo.language')); + + $this->actualValue = $this->calculatePercentageOfTransitionWordsInContent($content, $transitionWords); + + if ($this->actualValue < 30) { + $this->failureReason = __('failed.content.transition_words_ratio_check.too_few_transition_words', [ + 'actualValue' => $this->actualValue, + ]); + + return false; + } + + return true; + } + + public function calculatePercentageOfTransitionWordsInContent($content, $transitionWords) + { + // Get phrases seperate by new line, dot, exclamation mark or question mark + $phrases = preg_split('/\n|\.|\!|\?/', $content); + + // Count all phrases where it has more than 5 words + $totalPhrases = array_filter($phrases, function ($phrase) { + return str_word_count($phrase) > 5; + }); + + if (count($totalPhrases) === 0) { + $this->actualValue = 0; + $this->failureReason = __('failed.content.transition_words_ratio_check.no_phrases_found'); + + return 0; + } + + $phrasesWithTransitionWord = 0; + + foreach ($transitionWords as $transitionWord) { + $phrasesWithTransitionWord += $this->calculateNumberOfPhrasesWithTransitionWord($content, $transitionWord); + } + + return round($phrasesWithTransitionWord / count($totalPhrases) * 100, 0, PHP_ROUND_HALF_UP); + } + + public function calculateNumberOfPhrasesWithTransitionWord(string $content, string $transitionWord): int + { + preg_match_all('/\b[\w\s]+\b/', $content, $matches); + + $phrasesWithTransitionWord = 0; + + foreach ($matches[0] as $phrase) { + if (stripos($phrase, $transitionWord) !== false) { + $phrasesWithTransitionWord++; + } + } + + return $phrasesWithTransitionWord; + } +} diff --git a/src/Helpers/TransitionWords.php b/src/Helpers/TransitionWords.php new file mode 100644 index 00000000..6c0a5838 --- /dev/null +++ b/src/Helpers/TransitionWords.php @@ -0,0 +1,76 @@ + [ + 'additionally', 'moreover', 'furthermore', 'in addition', + 'not only', 'but also', 'as well as', 'besides', "what's more", + ], + 'contrast' => [ + 'however', 'nevertheless', 'on the other hand', 'in contrast', + 'conversely', 'although', 'while', 'yet', 'even though', 'nonetheless', + ], + 'comparison' => [ + 'similarly', 'likewise', 'in comparison', 'just as', + 'compared to', 'similarly to', + ], + 'cause_and_effect' => [ + 'therefore', 'thus', 'consequently', 'as a result', 'because', + 'since', 'so', 'due to', 'owing to', 'accordingly', + ], + 'emphasis' => [ + 'indeed', 'certainly', 'of course', 'undoubtedly', + 'without a doubt', 'naturally', + ], + 'example' => [ + 'for example', 'for instance', 'such as', 'to illustrate', 'in particular', + ], + 'sequence_order' => [ + 'first', 'second', 'third', 'next', 'then', 'afterward', 'meanwhile', + 'finally', 'in the meantime', 'subsequently', + ], + 'conclusion_summary' => [ + 'in conclusion', 'to sum up', 'ultimately', 'in summary', 'all in all', 'overall', + ], + 'time' => [ + 'meanwhile', 'before', 'after', 'during', 'while', 'since', 'until', 'eventually', 'soon', 'in the past', 'in the future', + ], + 'clarification' => [ + 'in other words', 'that is to say', 'specifically', 'to clarify', + ], + 'illustration' => [ + 'specifically', 'in this case', 'an example of this is', 'to demonstrate', + ], + 'concession' => [ + 'admittedly', 'granted', 'even though', 'while it is true', + ], + ]; + + public static function getTransitionWords(): array + { + return self::$transitionWords; + } + + public static function getTransitionWordsOnly(string $locale = null): array + { + $transitionWords = self::$transitionWords; + + $words = []; + + foreach ($transitionWords as $transitionWord) { + foreach ($transitionWord as $word) { + $words[] = __($word, [], $locale); + } + } + + return $words; + } + + public static function getTransitionWordsByType(string $type): array + { + return self::$transitionWords[$type]; + } +} diff --git a/tests/Checks/Content/TransitionWordRatioCheckTest.php b/tests/Checks/Content/TransitionWordRatioCheckTest.php new file mode 100644 index 00000000..f77ff821 --- /dev/null +++ b/tests/Checks/Content/TransitionWordRatioCheckTest.php @@ -0,0 +1,74 @@ + Http::response( + ' + + Test + + +

'.$body.'

+ ', + 200), + ]); + + $crawler->addHtmlContent(Http::get('vormkracht10.nl')->body()); + + $this->assertTrue($check->check(Http::get('vormkracht10.nl'), $crawler)); +}); + +it('can perform the transition word ratio check where sentence does not match criteria', function () { + $check = new TransitionWordRatioCheck(); + $crawler = new Crawler(); + + $body = 'Lorem ipsum. Dolor sit amet. This is the next sentence. Fourth sentence. Fifth sentence.'; + + Http::fake([ + 'vormkracht10.nl' => Http::response( + ' + + Test + + +

'.$body.'

+ ', + 200), + ]); + + $crawler->addHtmlContent(Http::get('vormkracht10.nl')->body()); + + $this->assertFalse($check->check(Http::get('vormkracht10.nl'), $crawler)); +}); + +it('can perform the transition word ratio check on page without content', function () { + $check = new TransitionWordRatioCheck(); + $crawler = new Crawler(); + + $body = ''; + + Http::fake([ + 'vormkracht10.nl' => Http::response( + ' + + Test + + +

'.$body.'

+ ', + 200), + ]); + + $crawler->addHtmlContent(Http::get('vormkracht10.nl')->body()); + + $this->assertFalse($check->check(Http::get('vormkracht10.nl'), $crawler)); +});