From f0363e07934c8a1dc24b7f5bebcda8eba911b2f4 Mon Sep 17 00:00:00 2001 From: Rashad Gover Date: Thu, 15 Aug 2019 04:13:47 -0700 Subject: [PATCH] [#126] Implement function for fetching repos with pagination (#127) * This is a combination of 2 commits. [#105] Implement `MonadDownload` effect (#109) [#43] Add Cabal.hs module Resolves #43 [#43] Add getCabalFile function Resolves #43 Fix after review [#43] Replace Issue with Repo in functions that fetch cabal file Resolves #43 Fix after review [#43] Fix fetchRepoCategories function Resolves #43 Fix after review Fix after review Fix after review [#43] Use new Download module in Cabal module Resolves #43 [#43] Add testing for Cabal module functions Resolves #43 Fix after review [#43] Fix rebase Resolves #43 [#106] Fix test for MonadCabal implementation Resolves #106 [#106] Add test for repoCabalUrl function Resolves #106 [#106] Add documentation Resolves #106 [#106] Add logging to MonadDownload and MonadCabal Resolves #106 [#104] Change parseIssueUserData to parseUserData because it's more accurate Resolves #104 [#104] Change constraints from WithDb to MonadIO Resolves #104 [#104] Add fetchAndUpsertRepos function Resolves #104 [#104] Introduce MonadUnliftIO Resolves #104 Fix after review Fix after review [#104] Rebase Resolves #104 [#104] Change fatal error logs to warnings Resolves #104 [#126] Modify searchHaskellRepos to take arguements for pagination of results Resolves #126 [#126] Refactor searchHaskellRepos for more flexibility I've also added a function for returning the current search rate limits. It's useful for logging purposes. Resolves #126 * [#126] Fix rebase Resolves #126 * [#126] Add function for searching for Haskell repositories within a date range Resolves #126 * [#126] Implement algorithm for syncing repos by date interval Resolves #126 * Fix after review * [#126] Replace fetchAndUpdate with sync Resolves #126 * [#126] Fix multiway-if Resolves #126 * [#126] Refactor syncReposByDate function Resolves #126 * Fix after review * Fix after review * [#126] Add wait functionality for githubSearch function Resolves #126 * [#126] Add julianDayToIso function to Prelude Resolves #126 * [#126] Make GitHub search functions more readable Resolves #126 * Fix after review * [#126] Fix formatting Resolves #126 * Fix after review * [#126] Move julianDayToIso function to IW.Time module Resolves #126 * Fix after review * Fix after review * [#126] Refactor sync functions Resolves #126 * Fix after review * Fix after review * [#126] Create better search function Resolves #126 * [#126] Refactor Update and Search modules Resolves #126 * Fix after review * Fix after review * Fix after review * [#126] Add more logging Resolves #126 * [#126] Clean up Update and Search modules Resolves #126 * [#126] Fix comments Resolves #126 * Fix after review --- issue-wanted.cabal | 2 + src/IW/App/Error.hs | 8 +- src/IW/Effects/Cabal.hs | 16 +-- src/IW/Sync/Search.hs | 221 ++++++++++++++++++++++++++++++---------- src/IW/Sync/Update.hs | 40 +++++--- src/IW/Time.hs | 16 +++ 6 files changed, 227 insertions(+), 76 deletions(-) create mode 100644 src/IW/Time.hs diff --git a/issue-wanted.cabal b/issue-wanted.cabal index c525e8d..aab237b 100644 --- a/issue-wanted.cabal +++ b/issue-wanted.cabal @@ -98,6 +98,8 @@ library IW.Sync.Search IW.Sync.Update + IW.Time + build-depends: aeson >= 1.4 , async ^>= 2.2.2 , bytestring ^>= 0.10 diff --git a/src/IW/App/Error.hs b/src/IW/App/Error.hs index 8c8ac84..474238c 100644 --- a/src/IW/App/Error.hs +++ b/src/IW/App/Error.hs @@ -24,6 +24,7 @@ module IW.App.Error , headerDecodeError , dbError , dbNamedError + , githubHttpError , urlDownloadFailedError -- * Error throwing helpers @@ -131,7 +132,7 @@ data IError -} data GError {- | A HTTP error occurred. The actual caught error is included. -} - = HTTPError Text + = HttpError Text {- | An error in the parser itself. -} | ParseError Text {- | The JSON is malformed or unexpected. -} @@ -143,7 +144,7 @@ data GError -- | Map the @github@ library's @Error@ type into AppErrorType. githubErrToAppErr :: GitHub.Error -> AppErrorType githubErrToAppErr = \case - GitHub.HTTPError httpException -> GitHubError $ HTTPError $ show httpException + GitHub.HTTPError httpException -> GitHubError $ HttpError $ show httpException GitHub.ParseError text -> GitHubError $ ParseError text GitHub.JsonError text -> GitHubError $ JsonError text GitHub.UserError text -> GitHubError $ UserError text @@ -207,6 +208,9 @@ dbError = InternalError . DbError dbNamedError :: PgNamedError -> AppErrorType dbNamedError = InternalError . DbNamedError +githubHttpError :: Text -> AppErrorType +githubHttpError = GitHubError . HttpError + urlDownloadFailedError :: Url -> AppErrorType urlDownloadFailedError = UrlDownloadFailed diff --git a/src/IW/Effects/Cabal.hs b/src/IW/Effects/Cabal.hs index 9587737..a16be4e 100644 --- a/src/IW/Effects/Cabal.hs +++ b/src/IW/Effects/Cabal.hs @@ -73,11 +73,11 @@ categoryNames genPkgDescr = Category <$> splitCategories genPkgDescr splitCategories :: GenericPackageDescription -> [Text] splitCategories = splitAndStrip "," . toText . category . packageDescription -{- | This function takes a delimeter and a delimeter seperated value, -and returns a list of @Text@ values stripped of excess whitespace. -Note that it returns an empty list when an empty delimeter seperated value is -passed in. This prevents the value @[""]@ from being returned. --} -splitAndStrip :: Text -> Text -> [Text] -splitAndStrip _ "" = [] -splitAndStrip delim text = strip <$> splitOn delim text + {- | This function takes a delimeter and a delimeter seperated value, + and returns a list of @Text@ values stripped of excess whitespace. + Note that it returns an empty list when an empty delimeter seperated value is + passed in. This prevents the value @[""]@ from being returned. + -} + splitAndStrip :: Text -> Text -> [Text] + splitAndStrip _ "" = [] + splitAndStrip delim text = strip <$> splitOn delim text diff --git a/src/IW/Sync/Search.hs b/src/IW/Sync/Search.hs index ac0afc3..4db6491 100644 --- a/src/IW/Sync/Search.hs +++ b/src/IW/Sync/Search.hs @@ -1,97 +1,214 @@ +{-# LANGUAGE MultiWayIf #-} + {- | This module provides functions used in fetching Haskell repos and -issues from the GitHub API. Functions with the fetch- prefix such as +issues from the GitHub API. Functions with the Search- prefix such as @fetchAllHaskellRepos@ can be used to make request to the GitHubAPI. This module also exposes functions that map @github@ library types to our own, and a parser for extracting the @RepoOwner@ and @RepoName@ from a URL. -} module IW.Sync.Search - ( fetchAllHaskellRepos - , fetchAllHaskellIssues - , fetchHaskellIssuesByLabels - , fromGitHubIssue - , fromGitHubRepo + ( searchAllHaskellRepos + , searchAllHaskellIssues + , searchAllHaskellIssuesByLabels , parseUserData ) where -import GitHub (SearchResult (..), URL (..)) -import GitHub.Endpoints.Search (searchRepos, searchIssues) +import Data.Time (Day (..), addDays) +import GitHub (SearchResult (..), URL (..), RateLimit (..), Limits, Paths, QueryString, executeRequest', limitsRemaining) +import GitHub.Endpoints.RateLimit (rateLimit) +import UnliftIO (MonadUnliftIO) +import UnliftIO.Concurrent (threadDelay) -import IW.App (WithError) +import IW.App (WithError, githubErrToAppErr, throwError) import IW.Core.Issue (Issue (..), Label (..)) import IW.Core.Repo (Repo (..), RepoOwner (..), RepoName (..)) import IW.Core.SqlArray (SqlArray (..)) -import IW.App.Error (githubErrToAppErr, throwError) +import IW.Time (julianDayToIso) import qualified GitHub import qualified Data.Text as T import qualified Data.Vector as V +{- HLINT ignore "Use prec" -} + --- | Fetch all repositories built with the Haskell language. -fetchAllHaskellRepos - :: ( MonadIO m +-- | Search all Haskell repositories starting from the most recent day. +searchAllHaskellRepos + :: forall env m. + ( MonadIO m + , MonadUnliftIO m , WithError m , WithLog env m ) - => m [GitHub.Repo] -fetchAllHaskellRepos = liftGitHubSearchToApp searchHaskellRepos - --- | Convert a value of the @GitHub.Repo@ type to a value of our own @Repo@ type. -fromGitHubRepo :: GitHub.Repo -> Repo -fromGitHubRepo githubRepo = Repo - { repoOwner = RepoOwner $ GitHub.untagName $ GitHub.simpleOwnerLogin $ GitHub.repoOwner githubRepo - , repoName = RepoName $ GitHub.untagName $ GitHub.repoName githubRepo - , repoDescr = fromMaybe "" $ GitHub.repoDescription githubRepo - , repoCategories = SqlArray [] - } + => Day -- ^ The function starts from this day and goes back in time + -> Integer -- ^ The size of the date interval that is used in the search + -> m [Repo] +searchAllHaskellRepos recent interval = do + githubRepos <- githubSearchAll ["search", "repositories"] "language:haskell" recent interval [] + pure $ fromGitHubRepo <$> githubRepos --- | Fetch all open issues with Haskell language and the labels passed in to the function. -fetchHaskellIssuesByLabels - :: ( MonadIO m +-- | Search all Haskell open issues starting from the most recent day. +searchAllHaskellIssues + :: forall env m. + ( MonadUnliftIO m , WithError m , WithLog env m ) - => [Label] - -> m [GitHub.Issue] -fetchHaskellIssuesByLabels = liftGitHubSearchToApp . searchHaskellIssuesByLabels + => Day + -> Integer + -> m [Issue] +searchAllHaskellIssues = searchAllHaskellIssuesByLabels [] --- | Fetch all open issues with Haskell language. -fetchAllHaskellIssues - :: ( MonadIO m +-- | Search all open Haskell issues with the corresponding labels. +searchAllHaskellIssuesByLabels + :: forall env m. + ( MonadIO m + , MonadUnliftIO m , WithError m , WithLog env m ) - => m [GitHub.Issue] -fetchAllHaskellIssues = fetchHaskellIssuesByLabels [] + => [Label] -- ^ The function will search for issues with these labels + -> Day + -> Integer + -> m [Issue] +searchAllHaskellIssuesByLabels labels recent interval = do + githubIssues <- githubSearchAll ["search", "issues"] queryString recent interval [] + pure $ catMaybes $ fromGitHubIssue <$> githubIssues + where + queryString :: Text + queryString = "language:haskell state:open" <> labelsToSearchQuery labels + + -- | Construct a github search query from a list of labels. + labelsToSearchQuery :: [Label] -> Text + labelsToSearchQuery = foldMap (\Label{..} -> "label:\"" <> unLabel <> "\" ") --- | Lift a github search action to work within our monad stack. -liftGitHubSearchToApp +githubSearchAll :: forall a env m. ( MonadIO m + , MonadUnliftIO m , WithError m , WithLog env m + , FromJSON a , Typeable a ) - => IO (Either GitHub.Error (SearchResult a)) + => Paths -- ^ Query paths + -> Text -- ^ Query properties + -> Day -- ^ The function starts at this day and goes back in time by the size of the interval + -> Integer -- ^ The date interval + -> [a] -- ^ A list of accumulated results used in recursive calls to this function -> m [a] -liftGitHubSearchToApp githubSearch = liftIO githubSearch >>= \case - Left err -> throwError $ githubErrToAppErr err - Right (SearchResult count vec) -> do - log Info $ "Fetching total of " <> show count <> " " <> typeName @a <> "s..." - pure $ V.toList vec +githubSearchAll paths properties recent interval acc = do + -- | Search for first page of the query and check the result count to see what to do next. + SearchResult count vec <- executeGithubSearch paths properties firstDay recent 1 + let firstPage = V.toList vec + -- | If count is 0, then all results for the given properties have been obtained. + if | count == 0 -> do + log I "All results obtained" + pure acc + -- | If count is less than or equal to 1000, then the interval is good and we can get + -- the rest of the results on pages 2 to 10. + | count <= 1000 -> do + listOfSearchResult <- mapM (executeGithubSearch paths properties firstDay recent) [2..10] + let remainingPages = concatMap searchResultToList listOfSearchResult + -- | Recursive call with a new @recent@ arguement and a new @acc@ arguement + -- representing all pages accumulated up to this point. + githubSearchAll paths properties (pred firstDay) interval (firstPage <> remainingPages <> acc) + -- | Otherwise, call the function with the same arguments but a smaller interval. + | otherwise -> githubSearchAll paths properties recent (pred interval) acc + where + -- | The first day of the search interval. It's calculated by subtracting the size of the + -- interval from the most recent day of the search interval. + firstDay :: Day + firstDay = negate interval `addDays` recent --- | Search all repositories built with the Haskell language. -searchHaskellRepos :: IO (Either GitHub.Error (SearchResult GitHub.Repo)) -searchHaskellRepos = searchRepos "language:haskell" + searchResultToList :: SearchResult a -> [a] + searchResultToList (SearchResult _ vec) = V.toList vec --- | Construct a github search query from a list of labels. -labelsToSearchQuery :: [Label] -> Text -labelsToSearchQuery = foldMap (\Label{..} -> "label:\"" <> unLabel <> "\" ") +-- | Executes a GitHub search action within the context of the @App@ monad. +executeGithubSearch + :: forall a env m. + ( MonadIO m + , MonadUnliftIO m + , WithError m + , WithLog env m + , FromJSON a + ) + => Paths -- ^ Query paths + -> Text -- ^ Query properties + -> Day -- ^ The first day of the date interval + -> Day -- ^ The last day of the date interval + -> Int -- ^ The number of the page to be searched + -> m (SearchResult a) +executeGithubSearch paths properties from to page = do + searchLimit <- getSearchRateLimit + log I $ "Current search limit information: " <> show searchLimit + if limitsRemaining searchLimit > 0 + then do + log D $ "Searching GitHub API with following request: " <> showGithubQuery paths properties from to page + liftIO (githubSearch paths properties from to page) >>= \case + Left err -> throwError $ githubErrToAppErr err + Right searchRes -> pure searchRes + else do + log I "API limit reached. Delaying search..." + threadDelay 60000000 + executeGithubSearch paths properties from to page + +-- | Performs a query against the GitHub Search API. +githubSearch + :: FromJSON a + => Paths + -> Text + -> Day + -> Day + -> Int + -> IO (Either GitHub.Error (SearchResult a)) +githubSearch paths properties from to page = executeRequest' $ buildGithubQuery paths properties from to page + +-- | Useful function for constructing a GitHub query and showing it as text. +showGithubQuery + :: Paths + -> Text + -> Day + -> Day + -> Int + -> Text +showGithubQuery paths properties from to page = show $ buildGithubQuery paths properties from to page --- | Search for all open Haskell issues with the corresponding labels. -searchHaskellIssuesByLabels :: [Label] -> IO (Either GitHub.Error (SearchResult GitHub.Issue)) -searchHaskellIssuesByLabels labels = searchIssues $ "language:haskell is:open " <> labelsToSearchQuery labels +-- | Function for building a GitHub search query. +buildGithubQuery + :: Paths -- ^ Query paths + -> Text -- ^ Query properties + -> Day -- ^ The first day of the date interval + -> Day -- ^ The last day of the date interval + -> Int -- ^ The number of the page to be searched + -> GitHub.GenRequest 'GitHub.MtJSON 'GitHub.RO a +buildGithubQuery paths properties from to page = GitHub.query paths queryString + where + queryString :: QueryString + queryString = + [ ("per_page", Just "100") + , ("page", Just $ show page) + , ("q", Just $ encodeUtf8 $ properties <> " " <> dateRange from to) + ] + + dateRange :: Day -> Day -> Text + dateRange from' to' = "created:" <> julianDayToIso from' <> ".." <> julianDayToIso to' + +-- | Function for fetching the current rate limit information for the GitHub Search API. +getSearchRateLimit :: forall m. (MonadIO m, MonadUnliftIO m, WithError m) => m Limits +getSearchRateLimit = liftIO rateLimit >>= \case + Left err -> throwError $ githubErrToAppErr err + Right RateLimit{..} -> pure rateLimitSearch + +-- | Convert a value of the @GitHub.Repo@ type to a value of our own @Repo@ type. +fromGitHubRepo :: GitHub.Repo -> Repo +fromGitHubRepo githubRepo = Repo + { repoOwner = RepoOwner $ GitHub.untagName $ GitHub.simpleOwnerLogin $ GitHub.repoOwner githubRepo + , repoName = RepoName $ GitHub.untagName $ GitHub.repoName githubRepo + , repoDescr = fromMaybe "" $ GitHub.repoDescription githubRepo + , repoCategories = SqlArray [] + } -- | Convert a value of the @GitHub.Issue@ type to a value of our own @Issue@ type. fromGitHubIssue :: GitHub.Issue -> Maybe Issue diff --git a/src/IW/Sync/Update.hs b/src/IW/Sync/Update.hs index 840bd3a..7f39304 100644 --- a/src/IW/Sync/Update.hs +++ b/src/IW/Sync/Update.hs @@ -4,7 +4,7 @@ data and insert it into the database. -} module IW.Sync.Update - ( fetchAndUpsertRepos + ( syncRepos ) where import Control.Monad.IO.Unlift (MonadUnliftIO) @@ -14,12 +14,13 @@ import IW.App (WithError) import IW.Core.Repo (Repo (..)) import IW.Db (WithDb, upsertRepos, updateRepoCategories) import IW.Effects.Cabal (MonadCabal (..), getCabalCategories) -import IW.Sync.Search (fetchAllHaskellRepos, fromGitHubRepo) +import IW.Sync.Search (searchAllHaskellRepos) +import IW.Time (getToday) --- | This function fetches the latest repos from the GitHub API, parses their @.cabal@ files, +-- | This function fetches all repos from the GitHub API, downloads their @.cabal@ files, -- and upserts them into the database. -fetchAndUpsertRepos +syncRepos :: forall env m. ( MonadCabal m , MonadUnliftIO m @@ -27,14 +28,25 @@ fetchAndUpsertRepos , WithLog env m , WithError m ) - => m () -fetchAndUpsertRepos = do - gitHubRepos <- fetchAllHaskellRepos - let repos = map fromGitHubRepo gitHubRepos + => Integer -- ^ The starting date interval used in the search function + -> m () +syncRepos interval = do + today <- liftIO getToday + repos <- searchAllHaskellRepos today interval upsertRepos repos - mapConcurrently_ fetchAndUpdateCategories repos - where - fetchAndUpdateCategories :: Repo -> m () - fetchAndUpdateCategories Repo{..} = do - categories <- getCabalCategories repoOwner repoName - updateRepoCategories repoOwner repoName categories + mapConcurrently_ syncCategories repos + +-- | This function takes a @Repo@ and attempts to download its @.cabal@ file. +syncCategories + :: forall env m. + ( MonadCabal m + , MonadUnliftIO m + , WithDb env m + , WithLog env m + , WithError m + ) + => Repo + -> m () +syncCategories Repo{..} = do + categories <- getCabalCategories repoOwner repoName + updateRepoCategories repoOwner repoName categories diff --git a/src/IW/Time.hs b/src/IW/Time.hs new file mode 100644 index 0000000..773f32f --- /dev/null +++ b/src/IW/Time.hs @@ -0,0 +1,16 @@ +module IW.Time + ( getToday + , julianDayToIso + ) where + +import Data.Time (Day (..), getCurrentTime, utctDay) +import Data.Time.Format (formatTime, defaultTimeLocale, iso8601DateFormat) + + +-- | Returns today's date as a Julian day. +getToday :: IO Day +getToday = utctDay <$> getCurrentTime + +-- | Converts a Julian day to a date in ISO 8601 (yyyy-mm-dd) format. +julianDayToIso :: Day -> Text +julianDayToIso = fromString . formatTime defaultTimeLocale (iso8601DateFormat Nothing)