Skip to content

Commit

Permalink
[#126] Implement function for fetching repos with pagination (#127)
Browse files Browse the repository at this point in the history
* This is a combination of 2 commits.

[#105] Implement `MonadDownload` effect (#109)

[#43] Add Cabal.hs module

Resolves #43

[#43] Add getCabalFile function

Resolves #43

Fix after review

[#43] Replace Issue with Repo in functions that fetch cabal file

Resolves #43

Fix after review

[#43] Fix fetchRepoCategories function

Resolves #43

Fix after review

Fix after review

Fix after review

[#43] Use new Download module in Cabal module

Resolves #43

[#43] Add testing for Cabal module functions

Resolves #43

Fix after review

[#43] Fix rebase

Resolves #43

[#106] Fix test for MonadCabal implementation

Resolves #106

[#106] Add test for repoCabalUrl function

Resolves #106

[#106] Add documentation

Resolves #106

[#106] Add logging to MonadDownload and MonadCabal

Resolves #106

[#104] Change parseIssueUserData to parseUserData because it's more accurate

Resolves #104

[#104] Change constraints from WithDb to MonadIO

Resolves #104

[#104] Add fetchAndUpsertRepos function

Resolves #104

[#104] Introduce MonadUnliftIO

Resolves #104

Fix after review

Fix after review

[#104] Rebase

Resolves #104

[#104] Change fatal error logs to warnings

Resolves #104

[#126] Modify searchHaskellRepos to take arguements for pagination of results

Resolves #126

[#126] Refactor searchHaskellRepos for more flexibility

I've also added a function for returning the current search rate limits.
It's useful for logging purposes.

Resolves #126

* [#126] Fix rebase

Resolves #126

* [#126] Add function for searching for Haskell repositories within a date range

Resolves #126

* [#126] Implement algorithm for syncing repos by date interval

Resolves #126

* Fix after review

* [#126] Replace fetchAndUpdate with sync

Resolves #126

* [#126] Fix multiway-if

Resolves #126

* [#126] Refactor syncReposByDate function

Resolves #126

* Fix after review

* Fix after review

* [#126] Add wait functionality for githubSearch function

Resolves #126

* [#126] Add julianDayToIso function to Prelude

Resolves #126

* [#126] Make GitHub search functions more readable

Resolves #126

* Fix after review

* [#126] Fix formatting

Resolves #126

* Fix after review

* [#126] Move julianDayToIso function to IW.Time module

Resolves #126

* Fix after review

* Fix after review

* [#126] Refactor sync functions

Resolves #126

* Fix after review

* Fix after review

* [#126] Create better search function

Resolves #126

* [#126] Refactor Update and Search modules

Resolves #126

* Fix after review

* Fix after review

* Fix after review

* [#126] Add more logging

Resolves #126

* [#126] Clean up Update and Search modules

Resolves #126

* [#126] Fix comments

Resolves #126

* Fix after review
  • Loading branch information
rashadg1030 authored and chshersh committed Aug 15, 2019
1 parent 2eae75b commit f0363e0
Show file tree
Hide file tree
Showing 6 changed files with 227 additions and 76 deletions.
2 changes: 2 additions & 0 deletions issue-wanted.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ library
IW.Sync.Search
IW.Sync.Update

IW.Time

build-depends: aeson >= 1.4
, async ^>= 2.2.2
, bytestring ^>= 0.10
Expand Down
8 changes: 6 additions & 2 deletions src/IW/App/Error.hs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ module IW.App.Error
, headerDecodeError
, dbError
, dbNamedError
, githubHttpError
, urlDownloadFailedError

-- * Error throwing helpers
Expand Down Expand Up @@ -131,7 +132,7 @@ data IError
-}
data GError
{- | A HTTP error occurred. The actual caught error is included. -}
= HTTPError Text
= HttpError Text
{- | An error in the parser itself. -}
| ParseError Text
{- | The JSON is malformed or unexpected. -}
Expand All @@ -143,7 +144,7 @@ data GError
-- | Map the @github@ library's @Error@ type into AppErrorType.
githubErrToAppErr :: GitHub.Error -> AppErrorType
githubErrToAppErr = \case
GitHub.HTTPError httpException -> GitHubError $ HTTPError $ show httpException
GitHub.HTTPError httpException -> GitHubError $ HttpError $ show httpException
GitHub.ParseError text -> GitHubError $ ParseError text
GitHub.JsonError text -> GitHubError $ JsonError text
GitHub.UserError text -> GitHubError $ UserError text
Expand Down Expand Up @@ -207,6 +208,9 @@ dbError = InternalError . DbError
dbNamedError :: PgNamedError -> AppErrorType
dbNamedError = InternalError . DbNamedError

githubHttpError :: Text -> AppErrorType
githubHttpError = GitHubError . HttpError

urlDownloadFailedError :: Url -> AppErrorType
urlDownloadFailedError = UrlDownloadFailed

Expand Down
16 changes: 8 additions & 8 deletions src/IW/Effects/Cabal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,11 @@ categoryNames genPkgDescr = Category <$> splitCategories genPkgDescr
splitCategories :: GenericPackageDescription -> [Text]
splitCategories = splitAndStrip "," . toText . category . packageDescription

{- | This function takes a delimeter and a delimeter seperated value,
and returns a list of @Text@ values stripped of excess whitespace.
Note that it returns an empty list when an empty delimeter seperated value is
passed in. This prevents the value @[""]@ from being returned.
-}
splitAndStrip :: Text -> Text -> [Text]
splitAndStrip _ "" = []
splitAndStrip delim text = strip <$> splitOn delim text
{- | This function takes a delimeter and a delimeter seperated value,
and returns a list of @Text@ values stripped of excess whitespace.
Note that it returns an empty list when an empty delimeter seperated value is
passed in. This prevents the value @[""]@ from being returned.
-}
splitAndStrip :: Text -> Text -> [Text]
splitAndStrip _ "" = []
splitAndStrip delim text = strip <$> splitOn delim text
221 changes: 169 additions & 52 deletions src/IW/Sync/Search.hs
Original file line number Diff line number Diff line change
@@ -1,97 +1,214 @@
{-# LANGUAGE MultiWayIf #-}

{- | This module provides functions used in fetching Haskell repos and
issues from the GitHub API. Functions with the fetch- prefix such as
issues from the GitHub API. Functions with the Search- prefix such as
@fetchAllHaskellRepos@ can be used to make request to the GitHubAPI.
This module also exposes functions that map @github@ library types to our own,
and a parser for extracting the @RepoOwner@ and @RepoName@ from a URL.
-}

module IW.Sync.Search
( fetchAllHaskellRepos
, fetchAllHaskellIssues
, fetchHaskellIssuesByLabels
, fromGitHubIssue
, fromGitHubRepo
( searchAllHaskellRepos
, searchAllHaskellIssues
, searchAllHaskellIssuesByLabels
, parseUserData
) where

import GitHub (SearchResult (..), URL (..))
import GitHub.Endpoints.Search (searchRepos, searchIssues)
import Data.Time (Day (..), addDays)
import GitHub (SearchResult (..), URL (..), RateLimit (..), Limits, Paths, QueryString, executeRequest', limitsRemaining)
import GitHub.Endpoints.RateLimit (rateLimit)
import UnliftIO (MonadUnliftIO)
import UnliftIO.Concurrent (threadDelay)

import IW.App (WithError)
import IW.App (WithError, githubErrToAppErr, throwError)
import IW.Core.Issue (Issue (..), Label (..))
import IW.Core.Repo (Repo (..), RepoOwner (..), RepoName (..))
import IW.Core.SqlArray (SqlArray (..))
import IW.App.Error (githubErrToAppErr, throwError)
import IW.Time (julianDayToIso)

import qualified GitHub
import qualified Data.Text as T
import qualified Data.Vector as V

{- HLINT ignore "Use prec" -}


-- | Fetch all repositories built with the Haskell language.
fetchAllHaskellRepos
:: ( MonadIO m
-- | Search all Haskell repositories starting from the most recent day.
searchAllHaskellRepos
:: forall env m.
( MonadIO m
, MonadUnliftIO m
, WithError m
, WithLog env m
)
=> m [GitHub.Repo]
fetchAllHaskellRepos = liftGitHubSearchToApp searchHaskellRepos

-- | Convert a value of the @GitHub.Repo@ type to a value of our own @Repo@ type.
fromGitHubRepo :: GitHub.Repo -> Repo
fromGitHubRepo githubRepo = Repo
{ repoOwner = RepoOwner $ GitHub.untagName $ GitHub.simpleOwnerLogin $ GitHub.repoOwner githubRepo
, repoName = RepoName $ GitHub.untagName $ GitHub.repoName githubRepo
, repoDescr = fromMaybe "" $ GitHub.repoDescription githubRepo
, repoCategories = SqlArray []
}
=> Day -- ^ The function starts from this day and goes back in time
-> Integer -- ^ The size of the date interval that is used in the search
-> m [Repo]
searchAllHaskellRepos recent interval = do
githubRepos <- githubSearchAll ["search", "repositories"] "language:haskell" recent interval []
pure $ fromGitHubRepo <$> githubRepos

-- | Fetch all open issues with Haskell language and the labels passed in to the function.
fetchHaskellIssuesByLabels
:: ( MonadIO m
-- | Search all Haskell open issues starting from the most recent day.
searchAllHaskellIssues
:: forall env m.
( MonadUnliftIO m
, WithError m
, WithLog env m
)
=> [Label]
-> m [GitHub.Issue]
fetchHaskellIssuesByLabels = liftGitHubSearchToApp . searchHaskellIssuesByLabels
=> Day
-> Integer
-> m [Issue]
searchAllHaskellIssues = searchAllHaskellIssuesByLabels []

-- | Fetch all open issues with Haskell language.
fetchAllHaskellIssues
:: ( MonadIO m
-- | Search all open Haskell issues with the corresponding labels.
searchAllHaskellIssuesByLabels
:: forall env m.
( MonadIO m
, MonadUnliftIO m
, WithError m
, WithLog env m
)
=> m [GitHub.Issue]
fetchAllHaskellIssues = fetchHaskellIssuesByLabels []
=> [Label] -- ^ The function will search for issues with these labels
-> Day
-> Integer
-> m [Issue]
searchAllHaskellIssuesByLabels labels recent interval = do
githubIssues <- githubSearchAll ["search", "issues"] queryString recent interval []
pure $ catMaybes $ fromGitHubIssue <$> githubIssues
where
queryString :: Text
queryString = "language:haskell state:open" <> labelsToSearchQuery labels

-- | Construct a github search query from a list of labels.
labelsToSearchQuery :: [Label] -> Text
labelsToSearchQuery = foldMap (\Label{..} -> "label:\"" <> unLabel <> "\" ")

-- | Lift a github search action to work within our monad stack.
liftGitHubSearchToApp
githubSearchAll
:: forall a env m.
( MonadIO m
, MonadUnliftIO m
, WithError m
, WithLog env m
, FromJSON a
, Typeable a
)
=> IO (Either GitHub.Error (SearchResult a))
=> Paths -- ^ Query paths
-> Text -- ^ Query properties
-> Day -- ^ The function starts at this day and goes back in time by the size of the interval
-> Integer -- ^ The date interval
-> [a] -- ^ A list of accumulated results used in recursive calls to this function
-> m [a]
liftGitHubSearchToApp githubSearch = liftIO githubSearch >>= \case
Left err -> throwError $ githubErrToAppErr err
Right (SearchResult count vec) -> do
log Info $ "Fetching total of " <> show count <> " " <> typeName @a <> "s..."
pure $ V.toList vec
githubSearchAll paths properties recent interval acc = do
-- | Search for first page of the query and check the result count to see what to do next.
SearchResult count vec <- executeGithubSearch paths properties firstDay recent 1
let firstPage = V.toList vec
-- | If count is 0, then all results for the given properties have been obtained.
if | count == 0 -> do
log I "All results obtained"
pure acc
-- | If count is less than or equal to 1000, then the interval is good and we can get
-- the rest of the results on pages 2 to 10.
| count <= 1000 -> do
listOfSearchResult <- mapM (executeGithubSearch paths properties firstDay recent) [2..10]
let remainingPages = concatMap searchResultToList listOfSearchResult
-- | Recursive call with a new @recent@ arguement and a new @acc@ arguement
-- representing all pages accumulated up to this point.
githubSearchAll paths properties (pred firstDay) interval (firstPage <> remainingPages <> acc)
-- | Otherwise, call the function with the same arguments but a smaller interval.
| otherwise -> githubSearchAll paths properties recent (pred interval) acc
where
-- | The first day of the search interval. It's calculated by subtracting the size of the
-- interval from the most recent day of the search interval.
firstDay :: Day
firstDay = negate interval `addDays` recent

-- | Search all repositories built with the Haskell language.
searchHaskellRepos :: IO (Either GitHub.Error (SearchResult GitHub.Repo))
searchHaskellRepos = searchRepos "language:haskell"
searchResultToList :: SearchResult a -> [a]
searchResultToList (SearchResult _ vec) = V.toList vec

-- | Construct a github search query from a list of labels.
labelsToSearchQuery :: [Label] -> Text
labelsToSearchQuery = foldMap (\Label{..} -> "label:\"" <> unLabel <> "\" ")
-- | Executes a GitHub search action within the context of the @App@ monad.
executeGithubSearch
:: forall a env m.
( MonadIO m
, MonadUnliftIO m
, WithError m
, WithLog env m
, FromJSON a
)
=> Paths -- ^ Query paths
-> Text -- ^ Query properties
-> Day -- ^ The first day of the date interval
-> Day -- ^ The last day of the date interval
-> Int -- ^ The number of the page to be searched
-> m (SearchResult a)
executeGithubSearch paths properties from to page = do
searchLimit <- getSearchRateLimit
log I $ "Current search limit information: " <> show searchLimit
if limitsRemaining searchLimit > 0
then do
log D $ "Searching GitHub API with following request: " <> showGithubQuery paths properties from to page
liftIO (githubSearch paths properties from to page) >>= \case
Left err -> throwError $ githubErrToAppErr err
Right searchRes -> pure searchRes
else do
log I "API limit reached. Delaying search..."
threadDelay 60000000
executeGithubSearch paths properties from to page

-- | Performs a query against the GitHub Search API.
githubSearch
:: FromJSON a
=> Paths
-> Text
-> Day
-> Day
-> Int
-> IO (Either GitHub.Error (SearchResult a))
githubSearch paths properties from to page = executeRequest' $ buildGithubQuery paths properties from to page

-- | Useful function for constructing a GitHub query and showing it as text.
showGithubQuery
:: Paths
-> Text
-> Day
-> Day
-> Int
-> Text
showGithubQuery paths properties from to page = show $ buildGithubQuery paths properties from to page

-- | Search for all open Haskell issues with the corresponding labels.
searchHaskellIssuesByLabels :: [Label] -> IO (Either GitHub.Error (SearchResult GitHub.Issue))
searchHaskellIssuesByLabels labels = searchIssues $ "language:haskell is:open " <> labelsToSearchQuery labels
-- | Function for building a GitHub search query.
buildGithubQuery
:: Paths -- ^ Query paths
-> Text -- ^ Query properties
-> Day -- ^ The first day of the date interval
-> Day -- ^ The last day of the date interval
-> Int -- ^ The number of the page to be searched
-> GitHub.GenRequest 'GitHub.MtJSON 'GitHub.RO a
buildGithubQuery paths properties from to page = GitHub.query paths queryString
where
queryString :: QueryString
queryString =
[ ("per_page", Just "100")
, ("page", Just $ show page)
, ("q", Just $ encodeUtf8 $ properties <> " " <> dateRange from to)
]

dateRange :: Day -> Day -> Text
dateRange from' to' = "created:" <> julianDayToIso from' <> ".." <> julianDayToIso to'

-- | Function for fetching the current rate limit information for the GitHub Search API.
getSearchRateLimit :: forall m. (MonadIO m, MonadUnliftIO m, WithError m) => m Limits
getSearchRateLimit = liftIO rateLimit >>= \case
Left err -> throwError $ githubErrToAppErr err
Right RateLimit{..} -> pure rateLimitSearch

-- | Convert a value of the @GitHub.Repo@ type to a value of our own @Repo@ type.
fromGitHubRepo :: GitHub.Repo -> Repo
fromGitHubRepo githubRepo = Repo
{ repoOwner = RepoOwner $ GitHub.untagName $ GitHub.simpleOwnerLogin $ GitHub.repoOwner githubRepo
, repoName = RepoName $ GitHub.untagName $ GitHub.repoName githubRepo
, repoDescr = fromMaybe "" $ GitHub.repoDescription githubRepo
, repoCategories = SqlArray []
}

-- | Convert a value of the @GitHub.Issue@ type to a value of our own @Issue@ type.
fromGitHubIssue :: GitHub.Issue -> Maybe Issue
Expand Down
40 changes: 26 additions & 14 deletions src/IW/Sync/Update.hs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ data and insert it into the database.
-}

module IW.Sync.Update
( fetchAndUpsertRepos
( syncRepos
) where

import Control.Monad.IO.Unlift (MonadUnliftIO)
Expand All @@ -14,27 +14,39 @@ import IW.App (WithError)
import IW.Core.Repo (Repo (..))
import IW.Db (WithDb, upsertRepos, updateRepoCategories)
import IW.Effects.Cabal (MonadCabal (..), getCabalCategories)
import IW.Sync.Search (fetchAllHaskellRepos, fromGitHubRepo)
import IW.Sync.Search (searchAllHaskellRepos)
import IW.Time (getToday)


-- | This function fetches the latest repos from the GitHub API, parses their @.cabal@ files,
-- | This function fetches all repos from the GitHub API, downloads their @.cabal@ files,
-- and upserts them into the database.
fetchAndUpsertRepos
syncRepos
:: forall env m.
( MonadCabal m
, MonadUnliftIO m
, WithDb env m
, WithLog env m
, WithError m
)
=> m ()
fetchAndUpsertRepos = do
gitHubRepos <- fetchAllHaskellRepos
let repos = map fromGitHubRepo gitHubRepos
=> Integer -- ^ The starting date interval used in the search function
-> m ()
syncRepos interval = do
today <- liftIO getToday
repos <- searchAllHaskellRepos today interval
upsertRepos repos
mapConcurrently_ fetchAndUpdateCategories repos
where
fetchAndUpdateCategories :: Repo -> m ()
fetchAndUpdateCategories Repo{..} = do
categories <- getCabalCategories repoOwner repoName
updateRepoCategories repoOwner repoName categories
mapConcurrently_ syncCategories repos

-- | This function takes a @Repo@ and attempts to download its @.cabal@ file.
syncCategories
:: forall env m.
( MonadCabal m
, MonadUnliftIO m
, WithDb env m
, WithLog env m
, WithError m
)
=> Repo
-> m ()
syncCategories Repo{..} = do
categories <- getCabalCategories repoOwner repoName
updateRepoCategories repoOwner repoName categories
Loading

0 comments on commit f0363e0

Please sign in to comment.