Skip to content

Commit

Permalink
* add common param _client_version=2 in request according to lumina…
Browse files Browse the repository at this point in the history
…37/aiotieba#67 (comment) , this allow us revert two previous commits 32168f6 and 31cd3ad @ `ClientRequester.PostProtoBuf()`

+ move update the parent thread of reply with the new title extracted from the first-floor reply in the first page from `PostParseHook()` into a new method `SaveParentThreadTitle()`
- parent virtual method `BaseCrawlFacade.ParsePostsEmbeddedUsers()` and move its override to `FillAuthorInfoBackToReply()`
- overridden parent virtual method `ThrowIfEmptyUsersEmbedInPosts()`
@ ReplyCrawlFacade.cs

* parse users stored in `response.Data.UserList` @ `PostParseHook()`
- overridden parent virtual method `ThrowIfEmptyUsersEmbedInPosts()` @ (Thread|Reply)CrawlFacade.cs

* no longer adding embed users into param `outUsers` @ `ParsePostsInternal()`
* assign `outPost.AuthorUid` with `inPost.AuthorId` instead of `.Author.Uid`
@ (Thread|Reply)Parser.cs

* change required param `ThreadResponse.Types.Data data` to `IEnumerable<Thread> threads` @ `ThreadCrawlFacade.ParseLatestRepliers()`, also affects `ThreadArchiveCrawlFacade.PostParseHook()`

$ `git submodule update --remote`
@ crawler
  • Loading branch information
n0099 committed Jan 10, 2023
1 parent f4c285e commit 624c6a9
Show file tree
Hide file tree
Showing 9 changed files with 34 additions and 51 deletions.
4 changes: 3 additions & 1 deletion crawler/src/Tieba/ClientRequester.cs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,9 @@ private Task<HttpResponseMessage> PostProtoBuf<TRequest>
(string url, string clientVersion, TRequest requestParam, Action<TRequest, Common> setCommonParamOnRequest)
where TRequest : IMessage<TRequest>
{
setCommonParamOnRequest(requestParam, new() {ClientVersion = clientVersion});
// https://github.com/Starry-OvO/aiotieba/issues/67#issuecomment-1376006123
// https://github.com/MoeNetwork/wmzz_post/blob/80aba25de46f5b2cb1a15aa2a69b527a7374ffa9/wmzz_post_setting.php#L64
setCommonParamOnRequest(requestParam, new() {ClientVersion = clientVersion, ClientType = 2});

// https://github.com/dotnet/runtime/issues/22996, http://test.greenbytes.de/tech/tc2231
var protoBufFile = new ByteArrayContent(requestParam.ToByteArray());
Expand Down
7 changes: 1 addition & 6 deletions crawler/src/Tieba/Crawl/Facade/BaseCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,7 @@ private void ValidateThenParse(BaseCrawler<TResponse, TPostProtoBuf>.Response re
_parser.ParsePosts(flag, posts, ParsedPosts, out var postsEmbeddedUsers);
if (flag != CrawlRequestFlag.None) return;
if (!postsEmbeddedUsers.Any() && posts.Any()) ThrowIfEmptyUsersEmbedInPosts();
if (!postsEmbeddedUsers.Any()) return;
Users.ParseUsers(postsEmbeddedUsers);
ParsePostsEmbeddedUsers(postsEmbeddedUsers, posts);
if (postsEmbeddedUsers.Any()) Users.ParseUsers(postsEmbeddedUsers);
}
finally
{
Expand All @@ -212,9 +210,6 @@ private void ValidateThenParse(BaseCrawler<TResponse, TPostProtoBuf>.Response re

protected virtual void ThrowIfEmptyUsersEmbedInPosts() { }

protected virtual void ParsePostsEmbeddedUsers
(IEnumerable<User> usersEmbedInPosts, IEnumerable<TPostProtoBuf> postsInCurrentResponse) { }

protected virtual void PostParseHook(TResponse response, CrawlRequestFlag flag) { }
}
}
30 changes: 15 additions & 15 deletions crawler/src/Tieba/Crawl/Facade/ReplyCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,37 +19,37 @@ public ReplyCrawlFacade(ILogger<ReplyCrawlFacade> logger,
_tid = tid;
}

protected override void ThrowIfEmptyUsersEmbedInPosts() =>
throw new TiebaException(
$"User list in the response of reply request for fid {Fid}, tid {_tid} is empty.");
protected override void PostParseHook(ReplyResponse response, CrawlRequestFlag flag)
{
ParsedPosts.Values.ForEach(r => r.Tid = _tid);
var data = response.Data;
Users.ParseUsers(data.UserList);
FillAuthorInfoBackToReply(data.UserList, data.PostList);
if (data.Page.CurrentPage == 1) SaveParentThreadTitle(data.PostList);
}

protected override void ParsePostsEmbeddedUsers(IEnumerable<User> usersEmbedInPosts, IEnumerable<Reply> postsInCurrentResponse) =>
private void FillAuthorInfoBackToReply(IEnumerable<User> users, IEnumerable<Reply> replies) =>
ParsedPosts.Values // only mutate posts which occurs in current response
.IntersectBy(postsInCurrentResponse.Select(r => r.Pid), r => r.Pid)
.Join(usersEmbedInPosts, r => r.AuthorUid, u => u.Uid, (r, a) => (r, a))
.IntersectBy(replies.Select(r => r.Pid), r => r.Pid)
.Join(users, r => r.AuthorUid, u => u.Uid, (r, a) => (r, a))
.ForEach(tuple =>
{ // fill the values for some field of reply from user list which is out of post list
var (r, author) = tuple;
r.AuthorManagerType = author.BawuType.NullIfWhiteSpace(); // will be null if he's not a moderator
r.AuthorExpGrade = (ushort)author.LevelId; // will be null when author is a historical anonymous user
r.AuthorExpGrade = (ushort)author.LevelId;
});

protected override void PostParseHook(ReplyResponse response, CrawlRequestFlag flag)
private void SaveParentThreadTitle(IEnumerable<Reply> replies)
{
ParsedPosts.Values.ForEach(r => r.Tid = _tid);

var data = response.Data;
// update parent thread of reply with new title that extracted from the first floor reply in first page
if (data.Page.CurrentPage != 1) return;

// update the parent thread of reply with the new title extracted from the first-floor reply in the first page
var db = _dbContextFactory(Fid);
using var transaction = db.Database.BeginTransaction(IsolationLevel.ReadCommitted);

var parentThreadTitle = (from t in db.Threads.AsNoTracking()
where t.Tid == _tid select t.Title).SingleOrDefault();
// thread title will be empty string as a fallback when the thread author haven't write title for this thread
if (parentThreadTitle != "") return;
var newTitle = data.PostList.FirstOrDefault(r => r.Floor == 1)?.Title;
var newTitle = replies.FirstOrDefault(r => r.Floor == 1)?.Title;
if (newTitle == null) return;

db.Attach(new ThreadPost {Tid = _tid, Title = newTitle})
Expand Down
5 changes: 2 additions & 3 deletions crawler/src/Tieba/Crawl/Facade/ThreadArchiveCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,13 @@ protected override void PostParseHook(ThreadResponse response, CrawlRequestFlag
{ // the second response with flag is as same as the first one so just skip it
if (flag == CrawlRequestFlag.ThreadClientVersion602) return;
var data = response.Data;
Users.ParseUsers(data.ThreadList.Select(t => t.Author));
ParseLatestRepliers(data.ThreadList);

ParsedPosts.Values // parsed author uid will be 0 when request with client version 6.0.2
.Join(data.ThreadList, t => t.Tid, t => (Tid)t.Tid,
(parsed, newInResponse) => (parsed, newInResponse))
.ForEach(tuple => tuple.parsed.AuthorUid = tuple.newInResponse.Author.Uid);

Users.ParseUsers(data.ThreadList.Select(t => t.Author));
ParseLatestRepliers(data);
}
}
}
12 changes: 4 additions & 8 deletions crawler/src/Tieba/Crawl/Facade/ThreadCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -37,24 +37,20 @@ where _latestRepliers.Keys.Any(uid => uid == u.Uid)
public static TiebaUser LatestReplierFactory(long uid, string? name, string? displayName) =>
new() {Uid = uid, Name = name, DisplayName = displayName};

protected void ParseLatestRepliers(ThreadResponse.Types.Data data) =>
data.ThreadList
.Select(t => t.LastReplyer ?? null) // LastReplyer will be null when LivePostType != ""
protected void ParseLatestRepliers(IEnumerable<Thread> threads) =>
threads.Select(t => t.LastReplyer ?? null) // LastReplyer will be null when LivePostType != ""
.OfType<User>() // filter out nulls
.Where(u => u.Uid != 0) // some rare deleted thread but still visible in 6.0.2 response will have a latest replier uid=0 name="" nameShow=".*"
.Select(u =>
LatestReplierFactory(u.Uid, u.Name.NullIfWhiteSpace(), u.Name == u.NameShow ? null : u.NameShow))
.ForEach(u => _latestRepliers[u.Uid] = u);

protected override void ThrowIfEmptyUsersEmbedInPosts() =>
throw new TiebaException(
$"User list in the response of thread request for fid {Fid} is empty.");

protected override void PostParseHook(ThreadResponse response, CrawlRequestFlag flag)
{
if (flag != CrawlRequestFlag.None) return;
var data = response.Data;
ParseLatestRepliers(data);
Users.ParseUsers(data.UserList);
ParseLatestRepliers(data.ThreadList);
// remove livepost threads since their real parent forum may not match with current crawling fid
data.ThreadList.Where(t => t.LivePostType != "")
.ForEach(t => ParsedPosts.TryRemove((Tid)t.Tid, out _));
Expand Down
3 changes: 2 additions & 1 deletion crawler/src/Tieba/Crawl/Parser/BaseParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ public void ParsePosts(CrawlRequestFlag requestFlag, IList<TPostProtoBuf> inPost
if (ShouldSkipParse(requestFlag, inPosts, outPosts)) return;
foreach (var p in ParsePostsInternal(inPosts, outNullableUsers))
outPosts[PostIdSelector(p)] = p;
outUsers.AddRange(outNullableUsers.OfType<User>());
outUsers.AddRange(outNullableUsers.OfType<User>()
.Where(u => u.CalculateSize() != 0)); // remove empty users
}
}
}
12 changes: 3 additions & 9 deletions crawler/src/Tieba/Crawl/Parser/ReplyParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,7 @@ public class ReplyParser : BaseParser<ReplyPost, Reply>

protected override PostId PostIdSelector(ReplyPost post) => post.Pid;

protected override IEnumerable<ReplyPost> ParsePostsInternal(IList<Reply> inPosts, List<User?> outUsers)
{
outUsers.AddRange(inPosts.Select(r => r.Author));
return inPosts.Select(Convert);
}
protected override IEnumerable<ReplyPost> ParsePostsInternal(IList<Reply> inPosts, List<User?> outUsers) => inPosts.Select(Convert);

protected override ReplyPost Convert(Reply inPost)
{
Expand All @@ -36,10 +32,8 @@ protected override ReplyPost Convert(Reply inPost)
});
o.Content = Helper.SerializedProtoBufWrapperOrNullIfEmpty(inPost.Content,
() => new PostContentWrapper {Value = {inPost.Content}});
// AuthorId will be protoBuf default value 0 when the response doesn't embed the author user in replies
// see ReplyCrawlFacade.ThrowIfEmptyUsersEmbedInPosts()
o.AuthorUid = inPost.Author?.Uid ?? inPost.AuthorId;
// values of AuthorManagerType and AuthorExpGrade will be write back in ReplyCrawlFacade.PostParseHook()
o.AuthorUid = inPost.AuthorId;
// values of AuthorManagerType and AuthorExpGrade will be write back in ReplyCrawlFacade.FillAuthorInfoBackToReply()
o.SubReplyCount = inPost.SubPostNumber.NullIfZero();
o.PostTime = inPost.Time;
o.IsFold = (ushort?)inPost.IsFold.NullIfZero();
Expand Down
10 changes: 3 additions & 7 deletions crawler/src/Tieba/Crawl/Parser/ThreadParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,7 @@ protected override bool ShouldSkipParse(CrawlRequestFlag requestFlag, IEnumerabl
return testRequestFlag();
}

protected override IEnumerable<ThreadPost> ParsePostsInternal(IList<Thread> inPosts, List<User?> outUsers)
{
outUsers.AddRange(inPosts.Select(t => t.Author));
return inPosts.Select(Convert);
}
protected override IEnumerable<ThreadPost> ParsePostsInternal(IList<Thread> inPosts, List<User?> outUsers) => inPosts.Select(Convert);

protected override ThreadPost Convert(Thread inPost)
{
Expand All @@ -48,8 +44,8 @@ protected override ThreadPost Convert(Thread inPost)
o.StickyType = inPost.IsMembertop == 1 ? "membertop" : inPost.IsTop == 0 ? null : "top";
o.IsGood = (ushort?)inPost.IsGood.NullIfZero();
o.TopicType = inPost.LivePostType.NullIfWhiteSpace();
o.Title = inPost.Title; // might be write back by ReplyCrawlFacade.PostParseHook()
o.AuthorUid = inPost.Author.Uid;
o.Title = inPost.Title; // might be write back by ReplyCrawlFacade.SaveParentThreadTitle()
o.AuthorUid = inPost.AuthorId;
o.AuthorManagerType = inPost.Author.BawuType.NullIfWhiteSpace();
o.PostTime = (uint)inPost.CreateTime;
o.LatestReplyTime = (uint)inPost.LastTimeInt;
Expand Down

0 comments on commit 624c6a9

Please sign in to comment.