Skip to content

Commit

Permalink
* fix the value of thread_info.reply_num might be -1: lumina37/ai…
Browse files Browse the repository at this point in the history
…otieba#64 (comment) @ `ThreadParser.Convert()`

* replace all `response(s) with` to `respond with`
@ crawler
  • Loading branch information
n0099 committed Feb 13, 2023
1 parent dfaee58 commit a08b016
Show file tree
Hide file tree
Showing 6 changed files with 11 additions and 10 deletions.
2 changes: 1 addition & 1 deletion crawler/src/Tieba/Crawl/Facade/ThreadArchiveCrawlFacade.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ public ThreadArchiveCrawlFacade(ILogger<ThreadArchiveCrawlFacade> logger, TbmDbC
) : base(logger, dbContextFactory, crawler.Invoke, parser, saver, users, requesterTcs, locks, fid, forumName) { }

protected override void PostParseHook(ThreadResponse response, CrawlRequestFlag flag, Dictionary<PostId, ThreadPost> parsedPostsInResponse)
{ // the second response with flag is as same as the first one so just skip it
{ // the second respond with flag is as same as the first one so just skip it
if (flag == CrawlRequestFlag.ThreadClientVersion602) return;
var data = response.Data;
Users.ParseUsers(data.ThreadList.Select(t => t.Author));
Expand Down
2 changes: 1 addition & 1 deletion crawler/src/Tieba/Crawl/Parser/ReplyParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ protected override ReplyPost Convert(Reply inPost)
o.Content = Helper.SerializedProtoBufWrapperOrNullIfEmpty(inPost.Content,
() => Helper.WrapPostContent(inPost.Content));
o.OriginalContents = inPost.Content;
// AuthorId rarely response with 0, Author should always be null but we can guarantee
// AuthorId rarely respond with 0, Author should always be null but we can guarantee
o.AuthorUid = inPost.AuthorId.NullIfZero() ?? inPost.Author?.Uid ?? 0;
// value of AuthorExpGrade will be write back in ReplyCrawlFacade.FillAuthorInfoBackToReply()
o.SubReplyCount = inPost.SubPostNumber.NullIfZero();
Expand Down
2 changes: 1 addition & 1 deletion crawler/src/Tieba/Crawl/Parser/ThreadParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ protected override ThreadPost Convert(Thread inPost)
o.LatestReplyPostedAt = (uint)inPost.LastTimeInt;
// LastReplyer will be null when LivePostType != "", but LastTimeInt will have expected timestamp value
o.LatestReplierUid = inPost.LastReplyer?.Uid;
o.ReplyCount = (uint?)inPost.ReplyNum.NullIfZero();
o.ReplyCount = inPost.ReplyNum < 0 ? 0 : (uint?)inPost.ReplyNum.NullIfZero(); // rarely respond with -1
o.ViewCount = (uint?)inPost.ViewNum.NullIfZero();
o.ShareCount = (uint?)inPost.ShareNum.NullIfZero();
// when the thread is livepost or Thread.AgreeNum == 0, the agree field will not exists
Expand Down
3 changes: 2 additions & 1 deletion crawler/src/Tieba/Crawl/Saver/ReplySaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,15 @@ where imagesKeyByUrlFilename.Keys.Contains(e.UrlFilename)
existingImages.Values.Where(e => e.ByteSize == 0)
.Join(imagesKeyByUrlFilename.Values, e => e.UrlFilename, i => i.UrlFilename,
(existing, newInContent) => (existing, newInContent))
.ForEach(t => t.existing.ByteSize = t.newInContent.ByteSize); // randomly response with 0
.ForEach(t => t.existing.ByteSize = t.newInContent.ByteSize); // randomly respond with 0
db.ReplyContentImages.AddRange(pidAndImageList.Select(t => new ReplyContentImage
{
Pid = t.Pid,
// no need to manually invoke DbContent.AddRange(images) since EF Core will do these chore
// https://stackoverflow.com/questions/5212751/how-can-i-retrieve-id-of-inserted-entity-using-entity-framework/41146434#41146434
// reuse the same instance from imagesKeyByUrlFilename will prevent assigning multiple different instances with the same key
// which will cause EF Core to insert identify entry more than one time leading to duplicated entry error
// https://github.com/dotnet/efcore/issues/30236
Image = existingImages.TryGetValue(t.Image.UrlFilename, out var e) ? e : imagesKeyByUrlFilename[t.Image.UrlFilename]
}));
}
Expand Down
10 changes: 5 additions & 5 deletions crawler/src/Tieba/Crawl/Saver/StaticCommonInSavers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public record FieldChangeIgnoranceCallbackRecord(FieldChangeIgnoranceCallback Up
if (whichPostType == typeof(TiebaUser))
{
switch (propName)
{ // possible randomly response with null
{ // possible randomly respond with null
case nameof(TiebaUser.IpGeolocation) when newValue is null:
// possible clock drift across multiple response from tieba api, they should sync their servers with NTP
/* following sql can track these drift
Expand All @@ -41,7 +41,7 @@ when Math.Abs((newValue as int? ?? 0) - (oldValue as int? ?? 0)) <= 10:
// prevent overwrite existing value of field liker_id which is saved by legacy crawler
// and Zan itself is deprecated by tieba so it shouldn't get updated
case nameof(ThreadPost.Zan):
// possible randomly response with null
// possible randomly respond with null
case nameof(ThreadPost.Geolocation) when newValue is null:
// empty string means the author had not write a title
// its value generated from the first reply within response of reply crawler will be later set by ReplyCrawlFacade.SaveParentThreadTitle()
Expand All @@ -50,19 +50,19 @@ when newValue is ""
// prevent repeatedly update with different title due to the thread is a multi forum topic thread
// thus its title can be vary within the forum and within the thread
|| (newValue is not "" && oldValue is not ""):
// possible randomly response with 0.NullIfZero()
// possible randomly respond with 0.NullIfZero()
case nameof(ThreadPost.DisagreeCount) when newValue is null && oldValue is not null:
// when the latest reply post is deleted and there's no new reply after delete
// this field but not LatestReplyPostedAt will be null
case nameof(ThreadPost.LatestReplierUid) when newValue is null:
return true;
}
}
// possible randomly response with null
// possible randomly respond with null
if (whichPostType == typeof(ReplyPost)
&& propName == nameof(ReplyPost.SignatureId)
&& newValue is null && oldValue is not null) return true;
// possible rarely response with the protoBuf default value 0
// possible rarely respond with the protoBuf default value 0
if (propName == nameof(IPost.AuthorUid)
&& newValue is (long)0 && oldValue is not null) return true;
return false;
Expand Down
2 changes: 1 addition & 1 deletion crawler/src/Worker/ArchiveCrawlWorker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ namespace tbm.Crawler.Worker;
public class ArchiveCrawlWorker : BackgroundService
{
// as of March 2019, tieba had restrict the max accepted value for page param of forum's threads api
// any request with page offset that larger than 10k threads will be response with results from the first page
// any request with page offset that larger than 10k threads will be respond with results from the first page
private const int MaxCrawlablePage = 334; // 10k threads / 30 per request (from Rn param) = 333.3...
private readonly ILogger<ArchiveCrawlWorker> _logger;
private readonly ILifetimeScope _scope0;
Expand Down

0 comments on commit a08b016

Please sign in to comment.