-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy patharticle.go
283 lines (245 loc) · 6.45 KB
/
article.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
package zim
import (
"bytes"
"errors"
"fmt"
"io"
"io/ioutil"
"strings"
"sync"
lru "github.com/hashicorp/golang-lru"
)
const (
RedirectEntry uint16 = 0xffff
LinkTargetEntry = 0xfffe
DeletedEntry = 0xfffd
)
var articlePool sync.Pool
// the recent uncompressed blobs, mainly useful while indexing and asking
// for the same blob again and again
var bcache *lru.ARCCache
type Article struct {
// EntryType is a RedirectEntry/LinkTargetEntry/DeletedEntry or an idx
// pointing to ZimReader.mimeTypeList
EntryType uint16
Title string
URLPtr uint64
Namespace byte
url string
blob uint32
cluster uint32
z *ZimReader
}
// convenient method to return the Article at URL index idx
func (z *ZimReader) ArticleAtURLIdx(idx uint32) (*Article, error) {
o, err := z.OffsetAtURLIdx(idx)
if err != nil {
return nil, err
}
return z.ArticleAt(o)
}
// return the article main page if it exists
func (z *ZimReader) MainPage() (*Article, error) {
if z.mainPage == 0xffffffff {
return nil, nil
}
return z.ArticleAtURLIdx(z.mainPage)
}
// get the article (Directory) pointed by the offset found in URLpos or Titlepos
func (z *ZimReader) ArticleAt(offset uint64) (*Article, error) {
a := articlePool.Get().(*Article)
err := z.FillArticleAt(a, offset)
return a, err
}
// Fill an article with datas found at offset
func (z *ZimReader) FillArticleAt(a *Article, offset uint64) error {
a.z = z
a.URLPtr = offset
mimeIdx, err := readInt16(z.bytesRangeAt(offset, offset+2))
if err != nil {
return fmt.Errorf("can't read article %w", err)
}
a.EntryType = mimeIdx
// Linktarget or Target Entry
if mimeIdx == LinkTargetEntry || mimeIdx == DeletedEntry {
// TODO
return nil
}
s, err := z.bytesRangeAt(offset+3, offset+4)
if err != nil {
return err
}
a.Namespace = s[0]
a.cluster, err = readInt32(z.bytesRangeAt(offset+8, offset+8+4))
if err != nil {
return err
}
a.blob, err = readInt32(z.bytesRangeAt(offset+12, offset+12+4))
if err != nil {
return err
}
// Redirect
if mimeIdx == RedirectEntry {
// assume the url + title won't be longer than 2k
b, err := z.bytesRangeAt(offset+12, offset+12+2048)
if err != nil {
return nil
}
bbuf := bytes.NewBuffer(b)
a.url, err = bbuf.ReadString('\x00')
if err != nil {
return err
}
a.url = strings.TrimRight(a.url, "\x00")
a.Title, err = bbuf.ReadString('\x00')
if err != nil {
return err
}
a.Title = strings.TrimRight(a.Title, "\x00")
return err
}
b, err := z.bytesRangeAt(offset+16, offset+16+2048)
if err != nil {
return nil
}
bbuf := bytes.NewBuffer(b)
a.url, err = bbuf.ReadString('\x00')
if err != nil {
return err
}
a.url = strings.TrimRight(string(a.url), "\x00")
title, err := bbuf.ReadString('\x00')
if err != nil {
return err
}
title = strings.TrimRight(string(title), "\x00")
// This is a trick to force a copy and avoid retain of the full buffer
// mainly for indexing title reasons
if len(title) != 0 {
a.Title = title[0:1] + title[1:]
}
return nil
}
// return the uncompressed data associated with this article
func (a *Article) Data() ([]byte, error) {
// ensure we have data to read
if a.EntryType == RedirectEntry || a.EntryType == LinkTargetEntry || a.EntryType == DeletedEntry {
return nil, nil
}
start, end, err := a.z.clusterOffsetsAtIdx(a.cluster)
if err != nil {
return nil, err
}
s, err := a.z.bytesRangeAt(start, start+1)
if err != nil {
return nil, err
}
compression := uint8(s[0])
// blob starts at offset, blob ends at offset
var bs, be uint32
// LZMA: 4, Zstandard: 5
if compression == 4 || compression == 5 {
blobLookup := func() ([]byte, bool) {
if v, ok := bcache.Get(a.cluster); ok {
b := v.([]byte)
return b, ok
}
return nil, false
}
var blob []byte
var ok bool
var dec io.ReadCloser
if blob, ok = blobLookup(); !ok {
b, err := a.z.bytesRangeAt(start+1, end+1)
if err != nil {
return nil, err
}
bbuf := bytes.NewBuffer(b)
switch compression {
case 5:
dec, err = NewZstdReader(bbuf)
case 4:
dec, err = NewXZReader(bbuf)
}
if err != nil {
return nil, err
}
defer dec.Close()
// the decoded chunk are around 1MB
b, err = ioutil.ReadAll(dec)
if err != nil {
return nil, err
}
blob = make([]byte, len(b))
copy(blob, b)
// TODO: 2 requests for the same blob could occure at the same time
bcache.Add(a.cluster, blob)
} else {
bi, ok := bcache.Get(a.cluster)
if !ok {
return nil, errors.New("not in cache anymore")
}
blob = bi.([]byte)
}
bs, err = readInt32(blob[a.blob*4:a.blob*4+4], nil)
if err != nil {
return nil, err
}
be, err = readInt32(blob[a.blob*4+4:a.blob*4+4+4], nil)
if err != nil {
return nil, err
}
// avoid retaining all the chunk
c := make([]byte, be-bs)
copy(c, blob[bs:be])
return c, nil
} else if compression == 0 || compression == 1 {
// uncompresssed
startPos := start + 1
blobOffset := uint64(a.blob * 4)
bs, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset, startPos+blobOffset+4))
if err != nil {
return nil, err
}
be, err := readInt32(a.z.bytesRangeAt(startPos+blobOffset+4, startPos+blobOffset+4+4))
if err != nil {
return nil, err
}
return a.z.bytesRangeAt(startPos+uint64(bs), startPos+uint64(be))
}
return nil, errors.New("Unhandled compression")
}
func (a *Article) MimeType() string {
if a.EntryType == RedirectEntry || a.EntryType == LinkTargetEntry || a.EntryType == DeletedEntry {
return ""
}
return a.z.mimeTypeList[a.EntryType]
}
// return the url prefixed by the namespace
func (a *Article) FullURL() string {
return string(a.Namespace) + "/" + a.url
}
func (a *Article) String() string {
return fmt.Sprintf("Mime: 0x%x URL: [%s], Title: [%s], Cluster: 0x%x Blob: 0x%x",
a.EntryType, a.FullURL(), a.Title, a.cluster, a.blob)
}
// RedirectIndex return the redirect index of RedirectEntry type article
// return an err if not a redirect entry
func (a *Article) RedirectIndex() (uint32, error) {
if a.EntryType != RedirectEntry {
return 0, errors.New("Not a RedirectEntry")
}
// We use the cluster to save the redirect index position for RedirectEntry type
return a.cluster, nil
}
func (a *Article) blobOffsetsAtIdx(z *ZimReader) (start, end uint64) {
idx := a.blob
offset := z.clusterPtrPos + uint64(idx)*8
start, err := readInt64(z.bytesRangeAt(offset, offset+8))
if err != nil {
return
}
offset = z.clusterPtrPos + uint64(idx+1)*8
end, _ = readInt64(z.bytesRangeAt(offset, offset+8))
return
}