Skip to content

Commit

Permalink
create non wiki namespace articles
Browse files Browse the repository at this point in the history
  • Loading branch information
tecoholic committed Jul 21, 2015
1 parent 86d3843 commit 3f71239
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 11 deletions.
10 changes: 5 additions & 5 deletions zimbalaka/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@
</div>
<div class="form-group col-sm-6" id="urldiv">
<label class="control-label col-sm-4" for="url">URL</label>
<div class="col-sm-8">
<input id="url" class="form-control" type="text" placeholder="http://some.wikisite.org">
<span class="input-group-addon">/wiki/Article</span>
<div class="col-sm-8 input-group">
<input id="url" class="form-control" type="text" placeholder="http://some.wikisite.org/wiki/" aria-describedby="add">
<span class="input-group-addon" id="add">article_name</span>
</div>
</div>
<div class="form-group col-xs-12 col-sm-6">
Expand Down Expand Up @@ -176,10 +176,10 @@
data.title= $('#title').val();
data.list= $('#list').val();
data.cats = $('#cats').val();
if( $('#site') === 'custom'){
if( $('#site').val() === 'custom'){
data.url = $('#url').val();
}else{
data.url = 'http://'+$('#lang').val()+'.'+$('#site').val()+'.org';
data.url = 'http://'+$('#lang').val()+'.'+$('#site').val()+'.org/wiki/';
}
$.post(
"./", data,
Expand Down
22 changes: 16 additions & 6 deletions zimbalaka/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,21 +57,25 @@ def clean_page(dloc, html, baseurl):
pq(image).attr('src', localfile)
# fix the links
for link in doc('a'):
absolute = baseurl + pq(link).attr('href')
# replace wiki from the url as the links start with /wiki/
wiki = re.compile('.*/wiki/$')
if wiki.match(baseurl):
baseurl = baseurl.replace('/wiki/', '')
absolute = baseurl+ pq(link).attr('href')
pq(link).attr('href', absolute)
return doc.html().encode("utf-8")

def download_file(dloc, title, baseurl):
"""Downloads the file from wikipedia with all the associated files"""
url = baseurl + '/wiki/' + urllib.quote( title.encode('utf-8') )
url = baseurl+urllib.quote(title.encode('utf-8'))
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Zimbalaka/1.0 based on OpenZim')]
# print "Opening .. ", url
infile = opener.open(url)
page = infile.read()
# clean the page now
page = clean_page(dloc, page, baseurl)
htmlname = os.path.join(dloc, title + ".html")
htmlname = os.path.join(dloc, title.split('/')[-1] + ".html")
with open(htmlname, 'w') as f:
f.write(page)
return htmlname
Expand All @@ -87,7 +91,13 @@ def get_cmcontinue(xml):
def articles_of_cat(url, cat):
"""Fectches the articles in the given category"""
query = "/w/api.php?action=query&list=categorymembers&format=xml&cmprop=title&cmnamespace=0&cmtype=page&cmlimit=10&cmtitle="
query = query + urllib.quote(cat.encode('utf-8'))
query = query+urllib.quote(cat.encode('utf-8'))

# remove wiki from the url and api base path is different
wiki = re.compile('.*\/wiki\/$')
if wiki.match(url):
return []
url = url.replace('/wiki/', '')

fullurl = url+query.encode('utf-8')
opener = urllib2.build_opener()
Expand All @@ -105,7 +115,7 @@ def articles_of_cat(url, cat):

def guess_language(url):
# for wikipedia like urls
rex = re.compile("(http[s]{0,1}:\/\/)(?P<lang>[\w\d]*)\.([\w\d]*)\.([\w\d]*)")
rex = re.compile("(http[s]{0,1}:\/\/)(?P<lang>[\w\d]*)\.([\w\d]*)\.([\w\d]*)\/wiki\/")
groups = rex.search(url)
if groups:
return groups.group('lang')
Expand Down Expand Up @@ -212,7 +222,7 @@ def zimit(title, articles, cats, url, logger):
zimwriterfs, w, f, l, t, d, c, p, directory, zimfile )
call(command, shell=True)
print 'Removing tmp dir '+dloc
shutil.rmtree(dloc)
#shutil.rmtree(dloc)
return zimfile


Expand Down

0 comments on commit 3f71239

Please sign in to comment.