Skip to content

Commit

Permalink
#1 Release 1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
HugoPoi committed Nov 25, 2016
1 parent 85c59a4 commit 124fed5
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 32 deletions.
19 changes: 12 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
# Google scraper

Google scraper is a simple tool to scrape google search engine writen for nodejs.
Based on SpookyJS / CasperJS / PhantomJS. This work has been inspired by `node-google-search-scraper` package, and the usage is similar.
Based on phantom and Casper. This work has been inspired by `node-google-search-scraper` package, and the usage is similar.

## Features

* Scrape Google results and can return rendered page for other usecase.
* Handle captchas
* Custom user agent and custom headers
* Use JS for rendering Google

## Usage

Expand All @@ -11,16 +17,15 @@ var GoogleScraper = require('google-scraper');
GoogleSearchScraper.search({
query : 'site:nodejs.org', // Query for google engine
limit: 10, // Limit number of results
keepPages: true, // Populate results.pages with rendered HTML content.
keepPages: false, // Populate results.pages with rendered HTML content.
solver: GoogleScraper.commandLineSolver, // Optional solver for resolving captcha (see commandLineSolver.js)
userAgent: 'GoogleSearchScraper1.0',
headers: { // Default http headers for PhantomJS
'Accept-Language': 'ru-RU,en,*'
},
spooky: { // Custom config for SpookyJS
child: {
'ignore-ssl-errors': 'yes' // This will be pass to phantomJS command line.
}
}
phantomOptions: [ // Command line options use for PhantomJS
'--ignore-ssl-errors=yes'
]
}, function(err, results){
console.log(results);
});
Expand Down
4 changes: 2 additions & 2 deletions commandLineSolver.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ module.exports.solve = function(image, callback){
output: process.stdout
});

rl.question('Captcha please ?', function(captcha) {
callback(null, captcha);
rl.question('Captcha please ?', function(solution) {
callback(null, { solution: solution });
rl.close();
});
};
42 changes: 20 additions & 22 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,8 @@ function search(options, callback){
.then(function(casperReturns){
return options.solver.solveAsync(new Buffer(casperReturns.captcha, 'base64'));
})
.then(function(solution){
// TODO solver return other info than solution
sharedContext.captcha = solution;
return page.invokeAsyncMethod('fillCaptchaSolution', solution).then(handleErrorFromCasper);
.then(function(captcha){
return page.invokeAsyncMethod('fillCaptchaSolution', captcha.solution).then(handleErrorFromCasper);
})
.then(retryCall);
}else{
Expand All @@ -51,12 +49,11 @@ function search(options, callback){
};
}

// TODO refactor captcha (maybe remove from sharedContext)
var sharedContext = { resultsCount: 0, endOfResults: false, captcha: null };
var sharedContext = { resultsCount: 0, endOfResults: false };

var phInstance, page;
phantom.create(options.phantomOptions, {
logLevel: 'info'
logLevel: 'error'
})
.then(function(instance){
phInstance = instance;
Expand Down Expand Up @@ -88,7 +85,7 @@ function search(options, callback){
});
});

page.defineMethod('searchGoogle', function(options, sharedContext, callback){
page.defineMethod('searchGoogle', function(options, callback){
var casper = objectSpace.casper, lastError;
console.log('Start wait Google form to be ready');
casper.waitForSelector('form[action="/search"] input[name="q"]', function(){
Expand All @@ -98,13 +95,17 @@ function search(options, callback){
}, function(){
if(!/\/sorry/.test(this.getCurrentUrl())){
lastError = { message: 'form_not_found', details: { url: this.getCurrentUrl(), html: this.getHTML() } };
}else if(!!sharedContext.captcha){
// TODO add captcha to sharedContext
lastError = { message: 'invalid_captcha', details: { captcha: sharedContext.captcha }};
}else{
lastError = { message: 'captcha_detected' };
}
}, options.timeout.waitSearchForm);
casper.waitForUrl(/#q=|\/sorry/, function(){
if(/\/sorry/.test(this.getCurrentUrl())){
lastError = { message: 'captcha_detected' };
}
}, function(){
console.log('Timeout on ' + this.getCurrentUrl());
}, 10000);
casper.run(function(){
console.log('Query sended get ' + this.getCurrentUrl());
callback({ err: lastError });
Expand All @@ -113,7 +114,6 @@ function search(options, callback){

page.defineMethod('scrapeResults', function(options, sharedContext, callback){
var casper = objectSpace.casper, lastError, output;
// TODO : change this waitFor selector or redirect
casper.waitForSelector('#res #ires h3', function(){
console.log('Parsing results.');
var links = this.evaluate(function getLinks() {
Expand All @@ -133,9 +133,6 @@ function search(options, callback){
}, function(){
if(!/\/sorry/.test(this.getCurrentUrl())){
lastError = { message: 'results_not_found', details: { url: this.getCurrentUrl(), html: this.getHTML() } };
}else if(!!sharedContext.captcha){
// TODO add captcha to sharedContext
lastError = { message: 'invalid_captcha', details: { captcha: sharedContext.captcha }};
}else{
lastError = { message: 'captcha_detected' };
}
Expand All @@ -153,11 +150,10 @@ function search(options, callback){
try{
captcha = this.captureBase64('jpg', 'img');
}catch(err){
// TODO send err ?
lastError = { message: 'captcha_timeout', details: 'End on url : ' + this.getCurrentUrl() };
lastError = { message: 'captcha_timeout', details: { url: this.getCurrentUrl() } };
}
}else{
lastError = { message: 'captcha_not_needed', details: 'End on url : ' + this.getCurrentUrl() };
lastError = { message: 'captcha_not_needed', details: { url: this.getCurrentUrl() } };
}
});
casper.run(function(){
Expand All @@ -175,18 +171,18 @@ function search(options, callback){
}, true);
});
casper.run(function(){
console.log('Captcha filled get ' + this.getCurrentUrl());
console.log('Captcha filled now GET ' + this.getCurrentUrl());
if(/\/sorry/.test(this.getCurrentUrl())){
lastError = { message: 'invalid_captcha', details: { captcha: solution }}; // TODO why return solution again ?
lastError = { message: 'invalid_captcha', details: { url: this.getCurrentUrl() }};
}
callback(lastError);
callback({ err: lastError });
});
});

return page.invokeAsyncMethod('setupCasper', options).then(handleErrorFromCasper);
})
.then(function(){
return page.invokeAsyncMethod('searchGoogle', options, {}).then(handleErrorFromCasper);
return page.invokeAsyncMethod('searchGoogle', options).then(handleErrorFromCasper).catch(catchCaptcha());
})
.then(function(){
function scrapeResults(){
Expand All @@ -208,9 +204,11 @@ function search(options, callback){
return scrapeResults().catch(catchCaptcha(scrapeResults));
})
.then(function(output){
phInstance.exit();
callback(null, output);
})
.catch(function(err){
phInstance.exit();
callback(err);
});
}
Expand Down
2 changes: 1 addition & 1 deletion test/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ describe('GoogleSearchScraper', function() {
});

it('With 20 limit results', function(done){
this.timeout(10000);
this.timeout(30000);
GoogleSearchScraper.search({ query : 'site:wikipedia.fr', limit: 20 }, function(err, result){
if(err){
return done(err);
Expand Down

0 comments on commit 124fed5

Please sign in to comment.