-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain.js
117 lines (110 loc) · 3.18 KB
/
main.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
'use strict';
var jobqueue = require("./lib/jobqueue");
var logger = require("log4js").getLogger("app.js");
var Promise = require("bluebird");
var config = require("./config");
var casper = require("./lib/server");
var db = require('mongo-bluebird').create(config.scrapecache.db);
var Server = function() {
this.promise = casper.setUp();
};
/**
*
* @param pages {
* "url":"",
* "script":"script file , absolute path"
* }
* @param type the function name will be called from the script file
* @param expiration
* @returns {*|ng.IPromise<TResult>}
*/
Server.prototype.scrape = function(pages, type, expiration) {
return this.promise.then(function() {
if (!Array.isArray(pages)) {
pages = [pages];
}
var urls = new Array();
pages.forEach(function(page) {
urls.push(page.url);
});
expiration = (typeof expiration !== 'number') ? config.scrapecache.validity : expiration;
return new Promise(function(resolve, reject) {
var cachedResults = [];
var now = new Date();
now.setHours(now.getHours() - expiration); // subtract expiration to current time to calculate validity
// Drop outdated cache
var cacheCollection = db.collection(type);
return cacheCollection.remove({url: {$in: urls}, time: {$lt: now}})
.then(function() {
// Find cached results
return cacheCollection.find({url: {$in: urls}, time: {$gt: now}}).then(function(items) {
for (var i in items) {
items[i].cached = true;
cachedResults.push(items[i]);
var inx = urls.indexOf(items[i].url);
if (inx > -1) {
urls.splice(inx, 1);
}
}
// No URLs to process. All results are cached and up-to-date.
if (!urls.length) {
resolve({status: true, message: '', results: cachedResults});
} else {
return _scrape(_filterPages(pages, urls), type).then(function(result) {
if (result && result.results) {
for (var i in result.results) {
result.results[i].time = new Date();
}
cacheCollection.insert(result.results).then(function() {
var allResults = cachedResults.concat(result.results);
result.results = allResults;
resolve(result);
});
} else {
resolve(result);
}
});
}
});
});
});
});
};
Server.prototype.clearCache = function(type,urls) {
return db.collection(type).remove({url: {$in: urls}});
};
Server.prototype.release = function(){
casper.releaseAll();
};
module.exports = new Server();
function _scrape(pages, type) {
try {
pages.forEach(function(element) {
element.method = type;
});
logger.info("");
logger.info("processing " + pages.length + " URLs with method " + type);
if (pages.length > 0) {
return jobqueue.post(pages);
} else {
return Promise.resolve({
status: false,
"message": "Nothing to do"
});
}
} catch (e) {
logger.error(e);
return Promise.resolve({
"status": false,
"message": e.message
});
}
}
function _filterPages(pages, urls) {
return pages.filter(function(page) {
var url = page.url;
return urls.some(function(newUrl) {
return url === newUrl;
});
});
}