-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDownloader.php
executable file
·131 lines (106 loc) · 3.23 KB
/
Downloader.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
<?php
class Downloader{
public function __construct(){
}
public function baseURL($url){
preg_match("/^http:\/\/([^\/]*)/",$url,$matches);
if (!isset($matches[1]))
return false;
return $matches[1];
}
public function FileExtension($url){
$split = explode(".",$url);
return $split[count($split) - 1];
}
public function topLevelDomain($url){
$url = $this->baseURL($url);
$split = explode('.',$url);
return $split[count($split)-1];
}
public function fetchData($urls){
/* MAKE IT LOOK LIKE A BROWSER (Firefox on a Mac) */
$header = array();
$header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: "; // browsers keep this blank.
$chs = array();
$datas = array();
$cmh = curl_multi_init();
foreach($urls as $url){
$toplevel = $this->topLevelDomain($url);
$extension = trim($this->FileExtension($url));
// Only download the following toplevel domains:
$toplevels = array(
"com","org","edu","net","uk",
"us","tv","gov","info"
);
$keepit = false;
foreach($toplevels as $top){
if (stristr($toplevel,$top) !== FALSE){
$keepit = true;
}
}
if (!$keepit)
continue;
// Don't download the following extensions.
$extensions = array(
"jpg","png","gif","jpeg","bmp",
"swf","js","css","pdf","wmv","ppt"
);
foreach($extensions as $ext){
if (stristr($extension,$ext) !== FALSE){
$ditchit = true;
break;
}else{
$ditchit = false;
}
}
if ($ditchit)
continue;
$ch = curl_init();
curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
curl_setopt($ch, CURLOPT_REFERER, 'http://www.facebook.com/');
curl_setopt($ch, CURLOPT_USERAGENT,
'Mozilla/5.0 (Macintosh; U; Intel Mac OS ' .
'X 10.5; en-US; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7'
);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
// if this is a new session:
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
/* set URL */
curl_setopt($ch, CURLOPT_URL, $url);
/* TO VERIFY THE PEER's CERTIFICATE: */
//curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 1);
//curl_setopt($ch, CURLOPT_CAINFO, "./server.crt");
/* Don't Verify peer certificate: */
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
/* MUST set CURLOPT_COOKIEJAR to a file for CURL to use cookies */
curl_setopt($ch, CURLOPT_COOKIESESSION, 1);
curl_setopt($ch, CURLOPT_COOKIEJAR, "./cookies.txt");
curl_setopt($ch, CURLOPT_COOKIEFILE, "./cookies.txt");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
$chs[$url] = $ch;
curl_multi_add_handle($cmh,$ch);
} // END LOOP THROUGH URL BATCH
/* Get all that stuff we just downloaded */
do{
$rc = curl_multi_exec($cmh, $threads);
} while ($threads > 0);
foreach($chs as $key=>$ch){
$html = curl_multi_getcontent($ch);
curl_multi_remove_handle($cmh, $ch);
curl_close($ch);
$datas[$key] = $html;
}
return $datas;
} // END fetchData
} // END CLASS
?>