Skip to content
This repository was archived by the owner on Jun 2, 2023. It is now read-only.

Commit b60d75f

Browse files
author
Stefan
committed
refactored classes
php 7 only supported from 0.9.0 onwards added namespaces fixed bugs prep for windows multi process support
1 parent 013013a commit b60d75f

37 files changed

+6783
-6738
lines changed

README.md

+8-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,13 @@
33

44
Initially just a copy of http://phpcrawl.cuab.de/ forked from [mmerian](https://github.com/mmerian/phpcrawl) for using with composer.
55

6-
### Due to the [main project](https://sourceforge.net/projects/phpcrawl/files/PHPCrawl/) now seemingly being abandoned (having no updates for 4 years) I am going to proceed to make any changes/fixes in this repository.
6+
*Due to the [main project](https://sourceforge.net/projects/phpcrawl/files/PHPCrawl/) now seemingly being abandoned (having no updates for 4 years) I am going to proceed to make any changes/fixes in this repository.*
7+
8+
### Latest updates
9+
- PHP 7 Only - Not backwards compatible with 0.8 versions.
10+
- Introduced namespaces
11+
- Lots of bug fixes
12+
- Refactored various class sections
13+
- Preperation for Windows OS multiprocess mode (pthreads or paralell extension)
714

815
[Pull requests](https://github.com/crispy-computing-machine/phpcrawl/pulls) are welcome

composer.json

+7-7
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
2-
"name": "brittainmedia/phpcrawl",
3-
"description": "PHPCrawl is a webcrawler/webspider-library written in PHP. It supports filters, limiters, cookie-handling, robots.txt-handling, multiprocessing and much more.",
4-
"license": "GPL-2.0",
5-
"autoload": {
6-
"psr-4": {
7-
"PHPCrawl\\": "libs/"
8-
}
2+
"name": "brittainmedia/phpcrawl",
3+
"description": "PHPCrawl is a webcrawler/webspider-library written in PHP. It supports filters, limiters, cookie-handling, robots.txt-handling, multiprocessing and much more.",
4+
"license": "GPL-2.0",
5+
"autoload": {
6+
"psr-4": {
7+
"PHPCrawl\\": "libs/"
98
}
9+
}
1010
}

example.php

+46-38
Original file line numberDiff line numberDiff line change
@@ -2,45 +2,54 @@
22

33

44
// It may take a whils to crawl a site ...
5+
use PHPCrawl\PHPCrawler;
6+
use PHPCrawl\PHPCrawlerDocumentInfo;
7+
58
set_time_limit(10000);
69

710
// Inculde the phpcrawl-mainclass
811
include('libs/PHPCrawler.php');
912

1013
// Extend the class and override the handleDocumentInfo()-method
11-
class MyCrawler extends PHPCrawler
14+
15+
/**
16+
* Class MyCrawler
17+
*/
18+
class MyCrawler extends PHPCrawler
1219
{
13-
public function handleDocumentInfo($DocInfo)
14-
{
15-
// Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>")..
16-
if (PHP_SAPI === 'cli') {
17-
$lb = "\n";
18-
}
19-
else {
20-
$lb = "<br />";
21-
}
20+
/**
21+
* @param PHPCrawlerDocumentInfo $DocInfo
22+
* @return int|void
23+
*/
24+
public function handleDocumentInfo($DocInfo)
25+
{
26+
// Just detect linebreak for output ("\n" in CLI-mode, otherwise "<br>")..
27+
if (PHP_SAPI === 'cli') {
28+
$lb = "\n";
29+
} else {
30+
$lb = "<br />";
31+
}
2232

23-
// Print the URL and the HTTP-status-Code
24-
echo 'Page requested: ' .$DocInfo->url. ' (' .$DocInfo->http_status_code. ')' .$lb;
25-
26-
// Print the refering URL
27-
echo 'Referer-page: ' .$DocInfo->referer_url.$lb;
28-
29-
// Print if the content of the document was be recieved or not
30-
if ($DocInfo->received == true) {
31-
echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
32-
}
33-
else {
34-
echo "Content not received" . $lb;
33+
// Print the URL and the HTTP-status-Code
34+
echo 'Page requested: ' . $DocInfo->url . ' (' . $DocInfo->http_status_code . ')' . $lb;
35+
36+
// Print the refering URL
37+
echo 'Referer-page: ' . $DocInfo->referer_url . $lb;
38+
39+
// Print if the content of the document was be recieved or not
40+
if ($DocInfo->received == true) {
41+
echo "Content received: " . $DocInfo->bytes_received . " bytes" . $lb;
42+
} else {
43+
echo "Content not received" . $lb;
44+
}
45+
46+
// Now you should do something with the content of the actual
47+
// received page or file ($DocInfo->source), we skip it in this example
48+
49+
echo $lb;
50+
51+
flush();
3552
}
36-
37-
// Now you should do something with the content of the actual
38-
// received page or file ($DocInfo->source), we skip it in this example
39-
40-
echo $lb;
41-
42-
flush();
43-
}
4453
}
4554

4655
// Now, create a instance of your class, define the behaviour
@@ -74,13 +83,12 @@ public function handleDocumentInfo($DocInfo)
7483

7584
if (PHP_SAPI === 'cli') {
7685
$lb = "\n";
77-
}
78-
else {
86+
} else {
7987
$lb = "<br />";
8088
}
81-
82-
echo 'Summary:' .$lb;
83-
echo 'Links followed: ' .$report->links_followed.$lb;
84-
echo 'Documents received: ' .$report->files_received.$lb;
85-
echo 'Bytes received: ' .$report->bytes_received. ' bytes' .$lb;
86-
echo 'Process runtime: ' .$report->process_runtime. ' sec' .$lb;
89+
90+
echo 'Summary:' . $lb;
91+
echo 'Links followed: ' . $report->links_followed . $lb;
92+
echo 'Documents received: ' . $report->files_received . $lb;
93+
echo 'Bytes received: ' . $report->bytes_received . ' bytes' . $lb;
94+
echo 'Process runtime: ' . $report->process_runtime . ' sec' . $lb;
+29-26
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
<?php
2+
23
namespace PHPCrawl\CookieCache;
34

5+
use PHPCrawl\PHPCrawlerCookieDescriptor;
6+
47
/**
58
* Abstract baseclass for storing cookies.
69
*
@@ -9,30 +12,30 @@
912
*/
1013
abstract class PHPCrawlerCookieCacheBase
1114
{
12-
/**
13-
* Adds a cookie to the cookie-cache.
14-
*
15-
* @param PHPCrawlerCookieDescriptor $Cookie The cookie to add.
16-
*/
17-
abstract public function addCookie(PHPCrawlerCookieDescriptor $Cookie);
18-
19-
/**
20-
* Adds a bunch of cookies to the cookie-cache.
21-
*
22-
* @param array $cookies Numeric array conatinin the cookies to add as PHPCrawlerCookieDescriptor-objects
23-
*/
24-
abstract public function addCookies($cookies);
25-
26-
/**
27-
* Returns all cookies from the cache that are adressed to the given URL
28-
*
29-
* @param string $target_url The target-URL
30-
* @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
31-
*/
32-
abstract public function getCookiesForUrl($target_url);
33-
34-
/**
35-
* Do cleanups after the cache is not needed anymore
36-
*/
37-
abstract public function cleanup();
15+
/**
16+
* Adds a cookie to the cookie-cache.
17+
*
18+
* @param PHPCrawlerCookieDescriptor $Cookie The cookie to add.
19+
*/
20+
abstract public function addCookie(PHPCrawlerCookieDescriptor $Cookie);
21+
22+
/**
23+
* Adds a bunch of cookies to the cookie-cache.
24+
*
25+
* @param array $cookies Numeric array conatinin the cookies to add as PHPCrawlerCookieDescriptor-objects
26+
*/
27+
abstract public function addCookies($cookies);
28+
29+
/**
30+
* Returns all cookies from the cache that are adressed to the given URL
31+
*
32+
* @param string $target_url The target-URL
33+
* @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
34+
*/
35+
abstract public function getCookiesForUrl($target_url);
36+
37+
/**
38+
* Do cleanups after the cache is not needed anymore
39+
*/
40+
abstract public function cleanup();
3841
}
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
<?php
2+
23
namespace PHPCrawl\CookieCache;
34

5+
use PHPCrawl\PHPCrawlerCookieDescriptor;
6+
use PHPCrawl\Utils\PHPCrawlerUtils;
7+
48
/**
59
* Class for storing/caching cookies in memory.
610
*
@@ -9,86 +13,84 @@
913
*/
1014
class PHPCrawlerMemoryCookieCache extends PHPCrawlerCookieCacheBase
1115
{
12-
protected $cookies = [];
13-
14-
/**
15-
* Adds a cookie to the cookie-cache.
16-
*
17-
* @param PHPCrawlerCookieDescriptor $Cookie The cookie to add.
18-
*/
19-
public function addCookie(PHPCrawlerCookieDescriptor $Cookie)
20-
{
21-
$source_domain = $Cookie->source_domain;
22-
$cookie_domain = $Cookie->domain;
23-
$cookie_path = $Cookie->path;
24-
$cookie_name = $Cookie->name;
25-
26-
$cookie_hash = md5($cookie_domain. '_' .$cookie_path. '_' .$cookie_name);
27-
28-
$this->cookies[$source_domain][$cookie_hash] = $Cookie;
29-
}
30-
31-
/**
32-
* Adds a bunch of cookies to the cookie-cache.
33-
*
34-
* @param array $cookies Numeric array conatinin the cookies to add as PHPCrawlerCookieDescriptor-objects
35-
*/
36-
public function addCookies($cookies)
37-
{
38-
for ($x=0, $xMax = count($cookies); $x< $xMax; $x++)
16+
protected $cookies = [];
17+
18+
/**
19+
* Adds a cookie to the cookie-cache.
20+
*
21+
* @param PHPCrawlerCookieDescriptor $Cookie The cookie to add.
22+
*/
23+
public function addCookie(PHPCrawlerCookieDescriptor $Cookie)
3924
{
40-
$this->addCookie($cookies[$x]);
25+
$source_domain = $Cookie->source_domain;
26+
$cookie_domain = $Cookie->domain;
27+
$cookie_path = $Cookie->path;
28+
$cookie_name = $Cookie->name;
29+
30+
$cookie_hash = md5($cookie_domain . '_' . $cookie_path . '_' . $cookie_name);
31+
32+
$this->cookies[$source_domain][$cookie_hash] = $Cookie;
4133
}
42-
}
43-
44-
/**
45-
* Returns all cookies from the cache that are adressed to the given URL
46-
*
47-
* @param string $target_url The target-URL
48-
* @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
49-
*/
50-
public function getCookiesForUrl($target_url)
51-
{
52-
$url_parts = PHPCrawlerUtils::splitURL($target_url);
53-
54-
$target_domain = $url_parts['domain']; // e.g. acme.com
55-
56-
$return_cookies = [];
57-
58-
// Iterate over all cookies of this domain
59-
@reset($this->cookies[$target_domain]);
60-
while (list($hash) = @each($this->cookies[$target_domain]))
34+
35+
/**
36+
* Adds a bunch of cookies to the cookie-cache.
37+
*
38+
* @param array $cookies Numeric array conatinin the cookies to add as PHPCrawlerCookieDescriptor-objects
39+
*/
40+
public function addCookies($cookies)
6141
{
62-
$Cookie = $this->cookies[$target_domain][$hash];
63-
64-
// Does the cookie-domain match?
65-
// Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
66-
// A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
67-
// Seems like ".acme.com" should also match "anvil.acme.com", so just remove the dot
68-
69-
$Cookie->domain = preg_replace('#^.#', '', $Cookie->domain);
70-
71-
if ($Cookie->domain == $url_parts['host'] || preg_match('#' .preg_quote($Cookie->domain). '$#', $url_parts['host']))
72-
{
73-
// Does the path match?
74-
if (preg_match('#^' .preg_quote($Cookie->path). '#', $url_parts['path']))
75-
{
76-
$return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies
42+
for ($x = 0, $xMax = count($cookies); $x < $xMax; $x++) {
43+
$this->addCookie($cookies[$x]);
44+
}
45+
}
46+
47+
/**
48+
* Returns all cookies from the cache that are adressed to the given URL
49+
*
50+
* @param string $target_url The target-URL
51+
* @return array Numeric array conatining all matching cookies as PHPCrawlerCookieDescriptor-objects
52+
*/
53+
public function getCookiesForUrl($target_url)
54+
{
55+
$url_parts = PHPCrawlerUtils::splitURL($target_url);
56+
57+
$target_domain = $url_parts['domain']; // e.g. acme.com
58+
59+
$return_cookies = [];
60+
61+
// Iterate over all cookies of this domain
62+
if (isset($this->cookies[$target_domain])) {
63+
foreach ($this->cookies[$target_domain] as $hash => $hash_value) {
64+
$Cookie = $this->cookies[$target_domain][$hash];
65+
66+
// Does the cookie-domain match?
67+
// Tail-matching, see http://curl.haxx.se/rfc/cookie_spec.html:
68+
// A domain attribute of "acme.com" would match host names "anvil.acme.com" as well as "shipping.crate.acme.com"
69+
// Seems like ".acme.com" should also match "anvil.acme.com", so just remove the dot
70+
71+
$Cookie->domain = preg_replace('#^.#', '', $Cookie->domain);
72+
73+
if ($Cookie->domain == $url_parts['host'] || preg_match('#' . preg_quote($Cookie->domain) . '$#', $url_parts['host'])) {
74+
// Does the path match?
75+
if (preg_match('#^' . preg_quote($Cookie->path) . '#', $url_parts['path'])) {
76+
$return_cookies[$Cookie->name] = $Cookie; // Use cookie-name as index to avoid double-cookies
77+
}
78+
}
79+
}
80+
7781
}
78-
}
82+
83+
// Convert to numeric array
84+
$return_cookies = array_values($return_cookies);
85+
86+
return $return_cookies;
87+
}
88+
89+
/**
90+
* Cleans up the cache after is it not needed anymore.
91+
*/
92+
public function cleanup()
93+
{
94+
$this->cookies = [];
7995
}
80-
81-
// Convert to numeric array
82-
$return_cookies = array_values($return_cookies);
83-
84-
return $return_cookies;
85-
}
86-
87-
/**
88-
* Cleans up the cache after is it not needed anymore.
89-
*/
90-
public function cleanup()
91-
{
92-
$this->cookies = [];
93-
}
9496
}

0 commit comments

Comments
 (0)