forked from vezaynk/Sitemap-Generator-Crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sitemap.php
executable file
·127 lines (104 loc) · 3.51 KB
/
sitemap.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
<?php
/***************************\
|***DO NOT EDIT THIS FILE***|
|**EDIT sitemap.config.php**|
\***************************/
error_reporting(E_ALL);
//Read global variables from config file
require_once('sitemap.config.php');
// Include all functions
require_once('sitemap.functions.php');
//Default html header makes browsers ignore \n
header("Content-Type: text/plain");
$color = false;
$version_script = 2;
if ($version_script != $version_functions || $version_functions != $version_config) {
logger("Script versions mismatch!", 3);
logger("Update necessary", 3);
logger("Version of sitemap.functions.php " .$version_functions, 3);
logger("Version of sitemap.config.php " .$version_config, 3);
logger("Version of sitemap.php " .$version_script, 3);
logger("Download new files here: https://www.github.com/knyzorg/sitemap-generator-crawler", 3);
die("Stopped.");
}
// Add PHP CLI support
if (php_sapi_name() === 'cli' && PHP_OS != 'WINNT') {
parse_str(implode('&', array_slice($argv, 1)), $args);
$color = true;
}
//Allow variable overloading with CLI
if (isset($args['file'])) {
$file = $args['file'];
}
if (isset($args['site'])) {
$site = $args['site'];
}
if (isset($args['max_depth'])) {
$max_depth = $args['max_depth'];
}
if (isset($args['enable_frequency'])) {
$enable_frequency = $args['enable_frequency'];
}
if (isset($args['enable_priority'])) {
$enable_priority = $args['enable_priority'];
}
if (isset($args['enable_modified'])) {
$enable_modified = $args['enable_modified'];
}
if (isset($args['freq'])) {
$freq = $args['freq'];
}
if (isset($args['priority'])) {
$priority = $args['priority'];
}
if (isset($args['blacklist'])) {
$blacklist = $args['blacklist'];
}
if (isset($args['debug'])) {
$debug = $args['debug'];
}
if (isset($args['ignore_arguments'])) {
$ignore_arguments = !!$args['ignore_arguments'];
}
if (isset($args['pdf_index'])) {
$pdf_index = $args['pdf_index'];
}
//Begin stopwatch for statistics
$start = microtime(true);
//Setup file stream
$tempfile = tempnam(sys_get_temp_dir(), 'sitemap.xml.');
$file_stream = fopen($tempfile, "w") or die("Error: Could not create temporary file $tempfile" . "\n");
fwrite($file_stream, $xmlheader);
// Global variable, non-user defined
$depth = 0;
$indexed = 0;
$scanned = array();
$deferredLinks = array();
// Reduce domain to root in case of monkey
$real_site = domain_root($site);
if ($real_site != $site) {
logger("Reformatted site from $site to $real_site", 2);
}
// Begin by crawling the original url
scan_url($real_site);
// Finalize sitemap
fclose($file_stream);
// Pretty-print sitemap
if ((PHP_OS == 'WINNT') ? `where xmllint` : `which xmllint`) {
logger("Found xmllint, pretty-printing sitemap", 0);
$responsevalue = exec('xmllint --format ' . $tempfile . ' -o ' . $tempfile . ' 2>&1', $discardedoutputvalue, $returnvalue);
if ($returnvalue) {
die("Error: " . $responsevalue . "\n");
}
}
// Generate and print out statistics
$time_elapsed_secs = round(microtime(true) - $start, 2);
logger("Sitemap has been generated in " . $time_elapsed_secs . " second" . (($time_elapsed_secs >= 1 ? 's' : '') . "and saved to $file"), 0);
$size = sizeof($scanned);
logger("Scanned a total of $size pages and indexed $indexed pages.", 0);
// Rename partial file to the real file name. `rename()` overwrites any existing files
rename($tempfile, $file);
// Apply permissions
chmod($file, $permissions);
// Declare that the script has finished executing and exit
logger("Operation Completed", 0);