Page Menu
Home
Phabricator
Search
Configure Global Search
Log In
Files
F3778
generateSitemap.php
Public
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Authored By
•
bzimport
Nov 21 2014, 9:38 PM
2014-11-21 21:38:02 (UTC+0)
Size
11 KB
Referenced Files
None
Subscribers
None
generateSitemap.php
View Options
<?php
define
(
'GS_MAIN'
,
-
2
);
define
(
'GS_TALK'
,
-
1
);
/**
* Creates a Google sitemap for the site
*
* @addtogroup Maintenance
*
* @copyright Copyright © 2005, Ævar Arnfjörð Bjarmason
* @copyright Copyright © 2005, Jens Frank <jeluf@gmx.de>
* @copyright Copyright © 2005, Brion Vibber <brion@pobox.com>
*
* @see http://www.google.com/webmasters/sitemaps/docs/en/about.html
* @see http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
*
* @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later
*
* Modified by gadgetdoctor to correct local paths issue in the index
* Bugzilla: Bug 9675 https://bugzilla.wikimedia.org/show_bug.cgi?id=9675
*
*/
class
GenerateSitemap
{
/**
* The maximum amount of urls in a sitemap file
*
* @link http://www.google.com/schemas/sitemap/0.84/sitemap.xsd
*
* @var int
*/
var
$url_limit
;
/**
* The maximum size of a sitemap file
*
* @link http://www.google.com/webmasters/sitemaps/docs/en/protocol.html#faq_sitemap_size
*
* @var int
*/
var
$size_limit
;
/**
* The path to prepend to the filename
*
* @var string
*/
var
$fspath
;
/**
* The path to append to the domain name
*
* @var string
*/
var
$path
;
/**
* Whether or not to use compression
*
* @var bool
*/
var
$compress
;
/**
* The number of entries to save in each sitemap file
*
* @var array
*/
var
$limit
=
array
();
/**
* Key => value entries of namespaces and their priorities
*
* @var array
*/
var
$priorities
=
array
(
// Custom main namespaces
GS_MAIN
=>
'0.5'
,
// Custom talk namesspaces
GS_TALK
=>
'0.1'
,
// MediaWiki standard namespaces
NS_MAIN
=>
'1.0'
,
NS_TALK
=>
'0.1'
,
NS_USER
=>
'0.5'
,
NS_USER_TALK
=>
'0.1'
,
NS_PROJECT
=>
'0.5'
,
NS_PROJECT_TALK
=>
'0.1'
,
NS_IMAGE
=>
'0.5'
,
NS_IMAGE_TALK
=>
'0.1'
,
NS_MEDIAWIKI
=>
'0.0'
,
NS_MEDIAWIKI_TALK
=>
'0.1'
,
NS_TEMPLATE
=>
'0.0'
,
NS_TEMPLATE_TALK
=>
'0.1'
,
NS_HELP
=>
'0.5'
,
NS_HELP_TALK
=>
'0.1'
,
NS_CATEGORY
=>
'0.5'
,
NS_CATEGORY_TALK
=>
'0.1'
,
);
/**
* A one-dimensional array of namespaces in the wiki
*
* @var array
*/
var
$namespaces
=
array
();
/**
* When this sitemap batch was generated
*
* @var string
*/
var
$timestamp
;
/**
* A database slave object
*
* @var object
*/
var
$dbr
;
/**
* A resource pointing to the sitemap index file
*
* @var resource
*/
var
$findex
;
/**
* A resource pointing to a sitemap file
*
* @var resource
*/
var
$file
;
/**
* A resource pointing to php://stderr
*
* @var resource
*/
var
$stderr
;
/**
* Constructor
*
* @param string $fspath The path to prepend to the filenames, used to
* save them somewhere else than in the root directory
* @param string $path The path to append to the domain name
* @param bool $compress Whether to compress the sitemap files
*/
function
GenerateSitemap
(
$fspath
,
$compress
)
{
global
$wgScriptPath
;
$this
->
url_limit
=
50000
;
$this
->
size_limit
=
pow
(
2
,
20
)
*
10
;
$this
->
fspath
=
isset
(
$fspath
)
?
$fspath
:
''
;
$this
->
compress
=
$compress
;
$this
->
stderr
=
fopen
(
'php://stderr'
,
'wt'
);
$this
->
dbr
=
wfGetDB
(
DB_SLAVE
);
$this
->
generateNamespaces
();
$this
->
timestamp
=
wfTimestamp
(
TS_ISO_8601
,
wfTimestampNow
()
);
$this
->
findex
=
fopen
(
"{$this->fspath}sitemap-index-"
.
wfWikiID
()
.
".xml"
,
'wb'
);
}
/**
* Generate a one-dimensional array of existing namespaces
*/
function
generateNamespaces
()
{
$fname
=
'GenerateSitemap::generateNamespaces'
;
$res
=
$this
->
dbr
->
select
(
'page'
,
array
(
'page_namespace'
),
array
(),
$fname
,
array
(
'GROUP BY'
=>
'page_namespace'
,
'ORDER BY'
=>
'page_namespace'
,
)
);
while
(
$row
=
$this
->
dbr
->
fetchObject
(
$res
)
)
$this
->
namespaces
[]
=
$row
->
page_namespace
;
}
/**
* Get the priority of a given namespace
*
* @param int $namespace The namespace to get the priority for
+
* @return string
*/
function
priority
(
$namespace
)
{
return
isset
(
$this
->
priorities
[
$namespace
]
)
?
$this
->
priorities
[
$namespace
]
:
$this
->
guessPriority
(
$namespace
);
}
/**
* If the namespace isn't listed on the priority list return the
* default priority for the namespace, varies depending on whether it's
* a talkpage or not.
*
* @param int $namespace The namespace to get the priority for
*
* @return string
*/
function
guessPriority
(
$namespace
)
{
return
Namespace
::
isMain
(
$namespace
)
?
$this
->
priorities
[
GS_MAIN
]
:
$this
->
priorities
[
GS_TALK
];
}
/**
* Return a database resolution of all the pages in a given namespace
*
* @param int $namespace Limit the query to this namespace
*
* @return resource
*/
function
getPageRes
(
$namespace
)
{
$fname
=
'GenerateSitemap::getPageRes'
;
return
$this
->
dbr
->
select
(
'page'
,
array
(
'page_namespace'
,
'page_title'
,
'page_touched'
,
),
array
(
'page_namespace'
=>
$namespace
),
$fname
);
}
/**
* Main loop
*
* @access public
*/
function
main
()
{
global
$wgContLang
;
fwrite
(
$this
->
findex
,
$this
->
openIndex
()
);
foreach
(
$this
->
namespaces
as
$namespace
)
{
$res
=
$this
->
getPageRes
(
$namespace
);
$this
->
file
=
false
;
$this
->
generateLimit
(
$namespace
);
$length
=
$this
->
limit
[
0
];
$i
=
$smcount
=
0
;
$fns
=
$wgContLang
->
getFormattedNsText
(
$namespace
);
$this
->
debug
(
"$namespace ($fns)"
);
while
(
$row
=
$this
->
dbr
->
fetchObject
(
$res
)
)
{
if
(
$i
++
===
0
||
$i
===
$this
->
url_limit
+
1
||
$length
+
$this
->
limit
[
1
]
+
$this
->
limit
[
2
]
>
$this
->
size_limit
)
{
if
(
$this
->
file
!==
false
)
{
$this
->
write
(
$this
->
file
,
$this
->
closeFile
()
);
$this
->
close
(
$this
->
file
);
}
$filename
=
$this
->
sitemapFilename
(
$namespace
,
$smcount
++
);
$this
->
file
=
$this
->
open
(
$this
->
fspath
.
$filename
,
'wb'
);
$this
->
write
(
$this
->
file
,
$this
->
openFile
()
);
fwrite
(
$this
->
findex
,
$this
->
indexEntry
(
$filename
)
);
$this
->
debug
(
"
\t
$filename"
);
$length
=
$this
->
limit
[
0
];
$i
=
1
;
}
$title
=
Title
::
makeTitle
(
$row
->
page_namespace
,
$row
->
page_title
);
$date
=
wfTimestamp
(
TS_ISO_8601
,
$row
->
page_touched
);
$entry
=
$this
->
fileEntry
(
$title
->
getFullURL
(),
$date
,
$this
->
priority
(
$namespace
)
);
$length
+=
strlen
(
$entry
);
$this
->
write
(
$this
->
file
,
$entry
);
// generate pages for language variants
if
(
$wgContLang
->
hasVariants
()){
$variants
=
$wgContLang
->
getVariants
();
foreach
(
$variants
as
$vCode
){
if
(
$vCode
==
$wgContLang
->
getCode
())
continue
;
// we don't want default variant
$entry
=
$this
->
fileEntry
(
$title
->
getFullURL
(
''
,
$vCode
),
$date
,
$this
->
priority
(
$namespace
)
);
$length
+=
strlen
(
$entry
);
$this
->
write
(
$this
->
file
,
$entry
);
}
}
}
if
(
$this
->
file
)
{
$this
->
write
(
$this
->
file
,
$this
->
closeFile
()
);
$this
->
close
(
$this
->
file
);
}
}
fwrite
(
$this
->
findex
,
$this
->
closeIndex
()
);
fclose
(
$this
->
findex
);
}
/**
* gzopen() / fopen() wrapper
*
* @return resource
*/
function
open
(
$file
,
$flags
)
{
return
$this
->
compress
?
gzopen
(
$file
,
$flags
)
:
fopen
(
$file
,
$flags
);
}
/**
* gzwrite() / fwrite() wrapper
*/
function
write
(
&
$handle
,
$str
)
{
if
(
$this
->
compress
)
gzwrite
(
$handle
,
$str
);
else
fwrite
(
$handle
,
$str
);
}
/**
* gzclose() / fclose() wrapper
*/
function
close
(
&
$handle
)
{
if
(
$this
->
compress
)
gzclose
(
$handle
);
else
fclose
(
$handle
);
}
/**
* Get a sitemap filename
*
* @static
*
* @param int $namespace The namespace
* @param int $count The count
*
* @return string
*/
function
sitemapFilename
(
$namespace
,
$count
)
{
$ext
=
$this
->
compress
?
'.gz'
:
''
;
return
"sitemap-"
.
wfWikiID
().
"-NS_$namespace-$count.xml$ext"
;
}
/**
* Return the XML required to open an XML file
*
* @static
*
* @return string
*/
function
xmlHead
()
{
return
'<?xml version="1.0" encoding="UTF-8"?>'
.
"
\n
"
;
}
/**
* Return the XML schema being used
*
* @static
*
* @returns string
*/
function
xmlSchema
()
{
return
'http://www.google.com/schemas/sitemap/0.84'
;
}
/**
* Return the XML required to open a sitemap index file
*
* @return string
*/
function
openIndex
()
{
return
$this
->
xmlHead
()
.
'<sitemapindex xmlns="'
.
$this
->
xmlSchema
()
.
'">'
.
"
\n
"
;
}
/**
* Return the XML for a single sitemap indexfile entry
*
* @static
*
* @param string $filename The filename of the sitemap file
*
* @return string
*/
function
indexEntry
(
$filename
)
{
global
$wgServer
;
global
$wgWebpath
;
return
"
\t
<sitemap>
\n
"
.
"
\t\t
<loc>$wgServer$wgWebpath/$filename</loc>
\n
"
.
"
\t\t
<lastmod>{$this->timestamp}</lastmod>
\n
"
.
"
\t
</sitemap>
\n
"
;
}
/**
* Return the XML required to close a sitemap index file
*
* @static
*
* @return string
*/
function
closeIndex
()
{
return
"</sitemapindex>
\n
"
;
}
/**
* Return the XML required to open a sitemap file
*
* @return string
*/
function
openFile
()
{
return
$this
->
xmlHead
()
.
'<urlset xmlns="'
.
$this
->
xmlSchema
()
.
'">'
.
"
\n
"
;
}
/**
* Return the XML for a single sitemap entry
*
* @static
*
* @param string $url An RFC 2396 compilant URL
* @param string $date A ISO 8601 date
* @param string $priority A priority indicator, 0.0 - 1.0 inclusive with a 0.1 stepsize
*
* @return string
*/
function
fileEntry
(
$url
,
$date
,
$priority
)
{
return
"
\t
<url>
\n
"
.
"
\t\t
<loc>$url</loc>
\n
"
.
"
\t\t
<lastmod>$date</lastmod>
\n
"
.
"
\t\t
<priority>$priority</priority>
\n
"
.
"
\t
</url>
\n
"
;
}
/**
* Return the XML required to close sitemap file
*
* @static
* @return string
*/
function
closeFile
()
{
return
"</urlset>
\n
"
;
}
/**
* Write a string to stderr followed by a UNIX newline
*/
function
debug
(
$str
)
{
fwrite
(
$this
->
stderr
,
"$str
\n
"
);
}
/**
* Populate $this->limit
*/
function
generateLimit
(
$namespace
)
{
$title
=
Title
::
makeTitle
(
$namespace
,
str_repeat
(
"
\x
f0
\x
a8
\x
ae
\x
81"
,
63
)
.
"
\x
e5
\x
96
\x
83"
);
$this
->
limit
=
array
(
strlen
(
$this
->
openFile
()
),
strlen
(
$this
->
fileEntry
(
$title
->
getFullUrl
(),
wfTimestamp
(
TS_ISO_8601
,
wfTimestamp
()
),
$this
->
priority
(
$namespace
)
)
),
strlen
(
$this
->
closeFile
()
)
);
}
}
if
(
in_array
(
'--help'
,
$argv
)
)
{
echo
<<<EOT
Usage: php generateSitemap.php [options]
--help show this message
--fspath=<path> The file system path to save to, e.g /tmp/sitemap/
--server=<server> The protocol and server name to use in URLs, e.g.
http://en.wikipedia.org. This is sometimes necessary because
server name detection may fail in command line scripts.
--compress=[yes|no] compress the sitemap files, default yes
--webpath=<dir> If you are placing the sitemap files in a sub folder
i.e. using the --fspath option and specify somewhere other than root
you need to place here the directory name e.g:
if -fspath = /var/www/httpdocs/mediawiki_sitemaps/ for example
then --webpath = /mediawiki_sitemaps *Note, no trailing / needed
EOT;
die
(
-
1
);
}
$optionsWithArgs
=
array
(
'fspath'
,
'server'
,
'compress'
,
'webpath'
);
require_once
(
dirname
(
__FILE__
)
.
'/commandLine.inc'
);
if
(
isset
(
$options
[
'server'
]
)
)
{
$wgServer
=
$options
[
'server'
];
}
if
(
isset
(
$options
[
'webpath'
]
)
)
{
$wgWebpath
=
$options
[
'webpath'
];
}
$gs
=
new
GenerateSitemap
(
@
$options
[
'fspath'
],
@
$options
[
'compress'
]
!==
'no'
);
$gs
->
main
();
File Metadata
Details
Attached
Mime Type
text/x-php
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3345
Default Alt Text
generateSitemap.php (11 KB)
Attached To
Mode
T11675: sitemap-index doesn't include full location path
Attached
Detach File
Event Timeline
Log In to Comment