Skip to content
Open

Ak #7

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
gen.php
sitemap.xml
sitemap-index.xml
8 changes: 5 additions & 3 deletions sitemap-config.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
return array(
// Site to crawl and create a sitemap for.
// <Syntax> https://www.your-domain-name.com/ or http://www.your-domain-name.com/
"SITE_URL" => "https://student-laptop.nl/",
"SITE_URL" => "https://www.example.com/",

// Boolean for crawling external links.
// <Example> *Domain = https://www.student-laptop.nl* , *Link = https://www.google.com* <When false google will not be crawled>
Expand All @@ -22,7 +22,7 @@
// Array with absolute links or keywords for the pages to skip when crawling the given SITE_URL.
// <Example> https://student-laptop.nl/info/laptops or you can just input student-laptop.nl/info/ and it will not crawl anything in that directory
// Try to be as specific as you can so you dont skip 300 pages
"KEYWORDS_TO_SKIP" => array(),
"KEYWORDS_TO_SKIP" => ["/privacy"],

// Location + filename where the sitemap will be saved.
"SAVE_LOC" => "sitemap.xml",
Expand All @@ -31,8 +31,10 @@
"PRIORITY" => 1,

// Static update frequency
"CHANGE_FREQUENCY" => "daily",
"CHANGE_FREQUENCY" => "monthly",

// Date changed (today's date)
"LAST_UPDATED" => date('Y-m-d'),

"VERBOSE" => true,
);
55 changes: 38 additions & 17 deletions sitemap-generator.php
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
<?php

class SitemapGenerator
{
ob_implicit_flush(true);
class SitemapGenerator {
// Config file with crawler/sitemap options
private $config;

Expand All @@ -15,18 +14,26 @@ class SitemapGenerator
// File where sitemap is written to.
private $sitemap_file;

private $_isCLI;
private $_verbose = false;

// Constructor sets the given file for internal use
public function __construct($conf)
{
public function __construct($conf) {
// Setup class variables using the config
$this->config = $conf;
$this->scanned = [];
$this->site_url_base = parse_url($this->config['SITE_URL'])['scheme'] . "://" . parse_url($this->config['SITE_URL'])['host'];
$this->sitemap_file = fopen($this->config['SAVE_LOC'], "w");


if (isset($this->config["VERBOSE"]) && filter_var($this->config["VERBOSE"], FILTER_VALIDATE_BOOL)) {
$this->_verbose = (bool)$this->config["VERBOSE"];
}

$this->_isCLI = (function_exists("php_sapi_name") && php_sapi_name() === "cli") ? true : false;
}

public function GenerateSitemap()
{
public function GenerateSitemap() {
// Call the recursive crawl function with the start url.
$this->crawlPage($this->config['SITE_URL']);

Expand All @@ -35,8 +42,7 @@ public function GenerateSitemap()
}

// Get the html content of a page and return it as a dom object
private function getHtml($url)
{
private function getHtml($url) {
// Get html from the given page
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
Expand All @@ -52,8 +58,7 @@ private function getHtml($url)
}

// Recursive function that crawls a page's anchor tags and store them in the scanned array.
private function crawlPage($page_url)
{
private function crawlPage($page_url) {
$url = filter_var($page_url, FILTER_SANITIZE_URL);

// Check if the url is invalid or if the page is already scanned;
Expand All @@ -64,7 +69,7 @@ private function crawlPage($page_url)
// Add the page url to the scanned array
array_push($this->scanned, $page_url);

// Get the html content from the
// Get the html content from the
$html = $this->getHtml($url);
$anchors = $html->getElementsByTagName('a');

Expand Down Expand Up @@ -116,6 +121,9 @@ private function crawlPage($page_url)

// Call the function again with the new URL
if (!$found) {
if ($this->_isCLI && $this->_verbose) {
echo "\nCrawling next URL: {$next_url}";
}
$this->crawlPage($next_url);
}
}
Expand All @@ -124,35 +132,48 @@ private function crawlPage($page_url)
// Convert a relative link to a absolute link
// Example: Relative /articles
// Absolute https://student-laptop.nl/articles
private function convertRelativeToAbsolute($page_base_url, $link)
{
private function convertRelativeToAbsolute($page_base_url, $link) {
$first_character = substr($link, 0, 1);
if ($first_character == "?" || $first_character == "#") {
return $page_base_url . $link;
} else if ($first_character != "/") {
return $this->site_url_base . "/" . $link;
} else {
$baseUrlParts = parse_url($page_base_url);
$linkParts = parse_url($link);

if (isset($linkParts["host"]) && $linkParts["host"] == $baseUrlParts["host"]) {
return $this->site_url_base . $linkParts["path"];
}
if (isset($linkParts["scheme"]))
return $link;
return $this->site_url_base . $link;
}
}

// Function to generate a Sitemap with the given pages array where the script has run through
private function generateFile($pages)
{
private function generateFile($pages) {
$xml = '<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<!-- ' . count($pages) . ' total pages-->
<!-- PHP-sitemap-generator by https://github.com/tristangoossens -->';


// Print the amount of pages
echo count($pages);
// if ($this->_isCLI && $this->_verbose) {
echo "\n" . count($pages) . " Pages included in the sitemap\n";
// }

foreach ($pages as $page) {
$xml .= "<url><loc>" . $page . "</loc>
<lastmod>" . $this->config['LAST_UPDATED'] . "</lastmod>
<changefreq>" . $this->config['CHANGE_FREQUENCY'] . "</changefreq>
<priority>" . $this->config['PRIORITY'] . "</priority></url>";

if ($this->_isCLI && $this->_verbose) {
echo "\nAdding {$page} to sitemap";
sleep(1);
}
}

$xml .= "</urlset>";
Expand Down
59 changes: 0 additions & 59 deletions sitemap.xml

This file was deleted.