diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..97b8f45 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +gen.php +sitemap.xml +sitemap-index.xml \ No newline at end of file diff --git a/sitemap-config.php b/sitemap-config.php index 1c863e8..9106729 100644 --- a/sitemap-config.php +++ b/sitemap-config.php @@ -3,7 +3,7 @@ return array( // Site to crawl and create a sitemap for. // https://www.your-domain-name.com/ or http://www.your-domain-name.com/ - "SITE_URL" => "https://student-laptop.nl/", + "SITE_URL" => "https://www.example.com/", // Boolean for crawling external links. // *Domain = https://www.student-laptop.nl* , *Link = https://www.google.com* @@ -22,7 +22,7 @@ // Array with absolute links or keywords for the pages to skip when crawling the given SITE_URL. // https://student-laptop.nl/info/laptops or you can just input student-laptop.nl/info/ and it will not crawl anything in that directory // Try to be as specific as you can so you dont skip 300 pages - "KEYWORDS_TO_SKIP" => array(), + "KEYWORDS_TO_SKIP" => ["/privacy"], // Location + filename where the sitemap will be saved. "SAVE_LOC" => "sitemap.xml", @@ -31,8 +31,10 @@ "PRIORITY" => 1, // Static update frequency - "CHANGE_FREQUENCY" => "daily", + "CHANGE_FREQUENCY" => "monthly", // Date changed (today's date) "LAST_UPDATED" => date('Y-m-d'), + + "VERBOSE" => true, ); diff --git a/sitemap-generator.php b/sitemap-generator.php index 0727ffd..c5bb484 100644 --- a/sitemap-generator.php +++ b/sitemap-generator.php @@ -1,7 +1,6 @@ config = $conf; $this->scanned = []; $this->site_url_base = parse_url($this->config['SITE_URL'])['scheme'] . "://" . parse_url($this->config['SITE_URL'])['host']; $this->sitemap_file = fopen($this->config['SAVE_LOC'], "w"); + + + if (isset($this->config["VERBOSE"]) && filter_var($this->config["VERBOSE"], FILTER_VALIDATE_BOOL)) { + $this->_verbose = (bool)$this->config["VERBOSE"]; + } + + $this->_isCLI = (function_exists("php_sapi_name") && php_sapi_name() === "cli") ? true : false; } - public function GenerateSitemap() - { + public function GenerateSitemap() { // Call the recursive crawl function with the start url. $this->crawlPage($this->config['SITE_URL']); @@ -35,8 +42,7 @@ public function GenerateSitemap() } // Get the html content of a page and return it as a dom object - private function getHtml($url) - { + private function getHtml($url) { // Get html from the given page $curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $url); @@ -52,8 +58,7 @@ private function getHtml($url) } // Recursive function that crawls a page's anchor tags and store them in the scanned array. - private function crawlPage($page_url) - { + private function crawlPage($page_url) { $url = filter_var($page_url, FILTER_SANITIZE_URL); // Check if the url is invalid or if the page is already scanned; @@ -64,7 +69,7 @@ private function crawlPage($page_url) // Add the page url to the scanned array array_push($this->scanned, $page_url); - // Get the html content from the + // Get the html content from the $html = $this->getHtml($url); $anchors = $html->getElementsByTagName('a'); @@ -116,6 +121,9 @@ private function crawlPage($page_url) // Call the function again with the new URL if (!$found) { + if ($this->_isCLI && $this->_verbose) { + echo "\nCrawling next URL: {$next_url}"; + } $this->crawlPage($next_url); } } @@ -124,21 +132,27 @@ private function crawlPage($page_url) // Convert a relative link to a absolute link // Example: Relative /articles // Absolute https://student-laptop.nl/articles - private function convertRelativeToAbsolute($page_base_url, $link) - { + private function convertRelativeToAbsolute($page_base_url, $link) { $first_character = substr($link, 0, 1); if ($first_character == "?" || $first_character == "#") { return $page_base_url . $link; } else if ($first_character != "/") { return $this->site_url_base . "/" . $link; } else { + $baseUrlParts = parse_url($page_base_url); + $linkParts = parse_url($link); + + if (isset($linkParts["host"]) && $linkParts["host"] == $baseUrlParts["host"]) { + return $this->site_url_base . $linkParts["path"]; + } + if (isset($linkParts["scheme"])) + return $link; return $this->site_url_base . $link; } } // Function to generate a Sitemap with the given pages array where the script has run through - private function generateFile($pages) - { + private function generateFile($pages) { $xml = ' @@ -146,13 +160,20 @@ private function generateFile($pages) // Print the amount of pages - echo count($pages); + // if ($this->_isCLI && $this->_verbose) { + echo "\n" . count($pages) . " Pages included in the sitemap\n"; + // } foreach ($pages as $page) { $xml .= "" . $page . " " . $this->config['LAST_UPDATED'] . " " . $this->config['CHANGE_FREQUENCY'] . " " . $this->config['PRIORITY'] . ""; + + if ($this->_isCLI && $this->_verbose) { + echo "\nAdding {$page} to sitemap"; + sleep(1); + } } $xml .= ""; diff --git a/sitemap.xml b/sitemap.xml deleted file mode 100644 index b8fac62..0000000 --- a/sitemap.xml +++ /dev/null @@ -1,59 +0,0 @@ - - - - - - https://student-laptop.nl/ - 2021-03-10 - daily - 1 - - - https://student-laptop.nl/underConstruction - 2021-03-10 - daily - 1 - - - https://student-laptop.nl/article?article_id=1 - 2021-03-10 - daily - 1 - - - https://student-laptop.nl/laptopInfo?laptop_id=7 - 2021-03-10 - daily - 1 - - - https://student-laptop.nl/laptopInfo?laptop_id=6 - 2021-03-10 - daily - 1 - - - https://student-laptop.nl/laptopInfo?laptop_id=5 - 2021-03-10 - daily - 1 - - - https://student-laptop.nl/laptopInfo?laptop_id=4 - 2021-03-10 - daily - 1 - - - https://student-laptop.nl/laptopInfo?laptop_id=3 - 2021-03-10 - daily - 1 - - - https://student-laptop.nl/laptopInfo?laptop_id=2 - 2021-03-10 - daily - 1 - -