diff --git a/generate_sitemap.py b/generate_sitemap.py index 6bec16f..8d02aa5 100644 --- a/generate_sitemap.py +++ b/generate_sitemap.py @@ -24,6 +24,17 @@ "uk": "uk", # Ukrainian } +# User agent for Googlebot +HEADERS = { + #"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" # gitbook returns 403 +} + +def fetch_sitemap(url): + """Fetch and return the contents of a sitemap.""" + response = requests.get(url, headers=HEADERS, timeout=30) + response.raise_for_status() + return response.text + def prettify_xml(element): """Prettify and return a string representation of the XML.""" rough_string = ET.tostring(element, encoding='utf-8') @@ -32,27 +43,44 @@ def prettify_xml(element): def encode_url(url): """Encode the URL to make it XML-safe and RFC-compliant.""" - return quote(url, safe=":/?&=") + return quote(url, safe=":/?&=") # Leave common URL-safe characters untouched def add_static_urls_without_translations(root, urls): """Add static URLs without translations to the sitemap.""" for url in urls: url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url') - loc = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc') loc.text = encode_url(url) - root.append(url_element) def main(): + # URLs of the sitemaps + book_sitemap_url = "https://book.hacktricks.xyz/sitemap.xml" + cloud_sitemap_url = "https://cloud.hacktricks.xyz/sitemap.xml" + + # Fetch both sitemaps + book_sitemap_data = fetch_sitemap(book_sitemap_url) + cloud_sitemap_data = fetch_sitemap(cloud_sitemap_url) + + # Parse XML + ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'} + book_root = ET.fromstring(book_sitemap_data) + cloud_root = ET.fromstring(cloud_sitemap_data) + + all_urls = book_root.findall('ns:url', ns) + cloud_root.findall('ns:url', ns) + # Prepare the output sitemap ET.register_namespace('', "http://www.sitemaps.org/schemas/sitemap/0.9") ET.register_namespace('xhtml', "http://www.w3.org/1999/xhtml") new_root = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}urlset') - # Add static URLs without translations - static_urls = [ - "https://www.hacktricks.xyz/", + # Add static entry for https://www.hacktricks.xyz/ + add_static_urls_without_translations(new_root, [ + "https://www.hacktricks.xyz/" + ]) + + # Add static URLs for training.hacktricks.xyz without translations + static_training_urls = [ "https://training.hacktricks.xyz/", "https://training.hacktricks.xyz/courses/arte", "https://training.hacktricks.xyz/courses/arta", @@ -66,25 +94,45 @@ def main(): "https://training.hacktricks.xyz/terms", "https://training.hacktricks.xyz/privacy", ] - add_static_urls_without_translations(new_root, static_urls) - - # Add URLs with translations (Example: book.hacktricks.xyz) - url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url') - loc = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc') - loc.text = encode_url("https://book.hacktricks.xyz/") - priority = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}priority') - priority.text = "0.84" - lastmod = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod') - lastmod.text = "2024-12-14" - - # Add translations - for hreflang, lang_path in languages.items(): - alt_link = ET.SubElement(url_element, '{http://www.w3.org/1999/xhtml}link') - alt_link.set('rel', 'alternate') - alt_link.set('hreflang', hreflang) - alt_link.set('href', encode_url(f"https://book.hacktricks.xyz/{lang_path}")) - - new_root.append(url_element) + add_static_urls_without_translations(new_root, static_training_urls) + + # Process main URLs from book and cloud hacktricks sitemaps + for url_element in tqdm(all_urls, desc="Processing URLs"): + loc = url_element.find('ns:loc', ns) + if loc is None: + continue + + loc_text = loc.text.strip() + priority = url_element.find('ns:priority', ns) + lastmod = url_element.find('ns:lastmod', ns) + + # Create a new element + url_entry = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url') + + # Add + loc_el = ET.SubElement(url_entry, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc') + loc_el.text = encode_url(loc_text) + + # Add if available + if priority is not None: + priority_el = ET.SubElement(url_entry, '{http://www.sitemaps.org/schemas/sitemap/0.9}priority') + priority_el.text = priority.text + + # Add if available + if lastmod is not None: + lastmod_el = ET.SubElement(url_entry, '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod') + lastmod_el.text = lastmod.text + + # Add alternate links for translations + base_domain = loc_text.split('/')[0:3] + base_domain = '/'.join(base_domain) + for hreflang, lang_path in languages.items(): + alt_link = ET.SubElement(url_entry, '{http://www.w3.org/1999/xhtml}link') + alt_link.set('rel', 'alternate') + alt_link.set('hreflang', hreflang) + alt_link.set('href', encode_url(f"{base_domain}/{lang_path}")) + + new_root.append(url_entry) # Save prettified XML to file beautified_xml = prettify_xml(new_root)