Skip to content

Commit

Permalink
Update generate_sitemap.py
Browse files Browse the repository at this point in the history
  • Loading branch information
carlospolop authored Dec 16, 2024
1 parent 936e7b7 commit 7a9a3de
Showing 1 changed file with 73 additions and 25 deletions.
98 changes: 73 additions & 25 deletions generate_sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,17 @@
"uk": "uk", # Ukrainian
}

# User agent for Googlebot
HEADERS = {
#"User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" # gitbook returns 403
}

def fetch_sitemap(url):
"""Fetch and return the contents of a sitemap."""
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
return response.text

def prettify_xml(element):
"""Prettify and return a string representation of the XML."""
rough_string = ET.tostring(element, encoding='utf-8')
Expand All @@ -32,27 +43,44 @@ def prettify_xml(element):

def encode_url(url):
"""Encode the URL to make it XML-safe and RFC-compliant."""
return quote(url, safe=":/?&=")
return quote(url, safe=":/?&=") # Leave common URL-safe characters untouched

def add_static_urls_without_translations(root, urls):
"""Add static URLs without translations to the sitemap."""
for url in urls:
url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')

loc = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
loc.text = encode_url(url)

root.append(url_element)

def main():
# URLs of the sitemaps
book_sitemap_url = "https://book.hacktricks.xyz/sitemap.xml"
cloud_sitemap_url = "https://cloud.hacktricks.xyz/sitemap.xml"

# Fetch both sitemaps
book_sitemap_data = fetch_sitemap(book_sitemap_url)
cloud_sitemap_data = fetch_sitemap(cloud_sitemap_url)

# Parse XML
ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
book_root = ET.fromstring(book_sitemap_data)
cloud_root = ET.fromstring(cloud_sitemap_data)

all_urls = book_root.findall('ns:url', ns) + cloud_root.findall('ns:url', ns)

# Prepare the output sitemap
ET.register_namespace('', "http://www.sitemaps.org/schemas/sitemap/0.9")
ET.register_namespace('xhtml', "http://www.w3.org/1999/xhtml")
new_root = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}urlset')

# Add static URLs without translations
static_urls = [
"https://www.hacktricks.xyz/",
# Add static entry for https://www.hacktricks.xyz/
add_static_urls_without_translations(new_root, [
"https://www.hacktricks.xyz/"
])

# Add static URLs for training.hacktricks.xyz without translations
static_training_urls = [
"https://training.hacktricks.xyz/",
"https://training.hacktricks.xyz/courses/arte",
"https://training.hacktricks.xyz/courses/arta",
Expand All @@ -66,25 +94,45 @@ def main():
"https://training.hacktricks.xyz/terms",
"https://training.hacktricks.xyz/privacy",
]
add_static_urls_without_translations(new_root, static_urls)

# Add URLs with translations (Example: book.hacktricks.xyz)
url_element = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')
loc = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
loc.text = encode_url("https://book.hacktricks.xyz/")
priority = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}priority')
priority.text = "0.84"
lastmod = ET.SubElement(url_element, '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod')
lastmod.text = "2024-12-14"

# Add translations
for hreflang, lang_path in languages.items():
alt_link = ET.SubElement(url_element, '{http://www.w3.org/1999/xhtml}link')
alt_link.set('rel', 'alternate')
alt_link.set('hreflang', hreflang)
alt_link.set('href', encode_url(f"https://book.hacktricks.xyz/{lang_path}"))

new_root.append(url_element)
add_static_urls_without_translations(new_root, static_training_urls)

# Process main URLs from book and cloud hacktricks sitemaps
for url_element in tqdm(all_urls, desc="Processing URLs"):
loc = url_element.find('ns:loc', ns)
if loc is None:
continue

loc_text = loc.text.strip()
priority = url_element.find('ns:priority', ns)
lastmod = url_element.find('ns:lastmod', ns)

# Create a new <url> element
url_entry = ET.Element('{http://www.sitemaps.org/schemas/sitemap/0.9}url')

# Add <loc>
loc_el = ET.SubElement(url_entry, '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
loc_el.text = encode_url(loc_text)

# Add <priority> if available
if priority is not None:
priority_el = ET.SubElement(url_entry, '{http://www.sitemaps.org/schemas/sitemap/0.9}priority')
priority_el.text = priority.text

# Add <lastmod> if available
if lastmod is not None:
lastmod_el = ET.SubElement(url_entry, '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod')
lastmod_el.text = lastmod.text

# Add alternate links for translations
base_domain = loc_text.split('/')[0:3]
base_domain = '/'.join(base_domain)
for hreflang, lang_path in languages.items():
alt_link = ET.SubElement(url_entry, '{http://www.w3.org/1999/xhtml}link')
alt_link.set('rel', 'alternate')
alt_link.set('hreflang', hreflang)
alt_link.set('href', encode_url(f"{base_domain}/{lang_path}"))

new_root.append(url_entry)

# Save prettified XML to file
beautified_xml = prettify_xml(new_root)
Expand Down

0 comments on commit 7a9a3de

Please sign in to comment.