#!/usr/bin/env python3 import re import os import sys import shutil import requests from bs4 import BeautifulSoup, Comment from urllib.parse import urljoin, urlparse, parse_qs import internetarchive as ia from datetime import datetime from packaging.version import Version headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:128.0) Gecko/20100101 Firefox/128.0', } ignore_urls = re.compile(r'\.(ogv|oga|ogg|webm|mp4|zim|mp3|wav|asf|ai|sig|txt|ppt|docx?|xls|csv|png|gif|jpe?g|zip|ps|pdf|exe|deb|dsc|gz|bz2?|xz|zstd|lz|tar(\.(gz|bz2?|xz|zstd|lz))?|t[bgx]z)(\?|$)', re.IGNORECASE) wiki_string_detector = re.compile(r'wiki|docs|documentation', re.IGNORECASE) wiki_url_detector = re.compile( r''' wiki| doku| docs| /lib/images/interwiki/| /start$| Special:| User:| Hauptseite| /images/[0-9a-z]/[0-9a-z][0-9a-z]/[^/]+$| /images/thumb/[0-9a-z]/[0-9a-z][0-9a-z]/[^/]+/[^/]+| \.fandom\.com| Main_Page ''', re.VERBOSE|re.IGNORECASE ) mediawiki_href = {'href': re.compile(r'^(?:https?:)?//www\.mediawiki\.org/$')} dead_hostnames = ''' localhost alioth.debian.org anonscm.debian.org wikiwand.com www.wikiwand.com wiki.archiveteam.org mastodon.social '''.split() ignored_hostnames = dead_hostnames + ''' forge-allura.apache.org wiki.lineageos.org wiki.archiveteam.org gitlab.com gitlab.gnome.org gitlab.winehq.org gitlab.torproject.org gitlab.haskell.org airvpn.org www.mediawiki.org botwiki.org www.patreon.com dev.e-taxonomy.eu www.redmine.org redmine.org alioth-lists-archive.debian.net glyphwiki.org en.glyphwiki.org lists.alioth.debian.org phabricator.wikimedia.org www.facebook.com www.linkedin.com www.instagram.com wiki.natenom.de diff.wikimedia.org techblog.wikimedia.org debathena.mit.edu www.twitter.com search.twitter.com mobile.twitter.com medium.com blog.wikimedia.org wordpress.com josm.openstreetmap.de wordpress.org cas.fsf.org www.x.com facebook.com linkedin.com instagram.com twitter.com x.com gitlab.freedesktop.org secure.freedesktop.org salsa.debian.org packages.debian.org bugs.debian.org deb.debian.org ftp.debian.org packages.qa.debian.org tracker.debian.org redmine.replicant.us clang.debian.net metadata.ftp-master.debian.org snapshot.debian.org security.debian.org piuparts.debian.org release.debian.org patches.ubuntu.com people.debian.org qa.debian.org screenshots.debian.net sources.debian.org udd.debian.org fuse.wikichip.org gcc.gnu.org fosstodon.org xwiki.org opencollective.com www.xwiki.org extensions.xwiki.org jira.xwiki.org forum.xwiki.org commons.xwiki.org l10n.xwiki.org platform.xwiki.org rendering.xwiki.org twitter.com commons.xwiki.org design.xwiki.org dev.xwiki.org slashdot.org rss.slashdot.org news.slashdot.org download.kiwix.org www.debian.org wikipedia.fivefilters.org bugs.launchpad.net launchpad.net code.launchpad.net buildd.debian.org ci.debian.net debtags.debian.org i18n.debian.org codeberg.org osmocom.org projects.osmocom.org www.osmocom.org docs.github.com github.com www.bcn.cl archive.org www.archive.org emacswiki.org www.emacswiki.org netsplit.de freedesktop.org www.freedesktop.org wiki.freedesktop.org cgit.freedesktop.org en.wikipedia.org es.wikipedia.org de.wikipedia.org fr.wikipedia.org lists.wikimedia.org de.wikipedia.org wikipedia.org x.org www.x.org ikiwiki.info www.ikiwiki.info wikibot.digitaldragon.dev cdn.digitaldragon.dev tracker.debian.org dev.haiku-os.org old.reddit.com www.reddit.com wiki.ircforever.org wiki.ircnow.org wiki.debian.org wiki.gnome.org wiki.freebsd.org accounts.google.com wiki.netbsd.org wiki.ubuntu.com wiki.ubuntu-it.org wiki.mercurial-scm.org wiki.ros.org wiki.list.org wiki.python.org wm-bot.wmcloud.org sourceforge.net sf.net en.sourceforge.jp osdn.net en.osdn.net ja.osdn.net code.google.com www.google.com etherpad.wikimedia.org readthedocs.org breezewiki.com wikibruce.com www.flickr.com flickr.com www.youtube.com youtube.com youtu.be wikitrans.co fb.me '''.split() requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) max_year = datetime.now().year - 1 cmd_fn = os.path.expanduser('~/wikibot-commands.txt') class requests_ContentLengthTooLargeException(requests.RequestException): 'Content-Length header indicates a size that is too large' class requests_ContentTooLargeException(requests.RequestException): 'Content size that is too large' # FIXME: replace this when a better option becomes available # https://github.com/psf/requests/issues/1751 def requests_get_limit(url, *args, **kwargs): max_size = 1024*1024*10 # 10MiB max_block_size = 1024*1024*1 # 1MiB r = requests.get(url, stream=True, *args, **kwargs) content_size = r.headers.get('Content-Length', 0) content_size = int(content_size) if content_size >= max_size: r.close() raise requests_ContentLengthTooLargeException() total_size = 0 r._content = b'' for block in r.iter_content(max_block_size): block_size = len(block) total_size += block_size if total_size > max_size: r.close() raise requests_ContentTooLargeException() r._content += block r.close() return r def mw_new(url): return f'!mw --url {url} --xml --xmlrevisions --images --queue bulk' def mw_old(url): return f'!mw --url {url} --xml --xmlapiexport --images --queue bulk' def mw_ancient(url): return f'!mw --url {url} --xml --images --queue bulk' def dw(url): return f'!dw --url {url} --auto --queue bulk' def pw(url): return f'!pw --url {url} --auto --queue bulk' indent='' def err(str): print(indent+str, file=sys.stderr) print(indent) return False def warn(str): print(indent+str, file=sys.stderr) def head(str): length = shutil.get_terminal_size()[0] - len(indent+str) - 1 print(indent+str, length * '-', file=sys.stderr) mw_minerva_re = re.compile(r'"wgMinervaMenuData"\s*:\s*{\s*"(?:groups|discovery)"\s*:\s*(?:\[\s*)*{\s*"name"\s*:\s*"home"\s*,\s*"components"\s*:\s*\[\s*{\s*(?:"text"\s*:\s*"[^"]*"\s*,\s*)?"href"\s*:\s*"([^"]+)"') def check_mw_minerva(tag): if tag.name == 'script': if 'wgMinervaMenuData' in tag.string: match = mw_minerva_re.search(tag.string) if match: tag['href'] = match.group(1) return True def check_mw_p_logo_commented(tag): if tag.name == 'div' and \ tag.get('id') == 'p-logo' and \ tag.get('role') == 'banner' and \ isinstance(tag.string, Comment): doc = BeautifulSoup(tag.string, 'html.parser') link = doc.find('a') tag['href'] = link['href'] return True def check_pw_header_logo(tag): if tag.name == 'div' and \ tag.get('id') == 'header' and \ tag.a.img.get('id') == 'logo': tag['href'] = tag.a['href'] return True def check_wiki_urls_on_page(url, doc, wiki_links, error): if not wiki_links: return err(error) links = doc.find_all('a', {'href': True}, string=wiki_string_detector) links += doc.find_all('a', {'href': wiki_url_detector}) if links: warn(error) err(f'Found {len(links)} link(s) to wikis in {url}') global indent indent=' ' links = sorted(set([urljoin(url, a['href']) for a in links])) for link in links: check_url(link, False) indent='' return err(f'Done checking link(s) to wikis in {url}') else: return err(error) def check_url(url, wiki_links=True): # Whitespace usually isn't part of URLs so drop it url = url.strip() # Blank URLs cannot be fetched if not url: return err(f'URL blank: {url}') # Heading for the URL head(url) # Strip off stuff that masks the main page url = re.sub(r'/lib/exe/taskrunner\.php\?.*|/File:.*|/api\.php$', r'/', url) if ignore_urls.search(url): return err(f'URL ignored {url}') parsed_url = urlparse(url) hostname = parsed_url.hostname path = parsed_url.path if not hostname: return err(f'Could not find hostname from {url}') if hostname in ('google.com', 'www.google.com', 'encrypted.google.com') and path == '/url': query = parse_qs(parsed_url.query) if 'url' in query: url = urljoin(url, query['url'][0]) parsed_url = urlparse(url) hostname = parsed_url.hostname path = parsed_url.path elif 'q' in query: url = urljoin(url, query['q'][0]) parsed_url = urlparse(url) hostname = parsed_url.hostname path = parsed_url.path # Redirect Google Translate to the original URL if re.search(r'\.translate\.goog$', hostname): response = requests.get(url, headers=headers, timeout=60) doc = BeautifulSoup(response.text, 'html.parser') link = doc.find('base') or doc.find('a', text='Go to original page') if link: url = urljoin(url, link['href']) parsed_url = urlparse(url) hostname = parsed_url.hostname else: return err('Could not find original URL from Google Translate') if hostname in dead_hostnames: return err(f'This site is no longer online {hostname}') if not wiki_links and (hostname in ignored_hostnames or \ re.search(r'^[-a-z0-9]+\.((github|readthedocs)\.io|atlassian\.net)$', hostname) or \ re.search(r'^[-a-z0-9]+\.readthedocs\.org$', hostname) or \ re.search(r'^[-a-z0-9]+\.blogspot\.', hostname) or \ re.search(r'^[-a-z0-9]+\.(wordpress|tumblr|wikidot|medium|substack)\.com$', hostname) or \ re.search(r'^(source\.)?[-a-z0-9]+\.branchable\.com$', hostname)): return err(f'No supported wikis here {hostname}') # Download the URL for subsequent detection of TLS validity, wiki type, main page # FIXME: handle meta refresh try: try: response = requests_get_limit(url, headers=headers, timeout=60) secure = True except requests.exceptions.SSLError: response = requests_get_limit(url, headers=headers, timeout=60, verify=False) secure = False except requests.packages.urllib3.exceptions.LocationParseError as e: return err(f'URL parse error for {url} is {e}') except requests.Timeout as e: return err(f'Timed out on {url}') except requests.RequestException as e: return err(f'Download failure for {url} is {e}') except requests_ContentLengthTooLargeException as e: return err(f'Data to be downloaded for {url} is too large {e}') except requests_ContentTooLargeException as e: return err(f'Data that was downloaded for {url} is too large {e}') # Pref the redirect target to the original URL url = response.url if ignore_urls.search(url): return err(f'URL ignored {url}') hostname = urlparse(url).hostname if not hostname: return err(f'Could not find hostname from {url}') # Parse the HTML response doc = BeautifulSoup(response.text, 'html5lib') if hostname in ignored_hostnames or \ re.search(r'^[-a-z0-9]+\.((github|readthedocs)\.io|atlassian\.net)$', hostname) or \ re.search(r'^[-a-z0-9]+\.readthedocs\.org$', hostname) or \ re.search(r'^[-a-z0-9]+\.blogspot\.', hostname) or \ re.search(r'^[-a-z0-9]+\.(wordpress|tumblr|wikidot|medium|substack)\.com$', hostname) or \ re.search(r'^(source\.)?[-a-z0-9]+\.branchable\.com$', hostname): return check_wiki_urls_on_page(url, doc, wiki_links, f'No supported wikis here {hostname}') # Find the wiki type generator = doc.find('meta', {'name': 'generator'}) if generator: generator = generator['content'] else: generator = '' # Detect PukiWiki, which does not use meta generator if not generator: footer = doc.find('div', {'id': 'footer'}) if not footer: footer = doc.find('div', {'id': 'footer2'}) if footer: pukiwiki_version = footer.find_next('strong') if pukiwiki_version and \ pukiwiki_version.string and \ pukiwiki_version.string.startswith('PukiWiki '): generator = pukiwiki_version.string else: pukiwiki_version = re.search(r'(PukiWiki)"? *(?:Plus!?)? *(\d\.\d[^ ]*)', footer.text) if pukiwiki_version: generator = ' '.join(pukiwiki_version.groups()) # Detect DokuWikis where the generator metadata is disabled if not generator: footer = doc.find('div', {'id': 'dokuwiki__footer'}) if footer: if footer.find('a', {'title': 'Driven by DokuWiki'}) or \ footer.find('img', {'alt': 'Driven by DokuWiki'}): generator = 'DokuWiki' # Detect MediaWikis where the generator metadata is disabled but Special:Version is enabled if not generator: version_page = None if not version_page: if doc.find('h1', {'class': 'firstHeading'}, string='Version'): version_page = doc if not version_page: feed = doc.find('link', {'rel': 'alternate', 'type': 'application/atom+xml'}) if feed: feed_page = urljoin(url, feed['href']) version_page = re.sub(r'(\?title=).*', r'\1Special:Version', feed_page) if not version_page: special_re = re.compile(r'(/|\?title=)Special:([^?]+)') special = doc.find('a', {'href': special_re}) if special: special_page = urljoin(url, special['href']) version_page = special_re.sub(r'\1Special:Version', special_page) if isinstance(version_page, str): try: version_page_hostname = urlparse(version_page).hostname if version_page_hostname != hostname: raise ValueError(f'mismatched hostnames {version_page_hostname} != {hostname}') version_page = requests.get(version_page, headers=headers, timeout=60, verify=secure) version_page = BeautifulSoup(version_page.text, 'html.parser') except: version_page = None if version_page: mediawiki_links = version_page.find_all('a', mediawiki_href, string='MediaWiki', limit=2) if mediawiki_links: mediawiki_link = mediawiki_links[-1] if mediawiki_link.parent.name == 'td': generator = mediawiki_link.parent.parent.text.strip().replace('\n', ' ') elif mediawiki_link.parent.name == 'li': generator = mediawiki_link.parent.text.strip().replace(': ', ' ') generator = ' '.join(generator.split()) generator = re.sub(r'MediaWiki(\d)', r'MediaWiki \1', generator) # Detect MediaWiki where the generator metadata and Special:Version are both disabled if not generator: footer = doc.find('div', {'id': 'footer'}) if footer: poweredby = footer.find('div', {'id': 'f-poweredbyico'}) if poweredby: mediawiki = poweredby.find('a', mediawiki_href) if mediawiki: poweredbyimg = mediawiki.find('img', {'alt': 'MediaWiki'}) if poweredbyimg: generator = 'MediaWiki' # Detect MediaWiki where the generator metadata and Special:Version are both disabled if not generator: if doc.find('body', {'class': 'mediawiki'}): generator = 'MediaWiki' # Extract the software from the generator if generator: software = generator.split()[0] else: software = '' # Select the #wikibot command generator and main page detector list cmd = None if software == 'MediaWiki': mediawiki = generator.split() mediawiki = mediawiki[1] if len(mediawiki) > 1 else '1.27' mediawiki = mediawiki.split('-')[0] mediawiki = re.sub(r'wmf\d+$', r'', mediawiki) if mediawiki == 'MediaWiki': cmd = mw_new elif Version(mediawiki) >= Version('1.27'): cmd = mw_new elif Version(mediawiki) >= Version('1.16'): cmd = mw_old else: cmd = mw_ancient main_page_locations = [ ('li', {'id': 'n-mainpage-description'}, 'a', 'href'), ('a', {'id': 'n-mainpage-description'}, None, 'href'), ('li', {'id': 'n-mainpage'}, 'a', 'href'), ('a', {'data-tracking': 'explore-main-page'}, None, 'href'), (check_mw_minerva, None, None, 'href'), ('li', {'id': 'n-Home'}, 'a', 'href'), ('li', {'id': 'n-Main_page'}, 'a', 'href'), ('a', {'class': 'artbase-home'}, None, 'href'), ('a', {'class': 'mw-logo'}, None, 'href'), ('a', {'class': 'mw-wiki-logo'}, None, 'href'), ('a', {'id': 'p-banner'}, None, 'href'), ('div', {'id': 'p-logo'}, 'a', 'href'), ('div', {'id': 'p-logo-container'}, 'a', 'href'), ('li', {'id': 'p-logo'}, 'a', 'href'), ('div', {'id': 'wiki_logo'}, 'a', 'href'), ('div', {'id': 'site-logo'}, 'a', 'href'), ('div', {'class': 'sitename-logo'}, 'a', 'href'), ('div', {'class': 'toplogo'}, 'a', 'href'), ('em', {'class': 'wiki_icon'}, None, 'href', True), ('a', {'id': 'header_logo'}, None, 'href', True), ('a', {'id': 'footer_logo'}, None, 'href', True), ('li', {'class': 'icon-home'}, 'a', 'href', True), (check_mw_p_logo_commented, None, None, 'href'), ('a', {'title': 'Visit the main page'}, None, 'href'), ('div', {'class': 'rpw-site-name'}, 'a', 'href'), ('div', {'class': 'cosmos-header__sitename'}, 'a', 'href'), ('div', {'id': 'p-navigation'}, 'a', 'href'), ('div', {'id': 'bs-navigation'}, 'a', 'href'), ('div', {'class': 'branding-box'}, 'a', 'href'), ('p', {'id': 'sitetitle'}, 'a', 'href'), ('h2', {'id': 'section_title'}, 'a', 'href'), ('div', {'class': 'title-name'}, 'a', 'href'), ('h1', {'class': 'title-name'}, 'a', 'href'), ('h2', {'class': 'title-name'}, 'a', 'href'), ('h3', {'class': 'title-name'}, 'a', 'href'), ('h4', {'class': 'title-name'}, 'a', 'href'), ('div', {'class': re.compile('[a-z]+-head__logo')}, 'a', 'href'), ('div', {'class': re.compile('[a-z]+-header-logo')}, 'a', 'href'), ('div', {'id': re.compile('[a-z]+-head__logo')}, 'a', 'href'), ('div', {'id': re.compile('[a-z]+-header-logo')}, 'a', 'href'), ('meta', {'name': 'twitter:url'}, None, 'content'), ('a', {'id': 'logo'}, None, 'href'), ('a', {'class': re.compile('[a-z]+-logo')}, None, 'href'), ('div', {'id': 'logo'}, 'a', 'href'), ('span', {'id': 'logo'}, 'a', 'href'), ('div', {'class': 'logo'}, 'a', 'href'), ('li', {'class': 'logo'}, 'a', 'href'), ('a', {'accesskey': 'z'}, None, 'href'), ('div', {'id': 'p-nav'}, 'a', 'href'), ('div', {'id': 'topbar'}, 'a', 'href'), ('a', {'class': 'navbar-brand'}, None, 'href'), ('a', {'class': 'brand'}, None, 'href'), ('div', {'class', 'navbar'}, 'a', 'href'), ('nav', {'class', 'navbar'}, 'a', 'href'), ('a', {'class': 'wiki-link'}, None, 'href'), ('form', {'id': 'searchform'}, None, 'action'), (None, None, None, None), ] elif software == 'DokuWiki': cmd = dw main_page_locations = [ ('span', {'class': 'home'}, 'a', 'href'), ('li', {'class': 'home'}, 'a', 'href'), ('form', {'id': 'dw__search'}, None, 'action'), ('link', {'rel': 'start'}, None, 'href'), ('a', {'accesskey': 'h'}, None, 'href'), ('a', {'class': 'navbar-brand'}, None, 'href'), (None, None, None, None), ] elif software == 'PukiWiki': cmd = pw main_page_locations = [ (check_pw_header_logo, None, None, 'href'), ('img', {'src': 'image/top.png'}, None, 'href', True), ] elif generator.strip(): return check_wiki_urls_on_page(url, doc, wiki_links, f'Generator {software} "{generator}" for {url} has no extractor {response.status_code} {response.reason} {len(response.text)}b') else: return check_wiki_urls_on_page(url, doc, wiki_links, f'Could not find a generator for {url} {response.status_code} {response.reason} {len(response.text)}b') # Detect the main page link for (tag, search, subtag, attr, *parent) in main_page_locations: try: if tag and search and parent and subtag and attr: main = doc.find(tag, search).parent.find(subtag).get(attr) elif tag and search and parent and attr: main = doc.find(tag, search).parent.get(attr) elif tag and search and subtag and attr: main = doc.find(tag, search).find(subtag).get(attr) elif tag and search and attr: main = doc.find(tag, search).get(attr) elif tag and attr: main = doc.find(tag).get(attr) else: err(f'No main page found for {url}') main = url if not main: continue url = urljoin(url, main) break except (AttributeError, TypeError) as e: continue if url == 'https://community.fandom.com/wiki/Special:NotAValidWiki': return err('Not a valid Fandom wiki') # Check if there are recent saves on IA url = urlparse(url) if not url.hostname: return err(f'Parse failure: {url}') # FIXME: check the size of the images/etc domain = url.hostname.removeprefix('wiki.') query = f'originalurl:*{domain}* OR originalurl:{domain} OR originalurl:wiki.{domain}' fields = 'addeddate,originalurl'.split(',') params = {'rows': 1} sorts = ['addeddate desc'] search = ia.search_items(query, fields, sorts, params) for result in search: originalurl = result['originalurl'] year = datetime.fromisoformat(result['addeddate']).year if year >= max_year: return err(f'Archived recently {year} {originalurl}') # Save the generated wiki command # FIXME: divert wikis mentioned on IRC to a maybe-failed file url = url.geturl() cmd = cmd(url) if not secure: cmd += ' --insecure' print(indent+cmd) with open(cmd_fn, 'a') as cmd_f: print(cmd, file=cmd_f) print() for url in sys.stdin: check_url(url) for url in sys.argv[1:]: check_url(url)