esp-idf/tools/ci/check_readme_links.py

#!/usr/bin/env python
#
# Checks that all links in the readme markdown files are valid
#
# SPDX-FileCopyrightText: 2020-2022 Espressif Systems (Shanghai) CO LTD
# SPDX-License-Identifier: Apache-2.0
#

import argparse
import concurrent.futures
import os
import os.path
import re
import sys
import urllib.error
import urllib.request
from collections import defaultdict, namedtuple
from pathlib import Path
from typing import List

# The apple apps links are not accessible from the company network for some reason
EXCLUDE_URL_LIST = ['https://apps.apple.com/in/app/esp-ble-provisioning/id1473590141', 'https://apps.apple.com/in/app/esp-softap-provisioning/id1474040630']

Link = namedtuple('Link', ['file', 'url'])


class ReadmeLinkError(Exception):
    def __init__(self, file: str, url: str) -> None:
        self.file = file
        self.url = url


class RelativeLinkError(ReadmeLinkError):
    def __str__(self) -> str:
        return 'Relative link error, file - {} not found, linked from {}'.format(self.url, self.file)


class UrlLinkError(ReadmeLinkError):
    def __init__(self, file: str, url: str, error_code: str):
        self.error_code = error_code
        super().__init__(file, url)

    def __str__(self) -> str:
        files = [str(f) for f in self.file]
        return 'URL error, url - {} in files - {} is not accessible, request returned {}'.format(self.url, ', '.join(files), self.error_code)


# we do not want a failed test just due to bad network conditions, for non 404 errors we simply print a warning
def check_url(url: str, files: str, timeout: float) -> None:
    try:
        with urllib.request.urlopen(url, timeout=timeout):
            return
    except urllib.error.HTTPError as e:
        if e.code == 404:
            raise UrlLinkError(files, url, str(e))
        else:
            print('Unable to access {}, err = {}'.format(url, str(e)))
    except Exception as e:
        print('Unable to access {}, err = {}'.format(url, str(e)))


def check_web_links(web_links: defaultdict) -> List:

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        errors = []
        future_to_url = {executor.submit(check_url, url, files, timeout=30): (url, files) for url, files in web_links.items()}
        for future in concurrent.futures.as_completed(future_to_url):
            try:
                future.result()
            except UrlLinkError as e:
                errors.append(e)

        return errors


def check_file_links(file_links: List) -> List:
    errors = []

    for link in file_links:
        link_path = link.file.parent / link.url

        if not Path.exists(link_path):
            errors.append(RelativeLinkError(link.file, link.url))

    print('Found {} errors with relative links'.format(len(errors)))
    return errors


def get_md_links(folder: str) -> List:
    MD_LINK_RE = r'\[.+?\]\((.+?)(#.+)?\)'

    idf_path_str = os.getenv('IDF_PATH')
    if idf_path_str is None:
        raise RuntimeError("Environment variable 'IDF_PATH' wasn't set.")
    idf_path = Path(idf_path_str)
    links = []

    for path in (idf_path / folder).rglob('*.md'):
        with path.open(encoding='utf8') as f:
            content = f.read()

        for url in re.findall(MD_LINK_RE, content):
            link = Link(path, url[0].lstrip())
            # Ignore "local" links
            if not link.url.startswith('#'):
                links.append(link)

    return links


def check_readme_links(args: argparse.Namespace) -> int:

    links = get_md_links('examples')
    print('Found {} links'.format(len(links)))

    errors = []

    web_links = defaultdict(list)
    file_links = []

    # Sort links into file and web links
    for link in links:
        if link.url.startswith('http'):
                web_links[link.url].append(link.file)
        else:
            file_links.append(link)

    for url in EXCLUDE_URL_LIST:
        del web_links[url]

    errors.extend(check_file_links(file_links))

    if not args.skip_weburl:
        errors.extend(check_web_links(web_links))

    print('Found {} errors:'.format(len(errors)))
    for e in errors:
        print(e)

    return 1 if len(errors) > 0 else 0


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='check_readme_links.py: Checks for dead links in example READMEs', prog='check_readme_links.py')
    parser.add_argument('--skip-weburl', '-w', action='store_true', help='Skip checking of web URLs, only check links to local files')
    args = parser.parse_args()

    sys.exit(check_readme_links(args))
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`#!/usr/bin/env python`
			`#`
			`# Checks that all links in the readme markdown files are valid`
			`#`
tools: Change copyright in ci dir 2022-06-15 14:46:55 +00:00			`# SPDX-FileCopyrightText: 2020-2022 Espressif Systems (Shanghai) CO LTD`
			`# SPDX-License-Identifier: Apache-2.0`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`#`

style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`import argparse`
			`import concurrent.futures`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`import os`
			`import os.path`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`import re`
check_readme_links: remove throwing of exception before exit Reraising the exception before exiting was intended to help troubleshoot, but turned out to be more confusing than helpful as it might look like the script was failing 2021-04-26 07:36:30 +00:00			`import sys`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`import urllib.error`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`import urllib.request`
			`from collections import defaultdict, namedtuple`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`from pathlib import Path`
ci: Add python types hints 2022-06-28 17:00:12 +00:00			`from typing import List`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00
			`# The apple apps links are not accessible from the company network for some reason`
			`EXCLUDE_URL_LIST = ['https://apps.apple.com/in/app/esp-ble-provisioning/id1473590141', 'https://apps.apple.com/in/app/esp-softap-provisioning/id1474040630']`

			`Link = namedtuple('Link', ['file', 'url'])`


			`class ReadmeLinkError(Exception):`
ci: Add python types hints 2022-06-28 17:00:12 +00:00			`def __init__(self, file: str, url: str) -> None:`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`self.file = file`
			`self.url = url`


			`class RelativeLinkError(ReadmeLinkError):`
ci: Add python types hints 2022-06-28 17:00:12 +00:00			`def __str__(self) -> str:`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`return 'Relative link error, file - {} not found, linked from {}'.format(self.url, self.file)`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00

			`class UrlLinkError(ReadmeLinkError):`
ci: Add python types hints 2022-06-28 17:00:12 +00:00			`def __init__(self, file: str, url: str, error_code: str):`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`self.error_code = error_code`
			`super().__init__(file, url)`

ci: Add python types hints 2022-06-28 17:00:12 +00:00			`def __str__(self) -> str:`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`files = [str(f) for f in self.file]`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`return 'URL error, url - {} in files - {} is not accessible, request returned {}'.format(self.url, ', '.join(files), self.error_code)`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00

			`# we do not want a failed test just due to bad network conditions, for non 404 errors we simply print a warning`
ci: Add python types hints 2022-06-28 17:00:12 +00:00			`def check_url(url: str, files: str, timeout: float) -> None:`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`try:`
			`with urllib.request.urlopen(url, timeout=timeout):`
			`return`
			`except urllib.error.HTTPError as e:`
			`if e.code == 404:`
			`raise UrlLinkError(files, url, str(e))`
			`else:`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`print('Unable to access {}, err = {}'.format(url, str(e)))`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`except Exception as e:`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`print('Unable to access {}, err = {}'.format(url, str(e)))`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00

ci: Add python types hints 2022-06-28 17:00:12 +00:00			`def check_web_links(web_links: defaultdict) -> List:`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00
			`with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:`
			`errors = []`
			`future_to_url = {executor.submit(check_url, url, files, timeout=30): (url, files) for url, files in web_links.items()}`
			`for future in concurrent.futures.as_completed(future_to_url):`
			`try:`
			`future.result()`
			`except UrlLinkError as e:`
			`errors.append(e)`

			`return errors`


ci: Add python types hints 2022-06-28 17:00:12 +00:00			`def check_file_links(file_links: List) -> List:`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`errors = []`

			`for link in file_links:`
			`link_path = link.file.parent / link.url`

			`if not Path.exists(link_path):`
			`errors.append(RelativeLinkError(link.file, link.url))`

style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`print('Found {} errors with relative links'.format(len(errors)))`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`return errors`


ci: Add python types hints 2022-06-28 17:00:12 +00:00			`def get_md_links(folder: str) -> List:`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`MD_LINK_RE = r'\[.+?\]\((.+?)(#.+)?\)'`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00
ci: Add python types hints 2022-06-28 17:00:12 +00:00			`idf_path_str = os.getenv('IDF_PATH')`
			`if idf_path_str is None:`
			`raise RuntimeError("Environment variable 'IDF_PATH' wasn't set.")`
			`idf_path = Path(idf_path_str)`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`links = []`

			`for path in (idf_path / folder).rglob('*.md'):`
			`with path.open(encoding='utf8') as f:`
			`content = f.read()`

			`for url in re.findall(MD_LINK_RE, content):`
			`link = Link(path, url[0].lstrip())`
			`# Ignore "local" links`
			`if not link.url.startswith('#'):`
			`links.append(link)`

			`return links`


ci: Add python types hints 2022-06-28 17:00:12 +00:00			`def check_readme_links(args: argparse.Namespace) -> int:`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00
			`links = get_md_links('examples')`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`print('Found {} links'.format(len(links)))`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00
			`errors = []`

			`web_links = defaultdict(list)`
			`file_links = []`

			`# Sort links into file and web links`
			`for link in links:`
			`if link.url.startswith('http'):`
			`web_links[link.url].append(link.file)`
			`else:`
			`file_links.append(link)`

			`for url in EXCLUDE_URL_LIST:`
			`del web_links[url]`

			`errors.extend(check_file_links(file_links))`

			`if not args.skip_weburl:`
			`errors.extend(check_web_links(web_links))`

style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`print('Found {} errors:'.format(len(errors)))`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`for e in errors:`
			`print(e)`
check_readme_links: remove throwing of exception before exit Reraising the exception before exiting was intended to help troubleshoot, but turned out to be more confusing than helpful as it might look like the script was failing 2021-04-26 07:36:30 +00:00
			`return 1 if len(errors) > 0 else 0`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00

			`if __name__ == '__main__':`

			`parser = argparse.ArgumentParser(description='check_readme_links.py: Checks for dead links in example READMEs', prog='check_readme_links.py')`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`parser.add_argument('--skip-weburl', '-w', action='store_true', help='Skip checking of web URLs, only check links to local files')`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`args = parser.parse_args()`

check_readme_links: remove throwing of exception before exit Reraising the exception before exiting was intended to help troubleshoot, but turned out to be more confusing than helpful as it might look like the script was failing 2021-04-26 07:36:30 +00:00			`sys.exit(check_readme_links(args))`