esp-idf/tools/ci/check_readme_links.py

#!/usr/bin/env python
#
# Checks that all links in the readme markdown files are valid
#
# Copyright 2020 Espressif Systems (Shanghai) PTE LTD
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import argparse
import concurrent.futures
import os
import os.path
import re
import urllib.error
import urllib.request
from collections import defaultdict, namedtuple
from pathlib import Path

EXCLUDE_DOCS_LIST = ['examples/peripherals/secure_element/atecc608_ecdsa/components/esp-cryptoauthlib/cryptoauthlib/**']

# The apple apps links are not accessible from the company network for some reason
EXCLUDE_URL_LIST = ['https://apps.apple.com/in/app/esp-ble-provisioning/id1473590141', 'https://apps.apple.com/in/app/esp-softap-provisioning/id1474040630']

Link = namedtuple('Link', ['file', 'url'])


class ReadmeLinkError(Exception):
    def __init__(self, file, url):
        self.file = file
        self.url = url


class RelativeLinkError(ReadmeLinkError):
    def __str__(self):
        return 'Relative link error, file - {} not found, linked from {}'.format(self.url, self.file)


class UrlLinkError(ReadmeLinkError):
    def __init__(self, file, url, error_code):
        self.error_code = error_code
        super().__init__(file, url)

    def __str__(self):
        files = [str(f) for f in self.file]
        return 'URL error, url - {} in files - {} is not accessible, request returned {}'.format(self.url, ', '.join(files), self.error_code)


# we do not want a failed test just due to bad network conditions, for non 404 errors we simply print a warning
def check_url(url, files, timeout):
    try:
        with urllib.request.urlopen(url, timeout=timeout):
            return
    except urllib.error.HTTPError as e:
        if e.code == 404:
            raise UrlLinkError(files, url, str(e))
        else:
            print('Unable to access {}, err = {}'.format(url, str(e)))
    except Exception as e:
        print('Unable to access {}, err = {}'.format(url, str(e)))


def check_web_links(web_links):

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        errors = []
        future_to_url = {executor.submit(check_url, url, files, timeout=30): (url, files) for url, files in web_links.items()}
        for future in concurrent.futures.as_completed(future_to_url):
            try:
                future.result()
            except UrlLinkError as e:
                errors.append(e)

        return errors


def check_file_links(file_links):
    errors = []

    for link in file_links:
        link_path = link.file.parent / link.url

        if not Path.exists(link_path):
            errors.append(RelativeLinkError(link.file, link.url))

    print('Found {} errors with relative links'.format(len(errors)))
    return errors


def get_md_links(folder):
    MD_LINK_RE = r'\[.+?\]\((.+?)(#.+)?\)'

    idf_path = Path(os.getenv('IDF_PATH'))
    links = []

    for path in (idf_path / folder).rglob('*.md'):
        if any([path.relative_to(idf_path).match(exclude_doc) for exclude_doc in EXCLUDE_DOCS_LIST]):
            print('{} - excluded'.format(path))
            continue

        with path.open(encoding='utf8') as f:
            content = f.read()

        for url in re.findall(MD_LINK_RE, content):
            link = Link(path, url[0].lstrip())
            # Ignore "local" links
            if not link.url.startswith('#'):
                links.append(link)

    return links


def check_readme_links(args):

    links = get_md_links('examples')
    print('Found {} links'.format(len(links)))

    errors = []

    web_links = defaultdict(list)
    file_links = []

    # Sort links into file and web links
    for link in links:
        if link.url.startswith('http'):
                web_links[link.url].append(link.file)
        else:
            file_links.append(link)

    for url in EXCLUDE_URL_LIST:
        del web_links[url]

    errors.extend(check_file_links(file_links))

    if not args.skip_weburl:
        errors.extend(check_web_links(web_links))

    print('Found {} errors:'.format(len(errors)))
    for e in errors:
        print(e)
    if errors:
        raise e


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='check_readme_links.py: Checks for dead links in example READMEs', prog='check_readme_links.py')
    parser.add_argument('--skip-weburl', '-w', action='store_true', help='Skip checking of web URLs, only check links to local files')
    args = parser.parse_args()

    check_readme_links(args)
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`#!/usr/bin/env python`
			`#`
			`# Checks that all links in the readme markdown files are valid`
			`#`
			`# Copyright 2020 Espressif Systems (Shanghai) PTE LTD`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`

style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`import argparse`
			`import concurrent.futures`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`import os`
			`import os.path`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`import re`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`import urllib.error`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`import urllib.request`
			`from collections import defaultdict, namedtuple`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`from pathlib import Path`

			`EXCLUDE_DOCS_LIST = ['examples/peripherals/secure_element/atecc608_ecdsa/components/esp-cryptoauthlib/cryptoauthlib/**']`

			`# The apple apps links are not accessible from the company network for some reason`
			`EXCLUDE_URL_LIST = ['https://apps.apple.com/in/app/esp-ble-provisioning/id1473590141', 'https://apps.apple.com/in/app/esp-softap-provisioning/id1474040630']`

			`Link = namedtuple('Link', ['file', 'url'])`


			`class ReadmeLinkError(Exception):`
			`def __init__(self, file, url):`
			`self.file = file`
			`self.url = url`


			`class RelativeLinkError(ReadmeLinkError):`
			`def __str__(self):`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`return 'Relative link error, file - {} not found, linked from {}'.format(self.url, self.file)`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00

			`class UrlLinkError(ReadmeLinkError):`
			`def __init__(self, file, url, error_code):`
			`self.error_code = error_code`
			`super().__init__(file, url)`

			`def __str__(self):`
			`files = [str(f) for f in self.file]`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`return 'URL error, url - {} in files - {} is not accessible, request returned {}'.format(self.url, ', '.join(files), self.error_code)`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00

			`# we do not want a failed test just due to bad network conditions, for non 404 errors we simply print a warning`
			`def check_url(url, files, timeout):`
			`try:`
			`with urllib.request.urlopen(url, timeout=timeout):`
			`return`
			`except urllib.error.HTTPError as e:`
			`if e.code == 404:`
			`raise UrlLinkError(files, url, str(e))`
			`else:`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`print('Unable to access {}, err = {}'.format(url, str(e)))`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`except Exception as e:`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`print('Unable to access {}, err = {}'.format(url, str(e)))`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00

			`def check_web_links(web_links):`

			`with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:`
			`errors = []`
			`future_to_url = {executor.submit(check_url, url, files, timeout=30): (url, files) for url, files in web_links.items()}`
			`for future in concurrent.futures.as_completed(future_to_url):`
			`try:`
			`future.result()`
			`except UrlLinkError as e:`
			`errors.append(e)`

			`return errors`


			`def check_file_links(file_links):`
			`errors = []`

			`for link in file_links:`
			`link_path = link.file.parent / link.url`

			`if not Path.exists(link_path):`
			`errors.append(RelativeLinkError(link.file, link.url))`

style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`print('Found {} errors with relative links'.format(len(errors)))`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`return errors`


			`def get_md_links(folder):`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`MD_LINK_RE = r'\[.+?\]\((.+?)(#.+)?\)'`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00
			`idf_path = Path(os.getenv('IDF_PATH'))`
			`links = []`

			`for path in (idf_path / folder).rglob('*.md'):`
			`if any([path.relative_to(idf_path).match(exclude_doc) for exclude_doc in EXCLUDE_DOCS_LIST]):`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`print('{} - excluded'.format(path))`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`continue`

			`with path.open(encoding='utf8') as f:`
			`content = f.read()`

			`for url in re.findall(MD_LINK_RE, content):`
			`link = Link(path, url[0].lstrip())`
			`# Ignore "local" links`
			`if not link.url.startswith('#'):`
			`links.append(link)`

			`return links`


			`def check_readme_links(args):`

			`links = get_md_links('examples')`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`print('Found {} links'.format(len(links)))`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00
			`errors = []`

			`web_links = defaultdict(list)`
			`file_links = []`

			`# Sort links into file and web links`
			`for link in links:`
			`if link.url.startswith('http'):`
			`web_links[link.url].append(link.file)`
			`else:`
			`file_links.append(link)`

			`for url in EXCLUDE_URL_LIST:`
			`del web_links[url]`

			`errors.extend(check_file_links(file_links))`

			`if not args.skip_weburl:`
			`errors.extend(check_web_links(web_links))`

style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`print('Found {} errors:'.format(len(errors)))`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`for e in errors:`
			`print(e)`
			`if errors:`
			`raise e`


			`if __name__ == '__main__':`

			`parser = argparse.ArgumentParser(description='check_readme_links.py: Checks for dead links in example READMEs', prog='check_readme_links.py')`
style: format python files with isort and double-quote-string-fixer 2021-01-26 02:49:01 +00:00			`parser.add_argument('--skip-weburl', '-w', action='store_true', help='Skip checking of web URLs, only check links to local files')`
CI: add script for checking links in example READMEs Closes IDF-1846 2020-06-16 11:00:27 +00:00			`args = parser.parse_args()`

			`check_readme_links(args)`