158 lines
4.9 KiB
Python
158 lines
4.9 KiB
Python
# Modified from:
|
|
# https://github.com/allenai/allennlp/blob/main/scripts/check_links.py
|
|
|
|
import argparse
|
|
import logging
|
|
import os
|
|
import pathlib
|
|
import re
|
|
import sys
|
|
from multiprocessing.dummy import Pool
|
|
from typing import NamedTuple, Optional, Tuple
|
|
|
|
import requests
|
|
from mmcv.utils import get_logger
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(
|
|
description='Goes through all the inline-links '
|
|
'in markdown files and reports the breakages')
|
|
parser.add_argument(
|
|
'--num-threads',
|
|
type=int,
|
|
default=100,
|
|
help='Number of processes to confirm the link')
|
|
parser.add_argument('--https-proxy', type=str, help='https proxy')
|
|
parser.add_argument(
|
|
'--out',
|
|
type=str,
|
|
default='link_reports.txt',
|
|
help='output path of reports')
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
OK_STATUS_CODES = (
|
|
200,
|
|
401, # the resource exists but may require some sort of login.
|
|
403, # ^ same
|
|
405, # HEAD method not allowed.
|
|
# the resource exists, but our default 'Accept-' header may not
|
|
# match what the server can provide.
|
|
406,
|
|
)
|
|
|
|
|
|
class MatchTuple(NamedTuple):
|
|
source: str
|
|
name: str
|
|
link: str
|
|
|
|
|
|
def check_link(
|
|
match_tuple: MatchTuple,
|
|
http_session: requests.Session,
|
|
logger: logging = None) -> Tuple[MatchTuple, bool, Optional[str]]:
|
|
reason: Optional[str] = None
|
|
if match_tuple.link.startswith('http'):
|
|
result_ok, reason = check_url(match_tuple, http_session)
|
|
else:
|
|
result_ok = check_path(match_tuple)
|
|
if logger is None:
|
|
print(f" {'✓' if result_ok else '✗'} {match_tuple.link}")
|
|
else:
|
|
logger.info(f" {'✓' if result_ok else '✗'} {match_tuple.link}")
|
|
return match_tuple, result_ok, reason
|
|
|
|
|
|
def check_url(match_tuple: MatchTuple,
|
|
http_session: requests.Session) -> Tuple[bool, str]:
|
|
"""Check if a URL is reachable."""
|
|
try:
|
|
result = http_session.head(
|
|
match_tuple.link, timeout=5, allow_redirects=True)
|
|
return (
|
|
result.ok or result.status_code in OK_STATUS_CODES,
|
|
f'status code = {result.status_code}',
|
|
)
|
|
except (requests.ConnectionError, requests.Timeout):
|
|
return False, 'connection error'
|
|
|
|
|
|
def check_path(match_tuple: MatchTuple) -> bool:
|
|
"""Check if a file in this repository exists."""
|
|
relative_path = match_tuple.link.split('#')[0]
|
|
full_path = os.path.join(
|
|
os.path.dirname(str(match_tuple.source)), relative_path)
|
|
return os.path.exists(full_path)
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
|
|
# setup logger
|
|
logger = get_logger(name='mmdet', log_file=args.out)
|
|
|
|
# setup https_proxy
|
|
if args.https_proxy:
|
|
os.environ['https_proxy'] = args.https_proxy
|
|
|
|
# setup http_session
|
|
http_session = requests.Session()
|
|
for resource_prefix in ('http://', 'https://'):
|
|
http_session.mount(
|
|
resource_prefix,
|
|
requests.adapters.HTTPAdapter(
|
|
max_retries=5,
|
|
pool_connections=20,
|
|
pool_maxsize=args.num_threads),
|
|
)
|
|
|
|
logger.info('Finding all markdown files in the current directory...')
|
|
|
|
project_root = (pathlib.Path(__file__).parent / '..').resolve()
|
|
markdown_files = project_root.glob('**/*.md')
|
|
|
|
all_matches = set()
|
|
url_regex = re.compile(r'\[([^!][^\]]+)\]\(([^)(]+)\)')
|
|
for markdown_file in markdown_files:
|
|
with open(markdown_file) as handle:
|
|
for line in handle.readlines():
|
|
matches = url_regex.findall(line)
|
|
for name, link in matches:
|
|
if 'localhost' not in link:
|
|
all_matches.add(
|
|
MatchTuple(
|
|
source=str(markdown_file),
|
|
name=name,
|
|
link=link))
|
|
|
|
logger.info(f' {len(all_matches)} markdown files found')
|
|
logger.info('Checking to make sure we can retrieve each link...')
|
|
|
|
with Pool(processes=args.num_threads) as pool:
|
|
results = pool.starmap(check_link, [(match, http_session, logger)
|
|
for match in list(all_matches)])
|
|
|
|
# collect unreachable results
|
|
unreachable_results = [(match_tuple, reason)
|
|
for match_tuple, success, reason in results
|
|
if not success]
|
|
|
|
if unreachable_results:
|
|
logger.info('================================================')
|
|
logger.info(f'Unreachable links ({len(unreachable_results)}):')
|
|
for match_tuple, reason in unreachable_results:
|
|
logger.info(' > Source: ' + match_tuple.source)
|
|
logger.info(' Name: ' + match_tuple.name)
|
|
logger.info(' Link: ' + match_tuple.link)
|
|
if reason is not None:
|
|
logger.info(' Reason: ' + reason)
|
|
sys.exit(1)
|
|
logger.info('No Unreachable link found.')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|