import re import requests import urllib.parse import unittest from unittest.mock import patch #head to place at start of all relative links RELATIVE_LINK_HEAD = "https://cldr.unicode.org" #sometimes the html --> md conversion puts extra spaces between bullets def fixBullets(content): #remove extra spaces after dash in bullet points content = re.sub(r'-\s{3}', '- ', content) #remove extra space after numbered bullet points content = re.sub(r'(\d+\.)\s{2}', r'\1 ', content) #process lines for list handling processed_lines = [] in_list = False for line in content.splitlines(): if re.match(r'^\s*[-\d]', line): #check if the current line is part of a list in_list = True elif in_list and not line.strip(): #skip empty lines within lists continue else: in_list = False processed_lines.append(line) processed_content = '\n'.join(processed_lines) return processed_content #html-->md conversion puts link headings into md and messes up titles def fixTitles(content): #link headings regex pattern = re.compile(r'(#+)\s*\n*\[\n*\]\(#.*\)\n(.*)\n*') #replace matched groups def replaceUnwanted(match): heading_level = match.group(1) #heading level (ex. ##) title_text = match.group(2).strip() #capture and strip the title text return f"{heading_level} {title_text}" #return the formatted heading and title on the same line # Replace the unwanted text using the defined pattern and function processed_content = re.sub(pattern, replaceUnwanted, content) return processed_content # add title at top and unicode copyright at bottom def addHeaderAndFooter(content): #get title from top of md file title_match = re.search(r'(?<=#\s).*', content) if title_match: title = title_match.group(0).strip() else: title = "Default Title" #default if couldnt find #header header = f"---\ntitle: {title}\n---\n" #footer footer = "\n![Unicode copyright](https://www.unicode.org/img/hb_notice.gif)\n" #look for existing title and copywrite in the YAML front matter title_exists = re.search(r'^---\n.*title:.*\n---', content, re.MULTILINE) footer_exists = footer.strip() in content #add header if not title_exists: content = header + content #add footer if not footer_exists: content = content + footer return content #html-->md sometimes produces double bullets on indented lists def fixIndentedBullets(content): #regex pattern to match the double hyphen bullets pattern = re.compile(r'^-\s-\s(.*)', re.MULTILINE) #split into lines lines = content.split('\n') #normalize bullets normalized_lines = [] in_list = False for line in lines: #lines with double hyphens match = pattern.match(line) if match: #normalize the double hyphen bullet bullet_point = match.group(1) normalized_lines.append(f'- {bullet_point.strip()}') in_list = True elif in_list and re.match(r'^\s*-\s', line): #remove indentation from following bullets in the same list normalized_lines.append(line.strip()) else: normalized_lines.append(line) in_list = False #join back into a single string processed_content = '\n'.join(normalized_lines) return processed_content #links on text that is already a link def removeRedundantLinks(content): #(link)[link] regex pattern link_pattern = re.compile(r'\((https?:\/\/[^\s\)]+)\)\[\1\]') #function to process unwanted links def replace_link(match): return match.group(1) #return only the first URL #replace the links processed_content = re.sub(link_pattern, replace_link, content) return processed_content #process links, google redirects, normal redirects, and relative links (takes in a url) def convertLink(url): #relative links if url.startswith("/"): return RELATIVE_LINK_HEAD + url #google redirect links elif "www.google.com/url" in url: parsed_url = urllib.parse.urlparse(url) query_params = urllib.parse.parse_qs(parsed_url.query) if 'q' in query_params: return query_params['q'][0] return url #redirects else: try: response = requests.get(url) return response.url except requests.RequestException as e: print(f"Error following redirects for {url}: {e}") return url #finds all links and runs them through converLink def process_links(content): #regex pattern for md links pattern = re.compile(r'\[(.*?)\]\((.*?)\)') #replace each link def replace_link(match): text = match.group(1) url = match.group(2) new_url = convertLink(url) return f'[{text}]({new_url})' return pattern.sub(replace_link, content) #given a file path to an md file, run it through every cleanup function and write inot samle.md def fullCleanup(file_path): with open(file_path, 'r') as file: content = file.read() # Read entire file as a string content = addHeaderAndFooter(content) content = fixTitles(content) content = fixBullets(content) content = removeRedundantLinks(content) content = fixIndentedBullets(content) content = process_links(content) with open("sample.md", 'w') as file: file.write(content) #given a md string, run through every cleanup function and return result def fullCleanupString(str): content = addHeaderAndFooter(str) content = fixTitles(content) content = fixBullets(content) content = removeRedundantLinks(content) content = fixIndentedBullets(content) content = process_links(content) return content #TESTS class TestMarkdownLinkProcessing(unittest.TestCase): def test_remove_redundant_links(self): #standard use cases markdown_content1 = ''' redundant link (https://mail.google.com/mail/u/1/#inbox)[https://mail.google.com/mail/u/1/#inbox]. not redundant link [example](https://www.example.com). ''' expected_output1 = ''' redundant link https://mail.google.com/mail/u/1/#inbox. not redundant link [example](https://www.example.com). ''' self.assertEqual(removeRedundantLinks(markdown_content1), expected_output1) #edge cases: #If the link does not start with http:// or https:// it will not be picked up as a link #if the two links are different, it does not get corrected markdown_content2 = ''' not link [www.example.com](www.example.com). Different links (https://mail.google.com/mail/u/1/#inbox)[https://emojipedia.org/japanese-symbol-for-beginner]. ''' expected_output2 = ''' not link [www.example.com](www.example.com). Different links (https://mail.google.com/mail/u/1/#inbox)[https://emojipedia.org/japanese-symbol-for-beginner]. ''' self.assertEqual(removeRedundantLinks(markdown_content2), expected_output2) @patch('requests.get') def test_replace_links(self, mock_get): #mock responses for follow_redirects function def mock_get_response(url): class MockResponse: def __init__(self, url): self.url = url if url == 'http://www.google.com/url?q=http%3A%2F%2Fwww.typolexikon.de%2F&sa=D&sntz=1&usg=AOvVaw3SSbqyjrSIq8enzBt6Gltw': return MockResponse('http://www.typolexikon.de/') elif url == 'http://www.example.com/': return MockResponse('http://www.example.com/') return MockResponse(url) mock_get.side_effect = mock_get_response #standard use cases markdown_content1 = ''' relative link [page](/relative-page). Google redirect link [typolexikon.de](http://www.google.com/url?q=http%3A%2F%2Fwww.typolexikon.de%2F&sa=D&sntz=1&usg=AOvVaw3SSbqyjrSIq8enzBt6Gltw). normal link [example.com](http://www.example.com/). ''' expected_output1 = ''' relative link [page](https://cldr.unicode.org/relative-page). Google redirect link [typolexikon.de](http://www.typolexikon.de/). normal link [example.com](http://www.example.com/). ''' cleaned_content = removeRedundantLinks(markdown_content1) self.assertEqual(process_links(cleaned_content), expected_output1) if __name__ == '__main__': fullCleanup("testing.md") unittest.main()