1import re 2import requests 3import urllib.parse 4import unittest 5from unittest.mock import patch 6 7#head to place at start of all relative links 8RELATIVE_LINK_HEAD = "https://cldr.unicode.org" 9 10#sometimes the html --> md conversion puts extra spaces between bullets 11def fixBullets(content): 12 #remove extra spaces after dash in bullet points 13 content = re.sub(r'-\s{3}', '- ', content) 14 #remove extra space after numbered bullet points 15 content = re.sub(r'(\d+\.)\s{2}', r'\1 ', content) 16 #process lines for list handling 17 processed_lines = [] 18 in_list = False 19 for line in content.splitlines(): 20 if re.match(r'^\s*[-\d]', line): 21 #check if the current line is part of a list 22 in_list = True 23 elif in_list and not line.strip(): 24 #skip empty lines within lists 25 continue 26 else: 27 in_list = False 28 processed_lines.append(line) 29 processed_content = '\n'.join(processed_lines) 30 31 return processed_content 32 33#html-->md conversion puts link headings into md and messes up titles 34def fixTitles(content): 35 #link headings regex 36 pattern = re.compile(r'(#+)\s*\n*\[\n*\]\(#.*\)\n(.*)\n*') 37 38 #replace matched groups 39 def replaceUnwanted(match): 40 heading_level = match.group(1) #heading level (ex. ##) 41 title_text = match.group(2).strip() #capture and strip the title text 42 return f"{heading_level} {title_text}" #return the formatted heading and title on the same line 43 44 # Replace the unwanted text using the defined pattern and function 45 processed_content = re.sub(pattern, replaceUnwanted, content) 46 return processed_content 47 48# add title at top and unicode copyright at bottom 49def addHeaderAndFooter(content): 50 #get title from top of md file 51 title_match = re.search(r'(?<=#\s).*', content) 52 if title_match: 53 title = title_match.group(0).strip() 54 else: 55 title = "Default Title" #default if couldnt find 56 57 #header 58 header = f"---\ntitle: {title}\n---\n" 59 #footer 60 footer = "\n\n" 61 62 #look for existing title and copywrite in the YAML front matter 63 title_exists = re.search(r'^---\n.*title:.*\n---', content, re.MULTILINE) 64 footer_exists = footer.strip() in content 65 66 #add header 67 if not title_exists: 68 content = header + content 69 70 #add footer 71 if not footer_exists: 72 content = content + footer 73 74 return content 75 76#html-->md sometimes produces double bullets on indented lists 77def fixIndentedBullets(content): 78 #regex pattern to match the double hyphen bullets 79 pattern = re.compile(r'^-\s-\s(.*)', re.MULTILINE) 80 81 #split into lines 82 lines = content.split('\n') 83 84 #normalize bullets 85 normalized_lines = [] 86 in_list = False 87 88 for line in lines: 89 #lines with double hyphens 90 match = pattern.match(line) 91 if match: 92 #normalize the double hyphen bullet 93 bullet_point = match.group(1) 94 normalized_lines.append(f'- {bullet_point.strip()}') 95 in_list = True 96 elif in_list and re.match(r'^\s*-\s', line): 97 #remove indentation from following bullets in the same list 98 normalized_lines.append(line.strip()) 99 else: 100 normalized_lines.append(line) 101 in_list = False 102 103 #join back into a single string 104 processed_content = '\n'.join(normalized_lines) 105 return processed_content 106 107#links on text that is already a link 108def removeRedundantLinks(content): 109 #(link)[link] regex pattern 110 link_pattern = re.compile(r'\((https?:\/\/[^\s\)]+)\)\[\1\]') 111 112 #function to process unwanted links 113 def replace_link(match): 114 return match.group(1) #return only the first URL 115 116 #replace the links 117 processed_content = re.sub(link_pattern, replace_link, content) 118 return processed_content 119 120#process links, google redirects, normal redirects, and relative links (takes in a url) 121def convertLink(url): 122 #relative links 123 if url.startswith("/"): 124 return RELATIVE_LINK_HEAD + url 125 #google redirect links 126 elif "www.google.com/url" in url: 127 parsed_url = urllib.parse.urlparse(url) 128 query_params = urllib.parse.parse_qs(parsed_url.query) 129 if 'q' in query_params: 130 return query_params['q'][0] 131 return url 132 #redirects 133 else: 134 try: 135 response = requests.get(url) 136 return response.url 137 except requests.RequestException as e: 138 print(f"Error following redirects for {url}: {e}") 139 return url 140 141#finds all links and runs them through converLink 142def process_links(content): 143 #regex pattern for md links 144 pattern = re.compile(r'\[(.*?)\]\((.*?)\)') 145 146 #replace each link 147 def replace_link(match): 148 text = match.group(1) 149 url = match.group(2) 150 new_url = convertLink(url) 151 return f'[{text}]({new_url})' 152 153 return pattern.sub(replace_link, content) 154 155#given a file path to an md file, run it through every cleanup function and write inot samle.md 156def fullCleanup(file_path): 157 with open(file_path, 'r') as file: 158 content = file.read() # Read entire file as a string 159 content = addHeaderAndFooter(content) 160 content = fixTitles(content) 161 content = fixBullets(content) 162 content = removeRedundantLinks(content) 163 content = fixIndentedBullets(content) 164 content = process_links(content) 165 with open("sample.md", 'w') as file: 166 file.write(content) 167 168#given a md string, run through every cleanup function and return result 169def fullCleanupString(str): 170 content = addHeaderAndFooter(str) 171 content = fixTitles(content) 172 content = fixBullets(content) 173 content = removeRedundantLinks(content) 174 content = fixIndentedBullets(content) 175 content = process_links(content) 176 return content 177 178 179#TESTS 180class TestMarkdownLinkProcessing(unittest.TestCase): 181 def test_remove_redundant_links(self): 182 #standard use cases 183 markdown_content1 = ''' 184 redundant link (https://mail.google.com/mail/u/1/#inbox)[https://mail.google.com/mail/u/1/#inbox]. 185 not redundant link [example](https://www.example.com). 186 ''' 187 expected_output1 = ''' 188 redundant link https://mail.google.com/mail/u/1/#inbox. 189 not redundant link [example](https://www.example.com). 190 ''' 191 self.assertEqual(removeRedundantLinks(markdown_content1), expected_output1) 192 193 #edge cases: 194 #If the link does not start with http:// or https:// it will not be picked up as a link 195 #if the two links are different, it does not get corrected 196 markdown_content2 = ''' 197 not link [www.example.com](www.example.com). 198 Different links (https://mail.google.com/mail/u/1/#inbox)[https://emojipedia.org/japanese-symbol-for-beginner]. 199 ''' 200 expected_output2 = ''' 201 not link [www.example.com](www.example.com). 202 Different links (https://mail.google.com/mail/u/1/#inbox)[https://emojipedia.org/japanese-symbol-for-beginner]. 203 ''' 204 self.assertEqual(removeRedundantLinks(markdown_content2), expected_output2) 205 206 @patch('requests.get') 207 def test_replace_links(self, mock_get): 208 #mock responses for follow_redirects function 209 def mock_get_response(url): 210 class MockResponse: 211 def __init__(self, url): 212 self.url = url 213 if url == 'http://www.google.com/url?q=http%3A%2F%2Fwww.typolexikon.de%2F&sa=D&sntz=1&usg=AOvVaw3SSbqyjrSIq8enzBt6Gltw': 214 return MockResponse('http://www.typolexikon.de/') 215 elif url == 'http://www.example.com/': 216 return MockResponse('http://www.example.com/') 217 return MockResponse(url) 218 219 mock_get.side_effect = mock_get_response 220 221 #standard use cases 222 markdown_content1 = ''' 223 relative link [page](/relative-page). 224 Google redirect link [typolexikon.de](http://www.google.com/url?q=http%3A%2F%2Fwww.typolexikon.de%2F&sa=D&sntz=1&usg=AOvVaw3SSbqyjrSIq8enzBt6Gltw). 225 normal link [example.com](http://www.example.com/). 226 ''' 227 expected_output1 = ''' 228 relative link [page](https://cldr.unicode.org/relative-page). 229 Google redirect link [typolexikon.de](http://www.typolexikon.de/). 230 normal link [example.com](http://www.example.com/). 231 ''' 232 cleaned_content = removeRedundantLinks(markdown_content1) 233 self.assertEqual(process_links(cleaned_content), expected_output1) 234 235if __name__ == '__main__': 236 fullCleanup("testing.md") 237 unittest.main() 238