• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import re
2import requests
3import urllib.parse
4import unittest
5from unittest.mock import patch
6
7#head to place at start of all relative links
8RELATIVE_LINK_HEAD = "https://cldr.unicode.org"
9
10#sometimes the html --> md conversion puts extra spaces between bullets
11def fixBullets(content):
12    #remove extra spaces after dash in bullet points
13    content = re.sub(r'-\s{3}', '- ', content)
14    #remove extra space after numbered bullet points
15    content = re.sub(r'(\d+\.)\s{2}', r'\1 ', content)
16    #process lines for list handling
17    processed_lines = []
18    in_list = False
19    for line in content.splitlines():
20        if re.match(r'^\s*[-\d]', line):
21            #check if the current line is part of a list
22            in_list = True
23        elif in_list and not line.strip():
24            #skip empty lines within lists
25            continue
26        else:
27            in_list = False
28        processed_lines.append(line)
29    processed_content = '\n'.join(processed_lines)
30
31    return processed_content
32
33#html-->md conversion puts link headings into md and messes up titles
34def fixTitles(content):
35    #link headings regex
36    pattern = re.compile(r'(#+)\s*\n*\[\n*\]\(#.*\)\n(.*)\n*')
37
38    #replace matched groups
39    def replaceUnwanted(match):
40        heading_level = match.group(1)  #heading level (ex. ##)
41        title_text = match.group(2).strip()  #capture and strip the title text
42        return f"{heading_level} {title_text}"  #return the formatted heading and title on the same line
43
44    # Replace the unwanted text using the defined pattern and function
45    processed_content = re.sub(pattern, replaceUnwanted, content)
46    return processed_content
47
48# add title at top and unicode copyright at bottom
49def addHeaderAndFooter(content):
50    #get title from top of md file
51    title_match = re.search(r'(?<=#\s).*', content)
52    if title_match:
53        title = title_match.group(0).strip()
54    else:
55        title = "Default Title"  #default if couldnt find
56
57    #header
58    header = f"---\ntitle: {title}\n---\n"
59    #footer
60    footer = "\n![Unicode copyright](https://www.unicode.org/img/hb_notice.gif)\n"
61
62    #look for existing title and copywrite in the YAML front matter
63    title_exists = re.search(r'^---\n.*title:.*\n---', content, re.MULTILINE)
64    footer_exists = footer.strip() in content
65
66    #add header
67    if not title_exists:
68        content = header + content
69
70    #add footer
71    if not footer_exists:
72        content = content + footer
73
74    return content
75
76#html-->md sometimes produces double bullets on indented lists
77def fixIndentedBullets(content):
78    #regex pattern to match the double hyphen bullets
79    pattern = re.compile(r'^-\s-\s(.*)', re.MULTILINE)
80
81    #split into lines
82    lines = content.split('\n')
83
84    #normalize bullets
85    normalized_lines = []
86    in_list = False
87
88    for line in lines:
89        #lines with double hyphens
90        match = pattern.match(line)
91        if match:
92            #normalize the double hyphen bullet
93            bullet_point = match.group(1)
94            normalized_lines.append(f'- {bullet_point.strip()}')
95            in_list = True
96        elif in_list and re.match(r'^\s*-\s', line):
97            #remove indentation from following bullets in the same list
98            normalized_lines.append(line.strip())
99        else:
100            normalized_lines.append(line)
101            in_list = False
102
103    #join back into a single string
104    processed_content = '\n'.join(normalized_lines)
105    return processed_content
106
107#links on text that is already a link
108def removeRedundantLinks(content):
109    #(link)[link] regex pattern
110    link_pattern = re.compile(r'\((https?:\/\/[^\s\)]+)\)\[\1\]')
111
112    #function to process unwanted links
113    def replace_link(match):
114        return match.group(1)  #return only the first URL
115
116    #replace the links
117    processed_content = re.sub(link_pattern, replace_link, content)
118    return processed_content
119
120#process links, google redirects, normal redirects, and relative links (takes in a url)
121def convertLink(url):
122    #relative links
123    if url.startswith("/"):
124        return RELATIVE_LINK_HEAD + url
125    #google redirect links
126    elif "www.google.com/url" in url:
127        parsed_url = urllib.parse.urlparse(url)
128        query_params = urllib.parse.parse_qs(parsed_url.query)
129        if 'q' in query_params:
130            return query_params['q'][0]
131        return url
132    #redirects
133    else:
134        try:
135            response = requests.get(url)
136            return response.url
137        except requests.RequestException as e:
138            print(f"Error following redirects for {url}: {e}")
139            return url
140
141#finds all links and runs them through converLink
142def process_links(content):
143    #regex pattern for md links
144    pattern = re.compile(r'\[(.*?)\]\((.*?)\)')
145
146    #replace each link
147    def replace_link(match):
148        text = match.group(1)
149        url = match.group(2)
150        new_url = convertLink(url)
151        return f'[{text}]({new_url})'
152
153    return pattern.sub(replace_link, content)
154
155#given a file path to an md file, run it through every cleanup function and write inot samle.md
156def fullCleanup(file_path):
157    with open(file_path, 'r') as file:
158        content = file.read()  # Read entire file as a string
159    content = addHeaderAndFooter(content)
160    content = fixTitles(content)
161    content = fixBullets(content)
162    content = removeRedundantLinks(content)
163    content = fixIndentedBullets(content)
164    content = process_links(content)
165    with open("sample.md", 'w') as file:
166        file.write(content)
167
168#given a md string, run through every cleanup function and return result
169def fullCleanupString(str):
170    content = addHeaderAndFooter(str)
171    content = fixTitles(content)
172    content = fixBullets(content)
173    content = removeRedundantLinks(content)
174    content = fixIndentedBullets(content)
175    content = process_links(content)
176    return content
177
178
179#TESTS
180class TestMarkdownLinkProcessing(unittest.TestCase):
181    def test_remove_redundant_links(self):
182        #standard use cases
183        markdown_content1 = '''
184        redundant link (https://mail.google.com/mail/u/1/#inbox)[https://mail.google.com/mail/u/1/#inbox].
185        not redundant link [example](https://www.example.com).
186        '''
187        expected_output1 = '''
188        redundant link https://mail.google.com/mail/u/1/#inbox.
189        not redundant link [example](https://www.example.com).
190        '''
191        self.assertEqual(removeRedundantLinks(markdown_content1), expected_output1)
192
193        #edge cases:
194        #If the link does not start with http:// or https:// it will not be picked up as a link
195        #if the two links are different, it does not get corrected
196        markdown_content2 = '''
197        not link [www.example.com](www.example.com).
198        Different links (https://mail.google.com/mail/u/1/#inbox)[https://emojipedia.org/japanese-symbol-for-beginner].
199        '''
200        expected_output2 = '''
201        not link [www.example.com](www.example.com).
202        Different links (https://mail.google.com/mail/u/1/#inbox)[https://emojipedia.org/japanese-symbol-for-beginner].
203        '''
204        self.assertEqual(removeRedundantLinks(markdown_content2), expected_output2)
205
206    @patch('requests.get')
207    def test_replace_links(self, mock_get):
208        #mock responses for follow_redirects function
209        def mock_get_response(url):
210            class MockResponse:
211                def __init__(self, url):
212                    self.url = url
213            if url == 'http://www.google.com/url?q=http%3A%2F%2Fwww.typolexikon.de%2F&sa=D&sntz=1&usg=AOvVaw3SSbqyjrSIq8enzBt6Gltw':
214                return MockResponse('http://www.typolexikon.de/')
215            elif url == 'http://www.example.com/':
216                return MockResponse('http://www.example.com/')
217            return MockResponse(url)
218
219        mock_get.side_effect = mock_get_response
220
221        #standard use cases
222        markdown_content1 = '''
223        relative link [page](/relative-page).
224        Google redirect link [typolexikon.de](http://www.google.com/url?q=http%3A%2F%2Fwww.typolexikon.de%2F&sa=D&sntz=1&usg=AOvVaw3SSbqyjrSIq8enzBt6Gltw).
225        normal link [example.com](http://www.example.com/).
226        '''
227        expected_output1 = '''
228        relative link [page](https://cldr.unicode.org/relative-page).
229        Google redirect link [typolexikon.de](http://www.typolexikon.de/).
230        normal link [example.com](http://www.example.com/).
231        '''
232        cleaned_content = removeRedundantLinks(markdown_content1)
233        self.assertEqual(process_links(cleaned_content), expected_output1)
234
235if __name__ == '__main__':
236    fullCleanup("testing.md")
237    unittest.main()
238