1// word2md - Word to Markdown conversion tool 2// 3// word2md converts a Microsoft Word document to Markdown formatted text. The tool uses the 4// Word Automation APIs to start an instance of Word and access the contents of the document 5// being converted. The tool must be run using the cscript.exe script host and requires Word 6// to be installed on the target machine. The name of the document to convert must be specified 7// as a command line argument and the resulting Markdown is written to standard output. The 8// tool recognizes the specific Word styles used in the TypeScript Language Specification. 9 10namespace Word { 11 export interface Collection<T> { 12 count: number; 13 item(index: number): T; 14 } 15 16 export interface Font { 17 bold: boolean; 18 italic: boolean; 19 subscript: boolean; 20 superscript: boolean; 21 } 22 23 export interface Find { 24 font: Font; 25 format: boolean; 26 replacement: Replacement; 27 style: any; 28 text: string; 29 clearFormatting(): void; 30 execute( 31 findText: string, 32 matchCase: boolean, 33 matchWholeWord: boolean, 34 matchWildcards: boolean, 35 matchSoundsLike: boolean, 36 matchAllWordForms: boolean, 37 forward: boolean, 38 wrap: number, 39 format: boolean, 40 replaceWith: string, 41 replace: number): boolean; 42 } 43 44 export interface Replacement { 45 font: Font; 46 style: any; 47 text: string; 48 clearFormatting(): void; 49 } 50 51 export interface ListFormat { 52 listLevelNumber: number; 53 listString: string; 54 } 55 56 export interface Column { 57 } 58 59 export interface Columns extends Collection<Column> { 60 } 61 62 export interface Table { 63 columns: Columns; 64 } 65 66 export interface Tables extends Collection<Table> { 67 } 68 69 export interface Range { 70 find: Find; 71 listFormat: ListFormat; 72 tables: Tables; 73 text: string; 74 textRetrievalMode: { 75 includeHiddenText: boolean; 76 } 77 words: Ranges; 78 } 79 80 export interface Ranges extends Collection<Range> { 81 } 82 83 export interface Style { 84 nameLocal: string; 85 } 86 87 export interface Paragraph { 88 alignment: number; 89 range: Range; 90 style: Style; 91 next(): Paragraph; 92 } 93 94 export interface Paragraphs extends Collection<Paragraph> { 95 first: Paragraph; 96 } 97 98 export interface Field { 99 } 100 101 export interface Fields extends Collection<Field> { 102 toggleShowCodes(): void; 103 } 104 105 export interface Hyperlink { 106 address: string; 107 textToDisplay: string; 108 range: Range; 109 } 110 111 export interface Hyperlinks extends Collection<Hyperlink> { 112 } 113 114 export interface Document { 115 fields: Fields; 116 paragraphs: Paragraphs; 117 hyperlinks: Hyperlinks; 118 builtInDocumentProperties: Collection<any>; 119 close(saveChanges: boolean): void; 120 range(): Range; 121 } 122 123 export interface Documents extends Collection<Document> { 124 open(filename: string): Document; 125 } 126 127 export interface Application { 128 documents: Documents; 129 quit(): void; 130 } 131} 132 133const sys = (() => { 134 const fileStream = new ActiveXObject("ADODB.Stream"); 135 fileStream.Type = 2 /* text */; 136 const binaryStream = new ActiveXObject("ADODB.Stream"); 137 binaryStream.Type = 1 /* binary */; 138 const args: string[] = []; 139 for (let i = 0; i < WScript.Arguments.length; i++) { 140 args[i] = WScript.Arguments.Item(i); 141 } 142 return { 143 args, 144 createObject: (typeName: string) => new ActiveXObject(typeName), 145 write(s: string): void { 146 WScript.StdOut.Write(s); 147 }, 148 writeFile: (fileName: string, data: string): void => { 149 fileStream.Open(); 150 binaryStream.Open(); 151 try { 152 // Write characters in UTF-8 encoding 153 fileStream.Charset = "utf-8"; 154 fileStream.WriteText(data); 155 // We don't want the BOM, skip it by setting the starting location to 3 (size of BOM). 156 fileStream.Position = 3; 157 fileStream.CopyTo(binaryStream); 158 binaryStream.SaveToFile(fileName, 2 /*overwrite*/); 159 } 160 finally { 161 binaryStream.Close(); 162 fileStream.Close(); 163 } 164 } 165 }; 166})(); 167 168interface FindReplaceOptions { 169 style?: any; 170 font?: { 171 bold?: boolean; 172 italic?: boolean; 173 subscript?: boolean; 174 }; 175} 176 177function convertDocumentToMarkdown(doc: Word.Document): string { 178 179 const columnAlignment: number[] = []; 180 let tableColumnCount: number; 181 let tableCellIndex: number; 182 let lastInTable: boolean; 183 let lastStyle: string; 184 let result = ""; 185 186 function setProperties(target: any, properties: any) { 187 for (const name in properties) { 188 if (properties.hasOwnProperty(name)) { 189 const value = properties[name]; 190 if (typeof value === "object") { 191 setProperties(target[name], value); 192 } 193 else { 194 target[name] = value; 195 } 196 } 197 } 198 } 199 200 function findReplace(findText: string, findOptions: FindReplaceOptions, replaceText: string, replaceOptions: FindReplaceOptions) { 201 const find = doc.range().find; 202 find.clearFormatting(); 203 setProperties(find, findOptions); 204 const replace = find.replacement; 205 replace.clearFormatting(); 206 setProperties(replace, replaceOptions); 207 find.execute(findText, 208 /* matchCase */ false, 209 /* matchWholeWord */ false, 210 /* matchWildcards */ false, 211 /* matchSoundsLike */ false, 212 /* matchAllWordForms */ false, 213 /* forward */ true, 214 0, 215 /* format */ true, 216 replaceText, 217 2 218 ); 219 } 220 221 function fixHyperlinks() { 222 const count = doc.hyperlinks.count; 223 for (let i = 0; i < count; i++) { 224 const hyperlink = doc.hyperlinks.item(i + 1); 225 const address = hyperlink.address; 226 if (address && address.length > 0) { 227 const textToDisplay = hyperlink.textToDisplay; 228 hyperlink.textToDisplay = "[" + textToDisplay + "](" + address + ")"; 229 } 230 } 231 } 232 233 function write(s: string) { 234 result += s; 235 } 236 237 function writeTableHeader() { 238 for (let i = 0; i < tableColumnCount - 1; i++) { 239 switch (columnAlignment[i]) { 240 case 1: 241 write("|:---:"); 242 break; 243 case 2: 244 write("|---:"); 245 break; 246 default: 247 write("|---"); 248 } 249 } 250 write("|\n"); 251 } 252 253 function trimEndFormattingMarks(text: string) { 254 let i = text.length; 255 while (i > 0 && text.charCodeAt(i - 1) < 0x20) i--; 256 return text.substr(0, i); 257 } 258 259 function writeBlockEnd() { 260 switch (lastStyle) { 261 case "Code": 262 write("```\n\n"); 263 break; 264 case "List Paragraph": 265 case "Table": 266 case "TOC": 267 write("\n"); 268 break; 269 } 270 } 271 272 function writeParagraph(p: Word.Paragraph) { 273 274 const range = p.range; 275 const inTable = range.tables.count > 0; 276 const sectionBreak = range.text.indexOf("\x0C") >= 0; 277 278 let level = 1; 279 let style = p.style.nameLocal; 280 let text = range.text; 281 282 text = trimEndFormattingMarks(text); 283 if (text === "/") { 284 // An inline image shows up in the text as a "/". When we see a paragraph 285 // consisting of nothing but "/", we check to see if the paragraph contains 286 // hidden text and, if so, emit that instead. The hidden text is assumed to 287 // contain an appropriate markdown image link. 288 range.textRetrievalMode.includeHiddenText = true; 289 const fullText = range.text; 290 range.textRetrievalMode.includeHiddenText = false; 291 if (text !== fullText) { 292 text = "  " + fullText.substr(1); 293 } 294 } 295 296 if (inTable) { 297 style = "Table"; 298 } 299 else if (style.match(/\s\d$/)) { 300 level = +style.substr(style.length - 1); 301 style = style.substr(0, style.length - 2); 302 } 303 if (lastStyle && style !== lastStyle) { 304 writeBlockEnd(); 305 } 306 307 switch (style) { 308 309 case "Heading": 310 case "Appendix": 311 const section = range.listFormat.listString; 312 write("####".substr(0, level) + ' <a name="' + section + '"/>' + section + " " + text + "\n\n"); 313 break; 314 315 case "Normal": 316 if (text.length) { 317 write(text + "\n\n"); 318 } 319 break; 320 321 case "List Paragraph": 322 write(" ".substr(0, range.listFormat.listLevelNumber * 2 - 2) + "* " + text + "\n"); 323 break; 324 325 case "Grammar": 326 write("  " + text.replace(/\s\s\s/g, " ").replace(/\x0B/g, " \n   ") + "\n\n"); 327 break; 328 329 case "Code": 330 if (lastStyle !== "Code") { 331 write("```TypeScript\n"); 332 } 333 else { 334 write("\n"); 335 } 336 write(text.replace(/\x0B/g, " \n") + "\n"); 337 break; 338 339 case "Table": 340 if (!lastInTable) { 341 tableColumnCount = range.tables.item(1).columns.count + 1; 342 tableCellIndex = 0; 343 } 344 if (tableCellIndex < tableColumnCount) { 345 columnAlignment[tableCellIndex] = p.alignment; 346 } 347 write("|" + text); 348 tableCellIndex++; 349 if (tableCellIndex % tableColumnCount === 0) { 350 write("\n"); 351 if (tableCellIndex === tableColumnCount) { 352 writeTableHeader(); 353 } 354 } 355 break; 356 357 case "TOC Heading": 358 write("## " + text + "\n\n"); 359 break; 360 361 case "TOC": 362 const strings = text.split("\t"); 363 write(" ".substr(0, level * 2 - 2) + "* [" + strings[0] + " " + strings[1] + "](#" + strings[0] + ")\n"); 364 break; 365 } 366 367 if (sectionBreak) { 368 write("<br/>\n\n"); 369 } 370 lastStyle = style; 371 lastInTable = inTable; 372 } 373 374 function writeDocument() { 375 const title = doc.builtInDocumentProperties.item(1) + ""; 376 if (title.length) { 377 write("# " + title + "\n\n"); 378 } 379 for (let p = doc.paragraphs.first; p; p = p.next()) { 380 writeParagraph(p); 381 } 382 writeBlockEnd(); 383 } 384 385 findReplace("<", {}, "<", {}); 386 findReplace("<", { style: "Code" }, "<", {}); 387 findReplace("<", { style: "Code Fragment" }, "<", {}); 388 findReplace("<", { style: "Terminal" }, "<", {}); 389 findReplace("", { font: { subscript: true } }, "<sub>^&</sub>", { font: { subscript: false } }); 390 findReplace("", { style: "Code Fragment" }, "`^&`", { style: -66 /* default font */ }); 391 findReplace("", { style: "Production" }, "*^&*", { style: -66 /* default font */ }); 392 findReplace("", { style: "Terminal" }, "`^&`", { style: -66 /* default font */ }); 393 findReplace("", { font: { bold: true, italic: true } }, "***^&***", { font: { bold: false, italic: false } }); 394 findReplace("", { font: { italic: true } }, "*^&*", { font: { italic: false } }); 395 396 doc.fields.toggleShowCodes(); 397 findReplace("^19 REF", {}, "[^&](#^&)", {}); 398 doc.fields.toggleShowCodes(); 399 400 fixHyperlinks(); 401 402 writeDocument(); 403 404 result = result.replace(/\x85/g, "\u2026"); 405 result = result.replace(/\x96/g, "\u2013"); 406 result = result.replace(/\x97/g, "\u2014"); 407 408 return result; 409} 410 411function main(args: string[]) { 412 if (args.length !== 2) { 413 sys.write("Syntax: word2md <inputfile> <outputfile>\n"); 414 return; 415 } 416 const app: Word.Application = sys.createObject("Word.Application"); 417 const doc = app.documents.open(args[0]); 418 sys.writeFile(args[1], convertDocumentToMarkdown(doc)); 419 doc.close(/* saveChanges */ false); 420 app.quit(); 421} 422 423main(sys.args); 424