• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// word2md - Word to Markdown conversion tool
2//
3// word2md converts a Microsoft Word document to Markdown formatted text. The tool uses the
4// Word Automation APIs to start an instance of Word and access the contents of the document
5// being converted. The tool must be run using the cscript.exe script host and requires Word
6// to be installed on the target machine. The name of the document to convert must be specified
7// as a command line argument and the resulting Markdown is written to standard output. The
8// tool recognizes the specific Word styles used in the TypeScript Language Specification.
9
10namespace Word {
11    export interface Collection<T> {
12        count: number;
13        item(index: number): T;
14    }
15
16    export interface Font {
17        bold: boolean;
18        italic: boolean;
19        subscript: boolean;
20        superscript: boolean;
21    }
22
23    export interface Find {
24        font: Font;
25        format: boolean;
26        replacement: Replacement;
27        style: any;
28        text: string;
29        clearFormatting(): void;
30        execute(
31            findText: string,
32            matchCase: boolean,
33            matchWholeWord: boolean,
34            matchWildcards: boolean,
35            matchSoundsLike: boolean,
36            matchAllWordForms: boolean,
37            forward: boolean,
38            wrap: number,
39            format: boolean,
40            replaceWith: string,
41            replace: number): boolean;
42    }
43
44    export interface Replacement {
45        font: Font;
46        style: any;
47        text: string;
48        clearFormatting(): void;
49    }
50
51    export interface ListFormat {
52        listLevelNumber: number;
53        listString: string;
54    }
55
56    export interface Column {
57    }
58
59    export interface Columns extends Collection<Column> {
60    }
61
62    export interface Table {
63        columns: Columns;
64    }
65
66    export interface Tables extends Collection<Table> {
67    }
68
69    export interface Range {
70        find: Find;
71        listFormat: ListFormat;
72        tables: Tables;
73        text: string;
74        textRetrievalMode: {
75            includeHiddenText: boolean;
76        }
77        words: Ranges;
78    }
79
80    export interface Ranges extends Collection<Range> {
81    }
82
83    export interface Style {
84        nameLocal: string;
85    }
86
87    export interface Paragraph {
88        alignment: number;
89        range: Range;
90        style: Style;
91        next(): Paragraph;
92    }
93
94    export interface Paragraphs extends Collection<Paragraph> {
95        first: Paragraph;
96    }
97
98    export interface Field {
99    }
100
101    export interface Fields extends Collection<Field> {
102        toggleShowCodes(): void;
103    }
104
105    export interface Hyperlink {
106        address: string;
107        textToDisplay: string;
108        range: Range;
109    }
110
111    export interface Hyperlinks extends Collection<Hyperlink> {
112    }
113
114    export interface Document {
115        fields: Fields;
116        paragraphs: Paragraphs;
117        hyperlinks: Hyperlinks;
118        builtInDocumentProperties: Collection<any>;
119        close(saveChanges: boolean): void;
120        range(): Range;
121    }
122
123    export interface Documents extends Collection<Document> {
124        open(filename: string): Document;
125    }
126
127    export interface Application {
128        documents: Documents;
129        quit(): void;
130    }
131}
132
133const sys = (() => {
134    const fileStream = new ActiveXObject("ADODB.Stream");
135    fileStream.Type = 2 /* text */;
136    const binaryStream = new ActiveXObject("ADODB.Stream");
137    binaryStream.Type = 1 /* binary */;
138    const args: string[] = [];
139    for (let i = 0; i < WScript.Arguments.length; i++) {
140        args[i] = WScript.Arguments.Item(i);
141    }
142    return {
143        args,
144        createObject: (typeName: string) => new ActiveXObject(typeName),
145        write(s: string): void {
146            WScript.StdOut.Write(s);
147        },
148        writeFile: (fileName: string, data: string): void => {
149            fileStream.Open();
150            binaryStream.Open();
151            try {
152                // Write characters in UTF-8 encoding
153                fileStream.Charset = "utf-8";
154                fileStream.WriteText(data);
155                // We don't want the BOM, skip it by setting the starting location to 3 (size of BOM).
156                fileStream.Position = 3;
157                fileStream.CopyTo(binaryStream);
158                binaryStream.SaveToFile(fileName, 2 /*overwrite*/);
159            }
160            finally {
161                binaryStream.Close();
162                fileStream.Close();
163            }
164        }
165    };
166})();
167
168interface FindReplaceOptions {
169    style?: any;
170    font?: {
171        bold?: boolean;
172        italic?: boolean;
173        subscript?: boolean;
174    };
175}
176
177function convertDocumentToMarkdown(doc: Word.Document): string {
178
179    const columnAlignment: number[] = [];
180    let tableColumnCount: number;
181    let tableCellIndex: number;
182    let lastInTable: boolean;
183    let lastStyle: string;
184    let result = "";
185
186    function setProperties(target: any, properties: any) {
187        for (const name in properties) {
188            if (properties.hasOwnProperty(name)) {
189                const value = properties[name];
190                if (typeof value === "object") {
191                    setProperties(target[name], value);
192                }
193                else {
194                    target[name] = value;
195                }
196            }
197        }
198    }
199
200    function findReplace(findText: string, findOptions: FindReplaceOptions, replaceText: string, replaceOptions: FindReplaceOptions) {
201        const find = doc.range().find;
202        find.clearFormatting();
203        setProperties(find, findOptions);
204        const replace = find.replacement;
205        replace.clearFormatting();
206        setProperties(replace, replaceOptions);
207        find.execute(findText,
208            /* matchCase */ false,
209            /* matchWholeWord */ false,
210            /* matchWildcards */ false,
211            /* matchSoundsLike */ false,
212            /* matchAllWordForms */ false,
213            /* forward */ true,
214            0,
215            /* format */ true,
216            replaceText,
217            2
218        );
219    }
220
221    function fixHyperlinks() {
222        const count = doc.hyperlinks.count;
223        for (let i = 0; i < count; i++) {
224            const hyperlink = doc.hyperlinks.item(i + 1);
225            const address = hyperlink.address;
226            if (address && address.length > 0) {
227                const textToDisplay = hyperlink.textToDisplay;
228                hyperlink.textToDisplay = "[" + textToDisplay + "](" + address + ")";
229            }
230        }
231    }
232
233    function write(s: string) {
234        result += s;
235    }
236
237    function writeTableHeader() {
238        for (let i = 0; i < tableColumnCount - 1; i++) {
239            switch (columnAlignment[i]) {
240                case 1:
241                    write("|:---:");
242                    break;
243                case 2:
244                    write("|---:");
245                    break;
246                default:
247                    write("|---");
248            }
249        }
250        write("|\n");
251    }
252
253    function trimEndFormattingMarks(text: string) {
254        let i = text.length;
255        while (i > 0 && text.charCodeAt(i - 1) < 0x20) i--;
256        return text.substr(0, i);
257    }
258
259    function writeBlockEnd() {
260        switch (lastStyle) {
261            case "Code":
262                write("```\n\n");
263                break;
264            case "List Paragraph":
265            case "Table":
266            case "TOC":
267                write("\n");
268                break;
269        }
270    }
271
272    function writeParagraph(p: Word.Paragraph) {
273
274        const range = p.range;
275        const inTable = range.tables.count > 0;
276        const sectionBreak = range.text.indexOf("\x0C") >= 0;
277
278        let level = 1;
279        let style = p.style.nameLocal;
280        let text = range.text;
281
282        text = trimEndFormattingMarks(text);
283        if (text === "/") {
284            // An inline image shows up in the text as a "/". When we see a paragraph
285            // consisting of nothing but "/", we check to see if the paragraph contains
286            // hidden text and, if so, emit that instead. The hidden text is assumed to
287            // contain an appropriate markdown image link.
288            range.textRetrievalMode.includeHiddenText = true;
289            const fullText = range.text;
290            range.textRetrievalMode.includeHiddenText = false;
291            if (text !== fullText) {
292                text = "&emsp;&emsp;" + fullText.substr(1);
293            }
294        }
295
296        if (inTable) {
297            style = "Table";
298        }
299        else if (style.match(/\s\d$/)) {
300            level = +style.substr(style.length - 1);
301            style = style.substr(0, style.length - 2);
302        }
303        if (lastStyle && style !== lastStyle) {
304            writeBlockEnd();
305        }
306
307        switch (style) {
308
309            case "Heading":
310            case "Appendix":
311                const section = range.listFormat.listString;
312                write("####".substr(0, level) + ' <a name="' + section + '"/>' + section + " " + text + "\n\n");
313                break;
314
315            case "Normal":
316                if (text.length) {
317                    write(text + "\n\n");
318                }
319                break;
320
321            case "List Paragraph":
322                write("        ".substr(0, range.listFormat.listLevelNumber * 2 - 2) + "* " + text + "\n");
323                break;
324
325            case "Grammar":
326                write("&emsp;&emsp;" + text.replace(/\s\s\s/g, "&emsp;").replace(/\x0B/g, "  \n&emsp;&emsp;&emsp;") + "\n\n");
327                break;
328
329            case "Code":
330                if (lastStyle !== "Code") {
331                    write("```TypeScript\n");
332                }
333                else {
334                    write("\n");
335                }
336                write(text.replace(/\x0B/g, "  \n") + "\n");
337                break;
338
339            case "Table":
340                if (!lastInTable) {
341                    tableColumnCount = range.tables.item(1).columns.count + 1;
342                    tableCellIndex = 0;
343                }
344                if (tableCellIndex < tableColumnCount) {
345                    columnAlignment[tableCellIndex] = p.alignment;
346                }
347                write("|" + text);
348                tableCellIndex++;
349                if (tableCellIndex % tableColumnCount === 0) {
350                    write("\n");
351                    if (tableCellIndex === tableColumnCount) {
352                        writeTableHeader();
353                    }
354                }
355                break;
356
357            case "TOC Heading":
358                write("## " + text + "\n\n");
359                break;
360
361            case "TOC":
362                const strings = text.split("\t");
363                write("        ".substr(0, level * 2 - 2) + "* [" + strings[0] + " " + strings[1] + "](#" + strings[0] + ")\n");
364                break;
365        }
366
367        if (sectionBreak) {
368            write("<br/>\n\n");
369        }
370        lastStyle = style;
371        lastInTable = inTable;
372    }
373
374    function writeDocument() {
375        const title = doc.builtInDocumentProperties.item(1) + "";
376        if (title.length) {
377            write("# " + title + "\n\n");
378        }
379        for (let p = doc.paragraphs.first; p; p = p.next()) {
380            writeParagraph(p);
381        }
382        writeBlockEnd();
383    }
384
385    findReplace("<", {}, "&lt;", {});
386    findReplace("&lt;", { style: "Code" }, "<", {});
387    findReplace("&lt;", { style: "Code Fragment" }, "<", {});
388    findReplace("&lt;", { style: "Terminal" }, "<", {});
389    findReplace("", { font: { subscript: true } }, "<sub>^&</sub>", { font: { subscript: false } });
390    findReplace("", { style: "Code Fragment" }, "`^&`", { style: -66 /* default font */ });
391    findReplace("", { style: "Production" }, "*^&*", { style: -66 /* default font */ });
392    findReplace("", { style: "Terminal" }, "`^&`", { style: -66 /* default font */ });
393    findReplace("", { font: { bold: true, italic: true } }, "***^&***", { font: { bold: false, italic: false } });
394    findReplace("", { font: { italic: true } }, "*^&*", { font: { italic: false } });
395
396    doc.fields.toggleShowCodes();
397    findReplace("^19 REF", {}, "[^&](#^&)", {});
398    doc.fields.toggleShowCodes();
399
400    fixHyperlinks();
401
402    writeDocument();
403
404    result = result.replace(/\x85/g, "\u2026");
405    result = result.replace(/\x96/g, "\u2013");
406    result = result.replace(/\x97/g, "\u2014");
407
408    return result;
409}
410
411function main(args: string[]) {
412    if (args.length !== 2) {
413        sys.write("Syntax: word2md <inputfile> <outputfile>\n");
414        return;
415    }
416    const app: Word.Application = sys.createObject("Word.Application");
417    const doc = app.documents.open(args[0]);
418    sys.writeFile(args[1], convertDocumentToMarkdown(doc));
419    doc.close(/* saveChanges */ false);
420    app.quit();
421}
422
423main(sys.args);
424