1// extract site frontmatter and read from /sitemap.tsv, save to json 2 3import * as fs from "node:fs/promises"; 4import * as path from "node:path"; 5import { default as process } from "node:process"; 6import { default as matter } from "gray-matter"; 7import { SitemapStream, streamToPromise } from "sitemap"; 8import { Readable } from "node:stream"; 9import { Dirent } from "node:fs"; 10 11// utilities and constants 12 13// files to skip 14const SKIP_THESE = /(node_modules|\.jekyll-cache|^sitemap.*)/; 15 16// final URL of site 17const SITE = "https://cldr.unicode.org"; 18 19// input file 20const SITEMAPFILE = "sitemap.tsv"; 21 22// utility collator 23const coll = new Intl.Collator(["und"]); 24 25/** 26 * Directory Crawler: process one directory 27 * @param {string} d directory paren 28 * @param {string} fullPath path to this file 29 * @param {object} out output object 30 */ 31async function processFile(d, fullPath, out) { 32 const f = await fs.readFile(fullPath, "utf-8"); 33 const m = matter(f); 34 fullPath = fullPath.replace(/\\/g, '/'); // backslash with slash, for win 35 if (m && m.data) { 36 const { data } = m; 37 out.all.push({ ...data, fullPath }); 38 } else { 39 out.app.push({ fullPath }); // synthesize data? 40 } 41} 42 43/** 44 * Directory Crawler: process one dirent 45 * @param {string} d directory paren 46 * @param {object} out output object 47 * @param {Dirent} e directory entry 48 * @returns 49 */ 50async function processEntry(d, out, e) { 51 const fullpath = path.join(d, e.name); 52 if (SKIP_THESE.test(e.name)) return; 53 if (e.isDirectory()) { 54 return await traverse(fullpath, out); 55 } else if (!e.isFile() || !/\.md$/.test(e.name)) { 56 return; 57 } 58 await processFile(d, fullpath, out); 59} 60 61/** 62 * Directory Crawler: kick off the crawl (or subcrawl) of a directory 63 * @param {string} d path to directory 64 * @param {object} out output struct 65 */ 66async function traverse(d, out) { 67 const dirents = await fs.readdir(d, { withFileTypes: true }); 68 const promises = dirents.map((e) => processEntry(d, out, e)); 69 return Promise.all(promises); 70} 71 72/** replace a/b/c.md with a/b */ 73function path2dir(p) { 74 const dir = p.split("/").slice(0, -1).join("/"); 75 return dir; 76} 77 78/** replace a/b/c.md with a/b/c.html */ 79function md2html(p) { 80 return p.replace(/\.md$/, ".html"); 81} 82 83/** replace a/b/c.html with a/b/c.md */ 84function html2md(p) { 85 return p.replace(/\.html$/, ".md"); 86} 87 88/** replace a/b/c.md with a/b/c */ 89function dropmd(p) { 90 return p.replace(/\.md$/, ""); 91} 92 93/** 94 * 95 * @param {number} n 96 * @returns string with n tabs 97 */ 98function tabs(n) { 99 let s = []; 100 for (let i = 0; i < n; i++) { 101 s.push("\t"); 102 } 103 return s.join(""); 104} 105 106/** convert a markdown path to a final URL */ 107function mkurl(p) { 108 return `${SITE}/${md2html(p)}`; 109} 110 111async function writeXmlSiteMap(out) { 112 // simple list of links 113 const links = await Promise.all( 114 out.all.map(async ({ fullPath, title }) => { 115 const stat = await fs.stat(fullPath); 116 return { 117 url: dropmd(`/${fullPath}`), 118 lastmod: stat.mtime.toISOString(), 119 }; 120 }) 121 ); 122 const stream = new SitemapStream({ hostname: SITE }); 123 const data = ( 124 await streamToPromise(Readable.from(links).pipe(stream)) 125 ).toString(); 126 await fs.writeFile("./sitemap.xml", data, "utf-8"); 127 console.log(`Wrote sitemap.xml with ${links.length} entries`); 128} 129 130async function readTsvSiteMap(out) { 131 console.log(`Reading ${SITEMAPFILE}`); 132 const lines = (await fs.readFile(SITEMAPFILE, "utf-8")).split("\n"); // don't skip comment lines here so we can get line numbers. 133 const errors = []; 134 135 // user's specified map 136 const usermap = { 137 /* 138 index: { 139 parent: null, 140 title: 'CLDR Site', 141 children: [ 142 'cldr-spec', 143 'downloads', 144 … 145 ], 146 }, 147 'cldr-spec': { 148 parent: 'index', 149 title: …, 150 children: [ 151 'cldr-spec/collation-guidelines', 152 … 153 ], 154 }, 155 'cldr-spec/collation-guidelines': { 156 parent: 'cldr-spec', 157 title: …, 158 children: null, 159 }, 160 */ 161 }; 162 // stack of parents, in order 163 let parents = []; 164 let n = 0; 165 for (let line of lines) { 166 n++; 167 const location = `${SITEMAPFILE}:${n}: `; // for errors 168 // skip comment or blank lines 169 if (/^[ \t]*#/.test(line) || !line.trim()) continue; 170 171 // # of leading 172 const tabs = /^[\t]*/.exec(line)[0].length; 173 // rest of line: the actual path 174 const path = line.slice(tabs).trim(); 175 if (usermap[path]) { 176 errors.push(`${location} duplicate path: ${path}`); 177 continue; 178 } 179 const foundItem = out.all.find(({ fullPath }) => fullPath === `${path}.md`); 180 if (!foundItem) { 181 errors.push(`${location} could not find file: ${path}.md`); 182 continue; 183 } 184 if (!foundItem.title) { 185 errors.push(`${location} missing title in ${path}.md`); 186 // let this continue on 187 } 188 usermap[path] = { 189 title: foundItem.title ?? path, 190 }; 191 const parentCount = parents.length; 192 if (tabs < parentCount) { 193 /** 194 * index [1] 195 * foo [2] 196 * 197 */ 198 // outdent 199 if (tabs == 0) { 200 errors.push(`${location} can't have more than one root page!`); 201 break; 202 } 203 // drop 'n' parents 204 parents = parents.slice(0, tabs); 205 } else if (tabs > parentCount) { 206 // Error - wrong indent 207 errors.push( 208 `${location} indent too deep (expected ${parentCount} tabs at most)` 209 ); 210 continue; 211 } 212 const parent = parents.slice(-1)[0] || null; // calculate parent (null for index page) 213 usermap[path].parent = parent; 214 if (parent) { 215 // not for index 216 usermap[parent].children = usermap[parent].children ?? []; 217 usermap[parent].children.push(path); 218 } 219 parents.push(path); // for next time 220 } 221 out.usermap = usermap; 222 out.all.forEach(({ fullPath }) => { 223 if (!usermap[dropmd(fullPath)]) { 224 errors.push(`${SITEMAPFILE}: missing: ${dropmd(fullPath)}`); 225 } 226 }); 227 if (errors.length) { 228 errors.forEach((l) => console.error(l)); 229 throw Error(`${errors.length} errors reading tsv`); 230 } else { 231 console.log(`${SITEMAPFILE} Valid.`); 232 } 233} 234 235/** top level async */ 236async function main() { 237 const out = { 238 all: [], 239 }; 240 await fs.mkdir("assets/json/", { recursive: true }); 241 await traverse(".", out); 242 await writeXmlSiteMap(out); 243 await readTsvSiteMap(out); 244 // write final json asset 245 delete out.all; //not needed at this phase, so trim out of the deploy 246 await fs.writeFile("assets/json/tree.json", JSON.stringify(out, null, " ")); 247 console.log("Wrote assets/json/tree.json"); 248} 249 250main().then( 251 () => console.log("Done."), 252 (e) => { 253 console.error(e); 254 process.exitCode = 1; 255 } 256); 257