import { execSync } from "child_process"; import * as matter from "gray-matter"; import * as fs from "fs"; import * as path from "path"; // Note: this must be executed within the docs-website directory. // Constants. const HOSTED_SITE_URL = "https://datahubproject.io"; const GITHUB_EDIT_URL = "https://github.com/linkedin/datahub/blob/master"; const GITHUB_BROWSE_URL = "https://github.com/linkedin/datahub/blob/master"; const SIDEBARS_DEF_PATH = "./sidebars.js"; const sidebars = require(SIDEBARS_DEF_PATH); function actually_in_sidebar(filepath: string): boolean { const slug = get_slug(filepath); const json = JSON.stringify(sidebars); return json.indexOf(slug) < 0; } function list_markdown_files(): string[] { const all_markdown_files = execSync("cd .. && git ls-files . | grep '.md$'") .toString() .trim() .split("\n"); const filter_patterns = [ // We don't need our issue and pull request templates. /^\.github\//, // Ignore everything within this directory. /^docs-website\//, // Don't want hosted docs for these. /^contrib\/(?!kubernetes\/README\.md)/, // Keeps the main Kubernetes docs. /^datahub-web\//, /^metadata-ingestion-examples\//, /^docs\/rfc\/templates\/000-template\.md$/, /^docs\/docker\/README\.md/, // This one is just a pointer to another file. ]; const markdown_files = all_markdown_files.filter((filepath) => { return !filter_patterns.some((rule) => rule.test(filepath)); }); return markdown_files; } const markdown_files = list_markdown_files(); // console.log(markdown_files); function get_id(filepath: string): string { // Removes the file extension (e.g. md). const id = filepath.replace(/\.[^/.]+$/, ""); // console.log(id); return id; } const hardcoded_slugs = { "README.md": "/", "docs/README.md": "docs/overview", }; function get_slug(filepath: string): string { if (filepath in hardcoded_slugs) { return hardcoded_slugs[filepath]; } let slug = get_id(filepath); if (slug.startsWith("docs/")) { slug = slug.slice(5); } slug = `/${slug}`; if (slug.endsWith("/README")) { slug = slug.slice(0, -7); } slug = slug.toLowerCase(); return slug; } const hardcoded_titles = { "README.md": "Introduction", "docs/demo.md": "Demo", }; function markdown_guess_title( contents: matter.GrayMatterFile, filepath: string ): void { if (contents.data.title) { return; } let title: string; if (filepath in hardcoded_titles) { title = hardcoded_titles[filepath]; } else { // Find first h1 header and use it as the title. const headers = contents.content.match(/^# (.+)$/gm); if (headers.length > 1 && contents.content.indexOf("```") < 0) { throw new Error(`too many h1 headers in ${filepath}`); } title = headers[0].slice(2).trim(); if (title.startsWith("DataHub ")) { title = title.slice(8).trim(); } } contents.data.title = title; contents.data.hide_title = true; } function markdown_add_edit_url( contents: matter.GrayMatterFile, filepath: string ): void { const editUrl = `${GITHUB_EDIT_URL}/${filepath}`; contents.data.custom_edit_url = editUrl; } function markdown_add_slug( contents: matter.GrayMatterFile, filepath: string ): void { if (contents.data.slug) { return; } const slug = get_slug(filepath); contents.data.slug = slug; } function new_url(original: string, filepath: string): string { if (original.toLowerCase().startsWith(HOSTED_SITE_URL)) { // For absolute links to the hosted docs site, we transform them into local ones. // Note that HOSTED_SITE_URL does not have a trailing slash, so after the replacement, // the url will start with a slash. return original.replace(HOSTED_SITE_URL, ""); } if (original.startsWith("http://") || original.startsWith("https://")) { if ( (original .toLowerCase() .startsWith("https://github.com/linkedin/datahub/blob") || original .toLowerCase() .startsWith("https://github.com/linkedin/datahub/tree")) && (original.endsWith(".md") || original.endsWith(".pdf")) ) { throw new Error(`absolute link (${original}) found in ${filepath}`); } return original; } if (original.startsWith("#")) { // These are anchor links that reference within the document itself. return original; } // Now we assume this is a local reference. const suffix = path.extname(original); if ( suffix == "" || [ ".java", ".conf", ".xml", ".pdl", ".json", ".py", ".ts", ".yml", ".sh", ".env", ".sql", ].some((ext) => suffix.startsWith(ext)) ) { // A reference to a file or directory in the Github repo. const relation = path.dirname(filepath); const updated_path = path.normalize(`${relation}/${original}`); if (!fs.existsSync(`../${updated_path}`) && actually_in_sidebar(filepath)) { // Detects when the path is a dangling reference, according to the locally // checked out repo. throw new Error( `broken github repo link: ${updated_path} in ${filepath}` ); } const updated_url = `${GITHUB_BROWSE_URL}/${updated_path}`; return updated_url; } else if (suffix.startsWith(".md")) { // Leave as-is. // We use startsWith above so that we can allow anchor tags on links. return original; } else if ([".png", ".svg", ".gif", ".pdf"].includes(suffix)) { // Let docusaurus bundle these as static assets. const up_levels = (filepath.match(/\//g) ?? []).length; const relation = path.dirname(filepath); const updated = path.normalize( `${"../".repeat(up_levels + 2)}/${relation}/${original}` ); return updated; } else { throw new Error(`unknown extension - ${original} in ${filepath}`); } } function markdown_rewrite_urls( contents: matter.GrayMatterFile, filepath: string ): void { const new_content = contents.content .replace( // Look for the [text](url) syntax. Note that this will also capture images. // // We do a little bit of parenthesis matching here to account for parens in URLs. // See https://stackoverflow.com/a/17759264 for explanation of the second capture group. /\[(.+?)\]\(((?:[^)(]+|\((?:[^)(]+|\([^)(]*\))*\))*)\)/g, (_, text, url) => { const updated = new_url(url.trim(), filepath); return `[${text}](${updated})`; } ) .replace( // Also look for the [text]: url syntax. /^\[(.+?)\]\s*:\s*(.+?)\s*$/gm, (_, text, url) => { const updated = new_url(url, filepath); return `[${text}]: ${updated}`; } ); contents.content = new_content; } function markdown_enable_specials( contents: matter.GrayMatterFile, filepath: string ): void { const new_content = contents.content .replace(/^$/gm, ""); contents.content = new_content; } for (const filepath of markdown_files) { // console.log("Processing:", filepath); const contents_string = fs.readFileSync(`../${filepath}`).toString(); const contents = matter(contents_string); markdown_guess_title(contents, filepath); markdown_add_slug(contents, filepath); markdown_add_edit_url(contents, filepath); markdown_rewrite_urls(contents, filepath); markdown_enable_specials(contents, filepath); // console.log(contents); const outpath = `genDocs/${filepath}`; const pathname = path.dirname(outpath); fs.mkdirSync(pathname, { recursive: true }); fs.writeFileSync(outpath, contents.stringify("")); } // Output a list of all docs which are not included in a sidebar. const sidebar = fs.readFileSync(SIDEBARS_DEF_PATH).toString(); for (const filepath of markdown_files) { const doc_id = get_id(filepath); if (sidebar.indexOf(`"${doc_id}"`) < 0) { throw new Error(`File not accounted for in sidebar ${filepath}`); } }