datahub/docs-website/generateDocsDir.ts

import { execSync } from "child_process";
import * as matter from "gray-matter";
import * as fs from "fs";
import * as path from "path";

// Note: this must be executed within the docs-website directory.

// Constants.
const HOSTED_SITE_URL = "https://datahubproject.io";
const GITHUB_EDIT_URL = "https://github.com/linkedin/datahub/blob/master";
const GITHUB_BROWSE_URL = "https://github.com/linkedin/datahub/blob/master";

const SIDEBARS_DEF_PATH = "./sidebars.js";
const sidebars = require(SIDEBARS_DEF_PATH);

function actually_in_sidebar(filepath: string): boolean {
  const slug = get_slug(filepath);
  const json = JSON.stringify(sidebars);
  return json.indexOf(slug) < 0;
}

function list_markdown_files(): string[] {
  const all_markdown_files = execSync("cd .. && git ls-files . | grep '.md$'")
    .toString()
    .trim()
    .split("\n");

  const filter_patterns = [
    // We don't need our issue and pull request templates.
    /^\.github\//,
    // Ignore everything within this directory.
    /^docs-website\//,
    // Don't want hosted docs for these.
    /^contrib\/(?!kubernetes\/README\.md)/, // Keeps the main Kubernetes docs.
    /^datahub-web\//,
    /^metadata-ingestion-examples\//,
    /^docs\/rfc\/templates\/000-template\.md$/,
    /^docs\/docker\/README\.md/, // This one is just a pointer to another file.
  ];

  const markdown_files = all_markdown_files.filter((filepath) => {
    return !filter_patterns.some((rule) => rule.test(filepath));
  });
  return markdown_files;
}

const markdown_files = list_markdown_files();
// console.log(markdown_files);

function get_id(filepath: string): string {
  // Removes the file extension (e.g. md).
  const id = filepath.replace(/\.[^/.]+$/, "");
  // console.log(id);
  return id;
}

const hardcoded_slugs = {
  "README.md": "/",
  "docs/README.md": "docs/overview",
};

function get_slug(filepath: string): string {
  if (filepath in hardcoded_slugs) {
    return hardcoded_slugs[filepath];
  }

  let slug = get_id(filepath);
  if (slug.startsWith("docs/")) {
    slug = slug.slice(5);
  }
  slug = `/${slug}`;
  if (slug.endsWith("/README")) {
    slug = slug.slice(0, -7);
  }
  slug = slug.toLowerCase();
  return slug;
}

const hardcoded_titles = {
  "README.md": "Introduction",
  "docs/demo.md": "Demo",
};

function markdown_guess_title(
  contents: matter.GrayMatterFile<string>,
  filepath: string
): void {
  if (contents.data.title) {
    return;
  }

  let title: string;
  if (filepath in hardcoded_titles) {
    title = hardcoded_titles[filepath];
  } else {
    // Find first h1 header and use it as the title.
    const headers = contents.content.match(/^# (.+)$/gm);
    if (headers.length > 1 && contents.content.indexOf("```") < 0) {
      throw new Error(`too many h1 headers in ${filepath}`);
    }
    title = headers[0].slice(2).trim();
    if (title.startsWith("DataHub ")) {
      title = title.slice(8).trim();
    }
  }

  contents.data.title = title;
  contents.data.hide_title = true;
}

function markdown_add_edit_url(
  contents: matter.GrayMatterFile<string>,
  filepath: string
): void {
  const editUrl = `${GITHUB_EDIT_URL}/${filepath}`;
  contents.data.custom_edit_url = editUrl;
}

function markdown_add_slug(
  contents: matter.GrayMatterFile<string>,
  filepath: string
): void {
  if (contents.data.slug) {
    return;
  }

  const slug = get_slug(filepath);
  contents.data.slug = slug;
}

function new_url(original: string, filepath: string): string {
  if (original.toLowerCase().startsWith(HOSTED_SITE_URL)) {
    // For absolute links to the hosted docs site, we transform them into local ones.
    // Note that HOSTED_SITE_URL does not have a trailing slash, so after the replacement,
    // the url will start with a slash.
    return original.replace(HOSTED_SITE_URL, "");
  }

  if (original.startsWith("http://") || original.startsWith("https://")) {
    if (
      (original
        .toLowerCase()
        .startsWith("https://github.com/linkedin/datahub/blob") ||
        original
          .toLowerCase()
          .startsWith("https://github.com/linkedin/datahub/tree")) &&
      (original.endsWith(".md") || original.endsWith(".pdf"))
    ) {
      throw new Error(`absolute link (${original}) found in ${filepath}`);
    }
    return original;
  }

  if (original.startsWith("#")) {
    // These are anchor links that reference within the document itself.
    return original;
  }

  // Now we assume this is a local reference.
  const suffix = path.extname(original);
  if (
    suffix == "" ||
    [
      ".java",
      ".conf",
      ".xml",
      ".pdl",
      ".json",
      ".py",
      ".ts",
      ".yml",
      ".sh",
      ".env",
      ".sql",
    ].some((ext) => suffix.startsWith(ext))
  ) {
    // A reference to a file or directory in the Github repo.
    const relation = path.dirname(filepath);
    const updated_path = path.normalize(`${relation}/${original}`);
    if (!fs.existsSync(`../${updated_path}`) && actually_in_sidebar(filepath)) {
      // Detects when the path is a dangling reference, according to the locally
      // checked out repo.
      throw new Error(
        `broken github repo link: ${updated_path} in ${filepath}`
      );
    }
    const updated_url = `${GITHUB_BROWSE_URL}/${updated_path}`;
    return updated_url;
  } else if (suffix.startsWith(".md")) {
    // Leave as-is.
    // We use startsWith above so that we can allow anchor tags on links.
    return original;
  } else if ([".png", ".svg", ".gif", ".pdf"].includes(suffix)) {
    // Let docusaurus bundle these as static assets.
    const up_levels = (filepath.match(/\//g) ?? []).length;
    const relation = path.dirname(filepath);
    const updated = path.normalize(
      `${"../".repeat(up_levels + 2)}/${relation}/${original}`
    );
    return updated;
  } else {
    throw new Error(`unknown extension - ${original} in ${filepath}`);
  }
}

function markdown_rewrite_urls(
  contents: matter.GrayMatterFile<string>,
  filepath: string
): void {
  const new_content = contents.content
    .replace(
      // Look for the [text](url) syntax. Note that this will also capture images.
      //
      // We do a little bit of parenthesis matching here to account for parens in URLs.
      // See https://stackoverflow.com/a/17759264 for explanation of the second capture group.
      /\[(.+?)\]\(((?:[^)(]+|\((?:[^)(]+|\([^)(]*\))*\))*)\)/g,
      (_, text, url) => {
        const updated = new_url(url.trim(), filepath);
        return `[${text}](${updated})`;
      }
    )
    .replace(
      // Also look for the [text]: url syntax.
      /^\[(.+?)\]\s*:\s*(.+?)\s*$/gm,
      (_, text, url) => {
        const updated = new_url(url, filepath);
        return `[${text}]: ${updated}`;
      }
    );
  contents.content = new_content;
}

function markdown_enable_specials(
  contents: matter.GrayMatterFile<string>,
  filepath: string
): void {
  const new_content = contents.content
    .replace(/^<!--HOSTED_DOCS_ONLY$/gm, "")
    .replace(/^HOSTED_DOCS_ONLY-->$/gm, "");
  contents.content = new_content;
}

for (const filepath of markdown_files) {
  // console.log("Processing:", filepath);
  const contents_string = fs.readFileSync(`../${filepath}`).toString();
  const contents = matter(contents_string);

  markdown_guess_title(contents, filepath);
  markdown_add_slug(contents, filepath);
  markdown_add_edit_url(contents, filepath);
  markdown_rewrite_urls(contents, filepath);
  markdown_enable_specials(contents, filepath);
  // console.log(contents);

  const outpath = `genDocs/${filepath}`;
  const pathname = path.dirname(outpath);
  fs.mkdirSync(pathname, { recursive: true });
  fs.writeFileSync(outpath, contents.stringify(""));
}

// Output a list of all docs which are not included in a sidebar.
const sidebar = fs.readFileSync(SIDEBARS_DEF_PATH).toString();
for (const filepath of markdown_files) {
  const doc_id = get_id(filepath);

  if (sidebar.indexOf(`"${doc_id}"`) < 0) {
    throw new Error(`File not accounted for in sidebar ${filepath}`);
  }
}