Source code for notion.markdown

import re

from commonmark import Parser
from commonmark.dump import prepare

delimiters = {
    "!",
    '"',
    "#",
    "$",
    "%",
    "&",
    "'",
    "(",
    ")",
    "*",
    "+",
    ",",
    "-",
    ".",
    "/",
    ":",
    ";",
    "<",
    "=",
    ">",
    "?",
    "@",
    "[",
    "\\",
    "]",
    "^",
    "_",
    "`",
    "{",
    "|",
    "}",
    "~",
    "☃",
    " ",
    "\t",
    "\n",
    "\x0b",
    "\x0c",
    "\r",
    "\x1c",
    "\x1d",
    "\x1e",
    "\x1f",
    "\x85",
    "\xa0",
    "\u1680",
    "\u2000",
    "\u2001",
    "\u2002",
    "\u2003",
    "\u2004",
    "\u2005",
    "\u2006",
    "\u2007",
    "\u2008",
    "\u2009",
    "\u200a",
    "\u2028",
    "\u2029",
    "\u202f",
    "\u205f",
    "\u3000",
}

_NOTION_TO_MARKDOWN_MAPPER = {"i": "☃", "b": "☃☃", "s": "~~", "c": "`"}

FORMAT_PRECEDENCE = ["s", "b", "i", "a", "c"]


def _extract_text_and_format_from_ast(item: dict):
    literal = item.get("literal", "")
    item_type = item["type"]

    if item_type == "html_inline":
        if literal == "<s>":
            return "", ("s",)

    if item_type == "strong":
        return literal, ("b",)

    if item_type == "emph":
        return literal, ("i",)

    if item_type == "link":
        return literal, ("a", item.get("destination", ""))

    if item_type == "code":
        return literal, ("c",)

    return literal, ()


def _get_format(notion_segment, as_set=False):
    if len(notion_segment) == 1:
        if as_set:
            return set()
        else:
            return []
    else:
        if as_set:
            return set([tuple(f) for f in notion_segment[1]])
        else:
            return notion_segment[1]


def _cleanup_dashes(thing):
    regex_pattern = re.compile("⸻|%E2%B8%BB")
    if type(thing) is list:
        for counter, value in enumerate(thing):
            thing[counter] = _cleanup_dashes(value)
    elif type(thing) is str:
        return regex_pattern.sub("-", thing)

    return thing


[docs]def markdown_to_notion(markdown: str) -> list: """ Convert Markdown formatted string to Notion. Arguments --------- markdown : str Text to convert. Returns ------- list of Block Blocks converted from input. """ # commonmark doesn't support strikethrough, # so we need to handle it ourselves while markdown.count("~~") >= 2: markdown = markdown.replace("~~", "<s>", 1) markdown = markdown.replace("~~", "</s>", 1) # we don't want to touch dashes, so temporarily replace them here markdown = markdown.replace("-", "⸻") parser = Parser() ast = prepare(parser.parse(markdown)) format = set() notion = [] for section in ast: _, ended_format = _extract_text_and_format_from_ast(section) if ended_format and ended_format in format: format.remove(ended_format) if section["type"] == "paragraph": notion.append(["\n\n"]) for item in section.get("children", []): literal, new_format = _extract_text_and_format_from_ast(item) if new_format: format.add(new_format) if item["type"] == "html_inline" and literal == "</s>": format.remove(("s",)) literal = "" if item["type"] == "softbreak": literal = "\n" if literal: notion.append( [literal, [list(f) for f in sorted(format)]] if format else [literal] ) # in the ast format, code blocks are meant # to be immediately self-closing if ("c",) in format: format.remove(("c",)) # remove any trailing newlines from automatic closing paragraph markers if notion: notion[-1][0] = notion[-1][0].rstrip("\n") # consolidate any adjacent text blocks with identical styles consolidated = [] for item in notion: if consolidated and _get_format(consolidated[-1], as_set=True) == _get_format( item, as_set=True ): consolidated[-1][0] += item[0] elif item[0]: consolidated.append(item) return _cleanup_dashes(consolidated)
# TODO: Rewrite this function, it has to be shorter!
[docs]def notion_to_markdown(notion: list) -> str: """ Convert list of notion blocks to markdown text. Arguments --------- notion : list List of Notion Blocks TODO: is it true? Raises ------ Exception When it's unable to extract text. Returns ------- str Converted Markdown text. """ pattern = re.compile(r"^(?P<leading>\s*)(?P<stripped>(\s|.)*?)(?P<trailing>\s*)$") markdown_chunks = [] for item in notion or []: markdown = "" text = item[0] format = item[1] if len(item) == 2 else [] match = pattern.match(text) if not match: raise Exception("Unable to extract text from: %r" % text) leading_whitespace = match.groupdict()["leading"] stripped = match.groupdict()["stripped"] trailing_whitespace = match.groupdict()["trailing"] markdown += leading_whitespace sorted_format = sorted( format, key=lambda x: FORMAT_PRECEDENCE.index(x[0]) if x[0] in FORMAT_PRECEDENCE else -1, ) for f in sorted_format: if f[0] in _NOTION_TO_MARKDOWN_MAPPER: if stripped: markdown += _NOTION_TO_MARKDOWN_MAPPER[f[0]] if f[0] == "a": markdown += "[" markdown += stripped for f in reversed(sorted_format): if f[0] in _NOTION_TO_MARKDOWN_MAPPER: if stripped: markdown += _NOTION_TO_MARKDOWN_MAPPER[f[0]] if f[0] == "a": markdown += "]({})".format(f[1]) markdown += trailing_whitespace # to make it parseable, add a space after if it combines code/links and emphasis formatting format_types = [f[0] for f in format] if ( ("c" in format_types or "a" in format_types) and ("b" in format_types or "i" in format_types) and not trailing_whitespace ): markdown += " " markdown_chunks.append(markdown) # use underscores as needed to separate adjacent chunks to avoid ambiguous runs of asterisks full_markdown = "" last_used_underscores = False for i in range(len(markdown_chunks)): prev = markdown_chunks[i - 1] if i > 0 else "" curr = markdown_chunks[i] next = markdown_chunks[i + 1] if i < len(markdown_chunks) - 1 else "" prev_ended_in_delimiter = not prev or prev[-1] in delimiters next_starts_with_delimiter = not next or next[0] in delimiters if ( prev_ended_in_delimiter and next_starts_with_delimiter and not last_used_underscores and curr.startswith("☃") and curr.endswith("☃") ): if curr[1] == "☃": count = 2 else: count = 1 curr = "_" * count + curr[count:-count] + "_" * count last_used_underscores = True else: last_used_underscores = False final_markdown = curr.replace("☃", "*") # to make it parseable, convert emphasis/strong combinations to use a mix of _ and * if "***" in final_markdown: final_markdown = final_markdown.replace("***", "**_", 1) final_markdown = final_markdown.replace("***", "_**", 1) full_markdown += final_markdown return full_markdown
[docs]def notion_to_plaintext(notion: list, client=None) -> str: """ Convert list of notion blocks to plain text. Arguments --------- notion : list Text in a Notion specific API format i.e. [["some text"]] client : NotionClient, optional Used for getting blocks, if passed. Defaults to None. Returns ------- str Converted text. """ plaintext = "" for item in notion or []: text = item[0] formats = item[1] if len(item) == 2 else [] if text == "‣": for f in formats: if f[0] == "p": # page link if client is None: plaintext += "page:" + f[1] else: plaintext += client.get_block(f[1]).title_plaintext elif f[0] == "u": # user link if client is None: plaintext += "user:" + f[1] else: plaintext += client.get_user(f[1]).full_name continue plaintext += text return plaintext
[docs]def plaintext_to_notion(plaintext: str) -> list: """ Convert plain text to list of notion blocks. Arguments --------- plaintext : str Text to be converted. Returns ------- list List with the converted plaintext. """ return [[plaintext]]