#!/usr/bin/env python # -*- coding: utf-8 -*- r""" extract book highlights to orgmode ================================== Extracts from exported html notes from kindle to org subtree """ # Created: Sat Jul 14 02:10:24 2018 # Author: Óscar Nájera # License: GPL-3 import argparse from textwrap import fill as filltxt from bs4 import BeautifulSoup def html_notes2org(page): soup = BeautifulSoup(page, "lxml") docs = "" for div in soup.find_all("div"): hcl = div.attrs.get("class") if "bookTitle" in hcl: docs += "* {}".format(div.text.strip()) if "authors" in hcl: docs += " -- {}\n".format(div.text.strip()) if "sectionHeading" in hcl: docs += "** " + div.text.lstrip() if "noteText" in hcl: docs += "#+begin_quote\n{}\n#+end_quote\n\n".format( filltxt(div.text.strip()) ) return docs def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("file", help="HTML file from kindle export") args = parser.parse_args() with open(args.file, "rb") as fid: page = fid.read() docs = html_notes2org(page) with open("sub.org", "w") as fid: fid.write(docs) if __name__ == "__main__": main()