#!/usr/bin/env python3 import orgparse import pandas as pd import re def org_roam_nodes_to_dataframe(org_file): # Load the org file into an OrgData object org_data = orgparse.load(org_file) # Define a function to extract the title of a node def extract_title(node): if node.heading: # If the node has a heading, return it return node.heading else: # Otherwise, extract the title from the org file using a regular expression title_pattern = re.compile(r"^#\+title:\s*(.*)$", re.IGNORECASE) match = title_pattern.search(node.body) if match: return match.group(1) else: # If the title is not found, extract it from the first line of the body return re.sub( r"#\+title:", "", node.body.split("\n")[0], flags=re.IGNORECASE ).strip() # Define a function to recursively extract the bodies of a node and its descendants def extract_node_nested_body(node): body = node.body for child in node.children: body += ( "\n" + child.level * "*" + " " + child.heading + "\n" + extract_node_nested_body(child) ) return body.strip() # Define a function to recursively extract the bodies of a node # and its descendants when they are not other nodes def extract_node_nested_body_exclusive(node): body = node.body for child in node.children: if not child.properties.get("ID") and not child.properties.get("SEARCH"): body += ( "\n" + child.level * "*" + " " + child.heading + "\n" + extract_node_nested_body_exclusive(child) ) return body.strip() # Define a function to build the hierarchy of a node def build_node_hierarchy(node): hierarchy = [extract_title(node)] parent = node.parent # while parent and parent != org_data[0]: while parent: hierarchy.append(extract_title(parent)) parent = parent.parent return " > ".join(reversed(hierarchy)).strip() # Define a function to convert a node to a dictionary def node_to_dict(node, file_name): node_dict = { "file_name": file_name, "node_id": node.properties.get("ID"), "node_title": extract_title(node), "node_hierarchy": build_node_hierarchy(node), "node_text": node.body, "node_text_nested": extract_node_nested_body(node), "node_text_nested_exclusive": extract_node_nested_body_exclusive(node), } return node_dict # Create a list of all org-roam nodes in the OrgData object nodes = [ node_to_dict(node, org_file) for node in org_data[0][:] if node.properties.get("ID") ] return pd.DataFrame(nodes) model.encode( "What you need is the gist. Your mind needs the repetition to absorb the information. The true learning is on *doing* not in the input. If you can't use the material, you don't know. There is just a limited amount you can learn by listening. The rest you must do yourself. Courses give you ideas, you must figure out what to do with them." ) model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1") query_embedding = model.encode("How big is London") passage_embedding = model.encode( [ "London has 9,787,426 inhabitants at the 2011 census", "London is known for its finacial district", "London is full of criminals", "Cairo is small", ] ) print("Similarity:", util.dot_score(query_embedding, passage_embedding))