1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
#!/usr/bin/env python3
import orgparse
import pandas as pd
import re
def org_roam_nodes_to_dataframe(org_file):
# Load the org file into an OrgData object
org_data = orgparse.load(org_file)
# Define a function to extract the title of a node
def extract_title(node):
if node.heading:
# If the node has a heading, return it
return node.heading
else:
# Otherwise, extract the title from the org file using a regular expression
title_pattern = re.compile(r"^#\+title:\s*(.*)$", re.IGNORECASE)
match = title_pattern.search(node.body)
if match:
return match.group(1)
else:
# If the title is not found, extract it from the first line of the body
return re.sub(
r"#\+title:", "", node.body.split("\n")[0], flags=re.IGNORECASE
).strip()
# Define a function to recursively extract the bodies of a node and its descendants
def extract_node_nested_body(node):
body = node.body
for child in node.children:
body += (
"\n"
+ child.level * "*"
+ " "
+ child.heading
+ "\n"
+ extract_node_nested_body(child)
)
return body.strip()
# Define a function to recursively extract the bodies of a node
# and its descendants when they are not other nodes
def extract_node_nested_body_exclusive(node):
body = node.body
for child in node.children:
if not child.properties.get("ID") and not child.properties.get("SEARCH"):
body += (
"\n"
+ child.level * "*"
+ " "
+ child.heading
+ "\n"
+ extract_node_nested_body_exclusive(child)
)
return body.strip()
# Define a function to build the hierarchy of a node
def build_node_hierarchy(node):
hierarchy = [extract_title(node)]
parent = node.parent
# while parent and parent != org_data[0]:
while parent:
hierarchy.append(extract_title(parent))
parent = parent.parent
return " > ".join(reversed(hierarchy)).strip()
# Define a function to convert a node to a dictionary
def node_to_dict(node, file_name):
node_dict = {
"file_name": file_name,
"node_id": node.properties.get("ID"),
"node_title": extract_title(node),
"node_hierarchy": build_node_hierarchy(node),
"node_text": node.body,
"node_text_nested": extract_node_nested_body(node),
"node_text_nested_exclusive": extract_node_nested_body_exclusive(node),
}
return node_dict
# Create a list of all org-roam nodes in the OrgData object
nodes = [
node_to_dict(node, org_file)
for node in org_data[0][:]
if node.properties.get("ID")
]
return pd.DataFrame(nodes)
model.encode(
"What you need is the gist. Your mind needs the repetition to absorb the information. The true learning is on *doing* not in the input. If you can't use the material, you don't know. There is just a limited amount you can learn by listening. The rest you must do yourself. Courses give you ideas, you must figure out what to do with them."
)
model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
query_embedding = model.encode("How big is London")
passage_embedding = model.encode(
[
"London has 9,787,426 inhabitants at the 2011 census",
"London is known for its finacial district",
"London is full of criminals",
"Cairo is small",
]
)
print("Similarity:", util.dot_score(query_embedding, passage_embedding))
|