scratch/semgrep/parse.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109

#!/usr/bin/env python3

import orgparse
import pandas as pd
import re


def org_roam_nodes_to_dataframe(org_file):
    # Load the org file into an OrgData object
    org_data = orgparse.load(org_file)

    # Define a function to extract the title of a node
    def extract_title(node):
        if node.heading:
            # If the node has a heading, return it
            return node.heading
        else:
            # Otherwise, extract the title from the org file using a regular expression
            title_pattern = re.compile(r"^#\+title:\s*(.*)$", re.IGNORECASE)
            match = title_pattern.search(node.body)
            if match:
                return match.group(1)
            else:
                # If the title is not found, extract it from the first line of the body
                return re.sub(
                    r"#\+title:", "", node.body.split("\n")[0], flags=re.IGNORECASE
                ).strip()

    # Define a function to recursively extract the bodies of a node and its descendants
    def extract_node_nested_body(node):
        body = node.body
        for child in node.children:
            body += (
                "\n"
                + child.level * "*"
                + " "
                + child.heading
                + "\n"
                + extract_node_nested_body(child)
            )
        return body.strip()

    # Define a function to recursively extract the bodies of a node
    # and its descendants when they are not other nodes
    def extract_node_nested_body_exclusive(node):
        body = node.body
        for child in node.children:
            if not child.properties.get("ID") and not child.properties.get("SEARCH"):
                body += (
                    "\n"
                    + child.level * "*"
                    + " "
                    + child.heading
                    + "\n"
                    + extract_node_nested_body_exclusive(child)
                )
        return body.strip()

    # Define a function to build the hierarchy of a node
    def build_node_hierarchy(node):
        hierarchy = [extract_title(node)]
        parent = node.parent

        # while parent and parent != org_data[0]:
        while parent:
            hierarchy.append(extract_title(parent))
            parent = parent.parent
        return " > ".join(reversed(hierarchy)).strip()

    # Define a function to convert a node to a dictionary
    def node_to_dict(node, file_name):
        node_dict = {
            "file_name": file_name,
            "node_id": node.properties.get("ID"),
            "node_title": extract_title(node),
            "node_hierarchy": build_node_hierarchy(node),
            "node_text": node.body,
            "node_text_nested": extract_node_nested_body(node),
            "node_text_nested_exclusive": extract_node_nested_body_exclusive(node),
        }

        return node_dict

    # Create a list of all org-roam nodes in the OrgData object
    nodes = [
        node_to_dict(node, org_file)
        for node in org_data[0][:]
        if node.properties.get("ID")
    ]

    return pd.DataFrame(nodes)


model.encode(
    "What you need is the gist. Your mind needs the repetition to absorb the information. The true learning is on *doing* not in the input. If you can't use the material, you don't know. There is just a limited amount you can learn by listening. The rest you must do yourself. Courses give you ideas, you must figure out what to do with them."
)
model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

query_embedding = model.encode("How big is London")
passage_embedding = model.encode(
    [
        "London has 9,787,426 inhabitants at the 2011 census",
        "London is known for its finacial district",
        "London is full of criminals",
        "Cairo is small",
    ]
)

print("Similarity:", util.dot_score(query_embedding, passage_embedding))