aboutsummaryrefslogtreecommitdiffstats
path: root/scratch/semgrep/parse.py
diff options
context:
space:
mode:
Diffstat (limited to 'scratch/semgrep/parse.py')
-rw-r--r--scratch/semgrep/parse.py109
1 files changed, 0 insertions, 109 deletions
diff --git a/scratch/semgrep/parse.py b/scratch/semgrep/parse.py
deleted file mode 100644
index 3846e3e..0000000
--- a/scratch/semgrep/parse.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/usr/bin/env python3
-
-import orgparse
-import pandas as pd
-import re
-
-
-def org_roam_nodes_to_dataframe(org_file):
- # Load the org file into an OrgData object
- org_data = orgparse.load(org_file)
-
- # Define a function to extract the title of a node
- def extract_title(node):
- if node.heading:
- # If the node has a heading, return it
- return node.heading
- else:
- # Otherwise, extract the title from the org file using a regular expression
- title_pattern = re.compile(r"^#\+title:\s*(.*)$", re.IGNORECASE)
- match = title_pattern.search(node.body)
- if match:
- return match.group(1)
- else:
- # If the title is not found, extract it from the first line of the body
- return re.sub(
- r"#\+title:", "", node.body.split("\n")[0], flags=re.IGNORECASE
- ).strip()
-
- # Define a function to recursively extract the bodies of a node and its descendants
- def extract_node_nested_body(node):
- body = node.body
- for child in node.children:
- body += (
- "\n"
- + child.level * "*"
- + " "
- + child.heading
- + "\n"
- + extract_node_nested_body(child)
- )
- return body.strip()
-
- # Define a function to recursively extract the bodies of a node
- # and its descendants when they are not other nodes
- def extract_node_nested_body_exclusive(node):
- body = node.body
- for child in node.children:
- if not child.properties.get("ID") and not child.properties.get("SEARCH"):
- body += (
- "\n"
- + child.level * "*"
- + " "
- + child.heading
- + "\n"
- + extract_node_nested_body_exclusive(child)
- )
- return body.strip()
-
- # Define a function to build the hierarchy of a node
- def build_node_hierarchy(node):
- hierarchy = [extract_title(node)]
- parent = node.parent
-
- # while parent and parent != org_data[0]:
- while parent:
- hierarchy.append(extract_title(parent))
- parent = parent.parent
- return " > ".join(reversed(hierarchy)).strip()
-
- # Define a function to convert a node to a dictionary
- def node_to_dict(node, file_name):
- node_dict = {
- "file_name": file_name,
- "node_id": node.properties.get("ID"),
- "node_title": extract_title(node),
- "node_hierarchy": build_node_hierarchy(node),
- "node_text": node.body,
- "node_text_nested": extract_node_nested_body(node),
- "node_text_nested_exclusive": extract_node_nested_body_exclusive(node),
- }
-
- return node_dict
-
- # Create a list of all org-roam nodes in the OrgData object
- nodes = [
- node_to_dict(node, org_file)
- for node in org_data[0][:]
- if node.properties.get("ID")
- ]
-
- return pd.DataFrame(nodes)
-
-
-model.encode(
- "What you need is the gist. Your mind needs the repetition to absorb the information. The true learning is on *doing* not in the input. If you can't use the material, you don't know. There is just a limited amount you can learn by listening. The rest you must do yourself. Courses give you ideas, you must figure out what to do with them."
-)
-model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
-
-query_embedding = model.encode("How big is London")
-passage_embedding = model.encode(
- [
- "London has 9,787,426 inhabitants at the 2011 census",
- "London is known for its finacial district",
- "London is full of criminals",
- "Cairo is small",
- ]
-)
-
-print("Similarity:", util.dot_score(query_embedding, passage_embedding))