aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorOscar Najera <hi@oscarnajera.com>2023-07-31 21:51:44 +0200
committerOscar Najera <hi@oscarnajera.com>2024-01-13 02:27:42 +0100
commit20fb1a21587dc72d5e314119d6986fd3f2a45f13 (patch)
tree085993f4209c373733412dc334296558c9794e9a
parent88fead2e2b6082e88d58a1af7c0afa1765a25253 (diff)
downloaddotfiles-20fb1a21587dc72d5e314119d6986fd3f2a45f13.tar.gz
dotfiles-20fb1a21587dc72d5e314119d6986fd3f2a45f13.tar.bz2
dotfiles-20fb1a21587dc72d5e314119d6986fd3f2a45f13.zip
scratch sem grep
-rw-r--r--scratch/semgrep/parse.py109
-rw-r--r--scratch/semgrep/test.org20
2 files changed, 129 insertions, 0 deletions
diff --git a/scratch/semgrep/parse.py b/scratch/semgrep/parse.py
new file mode 100644
index 0000000..3846e3e
--- /dev/null
+++ b/scratch/semgrep/parse.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+import orgparse
+import pandas as pd
+import re
+
+
+def org_roam_nodes_to_dataframe(org_file):
+ # Load the org file into an OrgData object
+ org_data = orgparse.load(org_file)
+
+ # Define a function to extract the title of a node
+ def extract_title(node):
+ if node.heading:
+ # If the node has a heading, return it
+ return node.heading
+ else:
+ # Otherwise, extract the title from the org file using a regular expression
+ title_pattern = re.compile(r"^#\+title:\s*(.*)$", re.IGNORECASE)
+ match = title_pattern.search(node.body)
+ if match:
+ return match.group(1)
+ else:
+ # If the title is not found, extract it from the first line of the body
+ return re.sub(
+ r"#\+title:", "", node.body.split("\n")[0], flags=re.IGNORECASE
+ ).strip()
+
+ # Define a function to recursively extract the bodies of a node and its descendants
+ def extract_node_nested_body(node):
+ body = node.body
+ for child in node.children:
+ body += (
+ "\n"
+ + child.level * "*"
+ + " "
+ + child.heading
+ + "\n"
+ + extract_node_nested_body(child)
+ )
+ return body.strip()
+
+ # Define a function to recursively extract the bodies of a node
+ # and its descendants when they are not other nodes
+ def extract_node_nested_body_exclusive(node):
+ body = node.body
+ for child in node.children:
+ if not child.properties.get("ID") and not child.properties.get("SEARCH"):
+ body += (
+ "\n"
+ + child.level * "*"
+ + " "
+ + child.heading
+ + "\n"
+ + extract_node_nested_body_exclusive(child)
+ )
+ return body.strip()
+
+ # Define a function to build the hierarchy of a node
+ def build_node_hierarchy(node):
+ hierarchy = [extract_title(node)]
+ parent = node.parent
+
+ # while parent and parent != org_data[0]:
+ while parent:
+ hierarchy.append(extract_title(parent))
+ parent = parent.parent
+ return " > ".join(reversed(hierarchy)).strip()
+
+ # Define a function to convert a node to a dictionary
+ def node_to_dict(node, file_name):
+ node_dict = {
+ "file_name": file_name,
+ "node_id": node.properties.get("ID"),
+ "node_title": extract_title(node),
+ "node_hierarchy": build_node_hierarchy(node),
+ "node_text": node.body,
+ "node_text_nested": extract_node_nested_body(node),
+ "node_text_nested_exclusive": extract_node_nested_body_exclusive(node),
+ }
+
+ return node_dict
+
+ # Create a list of all org-roam nodes in the OrgData object
+ nodes = [
+ node_to_dict(node, org_file)
+ for node in org_data[0][:]
+ if node.properties.get("ID")
+ ]
+
+ return pd.DataFrame(nodes)
+
+
+model.encode(
+ "What you need is the gist. Your mind needs the repetition to absorb the information. The true learning is on *doing* not in the input. If you can't use the material, you don't know. There is just a limited amount you can learn by listening. The rest you must do yourself. Courses give you ideas, you must figure out what to do with them."
+)
+model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
+
+query_embedding = model.encode("How big is London")
+passage_embedding = model.encode(
+ [
+ "London has 9,787,426 inhabitants at the 2011 census",
+ "London is known for its finacial district",
+ "London is full of criminals",
+ "Cairo is small",
+ ]
+)
+
+print("Similarity:", util.dot_score(query_embedding, passage_embedding))
diff --git a/scratch/semgrep/test.org b/scratch/semgrep/test.org
new file mode 100644
index 0000000..aac16d0
--- /dev/null
+++ b/scratch/semgrep/test.org
@@ -0,0 +1,20 @@
+#+title: Test
+
+It has started all wars.
+
+No more data. It is a case of food suply. No more data. It is a case of food
+supply and nothing else for the.
+
+How an that
+
+#+begin_src emacs-lisp :results value raw
+(json-encode
+ (org-element-map
+ (org-element-parse-buffer)
+ 'paragraph
+ (lambda (hl)
+ (car
+ (org-element-contents hl)))))
+#+end_src
+
+#+RESULTS: